Preparation¶

In [ ]:
import pandas as pd
import numpy as np
from sklearn_pandas import DataFrameMapper
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from feature_engine.imputation import CategoricalImputer
from sklearn.base import BaseEstimator,TransformerMixin
In [ ]:
# load train and test data
train_raw = pd.read_csv("./house-prices-advanced-regression-techniques/train.csv", index_col='Id')
test_raw = pd.read_csv("./house-prices-advanced-regression-techniques/test.csv", index_col='Id')
print("The shape of train: {}".format(train_raw.shape))
print("The shape of test: {}".format(test_raw.shape))

Check Data¶

Tools for showing missing data¶

In [ ]:
## getColInfo shows a table with column names and their dtypes and the number of unique values
## you can set parameter dtype and cardinality_threshold to show columns you want
## it can be used to check how many categories a column has for the reference of 
## choosing methods of encoding categorical variables.
def getColInfo(train, dtype=None, cardinality_threshold=0):
    train_dtypes = [train[dt].dtype for dt in train.columns]
    nuniques = [train[col].nunique() for col in train.columns]
    res = pd.DataFrame({'Columns Names': train.columns,
                                 'Dtypes': train_dtypes,
                                 'Unique Value Numbers': nuniques })
    if dtype != None:
        res = res.loc[res['Dtypes']==dtype]
    res = pd.DataFrame(res.loc[res['Unique Value Numbers'] >= cardinality_threshold])
    return(res.sort_values(by=['Unique Value Numbers'], axis=0))

## getMissingData shows the percentage of columns that contain missing data
def getMissingData(train):
    all_data_na = (train.isnull().sum() / len(train)) * 100
    all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
    all_data_na_dtypes = [train[dt].dtype for dt in all_data_na.index]
    nuniques = train[all_data_na.index].nunique()
    missing_data = pd.DataFrame({'Dtypes': all_data_na_dtypes,
                                 'Missing Ratio' : all_data_na,
                                 'Unique Value Numbers': nuniques })
    if missing_data.shape[0] == 0:
        return("no missing vlaue")
    return(missing_data)
In [ ]:
getColInfo(train_raw, dtype='object', cardinality_threshold=10)
In [ ]:
ob = getMissingData(train_raw)
type(ob.loc[ob['Missing Ratio']>5].index)

Preprocessing¶

Split Training Data¶

In [ ]:
X_train, X_valid, y_train, y_valid = train_test_split(train_raw.drop('SalePrice', axis=1), 
                                                      train_raw['SalePrice'], 
                                                      test_size=0.1, 
                                                      random_state=99)
In [ ]:
X_train.shape

Delete Unwanted Columns¶

Deletion Strategy¶

In [ ]:
## define a class to filter unwanted columns
class ColumnFilter(BaseEstimator, TransformerMixin):
    
    def __init__(self, threshold=0):
        super().__init__()
        self.threshold = threshold
        self.col_names_deleted = []
        
    def fit(self, X, y=None):
        missing_df = self.get_missing_data(X, self.threshold)
        self.col_names_deleted = missing_df.index
        return self
    
    def transform(self, X, y=None):
        return(X.drop(columns=self.col_names_deleted))
        
        
    def get_missing_data(self, X, threshold):
        all_data_na = (X.isnull().sum() / len(X)) * 100
        all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
        all_data_na_dtypes = [X[dt].dtype for dt in all_data_na.index]
        nuniques = X[all_data_na.index].nunique()
        missing_data = pd.DataFrame({'Dtypes': all_data_na_dtypes,
                                 'Missing Ratio' : all_data_na,
                                 'Unique Value Numbers': nuniques })
        return(missing_data.loc[missing_data['Missing Ratio'] >= self.threshold])
    
In [ ]:
# test
# drop all the columns if their missing ration is greater than 5%
cf = ColumnFilter(5)
X_train_transformed = cf.fit_transform(X_train)
X_train_transformed.shape
In [ ]:
X_train_transformed.head()

Impute Missing Values¶

Imputation Strategy¶

For qualitative values, impute the mean value if there is no specified requirement. For categorical values, impute the mode value if there is no specified requirement.

Realization¶

In [ ]:
## Below is a class you can use to realize customized imputation
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator
class MyImputer(BaseEstimator, TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.
        """
        
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)
In [ ]:
# test
# impute X_train
my_imputer = MyImputer()
X_train_imputed = my_imputer.fit_transform(X_train_transformed)
# check missing data after imputation
getMissingData(X_train_imputed)
In [ ]:
X_train_imputed.shape

Encode Categorical Variavles¶

Encoding Strategy¶

We will encode all the categorical data using one-hot encoding with a specified threshold of cardinality.

In [ ]:
class MyEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, cardinality_threshold=0):
        self.cardinality_threshold=cardinality_threshold
        self.cat_cols = []
        self.one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
        
        
    def fit(self, X, y=None):
        # get all the categorial column names
        self.cat_cols = list(X.columns[X.dtypes == 'O'])
        self.cat_cols = list(X[self.cat_cols].columns[X[self.cat_cols].nunique() > self.cardinality_threshold])
        self.one_hot_encoder.fit(X[self.cat_cols])
        return(self)

    def transform(self, X, y=None):
        # one hot encoder transform 
        X_encoded = pd.DataFrame(self.one_hot_encoder.transform(X[self.cat_cols]).toarray(), 
                                 index=X.index, 
                                 columns=self.one_hot_encoder.get_feature_names_out())
        #print('The shape of X_encoded is {}'.format(X_encoded.shape))
        # delete original categorical columns
        X_dropped = X.drop(self.cat_cols,axis=1)
        #print('The shape of X_dropped is {}'.format(X_dropped.shape))
        # concat
        return(pd.concat([X_dropped,X_encoded],axis=1))
In [ ]:
my_encoder= MyEncoder()
my_encoder.fit(X_train)
X_encoded = my_encoder.transform(X_train)
X_encoded.shape

Building Pipeline¶

In [ ]:
pipe_ranf = Pipeline([('ColumnFilter_ranf', ColumnFilter()),
                      ('MyImputer_ranf', MyImputer()),
                      ('MyEncoder_ranf', MyEncoder()),
                      ('model_ranf', RandomForestRegressor())]) 
In [ ]:
grid_params_ranf = {
    'ColumnFilter_ranf__threshold':[5],
    'model_ranf__n_estimators':[500],
    'model_ranf__max_depth':[12,14,16],
    'model_ranf__criterion':['squared_error'],
    'model_ranf__min_samples_leaf':[5,10,20],
    'model_ranf__oob_score':[True],
    'model_ranf__random_state':[2021]
}
ranf_cv_tune = GridSearchCV(estimator=pipe_ranf,
            param_grid=grid_params_ranf,
            scoring='neg_mean_squared_error',
            cv=5)
In [ ]:
# log transform y
ranf_cv_tune.fit(X_train, np.log1p(y_train))
In [ ]:
ranf_best = ranf_cv_tune.best_estimator_
pred_valid = ranf_best.predict(X_valid)
In [ ]:
def MSE(y_valid,y_pred):
    res = np.square(np.subtract(y_valid,y_pred)).mean()
    return(res)
In [ ]:
MSE(np.log1p(y_valid), pred_valid)
In [ ]:
 
In [ ]:
# test
pipe_ranf2 = Pipeline([('ColumnFilter_ranf', ColumnFilter(5)),
                      ('MyImputer_ranf', MyImputer()),
                      ('MyEncoder_ranf', MyEncoder()),]) 

X_transformed = pipe_ranf2.fit_transform(X_train)

ranf2 = RandomForestRegressor()
In [ ]:
grid_params_ranf2 = {
    'n_estimators':[500],
    'max_depth':[4,6,10,14],
    'criterion':['squared_error'],
    'min_samples_leaf':[5,10,20],
    'oob_score':[True],
    'random_state':[2021]
}
grid2 = GridSearchCV(estimator=ranf2,
            param_grid=grid_params_ranf2,
            scoring='neg_mean_squared_error',
            cv=5)


grid2.fit(X_transformed.values, np.log1p(y_train))
In [ ]:
grid2.best_estimator_
In [ ]:
ranf_best = RandomForestRegressor(
    n_estimators=500,
    max_depth=14,
    criterion='squared_error',
    min_samples_leaf=5,
    oob_score=True,
    random_state=2021
)
ranf_best.fit(X_transformed.values,  np.log1p(y_train))
In [ ]:
X_valid_transformed = pipe_ranf2.transform(X_valid)
pred_value = ranf_best.predict(X_valid_transformed.values)
In [ ]:
MSE_valid = np.square(np.subtract(np.log1p(y_valid),pred_value)).mean();MSE_valid
In [ ]:
# define transform function for y
import math
def trans_Y(Y):
    return([math.exp(1)**y - 1 for y in Y ])

Prediction¶

In [ ]:
test_raw_transformed = pipe_ranf2.transform(test_raw)
pred_test = ranf_best.predict(test_raw_transformed.values)
In [ ]:
pred_test_reversed = trans_Y(pred_test)
In [ ]:
output = pd.DataFrame({'Id': test_raw.index,
                       'SalePrice': pred_test_reversed})
output.to_csv('submission.csv', index=False)

XGBoost¶

Preprocessing¶

In [ ]:
import xgboost as xgb
pipe_xgb = Pipeline([('ColumnFilter_ranf', ColumnFilter()),
                      ('MyImputer_ranf', MyImputer()),
                      ('MyEncoder_ranf', MyEncoder()),
                      ('XGBoost', xgb.XGBRegressor())]) 

Training¶

In [ ]:
grid_params_xgb = {
    'ColumnFilter_ranf__threshold':[5],
    'XGBoost__n_estimators':[200,500],
    'XGBoost__max_depth':[2,4],
    'XGBoost__min_child_weight': [5,8],
    'XGBoost__learning_rate': [0.1,0.05],
    'XGBoost__gamma':[0.01,0.1],
    'XGBoost__reg_lambda': [0.1,0.5,2,5],
    'XGBoost__random_state': [2022]
}
xgb_cv_tune = GridSearchCV(estimator=pipe_xgb,
            param_grid=grid_params_xgb,
            scoring='neg_mean_squared_error',
            cv=5)
In [ ]:
xgb_cv_tune.fit(X_train,np.log1p(y_train))
In [ ]:
xgb_best = xgb_cv_tune.best_estimator_;xgb_best
In [ ]:
pred_xgb = xgb_best.predict(X_valid)
In [ ]:
MSE_valid = MSE(np.log1p(y_valid), pred_xgb);MSE_valid

Prediction¶

In [ ]:
pred_test = xgb_regressor_refit.predict(test_raw_transformed)
pred_test_reversed = trans_Y(pred_test)
In [ ]:
output = pd.DataFrame({'Id': test_raw.index,
                       'SalePrice': pred_test_reversed})
output.to_csv('submission_xgboost.csv', index=False)

Lasso¶

In [ ]:
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Lasso
In [ ]:
pipe_lasso = Pipeline([('ColumnFilter_ranf', ColumnFilter()),
                      ('MyImputer_ranf', MyImputer()),
                      ('MyEncoder_ranf', MyEncoder()),
                      ('RobustScaler', RobustScaler()),
                      ('Lasso', Lasso())
                      ]) 
In [ ]:
grid_params_lasso = {
    'ColumnFilter_ranf__threshold':[5],
    'Lasso__alpha':[0.01,0.05,0.1,1,2],
    'Lasso__random_state':[2021],
    'Lasso__selection':['random'],
    'Lasso__max_iter': [1000,1500,2000]
}
lasso_cv_tune = GridSearchCV(estimator=pipe_lasso,
            param_grid=grid_params_lasso,
            scoring='neg_mean_squared_error',
            cv=5)
In [ ]:
lasso_cv_tune.fit(X_train, np.log1p(y_train))
In [ ]:
lasso_best = lasso_cv_tune.best_estimator_
In [ ]:
lasso_best.fit(X_train, np.log1p(y_train))
In [ ]:
lasso_pred = lasso_best.predict(X_valid)
In [ ]:
MSE(np.log1p(y_valid), lasso_pred)
In [ ]:
 

Ridge Regression¶

In [ ]:
from sklearn.kernel_ridge import KernelRidge
In [ ]:
pipe_ridge = Pipeline([('ColumnFilter_ranf', ColumnFilter()),
                      ('MyImputer_ranf', MyImputer()),
                      ('MyEncoder_ranf', MyEncoder()),
                      ('RobustScaler', RobustScaler()),
                      ('KernelRidge', KernelRidge())
                      ]) 
In [ ]:
grid_params_ridge = {
    'ColumnFilter_ranf__threshold':[5],
    'KernelRidge__alpha':[0.01,0.1,0.5],
    'KernelRidge__kernel':['polynomial','linear'],
    'KernelRidge__degree': [3],
    'KernelRidge__coef0': [2,2.5,3]
}
ridge_cv_tune = GridSearchCV(estimator=pipe_ridge,
            param_grid=grid_params_ridge,
            scoring='neg_mean_squared_error',
            cv=5)
In [ ]:
ridge_cv_tune.fit(X_train, np.log1p(y_train))
In [ ]:
ridge_best = ridge_cv_tune.best_estimator_
In [ ]:
ridge_cv_tune.scorer_
In [ ]:
ridge_pred = pipe_ridge.predict(X_valid)
MSE(np.log1p(y_valid), ridge_pred)

Elastic Net Regression¶

In [ ]:
from sklearn.linear_model import ElasticNet
In [ ]:
pipe_elastic = Pipeline([('ColumnFilter_ranf', ColumnFilter()),
                      ('MyImputer_ranf', MyImputer()),
                      ('MyEncoder_ranf', MyEncoder()),
                      ('RobustScaler', RobustScaler()),
                      ('ElasticNet', ElasticNet())
                      ]) 
In [ ]:
grid_params_elastic = {
    'ColumnFilter_ranf__threshold':[5],
    'ElasticNet__alpha':[0.01,0.1,0.5,0.6,0.8],
    'ElasticNet__l1_ratio':[0.1,0.3,0.5,0.7,1]
}
elastic_cv_tune = GridSearchCV(estimator=pipe_elastic,
            param_grid=grid_params_elastic,
            scoring='neg_mean_squared_error',
            cv=5)
In [ ]:
elastic_cv_tune.fit(X_train, np.log1p(y_train))
In [ ]:
elastic_best = elastic_cv_tune.best_estimator_
In [ ]:
elastic_pred = elastic_best.predict(X_valid)
MSE(np.log1p(y_valid), elastic_pred)

Prediction¶

In [ ]:
pred_elastic = pipe_elastic.predict(test_raw)
In [ ]:
pred_test_reversed = trans_Y(pred_elastic)
In [ ]:
output = pd.DataFrame({'Id': test_raw.index,
                       'SalePrice': pred_test_reversed})
output.to_csv('submission_elastic.csv', index=False)
In [ ]:
 

Ensemble¶

Ensemble with average strategy¶

In [ ]:
from sklearn.ensemble import VotingRegressor
In [ ]:
votingR = VotingRegressor(estimators=[
    ('xgb', xgb_best),
    ('lasso',lasso_best),
    ('elastic', elastic_best)],                    
     n_jobs=4)

votingR = votingR.fit(X_train, np.log1p(y_train))
In [ ]:
voting_pred = votingR.predict(X_valid)
In [ ]:
MSE(np.log1p(y_valid), voting_pred)
In [ ]:
pred_test = votingR.predict(test_raw)
pred_test_reversed = trans_Y(pred_test)
In [ ]:
output = pd.DataFrame({'Id': test_raw.index,
                       'SalePrice': pred_test_reversed})
output.to_csv('submission_ensemble_average.csv', index=False)