{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.dummy import DummyRegressor\n", "from sklearn.neighbors import KNeighborsRegressor\n", "from sklearn.tree import DecisionTreeRegressor\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.ensemble import HistGradientBoostingRegressor\n", "from sklearn.metrics import mean_absolute_percentage_error" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "ames_train = pd.read_csv(\n", " \"https://cs307.org/lab-04/data/ames-train.csv\",\n", ")\n", "ames_test = pd.read_csv(\n", " \"https://cs307.org/lab-04/data/ames-test.csv\",\n", ")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Order | \n", "PID | \n", "MS SubClass | \n", "MS Zoning | \n", "Lot Frontage | \n", "Lot Area | \n", "Street | \n", "Alley | \n", "Lot Shape | \n", "Land Contour | \n", "... | \n", "Pool Area | \n", "Pool QC | \n", "Fence | \n", "Misc Feature | \n", "Misc Val | \n", "Mo Sold | \n", "Yr Sold | \n", "Sale Type | \n", "Sale Condition | \n", "SalePrice | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "2832 | \n", "908188140 | \n", "160 | \n", "RM | \n", "24.0 | \n", "2522 | \n", "Pave | \n", "NaN | \n", "Reg | \n", "Lvl | \n", "... | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "4 | \n", "2006 | \n", "WD | \n", "Normal | \n", "137500 | \n", "
1 | \n", "2736 | \n", "905426150 | \n", "80 | \n", "RL | \n", "NaN | \n", "19690 | \n", "Pave | \n", "NaN | \n", "IR1 | \n", "Lvl | \n", "... | \n", "738 | \n", "Gd | \n", "GdPrv | \n", "NaN | \n", "0 | \n", "8 | \n", "2006 | \n", "WD | \n", "Alloca | \n", "274970 | \n", "
2 | \n", "2135 | \n", "907200130 | \n", "20 | \n", "RL | \n", "97.0 | \n", "11800 | \n", "Pave | \n", "NaN | \n", "IR1 | \n", "Bnk | \n", "... | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "8 | \n", "2007 | \n", "WD | \n", "Family | \n", "131000 | \n", "
3 | \n", "2424 | \n", "528228415 | \n", "120 | \n", "RM | \n", "NaN | \n", "3072 | \n", "Pave | \n", "NaN | \n", "Reg | \n", "Lvl | \n", "... | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "5 | \n", "2006 | \n", "WD | \n", "Normal | \n", "178740 | \n", "
4 | \n", "1967 | \n", "535457020 | \n", "20 | \n", "RL | \n", "80.0 | \n", "8000 | \n", "Pave | \n", "NaN | \n", "Reg | \n", "Lvl | \n", "... | \n", "0 | \n", "NaN | \n", "MnPrv | \n", "NaN | \n", "0 | \n", "11 | \n", "2007 | \n", "WD | \n", "Normal | \n", "156500 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
1870 | \n", "1020 | \n", "527302070 | \n", "20 | \n", "RL | \n", "NaN | \n", "10825 | \n", "Pave | \n", "NaN | \n", "IR1 | \n", "Lvl | \n", "... | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "7 | \n", "2008 | \n", "WD | \n", "Normal | \n", "181900 | \n", "
1871 | \n", "237 | \n", "905426200 | \n", "20 | \n", "RL | \n", "65.0 | \n", "11479 | \n", "Pave | \n", "NaN | \n", "Reg | \n", "Lvl | \n", "... | \n", "0 | \n", "NaN | \n", "MnPrv | \n", "NaN | \n", "0 | \n", "6 | \n", "2010 | \n", "WD | \n", "Normal | \n", "144500 | \n", "
1872 | \n", "1547 | \n", "910202050 | \n", "30 | \n", "RM | \n", "40.0 | \n", "3636 | \n", "Pave | \n", "NaN | \n", "Reg | \n", "Lvl | \n", "... | \n", "0 | \n", "NaN | \n", "MnPrv | \n", "NaN | \n", "0 | \n", "1 | \n", "2008 | \n", "WD | \n", "Normal | \n", "55000 | \n", "
1873 | \n", "1855 | \n", "533251110 | \n", "20 | \n", "RL | \n", "80.0 | \n", "12000 | \n", "Pave | \n", "NaN | \n", "Reg | \n", "Lvl | \n", "... | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "3 | \n", "2007 | \n", "WD | \n", "Normal | \n", "255000 | \n", "
1874 | \n", "2836 | \n", "908227030 | \n", "50 | \n", "RL | \n", "62.0 | \n", "8707 | \n", "Pave | \n", "NaN | \n", "Reg | \n", "Lvl | \n", "... | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "5 | \n", "2006 | \n", "WD | \n", "AdjLand | \n", "107000 | \n", "
1875 rows × 82 columns
\n", "Pipeline(steps=[('Preprocessor',\n", " ColumnTransformer(transformers=[('ExcludeColumns', 'drop',\n", " ['Order', 'PID', 'Alley',\n", " 'Pool QC', 'Fence',\n", " 'Misc Feature']),\n", " ('CategoricalPreprocessing',\n", " Pipeline(steps=[('ModalImputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('OneHotEncoder',\n", " OneHotEncoder(handle_unknown='infrequent_if_exist',\n", " max_categories=5))]),\n", " ['MS Zoning', 'St...\n", " 'Year Remod/Add',\n", " 'Mas Vnr Area',\n", " 'BsmtFin SF 1',\n", " 'BsmtFin SF 2',\n", " 'Bsmt Unf SF',\n", " 'Total Bsmt SF',\n", " '1st Flr SF', '2nd Flr SF',\n", " 'Low Qual Fin SF',\n", " 'Gr Liv Area',\n", " 'Bsmt Full Bath',\n", " 'Bsmt Half Bath',\n", " 'Full Bath', 'Half Bath',\n", " 'Bedroom AbvGr',\n", " 'Kitchen AbvGr',\n", " 'TotRms AbvGrd',\n", " 'Fireplaces',\n", " 'Garage Yr Blt',\n", " 'Garage Cars', 'Garage Area',\n", " 'Wood Deck SF',\n", " 'Open Porch SF',\n", " 'Enclosed Porch', ...])])),\n", " ('Regressor', DummyRegressor())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('Preprocessor',\n", " ColumnTransformer(transformers=[('ExcludeColumns', 'drop',\n", " ['Order', 'PID', 'Alley',\n", " 'Pool QC', 'Fence',\n", " 'Misc Feature']),\n", " ('CategoricalPreprocessing',\n", " Pipeline(steps=[('ModalImputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('OneHotEncoder',\n", " OneHotEncoder(handle_unknown='infrequent_if_exist',\n", " max_categories=5))]),\n", " ['MS Zoning', 'St...\n", " 'Year Remod/Add',\n", " 'Mas Vnr Area',\n", " 'BsmtFin SF 1',\n", " 'BsmtFin SF 2',\n", " 'Bsmt Unf SF',\n", " 'Total Bsmt SF',\n", " '1st Flr SF', '2nd Flr SF',\n", " 'Low Qual Fin SF',\n", " 'Gr Liv Area',\n", " 'Bsmt Full Bath',\n", " 'Bsmt Half Bath',\n", " 'Full Bath', 'Half Bath',\n", " 'Bedroom AbvGr',\n", " 'Kitchen AbvGr',\n", " 'TotRms AbvGrd',\n", " 'Fireplaces',\n", " 'Garage Yr Blt',\n", " 'Garage Cars', 'Garage Area',\n", " 'Wood Deck SF',\n", " 'Open Porch SF',\n", " 'Enclosed Porch', ...])])),\n", " ('Regressor', DummyRegressor())])
ColumnTransformer(transformers=[('ExcludeColumns', 'drop',\n", " ['Order', 'PID', 'Alley', 'Pool QC', 'Fence',\n", " 'Misc Feature']),\n", " ('CategoricalPreprocessing',\n", " Pipeline(steps=[('ModalImputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('OneHotEncoder',\n", " OneHotEncoder(handle_unknown='infrequent_if_exist',\n", " max_categories=5))]),\n", " ['MS Zoning', 'Street', 'Lot Shape',\n", " 'Land Contour', '...\n", " 'Overall Qual', 'Overall Cond', 'Year Built',\n", " 'Year Remod/Add', 'Mas Vnr Area',\n", " 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF',\n", " 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF',\n", " 'Low Qual Fin SF', 'Gr Liv Area',\n", " 'Bsmt Full Bath', 'Bsmt Half Bath',\n", " 'Full Bath', 'Half Bath', 'Bedroom AbvGr',\n", " 'Kitchen AbvGr', 'TotRms AbvGrd',\n", " 'Fireplaces', 'Garage Yr Blt', 'Garage Cars',\n", " 'Garage Area', 'Wood Deck SF',\n", " 'Open Porch SF', 'Enclosed Porch', ...])])
['Order', 'PID', 'Alley', 'Pool QC', 'Fence', 'Misc Feature']
drop
['MS Zoning', 'Street', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating', 'Heating QC', 'Central Air', 'Electrical', 'Kitchen Qual', 'Functional', 'Fireplace Qu', 'Garage Type', 'Garage Finish', 'Garage Qual', 'Garage Cond', 'Paved Drive', 'Sale Type', 'Sale Condition']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='infrequent_if_exist', max_categories=5)
['MS SubClass', 'Lot Frontage', 'Lot Area', 'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add', 'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd', 'Fireplaces', 'Garage Yr Blt', 'Garage Cars', 'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch', '3Ssn Porch', 'Screen Porch', 'Pool Area', 'Misc Val', 'Mo Sold', 'Yr Sold']
SimpleImputer(strategy='median')
StandardScaler()
DummyRegressor()
GridSearchCV(cv=5,\n", " estimator=Pipeline(steps=[('Preprocessor',\n", " ColumnTransformer(transformers=[('ExcludeColumns',\n", " 'drop',\n", " ['Order',\n", " 'PID',\n", " 'Alley',\n", " 'Pool '\n", " 'QC',\n", " 'Fence',\n", " 'Misc '\n", " 'Feature']),\n", " ('CategoricalPreprocessing',\n", " Pipeline(steps=[('ModalImputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('OneHotEncoder',\n", " OneHotEncoder(handle_unknown='infrequent_if_exist',\n", " max_...\n", " 'Regressor__n_neighbors': [1, 3, 5, 7, 9, 11, 15]},\n", " {'Regressor': [DecisionTreeRegressor()],\n", " 'Regressor__max_depth': [1, 3, 5, 7, 9, 11, 15,\n", " None]},\n", " {'Regressor': [HistGradientBoostingRegressor()],\n", " 'Regressor__l2_regularization': [0.1, 1.0],\n", " 'Regressor__learning_rate': [0.1, 0.01, 0.001],\n", " 'Regressor__max_depth': [None, 3],\n", " 'Regressor__max_iter': [1000]}],\n", " scoring='neg_mean_absolute_percentage_error', verbose=2)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GridSearchCV(cv=5,\n", " estimator=Pipeline(steps=[('Preprocessor',\n", " ColumnTransformer(transformers=[('ExcludeColumns',\n", " 'drop',\n", " ['Order',\n", " 'PID',\n", " 'Alley',\n", " 'Pool '\n", " 'QC',\n", " 'Fence',\n", " 'Misc '\n", " 'Feature']),\n", " ('CategoricalPreprocessing',\n", " Pipeline(steps=[('ModalImputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('OneHotEncoder',\n", " OneHotEncoder(handle_unknown='infrequent_if_exist',\n", " max_...\n", " 'Regressor__n_neighbors': [1, 3, 5, 7, 9, 11, 15]},\n", " {'Regressor': [DecisionTreeRegressor()],\n", " 'Regressor__max_depth': [1, 3, 5, 7, 9, 11, 15,\n", " None]},\n", " {'Regressor': [HistGradientBoostingRegressor()],\n", " 'Regressor__l2_regularization': [0.1, 1.0],\n", " 'Regressor__learning_rate': [0.1, 0.01, 0.001],\n", " 'Regressor__max_depth': [None, 3],\n", " 'Regressor__max_iter': [1000]}],\n", " scoring='neg_mean_absolute_percentage_error', verbose=2)
Pipeline(steps=[('Preprocessor',\n", " ColumnTransformer(transformers=[('ExcludeColumns', 'drop',\n", " ['Order', 'PID', 'Alley',\n", " 'Pool QC', 'Fence',\n", " 'Misc Feature']),\n", " ('CategoricalPreprocessing',\n", " Pipeline(steps=[('ModalImputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('OneHotEncoder',\n", " OneHotEncoder(handle_unknown='infrequent_if_exist',\n", " max_categories=5))]),\n", " ['MS Zoning', 'St...\n", " '1st Flr SF', '2nd Flr SF',\n", " 'Low Qual Fin SF',\n", " 'Gr Liv Area',\n", " 'Bsmt Full Bath',\n", " 'Bsmt Half Bath',\n", " 'Full Bath', 'Half Bath',\n", " 'Bedroom AbvGr',\n", " 'Kitchen AbvGr',\n", " 'TotRms AbvGrd',\n", " 'Fireplaces',\n", " 'Garage Yr Blt',\n", " 'Garage Cars', 'Garage Area',\n", " 'Wood Deck SF',\n", " 'Open Porch SF',\n", " 'Enclosed Porch', ...])])),\n", " ('Regressor',\n", " HistGradientBoostingRegressor(l2_regularization=1.0,\n", " learning_rate=0.01,\n", " max_iter=1000))])
ColumnTransformer(transformers=[('ExcludeColumns', 'drop',\n", " ['Order', 'PID', 'Alley', 'Pool QC', 'Fence',\n", " 'Misc Feature']),\n", " ('CategoricalPreprocessing',\n", " Pipeline(steps=[('ModalImputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('OneHotEncoder',\n", " OneHotEncoder(handle_unknown='infrequent_if_exist',\n", " max_categories=5))]),\n", " ['MS Zoning', 'Street', 'Lot Shape',\n", " 'Land Contour', '...\n", " 'Overall Qual', 'Overall Cond', 'Year Built',\n", " 'Year Remod/Add', 'Mas Vnr Area',\n", " 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF',\n", " 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF',\n", " 'Low Qual Fin SF', 'Gr Liv Area',\n", " 'Bsmt Full Bath', 'Bsmt Half Bath',\n", " 'Full Bath', 'Half Bath', 'Bedroom AbvGr',\n", " 'Kitchen AbvGr', 'TotRms AbvGrd',\n", " 'Fireplaces', 'Garage Yr Blt', 'Garage Cars',\n", " 'Garage Area', 'Wood Deck SF',\n", " 'Open Porch SF', 'Enclosed Porch', ...])])
['Order', 'PID', 'Alley', 'Pool QC', 'Fence', 'Misc Feature']
drop
['MS Zoning', 'Street', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating', 'Heating QC', 'Central Air', 'Electrical', 'Kitchen Qual', 'Functional', 'Fireplace Qu', 'Garage Type', 'Garage Finish', 'Garage Qual', 'Garage Cond', 'Paved Drive', 'Sale Type', 'Sale Condition']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='infrequent_if_exist', max_categories=5)
['MS SubClass', 'Lot Frontage', 'Lot Area', 'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add', 'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd', 'Fireplaces', 'Garage Yr Blt', 'Garage Cars', 'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch', '3Ssn Porch', 'Screen Porch', 'Pool Area', 'Misc Val', 'Mo Sold', 'Yr Sold']
SimpleImputer(strategy='median')
StandardScaler()
HistGradientBoostingRegressor(l2_regularization=1.0, learning_rate=0.01,\n", " max_iter=1000)