{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.dummy import DummyRegressor\n", "from sklearn.neighbors import KNeighborsRegressor\n", "from sklearn.tree import DecisionTreeRegressor\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.ensemble import HistGradientBoostingRegressor\n", "from sklearn.metrics import mean_absolute_percentage_error" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "ames_train = pd.read_csv(\n", " \"https://cs307.org/lab-04/data/ames-train.csv\",\n", ")\n", "ames_test = pd.read_csv(\n", " \"https://cs307.org/lab-04/data/ames-test.csv\",\n", ")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
OrderPIDMS SubClassMS ZoningLot FrontageLot AreaStreetAlleyLot ShapeLand Contour...Pool AreaPool QCFenceMisc FeatureMisc ValMo SoldYr SoldSale TypeSale ConditionSalePrice
02832908188140160RM24.02522PaveNaNRegLvl...0NaNNaNNaN042006WDNormal137500
1273690542615080RLNaN19690PaveNaNIR1Lvl...738GdGdPrvNaN082006WDAlloca274970
2213590720013020RL97.011800PaveNaNIR1Bnk...0NaNNaNNaN082007WDFamily131000
32424528228415120RMNaN3072PaveNaNRegLvl...0NaNNaNNaN052006WDNormal178740
4196753545702020RL80.08000PaveNaNRegLvl...0NaNMnPrvNaN0112007WDNormal156500
..................................................................
1870102052730207020RLNaN10825PaveNaNIR1Lvl...0NaNNaNNaN072008WDNormal181900
187123790542620020RL65.011479PaveNaNRegLvl...0NaNMnPrvNaN062010WDNormal144500
1872154791020205030RM40.03636PaveNaNRegLvl...0NaNMnPrvNaN012008WDNormal55000
1873185553325111020RL80.012000PaveNaNRegLvl...0NaNNaNNaN032007WDNormal255000
1874283690822703050RL62.08707PaveNaNRegLvl...0NaNNaNNaN052006WDAdjLand107000
\n", "

1875 rows × 82 columns

\n", "
" ], "text/plain": [ " Order PID MS SubClass MS Zoning Lot Frontage Lot Area Street \\\n", "0 2832 908188140 160 RM 24.0 2522 Pave \n", "1 2736 905426150 80 RL NaN 19690 Pave \n", "2 2135 907200130 20 RL 97.0 11800 Pave \n", "3 2424 528228415 120 RM NaN 3072 Pave \n", "4 1967 535457020 20 RL 80.0 8000 Pave \n", "... ... ... ... ... ... ... ... \n", "1870 1020 527302070 20 RL NaN 10825 Pave \n", "1871 237 905426200 20 RL 65.0 11479 Pave \n", "1872 1547 910202050 30 RM 40.0 3636 Pave \n", "1873 1855 533251110 20 RL 80.0 12000 Pave \n", "1874 2836 908227030 50 RL 62.0 8707 Pave \n", "\n", " Alley Lot Shape Land Contour ... Pool Area Pool QC Fence Misc Feature \\\n", "0 NaN Reg Lvl ... 0 NaN NaN NaN \n", "1 NaN IR1 Lvl ... 738 Gd GdPrv NaN \n", "2 NaN IR1 Bnk ... 0 NaN NaN NaN \n", "3 NaN Reg Lvl ... 0 NaN NaN NaN \n", "4 NaN Reg Lvl ... 0 NaN MnPrv NaN \n", "... ... ... ... ... ... ... ... ... \n", "1870 NaN IR1 Lvl ... 0 NaN NaN NaN \n", "1871 NaN Reg Lvl ... 0 NaN MnPrv NaN \n", "1872 NaN Reg Lvl ... 0 NaN MnPrv NaN \n", "1873 NaN Reg Lvl ... 0 NaN NaN NaN \n", "1874 NaN Reg Lvl ... 0 NaN NaN NaN \n", "\n", " Misc Val Mo Sold Yr Sold Sale Type Sale Condition SalePrice \n", "0 0 4 2006 WD Normal 137500 \n", "1 0 8 2006 WD Alloca 274970 \n", "2 0 8 2007 WD Family 131000 \n", "3 0 5 2006 WD Normal 178740 \n", "4 0 11 2007 WD Normal 156500 \n", "... ... ... ... ... ... ... \n", "1870 0 7 2008 WD Normal 181900 \n", "1871 0 6 2010 WD Normal 144500 \n", "1872 0 1 2008 WD Normal 55000 \n", "1873 0 3 2007 WD Normal 255000 \n", "1874 0 5 2006 WD AdjLand 107000 \n", "\n", "[1875 rows x 82 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ames_train" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sns.histplot(ames_train[\"SalePrice\"], kde=True)\n", "plt.title(\"Distribution of Sale Price\")\n", "plt.xlabel(\"Sale Price ($)\")\n", "plt.ylabel(\"Frequency\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# create X and y for train dataset\n", "X_train = ames_train.drop(\"SalePrice\", axis=1)\n", "y_train = ames_train[\"SalePrice\"]\n", "\n", "# create X and y for test dataset\n", "X_test = ames_test.drop(\"SalePrice\", axis=1)\n", "y_test = ames_test[\"SalePrice\"]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# helper function to find columns that are mostly missing\n", "def columns_missing_above_threshold(df, threshold):\n", " missing_proportion = df.isnull().sum() / len(df)\n", " columns_above_threshold = missing_proportion[missing_proportion > threshold].index.tolist()\n", " return columns_above_threshold" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# define columns to be excluded\n", "many_missing_columns = columns_missing_above_threshold(X_train, 0.6)\n", "exclude_columns = [\"Order\", \"PID\"] + many_missing_columns\n", "\n", "# find columns with string data type (will be considered categorical)\n", "string_columns = X_train.select_dtypes(include=[\"object\"]).columns.tolist()\n", "string_columns = [col for col in string_columns if col not in exclude_columns]\n", "\n", "# find columns with numeric data type\n", "numeric_columns = X_train.select_dtypes(include=[\"int64\", \"float64\"]).columns.tolist()\n", "numeric_columns = [col for col in numeric_columns if col not in exclude_columns]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# pipeline for numeric\n", "numeric_preprocessor = Pipeline(\n", " steps=[\n", " (\"MedianImputer\", SimpleImputer(strategy=\"median\")),\n", " (\"Standardize\", StandardScaler()),\n", " ]\n", ")\n", "\n", "# pipeline for categorical\n", "categorical_preprocessor = Pipeline(\n", " steps=[\n", " (\"ModalImputer\", SimpleImputer(strategy=\"most_frequent\")),\n", " (\"OneHotEncoder\", OneHotEncoder(handle_unknown=\"infrequent_if_exist\", max_categories=5)),\n", " ]\n", ")\n", "\n", "# column transformer\n", "preprocessor = ColumnTransformer(\n", " transformers=[\n", " (\"ExcludeColumns\", \"drop\", exclude_columns),\n", " (\"CategoricalPreprocessing\", categorical_preprocessor, string_columns),\n", " (\"NumericProcessing\", numeric_preprocessor, numeric_columns),\n", " ],\n", " remainder=\"drop\",\n", ")\n", "\n", "# full pipeline\n", "pipeline = Pipeline(\n", " steps=[\n", " (\"Preprocessor\", preprocessor),\n", " (\"Regressor\", DummyRegressor()),\n", " ]\n", ")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "param_grid = [\n", " {\n", " \"Regressor\": [DummyRegressor()],\n", " },\n", " {\n", " \"Regressor\": [LinearRegression()],\n", " },\n", " {\n", " \"Regressor\": [KNeighborsRegressor()],\n", " \"Regressor__n_neighbors\": [1, 3, 5, 7, 9, 11, 15],\n", " },\n", " {\n", " \"Regressor\": [DecisionTreeRegressor()],\n", " \"Regressor__max_depth\": [1, 3, 5, 7, 9, 11, 15, None],\n", " },\n", " {\n", " \"Regressor\": [HistGradientBoostingRegressor()],\n", " \"Regressor__learning_rate\": [0.1, 0.01, 0.001],\n", " \"Regressor__max_iter\": [1000],\n", " \"Regressor__max_depth\": [None, 3],\n", " \"Regressor__l2_regularization\": [0.1, 1.0],\n", " },\n", "]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('Preprocessor',\n",
       "                 ColumnTransformer(transformers=[('ExcludeColumns', 'drop',\n",
       "                                                  ['Order', 'PID', 'Alley',\n",
       "                                                   'Pool QC', 'Fence',\n",
       "                                                   'Misc Feature']),\n",
       "                                                 ('CategoricalPreprocessing',\n",
       "                                                  Pipeline(steps=[('ModalImputer',\n",
       "                                                                   SimpleImputer(strategy='most_frequent')),\n",
       "                                                                  ('OneHotEncoder',\n",
       "                                                                   OneHotEncoder(handle_unknown='infrequent_if_exist',\n",
       "                                                                                 max_categories=5))]),\n",
       "                                                  ['MS Zoning', 'St...\n",
       "                                                   'Year Remod/Add',\n",
       "                                                   'Mas Vnr Area',\n",
       "                                                   'BsmtFin SF 1',\n",
       "                                                   'BsmtFin SF 2',\n",
       "                                                   'Bsmt Unf SF',\n",
       "                                                   'Total Bsmt SF',\n",
       "                                                   '1st Flr SF', '2nd Flr SF',\n",
       "                                                   'Low Qual Fin SF',\n",
       "                                                   'Gr Liv Area',\n",
       "                                                   'Bsmt Full Bath',\n",
       "                                                   'Bsmt Half Bath',\n",
       "                                                   'Full Bath', 'Half Bath',\n",
       "                                                   'Bedroom AbvGr',\n",
       "                                                   'Kitchen AbvGr',\n",
       "                                                   'TotRms AbvGrd',\n",
       "                                                   'Fireplaces',\n",
       "                                                   'Garage Yr Blt',\n",
       "                                                   'Garage Cars', 'Garage Area',\n",
       "                                                   'Wood Deck SF',\n",
       "                                                   'Open Porch SF',\n",
       "                                                   'Enclosed Porch', ...])])),\n",
       "                ('Regressor', DummyRegressor())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('Preprocessor',\n", " ColumnTransformer(transformers=[('ExcludeColumns', 'drop',\n", " ['Order', 'PID', 'Alley',\n", " 'Pool QC', 'Fence',\n", " 'Misc Feature']),\n", " ('CategoricalPreprocessing',\n", " Pipeline(steps=[('ModalImputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('OneHotEncoder',\n", " OneHotEncoder(handle_unknown='infrequent_if_exist',\n", " max_categories=5))]),\n", " ['MS Zoning', 'St...\n", " 'Year Remod/Add',\n", " 'Mas Vnr Area',\n", " 'BsmtFin SF 1',\n", " 'BsmtFin SF 2',\n", " 'Bsmt Unf SF',\n", " 'Total Bsmt SF',\n", " '1st Flr SF', '2nd Flr SF',\n", " 'Low Qual Fin SF',\n", " 'Gr Liv Area',\n", " 'Bsmt Full Bath',\n", " 'Bsmt Half Bath',\n", " 'Full Bath', 'Half Bath',\n", " 'Bedroom AbvGr',\n", " 'Kitchen AbvGr',\n", " 'TotRms AbvGrd',\n", " 'Fireplaces',\n", " 'Garage Yr Blt',\n", " 'Garage Cars', 'Garage Area',\n", " 'Wood Deck SF',\n", " 'Open Porch SF',\n", " 'Enclosed Porch', ...])])),\n", " ('Regressor', DummyRegressor())])" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipeline" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "mod = GridSearchCV(\n", " pipeline,\n", " param_grid=param_grid,\n", " n_jobs=-1,\n", " cv=5,\n", " verbose=2,\n", " scoring=\"neg_mean_absolute_percentage_error\",\n", ")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fitting 5 folds for each of 29 candidates, totalling 145 fits\n", "[CV] END .........................Regressor=DummyRegressor(); total time= 0.1s\n", "[CV] END .........................Regressor=DummyRegressor(); total time= 0.1s\n", "[CV] END .........................Regressor=DummyRegressor(); total time= 0.1s\n", "[CV] END .......................Regressor=LinearRegression(); total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=1; total time= 0.1s\n", "[CV] END .........................Regressor=DummyRegressor(); total time= 0.1s\n", "[CV] END .........................Regressor=DummyRegressor(); total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=1; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=1; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=1; total time= 0.1s\n", "[CV] END .......................Regressor=LinearRegression(); total time= 0.2s\n", "[CV] END .......................Regressor=LinearRegression(); total time= 0.2s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=1; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=3; total time= 0.1s\n", "[CV] END .......................Regressor=LinearRegression(); total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=3; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=3; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=3; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=5; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=5; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=3; total time= 0.1s\n", "[CV] END .......................Regressor=LinearRegression(); total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=5; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=7; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=5; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=7; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=7; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=7; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=7; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=5; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=9; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=9; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=9; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=9; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=9; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=11; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=11; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=11; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=11; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=11; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=15; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=15; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=15; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=15; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=1; total time= 0.1s\n", "[CV] END Regressor=KNeighborsRegressor(), Regressor__n_neighbors=15; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=1; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=1; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=1; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=1; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=3; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=3; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=3; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=3; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=3; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=5; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=5; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=5; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=5; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=5; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=7; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=7; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=7; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=7; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=7; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=9; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=9; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=9; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=9; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=9; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=11; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=11; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=11; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=15; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=15; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=None; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=None; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=11; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=15; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=11; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=None; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=15; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=15; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=None; total time= 0.1s\n", "[CV] END Regressor=DecisionTreeRegressor(), Regressor__max_depth=None; total time= 0.1s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.1, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 2.7s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.1, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 2.9s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.1, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 3.0s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.01, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 3.1s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.01, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 3.1s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.1, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 2.9s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.1, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 3.0s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.01, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 3.0s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.01, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 3.2s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.1, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 7.0s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.01, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 7.9s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.1, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 8.1s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.1, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 8.3s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.01, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 8.9s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.01, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 3.2s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.001, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 3.2s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.001, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 3.1s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.01, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 7.8s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.001, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 3.3s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.001, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 3.3s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.001, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 3.3s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.1, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 7.8s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.1, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 8.0s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.001, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 10.3s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.01, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 8.6s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.001, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 10.7s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.01, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 8.6s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.1, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 2.7s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.1, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 2.8s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.1, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 2.7s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.1, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 7.6s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.001, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 11.6s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.1, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 7.7s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.1, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 2.7s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.1, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 2.7s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.1, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 7.8s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.01, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 2.8s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.01, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 3.0s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.01, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 3.0s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.01, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 9.0s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.001, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 10.4s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.01, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 3.0s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.01, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 2.8s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=0.1, Regressor__learning_rate=0.001, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 10.5s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.01, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 9.1s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.1, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 8.0s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.1, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 7.9s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.01, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 9.1s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.001, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 3.6s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.001, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 3.6s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.001, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 3.4s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.001, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 10.3s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.001, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 3.1s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.001, Regressor__max_depth=3, Regressor__max_iter=1000; total time= 3.2s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.01, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 9.0s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.001, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 10.0s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.001, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 9.7s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.001, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 9.8s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.001, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 10.0s\n", "[CV] END Regressor=HistGradientBoostingRegressor(), Regressor__l2_regularization=1.0, Regressor__learning_rate=0.01, Regressor__max_depth=None, Regressor__max_iter=1000; total time= 7.8s\n" ] }, { "data": { "text/html": [ "
GridSearchCV(cv=5,\n",
       "             estimator=Pipeline(steps=[('Preprocessor',\n",
       "                                        ColumnTransformer(transformers=[('ExcludeColumns',\n",
       "                                                                         'drop',\n",
       "                                                                         ['Order',\n",
       "                                                                          'PID',\n",
       "                                                                          'Alley',\n",
       "                                                                          'Pool '\n",
       "                                                                          'QC',\n",
       "                                                                          'Fence',\n",
       "                                                                          'Misc '\n",
       "                                                                          'Feature']),\n",
       "                                                                        ('CategoricalPreprocessing',\n",
       "                                                                         Pipeline(steps=[('ModalImputer',\n",
       "                                                                                          SimpleImputer(strategy='most_frequent')),\n",
       "                                                                                         ('OneHotEncoder',\n",
       "                                                                                          OneHotEncoder(handle_unknown='infrequent_if_exist',\n",
       "                                                                                                        max_...\n",
       "                          'Regressor__n_neighbors': [1, 3, 5, 7, 9, 11, 15]},\n",
       "                         {'Regressor': [DecisionTreeRegressor()],\n",
       "                          'Regressor__max_depth': [1, 3, 5, 7, 9, 11, 15,\n",
       "                                                   None]},\n",
       "                         {'Regressor': [HistGradientBoostingRegressor()],\n",
       "                          'Regressor__l2_regularization': [0.1, 1.0],\n",
       "                          'Regressor__learning_rate': [0.1, 0.01, 0.001],\n",
       "                          'Regressor__max_depth': [None, 3],\n",
       "                          'Regressor__max_iter': [1000]}],\n",
       "             scoring='neg_mean_absolute_percentage_error', verbose=2)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "GridSearchCV(cv=5,\n", " estimator=Pipeline(steps=[('Preprocessor',\n", " ColumnTransformer(transformers=[('ExcludeColumns',\n", " 'drop',\n", " ['Order',\n", " 'PID',\n", " 'Alley',\n", " 'Pool '\n", " 'QC',\n", " 'Fence',\n", " 'Misc '\n", " 'Feature']),\n", " ('CategoricalPreprocessing',\n", " Pipeline(steps=[('ModalImputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('OneHotEncoder',\n", " OneHotEncoder(handle_unknown='infrequent_if_exist',\n", " max_...\n", " 'Regressor__n_neighbors': [1, 3, 5, 7, 9, 11, 15]},\n", " {'Regressor': [DecisionTreeRegressor()],\n", " 'Regressor__max_depth': [1, 3, 5, 7, 9, 11, 15,\n", " None]},\n", " {'Regressor': [HistGradientBoostingRegressor()],\n", " 'Regressor__l2_regularization': [0.1, 1.0],\n", " 'Regressor__learning_rate': [0.1, 0.01, 0.001],\n", " 'Regressor__max_depth': [None, 3],\n", " 'Regressor__max_iter': [1000]}],\n", " scoring='neg_mean_absolute_percentage_error', verbose=2)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mod.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'Regressor': HistGradientBoostingRegressor(),\n", " 'Regressor__l2_regularization': 1.0,\n", " 'Regressor__learning_rate': 0.01,\n", " 'Regressor__max_depth': None,\n", " 'Regressor__max_iter': 1000}" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mod.best_params_" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# mod.cv_results_" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CV MAPE: 0.09323748138117231\n" ] } ], "source": [ "print(f\"CV MAPE: {-mod.best_score_}\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Test MAPE: 0.08164649164771304\n" ] } ], "source": [ "y_test_pred = mod.predict(X_test)\n", "test_mape = mean_absolute_percentage_error(y_test, y_test_pred)\n", "print(f\"Test MAPE: {test_mape}\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.5" } }, "nbformat": 4, "nbformat_minor": 2 }