{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Preprocessing"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# basics\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"# machine learning\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.compose import ColumnTransformer"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# helper function to simulate with a mix of numeric and categorical features\n",
"# also include missing data\n",
"def simulate_classification_data(n_samples=100, seed=42):\n",
" np.random.seed(seed)\n",
" numeric_features = pd.DataFrame(\n",
" {\n",
" \"num_small\": np.round(np.random.randn(n_samples), 2),\n",
" \"num_large\": np.round(100 * np.random.randn(n_samples), 2),\n",
" }\n",
" )\n",
" categorical_features = pd.DataFrame(\n",
" {\n",
" \"animal\": np.random.choice([\"cat\", \"dog\"], size=n_samples),\n",
" \"letter\": np.random.choice([\"X\", \"Y\", \"Z\"], size=n_samples),\n",
" }\n",
" )\n",
" df = pd.concat([numeric_features, categorical_features], axis=1)\n",
" df.loc[df.sample(frac=0.1).index, \"num_small\"] = np.nan\n",
" df.loc[df.sample(frac=0.2).index, \"animal\"] = np.nan\n",
" df.loc[df.sample(frac=0.3).index, \"letter\"] = np.nan\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" num_small \n",
" num_large \n",
" animal \n",
" letter \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" NaN \n",
" -46.34 \n",
" dog \n",
" NaN \n",
" \n",
" \n",
" 1 \n",
" -0.14 \n",
" -46.57 \n",
" dog \n",
" X \n",
" \n",
" \n",
" 2 \n",
" 0.65 \n",
" 24.20 \n",
" cat \n",
" NaN \n",
" \n",
" \n",
" 3 \n",
" 1.52 \n",
" -191.33 \n",
" dog \n",
" Z \n",
" \n",
" \n",
" 4 \n",
" -0.23 \n",
" -172.49 \n",
" NaN \n",
" NaN \n",
" \n",
" \n",
" 5 \n",
" -0.23 \n",
" -56.23 \n",
" dog \n",
" X \n",
" \n",
" \n",
" 6 \n",
" 1.58 \n",
" -101.28 \n",
" cat \n",
" Z \n",
" \n",
" \n",
" 7 \n",
" 0.77 \n",
" 31.42 \n",
" dog \n",
" Y \n",
" \n",
" \n",
" 8 \n",
" -0.47 \n",
" -90.80 \n",
" NaN \n",
" X \n",
" \n",
" \n",
" 9 \n",
" 0.54 \n",
" -141.23 \n",
" cat \n",
" Y \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" num_small num_large animal letter\n",
"0 NaN -46.34 dog NaN\n",
"1 -0.14 -46.57 dog X\n",
"2 0.65 24.20 cat NaN\n",
"3 1.52 -191.33 dog Z\n",
"4 -0.23 -172.49 NaN NaN\n",
"5 -0.23 -56.23 dog X\n",
"6 1.58 -101.28 cat Z\n",
"7 0.77 31.42 dog Y\n",
"8 -0.47 -90.80 NaN X\n",
"9 0.54 -141.23 cat Y"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"example_data = simulate_classification_data(n_samples=10)\n",
"example_data_numeric = example_data[[\"num_small\", \"num_large\"]]\n",
"example_data_categorical = example_data[[\"animal\", \"letter\"]]\n",
"example_data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Imputation"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" num_small \n",
" num_large \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" NaN \n",
" -46.34 \n",
" \n",
" \n",
" 1 \n",
" -0.14 \n",
" -46.57 \n",
" \n",
" \n",
" 2 \n",
" 0.65 \n",
" 24.20 \n",
" \n",
" \n",
" 3 \n",
" 1.52 \n",
" -191.33 \n",
" \n",
" \n",
" 4 \n",
" -0.23 \n",
" -172.49 \n",
" \n",
" \n",
" 5 \n",
" -0.23 \n",
" -56.23 \n",
" \n",
" \n",
" 6 \n",
" 1.58 \n",
" -101.28 \n",
" \n",
" \n",
" 7 \n",
" 0.77 \n",
" 31.42 \n",
" \n",
" \n",
" 8 \n",
" -0.47 \n",
" -90.80 \n",
" \n",
" \n",
" 9 \n",
" 0.54 \n",
" -141.23 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" num_small num_large\n",
"0 NaN -46.34\n",
"1 -0.14 -46.57\n",
"2 0.65 24.20\n",
"3 1.52 -191.33\n",
"4 -0.23 -172.49\n",
"5 -0.23 -56.23\n",
"6 1.58 -101.28\n",
"7 0.77 31.42\n",
"8 -0.47 -90.80\n",
"9 0.54 -141.23"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"example_data_numeric"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 5.4000e-01, -4.6340e+01],\n",
" [-1.4000e-01, -4.6570e+01],\n",
" [ 6.5000e-01, 2.4200e+01],\n",
" [ 1.5200e+00, -1.9133e+02],\n",
" [-2.3000e-01, -1.7249e+02],\n",
" [-2.3000e-01, -5.6230e+01],\n",
" [ 1.5800e+00, -1.0128e+02],\n",
" [ 7.7000e-01, 3.1420e+01],\n",
" [-4.7000e-01, -9.0800e+01],\n",
" [ 5.4000e-01, -1.4123e+02]])"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"num_imp = SimpleImputer(strategy=\"median\")\n",
"num_imp.fit(example_data_numeric)\n",
"num_imp.transform(example_data_numeric)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Scaling"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" num_small \n",
" num_large \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" NaN \n",
" -46.34 \n",
" \n",
" \n",
" 1 \n",
" -0.14 \n",
" -46.57 \n",
" \n",
" \n",
" 2 \n",
" 0.65 \n",
" 24.20 \n",
" \n",
" \n",
" 3 \n",
" 1.52 \n",
" -191.33 \n",
" \n",
" \n",
" 4 \n",
" -0.23 \n",
" -172.49 \n",
" \n",
" \n",
" 5 \n",
" -0.23 \n",
" -56.23 \n",
" \n",
" \n",
" 6 \n",
" 1.58 \n",
" -101.28 \n",
" \n",
" \n",
" 7 \n",
" 0.77 \n",
" 31.42 \n",
" \n",
" \n",
" 8 \n",
" -0.47 \n",
" -90.80 \n",
" \n",
" \n",
" 9 \n",
" 0.54 \n",
" -141.23 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" num_small num_large\n",
"0 NaN -46.34\n",
"1 -0.14 -46.57\n",
"2 0.65 24.20\n",
"3 1.52 -191.33\n",
"4 -0.23 -172.49\n",
"5 -0.23 -56.23\n",
"6 1.58 -101.28\n",
"7 0.77 31.42\n",
"8 -0.47 -90.80\n",
"9 0.54 -141.23"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"example_data_numeric"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ nan, 0.45669399],\n",
" [-0.80797156, 0.45348423],\n",
" [ 0.28625278, 1.44111551],\n",
" [ 1.49128465, -1.56671508],\n",
" [-0.93263003, -1.30379332],\n",
" [-0.93263003, 0.31867402],\n",
" [ 1.5743903 , -0.31002161],\n",
" [ 0.45246407, 1.54187428],\n",
" [-1.26505261, -0.16376788],\n",
" [ 0.13389243, -0.86754414]])"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scaler = StandardScaler()\n",
"scaler.fit(example_data_numeric)\n",
"scaler.transform(example_data_numeric)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Categorical Encoding"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" animal \n",
" letter \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" dog \n",
" NaN \n",
" \n",
" \n",
" 1 \n",
" dog \n",
" X \n",
" \n",
" \n",
" 2 \n",
" cat \n",
" NaN \n",
" \n",
" \n",
" 3 \n",
" dog \n",
" Z \n",
" \n",
" \n",
" 4 \n",
" NaN \n",
" NaN \n",
" \n",
" \n",
" 5 \n",
" dog \n",
" X \n",
" \n",
" \n",
" 6 \n",
" cat \n",
" Z \n",
" \n",
" \n",
" 7 \n",
" dog \n",
" Y \n",
" \n",
" \n",
" 8 \n",
" NaN \n",
" X \n",
" \n",
" \n",
" 9 \n",
" cat \n",
" Y \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" animal letter\n",
"0 dog NaN\n",
"1 dog X\n",
"2 cat NaN\n",
"3 dog Z\n",
"4 NaN NaN\n",
"5 dog X\n",
"6 cat Z\n",
"7 dog Y\n",
"8 NaN X\n",
"9 cat Y"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"example_data_categorical"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0., 1., 0., 0., 0., 0., 1.],\n",
" [0., 1., 0., 1., 0., 0., 0.],\n",
" [1., 0., 0., 0., 0., 0., 1.],\n",
" [0., 1., 0., 0., 0., 1., 0.],\n",
" [0., 0., 1., 0., 0., 0., 1.],\n",
" [0., 1., 0., 1., 0., 0., 0.],\n",
" [1., 0., 0., 0., 0., 1., 0.],\n",
" [0., 1., 0., 0., 1., 0., 0.],\n",
" [0., 0., 1., 1., 0., 0., 0.],\n",
" [1., 0., 0., 0., 1., 0., 0.]])"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"one_hot = OneHotEncoder()\n",
"one_hot.fit(example_data_categorical)\n",
"one_hot.fit_transform(example_data_categorical).toarray()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[1., 0., 0., 0., 1.],\n",
" [1., 0., 0., 0., 0.],\n",
" [0., 0., 0., 0., 1.],\n",
" [1., 0., 0., 1., 0.],\n",
" [0., 1., 0., 0., 1.],\n",
" [1., 0., 0., 0., 0.],\n",
" [0., 0., 0., 1., 0.],\n",
" [1., 0., 1., 0., 0.],\n",
" [0., 1., 0., 0., 0.],\n",
" [0., 0., 1., 0., 0.]])"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dummy = OneHotEncoder(drop=\"first\")\n",
"dummy.fit(example_data_categorical)\n",
"dummy.fit_transform(example_data_categorical).toarray()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Pipeline"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# note numeric and categorical features\n",
"numeric_features = [\"num_small\", \"num_large\"]\n",
"categorical_features = [\"animal\", \"letter\"]\n",
"\n",
"# define preprocessing for numeric features\n",
"numeric_transformer = Pipeline(\n",
" steps=[\n",
" (\"imputer\", SimpleImputer(strategy=\"mean\")),\n",
" (\"scaler\", StandardScaler()),\n",
" ]\n",
")\n",
"\n",
"# define preprocessing for categorical features\n",
"categorical_transformer = Pipeline(\n",
" steps=[\n",
" (\"imputer\", SimpleImputer(strategy=\"most_frequent\")),\n",
" (\"onehot\", OneHotEncoder(handle_unknown=\"ignore\", drop=\"first\")),\n",
" ]\n",
")\n",
"\n",
"# combine preprocessing steps\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" (\"num\", numeric_transformer, numeric_features),\n",
" (\"cat\", categorical_transformer, categorical_features),\n",
" ]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('imputer', SimpleImputer()),\n",
" ('scaler', StandardScaler())]),\n",
" ['num_small', 'num_large']),\n",
" ('cat',\n",
" Pipeline(steps=[('imputer',\n",
" SimpleImputer(strategy='most_frequent')),\n",
" ('onehot',\n",
" OneHotEncoder(drop='first',\n",
" handle_unknown='ignore'))]),\n",
" ['animal', 'letter'])]) In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. ColumnTransformer?Documentation for ColumnTransformer iFitted ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('imputer', SimpleImputer()),\n",
" ('scaler', StandardScaler())]),\n",
" ['num_small', 'num_large']),\n",
" ('cat',\n",
" Pipeline(steps=[('imputer',\n",
" SimpleImputer(strategy='most_frequent')),\n",
" ('onehot',\n",
" OneHotEncoder(drop='first',\n",
" handle_unknown='ignore'))]),\n",
" ['animal', 'letter'])]) "
],
"text/plain": [
"ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('imputer', SimpleImputer()),\n",
" ('scaler', StandardScaler())]),\n",
" ['num_small', 'num_large']),\n",
" ('cat',\n",
" Pipeline(steps=[('imputer',\n",
" SimpleImputer(strategy='most_frequent')),\n",
" ('onehot',\n",
" OneHotEncoder(drop='first',\n",
" handle_unknown='ignore'))]),\n",
" ['animal', 'letter'])])"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"preprocessor.fit(example_data)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0. , 0.45669399, 1. , 0. , 0. ],\n",
" [-0.85167681, 0.45348423, 1. , 0. , 0. ],\n",
" [ 0.30173693, 1.44111551, 0. , 0. , 0. ],\n",
" [ 1.57195205, -1.56671508, 1. , 0. , 1. ],\n",
" [-0.98307837, -1.30379332, 1. , 0. , 0. ],\n",
" [-0.98307837, 0.31867402, 1. , 0. , 0. ],\n",
" [ 1.65955309, -0.31002161, 0. , 0. , 1. ],\n",
" [ 0.47693901, 1.54187428, 1. , 1. , 0. ],\n",
" [-1.33348254, -0.16376788, 1. , 0. , 0. ],\n",
" [ 0.14113501, -0.86754414, 0. , 1. , 0. ]])"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"preprocessor.transform(example_data)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" num__num_small \n",
" num__num_large \n",
" cat__animal_dog \n",
" cat__letter_Y \n",
" cat__letter_Z \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 0.000000 \n",
" 0.456694 \n",
" 1.0 \n",
" 0.0 \n",
" 0.0 \n",
" \n",
" \n",
" 1 \n",
" -0.851677 \n",
" 0.453484 \n",
" 1.0 \n",
" 0.0 \n",
" 0.0 \n",
" \n",
" \n",
" 2 \n",
" 0.301737 \n",
" 1.441116 \n",
" 0.0 \n",
" 0.0 \n",
" 0.0 \n",
" \n",
" \n",
" 3 \n",
" 1.571952 \n",
" -1.566715 \n",
" 1.0 \n",
" 0.0 \n",
" 1.0 \n",
" \n",
" \n",
" 4 \n",
" -0.983078 \n",
" -1.303793 \n",
" 1.0 \n",
" 0.0 \n",
" 0.0 \n",
" \n",
" \n",
" 5 \n",
" -0.983078 \n",
" 0.318674 \n",
" 1.0 \n",
" 0.0 \n",
" 0.0 \n",
" \n",
" \n",
" 6 \n",
" 1.659553 \n",
" -0.310022 \n",
" 0.0 \n",
" 0.0 \n",
" 1.0 \n",
" \n",
" \n",
" 7 \n",
" 0.476939 \n",
" 1.541874 \n",
" 1.0 \n",
" 1.0 \n",
" 0.0 \n",
" \n",
" \n",
" 8 \n",
" -1.333483 \n",
" -0.163768 \n",
" 1.0 \n",
" 0.0 \n",
" 0.0 \n",
" \n",
" \n",
" 9 \n",
" 0.141135 \n",
" -0.867544 \n",
" 0.0 \n",
" 1.0 \n",
" 0.0 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" num__num_small num__num_large cat__animal_dog cat__letter_Y \\\n",
"0 0.000000 0.456694 1.0 0.0 \n",
"1 -0.851677 0.453484 1.0 0.0 \n",
"2 0.301737 1.441116 0.0 0.0 \n",
"3 1.571952 -1.566715 1.0 0.0 \n",
"4 -0.983078 -1.303793 1.0 0.0 \n",
"5 -0.983078 0.318674 1.0 0.0 \n",
"6 1.659553 -0.310022 0.0 0.0 \n",
"7 0.476939 1.541874 1.0 1.0 \n",
"8 -1.333483 -0.163768 1.0 0.0 \n",
"9 0.141135 -0.867544 0.0 1.0 \n",
"\n",
" cat__letter_Z \n",
"0 0.0 \n",
"1 0.0 \n",
"2 0.0 \n",
"3 1.0 \n",
"4 0.0 \n",
"5 0.0 \n",
"6 1.0 \n",
"7 0.0 \n",
"8 0.0 \n",
"9 0.0 "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(\n",
" preprocessor.transform(example_data),\n",
" columns=preprocessor.get_feature_names_out(),\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"new_data = simulate_classification_data(n_samples=20, seed=1)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 1.71795378, -0.43255096, 0. , 1. , 0. ],\n",
" [-1.53788497, 2.7008792 , 1. , 0. , 0. ],\n",
" [-1.42108358, 2.36162081, 1. , 0. , 0. ],\n",
" [-2.20949297, 1.80465649, 1. , 0. , 0. ],\n",
" [ 0. , 2.36064392, 1. , 0. , 1. ],\n",
" [-4.00531435, 0.14925416, 1. , 0. , 0. ],\n",
" [ 1.89315587, 0.93187903, 1. , 1. , 0. ],\n",
" [-1.75688758, -0.2025642 , 1. , 0. , 0. ],\n",
" [-0.18006881, 0.72952417, 1. , 1. , 0. ],\n",
" [-1.01227872, 1.84359236, 1. , 0. , 0. ],\n",
" [ 1.484351 , 0.13808975, 1. , 0. , 0. ],\n",
" [-3.65491017, 0.54963767, 0. , 0. , 1. ],\n",
" [-1.11447993, 0.14436973, 1. , 0. , 0. ],\n",
" [-1.20208098, -0.0761273 , 1. , 1. , 0. ],\n",
" [ 1.00254527, 0.16669854, 0. , 0. , 1. ],\n",
" [ 0. , 1.08566873, 0. , 0. , 0. ],\n",
" [-0.89547733, -0.45585666, 0. , 1. , 0. ],\n",
" [-1.93208967, 1.43050933, 1. , 1. , 0. ],\n",
" [-0.58887368, 3.41972741, 1. , 0. , 0. ],\n",
" [ 0.19953571, 2.1388909 , 1. , 0. , 1. ]])"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"preprocessor.transform(new_data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}