{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Preprocessing" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# basics\n", "import numpy as np\n", "import pandas as pd\n", "\n", "# machine learning\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.compose import ColumnTransformer" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# helper function to simulate with a mix of numeric and categorical features\n", "# also include missing data\n", "def simulate_classification_data(n_samples=100, seed=42):\n", " np.random.seed(seed)\n", " numeric_features = pd.DataFrame(\n", " {\n", " \"num_small\": np.round(np.random.randn(n_samples), 2),\n", " \"num_large\": np.round(100 * np.random.randn(n_samples), 2),\n", " }\n", " )\n", " categorical_features = pd.DataFrame(\n", " {\n", " \"animal\": np.random.choice([\"cat\", \"dog\"], size=n_samples),\n", " \"letter\": np.random.choice([\"X\", \"Y\", \"Z\"], size=n_samples),\n", " }\n", " )\n", " df = pd.concat([numeric_features, categorical_features], axis=1)\n", " df.loc[df.sample(frac=0.1).index, \"num_small\"] = np.nan\n", " df.loc[df.sample(frac=0.2).index, \"animal\"] = np.nan\n", " df.loc[df.sample(frac=0.3).index, \"letter\"] = np.nan\n", " return df" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
num_smallnum_largeanimalletter
0NaN-46.34dogNaN
1-0.14-46.57dogX
20.6524.20catNaN
31.52-191.33dogZ
4-0.23-172.49NaNNaN
5-0.23-56.23dogX
61.58-101.28catZ
70.7731.42dogY
8-0.47-90.80NaNX
90.54-141.23catY
\n", "
" ], "text/plain": [ " num_small num_large animal letter\n", "0 NaN -46.34 dog NaN\n", "1 -0.14 -46.57 dog X\n", "2 0.65 24.20 cat NaN\n", "3 1.52 -191.33 dog Z\n", "4 -0.23 -172.49 NaN NaN\n", "5 -0.23 -56.23 dog X\n", "6 1.58 -101.28 cat Z\n", "7 0.77 31.42 dog Y\n", "8 -0.47 -90.80 NaN X\n", "9 0.54 -141.23 cat Y" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "example_data = simulate_classification_data(n_samples=10)\n", "example_data_numeric = example_data[[\"num_small\", \"num_large\"]]\n", "example_data_categorical = example_data[[\"animal\", \"letter\"]]\n", "example_data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imputation" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
num_smallnum_large
0NaN-46.34
1-0.14-46.57
20.6524.20
31.52-191.33
4-0.23-172.49
5-0.23-56.23
61.58-101.28
70.7731.42
8-0.47-90.80
90.54-141.23
\n", "
" ], "text/plain": [ " num_small num_large\n", "0 NaN -46.34\n", "1 -0.14 -46.57\n", "2 0.65 24.20\n", "3 1.52 -191.33\n", "4 -0.23 -172.49\n", "5 -0.23 -56.23\n", "6 1.58 -101.28\n", "7 0.77 31.42\n", "8 -0.47 -90.80\n", "9 0.54 -141.23" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "example_data_numeric" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 5.4000e-01, -4.6340e+01],\n", " [-1.4000e-01, -4.6570e+01],\n", " [ 6.5000e-01, 2.4200e+01],\n", " [ 1.5200e+00, -1.9133e+02],\n", " [-2.3000e-01, -1.7249e+02],\n", " [-2.3000e-01, -5.6230e+01],\n", " [ 1.5800e+00, -1.0128e+02],\n", " [ 7.7000e-01, 3.1420e+01],\n", " [-4.7000e-01, -9.0800e+01],\n", " [ 5.4000e-01, -1.4123e+02]])" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "num_imp = SimpleImputer(strategy=\"median\")\n", "num_imp.fit(example_data_numeric)\n", "num_imp.transform(example_data_numeric)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Scaling" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
num_smallnum_large
0NaN-46.34
1-0.14-46.57
20.6524.20
31.52-191.33
4-0.23-172.49
5-0.23-56.23
61.58-101.28
70.7731.42
8-0.47-90.80
90.54-141.23
\n", "
" ], "text/plain": [ " num_small num_large\n", "0 NaN -46.34\n", "1 -0.14 -46.57\n", "2 0.65 24.20\n", "3 1.52 -191.33\n", "4 -0.23 -172.49\n", "5 -0.23 -56.23\n", "6 1.58 -101.28\n", "7 0.77 31.42\n", "8 -0.47 -90.80\n", "9 0.54 -141.23" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "example_data_numeric" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ nan, 0.45669399],\n", " [-0.80797156, 0.45348423],\n", " [ 0.28625278, 1.44111551],\n", " [ 1.49128465, -1.56671508],\n", " [-0.93263003, -1.30379332],\n", " [-0.93263003, 0.31867402],\n", " [ 1.5743903 , -0.31002161],\n", " [ 0.45246407, 1.54187428],\n", " [-1.26505261, -0.16376788],\n", " [ 0.13389243, -0.86754414]])" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scaler = StandardScaler()\n", "scaler.fit(example_data_numeric)\n", "scaler.transform(example_data_numeric)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Categorical Encoding" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
animalletter
0dogNaN
1dogX
2catNaN
3dogZ
4NaNNaN
5dogX
6catZ
7dogY
8NaNX
9catY
\n", "
" ], "text/plain": [ " animal letter\n", "0 dog NaN\n", "1 dog X\n", "2 cat NaN\n", "3 dog Z\n", "4 NaN NaN\n", "5 dog X\n", "6 cat Z\n", "7 dog Y\n", "8 NaN X\n", "9 cat Y" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "example_data_categorical" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0., 1., 0., 0., 0., 0., 1.],\n", " [0., 1., 0., 1., 0., 0., 0.],\n", " [1., 0., 0., 0., 0., 0., 1.],\n", " [0., 1., 0., 0., 0., 1., 0.],\n", " [0., 0., 1., 0., 0., 0., 1.],\n", " [0., 1., 0., 1., 0., 0., 0.],\n", " [1., 0., 0., 0., 0., 1., 0.],\n", " [0., 1., 0., 0., 1., 0., 0.],\n", " [0., 0., 1., 1., 0., 0., 0.],\n", " [1., 0., 0., 0., 1., 0., 0.]])" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "one_hot = OneHotEncoder()\n", "one_hot.fit(example_data_categorical)\n", "one_hot.fit_transform(example_data_categorical).toarray()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1., 0., 0., 0., 1.],\n", " [1., 0., 0., 0., 0.],\n", " [0., 0., 0., 0., 1.],\n", " [1., 0., 0., 1., 0.],\n", " [0., 1., 0., 0., 1.],\n", " [1., 0., 0., 0., 0.],\n", " [0., 0., 0., 1., 0.],\n", " [1., 0., 1., 0., 0.],\n", " [0., 1., 0., 0., 0.],\n", " [0., 0., 1., 0., 0.]])" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dummy = OneHotEncoder(drop=\"first\")\n", "dummy.fit(example_data_categorical)\n", "dummy.fit_transform(example_data_categorical).toarray()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Pipeline" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# note numeric and categorical features\n", "numeric_features = [\"num_small\", \"num_large\"]\n", "categorical_features = [\"animal\", \"letter\"]\n", "\n", "# define preprocessing for numeric features\n", "numeric_transformer = Pipeline(\n", " steps=[\n", " (\"imputer\", SimpleImputer(strategy=\"mean\")),\n", " (\"scaler\", StandardScaler()),\n", " ]\n", ")\n", "\n", "# define preprocessing for categorical features\n", "categorical_transformer = Pipeline(\n", " steps=[\n", " (\"imputer\", SimpleImputer(strategy=\"most_frequent\")),\n", " (\"onehot\", OneHotEncoder(handle_unknown=\"ignore\", drop=\"first\")),\n", " ]\n", ")\n", "\n", "# combine preprocessing steps\n", "preprocessor = ColumnTransformer(\n", " transformers=[\n", " (\"num\", numeric_transformer, numeric_features),\n", " (\"cat\", categorical_transformer, categorical_features),\n", " ]\n", ")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
ColumnTransformer(transformers=[('num',\n",
       "                                 Pipeline(steps=[('imputer', SimpleImputer()),\n",
       "                                                 ('scaler', StandardScaler())]),\n",
       "                                 ['num_small', 'num_large']),\n",
       "                                ('cat',\n",
       "                                 Pipeline(steps=[('imputer',\n",
       "                                                  SimpleImputer(strategy='most_frequent')),\n",
       "                                                 ('onehot',\n",
       "                                                  OneHotEncoder(drop='first',\n",
       "                                                                handle_unknown='ignore'))]),\n",
       "                                 ['animal', 'letter'])])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('imputer', SimpleImputer()),\n", " ('scaler', StandardScaler())]),\n", " ['num_small', 'num_large']),\n", " ('cat',\n", " Pipeline(steps=[('imputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('onehot',\n", " OneHotEncoder(drop='first',\n", " handle_unknown='ignore'))]),\n", " ['animal', 'letter'])])" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preprocessor.fit(example_data)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0. , 0.45669399, 1. , 0. , 0. ],\n", " [-0.85167681, 0.45348423, 1. , 0. , 0. ],\n", " [ 0.30173693, 1.44111551, 0. , 0. , 0. ],\n", " [ 1.57195205, -1.56671508, 1. , 0. , 1. ],\n", " [-0.98307837, -1.30379332, 1. , 0. , 0. ],\n", " [-0.98307837, 0.31867402, 1. , 0. , 0. ],\n", " [ 1.65955309, -0.31002161, 0. , 0. , 1. ],\n", " [ 0.47693901, 1.54187428, 1. , 1. , 0. ],\n", " [-1.33348254, -0.16376788, 1. , 0. , 0. ],\n", " [ 0.14113501, -0.86754414, 0. , 1. , 0. ]])" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preprocessor.transform(example_data)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
num__num_smallnum__num_largecat__animal_dogcat__letter_Ycat__letter_Z
00.0000000.4566941.00.00.0
1-0.8516770.4534841.00.00.0
20.3017371.4411160.00.00.0
31.571952-1.5667151.00.01.0
4-0.983078-1.3037931.00.00.0
5-0.9830780.3186741.00.00.0
61.659553-0.3100220.00.01.0
70.4769391.5418741.01.00.0
8-1.333483-0.1637681.00.00.0
90.141135-0.8675440.01.00.0
\n", "
" ], "text/plain": [ " num__num_small num__num_large cat__animal_dog cat__letter_Y \\\n", "0 0.000000 0.456694 1.0 0.0 \n", "1 -0.851677 0.453484 1.0 0.0 \n", "2 0.301737 1.441116 0.0 0.0 \n", "3 1.571952 -1.566715 1.0 0.0 \n", "4 -0.983078 -1.303793 1.0 0.0 \n", "5 -0.983078 0.318674 1.0 0.0 \n", "6 1.659553 -0.310022 0.0 0.0 \n", "7 0.476939 1.541874 1.0 1.0 \n", "8 -1.333483 -0.163768 1.0 0.0 \n", "9 0.141135 -0.867544 0.0 1.0 \n", "\n", " cat__letter_Z \n", "0 0.0 \n", "1 0.0 \n", "2 0.0 \n", "3 1.0 \n", "4 0.0 \n", "5 0.0 \n", "6 1.0 \n", "7 0.0 \n", "8 0.0 \n", "9 0.0 " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame(\n", " preprocessor.transform(example_data),\n", " columns=preprocessor.get_feature_names_out(),\n", ")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "new_data = simulate_classification_data(n_samples=20, seed=1)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 1.71795378, -0.43255096, 0. , 1. , 0. ],\n", " [-1.53788497, 2.7008792 , 1. , 0. , 0. ],\n", " [-1.42108358, 2.36162081, 1. , 0. , 0. ],\n", " [-2.20949297, 1.80465649, 1. , 0. , 0. ],\n", " [ 0. , 2.36064392, 1. , 0. , 1. ],\n", " [-4.00531435, 0.14925416, 1. , 0. , 0. ],\n", " [ 1.89315587, 0.93187903, 1. , 1. , 0. ],\n", " [-1.75688758, -0.2025642 , 1. , 0. , 0. ],\n", " [-0.18006881, 0.72952417, 1. , 1. , 0. ],\n", " [-1.01227872, 1.84359236, 1. , 0. , 0. ],\n", " [ 1.484351 , 0.13808975, 1. , 0. , 0. ],\n", " [-3.65491017, 0.54963767, 0. , 0. , 1. ],\n", " [-1.11447993, 0.14436973, 1. , 0. , 0. ],\n", " [-1.20208098, -0.0761273 , 1. , 1. , 0. ],\n", " [ 1.00254527, 0.16669854, 0. , 0. , 1. ],\n", " [ 0. , 1.08566873, 0. , 0. , 0. ],\n", " [-0.89547733, -0.45585666, 0. , 1. , 0. ],\n", " [-1.93208967, 1.43050933, 1. , 1. , 0. ],\n", " [-0.58887368, 3.41972741, 1. , 0. , 0. ],\n", " [ 0.19953571, 2.1388909 , 1. , 0. , 1. ]])" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preprocessor.transform(new_data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.5" } }, "nbformat": 4, "nbformat_minor": 2 }