# Preprocessing

In [1]:
# basics
import numpy as np
import pandas as pd

# machine learning
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
# helper function to simulate with a mix of numeric and categorical features
# also include missing data
def simulate_classification_data(n_samples=100, seed=42):
    np.random.seed(seed)
    numeric_features = pd.DataFrame(
        {
            "num_small": np.round(np.random.randn(n_samples), 2),
            "num_large": np.round(100 * np.random.randn(n_samples), 2),
        }
    )
    categorical_features = pd.DataFrame(
        {
            "animal": np.random.choice(["cat", "dog"], size=n_samples),
            "letter": np.random.choice(["X", "Y", "Z"], size=n_samples),
        }
    )
    df = pd.concat([numeric_features, categorical_features], axis=1)
    df.loc[df.sample(frac=0.1).index, "num_small"] = np.nan
    df.loc[df.sample(frac=0.2).index, "animal"] = np.nan
    df.loc[df.sample(frac=0.3).index, "letter"] = np.nan
    return df

In [3]:
example_data = simulate_classification_data(n_samples=10)
example_data_numeric = example_data[["num_small", "num_large"]]
example_data_categorical = example_data[["animal", "letter"]]
example_data

Unnamed: 0,num_small,num_large,animal,letter
0,,-46.34,dog,
1,-0.14,-46.57,dog,X
2,0.65,24.2,cat,
3,1.52,-191.33,dog,Z
4,-0.23,-172.49,,
5,-0.23,-56.23,dog,X
6,1.58,-101.28,cat,Z
7,0.77,31.42,dog,Y
8,-0.47,-90.8,,X
9,0.54,-141.23,cat,Y


## Imputation

In [4]:
example_data_numeric

Unnamed: 0,num_small,num_large
0,,-46.34
1,-0.14,-46.57
2,0.65,24.2
3,1.52,-191.33
4,-0.23,-172.49
5,-0.23,-56.23
6,1.58,-101.28
7,0.77,31.42
8,-0.47,-90.8
9,0.54,-141.23


In [5]:
num_imp = SimpleImputer(strategy="median")
num_imp.fit(example_data_numeric)
num_imp.transform(example_data_numeric)

array([[ 5.4000e-01, -4.6340e+01],
       [-1.4000e-01, -4.6570e+01],
       [ 6.5000e-01,  2.4200e+01],
       [ 1.5200e+00, -1.9133e+02],
       [-2.3000e-01, -1.7249e+02],
       [-2.3000e-01, -5.6230e+01],
       [ 1.5800e+00, -1.0128e+02],
       [ 7.7000e-01,  3.1420e+01],
       [-4.7000e-01, -9.0800e+01],
       [ 5.4000e-01, -1.4123e+02]])

## Scaling

In [6]:
example_data_numeric

Unnamed: 0,num_small,num_large
0,,-46.34
1,-0.14,-46.57
2,0.65,24.2
3,1.52,-191.33
4,-0.23,-172.49
5,-0.23,-56.23
6,1.58,-101.28
7,0.77,31.42
8,-0.47,-90.8
9,0.54,-141.23


In [7]:
scaler = StandardScaler()
scaler.fit(example_data_numeric)
scaler.transform(example_data_numeric)

array([[        nan,  0.45669399],
       [-0.80797156,  0.45348423],
       [ 0.28625278,  1.44111551],
       [ 1.49128465, -1.56671508],
       [-0.93263003, -1.30379332],
       [-0.93263003,  0.31867402],
       [ 1.5743903 , -0.31002161],
       [ 0.45246407,  1.54187428],
       [-1.26505261, -0.16376788],
       [ 0.13389243, -0.86754414]])

## Categorical Encoding

In [8]:
example_data_categorical

Unnamed: 0,animal,letter
0,dog,
1,dog,X
2,cat,
3,dog,Z
4,,
5,dog,X
6,cat,Z
7,dog,Y
8,,X
9,cat,Y


In [9]:
one_hot = OneHotEncoder()
one_hot.fit(example_data_categorical)
one_hot.fit_transform(example_data_categorical).toarray()

array([[0., 1., 0., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1., 0., 0.],
       [0., 0., 1., 1., 0., 0., 0.],
       [1., 0., 0., 0., 1., 0., 0.]])

In [10]:
dummy = OneHotEncoder(drop="first")
dummy.fit(example_data_categorical)
dummy.fit_transform(example_data_categorical).toarray()

array([[1., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.]])

## Pipeline

In [11]:
# note numeric and categorical features
numeric_features = ["num_small", "num_large"]
categorical_features = ["animal", "letter"]

# define preprocessing for numeric features
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
    ]
)

# define preprocessing for categorical features
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", drop="first")),
    ]
)

# combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [12]:
preprocessor.fit(example_data)

In [13]:
preprocessor.transform(example_data)

array([[ 0.        ,  0.45669399,  1.        ,  0.        ,  0.        ],
       [-0.85167681,  0.45348423,  1.        ,  0.        ,  0.        ],
       [ 0.30173693,  1.44111551,  0.        ,  0.        ,  0.        ],
       [ 1.57195205, -1.56671508,  1.        ,  0.        ,  1.        ],
       [-0.98307837, -1.30379332,  1.        ,  0.        ,  0.        ],
       [-0.98307837,  0.31867402,  1.        ,  0.        ,  0.        ],
       [ 1.65955309, -0.31002161,  0.        ,  0.        ,  1.        ],
       [ 0.47693901,  1.54187428,  1.        ,  1.        ,  0.        ],
       [-1.33348254, -0.16376788,  1.        ,  0.        ,  0.        ],
       [ 0.14113501, -0.86754414,  0.        ,  1.        ,  0.        ]])

In [14]:
pd.DataFrame(
    preprocessor.transform(example_data),
    columns=preprocessor.get_feature_names_out(),
)

Unnamed: 0,num__num_small,num__num_large,cat__animal_dog,cat__letter_Y,cat__letter_Z
0,0.0,0.456694,1.0,0.0,0.0
1,-0.851677,0.453484,1.0,0.0,0.0
2,0.301737,1.441116,0.0,0.0,0.0
3,1.571952,-1.566715,1.0,0.0,1.0
4,-0.983078,-1.303793,1.0,0.0,0.0
5,-0.983078,0.318674,1.0,0.0,0.0
6,1.659553,-0.310022,0.0,0.0,1.0
7,0.476939,1.541874,1.0,1.0,0.0
8,-1.333483,-0.163768,1.0,0.0,0.0
9,0.141135,-0.867544,0.0,1.0,0.0


In [15]:
new_data = simulate_classification_data(n_samples=20, seed=1)

In [16]:
preprocessor.transform(new_data)

array([[ 1.71795378, -0.43255096,  0.        ,  1.        ,  0.        ],
       [-1.53788497,  2.7008792 ,  1.        ,  0.        ,  0.        ],
       [-1.42108358,  2.36162081,  1.        ,  0.        ,  0.        ],
       [-2.20949297,  1.80465649,  1.        ,  0.        ,  0.        ],
       [ 0.        ,  2.36064392,  1.        ,  0.        ,  1.        ],
       [-4.00531435,  0.14925416,  1.        ,  0.        ,  0.        ],
       [ 1.89315587,  0.93187903,  1.        ,  1.        ,  0.        ],
       [-1.75688758, -0.2025642 ,  1.        ,  0.        ,  0.        ],
       [-0.18006881,  0.72952417,  1.        ,  1.        ,  0.        ],
       [-1.01227872,  1.84359236,  1.        ,  0.        ,  0.        ],
       [ 1.484351  ,  0.13808975,  1.        ,  0.        ,  0.        ],
       [-3.65491017,  0.54963767,  0.        ,  0.        ,  1.        ],
       [-1.11447993,  0.14436973,  1.        ,  0.        ,  0.        ],
       [-1.20208098, -0.0761273 ,  1. 