Estimators, Preprocessing & Feature Engineering
scikit-learn: Estimators, Preprocessing & Feature Engineering scikit-learn is Python's standard machine learning library. All objects follow a consistent API: e…
scikit-learn: Estimators, Preprocessing & Feature Engineering
scikit-learn is Python's standard machine learning library. All objects follow a consistent API: estimators have fit(), transformers have fit_transform(), and predictors have predict(). This uniformity makes it easy to build pipelines.
Data Splitting
from sklearn.model_selection import train_test_split
import numpy as np
X, y = load_data() # features and labels
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.2,
random_state=42,
stratify=y # preserve class distribution (for classification)
)
# Train/validation/test split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)
# Result: 60% train / 20% val / 20% testPreprocessing
from sklearn.preprocessing import (
StandardScaler, MinMaxScaler, RobustScaler,
LabelEncoder, OrdinalEncoder, OneHotEncoder,
PolynomialFeatures, Normalizer, PowerTransformer
)
# Numerical scaling
scaler = StandardScaler() # zero mean, unit variance (z-score)
X_train_scaled = scaler.fit_transform(X_train) # fit on train only!
X_test_scaled = scaler.transform(X_test) # only transform test
MinMaxScaler() # scale to [0, 1]
RobustScaler() # uses median/IQR — robust to outliers
Normalizer(norm='l2') # normalize each sample to unit norm
# Categorical encoding
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_encoded = enc.fit_transform(X_categorical)
# Creates one column per category (0 or 1)
OrdinalEncoder() # encode categories as integers (for ordinal data)
LabelEncoder() # encode target labels (y), not features
# Missing values
from sklearn.impute import SimpleImputer, KNNImputer
imputer = SimpleImputer(strategy='mean') # or 'median', 'most_frequent', 'constant'
imputer = KNNImputer(n_neighbors=5) # fill using nearest neighbors
# Feature engineering
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X) # adds x1^2, x2^2, x1*x2, etc.
PowerTransformer(method='yeo-johnson') # makes distribution more GaussianFeature Selection
from sklearn.feature_selection import (
SelectKBest, f_classif, chi2, mutual_info_classif,
RFE, SelectFromModel, VarianceThreshold
)
from sklearn.linear_model import Lasso
# Remove low-variance features
selector = VarianceThreshold(threshold=0.01)
X_selected = selector.fit_transform(X)
# Statistical tests (for classification)
selector = SelectKBest(score_func=f_classif, k=10) # ANOVA F-value
selector = SelectKBest(score_func=chi2, k=10) # Chi-squared (non-negative features)
selector = SelectKBest(score_func=mutual_info_classif, k=10)
X_selected = selector.fit_transform(X, y)
# Recursive Feature Elimination
from sklearn.ensemble import RandomForestClassifier
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=10)
rfe.fit(X_train, y_train)
X_selected = rfe.transform(X)
# L1-based selection (Lasso zeroes out unimportant features)
lasso = Lasso(alpha=0.01)
selector = SelectFromModel(lasso)
selector.fit(X_train, y_train)
X_selected = selector.transform(X)
# Feature importance from tree-based models
rf = RandomForestClassifier().fit(X_train, y_train)
importances = rf.feature_importances_
sorted_idx = np.argsort(importances)[::-1]