datarekha

scikit-learn cheat sheet

The estimator API, preprocessing, pipelines, model selection, and metrics — the sklearn workflow end to end.

The estimator API

Every sklearn object follows the same four-method contract.

from sklearn.linear_model import LinearRegression

model = LinearRegression()          # 1. instantiate (no data yet)
model.fit(X_train, y_train)         # 2. learn from training data
preds = model.predict(X_test)       # 3. apply to new data
score = model.score(X_test, y_test) # 4. R² for regressors, accuracy for classifiers

Transformers add transform and fit_transform:

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # fit ON train, then transform
X_test_scaled  = scaler.transform(X_test)        # transform ONLY — never fit on test

Rule: fit transformers on training data only. Fitting on test data leaks distribution information and inflates scores.

Train / test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,       # 20 % held out
    random_state=42,     # reproducibility
    stratify=y,          # preserve class proportions — always use for classification
)

For regression, drop stratify. For time-series, use TimeSeriesSplit instead (no random shuffle).

Preprocessing

Scaling

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

# StandardScaler: zero mean, unit variance — default choice for most models
sc = StandardScaler()
X_scaled = sc.fit_transform(X_train)

# MinMaxScaler: maps to [0, 1] — use when you need bounded range (e.g. neural net inputs)
mm = MinMaxScaler()
X_mm = mm.fit_transform(X_train)

# RobustScaler: uses median and IQR — better when outliers are present
rb = RobustScaler()
X_rb = rb.fit_transform(X_train)

Encoding categoricals

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# OneHotEncoder: creates a binary column per category — for nominal features
ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
X_ohe = ohe.fit_transform(X_cat_train)

# OrdinalEncoder: maps categories to integers 0, 1, 2 … — for ordinal features
oe = OrdinalEncoder(categories=[["low", "medium", "high"]])
X_oe = oe.fit_transform(X_ord_train)

Imputing missing values

from sklearn.impute import SimpleImputer, KNNImputer

# Numeric: fill with median (robust to outliers)
num_imp = SimpleImputer(strategy="median")
X_num = num_imp.fit_transform(X_num_train)

# Categorical: fill with most frequent value
cat_imp = SimpleImputer(strategy="most_frequent")
X_cat = cat_imp.fit_transform(X_cat_train)

# KNN imputation: uses neighbouring rows — better quality, slower
knn_imp = KNNImputer(n_neighbors=5)
X_knn = knn_imp.fit_transform(X_train)

ColumnTransformer — mixed-type data

Apply different transformers to numeric and categorical columns in one step.

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

num_cols = ["age", "income", "hours_per_week"]
cat_cols = ["education", "occupation", "marital_status"]

num_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scale",  StandardScaler()),
])

cat_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("ohe",    OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
])

preprocessor = ColumnTransformer([
    ("num", num_pipe, num_cols),
    ("cat", cat_pipe, cat_cols),
])

# Returns a numpy array with all features transformed and concatenated
X_prep = preprocessor.fit_transform(df_train)

Pipeline — no-leakage end-to-end

A Pipeline chains preprocessing and a model. fit trains every step in order; predict applies them in order. The test set never touches fit.

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline([
    ("pre",   preprocessor),          # ColumnTransformer from above
    ("model", RandomForestClassifier(n_estimators=200, random_state=42)),
])

pipe.fit(X_train, y_train)            # preprocessor.fit_transform + model.fit
preds = pipe.predict(X_test)          # preprocessor.transform + model.predict
proba = pipe.predict_proba(X_test)    # probability scores per class
score = pipe.score(X_test, y_test)    # accuracy by default

Pass a Pipeline anywhere you would pass a model: cross_val_score, GridSearchCV, etc. The split happens before fit so leakage is structurally impossible.

Common models — one-line setup

from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans

# --- Regression ---
lr  = LinearRegression()                          # OLS, no regularisation
rid = Ridge(alpha=1.0)                            # L2 — shrinks coefficients
las = Lasso(alpha=0.01)                           # L1 — sparse, zeros out features

# --- Classification ---
log = LogisticRegression(C=1.0, max_iter=1000)    # linear, probabilistic output
rfc = RandomForestClassifier(n_estimators=300, max_depth=None, random_state=42)
gbc = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05)
hgb = HistGradientBoostingClassifier(max_iter=300, early_stopping=True)  # fast, handles NaN natively
svc = SVC(kernel="rbf", C=1.0, probability=True)  # probability=True enables predict_proba
knn = KNeighborsClassifier(n_neighbors=5)

# --- Clustering (unsupervised — no y) ---
km  = KMeans(n_clusters=5, random_state=42, n_init="auto")
km.fit(X)
labels = km.labels_               # cluster assignment per row
centers = km.cluster_centers_     # centroid coordinates

HistGradientBoostingClassifier is the modern default for tabular data: it handles missing values, trains faster than GradientBoosting, and matches or beats XGBoost on many benchmarks.

Cross-validation

from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold

# 5-fold CV — returns one score per fold
scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring="roc_auc")
print(scores.mean(), scores.std())   # mean ± std is the reliable estimate

# KFold: for regression (no class imbalance concern)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# StratifiedKFold: for classification — preserves class ratios in every fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(pipe, X_train, y_train, cv=skf, scoring="f1_weighted")

Multiple metrics at once:

from sklearn.model_selection import cross_validate

results = cross_validate(
    pipe, X_train, y_train,
    cv=skf,
    scoring=["accuracy", "roc_auc", "f1_weighted"],
    return_train_score=True,   # detect overfitting: train >> val
)
# results is a dict; each key is an array of length n_splits
import numpy as np
print("Val ROC AUC:", results["test_roc_auc"].mean().round(4))

GridSearchCV — exhaustive

from sklearn.model_selection import GridSearchCV

param_grid = {
    "model__n_estimators": [100, 300, 500],
    "model__max_depth":    [None, 5, 10],
    "model__min_samples_leaf": [1, 5],
}
# Key: prefix each param with the Pipeline step name + "__"

gs = GridSearchCV(
    pipe, param_grid,
    cv=skf,
    scoring="roc_auc",
    n_jobs=-1,        # use all CPU cores
    verbose=1,
)
gs.fit(X_train, y_train)

print(gs.best_params_)
print(gs.best_score_)

best_model = gs.best_estimator_   # already fitted on full X_train
preds = best_model.predict(X_test)

RandomizedSearchCV — faster for wide spaces

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, loguniform

param_dist = {
    "model__n_estimators":      randint(100, 800),
    "model__max_depth":         [None, 5, 10, 20],
    "model__learning_rate":     loguniform(0.01, 0.3),
    "model__subsample":         [0.6, 0.8, 1.0],
}

rs = RandomizedSearchCV(
    pipe, param_dist,
    n_iter=50,         # number of random samples — trade time for coverage
    cv=skf,
    scoring="roc_auc",
    random_state=42,
    n_jobs=-1,
)
rs.fit(X_train, y_train)
print(rs.best_params_, rs.best_score_)

Prefer RandomizedSearchCV when there are more than 3 hyperparameters or continuous ranges. It finds good configs with far fewer evaluations.

Metrics

Classification

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, confusion_matrix,
    ConfusionMatrixDisplay,
)

y_pred  = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]   # probability of positive class

acc  = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average="weighted")
rec  = recall_score(y_test, y_pred,    average="weighted")
f1   = f1_score(y_test, y_pred,        average="weighted")
auc  = roc_auc_score(y_test, y_proba)          # binary; use multi_class="ovr" for multiclass

# Full breakdown per class — the most useful single call
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(cm, display_labels=model.classes_)
disp.plot()

average options: "binary" (two classes), "macro" (equal weight per class), "weighted" (weight by support — use when class sizes differ).

Regression

from sklearn.metrics import (
    mean_absolute_error, mean_squared_error,
    root_mean_squared_error, r2_score,
    mean_absolute_percentage_error,
)

y_pred = model.predict(X_test)

mae  = mean_absolute_error(y_test, y_pred)
mse  = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)   # sklearn 1.4+
r2   = r2_score(y_test, y_pred)                  # 1.0 is perfect, can be negative
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f"RMSE {rmse:.3f} | R² {r2:.3f}")

Handling class imbalance

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# class_weight="balanced": sklearn computes weights inversely proportional
# to class frequency and applies them during fit — minority class gets
# higher weight without oversampling

log = LogisticRegression(class_weight="balanced", max_iter=1000)
rfc = RandomForestClassifier(class_weight="balanced", n_estimators=300)

# For custom weights:
weights = {0: 1, 1: 10}   # penalise misclassifying class 1 ten times more
log2 = LogisticRegression(class_weight=weights)

# Use roc_auc or f1 (not accuracy) to evaluate imbalanced problems
scores = cross_val_score(rfc, X_train, y_train, cv=skf, scoring="roc_auc")

Feature importance

Tree-based (built-in)

import pandas as pd
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=300, random_state=42)
rfc.fit(X_train, y_train)

importances = pd.Series(rfc.feature_importances_, index=feature_names)
importances.sort_values(ascending=False).head(15).plot(kind="barh")

Tree-based importances can be biased toward high-cardinality features. Use permutation importance for a more reliable estimate.

Permutation importance — model-agnostic

from sklearn.inspection import permutation_importance

result = permutation_importance(
    model, X_test, y_test,
    n_repeats=10,          # shuffle each feature 10 times
    scoring="roc_auc",
    random_state=42,
    n_jobs=-1,
)

imp = pd.DataFrame({
    "feature":   feature_names,
    "mean":      result.importances_mean,
    "std":       result.importances_std,
}).sort_values("mean", ascending=False)

print(imp.head(10))

Permutation importance works on any fitted model (including a Pipeline) and uses the test set, so it reflects real held-out impact.

Saving and loading a model

import joblib

# Save — saves the entire fitted pipeline (preprocessor + model)
joblib.dump(pipe, "model_v1.joblib")

# Load — returns a ready-to-use estimator; no fit needed
pipe_loaded = joblib.load("model_v1.joblib")
preds = pipe_loaded.predict(X_new)

Save the Pipeline, not just the model. The preprocessor must transform new data identically to how it transformed training data, so saving them together is the only safe approach.

Full minimal workflow

import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score
import joblib

df = pd.read_csv("data.csv")
X = df.drop("target", axis=1)
y = df["target"]

num_cols = X.select_dtypes("number").columns.tolist()
cat_cols = X.select_dtypes("object").columns.tolist()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

pre = ColumnTransformer([
    ("num", Pipeline([("imp", SimpleImputer(strategy="median")),
                      ("sc",  StandardScaler())]), num_cols),
    ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                      ("ohe", OneHotEncoder(handle_unknown="ignore",
                                            sparse_output=False))]), cat_cols),
])

pipe = Pipeline([
    ("pre",   pre),
    ("model", HistGradientBoostingClassifier(max_iter=300,
                                              early_stopping=True,
                                              random_state=42)),
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_auc = cross_val_score(pipe, X_train, y_train, cv=skf, scoring="roc_auc")
print(f"CV ROC AUC: {cv_auc.mean():.4f} ± {cv_auc.std():.4f}")

pipe.fit(X_train, y_train)
y_pred  = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print(f"Test ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")

joblib.dump(pipe, "pipeline_final.joblib")
Go deeper The full scikit-learn course →

Explore further

Skip to content