The estimator API
Every sklearn object follows the same four-method contract.
from sklearn.linear_model import LinearRegression
model = LinearRegression() # 1. instantiate (no data yet)
model.fit(X_train, y_train) # 2. learn from training data
preds = model.predict(X_test) # 3. apply to new data
score = model.score(X_test, y_test) # 4. R² for regressors, accuracy for classifiers
Transformers add transform and fit_transform:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train) # fit ON train, then transform
X_test_scaled = scaler.transform(X_test) # transform ONLY — never fit on test
Rule: fit transformers on training data only. Fitting on test data leaks distribution information and inflates scores.
Train / test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.2, # 20 % held out
random_state=42, # reproducibility
stratify=y, # preserve class proportions — always use for classification
)
For regression, drop stratify. For time-series, use TimeSeriesSplit
instead (no random shuffle).
Preprocessing
Scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
# StandardScaler: zero mean, unit variance — default choice for most models
sc = StandardScaler()
X_scaled = sc.fit_transform(X_train)
# MinMaxScaler: maps to [0, 1] — use when you need bounded range (e.g. neural net inputs)
mm = MinMaxScaler()
X_mm = mm.fit_transform(X_train)
# RobustScaler: uses median and IQR — better when outliers are present
rb = RobustScaler()
X_rb = rb.fit_transform(X_train)
Encoding categoricals
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
# OneHotEncoder: creates a binary column per category — for nominal features
ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
X_ohe = ohe.fit_transform(X_cat_train)
# OrdinalEncoder: maps categories to integers 0, 1, 2 … — for ordinal features
oe = OrdinalEncoder(categories=[["low", "medium", "high"]])
X_oe = oe.fit_transform(X_ord_train)
Imputing missing values
from sklearn.impute import SimpleImputer, KNNImputer
# Numeric: fill with median (robust to outliers)
num_imp = SimpleImputer(strategy="median")
X_num = num_imp.fit_transform(X_num_train)
# Categorical: fill with most frequent value
cat_imp = SimpleImputer(strategy="most_frequent")
X_cat = cat_imp.fit_transform(X_cat_train)
# KNN imputation: uses neighbouring rows — better quality, slower
knn_imp = KNNImputer(n_neighbors=5)
X_knn = knn_imp.fit_transform(X_train)
ColumnTransformer — mixed-type data
Apply different transformers to numeric and categorical columns in one step.
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
num_cols = ["age", "income", "hours_per_week"]
cat_cols = ["education", "occupation", "marital_status"]
num_pipe = Pipeline([
("impute", SimpleImputer(strategy="median")),
("scale", StandardScaler()),
])
cat_pipe = Pipeline([
("impute", SimpleImputer(strategy="most_frequent")),
("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
])
preprocessor = ColumnTransformer([
("num", num_pipe, num_cols),
("cat", cat_pipe, cat_cols),
])
# Returns a numpy array with all features transformed and concatenated
X_prep = preprocessor.fit_transform(df_train)
Pipeline — no-leakage end-to-end
A Pipeline chains preprocessing and a model. fit trains every step in
order; predict applies them in order. The test set never touches fit.
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
pipe = Pipeline([
("pre", preprocessor), # ColumnTransformer from above
("model", RandomForestClassifier(n_estimators=200, random_state=42)),
])
pipe.fit(X_train, y_train) # preprocessor.fit_transform + model.fit
preds = pipe.predict(X_test) # preprocessor.transform + model.predict
proba = pipe.predict_proba(X_test) # probability scores per class
score = pipe.score(X_test, y_test) # accuracy by default
Pass a Pipeline anywhere you would pass a model: cross_val_score,
GridSearchCV, etc. The split happens before fit so leakage is
structurally impossible.
Common models — one-line setup
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
# --- Regression ---
lr = LinearRegression() # OLS, no regularisation
rid = Ridge(alpha=1.0) # L2 — shrinks coefficients
las = Lasso(alpha=0.01) # L1 — sparse, zeros out features
# --- Classification ---
log = LogisticRegression(C=1.0, max_iter=1000) # linear, probabilistic output
rfc = RandomForestClassifier(n_estimators=300, max_depth=None, random_state=42)
gbc = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05)
hgb = HistGradientBoostingClassifier(max_iter=300, early_stopping=True) # fast, handles NaN natively
svc = SVC(kernel="rbf", C=1.0, probability=True) # probability=True enables predict_proba
knn = KNeighborsClassifier(n_neighbors=5)
# --- Clustering (unsupervised — no y) ---
km = KMeans(n_clusters=5, random_state=42, n_init="auto")
km.fit(X)
labels = km.labels_ # cluster assignment per row
centers = km.cluster_centers_ # centroid coordinates
HistGradientBoostingClassifier is the modern default for tabular data: it handles missing values, trains faster than GradientBoosting, and matches or beats XGBoost on many benchmarks.
Cross-validation
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
# 5-fold CV — returns one score per fold
scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring="roc_auc")
print(scores.mean(), scores.std()) # mean ± std is the reliable estimate
# KFold: for regression (no class imbalance concern)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# StratifiedKFold: for classification — preserves class ratios in every fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipe, X_train, y_train, cv=skf, scoring="f1_weighted")
Multiple metrics at once:
from sklearn.model_selection import cross_validate
results = cross_validate(
pipe, X_train, y_train,
cv=skf,
scoring=["accuracy", "roc_auc", "f1_weighted"],
return_train_score=True, # detect overfitting: train >> val
)
# results is a dict; each key is an array of length n_splits
import numpy as np
print("Val ROC AUC:", results["test_roc_auc"].mean().round(4))
Hyperparameter search
GridSearchCV — exhaustive
from sklearn.model_selection import GridSearchCV
param_grid = {
"model__n_estimators": [100, 300, 500],
"model__max_depth": [None, 5, 10],
"model__min_samples_leaf": [1, 5],
}
# Key: prefix each param with the Pipeline step name + "__"
gs = GridSearchCV(
pipe, param_grid,
cv=skf,
scoring="roc_auc",
n_jobs=-1, # use all CPU cores
verbose=1,
)
gs.fit(X_train, y_train)
print(gs.best_params_)
print(gs.best_score_)
best_model = gs.best_estimator_ # already fitted on full X_train
preds = best_model.predict(X_test)
RandomizedSearchCV — faster for wide spaces
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, loguniform
param_dist = {
"model__n_estimators": randint(100, 800),
"model__max_depth": [None, 5, 10, 20],
"model__learning_rate": loguniform(0.01, 0.3),
"model__subsample": [0.6, 0.8, 1.0],
}
rs = RandomizedSearchCV(
pipe, param_dist,
n_iter=50, # number of random samples — trade time for coverage
cv=skf,
scoring="roc_auc",
random_state=42,
n_jobs=-1,
)
rs.fit(X_train, y_train)
print(rs.best_params_, rs.best_score_)
Prefer RandomizedSearchCV when there are more than 3 hyperparameters or continuous ranges. It finds good configs with far fewer evaluations.
Metrics
Classification
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
roc_auc_score, classification_report, confusion_matrix,
ConfusionMatrixDisplay,
)
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1] # probability of positive class
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average="weighted")
rec = recall_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")
auc = roc_auc_score(y_test, y_proba) # binary; use multi_class="ovr" for multiclass
# Full breakdown per class — the most useful single call
print(classification_report(y_test, y_pred))
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(cm, display_labels=model.classes_)
disp.plot()
average options: "binary" (two classes), "macro" (equal weight per
class), "weighted" (weight by support — use when class sizes differ).
Regression
from sklearn.metrics import (
mean_absolute_error, mean_squared_error,
root_mean_squared_error, r2_score,
mean_absolute_percentage_error,
)
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred) # sklearn 1.4+
r2 = r2_score(y_test, y_pred) # 1.0 is perfect, can be negative
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"RMSE {rmse:.3f} | R² {r2:.3f}")
Handling class imbalance
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# class_weight="balanced": sklearn computes weights inversely proportional
# to class frequency and applies them during fit — minority class gets
# higher weight without oversampling
log = LogisticRegression(class_weight="balanced", max_iter=1000)
rfc = RandomForestClassifier(class_weight="balanced", n_estimators=300)
# For custom weights:
weights = {0: 1, 1: 10} # penalise misclassifying class 1 ten times more
log2 = LogisticRegression(class_weight=weights)
# Use roc_auc or f1 (not accuracy) to evaluate imbalanced problems
scores = cross_val_score(rfc, X_train, y_train, cv=skf, scoring="roc_auc")
Feature importance
Tree-based (built-in)
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=300, random_state=42)
rfc.fit(X_train, y_train)
importances = pd.Series(rfc.feature_importances_, index=feature_names)
importances.sort_values(ascending=False).head(15).plot(kind="barh")
Tree-based importances can be biased toward high-cardinality features. Use permutation importance for a more reliable estimate.
Permutation importance — model-agnostic
from sklearn.inspection import permutation_importance
result = permutation_importance(
model, X_test, y_test,
n_repeats=10, # shuffle each feature 10 times
scoring="roc_auc",
random_state=42,
n_jobs=-1,
)
imp = pd.DataFrame({
"feature": feature_names,
"mean": result.importances_mean,
"std": result.importances_std,
}).sort_values("mean", ascending=False)
print(imp.head(10))
Permutation importance works on any fitted model (including a Pipeline) and uses the test set, so it reflects real held-out impact.
Saving and loading a model
import joblib
# Save — saves the entire fitted pipeline (preprocessor + model)
joblib.dump(pipe, "model_v1.joblib")
# Load — returns a ready-to-use estimator; no fit needed
pipe_loaded = joblib.load("model_v1.joblib")
preds = pipe_loaded.predict(X_new)
Save the Pipeline, not just the model. The preprocessor must transform new data identically to how it transformed training data, so saving them together is the only safe approach.
Full minimal workflow
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score
import joblib
df = pd.read_csv("data.csv")
X = df.drop("target", axis=1)
y = df["target"]
num_cols = X.select_dtypes("number").columns.tolist()
cat_cols = X.select_dtypes("object").columns.tolist()
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=42
)
pre = ColumnTransformer([
("num", Pipeline([("imp", SimpleImputer(strategy="median")),
("sc", StandardScaler())]), num_cols),
("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
("ohe", OneHotEncoder(handle_unknown="ignore",
sparse_output=False))]), cat_cols),
])
pipe = Pipeline([
("pre", pre),
("model", HistGradientBoostingClassifier(max_iter=300,
early_stopping=True,
random_state=42)),
])
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_auc = cross_val_score(pipe, X_train, y_train, cv=skf, scoring="roc_auc")
print(f"CV ROC AUC: {cv_auc.mean():.4f} ± {cv_auc.std():.4f}")
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred))
print(f"Test ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")
joblib.dump(pipe, "pipeline_final.joblib")