datarekha

Pandas cheat sheet

Read, clean, reshape, group, and join data — the pandas operations you reach for every day.

Import and Create

import pandas as pd
import numpy as np

# Series
s = pd.Series([10, 20, 30], index=["a", "b", "c"])

# DataFrame from dict
df = pd.DataFrame({
    "name":   ["Alice", "Bob", "Carol"],
    "age":    [25, 32, 28],
    "salary": [70_000, 85_000, 90_000],
})

# From list of dicts
rows = [{"x": 1, "y": 2}, {"x": 3, "y": 4}]
df = pd.DataFrame(rows)

# From numpy array
df = pd.DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"])

# Range index, explicit dtypes at construction (pandas 2.x)
df = pd.DataFrame({"id": pd.array([1, 2, 3], dtype="int32"),
                   "score": pd.array([9.1, 8.4, 7.7], dtype="float32")})

Reading and Writing

# --- CSV ---
df = pd.read_csv("data.csv")
df = pd.read_csv("data.csv",
    usecols=["id", "value"],     # read subset of columns
    dtype={"id": "int32"},       # specify dtypes upfront
    parse_dates=["created_at"],  # parse date column
    na_values=["NA", "N/A", ""],
    nrows=10_000,                # read first N rows
    chunksize=50_000,            # returns iterator of chunks
)

df.to_csv("out.csv", index=False)

# --- Parquet (recommended for large data) ---
df = pd.read_parquet("data.parquet", columns=["id", "value"])
df.to_parquet("out.parquet", index=False, compression="snappy")

# --- Excel ---
df = pd.read_excel("data.xlsx", sheet_name="Sheet1")
df.to_excel("out.xlsx", sheet_name="Results", index=False)

# --- SQL ---
import sqlalchemy
engine = sqlalchemy.create_engine("postgresql+psycopg2://user:pw@host/db")
df = pd.read_sql("SELECT * FROM orders WHERE status = 'active'", engine)
df = pd.read_sql_table("orders", engine)
df.to_sql("results", engine, if_exists="replace", index=False)

# --- JSON ---
df = pd.read_json("data.json", orient="records")

# --- Clipboard (handy for quick imports) ---
df = pd.read_clipboard()

Inspecting

df.head(10)          # first N rows
df.tail(5)           # last N rows
df.sample(5)         # random sample
df.shape             # (rows, cols) tuple
df.columns.tolist()  # list of column names
df.dtypes            # dtype of each column
df.info()            # dtypes + non-null counts + memory
df.describe()        # count/mean/std/min/quartiles/max for numerics
df.describe(include="all")   # include object/category columns
df.describe(percentiles=[.1, .25, .5, .75, .9])

df["col"].value_counts()          # frequency table
df["col"].value_counts(normalize=True)  # relative frequencies
df["col"].nunique()               # number of distinct values
df["col"].unique()                # array of distinct values

df.memory_usage(deep=True)        # bytes per column (deep=True counts strings)
df.memory_usage(deep=True).sum()  # total bytes

Selecting

# Single column → Series
s = df["name"]

# Multiple columns → DataFrame
sub = df[["name", "salary"]]

# --- loc: label-based ---
df.loc[0]                        # row by label
df.loc[0:4]                      # rows 0 through 4 inclusive
df.loc[0:4, "name":"salary"]     # row slice, column slice
df.loc[[0, 2, 4], ["name"]]      # specific rows + columns

# --- iloc: position-based ---
df.iloc[0]                       # first row
df.iloc[:5]                      # first 5 rows
df.iloc[:5, [0, 2]]              # first 5 rows, columns 0 and 2
df.iloc[-1]                      # last row

# --- Boolean mask ---
df[df["age"] > 28]
df[(df["age"] > 25) & (df["salary"] >= 80_000)]
df[~df["name"].str.startswith("A")]

# --- query (readable, safe with spaces in names) ---
df.query("age > 28 and salary >= 80_000")
threshold = 80_000
df.query("salary >= @threshold")  # pass local variables with @

# --- isin ---
df[df["name"].isin(["Alice", "Carol"])]
df[~df["status"].isin(["cancelled", "refunded"])]

# --- where: keep shape, replace False cells ---
df["salary"].where(df["salary"] > 75_000, other=0)

# --- at / iat: fastest single-cell access ---
df.at[0, "name"]     # label
df.iat[0, 0]         # position

Adding and Dropping Columns

# Add derived column
df["bonus"] = df["salary"] * 0.10

# assign() for chaining (returns new DataFrame)
df = df.assign(
    tax    = df["salary"] * 0.30,
    net    = lambda x: x["salary"] - x["tax"],
)

# Insert at specific position
df.insert(1, "country", "USA")

# Drop columns
df = df.drop(columns=["bonus", "tax"])

# Drop rows by index
df = df.drop(index=[0, 3])

# Rename columns
df = df.rename(columns={"salary": "annual_salary", "age": "age_years"})

# Rename all at once
df.columns = ["id", "name", "age", "salary"]

# Lowercase all column names
df.columns = df.columns.str.lower().str.replace(" ", "_")

Missing Data

df.isna()                         # boolean mask
df.isna().sum()                   # count NaN per column
df.isna().sum() / len(df) * 100   # % missing per column
df.notna()                        # inverse

# Drop rows/columns
df.dropna()                       # drop rows with any NaN
df.dropna(how="all")              # drop rows where ALL values are NaN
df.dropna(subset=["salary"])      # drop only if salary is NaN
df.dropna(thresh=3)               # keep rows with at least 3 non-null values
df.dropna(axis=1)                 # drop columns with any NaN

# Fill
df.fillna(0)
df.fillna({"salary": df["salary"].median(), "name": "Unknown"})
df["col"].fillna(method="ffill")  # forward fill (propagate last valid)
df["col"].fillna(method="bfill")  # backward fill
df["col"].ffill()                 # pandas 2.x preferred alias
df["col"].bfill()                 # pandas 2.x preferred alias

# Interpolate numeric gaps
df["price"].interpolate(method="linear")
df["price"].interpolate(method="time")  # for DatetimeIndex

# Replace arbitrary values
df.replace(-999, np.nan)
df.replace({"status": {"Y": True, "N": False}})

Dtypes and Memory Optimization

# Check dtypes
df.dtypes

# Cast single column
df["age"] = df["age"].astype("int16")
df["score"] = df["score"].astype("float32")
df["flag"] = df["flag"].astype(bool)

# Cast multiple columns at once
df = df.astype({"age": "int16", "salary": "float32"})

# Convert object → category (saves memory when cardinality is low)
df["status"] = df["status"].astype("category")

# Nullable integer types (pandas 2.x — allow NaN without float promotion)
df["count"] = df["count"].astype("Int32")   # capital I = nullable

# Nullable string type
df["name"] = df["name"].astype("string")    # StringDtype, not object

# Infer better dtypes automatically (pandas 2.x)
df = df.convert_dtypes()  # upgrades to best nullable types

# Memory before vs after
before = df.memory_usage(deep=True).sum()
df = df.astype({"age": "int16", "salary": "float32",
                "status": "category"})
after = df.memory_usage(deep=True).sum()
print(f"Saved {(before - after) / 1e6:.1f} MB")

Sorting

df.sort_values("salary", ascending=False)
df.sort_values(["department", "salary"], ascending=[True, False])
df.sort_values("score", na_position="last")   # NaN at end (default)
df.sort_values("score", na_position="first")

# Sort index
df.sort_index()
df.sort_index(ascending=False)

# Rank (useful for percentiles)
df["rank"] = df["salary"].rank(method="dense", ascending=False)

# nsmallest / nlargest (faster than sort_values + head)
df.nlargest(10, "salary")
df.nsmallest(5, "price")

GroupBy

g = df.groupby("department")

# Single aggregation
g["salary"].mean()
g["salary"].sum()

# Multiple columns
g[["salary", "bonus"]].mean()

# Multiple aggregations on one column
g["salary"].agg(["mean", "median", "std", "count"])

# Named aggregations (pandas 0.25+, essential in 2.x)
result = df.groupby("department").agg(
    avg_salary   = ("salary", "mean"),
    max_salary   = ("salary", "max"),
    headcount    = ("id",     "count"),
    total_bonus  = ("bonus",  "sum"),
)

# Custom function
g["salary"].agg(lambda s: s.quantile(0.9))

# Multiple columns, multiple aggs
result = df.groupby("department").agg(
    salary_mean  = ("salary", "mean"),
    salary_std   = ("salary", "std"),
    age_median   = ("age",    "median"),
)

# transform: broadcast group result back to original index (same length as df)
df["dept_avg_salary"]  = df.groupby("department")["salary"].transform("mean")
df["salary_vs_dept"]   = df["salary"] - df["dept_avg_salary"]

# filter: keep groups satisfying a condition
df.groupby("department").filter(lambda g: g["salary"].mean() > 75_000)

# apply: arbitrary function returning a DataFrame/Series per group
df.groupby("department").apply(lambda g: g.nlargest(3, "salary"),
                               include_groups=False)

# as_index=False → result looks like a flat DataFrame
df.groupby("department", as_index=False)["salary"].mean()

# Multiple grouping keys
df.groupby(["department", "level"])["salary"].mean().reset_index()

# Groupby on time periods
df.set_index("date").groupby(pd.Grouper(freq="ME"))["revenue"].sum()

Pivot, Pivot Table, and Melt

# pivot: unique values in index/columns required
df.pivot(index="date", columns="product", values="sales")

# pivot_table: handles duplicates via aggregation
pt = df.pivot_table(
    index="region",
    columns="quarter",
    values="revenue",
    aggfunc="sum",
    fill_value=0,
    margins=True,   # adds row/column totals
)

# melt: wide → long (unpivot)
df_long = df.melt(
    id_vars=["id", "name"],
    value_vars=["q1", "q2", "q3", "q4"],
    var_name="quarter",
    value_name="revenue",
)

# stack / unstack (work with MultiIndex)
df.stack()    # column level → row level
df.unstack()  # innermost row level → column level

# crosstab: frequency table
pd.crosstab(df["region"], df["product"])
pd.crosstab(df["region"], df["product"], values=df["sales"], aggfunc="sum")

Merge, Join, and Concat

# merge (SQL-style joins)
out = pd.merge(left, right, on="id")                        # inner join (default)
out = pd.merge(left, right, on="id", how="left")            # left join
out = pd.merge(left, right, on="id", how="right")           # right join
out = pd.merge(left, right, on="id", how="outer")           # full outer
out = pd.merge(left, right, on="id", how="cross")           # cartesian product

# join on different column names
out = pd.merge(left, right, left_on="user_id", right_on="id")

# merge on multiple keys
out = pd.merge(left, right, on=["date", "product"])

# suffix for overlapping column names
out = pd.merge(left, right, on="id", suffixes=("_left", "_right"))

# join by index
out = left.join(right, how="left")

# concat: stack vertically or horizontally
out = pd.concat([df1, df2, df3], ignore_index=True)         # vertical stack
out = pd.concat([df1, df2], axis=1)                         # horizontal stack
out = pd.concat([df1, df2], ignore_index=True, sort=False)

# concat with keys → MultiIndex
out = pd.concat([df1, df2], keys=["train", "test"])

# Merge cheat table
# how       | rows kept
# --------- | -----------------------
# "inner"   | only matching rows
# "left"    | all left + matching right
# "right"   | all right + matching left
# "outer"   | all rows from both

Apply, Map, and Vectorization

# --- map: element-wise on a Series ---
df["grade"] = df["score"].map({90: "A", 80: "B", 70: "C"})
df["name_upper"] = df["name"].map(str.upper)

# --- apply on Series: same as map for functions ---
df["log_salary"] = df["salary"].apply(np.log)

# --- apply on DataFrame: row-wise or column-wise ---
df.apply(np.sum, axis=0)          # column-wise (default)
df.apply(np.sum, axis=1)          # row-wise

# --- applymap / map (DataFrame element-wise, pandas 2.1+) ---
df.map(lambda x: round(x, 2))    # pandas 2.1+ (applymap is deprecated)

# --- PREFER vectorized operations over apply ---
# Slow (apply):
df["tax"] = df["salary"].apply(lambda x: x * 0.30)
# Fast (vectorized):
df["tax"] = df["salary"] * 0.30

# Slow (apply row-wise):
df["full_name"] = df.apply(lambda r: r["first"] + " " + r["last"], axis=1)
# Fast (vectorized string concat):
df["full_name"] = df["first"] + " " + df["last"]

# np.where: vectorized if/else
df["category"] = np.where(df["salary"] > 80_000, "senior", "junior")

# np.select: vectorized multi-condition
conditions = [df["score"] >= 90, df["score"] >= 75, df["score"] >= 60]
choices    = ["A", "B", "C"]
df["grade"] = np.select(conditions, choices, default="F")

String Methods

s = df["name"]

s.str.lower()
s.str.upper()
s.str.title()
s.str.strip()           # strip whitespace
s.str.strip("$")        # strip specific chars

s.str.len()             # length of each string
s.str.contains("alice", case=False, na=False)   # returns boolean mask
s.str.startswith("A")
s.str.endswith("n")

s.str.replace("old", "new", regex=False)
s.str.replace(r"\d+", "", regex=True)

s.str.split(",")                    # returns Series of lists
s.str.split(",", expand=True)       # returns DataFrame of columns
s.str.split(",", n=1, expand=True)  # split at most once

s.str.extract(r"(\d{4})")           # first capture group → DataFrame col
s.str.extractall(r"(\d+)")          # all matches → MultiIndex DataFrame

s.str.get(0)            # first element of each list (after split)
s.str[0]                # first character
s.str[1:4]              # slice

s.str.cat(sep=", ")     # concatenate all strings into one

# Pad and zfill
s.str.pad(10, side="right", fillchar=" ")
df["zip"].str.zfill(5)

Datetime

# Parse on read
df = pd.read_csv("data.csv", parse_dates=["date"])

# Convert after load
df["date"] = pd.to_datetime(df["date"])
df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")
df["date"] = pd.to_datetime(df["date"], errors="coerce")  # NaT on failure

# dt accessor
df["date"].dt.year
df["date"].dt.month
df["date"].dt.day
df["date"].dt.day_name()       # "Monday", "Tuesday" ...
df["date"].dt.hour
df["date"].dt.minute
df["date"].dt.date             # Python date objects
df["date"].dt.is_month_end
df["date"].dt.is_quarter_start
df["date"].dt.floor("h")       # floor to hour
df["date"].dt.normalize()      # floor to midnight
df["date"].dt.tz_localize("UTC")
df["date"].dt.tz_convert("US/Eastern")

# Timedelta arithmetic
df["duration"] = df["end_date"] - df["start_date"]
df["duration"].dt.days
df["duration"].dt.total_seconds()

# Date range
pd.date_range("2024-01-01", periods=12, freq="ME")   # month end
pd.date_range("2024-01-01", "2024-12-31", freq="W")  # weekly

# Resample (requires DatetimeIndex or specify on= column)
ts = df.set_index("date")
ts.resample("ME")["revenue"].sum()     # monthly total
ts.resample("W")["price"].mean()       # weekly mean
ts.resample("QE")["sales"].agg(["sum", "mean"])

# Rolling window
df["rolling_7d_avg"] = df["price"].rolling(window=7).mean()
df["rolling_30d_std"] = df["price"].rolling(window=30).std()
df["ewm_alpha0.3"]   = df["price"].ewm(alpha=0.3).mean()

# Expanding (cumulative)
df["cumsum"]  = df["revenue"].expanding().sum()
df["cummean"] = df["revenue"].expanding().mean()

# Shift
df["prev_day_price"] = df["price"].shift(1)
df["next_day_price"] = df["price"].shift(-1)
df["pct_change"]     = df["price"].pct_change()

Deduplication

df.duplicated()                          # boolean mask: True for duplicate rows
df.duplicated(subset=["email"])          # based on specific columns
df.duplicated(keep="first")             # mark all dupes except first occurrence
df.duplicated(keep="last")              # mark all dupes except last
df.duplicated(keep=False)               # mark ALL duplicates

df.drop_duplicates()
df.drop_duplicates(subset=["email"])
df.drop_duplicates(subset=["email", "date"], keep="last")

# Count duplicates
df.duplicated().sum()

Renaming

# Rename specific columns
df.rename(columns={"old_name": "new_name", "x": "feature_x"})

# Rename index labels
df.rename(index={0: "row_a", 1: "row_b"})

# Rename via function
df.rename(columns=str.upper)
df.rename(columns=lambda c: c.strip().lower().replace(" ", "_"))

# Reset index (turn index into a column)
df.reset_index()
df.reset_index(drop=True)    # discard old index, don't add as column

# Set a column as index
df.set_index("id")
df.set_index(["year", "month"])  # MultiIndex

Method Chaining

# All pandas operations that return DataFrames support chaining.
# Use assign, query, pipe, and rename instead of in-place mutations.

result = (
    pd.read_csv("orders.csv", parse_dates=["order_date"])
    .rename(columns=str.lower)
    .assign(
        order_date = lambda x: pd.to_datetime(x["order_date"]),
        revenue    = lambda x: x["quantity"] * x["unit_price"],
        year       = lambda x: x["order_date"].dt.year,
    )
    .query("status != 'cancelled'")
    .dropna(subset=["customer_id"])
    .astype({"region": "category"})
    .groupby(["year", "region"], as_index=False)
    .agg(
        total_revenue = ("revenue", "sum"),
        order_count   = ("order_id", "count"),
    )
    .sort_values("total_revenue", ascending=False)
    .reset_index(drop=True)
)

# pipe: call any function in the chain
def add_pct_of_total(df, col):
    df = df.copy()
    df[col + "_pct"] = df[col] / df[col].sum() * 100
    return df

result = (
    df.groupby("department", as_index=False)["salary"].sum()
    .pipe(add_pct_of_total, "salary")
)

Avoiding SettingWithCopyWarning

# The warning fires when pandas cannot tell if you are mutating a copy or a view.

# BAD — may silently fail or warn:
df[df["age"] > 30]["salary"] = 99_000

# GOOD — use .loc with the original DataFrame:
df.loc[df["age"] > 30, "salary"] = 99_000

# GOOD — work on an explicit copy if you need a separate object:
subset = df[df["age"] > 30].copy()
subset["salary"] = 99_000

# GOOD — assign() always returns a new DataFrame:
df = df.assign(salary=lambda x: np.where(x["age"] > 30, 99_000, x["salary"]))

# pandas 2.0 introduced Copy-on-Write (CoW) — enable it now to future-proof code:
pd.options.mode.copy_on_write = True   # will be the default in pandas 3.0
# With CoW enabled, every setitem on a slice automatically copies → no more ambiguity.

Useful One-Liners

# Frequency table with %
df["status"].value_counts(normalize=True).mul(100).round(1).astype(str) + "%"

# Top N per group
df.groupby("dept").apply(lambda g: g.nlargest(3, "salary"), include_groups=False)

# Lag features for time series
df[[f"lag_{n}" for n in range(1, 4)]] = df["value"].shift([1, 2, 3])

# Cumulative share (Pareto)
vc = df["product"].value_counts()
vc_df = vc.reset_index()
vc_df["cum_pct"] = vc_df["count"].cumsum() / vc_df["count"].sum()

# Flatten MultiIndex columns after groupby
df.columns = ["_".join(c).strip("_") for c in df.columns]

# Profile memory-heavy columns
df.memory_usage(deep=True).sort_values(ascending=False).head(10)

# Clip outliers
df["salary"] = df["salary"].clip(lower=30_000, upper=200_000)

# Correlation matrix
df[numeric_cols].corr().style.background_gradient(cmap="RdYlGn")

# Quick scatter via pandas (wraps matplotlib)
df.plot.scatter(x="age", y="salary", alpha=0.4)

# Export column types for documentation
df.dtypes.reset_index().rename(columns={"index": "column", 0: "dtype"}).to_markdown()
Go deeper The full Pandas course →

Explore further

Skip to content