Pandas Cheat Sheet

Import and Create

import pandas as pd
import numpy as np

# Series
s = pd.Series([10, 20, 30], index=["a", "b", "c"])

# DataFrame from dict
df = pd.DataFrame({
    "name":   ["Alice", "Bob", "Carol"],
    "age":    [25, 32, 28],
    "salary": [70_000, 85_000, 90_000],
})

# From list of dicts
rows = [{"x": 1, "y": 2}, {"x": 3, "y": 4}]
df = pd.DataFrame(rows)

# From numpy array
df = pd.DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"])

# Range index, explicit dtypes at construction (pandas 2.x)
df = pd.DataFrame({"id": pd.array([1, 2, 3], dtype="int32"),
                   "score": pd.array([9.1, 8.4, 7.7], dtype="float32")})

Reading and Writing

# --- CSV ---
df = pd.read_csv("data.csv")
df = pd.read_csv("data.csv",
    usecols=["id", "value"],     # read subset of columns
    dtype={"id": "int32"},       # specify dtypes upfront
    parse_dates=["created_at"],  # parse date column
    na_values=["NA", "N/A", ""],
    nrows=10_000,                # read first N rows
    chunksize=50_000,            # returns iterator of chunks
)

df.to_csv("out.csv", index=False)

# --- Parquet (recommended for large data) ---
df = pd.read_parquet("data.parquet", columns=["id", "value"])
df.to_parquet("out.parquet", index=False, compression="snappy")

# --- Excel ---
df = pd.read_excel("data.xlsx", sheet_name="Sheet1")
df.to_excel("out.xlsx", sheet_name="Results", index=False)

# --- SQL ---
import sqlalchemy
engine = sqlalchemy.create_engine("postgresql+psycopg2://user:pw@host/db")
df = pd.read_sql("SELECT * FROM orders WHERE status = 'active'", engine)
df = pd.read_sql_table("orders", engine)
df.to_sql("results", engine, if_exists="replace", index=False)

# --- JSON ---
df = pd.read_json("data.json", orient="records")

# --- Clipboard (handy for quick imports) ---
df = pd.read_clipboard()

Inspecting

df.head(10)          # first N rows
df.tail(5)           # last N rows
df.sample(5)         # random sample
df.shape             # (rows, cols) tuple
df.columns.tolist()  # list of column names
df.dtypes            # dtype of each column
df.info()            # dtypes + non-null counts + memory
df.describe()        # count/mean/std/min/quartiles/max for numerics
df.describe(include="all")   # include object/category columns
df.describe(percentiles=[.1, .25, .5, .75, .9])

df["col"].value_counts()          # frequency table
df["col"].value_counts(normalize=True)  # relative frequencies
df["col"].nunique()               # number of distinct values
df["col"].unique()                # array of distinct values

df.memory_usage(deep=True)        # bytes per column (deep=True counts strings)
df.memory_usage(deep=True).sum()  # total bytes

Selecting

# Single column → Series
s = df["name"]

# Multiple columns → DataFrame
sub = df[["name", "salary"]]

# --- loc: label-based ---
df.loc[0]                        # row by label
df.loc[0:4]                      # rows 0 through 4 inclusive
df.loc[0:4, "name":"salary"]     # row slice, column slice
df.loc[[0, 2, 4], ["name"]]      # specific rows + columns

# --- iloc: position-based ---
df.iloc[0]                       # first row
df.iloc[:5]                      # first 5 rows
df.iloc[:5, [0, 2]]              # first 5 rows, columns 0 and 2
df.iloc[-1]                      # last row

# --- Boolean mask ---
df[df["age"] > 28]
df[(df["age"] > 25) & (df["salary"] >= 80_000)]
df[~df["name"].str.startswith("A")]

# --- query (readable, safe with spaces in names) ---
df.query("age > 28 and salary >= 80_000")
threshold = 80_000
df.query("salary >= @threshold")  # pass local variables with @

# --- isin ---
df[df["name"].isin(["Alice", "Carol"])]
df[~df["status"].isin(["cancelled", "refunded"])]

# --- where: keep shape, replace False cells ---
df["salary"].where(df["salary"] > 75_000, other=0)

# --- at / iat: fastest single-cell access ---
df.at[0, "name"]     # label
df.iat[0, 0]         # position

Adding and Dropping Columns

# Add derived column
df["bonus"] = df["salary"] * 0.10

# assign() for chaining (returns new DataFrame)
df = df.assign(
    tax    = df["salary"] * 0.30,
    net    = lambda x: x["salary"] - x["tax"],
)

# Insert at specific position
df.insert(1, "country", "USA")

# Drop columns
df = df.drop(columns=["bonus", "tax"])

# Drop rows by index
df = df.drop(index=[0, 3])

# Rename columns
df = df.rename(columns={"salary": "annual_salary", "age": "age_years"})

# Rename all at once
df.columns = ["id", "name", "age", "salary"]

# Lowercase all column names
df.columns = df.columns.str.lower().str.replace(" ", "_")

Missing Data

df.isna()                         # boolean mask
df.isna().sum()                   # count NaN per column
df.isna().sum() / len(df) * 100   # % missing per column
df.notna()                        # inverse

# Drop rows/columns
df.dropna()                       # drop rows with any NaN
df.dropna(how="all")              # drop rows where ALL values are NaN
df.dropna(subset=["salary"])      # drop only if salary is NaN
df.dropna(thresh=3)               # keep rows with at least 3 non-null values
df.dropna(axis=1)                 # drop columns with any NaN

# Fill
df.fillna(0)
df.fillna({"salary": df["salary"].median(), "name": "Unknown"})
df["col"].fillna(method="ffill")  # forward fill (propagate last valid)
df["col"].fillna(method="bfill")  # backward fill
df["col"].ffill()                 # pandas 2.x preferred alias
df["col"].bfill()                 # pandas 2.x preferred alias

# Interpolate numeric gaps
df["price"].interpolate(method="linear")
df["price"].interpolate(method="time")  # for DatetimeIndex

# Replace arbitrary values
df.replace(-999, np.nan)
df.replace({"status": {"Y": True, "N": False}})

Dtypes and Memory Optimization

# Check dtypes
df.dtypes

# Cast single column
df["age"] = df["age"].astype("int16")
df["score"] = df["score"].astype("float32")
df["flag"] = df["flag"].astype(bool)

# Cast multiple columns at once
df = df.astype({"age": "int16", "salary": "float32"})

# Convert object → category (saves memory when cardinality is low)
df["status"] = df["status"].astype("category")

# Nullable integer types (pandas 2.x — allow NaN without float promotion)
df["count"] = df["count"].astype("Int32")   # capital I = nullable

# Nullable string type
df["name"] = df["name"].astype("string")    # StringDtype, not object

# Infer better dtypes automatically (pandas 2.x)
df = df.convert_dtypes()  # upgrades to best nullable types

# Memory before vs after
before = df.memory_usage(deep=True).sum()
df = df.astype({"age": "int16", "salary": "float32",
                "status": "category"})
after = df.memory_usage(deep=True).sum()
print(f"Saved {(before - after) / 1e6:.1f} MB")

Sorting

df.sort_values("salary", ascending=False)
df.sort_values(["department", "salary"], ascending=[True, False])
df.sort_values("score", na_position="last")   # NaN at end (default)
df.sort_values("score", na_position="first")

# Sort index
df.sort_index()
df.sort_index(ascending=False)

# Rank (useful for percentiles)
df["rank"] = df["salary"].rank(method="dense", ascending=False)

# nsmallest / nlargest (faster than sort_values + head)
df.nlargest(10, "salary")
df.nsmallest(5, "price")

GroupBy

g = df.groupby("department")

# Single aggregation
g["salary"].mean()
g["salary"].sum()

# Multiple columns
g[["salary", "bonus"]].mean()

# Multiple aggregations on one column
g["salary"].agg(["mean", "median", "std", "count"])

# Named aggregations (pandas 0.25+, essential in 2.x)
result = df.groupby("department").agg(
    avg_salary   = ("salary", "mean"),
    max_salary   = ("salary", "max"),
    headcount    = ("id",     "count"),
    total_bonus  = ("bonus",  "sum"),
)

# Custom function
g["salary"].agg(lambda s: s.quantile(0.9))

# Multiple columns, multiple aggs
result = df.groupby("department").agg(
    salary_mean  = ("salary", "mean"),
    salary_std   = ("salary", "std"),
    age_median   = ("age",    "median"),
)

# transform: broadcast group result back to original index (same length as df)
df["dept_avg_salary"]  = df.groupby("department")["salary"].transform("mean")
df["salary_vs_dept"]   = df["salary"] - df["dept_avg_salary"]

# filter: keep groups satisfying a condition
df.groupby("department").filter(lambda g: g["salary"].mean() > 75_000)

# apply: arbitrary function returning a DataFrame/Series per group
df.groupby("department").apply(lambda g: g.nlargest(3, "salary"),
                               include_groups=False)

# as_index=False → result looks like a flat DataFrame
df.groupby("department", as_index=False)["salary"].mean()

# Multiple grouping keys
df.groupby(["department", "level"])["salary"].mean().reset_index()

# Groupby on time periods
df.set_index("date").groupby(pd.Grouper(freq="ME"))["revenue"].sum()

Pivot, Pivot Table, and Melt

# pivot: unique values in index/columns required
df.pivot(index="date", columns="product", values="sales")

# pivot_table: handles duplicates via aggregation
pt = df.pivot_table(
    index="region",
    columns="quarter",
    values="revenue",
    aggfunc="sum",
    fill_value=0,
    margins=True,   # adds row/column totals
)

# melt: wide → long (unpivot)
df_long = df.melt(
    id_vars=["id", "name"],
    value_vars=["q1", "q2", "q3", "q4"],
    var_name="quarter",
    value_name="revenue",
)

# stack / unstack (work with MultiIndex)
df.stack()    # column level → row level
df.unstack()  # innermost row level → column level

# crosstab: frequency table
pd.crosstab(df["region"], df["product"])
pd.crosstab(df["region"], df["product"], values=df["sales"], aggfunc="sum")

Merge, Join, and Concat

# merge (SQL-style joins)
out = pd.merge(left, right, on="id")                        # inner join (default)
out = pd.merge(left, right, on="id", how="left")            # left join
out = pd.merge(left, right, on="id", how="right")           # right join
out = pd.merge(left, right, on="id", how="outer")           # full outer
out = pd.merge(left, right, on="id", how="cross")           # cartesian product

# join on different column names
out = pd.merge(left, right, left_on="user_id", right_on="id")

# merge on multiple keys
out = pd.merge(left, right, on=["date", "product"])

# suffix for overlapping column names
out = pd.merge(left, right, on="id", suffixes=("_left", "_right"))

# join by index
out = left.join(right, how="left")

# concat: stack vertically or horizontally
out = pd.concat([df1, df2, df3], ignore_index=True)         # vertical stack
out = pd.concat([df1, df2], axis=1)                         # horizontal stack
out = pd.concat([df1, df2], ignore_index=True, sort=False)

# concat with keys → MultiIndex
out = pd.concat([df1, df2], keys=["train", "test"])

# Merge cheat table
# how       | rows kept
# --------- | -----------------------
# "inner"   | only matching rows
# "left"    | all left + matching right
# "right"   | all right + matching left
# "outer"   | all rows from both

Apply, Map, and Vectorization

# --- map: element-wise on a Series ---
df["grade"] = df["score"].map({90: "A", 80: "B", 70: "C"})
df["name_upper"] = df["name"].map(str.upper)

# --- apply on Series: same as map for functions ---
df["log_salary"] = df["salary"].apply(np.log)

# --- apply on DataFrame: row-wise or column-wise ---
df.apply(np.sum, axis=0)          # column-wise (default)
df.apply(np.sum, axis=1)          # row-wise

# --- applymap / map (DataFrame element-wise, pandas 2.1+) ---
df.map(lambda x: round(x, 2))    # pandas 2.1+ (applymap is deprecated)

# --- PREFER vectorized operations over apply ---
# Slow (apply):
df["tax"] = df["salary"].apply(lambda x: x * 0.30)
# Fast (vectorized):
df["tax"] = df["salary"] * 0.30

# Slow (apply row-wise):
df["full_name"] = df.apply(lambda r: r["first"] + " " + r["last"], axis=1)
# Fast (vectorized string concat):
df["full_name"] = df["first"] + " " + df["last"]

# np.where: vectorized if/else
df["category"] = np.where(df["salary"] > 80_000, "senior", "junior")

# np.select: vectorized multi-condition
conditions = [df["score"] >= 90, df["score"] >= 75, df["score"] >= 60]
choices    = ["A", "B", "C"]
df["grade"] = np.select(conditions, choices, default="F")

String Methods

s = df["name"]

s.str.lower()
s.str.upper()
s.str.title()
s.str.strip()           # strip whitespace
s.str.strip("$")        # strip specific chars

s.str.len()             # length of each string
s.str.contains("alice", case=False, na=False)   # returns boolean mask
s.str.startswith("A")
s.str.endswith("n")

s.str.replace("old", "new", regex=False)
s.str.replace(r"\d+", "", regex=True)

s.str.split(",")                    # returns Series of lists
s.str.split(",", expand=True)       # returns DataFrame of columns
s.str.split(",", n=1, expand=True)  # split at most once

s.str.extract(r"(\d{4})")           # first capture group → DataFrame col
s.str.extractall(r"(\d+)")          # all matches → MultiIndex DataFrame

s.str.get(0)            # first element of each list (after split)
s.str[0]                # first character
s.str[1:4]              # slice

s.str.cat(sep=", ")     # concatenate all strings into one

# Pad and zfill
s.str.pad(10, side="right", fillchar=" ")
df["zip"].str.zfill(5)

Datetime

# Parse on read
df = pd.read_csv("data.csv", parse_dates=["date"])

# Convert after load
df["date"] = pd.to_datetime(df["date"])
df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")
df["date"] = pd.to_datetime(df["date"], errors="coerce")  # NaT on failure

# dt accessor
df["date"].dt.year
df["date"].dt.month
df["date"].dt.day
df["date"].dt.day_name()       # "Monday", "Tuesday" ...
df["date"].dt.hour
df["date"].dt.minute
df["date"].dt.date             # Python date objects
df["date"].dt.is_month_end
df["date"].dt.is_quarter_start
df["date"].dt.floor("h")       # floor to hour
df["date"].dt.normalize()      # floor to midnight
df["date"].dt.tz_localize("UTC")
df["date"].dt.tz_convert("US/Eastern")

# Timedelta arithmetic
df["duration"] = df["end_date"] - df["start_date"]
df["duration"].dt.days
df["duration"].dt.total_seconds()

# Date range
pd.date_range("2024-01-01", periods=12, freq="ME")   # month end
pd.date_range("2024-01-01", "2024-12-31", freq="W")  # weekly

# Resample (requires DatetimeIndex or specify on= column)
ts = df.set_index("date")
ts.resample("ME")["revenue"].sum()     # monthly total
ts.resample("W")["price"].mean()       # weekly mean
ts.resample("QE")["sales"].agg(["sum", "mean"])

# Rolling window
df["rolling_7d_avg"] = df["price"].rolling(window=7).mean()
df["rolling_30d_std"] = df["price"].rolling(window=30).std()
df["ewm_alpha0.3"]   = df["price"].ewm(alpha=0.3).mean()

# Expanding (cumulative)
df["cumsum"]  = df["revenue"].expanding().sum()
df["cummean"] = df["revenue"].expanding().mean()

# Shift
df["prev_day_price"] = df["price"].shift(1)
df["next_day_price"] = df["price"].shift(-1)
df["pct_change"]     = df["price"].pct_change()

Deduplication

df.duplicated()                          # boolean mask: True for duplicate rows
df.duplicated(subset=["email"])          # based on specific columns
df.duplicated(keep="first")             # mark all dupes except first occurrence
df.duplicated(keep="last")              # mark all dupes except last
df.duplicated(keep=False)               # mark ALL duplicates

df.drop_duplicates()
df.drop_duplicates(subset=["email"])
df.drop_duplicates(subset=["email", "date"], keep="last")

# Count duplicates
df.duplicated().sum()

Renaming

# Rename specific columns
df.rename(columns={"old_name": "new_name", "x": "feature_x"})

# Rename index labels
df.rename(index={0: "row_a", 1: "row_b"})

# Rename via function
df.rename(columns=str.upper)
df.rename(columns=lambda c: c.strip().lower().replace(" ", "_"))

# Reset index (turn index into a column)
df.reset_index()
df.reset_index(drop=True)    # discard old index, don't add as column

# Set a column as index
df.set_index("id")
df.set_index(["year", "month"])  # MultiIndex

Method Chaining

# All pandas operations that return DataFrames support chaining.
# Use assign, query, pipe, and rename instead of in-place mutations.

result = (
    pd.read_csv("orders.csv", parse_dates=["order_date"])
    .rename(columns=str.lower)
    .assign(
        order_date = lambda x: pd.to_datetime(x["order_date"]),
        revenue    = lambda x: x["quantity"] * x["unit_price"],
        year       = lambda x: x["order_date"].dt.year,
    )
    .query("status != 'cancelled'")
    .dropna(subset=["customer_id"])
    .astype({"region": "category"})
    .groupby(["year", "region"], as_index=False)
    .agg(
        total_revenue = ("revenue", "sum"),
        order_count   = ("order_id", "count"),
    )
    .sort_values("total_revenue", ascending=False)
    .reset_index(drop=True)
)

# pipe: call any function in the chain
def add_pct_of_total(df, col):
    df = df.copy()
    df[col + "_pct"] = df[col] / df[col].sum() * 100
    return df

result = (
    df.groupby("department", as_index=False)["salary"].sum()
    .pipe(add_pct_of_total, "salary")
)

Avoiding SettingWithCopyWarning

# The warning fires when pandas cannot tell if you are mutating a copy or a view.

# BAD — may silently fail or warn:
df[df["age"] > 30]["salary"] = 99_000

# GOOD — use .loc with the original DataFrame:
df.loc[df["age"] > 30, "salary"] = 99_000

# GOOD — work on an explicit copy if you need a separate object:
subset = df[df["age"] > 30].copy()
subset["salary"] = 99_000

# GOOD — assign() always returns a new DataFrame:
df = df.assign(salary=lambda x: np.where(x["age"] > 30, 99_000, x["salary"]))

# pandas 2.0 introduced Copy-on-Write (CoW) — enable it now to future-proof code:
pd.options.mode.copy_on_write = True   # will be the default in pandas 3.0
# With CoW enabled, every setitem on a slice automatically copies → no more ambiguity.

Useful One-Liners

# Frequency table with %
df["status"].value_counts(normalize=True).mul(100).round(1).astype(str) + "%"

# Top N per group
df.groupby("dept").apply(lambda g: g.nlargest(3, "salary"), include_groups=False)

# Lag features for time series
df[[f"lag_{n}" for n in range(1, 4)]] = df["value"].shift([1, 2, 3])

# Cumulative share (Pareto)
vc = df["product"].value_counts()
vc_df = vc.reset_index()
vc_df["cum_pct"] = vc_df["count"].cumsum() / vc_df["count"].sum()

# Flatten MultiIndex columns after groupby
df.columns = ["_".join(c).strip("_") for c in df.columns]

# Profile memory-heavy columns
df.memory_usage(deep=True).sort_values(ascending=False).head(10)

# Clip outliers
df["salary"] = df["salary"].clip(lower=30_000, upper=200_000)

# Correlation matrix
df[numeric_cols].corr().style.background_gradient(cmap="RdYlGn")

# Quick scatter via pandas (wraps matplotlib)
df.plot.scatter(x="age", y="salary", alpha=0.4)

# Export column types for documentation
df.dtypes.reset_index().rename(columns={"index": "column", 0: "dtype"}).to_markdown()

Import and Create

Reading and Writing

Inspecting

Selecting

Adding and Dropping Columns

Missing Data

Dtypes and Memory Optimization

Sorting

GroupBy

Pivot, Pivot Table, and Melt

Merge, Join, and Concat

Apply, Map, and Vectorization

String Methods

Datetime

Deduplication

Renaming

Method Chaining

Avoiding SettingWithCopyWarning

Useful One-Liners

Explore further