Import and Create
import pandas as pd
import numpy as np
# Series
s = pd.Series([10, 20, 30], index=["a", "b", "c"])
# DataFrame from dict
df = pd.DataFrame({
"name": ["Alice", "Bob", "Carol"],
"age": [25, 32, 28],
"salary": [70_000, 85_000, 90_000],
})
# From list of dicts
rows = [{"x": 1, "y": 2}, {"x": 3, "y": 4}]
df = pd.DataFrame(rows)
# From numpy array
df = pd.DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"])
# Range index, explicit dtypes at construction (pandas 2.x)
df = pd.DataFrame({"id": pd.array([1, 2, 3], dtype="int32"),
"score": pd.array([9.1, 8.4, 7.7], dtype="float32")})
Reading and Writing
# --- CSV ---
df = pd.read_csv("data.csv")
df = pd.read_csv("data.csv",
usecols=["id", "value"], # read subset of columns
dtype={"id": "int32"}, # specify dtypes upfront
parse_dates=["created_at"], # parse date column
na_values=["NA", "N/A", ""],
nrows=10_000, # read first N rows
chunksize=50_000, # returns iterator of chunks
)
df.to_csv("out.csv", index=False)
# --- Parquet (recommended for large data) ---
df = pd.read_parquet("data.parquet", columns=["id", "value"])
df.to_parquet("out.parquet", index=False, compression="snappy")
# --- Excel ---
df = pd.read_excel("data.xlsx", sheet_name="Sheet1")
df.to_excel("out.xlsx", sheet_name="Results", index=False)
# --- SQL ---
import sqlalchemy
engine = sqlalchemy.create_engine("postgresql+psycopg2://user:pw@host/db")
df = pd.read_sql("SELECT * FROM orders WHERE status = 'active'", engine)
df = pd.read_sql_table("orders", engine)
df.to_sql("results", engine, if_exists="replace", index=False)
# --- JSON ---
df = pd.read_json("data.json", orient="records")
# --- Clipboard (handy for quick imports) ---
df = pd.read_clipboard()
Inspecting
df.head(10) # first N rows
df.tail(5) # last N rows
df.sample(5) # random sample
df.shape # (rows, cols) tuple
df.columns.tolist() # list of column names
df.dtypes # dtype of each column
df.info() # dtypes + non-null counts + memory
df.describe() # count/mean/std/min/quartiles/max for numerics
df.describe(include="all") # include object/category columns
df.describe(percentiles=[.1, .25, .5, .75, .9])
df["col"].value_counts() # frequency table
df["col"].value_counts(normalize=True) # relative frequencies
df["col"].nunique() # number of distinct values
df["col"].unique() # array of distinct values
df.memory_usage(deep=True) # bytes per column (deep=True counts strings)
df.memory_usage(deep=True).sum() # total bytes
Selecting
# Single column → Series
s = df["name"]
# Multiple columns → DataFrame
sub = df[["name", "salary"]]
# --- loc: label-based ---
df.loc[0] # row by label
df.loc[0:4] # rows 0 through 4 inclusive
df.loc[0:4, "name":"salary"] # row slice, column slice
df.loc[[0, 2, 4], ["name"]] # specific rows + columns
# --- iloc: position-based ---
df.iloc[0] # first row
df.iloc[:5] # first 5 rows
df.iloc[:5, [0, 2]] # first 5 rows, columns 0 and 2
df.iloc[-1] # last row
# --- Boolean mask ---
df[df["age"] > 28]
df[(df["age"] > 25) & (df["salary"] >= 80_000)]
df[~df["name"].str.startswith("A")]
# --- query (readable, safe with spaces in names) ---
df.query("age > 28 and salary >= 80_000")
threshold = 80_000
df.query("salary >= @threshold") # pass local variables with @
# --- isin ---
df[df["name"].isin(["Alice", "Carol"])]
df[~df["status"].isin(["cancelled", "refunded"])]
# --- where: keep shape, replace False cells ---
df["salary"].where(df["salary"] > 75_000, other=0)
# --- at / iat: fastest single-cell access ---
df.at[0, "name"] # label
df.iat[0, 0] # position
Adding and Dropping Columns
# Add derived column
df["bonus"] = df["salary"] * 0.10
# assign() for chaining (returns new DataFrame)
df = df.assign(
tax = df["salary"] * 0.30,
net = lambda x: x["salary"] - x["tax"],
)
# Insert at specific position
df.insert(1, "country", "USA")
# Drop columns
df = df.drop(columns=["bonus", "tax"])
# Drop rows by index
df = df.drop(index=[0, 3])
# Rename columns
df = df.rename(columns={"salary": "annual_salary", "age": "age_years"})
# Rename all at once
df.columns = ["id", "name", "age", "salary"]
# Lowercase all column names
df.columns = df.columns.str.lower().str.replace(" ", "_")
Missing Data
df.isna() # boolean mask
df.isna().sum() # count NaN per column
df.isna().sum() / len(df) * 100 # % missing per column
df.notna() # inverse
# Drop rows/columns
df.dropna() # drop rows with any NaN
df.dropna(how="all") # drop rows where ALL values are NaN
df.dropna(subset=["salary"]) # drop only if salary is NaN
df.dropna(thresh=3) # keep rows with at least 3 non-null values
df.dropna(axis=1) # drop columns with any NaN
# Fill
df.fillna(0)
df.fillna({"salary": df["salary"].median(), "name": "Unknown"})
df["col"].fillna(method="ffill") # forward fill (propagate last valid)
df["col"].fillna(method="bfill") # backward fill
df["col"].ffill() # pandas 2.x preferred alias
df["col"].bfill() # pandas 2.x preferred alias
# Interpolate numeric gaps
df["price"].interpolate(method="linear")
df["price"].interpolate(method="time") # for DatetimeIndex
# Replace arbitrary values
df.replace(-999, np.nan)
df.replace({"status": {"Y": True, "N": False}})
Dtypes and Memory Optimization
# Check dtypes
df.dtypes
# Cast single column
df["age"] = df["age"].astype("int16")
df["score"] = df["score"].astype("float32")
df["flag"] = df["flag"].astype(bool)
# Cast multiple columns at once
df = df.astype({"age": "int16", "salary": "float32"})
# Convert object → category (saves memory when cardinality is low)
df["status"] = df["status"].astype("category")
# Nullable integer types (pandas 2.x — allow NaN without float promotion)
df["count"] = df["count"].astype("Int32") # capital I = nullable
# Nullable string type
df["name"] = df["name"].astype("string") # StringDtype, not object
# Infer better dtypes automatically (pandas 2.x)
df = df.convert_dtypes() # upgrades to best nullable types
# Memory before vs after
before = df.memory_usage(deep=True).sum()
df = df.astype({"age": "int16", "salary": "float32",
"status": "category"})
after = df.memory_usage(deep=True).sum()
print(f"Saved {(before - after) / 1e6:.1f} MB")
Sorting
df.sort_values("salary", ascending=False)
df.sort_values(["department", "salary"], ascending=[True, False])
df.sort_values("score", na_position="last") # NaN at end (default)
df.sort_values("score", na_position="first")
# Sort index
df.sort_index()
df.sort_index(ascending=False)
# Rank (useful for percentiles)
df["rank"] = df["salary"].rank(method="dense", ascending=False)
# nsmallest / nlargest (faster than sort_values + head)
df.nlargest(10, "salary")
df.nsmallest(5, "price")
GroupBy
g = df.groupby("department")
# Single aggregation
g["salary"].mean()
g["salary"].sum()
# Multiple columns
g[["salary", "bonus"]].mean()
# Multiple aggregations on one column
g["salary"].agg(["mean", "median", "std", "count"])
# Named aggregations (pandas 0.25+, essential in 2.x)
result = df.groupby("department").agg(
avg_salary = ("salary", "mean"),
max_salary = ("salary", "max"),
headcount = ("id", "count"),
total_bonus = ("bonus", "sum"),
)
# Custom function
g["salary"].agg(lambda s: s.quantile(0.9))
# Multiple columns, multiple aggs
result = df.groupby("department").agg(
salary_mean = ("salary", "mean"),
salary_std = ("salary", "std"),
age_median = ("age", "median"),
)
# transform: broadcast group result back to original index (same length as df)
df["dept_avg_salary"] = df.groupby("department")["salary"].transform("mean")
df["salary_vs_dept"] = df["salary"] - df["dept_avg_salary"]
# filter: keep groups satisfying a condition
df.groupby("department").filter(lambda g: g["salary"].mean() > 75_000)
# apply: arbitrary function returning a DataFrame/Series per group
df.groupby("department").apply(lambda g: g.nlargest(3, "salary"),
include_groups=False)
# as_index=False → result looks like a flat DataFrame
df.groupby("department", as_index=False)["salary"].mean()
# Multiple grouping keys
df.groupby(["department", "level"])["salary"].mean().reset_index()
# Groupby on time periods
df.set_index("date").groupby(pd.Grouper(freq="ME"))["revenue"].sum()
Pivot, Pivot Table, and Melt
# pivot: unique values in index/columns required
df.pivot(index="date", columns="product", values="sales")
# pivot_table: handles duplicates via aggregation
pt = df.pivot_table(
index="region",
columns="quarter",
values="revenue",
aggfunc="sum",
fill_value=0,
margins=True, # adds row/column totals
)
# melt: wide → long (unpivot)
df_long = df.melt(
id_vars=["id", "name"],
value_vars=["q1", "q2", "q3", "q4"],
var_name="quarter",
value_name="revenue",
)
# stack / unstack (work with MultiIndex)
df.stack() # column level → row level
df.unstack() # innermost row level → column level
# crosstab: frequency table
pd.crosstab(df["region"], df["product"])
pd.crosstab(df["region"], df["product"], values=df["sales"], aggfunc="sum")
Merge, Join, and Concat
# merge (SQL-style joins)
out = pd.merge(left, right, on="id") # inner join (default)
out = pd.merge(left, right, on="id", how="left") # left join
out = pd.merge(left, right, on="id", how="right") # right join
out = pd.merge(left, right, on="id", how="outer") # full outer
out = pd.merge(left, right, on="id", how="cross") # cartesian product
# join on different column names
out = pd.merge(left, right, left_on="user_id", right_on="id")
# merge on multiple keys
out = pd.merge(left, right, on=["date", "product"])
# suffix for overlapping column names
out = pd.merge(left, right, on="id", suffixes=("_left", "_right"))
# join by index
out = left.join(right, how="left")
# concat: stack vertically or horizontally
out = pd.concat([df1, df2, df3], ignore_index=True) # vertical stack
out = pd.concat([df1, df2], axis=1) # horizontal stack
out = pd.concat([df1, df2], ignore_index=True, sort=False)
# concat with keys → MultiIndex
out = pd.concat([df1, df2], keys=["train", "test"])
# Merge cheat table
# how | rows kept
# --------- | -----------------------
# "inner" | only matching rows
# "left" | all left + matching right
# "right" | all right + matching left
# "outer" | all rows from both
Apply, Map, and Vectorization
# --- map: element-wise on a Series ---
df["grade"] = df["score"].map({90: "A", 80: "B", 70: "C"})
df["name_upper"] = df["name"].map(str.upper)
# --- apply on Series: same as map for functions ---
df["log_salary"] = df["salary"].apply(np.log)
# --- apply on DataFrame: row-wise or column-wise ---
df.apply(np.sum, axis=0) # column-wise (default)
df.apply(np.sum, axis=1) # row-wise
# --- applymap / map (DataFrame element-wise, pandas 2.1+) ---
df.map(lambda x: round(x, 2)) # pandas 2.1+ (applymap is deprecated)
# --- PREFER vectorized operations over apply ---
# Slow (apply):
df["tax"] = df["salary"].apply(lambda x: x * 0.30)
# Fast (vectorized):
df["tax"] = df["salary"] * 0.30
# Slow (apply row-wise):
df["full_name"] = df.apply(lambda r: r["first"] + " " + r["last"], axis=1)
# Fast (vectorized string concat):
df["full_name"] = df["first"] + " " + df["last"]
# np.where: vectorized if/else
df["category"] = np.where(df["salary"] > 80_000, "senior", "junior")
# np.select: vectorized multi-condition
conditions = [df["score"] >= 90, df["score"] >= 75, df["score"] >= 60]
choices = ["A", "B", "C"]
df["grade"] = np.select(conditions, choices, default="F")
String Methods
s = df["name"]
s.str.lower()
s.str.upper()
s.str.title()
s.str.strip() # strip whitespace
s.str.strip("$") # strip specific chars
s.str.len() # length of each string
s.str.contains("alice", case=False, na=False) # returns boolean mask
s.str.startswith("A")
s.str.endswith("n")
s.str.replace("old", "new", regex=False)
s.str.replace(r"\d+", "", regex=True)
s.str.split(",") # returns Series of lists
s.str.split(",", expand=True) # returns DataFrame of columns
s.str.split(",", n=1, expand=True) # split at most once
s.str.extract(r"(\d{4})") # first capture group → DataFrame col
s.str.extractall(r"(\d+)") # all matches → MultiIndex DataFrame
s.str.get(0) # first element of each list (after split)
s.str[0] # first character
s.str[1:4] # slice
s.str.cat(sep=", ") # concatenate all strings into one
# Pad and zfill
s.str.pad(10, side="right", fillchar=" ")
df["zip"].str.zfill(5)
Datetime
# Parse on read
df = pd.read_csv("data.csv", parse_dates=["date"])
# Convert after load
df["date"] = pd.to_datetime(df["date"])
df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")
df["date"] = pd.to_datetime(df["date"], errors="coerce") # NaT on failure
# dt accessor
df["date"].dt.year
df["date"].dt.month
df["date"].dt.day
df["date"].dt.day_name() # "Monday", "Tuesday" ...
df["date"].dt.hour
df["date"].dt.minute
df["date"].dt.date # Python date objects
df["date"].dt.is_month_end
df["date"].dt.is_quarter_start
df["date"].dt.floor("h") # floor to hour
df["date"].dt.normalize() # floor to midnight
df["date"].dt.tz_localize("UTC")
df["date"].dt.tz_convert("US/Eastern")
# Timedelta arithmetic
df["duration"] = df["end_date"] - df["start_date"]
df["duration"].dt.days
df["duration"].dt.total_seconds()
# Date range
pd.date_range("2024-01-01", periods=12, freq="ME") # month end
pd.date_range("2024-01-01", "2024-12-31", freq="W") # weekly
# Resample (requires DatetimeIndex or specify on= column)
ts = df.set_index("date")
ts.resample("ME")["revenue"].sum() # monthly total
ts.resample("W")["price"].mean() # weekly mean
ts.resample("QE")["sales"].agg(["sum", "mean"])
# Rolling window
df["rolling_7d_avg"] = df["price"].rolling(window=7).mean()
df["rolling_30d_std"] = df["price"].rolling(window=30).std()
df["ewm_alpha0.3"] = df["price"].ewm(alpha=0.3).mean()
# Expanding (cumulative)
df["cumsum"] = df["revenue"].expanding().sum()
df["cummean"] = df["revenue"].expanding().mean()
# Shift
df["prev_day_price"] = df["price"].shift(1)
df["next_day_price"] = df["price"].shift(-1)
df["pct_change"] = df["price"].pct_change()
Deduplication
df.duplicated() # boolean mask: True for duplicate rows
df.duplicated(subset=["email"]) # based on specific columns
df.duplicated(keep="first") # mark all dupes except first occurrence
df.duplicated(keep="last") # mark all dupes except last
df.duplicated(keep=False) # mark ALL duplicates
df.drop_duplicates()
df.drop_duplicates(subset=["email"])
df.drop_duplicates(subset=["email", "date"], keep="last")
# Count duplicates
df.duplicated().sum()
Renaming
# Rename specific columns
df.rename(columns={"old_name": "new_name", "x": "feature_x"})
# Rename index labels
df.rename(index={0: "row_a", 1: "row_b"})
# Rename via function
df.rename(columns=str.upper)
df.rename(columns=lambda c: c.strip().lower().replace(" ", "_"))
# Reset index (turn index into a column)
df.reset_index()
df.reset_index(drop=True) # discard old index, don't add as column
# Set a column as index
df.set_index("id")
df.set_index(["year", "month"]) # MultiIndex
Method Chaining
# All pandas operations that return DataFrames support chaining.
# Use assign, query, pipe, and rename instead of in-place mutations.
result = (
pd.read_csv("orders.csv", parse_dates=["order_date"])
.rename(columns=str.lower)
.assign(
order_date = lambda x: pd.to_datetime(x["order_date"]),
revenue = lambda x: x["quantity"] * x["unit_price"],
year = lambda x: x["order_date"].dt.year,
)
.query("status != 'cancelled'")
.dropna(subset=["customer_id"])
.astype({"region": "category"})
.groupby(["year", "region"], as_index=False)
.agg(
total_revenue = ("revenue", "sum"),
order_count = ("order_id", "count"),
)
.sort_values("total_revenue", ascending=False)
.reset_index(drop=True)
)
# pipe: call any function in the chain
def add_pct_of_total(df, col):
df = df.copy()
df[col + "_pct"] = df[col] / df[col].sum() * 100
return df
result = (
df.groupby("department", as_index=False)["salary"].sum()
.pipe(add_pct_of_total, "salary")
)
Avoiding SettingWithCopyWarning
# The warning fires when pandas cannot tell if you are mutating a copy or a view.
# BAD — may silently fail or warn:
df[df["age"] > 30]["salary"] = 99_000
# GOOD — use .loc with the original DataFrame:
df.loc[df["age"] > 30, "salary"] = 99_000
# GOOD — work on an explicit copy if you need a separate object:
subset = df[df["age"] > 30].copy()
subset["salary"] = 99_000
# GOOD — assign() always returns a new DataFrame:
df = df.assign(salary=lambda x: np.where(x["age"] > 30, 99_000, x["salary"]))
# pandas 2.0 introduced Copy-on-Write (CoW) — enable it now to future-proof code:
pd.options.mode.copy_on_write = True # will be the default in pandas 3.0
# With CoW enabled, every setitem on a slice automatically copies → no more ambiguity.
Useful One-Liners
# Frequency table with %
df["status"].value_counts(normalize=True).mul(100).round(1).astype(str) + "%"
# Top N per group
df.groupby("dept").apply(lambda g: g.nlargest(3, "salary"), include_groups=False)
# Lag features for time series
df[[f"lag_{n}" for n in range(1, 4)]] = df["value"].shift([1, 2, 3])
# Cumulative share (Pareto)
vc = df["product"].value_counts()
vc_df = vc.reset_index()
vc_df["cum_pct"] = vc_df["count"].cumsum() / vc_df["count"].sum()
# Flatten MultiIndex columns after groupby
df.columns = ["_".join(c).strip("_") for c in df.columns]
# Profile memory-heavy columns
df.memory_usage(deep=True).sort_values(ascending=False).head(10)
# Clip outliers
df["salary"] = df["salary"].clip(lower=30_000, upper=200_000)
# Correlation matrix
df[numeric_cols].corr().style.background_gradient(cmap="RdYlGn")
# Quick scatter via pandas (wraps matplotlib)
df.plot.scatter(x="age", y="salary", alpha=0.4)
# Export column types for documentation
df.dtypes.reset_index().rename(columns={"index": "column", 0: "dtype"}).to_markdown()