pandas

Series, DataFrames & I/O

pandas: Series, DataFrames & I/O pandas is the standard Python library for data manipulation and analysis. A DataFrame is a table of labeled columns; a Series i…

pandas: Series, DataFrames & I/O

pandas is the standard Python library for data manipulation and analysis. A DataFrame is a table of labeled columns; a Series is a single labeled column. pandas is built on NumPy and integrates with the entire Python data stack.

Series

import pandas as pd
import numpy as np

# Create Series
s = pd.Series([1, 2, 3, 4, 5])
s = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
s = pd.Series({'x': 10, 'y': 20, 'z': 30})

# Access
s[0]          # by position
s['a']        # by label
s.iloc[0]     # positional
s.loc['a']    # label-based

# Operations (vectorized)
s * 2
s[s > 2]      # boolean filter
s.mean(), s.sum(), s.max(), s.min()
s.value_counts()
s.sort_values()
s.rank()
s.cumsum()
s.diff()
s.pct_change()
s.map(lambda x: x ** 2)
s.apply(str)
s.astype('float32')

DataFrames

# Create DataFrame
df = pd.DataFrame({
    'name':   ['Alice', 'Bob', 'Charlie'],
    'age':    [30, 25, 35],
    'salary': [70000, 55000, 90000],
})

df = pd.DataFrame(records_list)          # from list of dicts
df = pd.DataFrame(np.random.randn(5, 3), columns=['A', 'B', 'C'])

# Inspect
df.shape          # (rows, cols)
df.dtypes         # column types
df.info()         # types + non-null counts
df.describe()     # stats: count, mean, std, min, quartiles, max
df.head(10)       # first 10 rows
df.tail(5)        # last 5 rows
df.sample(5)      # random 5 rows
df.columns.tolist()
df.index.tolist()

# Add/modify columns
df['bonus'] = df['salary'] * 0.1
df['full_name'] = df['first'] + ' ' + df['last']
df.assign(tax=lambda x: x['salary'] * 0.3)   # non-mutating

# Delete columns
df.drop(columns=['bonus'], inplace=True)
df.drop(columns=['a', 'b'])

# Rename
df.rename(columns={'name': 'full_name', 'age': 'years'}, inplace=True)

# Sort
df.sort_values('salary', ascending=False)
df.sort_values(['dept', 'salary'], ascending=[True, False])
df.sort_index()

Reading & Writing Data

# CSV
df = pd.read_csv('data.csv')
df = pd.read_csv('data.csv',
    sep=';',
    header=0,                   # row to use as column names
    index_col='id',             # column to use as index
    usecols=['name', 'salary'], # only load these columns
    dtype={'salary': 'float32'},
    parse_dates=['hire_date'],
    na_values=['NA', 'NULL', '-'],
    nrows=1000,                 # read only first 1000 rows
    encoding='utf-8',
    chunksize=10000,            # iterate in chunks for large files
)
df.to_csv('output.csv', index=False)

# Excel
df = pd.read_excel('report.xlsx', sheet_name='Sheet1')
df.to_excel('output.xlsx', sheet_name='Results', index=False)

# JSON
df = pd.read_json('data.json')
df = pd.read_json('data.json', orient='records')
df.to_json('output.json', orient='records', indent=2)

# Parquet (columnar, fast, compressed)
df = pd.read_parquet('data.parquet')
df.to_parquet('output.parquet', compression='snappy')

# SQL
from sqlalchemy import create_engine
engine = create_engine('postgresql://user:pass@host/db')
df = pd.read_sql('SELECT * FROM users WHERE active = TRUE', engine)
df = pd.read_sql_table('orders', engine)
df.to_sql('processed_orders', engine, if_exists='append', index=False, chunksize=5000)

# Large CSV with chunking
chunk_list = []
for chunk in pd.read_csv('large.csv', chunksize=100_000):
    processed = chunk[chunk['status'] == 'active']
    chunk_list.append(processed)
df = pd.concat(chunk_list, ignore_index=True)

pandas

Indexing, Selection & GroupBy

pandas: Indexing, Selection & GroupBy Selection & Filtering # Column selection df['name'] # Series df[['name', 'salary']] # DataFrame (list of columns) # Row se…

pandas: Indexing, Selection & GroupBy

Selection & Filtering

# Column selection
df['name']                          # Series
df[['name', 'salary']]              # DataFrame (list of columns)

# Row selection by position (iloc)
df.iloc[0]                          # first row as Series
df.iloc[0:5]                        # first 5 rows
df.iloc[:, 0:2]                     # all rows, first 2 columns
df.iloc[[0, 2, 4]]                  # specific rows
df.iloc[0, 1]                       # row 0, col 1 (scalar)

# Row selection by label (loc)
df.loc[0]                           # row with index label 0
df.loc[0:5]                         # rows 0–5 inclusive (label-based!)
df.loc[0, 'salary']                 # specific cell
df.loc[[0, 2], ['name', 'salary']]  # specific rows and columns

# Boolean filtering
df[df['salary'] > 60000]
df[(df['salary'] > 60000) & (df['age'] < 35)]
df[(df['dept'] == 'Engineering') | (df['dept'] == 'Design')]
df[df['status'].isin(['active', 'pending'])]
df[~df['name'].str.contains('Alice')]  # negate

# query() — readable string-based filtering
df.query('salary > 60000 and age < 35')
df.query('dept in ["Engineering", "Design"]')
df.query('name == @name_variable')   # use variable with @

# where / mask
df.where(df > 0, other=0)           # keep values > 0, replace rest with 0
df.mask(df > 0, other=np.nan)       # replace values > 0 with NaN

Apply & Vectorized Operations

# apply — row or column-wise (slow; prefer vectorized)
df['grade'] = df['score'].apply(lambda x: 'A' if x >= 90 else 'B' if x >= 80 else 'C')
df['full_name'] = df.apply(lambda row: f"{row['first']} {row['last']}", axis=1)

# map — element-wise on Series
df['dept_code'] = df['dept'].map({'Engineering': 'ENG', 'Design': 'DES'})

# Vectorized string operations (.str)
df['name'].str.upper()
df['name'].str.lower()
df['email'].str.split('@').str[1]   # domain
df['name'].str.startswith('A')
df['desc'].str.contains('python', case=False, na=False)
df['name'].str.strip()
df['phone'].str.replace(r'[^0-9]', '', regex=True)
df['name'].str.len()

# Date operations (.dt)
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day_of_week'] = df['date'].dt.day_name()
df['days_since'] = (pd.Timestamp.now() - df['date']).dt.days

GroupBy & Aggregation

# GroupBy basics
grouped = df.groupby('dept')
grouped['salary'].mean()              # mean salary per dept
grouped['salary'].agg(['mean', 'min', 'max', 'count'])

# Multiple columns
df.groupby(['dept', 'level'])['salary'].mean()

# agg with different functions per column
df.groupby('dept').agg({
    'salary': ['mean', 'sum'],
    'age': 'mean',
    'name': 'count',
})

# Named aggregations (pandas 0.25+)
df.groupby('dept').agg(
    avg_salary=('salary', 'mean'),
    total_salary=('salary', 'sum'),
    headcount=('name', 'count'),
    max_age=('age', 'max'),
)

# Custom aggregation
df.groupby('dept')['salary'].agg(lambda x: x.quantile(0.9))  # 90th percentile

# transform — broadcast result back to original index
df['dept_avg_salary'] = df.groupby('dept')['salary'].transform('mean')
df['salary_deviation'] = df['salary'] - df['dept_avg_salary']

# filter — keep groups satisfying a condition
df.groupby('dept').filter(lambda x: x['salary'].mean() > 70000)

# Pivot table
pd.pivot_table(df,
    values='salary',
    index='dept',
    columns='level',
    aggfunc='mean',
    fill_value=0,
)

# crosstab
pd.crosstab(df['dept'], df['gender'], margins=True)

pandas

Cleaning, Merging & Performance

pandas: Cleaning, Merging & Performance Missing Data # Detect missing values df.isnull() # boolean DataFrame df.isnull().sum() # count per column df.isnull().su…

pandas: Cleaning, Merging & Performance

Missing Data

# Detect missing values
df.isnull()                         # boolean DataFrame
df.isnull().sum()                   # count per column
df.isnull().sum() / len(df) * 100  # % missing per column

# Drop missing
df.dropna()                         # drop rows with ANY null
df.dropna(subset=['name', 'email']) # drop rows with null in specific columns
df.dropna(how='all')                # drop rows where ALL values are null
df.dropna(thresh=3)                 # keep rows with at least 3 non-null values
df.dropna(axis=1)                   # drop columns with any null

# Fill missing values
df.fillna(0)
df.fillna({'salary': df['salary'].median(), 'dept': 'Unknown'})
df['salary'].fillna(df['salary'].mean(), inplace=True)
df.ffill()                          # forward fill (propagate last valid)
df.bfill()                          # backward fill
df['salary'].interpolate(method='linear')

# Replace specific values
df.replace(-999, np.nan)
df.replace({'dept': {'Eng': 'Engineering', 'Des': 'Design'}})

Merging & Joining

# merge — like SQL JOIN
pd.merge(df_left, df_right, on='id', how='inner')   # inner, left, right, outer
pd.merge(df_left, df_right, left_on='user_id', right_on='id')  # different key names
pd.merge(df_left, df_right, on=['dept', 'level'])   # multiple keys
pd.merge(df_left, df_right, on='id', how='left', suffixes=('_left', '_right'))  # duplicate column names

# concat — stack DataFrames
pd.concat([df1, df2])               # stack rows (union)
pd.concat([df1, df2], ignore_index=True)  # reset index
pd.concat([df1, df2], axis=1)       # stack columns side by side

# join — index-based merge
df1.join(df2, how='left')
df1.join(df2, on='dept_id')        # join on a column to other's index

# Update values from another DataFrame
df1.update(df2)                     # overwrite non-null values from df2

# merge_asof — time-series merge (nearest key)
pd.merge_asof(events, prices, on='timestamp', direction='backward')

Reshaping

# melt — wide to long (unpivot)
df_long = pd.melt(df,
    id_vars=['name', 'dept'],          # columns to keep
    value_vars=['q1', 'q2', 'q3', 'q4'],
    var_name='quarter',
    value_name='revenue',
)

# pivot — long to wide
df_wide = df_long.pivot(index='name', columns='quarter', values='revenue')
df_wide = df_long.pivot_table(index='name', columns='quarter', values='revenue', aggfunc='sum')

# stack / unstack (multi-index)
df.stack()      # columns → index level
df.unstack()    # innermost index level → columns

# explode — one element per row from list column
df_exploded = df.explode('tags')

Performance Tips

Use categorical dtype for low-cardinality string columns: df["dept"] = df["dept"].astype("category") — saves 5-10x memory.
Downcast numeric types: pd.to_numeric(df["age"], downcast="integer") — int64 → int8 saves 8x.
Avoid loops: use vectorized operations, .str, .dt, .apply with caution. Loops kill performance.
Read only needed columns: pd.read_csv(usecols=[...]) — avoids loading unused data.
Use read_parquet over read_csv for repeated analysis — 10-100x faster loading.
eval() and query(): use pandas.eval("a + b") for large arithmetic — avoids creating intermediate arrays.
inplace=True rarely helps performance and makes code harder to chain. Return new DataFrames instead.
polars: if pandas is too slow, polars is a Rust-based DataFrame library that is 10-100x faster for most operations.

pandas notes for developers