Series, DataFrames & I/O
pandas: Series, DataFrames & I/O pandas is the standard Python library for data manipulation and analysis. A DataFrame is a table of labeled columns; a Series i…
pandas: Series, DataFrames & I/O
pandas is the standard Python library for data manipulation and analysis. A DataFrame is a table of labeled columns; a Series is a single labeled column. pandas is built on NumPy and integrates with the entire Python data stack.
Series
import pandas as pd
import numpy as np
# Create Series
s = pd.Series([1, 2, 3, 4, 5])
s = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
s = pd.Series({'x': 10, 'y': 20, 'z': 30})
# Access
s[0] # by position
s['a'] # by label
s.iloc[0] # positional
s.loc['a'] # label-based
# Operations (vectorized)
s * 2
s[s > 2] # boolean filter
s.mean(), s.sum(), s.max(), s.min()
s.value_counts()
s.sort_values()
s.rank()
s.cumsum()
s.diff()
s.pct_change()
s.map(lambda x: x ** 2)
s.apply(str)
s.astype('float32')DataFrames
# Create DataFrame
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie'],
'age': [30, 25, 35],
'salary': [70000, 55000, 90000],
})
df = pd.DataFrame(records_list) # from list of dicts
df = pd.DataFrame(np.random.randn(5, 3), columns=['A', 'B', 'C'])
# Inspect
df.shape # (rows, cols)
df.dtypes # column types
df.info() # types + non-null counts
df.describe() # stats: count, mean, std, min, quartiles, max
df.head(10) # first 10 rows
df.tail(5) # last 5 rows
df.sample(5) # random 5 rows
df.columns.tolist()
df.index.tolist()
# Add/modify columns
df['bonus'] = df['salary'] * 0.1
df['full_name'] = df['first'] + ' ' + df['last']
df.assign(tax=lambda x: x['salary'] * 0.3) # non-mutating
# Delete columns
df.drop(columns=['bonus'], inplace=True)
df.drop(columns=['a', 'b'])
# Rename
df.rename(columns={'name': 'full_name', 'age': 'years'}, inplace=True)
# Sort
df.sort_values('salary', ascending=False)
df.sort_values(['dept', 'salary'], ascending=[True, False])
df.sort_index()Reading & Writing Data
# CSV
df = pd.read_csv('data.csv')
df = pd.read_csv('data.csv',
sep=';',
header=0, # row to use as column names
index_col='id', # column to use as index
usecols=['name', 'salary'], # only load these columns
dtype={'salary': 'float32'},
parse_dates=['hire_date'],
na_values=['NA', 'NULL', '-'],
nrows=1000, # read only first 1000 rows
encoding='utf-8',
chunksize=10000, # iterate in chunks for large files
)
df.to_csv('output.csv', index=False)
# Excel
df = pd.read_excel('report.xlsx', sheet_name='Sheet1')
df.to_excel('output.xlsx', sheet_name='Results', index=False)
# JSON
df = pd.read_json('data.json')
df = pd.read_json('data.json', orient='records')
df.to_json('output.json', orient='records', indent=2)
# Parquet (columnar, fast, compressed)
df = pd.read_parquet('data.parquet')
df.to_parquet('output.parquet', compression='snappy')
# SQL
from sqlalchemy import create_engine
engine = create_engine('postgresql://user:pass@host/db')
df = pd.read_sql('SELECT * FROM users WHERE active = TRUE', engine)
df = pd.read_sql_table('orders', engine)
df.to_sql('processed_orders', engine, if_exists='append', index=False, chunksize=5000)
# Large CSV with chunking
chunk_list = []
for chunk in pd.read_csv('large.csv', chunksize=100_000):
processed = chunk[chunk['status'] == 'active']
chunk_list.append(processed)
df = pd.concat(chunk_list, ignore_index=True)