Pandas
https://anaconda.org/TomAugspurger/pandas-performance/notebook
IO
Create dataframe
import pandas as pd
df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})
df = pd.DataFrame(data={'time': pd.date_range('2000', freq='D', periods=3)})
df = pd.DataFrame.from_dict({0: 1}, orient="index", columns=["col1"])#.T
df = pd.DataFrame([1, "a", 1.5], index=list("abc")).T
Read parquet
#filters = [("state", "==", "California")]
#filters = [("FIPS", "in", [6, 36])]
#filters = [("datetime", ">=", datetime(2023, 10, 4, 11, 30, 0, tzinfo=pytz.UTC))]
pd.read_parquet("file.parquet", filters=filtes)
Read JSON
import json
import pandas as pd
df = pd.read_json("out.json", orient="index")
Normalize
with open("out.json") as f:
data = json.load(f)
df = pd.json_normalize(
data,
record_path="counties",
meta=["state", "shortname", ["info", "governor"]],
sep="_",
)
Read csv
df = pd.read_csv(
"https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/titanic.csv"
)
df = pd.read_csv(
"runs.csv",
engine="pyarrow",
parse_dates=["Date"],
dtype={"Distance (km)": "float32"},
)
for col in ["Time", "5 km pace", "Km pace"]:
df[col] = pd.to_timedelta(df[col])
df
Read sql
Download https://www.oracle.com/database/technologies/instant-client/winx64-64-downloads.html and save it to C:\oracle\
$ pip install cx_Oracle
import pandas as pd
import cx_Oracle
import os
os.chdir(r"C:\oracle\instantclient_19_5")
dnsstring = cx_Oracle.makedsn('HOST', 'PORT', service_name='SERVICENAME')
u = "USERNAME"
p = "PASSWORD"
con_oracle = cx_Oracle.connect(user=u, password=p, dsn=dnsstring)
df = pd.read_sql('select * from TABLE WHERE ROWNUM < 5', con_oracle)
con_oracle.close()
Read sql query
query = open('file.sql', 'r')
df = pd.read_sql_query(query.read(), con)
Read xml
df = pd.read_xml("file.xml")
Read html
_l = pd.read_html("https://www.cdc.gov/growthcharts/html_charts/wtage.htm")
df = _l[1]
To xarray
df.to_xarray()
To parquet
import pyarrow as pa
d = {"a": pa.int32(), "b": pa.string(), "c": pa.float32()}
schema = pa.schema(d)
df.to_parquet("tmp.parquet", schema=schema)
df.to_parquet("tmp.parquet", allow_truncated_timestamps=True, coerce_timestamps="ms")
To csv
df.to_csv(index=False)
To json
df.to_json(orient="index", date_format="iso")
To excel
df.to_excel("file.xlsx")
Dataframe to series
df.squeeze()
Dtypes
Convert dtypes automatically (manually)
df.convert_dtypes()
df.astype(DTYPE)
Specify dtypes
d_fmt, s_fmt, i_fmt, f_fmt, c_fmt, = "datetime64[D]", "string", "Int64", "float64", "category"
Convert multiple columns to categorical
for col in cat_cols:
df[col] = df[col].astype("category")
Convert column to ordered categorical
from pandas.api.types import CategoricalDtype
cat_type = CategoricalDtype(categories=[0, 1, 2, 3], ordered=True)
df[col].astype(cat_type)
Convert objects to integers
loc_int = [i for i, v in enumerate(list(table_dtypes.values())) if v == i_fmt]
df.iloc[:, loc_int].apply(pd.to_numeric) # can add errors='coerce' to .to_numeric()
Dates
df["COL"].values.astype("datetime64[s]")
Index
Make an index a column
df.reset_index()
Make a Multi index a column
g.index.to_flat_index().to_frame()
Append a column to the index
df.set_index('COL', append=True)
First index of a series
s.first_valid_index()
Querying
Query a multiindex
df.loc[(1, 2)]
Between
df[df['COL'].between(1, 2, inclusive="neither")]
Multiple where conditions
df[(df['DATE'] <= 'YYYY-MM-DD') & (df['FLAG'] == 'Y')]
Drop columns which contain all 0's
df.loc[:, (df != 0).any(axis=0)]
Drop a column
df.drop(columns=['COLUMN_NAME'], inplace=True)
Split a dataframe into duplicated and not duplicated
single_df = df.drop_duplicates(subset="COL", keep=False)
dup_df = df[df.duplicated(subset="COL", keep=False)]
Find columns not in another dataframe
df.columns[~df.columns.isin(df2.columns)]
Create a column based on if another string column contains an element
df['COL2'] = df['COL'].str.contains('a')
Use query
df_students.query('Name == "Alex"')
value = ["a", "b", "c"]
df_students.query("Name in @value")
Find columns using wildcard
df.filter(like='ABC ', axis=1)
df.filter(regex="NET")
Find the index of a column
df.columns.get_loc("COL")
Keep/remove certain dtypes
df.select_dtypes(include=np.number)
df.select_dtypes(include="datetimetz")
df.select_dtypes(exclude="datetimetz")
Keep values with are not null based on one column
df[pd.notna(df["COL"])]
Keep values with are not null based on multiple columns
df.dropna(subset=["COL1", "COL2"])
ETL utils
Rename columns
df.rename(columns={"A": "a", "B": "b"})
Append str to some column names
df.columns = [c + "_new" if c in ["COL2", "COL4"] else c for c in df.columns]
Replace value in column
df["COL"] = df["COL"].replace(to_replace=val, value=new_val)
Replace word in all column names
df.columns = df.columns.str.replace("[()]", "_")
List of values to single columns / split list to new columns
df["COL2"] = df["COL"].apply(lambda x: x[0])
df["COL2"] = df["COL"].apply(lambda x: x[1])
Sorting
df.sort_values(by=['COL1', 'COL2'], ascending=[True, False])
number of largest values for a column
df.nlargest(10, "COL")
Mapping
Convert values (similar to a case statement)
df["COL"].map({'A':0, 'B':1})
Joining
Use merge if you are not joining on the index.
Best to rename if you don't want to duplicate the column
df.merge(df2, left_on=['DATE'], right_on=['DDATE'])
inner join
df.merge(df2, on='UUID')
outer join
l.merge(r, left_on=['lcol'], right_on=['rcol'], how="outer")
Append column to the right of the dataframe
pd.concat([df.reset_index(drop=True), df2.reset_index(drop=True)], axis=1)
Append column to the left of the dataframe
df.insert(0, 'COL', value)
Concat
df = pd.DataFrame()
for i range(0, 1):
df = pd.concat([df, etl_df])
Update value
df.loc[index, ["col"]]
Feature engineering
One-hot encode
pd.get_dummies(df, prefix=['COL'])
Concat columns
import itertools
cat_cols = ['a', 'b', 'c']
_ = list(itertools.combinations(cat_cols, 2))
for c1, c2 in _:
df.loc[:, c1 + "_" + c2] = df[c1].astype(str) + "_" + df[c2].astype(str)
Cylical features
col_scale = [("DAY", 7), ("MONTH", 12)]
for col, scale in col_scale:
df[col + '_SIN'] = np.sin(2 * np.pi * df[col] / scale)
df[col + '_COS'] = np.cos(2 * np.pi * df[col] / scale)
df['lon_sin'] = np.sin(df["lon"])
df['lon_cos'] = np.cos(df["lon"])
Group by
Do different aggregations
df.agg({'COL1': 'sum', 'COL2': 'nunique', 'COL3': 'first'})
If have multi-index
df = ddf.groupby(by=["MULTIINDEX1", "MULTIINDEX2"], as_index=False).agg(["min", "max"])
# df.columns = ["_".join(col).rstrip("_") for col in df.columns.values]
UUID by column
gdf["GROUP_UUID"] = gdf.groupby(by="GROUP").cumcount()
Apply a lambda function
Apply a scikit-learn function to two columns
from sklearn.metrics import roc_auc_score
def df_roc_auc_score(y, pred):
try:
return roc_auc_score(y, pred)
except:
pass
df.groupby('COLUMN').apply(
lambda df: df_roc_auc_score(df['COLUMN1'].astype(float), df['COLUMN2'].astype(float))
).reset_index()
or
def mean_absolute_error_group(df):
return mean_absolute_error(df["y_true"], df["y_pred"])
df.groupby('COLUMN').apply(mean_absolute_error_group)
or
df.groupby('COLUMN').apply(lambda df: mean_squared_error(df.COLUMN1, df.COLUMN2))
of
np.vectorize(df_roc_auc_score)(df["y_true"], df["y_pred"])
Apply scikit-learn regression metrics
from sklearn.metrics import (mean_absolute_error, mean_squared_error,
explained_variance_score, max_error, mean_squared_log_error, median_absolute_error,
r2_score, mean_absolute_percentage_error)
def regression_error_metrics_group(df):
out = {}
out['Mean'] = df["y_true"].mean()
out['Mean Error'] = np.average(df["y_pred"] - df["y_true"])
out['Mean Absolute Error'] = mean_absolute_error(df["y_true"], df["y_pred"])
out['Mean Absolute Percentage Error'] = mean_absolute_percentage_error(df["y_true"], df["y_pred"])
out['Mean Squared Error'] = mean_squared_error(df["y_true"], df["y_pred"])
out['Root Mean Squared Error'] = mean_squared_error(df["y_true"], df["y_pred"], squared=False)
out['Max Error'] = max_error(df["y_true"], df["y_pred"])
out['Median Absolute Error'] = median_absolute_error(df["y_true"], df["y_pred"])
out['Mean Squared Log Error'] = mean_squared_log_error(df["y_true"], df["y_pred"])
return pd.Series(out, index=['Mean', 'Mean Error', 'Mean Absolute Error',
'Mean Absolute Percentage Error', 'Mean Squared Error',
'Root Mean Squared Error', 'Max Error',
'Median Absolute Error', 'Mean Squared Log Error'])
df.groupby('COLUMN').apply(regression_error_metrics_group)
Transform
df["minute_since_FIELD_start"] = df.groupby("FIELD")["time"].transform(lambda x: (x - x.min()).dt.seconds // 60 % 60)
Resample
48 hours
df.set_index("Timestamp")["COL"].resample("48H").max()
Rolling
def mad(x):
return np.fabs(x - x.mean()).mean()
df = pd.DataFrame({"A": np.random.randn(100_000)},
index=pd.date_range('1/1/2000', periods=100_000,freq='T')
).cumsum()
df.rolling(10).apply(mad, engine="numba", raw=True)
Analysis
Correlation between all columns
df.corr()
Quantile/percentile of dataframe
df.quantile(0.1)
Round a dataframe to 2 d.p
df.round(2)
Describe dataframe
df["date"].describe(datetime_is_numeric=True)
Memory of a dataframe
from dask.utils import format_bytes
df.memory_usage(deep=True).apply(format_bytes)
df.info()
Count values
df.value_counts()
Split string
s.str.split('_', expand=True)
df[["a", "b"]] = s.str.split("-", expand=True)
df["path"].str.split("/").str[-1]
Concat two columns
df["NEW_COL"] = df["COL"] + df["DATE_COL"].astype(str)
For loops
Loop over rows
for index, row in df.iterrows():
row["COL"]
for row in df.itertuples(index=False):
row.COL
Create a count on groups of continuous Trues
df["COL"] = 0
last_value = True
counter = 1
for row in df.itertuples():
index = row.Index
value = row.bool_col
if index == 0 and not value:
counter = 0
if value:
if not last_value:
counter += 1
df.loc[index, "COL"] = counter
else:
df.loc[index, "COL"] = counter
last_value = True
else:
last_value = False
Datetime
Datetime from string
pd.to_datetime(t, format="%m/%d/%Y (%H:00)") # pd.Timedelta("hours 1")
pd.to_datetime("2015020100", format="%Y%m%d%H")
pd.to_datetime(df["COL"], unit="s")
Round to nearest 3 hours
s.round('3H')
Round up to nearest 3 hours
pd.Timedelta.ceil(s, freq="3H")
Add an integer series to a date series
df['DATE'] - pd.to_timedelta(df['WEEKS'] * 7, unit='D')
Current date
pd.to_datetime('now')
pd.Timestamp('now').strftime("%Y-%m-%d")
pd.Timestamp('now').isoformat()
df["timestamp"].dt.strftime("%Y-%m-%dT%H:%M:%S.000Z")
pd.Timestamp('now').to_pydatetime()
Add a timezone
pd.to_datetime("2020-01-01 00:00:00").tz_localize('US/Eastern')
pd.to_datetime("2020-01-01 00:00:00").tz_localize('UTC').tz_convert("America/Oregon")
Remove a timezone
df["Timestamp"].dt.tz_localize(None)
Day diff to week diff
df['WEEKDIFF'] = np.ceil((df['DATE1'] - df['DATE2']).dt.days / 7).astype(int)
Days since start
df["day_number"] = (df["date"] - df["date"].min()).dt.days
Timedelta to minutes
df["minutes"] = df["timedelta"].dt.total_seconds() / 60
Time features
df["year"] = df["time"].dt.year
df["is_leap_year"] = df["time"].dt.is_leap_year
df["month"] = df["time"].dt.month
df['next_month'] = df['month'].apply(lambda x: 1 if x == 12 else x + 1)
df['previous_month'] = df['month'].apply(lambda x: 12 if x == 1 else x - 1)
# 0 DJF, 1 MAM, 2 JJA, 3 SON
month_to_season_d = {
1: 0,
2: 0,
3: 1,
4: 1,
5: 1,
6: 2,
7: 2,
8: 2,
9: 3,
10: 3,
11: 3,
12: 0,
}
df["season"] = events["month"].map(month_to_season_d)
df["dayofyear"] = df["time"].dt.dayofyear
df["dayofweek"] = df["time"].dt.dayofweek
df["hour"] = df["time"].dt.hour
Date range
from datetime import datetime
pd.date_range("YYYY-MM-DD", "YYYY-MM-DD") # .to_pydatetime()
pd.date_range(f"{time_min:%Y%m%d}", f"{time_max:%Y%m%d}")
pd.date_range("2020-01", datetime.now(), freq="M")
pd.date_range("2020-01-01", periods=2, freq="min")
pd.date_range("2012", "2013", freq="3H", inclusive="left")
Machine learning
Convert probabilities to binary
df['COLUMN'].where(df['COLUMN'] > 0.5, 0).where(df['COLUMN'] <= 0.5, 1)
np.where(df.COLUMN > 0.5, 1, 0)
Convert binary labels to 1 and 0
df['COLUMN'] = df['COLUMN'].map({'A':0, 'B':1})
Plotting
Add label to y axis
ax = df.plot()
ax.set_ylabel("y label")
Plot with reverse x axis
df.plot(xlim=(df.index.max(), df.index.min()))
Scatter matrix (similar to seaborn pairplot)
from pandas.plotting import scatter_matrix
scatter_matrix(df, alpha=0.2, figsize=(6, 6), diagonal='kde')
Overlay lines
ax = df["COL"].plot()
df2["COL"].plot(c="red", ax=ax)
Bar plot (with two columns)
df.plot.bar(x='X', y='Y', color='teal', figsize=(6, 4))
df.plot(x='X', y=['Y1','Y1'], kind='bar', figsize=(8, 5))
Histogram plot
df.plot.hist()
Density plot
df.plot.density()
Display
pd.set_option("display.max_rows", 150)
pd.set_option("display.max_columns", 150)
pd.set_option("display.precision", 2)
pd.set_option("display.float_format', lambda x: '%.3f' % x)
A value in dollars
df['COL'].agg(['sum']).to_frame().style.format('${0:,.2f}')
Show all columns
print(df.columns.values)
Config
pd.options.compute.use_bottleneck = True
pd.set_option('display.max_rows', None)
Testing
import pandas._testing as pd_testing
pd_testing.assert_equal(a, b)