Pandas

https://anaconda.org/TomAugspurger/pandas-performance/notebook 

IO

Create dataframe

import pandas as pd


df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})

df = pd.DataFrame(data={'time': pd.date_range('2000', freq='D', periods=3)})


df = pd.DataFrame.from_dict({0: 1}, orient="index", columns=["col1"])#.T

df = pd.DataFrame([1, "a", 1.5], index=list("abc")).T

Read parquet

#filters = [("state", "==", "California")]

#filters = [("FIPS", "in", [6, 36])]

#filters = [("datetime", ">=", datetime(2023, 10, 4, 11, 30, 0, tzinfo=pytz.UTC))]

pd.read_parquet("file.parquet", filters=filtes)

Read JSON

import json

import pandas as pd


df = pd.read_json("out.json", orient="index")

 Normalize

with open("out.json") as f:

    data = json.load(f)

df = pd.json_normalize(

    data,

    record_path="counties",

    meta=["state", "shortname", ["info", "governor"]],

    sep="_",

)

Read csv

df = pd.read_csv(

    "https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/titanic.csv"

)


df = pd.read_csv(

    "runs.csv",

    engine="pyarrow",

    parse_dates=["Date"],

    dtype={"Distance (km)": "float32"},

)

for col in ["Time", "5 km pace", "Km pace"]:

    df[col] = pd.to_timedelta(df[col])

df

Read sql

Download https://www.oracle.com/database/technologies/instant-client/winx64-64-downloads.html and save it to C:\oracle\

$ pip install cx_Oracle

import pandas as pd

import cx_Oracle

import os


os.chdir(r"C:\oracle\instantclient_19_5")

dnsstring = cx_Oracle.makedsn('HOST', 'PORT', service_name='SERVICENAME')

u = "USERNAME"

p = "PASSWORD"

con_oracle = cx_Oracle.connect(user=u, password=p, dsn=dnsstring)

df = pd.read_sql('select * from TABLE WHERE ROWNUM < 5', con_oracle)

con_oracle.close()

Read sql query

query = open('file.sql', 'r')

df = pd.read_sql_query(query.read(), con)

Read xml

df = pd.read_xml("file.xml")

Read html

_l = pd.read_html("https://www.cdc.gov/growthcharts/html_charts/wtage.htm")

df = _l[1]

To xarray

df.to_xarray()

To parquet

import pyarrow as pa


d = {"a": pa.int32(), "b": pa.string(), "c": pa.float32()}

schema = pa.schema(d)

df.to_parquet("tmp.parquet", schema=schema)

df.to_parquet("tmp.parquet", allow_truncated_timestamps=True, coerce_timestamps="ms")

To csv

df.to_csv(index=False)

To json

df.to_json(orient="index", date_format="iso")

To excel 

df.to_excel("file.xlsx")

Dataframe to series

df.squeeze()

Dtypes

Convert dtypes automatically (manually)

df.convert_dtypes()

df.astype(DTYPE)

Specify dtypes

d_fmt, s_fmt, i_fmt, f_fmt, c_fmt, = "datetime64[D]", "string", "Int64", "float64", "category"

Convert multiple columns to categorical

for col in cat_cols:

    df[col] = df[col].astype("category")

Convert column to ordered categorical

from pandas.api.types import CategoricalDtype

cat_type = CategoricalDtype(categories=[0, 1, 2, 3], ordered=True)

df[col].astype(cat_type)

Convert objects to integers

loc_int = [i for i, v in enumerate(list(table_dtypes.values())) if v == i_fmt]

df.iloc[:, loc_int].apply(pd.to_numeric) # can add errors='coerce' to .to_numeric()

Dates

df["COL"].values.astype("datetime64[s]")

Index

Make an index a column

df.reset_index()

Make a Multi index a column

g.index.to_flat_index().to_frame()

Append a column to the index

df.set_index('COL', append=True)

First index of a series

s.first_valid_index()

Querying

Query a multiindex

df.loc[(1, 2)]

Between

df[df['COL'].between(1, 2, inclusive="neither")]

Multiple where conditions

df[(df['DATE'] <= 'YYYY-MM-DD') & (df['FLAG'] == 'Y')]

Drop columns which contain all 0's

df.loc[:, (df != 0).any(axis=0)]

Drop a column

df.drop(columns=['COLUMN_NAME'], inplace=True)

Split a dataframe into duplicated and not duplicated

single_df = df.drop_duplicates(subset="COL", keep=False)

dup_df = df[df.duplicated(subset="COL", keep=False)]

Find columns not in another dataframe

df.columns[~df.columns.isin(df2.columns)]

Create a column based on if another string column contains an element

df['COL2'] = df['COL'].str.contains('a')

Use query

df_students.query('Name == "Alex"')


value = ["a", "b", "c"]

df_students.query("Name in @value")

Find columns using wildcard

df.filter(like='ABC ', axis=1)

df.filter(regex="NET")

Find the index of a column

df.columns.get_loc("COL")

Keep/remove certain dtypes

df.select_dtypes(include=np.number)

df.select_dtypes(include="datetimetz")

df.select_dtypes(exclude="datetimetz")

Keep values with are not null based on one column

df[pd.notna(df["COL"])]

Keep values with are not null based on multiple columns

df.dropna(subset=["COL1", "COL2"])

ETL utils

Rename columns

df.rename(columns={"A": "a", "B": "b"})

Append str to some column names

df.columns = [c + "_new" if c in ["COL2", "COL4"] else c for c in df.columns]

Replace value in column

df["COL"] = df["COL"].replace(to_replace=val, value=new_val)

Replace word in all column names

df.columns = df.columns.str.replace("[()]", "_")

List of values to single columns / split list to new columns

df["COL2"] = df["COL"].apply(lambda x: x[0])

df["COL2"] = df["COL"].apply(lambda x: x[1])

Sorting

df.sort_values(by=['COL1', 'COL2'], ascending=[True, False])

number of largest values for a column

df.nlargest(10, "COL")

Mapping

Convert values (similar to a case statement)

df["COL"].map({'A':0, 'B':1})

Joining

Use merge if you are not joining on the index.

Best to rename if you don't want to duplicate the column

df.merge(df2, left_on=['DATE'], right_on=['DDATE'])

inner join

df.merge(df2, on='UUID')

outer join

l.merge(r, left_on=['lcol'], right_on=['rcol'], how="outer")

Append column to the right of the dataframe

pd.concat([df.reset_index(drop=True), df2.reset_index(drop=True)], axis=1)

Append column to the left of the dataframe

df.insert(0, 'COL', value)

Concat

df = pd.DataFrame()

for i range(0, 1):

    df = pd.concat([df, etl_df])

Update value

df.loc[index, ["col"]]

Feature engineering

One-hot encode

pd.get_dummies(df, prefix=['COL'])

Concat columns

import itertools

cat_cols = ['a', 'b', 'c']

_ = list(itertools.combinations(cat_cols, 2))

for c1, c2 in _:

    df.loc[:, c1 + "_" + c2] = df[c1].astype(str) + "_" + df[c2].astype(str)

Cylical features

col_scale = [("DAY", 7), ("MONTH", 12)]

for col, scale in col_scale:

    df[col + '_SIN'] = np.sin(2 * np.pi * df[col] / scale)

    df[col + '_COS'] = np.cos(2 * np.pi * df[col] / scale)

 

 df['lon_sin'] = np.sin(df["lon"])

 df['lon_cos'] = np.cos(df["lon"])

Group by

Do different aggregations

df.agg({'COL1': 'sum', 'COL2': 'nunique', 'COL3': 'first'})

If have multi-index

df = ddf.groupby(by=["MULTIINDEX1", "MULTIINDEX2"], as_index=False).agg(["min", "max"])

# df.columns = ["_".join(col).rstrip("_") for col in df.columns.values]

UUID by column

gdf["GROUP_UUID"] = gdf.groupby(by="GROUP").cumcount()

Apply a lambda function

Apply a scikit-learn function to two columns

from sklearn.metrics import roc_auc_score


def df_roc_auc_score(y, pred):

    try:

        return roc_auc_score(y, pred)

    except:

        pass


df.groupby('COLUMN').apply(

    lambda df: df_roc_auc_score(df['COLUMN1'].astype(float), df['COLUMN2'].astype(float))

    ).reset_index()

or

def mean_absolute_error_group(df):

    return mean_absolute_error(df["y_true"], df["y_pred"])


df.groupby('COLUMN').apply(mean_absolute_error_group)

or

df.groupby('COLUMN').apply(lambda df: mean_squared_error(df.COLUMN1, df.COLUMN2))

of 

np.vectorize(df_roc_auc_score)(df["y_true"], df["y_pred"])

Apply scikit-learn regression metrics

from sklearn.metrics import (mean_absolute_error, mean_squared_error,

explained_variance_score, max_error, mean_squared_log_error, median_absolute_error,

r2_score, mean_absolute_percentage_error)


def regression_error_metrics_group(df):

    out = {}

    out['Mean'] = df["y_true"].mean()

    out['Mean Error'] = np.average(df["y_pred"] - df["y_true"])

    out['Mean Absolute Error'] = mean_absolute_error(df["y_true"], df["y_pred"])

    out['Mean Absolute Percentage Error'] = mean_absolute_percentage_error(df["y_true"], df["y_pred"]) 

    out['Mean Squared Error'] = mean_squared_error(df["y_true"], df["y_pred"])

    out['Root Mean Squared Error'] = mean_squared_error(df["y_true"], df["y_pred"], squared=False)

    out['Max Error'] = max_error(df["y_true"], df["y_pred"])

    out['Median Absolute Error'] = median_absolute_error(df["y_true"], df["y_pred"])

    out['Mean Squared Log Error'] = mean_squared_log_error(df["y_true"], df["y_pred"])

    return pd.Series(out, index=['Mean', 'Mean Error', 'Mean Absolute Error',

                                 'Mean Absolute Percentage Error', 'Mean Squared Error',

                                 'Root Mean Squared Error', 'Max Error',

                                 'Median Absolute Error', 'Mean Squared Log Error'])


df.groupby('COLUMN').apply(regression_error_metrics_group)

Transform

df["minute_since_FIELD_start"] = df.groupby("FIELD")["time"].transform(lambda x: (x - x.min()).dt.seconds // 60 % 60)

Resample

48 hours

df.set_index("Timestamp")["COL"].resample("48H").max()

Rolling

def mad(x):

    return np.fabs(x - x.mean()).mean()


df = pd.DataFrame({"A": np.random.randn(100_000)},

                  index=pd.date_range('1/1/2000', periods=100_000,freq='T')

).cumsum()

df.rolling(10).apply(mad, engine="numba", raw=True)

Analysis

Correlation between all columns

df.corr()

Quantile/percentile of dataframe

df.quantile(0.1)

Round a dataframe to 2 d.p

df.round(2)

Describe dataframe

df["date"].describe(datetime_is_numeric=True)

Memory of a dataframe

from dask.utils import format_bytes


df.memory_usage(deep=True).apply(format_bytes)

df.info()

Count values

df.value_counts()

Split string

s.str.split('_', expand=True)

df[["a", "b"]] = s.str.split("-", expand=True)

df["path"].str.split("/").str[-1]

Concat two columns

df["NEW_COL"] = df["COL"] + df["DATE_COL"].astype(str)

For loops

Loop over rows

for index, row in df.iterrows():

    row["COL"]


for row in df.itertuples(index=False):

    row.COL

Create a count on groups of continuous Trues

df["COL"] = 0

last_value = True

counter = 1

for row in df.itertuples():

    index = row.Index

    value = row.bool_col

    if index == 0 and not value:

        counter = 0

    if value:

        if not last_value:

            counter += 1

            df.loc[index, "COL"] = counter

        else:

            df.loc[index, "COL"] = counter

        last_value = True

    else:

        last_value = False

Datetime

Datetime from string

pd.to_datetime(t, format="%m/%d/%Y (%H:00)") # pd.Timedelta("hours 1")

pd.to_datetime("2015020100", format="%Y%m%d%H")

pd.to_datetime(df["COL"], unit="s")

Round to nearest 3 hours

s.round('3H')

Round up to nearest 3 hours

pd.Timedelta.ceil(s, freq="3H")

Add an integer series to a date series

df['DATE'] - pd.to_timedelta(df['WEEKS'] * 7, unit='D')

Current date

pd.to_datetime('now')

pd.Timestamp('now').strftime("%Y-%m-%d")

pd.Timestamp('now').isoformat()

df["timestamp"].dt.strftime("%Y-%m-%dT%H:%M:%S.000Z")

pd.Timestamp('now').to_pydatetime()

Add a timezone

pd.to_datetime("2020-01-01 00:00:00").tz_localize('US/Eastern')

pd.to_datetime("2020-01-01 00:00:00").tz_localize('UTC').tz_convert("America/Oregon")

Remove a timezone

df["Timestamp"].dt.tz_localize(None)

Day diff to week diff

df['WEEKDIFF'] = np.ceil((df['DATE1'] - df['DATE2']).dt.days / 7).astype(int)

Days since start

df["day_number"] = (df["date"] - df["date"].min()).dt.days

Timedelta to minutes

df["minutes"] = df["timedelta"].dt.total_seconds() / 60

Time features

df["year"] = df["time"].dt.year

df["is_leap_year"] = df["time"].dt.is_leap_year

df["month"] = df["time"].dt.month

df['next_month'] = df['month'].apply(lambda x: 1 if x == 12 else x + 1)

df['previous_month'] = df['month'].apply(lambda x: 12 if x == 1 else x - 1)

# 0 DJF, 1 MAM, 2 JJA, 3 SON

month_to_season_d = {

    1: 0,

    2: 0,

    3: 1,

    4: 1,

    5: 1,

    6: 2,

    7: 2,

    8: 2,

    9: 3,

    10: 3,

    11: 3,

    12: 0,

}

df["season"] = events["month"].map(month_to_season_d)

df["dayofyear"] = df["time"].dt.dayofyear

df["dayofweek"] = df["time"].dt.dayofweek

df["hour"] = df["time"].dt.hour

Date range

from datetime import datetime


pd.date_range("YYYY-MM-DD", "YYYY-MM-DD") # .to_pydatetime()

pd.date_range(f"{time_min:%Y%m%d}", f"{time_max:%Y%m%d}")

pd.date_range("2020-01", datetime.now(), freq="M")

pd.date_range("2020-01-01", periods=2, freq="min")

pd.date_range("2012", "2013", freq="3H", inclusive="left")


Machine learning

Convert probabilities to binary

df['COLUMN'].where(df['COLUMN'] > 0.5, 0).where(df['COLUMN'] <= 0.5, 1)

np.where(df.COLUMN > 0.5, 1, 0)

Convert binary labels to 1 and 0

df['COLUMN'] = df['COLUMN'].map({'A':0, 'B':1})

Plotting

Add label to y axis

ax = df.plot()

ax.set_ylabel("y label")

Plot with reverse x axis

df.plot(xlim=(df.index.max(), df.index.min()))

Scatter matrix (similar to seaborn pairplot)

from pandas.plotting import scatter_matrix

scatter_matrix(df, alpha=0.2, figsize=(6, 6), diagonal='kde')

Overlay lines

ax = df["COL"].plot()

df2["COL"].plot(c="red", ax=ax)

Bar plot (with two columns)

df.plot.bar(x='X', y='Y', color='teal', figsize=(6, 4))

df.plot(x='X', y=['Y1','Y1'], kind='bar', figsize=(8, 5))

Histogram plot

df.plot.hist()

Density plot

df.plot.density()

Display

pd.set_option("display.max_rows", 150)

pd.set_option("display.max_columns", 150)


pd.set_option("display.precision", 2)

pd.set_option("display.float_format', lambda x: '%.3f' % x)

A value in dollars

df['COL'].agg(['sum']).to_frame().style.format('${0:,.2f}')

Show all columns

print(df.columns.values)

Config

pd.options.compute.use_bottleneck = True

pd.set_option('display.max_rows', None)

Testing

import pandas._testing as pd_testing

pd_testing.assert_equal(a, b)