https://docs.pola.rs/user-guide/getting-started/#installing-polars
import polars as pl
from datetime import datetime
df = pl.DataFrame(
{
"integer": [1, 2, 3],
"date": [
datetime(2025, 1, 1),
datetime(2025, 1, 2),
datetime(2025, 1, 3),
],
"float": [4.0, 5.0, 6.0],
"string": ["a", "b", "c"],
}
)
df = pd.read_parquet("file.parquet")
Read parquet from s3
path = "s3://FOLDER/FILE.parquet"
import json
import boto3
import polars as pl
session = boto3.session.Session()
credentials = session.get_credentials()
storage_options = {
"aws_access_key_id": credentials.access_key,
"aws_secret_access_key": credentials.secret_key,
"aws_session_token": credentials.token,
}
# State has been added later on so can't read in cleanly
file_paths = json.loads(
pl.scan_parquet(
path + "/*/*.parquet", storage_options=storage_options
).serialize()
)["Scan"]["paths"]
df = pl.concat(
[
pl.scan_parquet(x, storage_options=storage_options).select(
timestamp=pl.col("Timestamp"),
FIPS=pl.col("FIPS").cast(pl.Int32),
utility=pl.col("Utility"),
customers_out=pl.col("Customers Out").cast(pl.Int64),
)
for x in file_paths
]
)
Plot
import altair as alt
alt.data_transformers.enable("vegafusion")
df.plot.point(x="COL1", y="COL2", color="GROUP")
df.plot.point(
x="timestamp_utc_ceil_1hour",
y="max_sum_customers_out",
color=alt.Color('county_name:N', scale=alt.Scale(scheme='category20'), legend=alt.Legend(columns=4, symbolLimit=0)),
)