mamba install duckdb-python
https://duckdb.org/docs/api/python/overview.html
import duckdb
duckdb.sql("from duckdb_extensions()")
duckdb.sql("PRAGMA platform")
duckdb.sql("duckdb_memory()")
duckdb.sql("duckdb_temporary_files")
df = duckdb.read_parquet("titanic.parquet")
Read a parquet from s3
local
import duckdb
import boto3
aws_creds = boto3.Session().get_credentials()
with duckdb.connect() as con:
con.install_extension("httpfs")
con.install_extension("aws")
con.load_extension("httpfs")
con.load_extension("aws")
df = con.execute(
f"""
--CREATE SECRET s3_secret (
-- TYPE S3,
-- PROVIDER CREDENTIAL_CHAIN
--);
CREATE SECRET s3_secret (
TYPE S3,
KEY_ID '{aws_creds.access_key}',
SECRET '{aws_creds.secret_key}',
SESSION_TOKEN '{aws_creds.token}'
);
SELECT * FROM read_parquet('s3://dtn-dsci-prd-eda/ml_datasets/bike_sharing_demand.parquet');
--DESCRIBE SELECT * FROM 's3://dtn-dsci-prd-eda/ml_datasets/bike_sharing_demand.parquet';
"""
).pl()
df
import duckdb
conn = duckdb.connect()
conn.install_extension("httpfs")
conn.install_extension("aws")
conn.load_extension("httpfs")
conn.load_extension("aws")
try:
df = conn.execute(
"""
CREATE SECRET s3_secret (
TYPE S3,
PROVIDER CREDENTIAL_CHAIN
);
SELECT *
FROM read_parquet('s3://dtn-dsci-prd-eda/ml_datasets/bike_sharing_demand.parquet');
"""
)
PRAGMA enable_print_progress_bar;
SET TIMEZONE='UTC';
CREATE TABLE tbl AS SELECT *
FROM read_parquet('s3://dtn-dsci-prd-sia/data_fusion/processed/data_fusion_customers_out.parquet/*/*.parquet');
DESCRIBE tbl;
--SELECT MAX("Customers Out") FROM tbl
--SELECT "FIPS", MAX("Customers Out") FROM tbl
--GROUP BY "FIPS"
"""
).df()
Read overture maps places
import duckdb
conn = duckdb.connect()
conn.install_extension("httpfs")
conn.load_extension("httpfs")
conn.install_extension("aws")
conn.load_extension("aws")
conn.install_extension("spatial")
conn.load_extension("spatial")
df = conn.execute(
"""
PRAGMA enable_print_progress_bar;
SET s3_region='us-west-2';
SELECT
id,
names.primary AS primary_name,
geometry AS
ST_AsText(geometry)
FROM
read_parquet('s3://overturemaps-us-west-2/release/2024-09-18.0/theme=base/type=water/*', filename=true, hive_partitioning=1)
WHERE
bbox.xmin >= -91.3994
and bbox.xmax <= -89.3864
and bbox.ymin >= 29.152
and bbox.ymax <= 30.5161
"""
).df()
import duckdb
conn = duckdb.connect()
conn.install_extension("httpfs")
conn.load_extension("httpfs")
conn.install_extension("aws")
conn.load_extension("aws")
conn.install_extension("spatial")
conn.load_extension("spatial")
#duckdb_spatial_extension_path = "/Users/ray/Downloads/spatial.duckdb_extension"
# conn.execute(f"LOAD '{duckdb_spatial_extension_path}';")
df = conn.execute(
"""
PRAGMA enable_print_progress_bar;
SET s3_region='us-west-2';
SELECT
id,
names.primary as name,
confidence AS confidence,
CAST(socials AS JSON) as socials, -- Ensure each attribute can be serialized to JSON
ST_AsText(geometry) as geometry -- DuckDB understands this to be a geometry type
FROM read_parquet('s3://overturemaps-us-west-2/release/2024-09-18.0/theme=places/type=place/*', filename=true, hive_partitioning=1)
WHERE categories.primary = 'pizza_restaurant'
AND bbox.xmin BETWEEN -75 AND -73 -- Only use the bbox min values
AND bbox.ymin BETWEEN 40 AND 41 -- because they are point geometries.
"""
).df()
import duckdb
conn = duckdb.connect()
conn.install_extension("httpfs")
conn.load_extension("httpfs")
conn.install_extension("aws")
conn.load_extension("aws")
# conn.install_extension("spatial")
# conn.load_extension("spatial")
duckdb_spatial_extension_path = "/Users/ray/Downloads/spatial.duckdb_extension"
conn.execute(f"LOAD '{duckdb_spatial_extension_path}';")
df = conn.execute(
"""
SET s3_region='us-west-2';
SELECT
id,
names.primary as name,
class,
geometry -- DuckDB v.1.1.0 will autoload this as a `geometry` type
FROM read_parquet('s3://overturemaps-us-west-2/release/2024-09-18.0/theme=transportation/type=segment/*', filename=true, hive_partitioning=1)
WHERE bbox.xmin > 2.276
AND bbox.ymin > 48.865
AND bbox.xmax < 2.314
AND bbox.ymax < 48.882
"""
).df()