Kerchunk

https://fsspec.github.io/kerchunk/

https://github.com/ProjectPythia/kerchunk-cookbook (https://projectpythia.org/kerchunk-cookbook/README.html)

Multi-zarr to zarr

Parse a single NetCDF file with kerchunk

The idea here is if you have a NetCDF file on the cloud with many variables and you only want one variable then kerchunk can find the byte range of that variable, store that info into a .json file then read that and treat the NetCDF file in the cloud as a zarr store.

kerchunk.hdf.SingleHdf5ToZarr - Extracts byte ranges, compression information and metadata

import kerchunk.hdf # requires h5netcdf

import s3fs

import ujson # faster than json

uri = "s3://wrf-se-ak-ar5/ccsm/rcp85/daily/2060/WRFDS_2060-01-01.nc"

with s3fs.S3FileSystem(

anon=True,

default_fill_cache=False,

default_cache_type="first",

).open(uri) as f:

h5chunks = kerchunk.hdf.SingleHdf5ToZarr(h5f=f, url=uri).translate()

with open(uri.replace(":", "").replace("/", "_").replace(".nc", "_kerchunk.json"), "wb") as f:

f.write(ujson.dumps(h5chunks).encode())

import pandas as pd

variable = "air_temperature"

time = f"{pd.Timestamp('now').floor('1h') - pd.Timedelta('1h'):%Y%m%dT%H0000Z}"

uri = f"s3://BUCKET/{time}/{time}-{variable}-{time}.nc"

with s3fs.S3FileSystem(

default_fill_cache=False,

default_cache_type="first",

).open(uri) as f:

h5chunks = kerchunk.hdf.SingleHdf5ToZarr(h5f=f, url=uri).translate()

with open(uri.replace(":", "").replace("/", "_").replace(".nc", "_kerchunk.json"), "wb") as f:

f.write(ujson.dumps(h5chunks).encode())

import fsspec

import xarray as xr

fs = fsspec.filesystem(

"reference",

fo=h5chunks,

remote_protocol="s3",

remote_options=dict(anon=True),

skip_instance_cache=True,

)

ds = xr.open_dataset(

fs.get_mapper(), engine="zarr", backend_kwargs={"consolidated": False}

)

Write reference to parquet file:

import kerchunk.hdf # requires h5netcdf

import s3fs

from fsspec.implementations.reference import LazyReferenceMapper

uri = "s3://wrf-se-ak-ar5/ccsm/rcp85/daily/2060/WRFDS_2060-01-01.nc"

out = LazyReferenceMapper.create(root="file.parquet", record_size=1)

with s3fs.S3FileSystem(

anon=True,

default_fill_cache=False,

default_cache_type="first",

).open(uri) as f:

_ = kerchunk.hdf.SingleHdf5ToZarr(h5f=f, url=uri, out=out).translate()

Read this using fsspec and pass to xarray using engine="zarr"

Multi-file datasets

from tempfile import TemporaryDirectory

import fsspec

import ujson

import xarray as xr

from kerchunk.combine import MultiZarrToZarr

from kerchunk.hdf import SingleHdf5ToZarr

from tqdm import tqdm

Create a list of files

fs_read = fsspec.filesystem("s3", anon=True, skip_instance_cache=True)

file_paths = [

"s3://" + f

for f in s3fs.S3FileSystem().glob(

f"s3://BUCKET/0.1x0.1-global/{init_time}/{init_time}-{variable}-*.nc"

)

][0:forecast_len]

Scan the files

import json

from kerchunk.grib2 import scan_grib

out = scan_grib(

"s3://BUCKET/20220730T000000Z-air_temperature-20220730T000000Z.grib",

common_vars=[

"time",

"step",

"heightAboveGround",

"latitude",

"longitude",

"valid_time",

],

storage_options={},

)

with open("out.json", "w") as f:

json.dump(out, f)

from kerchunk.combine import MultiZarrToZarr

mzz = MultiZarrToZarr(["out.json", "out2.json"], concat_dims=['valid_time'])

ds = xr.open_dataset(

"reference://",

engine="zarr",

backend_kwargs={

"storage_options": {

"fo": mzzt,

},

"consolidated": False,

},

)

Parse a zarr file (single zarr to zarr)

import kerchunk.zarr