The idea here is if you have a NetCDF file on the cloud with many variables and you only want one variable then kerchunk can find the byte range of that variable, store that info into a .json file then read that and treat the NetCDF file in the cloud as a zarr store.
kerchunk.hdf.SingleHdf5ToZarr - Extracts byte ranges, compression information and metadata
import kerchunk.hdf # requires h5netcdf
import s3fs
import ujson # faster than json
uri = "s3://wrf-se-ak-ar5/ccsm/rcp85/daily/2060/WRFDS_2060-01-01.nc"
with s3fs.S3FileSystem(
anon=True,
default_fill_cache=False,
default_cache_type="first",
).open(uri) as f:
h5chunks = kerchunk.hdf.SingleHdf5ToZarr(h5f=f, url=uri).translate()
with open(uri.replace(":", "").replace("/", "_").replace(".nc", "_kerchunk.json"), "wb") as f:
f.write(ujson.dumps(h5chunks).encode())
import pandas as pd
variable = "air_temperature"
time = f"{pd.Timestamp('now').floor('1h') - pd.Timedelta('1h'):%Y%m%dT%H0000Z}"
uri = f"s3://BUCKET/{time}/{time}-{variable}-{time}.nc"
with s3fs.S3FileSystem(
default_fill_cache=False,
default_cache_type="first",
).open(uri) as f:
h5chunks = kerchunk.hdf.SingleHdf5ToZarr(h5f=f, url=uri).translate()
with open(uri.replace(":", "").replace("/", "_").replace(".nc", "_kerchunk.json"), "wb") as f:
f.write(ujson.dumps(h5chunks).encode())
import fsspec
import xarray as xr
fs = fsspec.filesystem(
"reference",
fo=h5chunks,
remote_protocol="s3",
remote_options=dict(anon=True),
skip_instance_cache=True,
)
ds = xr.open_dataset(
fs.get_mapper(), engine="zarr", backend_kwargs={"consolidated": False}
)
Write reference to parquet file:
import kerchunk.hdf # requires h5netcdf
import s3fs
from fsspec.implementations.reference import LazyReferenceMapper
uri = "s3://wrf-se-ak-ar5/ccsm/rcp85/daily/2060/WRFDS_2060-01-01.nc"
out = LazyReferenceMapper.create(root="file.parquet", record_size=1)
with s3fs.S3FileSystem(
anon=True,
default_fill_cache=False,
default_cache_type="first",
).open(uri) as f:
_ = kerchunk.hdf.SingleHdf5ToZarr(h5f=f, url=uri, out=out).translate()
Read this using fsspec and pass to xarray using engine="zarr"
from tempfile import TemporaryDirectory
import fsspec
import ujson
import xarray as xr
from kerchunk.combine import MultiZarrToZarr
from kerchunk.hdf import SingleHdf5ToZarr
from tqdm import tqdm
Create a list of files
fs_read = fsspec.filesystem("s3", anon=True, skip_instance_cache=True)
file_paths = [
"s3://" + f
for f in s3fs.S3FileSystem().glob(
f"s3://BUCKET/0.1x0.1-global/{init_time}/{init_time}-{variable}-*.nc"
)
][0:forecast_len]
Scan the files
import json
from kerchunk.grib2 import scan_grib
out = scan_grib(
"s3://BUCKET/20220730T000000Z-air_temperature-20220730T000000Z.grib",
common_vars=[
"time",
"step",
"heightAboveGround",
"latitude",
"longitude",
"valid_time",
],
storage_options={},
)
with open("out.json", "w") as f:
json.dump(out, f)
from kerchunk.combine import MultiZarrToZarr
mzz = MultiZarrToZarr(["out.json", "out2.json"], concat_dims=['valid_time'])
ds = xr.open_dataset(
"reference://",
engine="zarr",
backend_kwargs={
"storage_options": {
"fo": mzzt,
},
"consolidated": False,
},
)
import kerchunk.zarr