Xarray
IO
Read
Read tutorial data
import xarray as xr
ds = xr.tutorial.open_dataset("air_temperature").isel(time=0, lat=0, lon=0)
da = xr.tutorial.open_dataset("air_temperature").isel(time=0)["air"]
ds = xr.tutorial.open_dataset("eraint_uvz").isel(month=0, level=0)
da = xr.tutorial.open_dataset("eraint_uvz").isel(month=0, level=0)["u"].reset_coords(drop=True)
Read in a grib file:
import fsspec
import xarray as xr
ds = xr.open_dataset("f.grb", engine="cfgib")
ds = xr.open_mfdataset(
"f.grb",
engine="cfgrib",
backend_kwargs=dict(filter_by_keys={"typeOfLevel": "heightAboveGround"}),
#backend_kwargs=dict(filter_by_keys={"stepType": "instant", "typeOfLevel": "heightAboveGround", "level": 10}),
#backend_kwargs=dict(filter_by_keys={"cfVarName": "u"}),
)
uri = "simplecache::s3://mf-nwp-models/arpege-world/v2/2021-10-24/00/UGRD/10m/0h.grib2"
file = fsspec.open_local(
uri, s3={"anon": True}, filecache={"cache_storage": "/tmp/files"}
)
ds = xr.open_dataset(file, engine="cfgrib")
ds = xr.open_dataset(fsspec.open_local("simplecache::s3://BUCKET/FILE.grib"), engine="cfgrib")
Read in multiple files but don't have a dimension to concat along:
def preprocess(ds):
return ds.expand_dims('time')
ds = xr.open_mfdataset("*.grb2", engine="cfgrib", preprocess=preprocess, parallel=True)
xr.open_mfdataset(
files,
engine="zarr",
consolidated=True,
combine="nested",
concat_dim="valid_time",
)
Read NetCDF
import s3fs
import xarray as xr
ds = xr.open_dataset(
s3fs.S3FileSystem(
anon=True, default_fill_cache=False, default_cache_type="first"
).open("s3://wrf-se-ak-ar5/ccsm/rcp85/daily/2060/WRFDS_2060-01-01.nc")
)
with s3fs.S3FileSystem(
anon=True, default_fill_cache=False, default_cache_type="first"
).open("s3://wrf-se-ak-ar5/ccsm/rcp85/daily/2060/WRFDS_2060-01-01.nc") as f:
ds = xr.open_dataset(f)
print(ds)
import fsspec
file = fsspec.open_local("simplecache::s3://BUCKET/file.nc", s3={"anon": True}, filecache={"cache_storage": "/tmp/files"})
ds = xr.open_dataset(file)
import s3fs
fs = s3fs.S3FileSystem()
with fs.open("s3://BUCKET/file.nc") as f:
ds = xr.open_dataset(f, engine="h5netcdf")
Read data through thredds
ds = xr.open_dataset("https://tds.hycom.org/thredds/dodsC/GLBy0.08/expt_93.0/uv3z", decode_times=False)
Read zarr
ds = xr.open_zarr("test.zarr", consolidated=True)
ds = xr.open_zarr("test.zarr", storage_options={"anon": True})
ds = xr.open_dataset("test.zarr", engine="zarr", chunks={}) # chunks="auto"
ds = xr.open_mfdataset("year*.zarr", parallel=True, engine="zarr")
ds = xr.open_mfdataset("year*.zarr", parallel=True, combine="nested", engine="zarr")
https://registry.opendata.aws/ecmwf-era5/
ds = xr.open_mfdataset(
"s3://era5-pds/zarr/2020/12/data/eastward_wind_at_10_metres.zarr",
engine="zarr",
backend_kwargs=dict(storage_options={"anon": True}),
)
ds = xr.open_mfdataset(
"s3://era5-pds/zarr/2020/12/data/eastward_wind_at_10_metres.zarr",
engine="zarr",
backend_kwargs=dict(storage_options={"anon": True}, consolidated=True), # don't think has any benefit
)
https://github.com/google-research/arco-era5
ds = xr.open_zarr(
"gs://gcp-public-data-arco-era5/co/single-level-reanalysis.zarr",
chunks={'time': 48},
consolidated=True,
)
Create DataArray
import numpy as np
import xarray as xr
da = xr.DataArray(
data=np.arange(4),
coords=dict(lon=(["lon"], [0, 1, 358, 359])),
dims="lon",
)
da = xr.DataArray(
np.random.rand(3, 4, 5),
coords=[
pd.date_range("2000-01-01", "2000-01-03", freq="D"),
np.arange(4),
np.arange(5),
],
dims=["time", "lat", "lon"],
name='var'
)
da = xr.DataArray(
np.random.rand(NX, NY),
coords=[np.linspace(0, 90, num=1), np.linspace(0, 90, num=1)],
dims=["lat", "lon"],
name="var",
)
Dataset to numpy array
ds.to_array().values
Add a field to a dataset and keep the existing dims
ds_point["field"] = (("point"), np.zeros(len(ds_point["point"])))
Read OPENDAP:
If you require authentication see https://stackoverflow.com/a/66179413/6046019
from siphon.catalog import TDSCatalog
import xarray as xr
# cat = TDSCatalog('http://thredds.ucar.edu/thredds/catalog/grib/NCEP/GFS/Global_0p25deg/catalog.xml?dataset=grib/NCEP/GFS/Global_0p25deg/Best')
# url = cat.datasets[0].access_urls['OPENDAP']
url = 'https://thredds.ucar.edu/thredds/dodsC/grib/NCEP/GFS/Global_0p25deg/Best'
url = "https://thredds-jumbo.unidata.ucar.edu/thredds/catalog/grib/NCEP/GFS/Global_0p25deg/catalog.html"
ds = xr.open_dataset(url)
concat files
f1 = "https://rda.ucar.edu/thredds/dodsC/files/g/ds084.1/2020/20200101/gfs.0p25.2020010100.f000.grib2"
f2 = "https://rda.ucar.edu/thredds/dodsC/files/g/ds084.1/2020/20200101/gfs.0p25.2020010100.f003.grib2"
ds1 = xr.open_dataset(f1)['u-component_of_wind_height_above_ground']
ds2 = xr.open_dataset(f2)['u-component_of_wind_height_above_ground']
ds = xr.concat([ds1, ds2], dim="time")
using intake thredds (doesn't work)
import intake
import fsspec
fsspec.config.conf["simplecache"] = {"cache_storage": "/tmp/files", "same_names": True}
url = "simplecache::https://rda.ucar.edu/thredds/dodsC/files/g/ds084.1/2020/20200101/catalog.xml"
intake.open_thredds_merged(
url,
path=["gfs.0p25.2020010100.f*.grib2"],
driver="netcdf",
concat_kwargs={"dim": "time"},
xarray_kwargs=dict(
engine="cfgrib", backend_kwargs=dict(filter_by_keys={"cfVarName": "t2m"})
),
)
Preprocess utils
def select_location(ds: xr.Dataset) -> xr.Dataset:
return ds.sel(lat=lat, lon=lon, method="nearest")
lambda x: x.sel(
latitude=location.point.latitude,
longitude=location.point.latitude,
method="nearest",
)
def select_region(ds: xr.Dataset) -> xr.Dataset:
return ds.sel(latitude=slice(lat_min, lat_max), longitude=slice(lon_min, lon_max))
def select_region_and_time(ds: xr.Dataset) -> xr.Dataset:
return ds.sel(
latitude=slice(lat_min, lat_max),
longitude=slice(lon_min, lon_max),
time=slice(time_min, time_max),
)
def expand_time_dim(ds: xr.Dataset) -> xr.Dataset:
return ds.expand_dims('time')
def swap_time_dim_to_lead_time_dim(ds: xr.Dataset) -> xr.Dataset:
lead_time = (ds["time"] - ds["reftime"]).squeeze().values
ds = ds.assign_coords(lead_time=lead_time)
data_var = list(ds.data_vars)[0]
ds[data_var] = ds[data_var].swap_dims({"time": "lead_time"})
return ds.drop_dims("time")
Store
Write to zarr
from numcodecs import Blosc
import zarr
# ds["var"].encoding = {}
ds.to_zarr("ds.zarr", consolidated=True, mode="w")
ds.to_zarr("memory://ds.zarr")
ds.to_zarr("s3://BUCKET/ds.zarr", consolidated=True)
ds.to_zarr("s3://BUCKET/ds.zarr", consolidated=True, mode="w")
compressor = zarr.Blosc(cname="zstd", clevel=9, shuffle=Blosc.BITSHUFFLE)
ds.to_zarr("ds.zarr", encoding={"data_var": {"compressor": compressor}})
# append variables to zarr store
ds.to_zarr("s3://BUCKET/ds.zarr", consolidated=True, mode="w")
ds["var2"] = ds["var"]
ds = ds.drop_vars(["var"])
ds.to_zarr("s3://BUCKET/ds.zarr", consolidated=True, mode="a")
# no chunks
ds.chunk(-1).to_zarr("ds.zarr", consolidated=True, mode="w")
to pickle
import pickle
pickle.dump(ds, open("/tmp/ds.pkl", "wb"), protocol=-1)
ds = pickle.load(open("/tmp/ds.pkl", "rb"))
import s3fs
fs = s3fs.S3FileSystem()
pickle.dump(ds, fs.open("s3://BUCKET/tmp.pkl", "wb"), protocol=-1)
ds = pickle.load(fs.open("s3://BUCKET/tmp.pkl", "rb"))
DataArray to Dataset
da.to_dataset()
To dataframe
ds.to_series()
ds.to_dask_dataframe()
ETL
Reverse latitude
ds = ds.reindex(latitude=ds["latitude"][::-1])
Reassign longitude (0 - 360 -> -180 - 180)
with xr.set_options(keep_attrs=True):
ds = ds.assign_coords(longitude=(((ds["longitude"] + 180) % 360) - 180))
ds = ds.sortby("longitude")
Reset / drop / coords (drop scalar/empty coords)
ds.reset_coords(drop=True)
Add a forecast hour coordinate
ds.assign_coords(time_delta=ds["time"] - ds["reftime"])
Roll longitude (over meridian)
ds.roll(lon=1, roll_coords=True)
Drop variables
ds.drop_vars("var")
Rename variables
ds.rename({'lat': 'latitude', 'lon': 'longitude'})
Convert time to a timezone
ds = ds.assign_coords(
{
"time": ds["time"]
.to_index()
.tz_localize("UTC")
.tz_convert("America/Los_Angeles")
.tz_localize(None)
.values
}
)
Drop dimensions
ds.drop_dims(["a", "b"])
Get variable name
' '.join(ds.data_vars).split()
Fill values
ds.fillna(5)
Update values
da.loc[dict(axis_A="A_1", axis_B="B_1", axis_C="C_1")] = 1
See bytes
from dask.utils import format_bytes
format_bytes(ds.nbytes)
Concatenating, Merging and Joining
xr.merge([ds1, ds2])
xr.combine_by_coords([ds1, ds2])
ds = xr.open_mfdataset(files, combine="nested")
concat zarrs
da = xr.DataArray(
np.random.rand(2, 2, 1),
coords=[
np.arange(2),
np.arange(2),
np.arange(1),
],
dims=["x", "y", "z"],
name="da"
)
da.to_dataset().to_zarr("/tmp/ds_z0.zarr", consolidated=True, mode="w")
da2 = da.copy()
da2 = da2.assign_coords(z=da2["z"] + 1)
da2.to_dataset().to_zarr("/tmp/ds_z1.zarr", consolidated=True, mode="w")
ds = xr.open_mfdataset("/tmp/ds_z*.zarr", engine="zarr")
Reorder coordinates
ds.transpose("lon", "lat", "time")
Selecting
See data variables and dimensions:
import pandas as pd
df = pd.DataFrame(
[[name, var.dims] for name, var in ds.data_vars.items()],
columns=["Data Variable", "Dimensions"],
)
Select lat-lon region:
ds.sel(lat=slice(50, 55), lon=slice(0, 5))
isel
ds.isel(depth=slice(0, 6))
Select points:
ds.sel(lat=lats, lon=lons, method="nearest")
Select time:
da.sel(time="2021-03-03 00")
Select time-range
da.loc["2020-01-01T00":"2020-01-01T12"]
query
da = xr.DataArray(np.arange(0, 5, 1), dims="x", name="a")
da.query(x="a > 2")
Resizing
Interpolate to double spatial resolution
new_lat = np.linspace(
ds["lat"].min(), ds["lat"].max(), num=len(ds["lat"]) * 2, dtype="float32"
)
new_lon = np.linspace(
ds["lon"].min(), ds["lon"].max(), num=len(ds["lon"]) * 2, dtype="float32"
)
ds_out = xr.Dataset(
{
"lat": (["lat"], new_lat),
"lon": (["lon"], new_lon),
}
)
da.interp_like(ds_out)
Coarsening to half spatial resolution
ds.coarsen(lat=2, lon=2).mean()
Filtering
Replace values less than 4 with 1 and give everything else a nan
xr.where(da < 4, 1, np.nan)
Return index of value
ds.where(ds == ds.max(), drop=True)
Drop nas
ds.dropna
Stacking and unstacking
da.stack({'index': ["time", "latitude", "longitude"]})
da.stack({'point': ["latitude", "longitude"]})
da.stack({'index': ['y', 'x']}).isel(index=index).unstack()
da.unstack(sparse=True)
Analytics
See if a data_var has nans /isnan
da.isnull().any()
Find location of max
da.isel(da.argmax(...))
See chunks
ds.chunks
Rechunk
ds = ds.chunk({"time": 12, "latitude": 227, "longitude": 350})
from rechunker import rechunk
Apply
Every data point in Dataset:
def magnitude(u, v):
return np.sqrt(u ** 2 + v ** 2)
ws10 = xr.apply_ufunc(magnitude, ds["u10"], ds["v10"])
ws10 = xr.apply_ufunc(magnitude, ds["u10"], ds["v10"], dask="allowed").compute()
ws10.attrs = ds["u10"].attrs
ws10.attrs["long_name"] = "10 metre wind speed"
def uv_to_direction_from(u, v):
return np.mod(90 - np.arctan2(-v, -u) * (180 / np.pi), 360)
wdir10 = xr.apply_ufunc(uv_to_direction_from, ds["u10"], ds["v10"])
calm_mask = xr.where((ds["u10"] == 0) & (ds["v10"] == 0), True, False)
wdir10.loc[calm_mask] = 0.
wdir10.attrs["units"] = "deg"
wdir10.attrs["long_name"] = "10 metre wind direction from"
def wind_dir_from_to_cat(x, n_categories=4):
if 45 <= x < 135:
return "E"
elif 135 <= x < 225:
return "S"
elif 225 <= x < 315:
return "W"
else:
return "N"
wind_dir_from_to_cat_vec = np.vectorize(wind_dir_from_to_cat)
ds["wind_direction_categorical"] = xr.apply_ufunc(wind_dir_from_to_cat_vec, ds["wind_direction"])
Plotting
Change matplotlib style
import matplotlib.pyplot as plt
plt.style.use("ggplot")
da.plot()
Choose color map
da.plot(cmap=plt.cm.Blues)
Modify colorbar
from matplotlib.colors import BoundaryNorm, LinearSegmentedColormap
import matplotlib.pyplot as plt
color_list = ["blue", "red", "green", "purple", "white", "black"]
cmap = LinearSegmentedColormap.from_list("front_colors", color_list, N=len(color_list))
labels = ["Cold", "Warm", "Stationary", "Occluded", "None", "NA"]
# Match colormap segments to data categories
bounds = np.arange(0, len(color_list) + 1, 1)
norm = BoundaryNorm(bounds, cmap.N)
fig, ax = plt.subplots()
cax = da.plot(ax=ax, cmap=cmap, norm=norm, add_colorbar=False)
cbar = fig.colorbar(cax, ticks=[0.5, 1.5, 2.5, 3.5, 4.5, 5.5], label="Fronts")
cbar.ax.set_yticklabels(labels)
plt.show()
Don't plot extreme
da.plot(robust=True)
Change fig size
da.plot(figsize=(16, 8))
da.plot(aspect=3, size=6)
Add geofeatures
import cartopy.crs as ccrs
import cartopy.feature as cf
import matplotlib.pyplot as plt
plt.figure(figsize=(16, 8))
ax = plt.subplot(projection=ccrs.PlateCarree())
da.plot(ax=ax)
ax.coastlines(resolution="110m")
ax.coastlines()
ax.add_feature(cf.BORDERS)
ax.gridlines(draw_labels=True);
facet by column
da.plot(col="DIMENSION")
overlay geopandas dataframe
ax = plt.subplot(projection=ccrs.PlateCarree())
da.plot.contour(ax=ax, levels=30)
gdf.plot(ax=ax, color="blue")
gdf2.plot(ax=ax, color="red")
Display
xr.set_options(display_max_rows=20, keep_attrs=True)
Query using CF parameters
import cf_xarray as cfxr
da.cf["standard_name_var"]