Xarray

IO

Read

Read tutorial data

import xarray as xr


ds = xr.tutorial.open_dataset("air_temperature").isel(time=0, lat=0, lon=0)

da = xr.tutorial.open_dataset("air_temperature").isel(time=0)["air"]


ds = xr.tutorial.open_dataset("eraint_uvz").isel(month=0, level=0)

da = xr.tutorial.open_dataset("eraint_uvz").isel(month=0, level=0)["u"].reset_coords(drop=True)

Read in a grib file:

import fsspec

import xarray as xr


ds = xr.open_dataset("f.grb", engine="cfgib")


ds = xr.open_mfdataset(

    "f.grb",

    engine="cfgrib",

    backend_kwargs=dict(filter_by_keys={"typeOfLevel": "heightAboveGround"}),

    #backend_kwargs=dict(filter_by_keys={"stepType": "instant", "typeOfLevel": "heightAboveGround", "level": 10}),

    #backend_kwargs=dict(filter_by_keys={"cfVarName": "u"}),

)


uri = "simplecache::s3://mf-nwp-models/arpege-world/v2/2021-10-24/00/UGRD/10m/0h.grib2"

file = fsspec.open_local(

    uri, s3={"anon": True}, filecache={"cache_storage": "/tmp/files"}

)

ds = xr.open_dataset(file, engine="cfgrib")

ds = xr.open_dataset(fsspec.open_local("simplecache::s3://BUCKET/FILE.grib"), engine="cfgrib")

Read in multiple files but don't have a dimension to concat along:

def preprocess(ds):

    return ds.expand_dims('time')


ds = xr.open_mfdataset("*.grb2", engine="cfgrib", preprocess=preprocess, parallel=True)


xr.open_mfdataset(

    files,

    engine="zarr",

    consolidated=True,

    combine="nested",

    concat_dim="valid_time",

)

Read NetCDF

import s3fs

import xarray as xr


ds = xr.open_dataset(

    s3fs.S3FileSystem(

        anon=True, default_fill_cache=False, default_cache_type="first"

    ).open("s3://wrf-se-ak-ar5/ccsm/rcp85/daily/2060/WRFDS_2060-01-01.nc")

)

with s3fs.S3FileSystem(

        anon=True, default_fill_cache=False, default_cache_type="first"

    ).open("s3://wrf-se-ak-ar5/ccsm/rcp85/daily/2060/WRFDS_2060-01-01.nc") as f:

    ds = xr.open_dataset(f)

    print(ds)


import fsspec

file = fsspec.open_local("simplecache::s3://BUCKET/file.nc", s3={"anon": True}, filecache={"cache_storage": "/tmp/files"})

ds = xr.open_dataset(file)

 

import s3fs

fs = s3fs.S3FileSystem()

with fs.open("s3://BUCKET/file.nc") as f:

    ds = xr.open_dataset(f, engine="h5netcdf")

Read data through thredds

ds = xr.open_dataset("https://tds.hycom.org/thredds/dodsC/GLBy0.08/expt_93.0/uv3z", decode_times=False)

Read zarr

ds = xr.open_zarr("test.zarr", consolidated=True)

ds = xr.open_zarr("test.zarr", storage_options={"anon": True})

ds = xr.open_dataset("test.zarr", engine="zarr", chunks={}) # chunks="auto"

ds = xr.open_mfdataset("year*.zarr", parallel=True, engine="zarr")

ds = xr.open_mfdataset("year*.zarr", parallel=True, combine="nested", engine="zarr")


https://registry.opendata.aws/ecmwf-era5/ 

ds = xr.open_mfdataset(

    "s3://era5-pds/zarr/2020/12/data/eastward_wind_at_10_metres.zarr",

    engine="zarr",

    backend_kwargs=dict(storage_options={"anon": True}),

)


ds = xr.open_mfdataset(

    "s3://era5-pds/zarr/2020/12/data/eastward_wind_at_10_metres.zarr",

    engine="zarr",

    backend_kwargs=dict(storage_options={"anon": True}, consolidated=True), # don't think has any benefit

)


https://github.com/google-research/arco-era5 

ds = xr.open_zarr(

    "gs://gcp-public-data-arco-era5/co/single-level-reanalysis.zarr", 

    chunks={'time': 48},

    consolidated=True,

)

Create DataArray

import numpy as np

import xarray as xr


da = xr.DataArray(

    data=np.arange(4),

    coords=dict(lon=(["lon"], [0, 1, 358, 359])),

    dims="lon",

)


da = xr.DataArray(

       np.random.rand(3, 4, 5),

       coords=[

           pd.date_range("2000-01-01", "2000-01-03", freq="D"),

           np.arange(4),

           np.arange(5),

       ],

       dims=["time", "lat", "lon"],

       name='var'

)


da = xr.DataArray(

    np.random.rand(NX, NY),

    coords=[np.linspace(0, 90, num=1), np.linspace(0, 90, num=1)],

    dims=["lat", "lon"],

    name="var",

)

Dataset to numpy array

ds.to_array().values

Add a field to a dataset and keep the existing dims

ds_point["field"] = (("point"), np.zeros(len(ds_point["point"])))

Read OPENDAP:

If you require authentication see https://stackoverflow.com/a/66179413/6046019 

from siphon.catalog import TDSCatalog

import xarray as xr


# cat = TDSCatalog('http://thredds.ucar.edu/thredds/catalog/grib/NCEP/GFS/Global_0p25deg/catalog.xml?dataset=grib/NCEP/GFS/Global_0p25deg/Best')

# url = cat.datasets[0].access_urls['OPENDAP']

url = 'https://thredds.ucar.edu/thredds/dodsC/grib/NCEP/GFS/Global_0p25deg/Best'

url = "https://thredds-jumbo.unidata.ucar.edu/thredds/catalog/grib/NCEP/GFS/Global_0p25deg/catalog.html"

ds = xr.open_dataset(url)

 concat files

f1 = "https://rda.ucar.edu/thredds/dodsC/files/g/ds084.1/2020/20200101/gfs.0p25.2020010100.f000.grib2"

f2 = "https://rda.ucar.edu/thredds/dodsC/files/g/ds084.1/2020/20200101/gfs.0p25.2020010100.f003.grib2"

ds1 = xr.open_dataset(f1)['u-component_of_wind_height_above_ground']

ds2 = xr.open_dataset(f2)['u-component_of_wind_height_above_ground']

ds = xr.concat([ds1, ds2], dim="time")

 using intake thredds (doesn't work)

import intake

import fsspec


fsspec.config.conf["simplecache"] = {"cache_storage": "/tmp/files", "same_names": True}


url = "simplecache::https://rda.ucar.edu/thredds/dodsC/files/g/ds084.1/2020/20200101/catalog.xml"


intake.open_thredds_merged(

    url,

    path=["gfs.0p25.2020010100.f*.grib2"],

    driver="netcdf",

    concat_kwargs={"dim": "time"},

    xarray_kwargs=dict(

        engine="cfgrib", backend_kwargs=dict(filter_by_keys={"cfVarName": "t2m"})

    ),

)

Preprocess utils

def select_location(ds: xr.Dataset) -> xr.Dataset:

    return ds.sel(lat=lat, lon=lon, method="nearest")


lambda x: x.sel(

            latitude=location.point.latitude,

            longitude=location.point.latitude,

            method="nearest",

        )


def select_region(ds: xr.Dataset) -> xr.Dataset:

    return ds.sel(latitude=slice(lat_min, lat_max), longitude=slice(lon_min, lon_max))


def select_region_and_time(ds: xr.Dataset) -> xr.Dataset:

    return ds.sel(

        latitude=slice(lat_min, lat_max),

        longitude=slice(lon_min, lon_max),

        time=slice(time_min, time_max),

    )


def expand_time_dim(ds: xr.Dataset) -> xr.Dataset:

    return ds.expand_dims('time')


def swap_time_dim_to_lead_time_dim(ds: xr.Dataset) -> xr.Dataset:

    lead_time = (ds["time"] - ds["reftime"]).squeeze().values

    ds = ds.assign_coords(lead_time=lead_time)

    data_var = list(ds.data_vars)[0]

    ds[data_var] = ds[data_var].swap_dims({"time": "lead_time"})

    return ds.drop_dims("time")

Store

Write to zarr

from numcodecs import Blosc

import zarr


# ds["var"].encoding = {}

ds.to_zarr("ds.zarr", consolidated=True, mode="w")

ds.to_zarr("memory://ds.zarr")

ds.to_zarr("s3://BUCKET/ds.zarr", consolidated=True)

ds.to_zarr("s3://BUCKET/ds.zarr", consolidated=True, mode="w")


compressor = zarr.Blosc(cname="zstd", clevel=9, shuffle=Blosc.BITSHUFFLE)

ds.to_zarr("ds.zarr", encoding={"data_var": {"compressor": compressor}})


# append variables to zarr store

ds.to_zarr("s3://BUCKET/ds.zarr", consolidated=True, mode="w")

ds["var2"] = ds["var"]

ds = ds.drop_vars(["var"])

ds.to_zarr("s3://BUCKET/ds.zarr", consolidated=True, mode="a")


# no chunks

ds.chunk(-1).to_zarr("ds.zarr", consolidated=True, mode="w")

to pickle

import pickle


pickle.dump(ds, open("/tmp/ds.pkl", "wb"), protocol=-1)

ds = pickle.load(open("/tmp/ds.pkl", "rb"))


import s3fs

fs = s3fs.S3FileSystem()

pickle.dump(ds, fs.open("s3://BUCKET/tmp.pkl", "wb"), protocol=-1)

ds = pickle.load(fs.open("s3://BUCKET/tmp.pkl", "rb"))

DataArray to Dataset

da.to_dataset()

To dataframe

ds.to_series()

ds.to_dask_dataframe()

ETL

Reverse latitude

ds = ds.reindex(latitude=ds["latitude"][::-1])

Reassign longitude (0 - 360 -> -180 - 180)

with xr.set_options(keep_attrs=True):

    ds = ds.assign_coords(longitude=(((ds["longitude"] + 180) % 360) - 180))

ds = ds.sortby("longitude")

Reset / drop /  coords (drop scalar/empty coords)

ds.reset_coords(drop=True)

Add a forecast hour coordinate

ds.assign_coords(time_delta=ds["time"] - ds["reftime"])

Roll  longitude (over meridian)

ds.roll(lon=1, roll_coords=True)

Drop variables

ds.drop_vars("var")

Rename variables

ds.rename({'lat': 'latitude', 'lon': 'longitude'})

Convert time to a timezone

ds = ds.assign_coords(

    {

        "time": ds["time"]

        .to_index()

        .tz_localize("UTC")

        .tz_convert("America/Los_Angeles")

        .tz_localize(None)

        .values

    }

)

Drop dimensions

ds.drop_dims(["a", "b"])

Get variable name

' '.join(ds.data_vars).split()

Fill values

ds.fillna(5)

Update values

da.loc[dict(axis_A="A_1", axis_B="B_1", axis_C="C_1")] = 1

See bytes

from dask.utils import format_bytes

format_bytes(ds.nbytes)

Concatenating, Merging and Joining

xr.merge([ds1, ds2])

 

xr.combine_by_coords([ds1, ds2])

 

ds = xr.open_mfdataset(files, combine="nested")

concat zarrs

da = xr.DataArray(

       np.random.rand(2, 2, 1),

       coords=[

           np.arange(2),

           np.arange(2),

           np.arange(1),

       ],

       dims=["x", "y", "z"],

       name="da"

)

da.to_dataset().to_zarr("/tmp/ds_z0.zarr", consolidated=True, mode="w")

da2 = da.copy()

da2 = da2.assign_coords(z=da2["z"] + 1)

da2.to_dataset().to_zarr("/tmp/ds_z1.zarr", consolidated=True, mode="w")

ds = xr.open_mfdataset("/tmp/ds_z*.zarr", engine="zarr")

Reorder coordinates

ds.transpose("lon", "lat", "time")

Selecting

See data variables and dimensions:

import pandas as pd


df = pd.DataFrame(

    [[name, var.dims] for name, var in ds.data_vars.items()],

    columns=["Data Variable", "Dimensions"],

)

Select lat-lon region:

ds.sel(lat=slice(50, 55), lon=slice(0, 5))

isel

ds.isel(depth=slice(0, 6))

Select points:

ds.sel(lat=lats, lon=lons, method="nearest")

Select time:

da.sel(time="2021-03-03 00")

 Select time-range

da.loc["2020-01-01T00":"2020-01-01T12"]

query

da = xr.DataArray(np.arange(0, 5, 1), dims="x", name="a")

da.query(x="a > 2")

Resizing

Interpolate to double spatial resolution

new_lat = np.linspace(

    ds["lat"].min(), ds["lat"].max(), num=len(ds["lat"]) * 2, dtype="float32"

)

new_lon = np.linspace(

    ds["lon"].min(), ds["lon"].max(), num=len(ds["lon"]) * 2, dtype="float32"

)

ds_out = xr.Dataset(

    {

        "lat": (["lat"], new_lat),

        "lon": (["lon"], new_lon),

    }

)

da.interp_like(ds_out)

Coarsening to half spatial resolution

ds.coarsen(lat=2, lon=2).mean()

Filtering

Replace values less than 4 with 1 and give everything else a nan

xr.where(da < 4, 1, np.nan)

Return index of value

ds.where(ds == ds.max(), drop=True)

Drop nas

ds.dropna

Stacking and unstacking

da.stack({'index': ["time", "latitude", "longitude"]})

da.stack({'point': ["latitude", "longitude"]})

da.stack({'index': ['y', 'x']}).isel(index=index).unstack()

da.unstack(sparse=True)

Analytics

See if a data_var has nans /isnan

da.isnull().any()

Find location of max

da.isel(da.argmax(...))

See chunks

ds.chunks

 Rechunk

ds = ds.chunk({"time": 12, "latitude": 227, "longitude": 350})


from rechunker import rechunk


Apply

Every data point in Dataset:

def magnitude(u, v):

    return np.sqrt(u ** 2 + v ** 2)


ws10 = xr.apply_ufunc(magnitude, ds["u10"], ds["v10"])

ws10 = xr.apply_ufunc(magnitude, ds["u10"], ds["v10"], dask="allowed").compute()

ws10.attrs = ds["u10"].attrs

ws10.attrs["long_name"] = "10 metre wind speed"

 

def uv_to_direction_from(u, v):

    return np.mod(90 - np.arctan2(-v, -u) * (180 / np.pi), 360)


wdir10 = xr.apply_ufunc(uv_to_direction_from, ds["u10"], ds["v10"])

calm_mask = xr.where((ds["u10"] == 0) & (ds["v10"] == 0), True, False)

wdir10.loc[calm_mask] = 0.

wdir10.attrs["units"] = "deg"

wdir10.attrs["long_name"] = "10 metre wind direction from"

 

def wind_dir_from_to_cat(x, n_categories=4):

    if 45 <= x < 135:

        return "E"

    elif 135 <= x < 225:

        return "S"

    elif 225 <= x < 315:

        return "W"

    else:

        return "N"

wind_dir_from_to_cat_vec = np.vectorize(wind_dir_from_to_cat)

ds["wind_direction_categorical"] = xr.apply_ufunc(wind_dir_from_to_cat_vec, ds["wind_direction"])


Plotting

Change matplotlib style

import matplotlib.pyplot as plt


plt.style.use("ggplot")

da.plot()

Choose color map

da.plot(cmap=plt.cm.Blues)

Modify colorbar

from matplotlib.colors import BoundaryNorm, LinearSegmentedColormap

import matplotlib.pyplot as plt


color_list = ["blue", "red", "green", "purple", "white", "black"]

cmap = LinearSegmentedColormap.from_list("front_colors", color_list, N=len(color_list))

labels = ["Cold", "Warm", "Stationary", "Occluded", "None", "NA"]


# Match colormap segments to data categories

bounds = np.arange(0, len(color_list) + 1, 1)

norm = BoundaryNorm(bounds, cmap.N)


fig, ax = plt.subplots()

cax = da.plot(ax=ax, cmap=cmap, norm=norm, add_colorbar=False)

cbar = fig.colorbar(cax, ticks=[0.5, 1.5, 2.5, 3.5, 4.5, 5.5], label="Fronts")

cbar.ax.set_yticklabels(labels)

plt.show()

Don't plot extreme

da.plot(robust=True)

Change fig size

da.plot(figsize=(16, 8))

da.plot(aspect=3, size=6)

Add geofeatures

import cartopy.crs as ccrs

import cartopy.feature as cf

import matplotlib.pyplot as plt



plt.figure(figsize=(16, 8))

ax = plt.subplot(projection=ccrs.PlateCarree())

da.plot(ax=ax)

ax.coastlines(resolution="110m")

ax.coastlines()

ax.add_feature(cf.BORDERS)

ax.gridlines(draw_labels=True);

facet by column

da.plot(col="DIMENSION")

overlay geopandas dataframe

ax = plt.subplot(projection=ccrs.PlateCarree())

da.plot.contour(ax=ax, levels=30)

gdf.plot(ax=ax, color="blue")

gdf2.plot(ax=ax, color="red")

Display

xr.set_options(display_max_rows=20, keep_attrs=True)

Query using CF parameters


import cf_xarray as cfxr


da.cf["standard_name_var"]