import dataclasses
import datetime
import numcodecs as ncd
import numpy as np
import s3fs

# Code for low-level access to chunk data (could be simplified for this specific use case)
s3 = s3fs.S3FileSystem(anon=True)

@dataclasses.dataclass
class ZarrId:
    run_hour: datetime.datetime
    level_type: str
    var_level: str
    var_name: str
    model_type: str
        
    def format_chunk_id(self, chunk_id):
        return f"0.{chunk_id}" if self.model_type == "fcst" else chunk_id
        

def create_s3_chunk_url(zarr_id, chunk_id, prefix=False):
    url = zarr_id.run_hour.strftime(
        f"hrrrzarr/{zarr_id.level_type}/%Y%m%d/%Y%m%d_%Hz_{zarr_id.model_type}.zarr/")
    url += f"{zarr_id.var_level}/{zarr_id.var_name}/{zarr_id.var_level}/{zarr_id.var_name}/"
    url += f"{zarr_id.format_chunk_id(chunk_id)}"
    return url


def retrieve_data(zarr_id, s3_url):
    with s3.open(s3_url, 'rb') as compressed_data:
        buffer = ncd.blosc.decompress(compressed_data.read())

        dtype = "<f2"
        if zarr_id.var_level == "surface" and zarr_id.var_name == "PRES":
            dtype = "<f4"

        chunk = np.frombuffer(buffer, dtype=dtype)

        if zarr_id.model_type == "anl":
            data_array = np.reshape(chunk, (150, 150))
        else:
            entry_size = 22500
            data_array = np.reshape(chunk, (len(chunk)//entry_size, 150, 150))

    return data_array


def get_chunk(zid, chunk_id):
    try:
        return retrieve_data(zid, create_s3_chunk_url(zid, chunk_id))
    except:
        return None


# The actual calculation
def get_percentile_for_chunk(chunk_id):
    gust_array = np.array([x for x in (get_chunk(zid, chunk_id) for zid in zids) if x is not None])
    percentile_95 = np.percentile(gust_array, 95, axis=0)
    return percentile_95


## Define the data we want to retrieve
from pandas import date_range

dates = []
for year in range(2016, 2020): 
    dates += [hour.to_pydatetime() for hour in date_range(start=f"{year}-09-01",end=f"{year}-10-01",
                                                 freq="H", closed="left")]

zids = [ZarrId(
                run_hour=date,
                level_type="sfc",
                var_level="surface",
                var_name="GUST",
                model_type="anl"
            ) 
        for date in dates]


%%time
import datetime
from dask_cloudprovider.aws import FargateCluster

time_string = datetime.datetime.now().strftime("%Y-%M-%d-%H-%m") # just using to identify logs
cluster = FargateCluster(n_workers=48, worker_cpu=256, worker_mem=1024, # Use cheap wimpy machines
          image="public.ecr.aws/i5z4g4e5/climo",
          cloudwatch_logs_group=f"dask-climo-{time_string}")
cluster

CPU times: user 33.7 s, sys: 3.17 s, total: 36.8 s
Wall time: 2min 36s


from dask.distributed import Client
client = Client(cluster) ## If you forget this line, Dask will run locally or on the last-configured client!


%%time
from dask import delayed

chunk_y = range(8)
chunk_x = range(12)

chunk_ids = [f"{y}.{x}" for y in chunk_y for x in chunk_x]
gust_climo_data = delayed(np.array)([
              [delayed(get_percentile_for_chunk)(f"{y}.{x}") for y in chunk_y]
                   for x in chunk_x])
gust_climo_data = gust_climo_data.compute()

CPU times: user 2.25 s, sys: 202 ms, total: 2.45 s
Wall time: 5min 45s


cluster.close()


%%time
gust_climo_data = np.hstack(
    [np.vstack(chunk_data) for chunk_data in gust_climo_data])

CPU times: user 10.6 ms, sys: 10.7 ms, total: 21.2 ms
Wall time: 21.1 ms


%%time
# Get lats and lons from chunk index
import xarray as xr

chunk_index = xr.open_zarr(s3fs.S3Map("s3://hrrrzarr/grid/HRRR_chunk_index.zarr", s3=s3))

CPU times: user 239 ms, sys: 30.2 ms, total: 269 ms
Wall time: 2.26 s


%%time
import matplotlib.pyplot as plt
import cartopy.crs as ccrs

# Get rid of invalid values from dummy gridpoints in chunks that weren't actually 150x150
matrix = gust_climo_data[:1059, :1799]
fig = plt.figure(figsize=(10,10))
ax = plt.axes(projection=ccrs.PlateCarree())
plt.contourf(chunk_index.longitude, chunk_index.latitude, matrix)
ax.coastlines()
plt.show()
plt.close()

CPU times: user 2.44 s, sys: 381 ms, total: 2.82 s
Wall time: 34.8 s

HRRR Zarr Cloud Example (dask)¶

Motivation¶

Introduction¶

Pre-reqs:¶

Worker code¶

Setting up the cluster¶

Note on docker¶

FargateCluster

Cluster dashboard¶

Running the job¶

Note on runtime and resources¶

Closing the cluster¶

Post-processing the data¶

requirements.txt¶