Last active
December 9, 2025 17:34
-
-
Save bmorris3/6b110f19cc21f0ec56c4cbf99acdaa40 to your computer and use it in GitHub Desktop.
example for building MAESTRO Zarr arrays
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "id": "8f988550-1925-4851-9f0d-3ecd939257ae", | |
| "metadata": {}, | |
| "source": [ | |
| "# Minimal Example: MAESTRO HDF5 to Zarr" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "95a4b1a1-f682-400e-99e4-2ef3e6274327", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import datetime\n", | |
| "from tqdm.auto import tqdm\n", | |
| "\n", | |
| "import h5py\n", | |
| "import numpy as np\n", | |
| "import zarr\n", | |
| "\n", | |
| "import matplotlib.pyplot as plt\n", | |
| "from astropy.table import Table" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "6c8cb17f-ca8d-4e13-9d5d-9c9cb2655f7c", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "grid = Table.read('grid1460.csv')\n", | |
| "grid['index'] = grid['file_number'] - 1\n", | |
| "grid.add_index('index')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "d68cac62-2d88-4054-bac6-6ace55100060", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "archive_path = '../12C-H4.h5'" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "423508c0-d36c-4b46-aeb9-70ebdc38a87f", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "h5_file = h5py.File(archive_path, 'r')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "d54ea68a-041e-4e3e-9242-899801a88752", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "press_coords = h5_file['pressure_coords'][:]\n", | |
| "temp_coords = h5_file['temperature_coords'][:]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "557aa663-760a-452e-8160-596e38513d7b", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# extract only the molecule name:\n", | |
| "path = archive_path.split('/')[-1].split('.h5')[0]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "799245d8-2991-45a9-bba3-c62331bf5f6e", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Natasha's recommended uniform wavenumber sampling \n", | |
| "# (MAESTRO slack Nov 25 2025)\n", | |
| "row = grid[0]\n", | |
| "start = row['start_wavenumber']\n", | |
| "number_wave_pts = row['number_wave_pts']\n", | |
| "delta_wavenumber = row['delta_wavenumber']\n", | |
| "new_wvno_grid = np.arange(number_wave_pts) * delta_wavenumber + start\n", | |
| "\n", | |
| "wavenumber_sampling = new_wvno_grid[::2]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "081a96ac-976d-4eb5-9cb1-4e2c08ba7bde", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "compressed_path = f'{path}.zarr'\n", | |
| "min_wavenumber_pts = int(grid['number_wave_pts'].min())\n", | |
| "\n", | |
| "temperatures = np.sort(list(set(temp_coords)))\n", | |
| "pressures = np.sort(list(set(press_coords)))\n", | |
| "\n", | |
| "# these compression settings will apply to every array:\n", | |
| "compression = dict(\n", | |
| " compressors=zarr.codecs.BloscCodec(\n", | |
| " cname=\"zstd\",\n", | |
| " # here we use the maximum compression level,\n", | |
| " # which takes longer to compress (one time)\n", | |
| " # but not more time to decompress (many times)\n", | |
| " clevel=9,\n", | |
| " shuffle=zarr.codecs.BloscShuffle.shuffle\n", | |
| " )\n", | |
| ")\n", | |
| "\n", | |
| "store = zarr.storage.LocalStore(compressed_path)\n", | |
| "root = zarr.create_group(store, overwrite='w', zarr_format=3)\n", | |
| "root.create_array(\n", | |
| " 'temperature', \n", | |
| " data=temperatures, \n", | |
| " dimension_names=('temperature',), \n", | |
| " attributes=dict(coordinates='temperature'),\n", | |
| " **compression\n", | |
| ")\n", | |
| "root.create_array(\n", | |
| " 'pressure', \n", | |
| " data=pressures, \n", | |
| " dimension_names=('pressure',), \n", | |
| " attributes=dict(coordinates='pressure'),\n", | |
| " **compression\n", | |
| ")\n", | |
| "root.create_array(\n", | |
| " 'wavenumber', \n", | |
| " data=wavenumber_sampling, \n", | |
| " dimension_names=('wavenumber',), \n", | |
| " attributes=dict(coordinates='wavenumber'),\n", | |
| " **compression\n", | |
| ")\n", | |
| "\n", | |
| "arr_shape = (wavenumber_sampling.size, temperatures.size, pressures.size)\n", | |
| "arr_attrs = dict(\n", | |
| " # `coordinates` is required for remote indexing with xarray:\n", | |
| " coordinates='wavenumber temperature pressure',\n", | |
| "\n", | |
| " # for MAESTRO versioning:\n", | |
| " molecule=path,\n", | |
| " source=dict(\n", | |
| " database='MAESTRO',\n", | |
| " created=str(datetime.datetime.now()),\n", | |
| " version=0.1 # or pick your number\n", | |
| " ),\n", | |
| " compression=dict(\n", | |
| " codec=compression['compressors'].__class__.__name__,\n", | |
| " algorithm=compression['compressors'].cname.name,\n", | |
| " clevel=compression['compressors'].clevel,\n", | |
| " shuffle=compression['compressors'].shuffle.__class__.__name__,\n", | |
| " )\n", | |
| ")\n", | |
| "dimension_names = 'wavenumber temperature pressure'.split()\n", | |
| "\n", | |
| "arr = root.create_array(\n", | |
| " 'csx', \n", | |
| " shape=arr_shape, \n", | |
| " dtype=np.float64, \n", | |
| " dimension_names=dimension_names,\n", | |
| " attributes=arr_attrs,\n", | |
| " **compression\n", | |
| ")\n", | |
| "\n", | |
| "# this promotes efficient remote indexing:\n", | |
| "zarr.consolidate_metadata(store)\n", | |
| "\n", | |
| "\n", | |
| "# loop over temperature and pressure to downsample the \n", | |
| "# wavenumber grid and store the results in the zarr array:\n", | |
| "for i, temperature in tqdm(enumerate(temperatures), total=len(temperatures)):\n", | |
| " for j, pressure in enumerate(pressures):\n", | |
| " nearest_coord = np.argmin(np.hypot(grid['pressure_bar'] - pressure, grid['temperature_K'] - temperature)) \n", | |
| " grid_point = dict(grid.loc[nearest_coord])\n", | |
| " cross_section = h5_file['cxs'][nearest_coord]\n", | |
| " wavenumber = np.arange(grid_point['number_wave_pts']) * grid_point['delta_wavenumber'] + grid_point['start_wavenumber']\n", | |
| "\n", | |
| "\n", | |
| " arr[:, i, j] = np.interp(wavenumber_sampling, wavenumber, cross_section)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "3657e9cc-f2cb-4359-8677-eba8a70e0d28", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.12.12" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment