bmorris3 · December 9, 2025 17:34
diff --git a/hdf5-to-zarr-minimal.ipynb b/hdf5-to-zarr-minimal.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "8f988550-1925-4851-9f0d-3ecd939257ae",
   "metadata": {},
   "source": [
    "# Minimal Example: MAESTRO HDF5 to Zarr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "95a4b1a1-f682-400e-99e4-2ef3e6274327",
   "metadata": {},
   "outputs": [],
   "source": [
    "import datetime\n",
    "from tqdm.auto import tqdm\n",
    "\n",
    "import h5py\n",
    "import numpy as np\n",
    "import zarr\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "from astropy.table import Table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6c8cb17f-ca8d-4e13-9d5d-9c9cb2655f7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "grid = Table.read('grid1460.csv')\n",
    "grid['index'] = grid['file_number'] - 1\n",
    "grid.add_index('index')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d68cac62-2d88-4054-bac6-6ace55100060",
   "metadata": {},
   "outputs": [],
   "source": [
    "archive_path = '../12C-H4.h5'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "423508c0-d36c-4b46-aeb9-70ebdc38a87f",
   "metadata": {},
   "outputs": [],
   "source": [
    "h5_file = h5py.File(archive_path, 'r')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d54ea68a-041e-4e3e-9242-899801a88752",
   "metadata": {},
   "outputs": [],
   "source": [
    "press_coords = h5_file['pressure_coords'][:]\n",
    "temp_coords = h5_file['temperature_coords'][:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "557aa663-760a-452e-8160-596e38513d7b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# extract only the molecule name:\n",
    "path = archive_path.split('/')[-1].split('.h5')[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "799245d8-2991-45a9-bba3-c62331bf5f6e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Natasha's recommended uniform wavenumber sampling \n",
    "# (MAESTRO slack Nov 25 2025)\n",
    "row = grid[0]\n",
    "start = row['start_wavenumber']\n",
    "number_wave_pts = row['number_wave_pts']\n",
    "delta_wavenumber = row['delta_wavenumber']\n",
    "new_wvno_grid = np.arange(number_wave_pts) * delta_wavenumber + start\n",
    "\n",
    "wavenumber_sampling = new_wvno_grid[::2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "081a96ac-976d-4eb5-9cb1-4e2c08ba7bde",
   "metadata": {},
   "outputs": [],
   "source": [
    "compressed_path = f'{path}.zarr'\n",
    "min_wavenumber_pts = int(grid['number_wave_pts'].min())\n",
    "\n",
    "temperatures = np.sort(list(set(temp_coords)))\n",
    "pressures = np.sort(list(set(press_coords)))\n",
    "\n",
    "# these compression settings will apply to every array:\n",
    "compression = dict(\n",
    "    compressors=zarr.codecs.BloscCodec(\n",
    "        cname=\"zstd\",\n",
    "        # here we use the maximum compression level,\n",
    "        # which takes longer to compress (one time)\n",
    "        # but not more time to decompress (many times)\n",
    "        clevel=9,\n",
    "        shuffle=zarr.codecs.BloscShuffle.shuffle\n",
    "    )\n",
    ")\n",
    "\n",
    "store = zarr.storage.LocalStore(compressed_path)\n",
    "root = zarr.create_group(store, overwrite='w', zarr_format=3)\n",
    "root.create_array(\n",
    "    'temperature', \n",
    "    data=temperatures, \n",
    "    dimension_names=('temperature',), \n",
    "    attributes=dict(coordinates='temperature'),\n",
    "    **compression\n",
    ")\n",
    "root.create_array(\n",
    "    'pressure', \n",
    "    data=pressures, \n",
    "    dimension_names=('pressure',), \n",
    "    attributes=dict(coordinates='pressure'),\n",
    "    **compression\n",
    ")\n",
    "root.create_array(\n",
    "    'wavenumber', \n",
    "    data=wavenumber_sampling, \n",
    "    dimension_names=('wavenumber',), \n",
    "    attributes=dict(coordinates='wavenumber'),\n",
    "    **compression\n",
    ")\n",
    "\n",
    "arr_shape = (wavenumber_sampling.size, temperatures.size, pressures.size)\n",
    "arr_attrs = dict(\n",
    "    # `coordinates` is required for remote indexing with xarray:\n",
    "    coordinates='wavenumber temperature pressure',\n",
    "\n",
    "    # for MAESTRO versioning:\n",
    "    molecule=path,\n",
    "    source=dict(\n",
    "        database='MAESTRO',\n",
    "        created=str(datetime.datetime.now()),\n",
    "        version=0.1  # or pick your number\n",
    "    ),\n",
    "    compression=dict(\n",
    "        codec=compression['compressors'].__class__.__name__,\n",
    "        algorithm=compression['compressors'].cname.name,\n",
    "        clevel=compression['compressors'].clevel,\n",
    "        shuffle=compression['compressors'].shuffle.__class__.__name__,\n",
    "    )\n",
    ")\n",
    "dimension_names = 'wavenumber temperature pressure'.split()\n",
    "\n",
    "arr = root.create_array(\n",
    "    'csx', \n",
    "    shape=arr_shape, \n",
    "    dtype=np.float64, \n",
    "    dimension_names=dimension_names,\n",
    "    attributes=arr_attrs,\n",
    "    **compression\n",
    ")\n",
    "\n",
    "# this promotes efficient remote indexing:\n",
    "zarr.consolidate_metadata(store)\n",
    "\n",
    "\n",
    "# loop over temperature and pressure to downsample the \n",
    "# wavenumber grid and store the results in the zarr array:\n",
    "for i, temperature in tqdm(enumerate(temperatures), total=len(temperatures)):\n",
    "    for j, pressure in enumerate(pressures):\n",
    "        nearest_coord = np.argmin(np.hypot(grid['pressure_bar'] - pressure, grid['temperature_K'] - temperature))        \n",
    "        grid_point = dict(grid.loc[nearest_coord])\n",
    "        cross_section = h5_file['cxs'][nearest_coord]\n",
    "        wavenumber = np.arange(grid_point['number_wave_pts']) * grid_point['delta_wavenumber'] + grid_point['start_wavenumber']\n",
    "\n",
    "\n",
    "        arr[:, i, j] = np.interp(wavenumber_sampling, wavenumber, cross_section)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3657e9cc-f2cb-4359-8677-eba8a70e0d28",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "8f988550-1925-4851-9f0d-3ecd939257ae",
	"metadata": {},
	"source": [
	"# Minimal Example: MAESTRO HDF5 to Zarr"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "95a4b1a1-f682-400e-99e4-2ef3e6274327",
	"metadata": {},
	"outputs": [],
	"source": [
	"import datetime\n",
	"from tqdm.auto import tqdm\n",
	"\n",
	"import h5py\n",
	"import numpy as np\n",
	"import zarr\n",
	"\n",
	"import matplotlib.pyplot as plt\n",
	"from astropy.table import Table"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "6c8cb17f-ca8d-4e13-9d5d-9c9cb2655f7c",
	"metadata": {},
	"outputs": [],
	"source": [
	"grid = Table.read('grid1460.csv')\n",
	"grid['index'] = grid['file_number'] - 1\n",
	"grid.add_index('index')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "d68cac62-2d88-4054-bac6-6ace55100060",
	"metadata": {},
	"outputs": [],
	"source": [
	"archive_path = '../12C-H4.h5'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "423508c0-d36c-4b46-aeb9-70ebdc38a87f",
	"metadata": {},
	"outputs": [],
	"source": [
	"h5_file = h5py.File(archive_path, 'r')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "d54ea68a-041e-4e3e-9242-899801a88752",
	"metadata": {},
	"outputs": [],
	"source": [
	"press_coords = h5_file['pressure_coords'][:]\n",
	"temp_coords = h5_file['temperature_coords'][:]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "557aa663-760a-452e-8160-596e38513d7b",
	"metadata": {},
	"outputs": [],
	"source": [
	"# extract only the molecule name:\n",
	"path = archive_path.split('/')[-1].split('.h5')[0]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "799245d8-2991-45a9-bba3-c62331bf5f6e",
	"metadata": {},
	"outputs": [],
	"source": [
	"# Natasha's recommended uniform wavenumber sampling \n",
	"# (MAESTRO slack Nov 25 2025)\n",
	"row = grid[0]\n",
	"start = row['start_wavenumber']\n",
	"number_wave_pts = row['number_wave_pts']\n",
	"delta_wavenumber = row['delta_wavenumber']\n",
	"new_wvno_grid = np.arange(number_wave_pts) * delta_wavenumber + start\n",
	"\n",
	"wavenumber_sampling = new_wvno_grid[::2]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "081a96ac-976d-4eb5-9cb1-4e2c08ba7bde",
	"metadata": {},
	"outputs": [],
	"source": [
	"compressed_path = f'{path}.zarr'\n",
	"min_wavenumber_pts = int(grid['number_wave_pts'].min())\n",
	"\n",
	"temperatures = np.sort(list(set(temp_coords)))\n",
	"pressures = np.sort(list(set(press_coords)))\n",
	"\n",
	"# these compression settings will apply to every array:\n",
	"compression = dict(\n",
	" compressors=zarr.codecs.BloscCodec(\n",
	" cname=\"zstd\",\n",
	" # here we use the maximum compression level,\n",
	" # which takes longer to compress (one time)\n",
	" # but not more time to decompress (many times)\n",
	" clevel=9,\n",
	" shuffle=zarr.codecs.BloscShuffle.shuffle\n",
	" )\n",
	")\n",
	"\n",
	"store = zarr.storage.LocalStore(compressed_path)\n",
	"root = zarr.create_group(store, overwrite='w', zarr_format=3)\n",
	"root.create_array(\n",
	" 'temperature', \n",
	" data=temperatures, \n",
	" dimension_names=('temperature',), \n",
	" attributes=dict(coordinates='temperature'),\n",
	" **compression\n",
	")\n",
	"root.create_array(\n",
	" 'pressure', \n",
	" data=pressures, \n",
	" dimension_names=('pressure',), \n",
	" attributes=dict(coordinates='pressure'),\n",
	" **compression\n",
	")\n",
	"root.create_array(\n",
	" 'wavenumber', \n",
	" data=wavenumber_sampling, \n",
	" dimension_names=('wavenumber',), \n",
	" attributes=dict(coordinates='wavenumber'),\n",
	" **compression\n",
	")\n",
	"\n",
	"arr_shape = (wavenumber_sampling.size, temperatures.size, pressures.size)\n",
	"arr_attrs = dict(\n",
	" # `coordinates` is required for remote indexing with xarray:\n",
	" coordinates='wavenumber temperature pressure',\n",
	"\n",
	" # for MAESTRO versioning:\n",
	" molecule=path,\n",
	" source=dict(\n",
	" database='MAESTRO',\n",
	" created=str(datetime.datetime.now()),\n",
	" version=0.1 # or pick your number\n",
	" ),\n",
	" compression=dict(\n",
	" codec=compression['compressors'].__class__.__name__,\n",
	" algorithm=compression['compressors'].cname.name,\n",
	" clevel=compression['compressors'].clevel,\n",
	" shuffle=compression['compressors'].shuffle.__class__.__name__,\n",
	" )\n",
	")\n",
	"dimension_names = 'wavenumber temperature pressure'.split()\n",
	"\n",
	"arr = root.create_array(\n",
	" 'csx', \n",
	" shape=arr_shape, \n",
	" dtype=np.float64, \n",
	" dimension_names=dimension_names,\n",
	" attributes=arr_attrs,\n",
	" **compression\n",
	")\n",
	"\n",
	"# this promotes efficient remote indexing:\n",
	"zarr.consolidate_metadata(store)\n",
	"\n",
	"\n",
	"# loop over temperature and pressure to downsample the \n",
	"# wavenumber grid and store the results in the zarr array:\n",
	"for i, temperature in tqdm(enumerate(temperatures), total=len(temperatures)):\n",
	" for j, pressure in enumerate(pressures):\n",
	" nearest_coord = np.argmin(np.hypot(grid['pressure_bar'] - pressure, grid['temperature_K'] - temperature)) \n",
	" grid_point = dict(grid.loc[nearest_coord])\n",
	" cross_section = h5_file['cxs'][nearest_coord]\n",
	" wavenumber = np.arange(grid_point['number_wave_pts']) * grid_point['delta_wavenumber'] + grid_point['start_wavenumber']\n",
	"\n",
	"\n",
	" arr[:, i, j] = np.interp(wavenumber_sampling, wavenumber, cross_section)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "3657e9cc-f2cb-4359-8677-eba8a70e0d28",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.12.12"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}
No results found