Skip to content

Instantly share code, notes, and snippets.

@hqucms
Created December 21, 2018 17:06
Show Gist options
  • Select an option

  • Save hqucms/f381d1121ca3a6220c6270c2e7c59a81 to your computer and use it in GitHub Desktop.

Select an option

Save hqucms/f381d1121ca3a6220c6270c2e7c59a81 to your computer and use it in GitHub Desktop.
Convert top tagging dataset
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import os
import logging
logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(levelname)s: %(message)s')
import pandas as pd
import numpy as np
import numexpr as ne
import math
import tables
filters = tables.Filters(complevel=7, complib='blosc')
# In[2]:
def _write_carray(a, h5file, name, group_path='/', **kwargs):
h5file.create_carray(group_path, name, obj=a, filters=filters, createparents=True, **kwargs)
# In[3]:
def _transform(dataframe, max_particles=100, start=0, stop=-1):
from collections import OrderedDict
v = OrderedDict()
df = dataframe.iloc[start:stop]
def _col_list(prefix):
return ['%s_%d'%(prefix,i) for i in range(max_particles)]
E = df[_col_list('E')].as_matrix()
PX = df[_col_list('PX')].as_matrix()
PY = df[_col_list('PY')].as_matrix()
PZ = df[_col_list('PZ')].as_matrix()
# -> PT, eta, phi
PT = np.sqrt(PX**2 + PY**2)
Eta = 0.5*np.log((E+PZ)/(E-PZ))
Phi = np.arctan2(PY, PX)
# -> Jet
Jet_E = np.sum(E, axis=1)
Jet_PX = np.sum(PX, axis=1)
Jet_PY = np.sum(PY, axis=1)
Jet_PZ = np.sum(PZ, axis=1)
_jet_PT2 = Jet_PX**2 + Jet_PY**2
Jet_PT = np.sqrt(_jet_PT2)
_jet_P2 = _jet_PT2 + Jet_PZ**2
_jet_cosTheta = Jet_PZ/np.sqrt(_jet_P2)
Jet_Eta = -0.5*np.log( (1.0-_jet_cosTheta)/(1.0+_jet_cosTheta) )
Jet_Phi = np.arctan2(Jet_PY, Jet_PX)
Jet_M = np.sqrt(Jet_E**2 - _jet_P2)
# transformed
_label = df['is_signal_new'].as_matrix()
v['label'] = np.stack((_label, 1-_label), axis=-1)
v['train_val_test'] = df['ttv'].as_matrix()
v['jet_px'] = Jet_PX
v['jet_py'] = Jet_PY
v['jet_pz'] = Jet_PZ
v['jet_energy'] = Jet_E
v['jet_pt'] = Jet_PT
v['jet_eta'] = Jet_Eta
v['jet_phi'] = Jet_Phi
v['jet_mass'] = Jet_M
v['part_px'] = PX
v['part_py'] = PY
v['part_pz'] = PZ
v['part_energy'] = E
v['part_pt'] = PT
v['part_eta'] = Eta
v['part_phi'] = Phi
v['part_pt_log'] = np.log(PT)
v['part_ptrel'] = PT/Jet_PT[:,None]
v['part_ptrel_log'] = np.log(v['part_ptrel'])
v['part_energy_log'] = np.log(E)
v['part_erel'] = E/Jet_E[:,None]
v['part_erel_log'] = np.log(v['part_erel'])
_jet_etasign = np.sign(Jet_Eta)
_jet_etasign[_jet_etasign==0] = 1
v['part_etarel'] = (Eta - Jet_Eta[:,None]) * _jet_etasign[:,None]
v['part_etarel_noreflect'] = (Eta - Jet_Eta[:,None])
_dphi = Phi - Jet_Phi[:,None]
_pos = (np.abs(_dphi)> np.pi)
_n = np.round(_dphi/(2*np.pi))
_dphi[_pos] -= _n[_pos]*(2*np.pi)
v['part_phirel'] = _dphi
v['part_deltaR'] = np.sqrt(v['part_etarel']**2 + v['part_phirel']**2)
# fix nan/inf
for k in v:
if k.startswith('part_'):
v[k][E==0]=0
def _make_image(var_img, rec, n_pixels = 64, img_ranges = [[-0.8, 0.8], [-0.8, 0.8]]):
wgt = rec[var_img]
x = rec['part_etarel']
y = rec['part_phirel']
img = np.zeros(shape=(len(wgt), n_pixels, n_pixels))
for i in range(len(wgt)):
hist2d, xedges, yedges = np.histogram2d(x[i], y[i], bins=[n_pixels, n_pixels], range=img_ranges, weights=wgt[i])
img[i] = hist2d
return img
v['img_pt'] = _make_image('part_ptrel', v)
v['img_energy'] = _make_image('part_erel', v)
return v
# In[4]:
def convert(source, destdir, basename, step=50000, limit=None):
df = pd.read_hdf(source, key='table')
logging.info('Total events:', str(df.shape[0]))
idx=-1
while True:
idx+=1
start=idx*step
if start>=df.shape[0]: break
if limit is not None and start>=limit: break
if not os.path.exists(destdir):
os.makedirs(destdir)
output = os.path.join(destdir, '%s_%d.h5'%(basename, idx))
logging.info(output)
if os.path.exists(output):
logging.warning('... file already exist: continue ...')
continue
with tables.open_file(output, mode='w') as h5file:
v=_transform(df, start=start, stop=start+step)
for k in v.keys():
if k=='label':
_write_carray(v[k], h5file, name=k, title='isTop,isQCD')
else:
_write_carray(v[k], h5file, name=k)
# In[ ]:
# In[5]:
# conver training file
convert('/data/hqu/ntuples/GMT/v0_2018_03_27/orig/train.h5', destdir='/data/hqu/ntuples/GMT/v0_2018_03_27/converted', basename='train_file')
# In[6]:
# conver validation file
convert('/data/hqu/ntuples/GMT/v0_2018_03_27/orig/val.h5', destdir='/data/hqu/ntuples/GMT/v0_2018_03_27/converted', basename='val_file')
# In[8]:
# conver testing file
convert('/data/hqu/ntuples/GMT/v0_2018_03_27/orig/test.h5', destdir='/data/hqu/ntuples/GMT/v0_2018_03_27/converted/test', basename='test_file')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment