Newer
Older
import numpy as np
from numpy.typing import ArrayLike
import uproot as ur
from typing import Any, Iterable

johannes bilk
committed
import re
class Rootable:
"""
this class uses uproot to load pxd data from root files and converts them into
native python data structures.
it can load the cluster information, uses the digits to generate the adc matrices,
coordinates, layer and ladders and finally also monte carlo data.
"""
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# these are the sensor IDs of the pxd modules/panels from the root file, they are
# use to identify on which panels a cluster event happened
self.panelIDs = np.array([ 8480, 8512, 8736, 8768, 8992, 9024, 9248, 9280,
9504, 9536, 9760, 9792, 10016, 10048, 10272, 10304,
16672, 16704, 16928, 16960, 17184, 17216, 17440, 17472,
17696, 17728, 17952, 17984, 18208, 18240, 18464, 18496,
18720, 18752, 18976, 19008, 19232, 19264, 19488, 19520])
# every line in this corresponds to one entry in the array above, this is used
# to put the projected uv plane in the right position
self.panelShifts = np.array([[1.3985 , 0.2652658 , 3.68255],
[ 1.3985 , 0.23238491, -0.88255],
[ 0.80146531, 1.17631236, 3.68255],
[ 0.82407264, 1.15370502, -0.88255],
[-0.2582769 , 1.3985 , 3.68255],
[-0.2322286 , 1.3985 , -0.88255],
[-1.17531186, 0.80246583, 3.68255 ],
[-1.15510614, 0.82267151, -0.88255],
[-1.3985 , -0.2645974 , 3.68255],
[-1.3985 , -0.23012119, -0.88255],
[-0.80591227, -1.17186534, 3.68255],
[-0.82344228, -1.15433536, -0.88255],
[ 0.26975836, -1.3985 , 3.68255],
[ 0.23326624, -1.3985 , -0.88255],
[ 1.1746111 , -0.80316652, 3.68255],
[ 1.15205703, -0.82572062, -0.88255],
[ 2.2015 , 0.26959865, 5.01305],
[ 2.2015 , 0.2524582 , -1.21305],
[ 1.77559093, 1.32758398, 5.01305],
[ 1.78212569, 1.31626522, -1.21305],
[ 0.87798948, 2.03516717, 5.01305],
[ 0.88478563, 2.03124357, -1.21305],
[-0.26129975, 2.2015 , 5.01305],
[-0.25184137, 2.2015 , -1.21305],
[-1.32416655, 1.77756402, 5.01305],
[-1.31417539, 1.78333226, -1.21305],
[-2.03421133, 0.87964512, 5.01305],
[-2.02960691, 0.88762038, -1.21305],
[-2.2015 , -0.25954151, 5.01305],
[-2.2015 , -0.24969109, -1.21305],
[-1.77636043, -1.32625112, 5.01305],
[-1.78138268, -1.31755219, -1.21305],
[-0.87493138, -2.03693277, 5.01305 ],
[-0.8912978 , -2.02748378, -1.21305],
[ 0.26489725, -2.2015 , 5.01305],
[ 0.25364439, -2.2015 , -1.21305],
[ 1.3269198 , -1.7759744 , 5.01305],
[ 1.32258793, -1.77847528, -1.21305],
[ 2.03616649, -0.87625871, 5.01305],
[ 2.02936825, -0.8880338 , -1.21305]])
# every entry here corresponds to the entries in the array above, these are
# used for rotating the projected uv plane
self.panelRotations = np.array([ 90, 90, 135, 135, 180, 180, 225, 225, 270, 270, 315, 315, 360,
360, 405, 405, 90, 90, 120, 120, 150, 150, 180, 180, 210, 210,
240, 240, 270, 270, 300, 300, 330, 330, 360, 360, 390, 390, 420,
420])
# the layer and ladder arrays, for finding them from sensor id

johannes bilk
committed
self.panelLayer = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
self.panelLadder = np.array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21])
# all transpormaations are stored in a dict, with the sensor id as a keyword
self.transformation = {}
self.layersLadders = {}
for i in range(len(self.panelIDs)):
self.transformation[str(self.panelIDs[i])] = [self.panelShifts[i], self.panelRotations[i]]
self.layersLadders[str(self.panelIDs[i])] = [self.panelLayer[i], self.panelLadder[i]]
# these are the branch names for cluster info in the root file
self.gotClusters = False
self.clusters = ['PXDClusters/PXDClusters.m_clsCharge',
'PXDClusters/PXDClusters.m_seedCharge',
'PXDClusters/PXDClusters.m_clsSize',
'PXDClusters/PXDClusters.m_uSize',
'PXDClusters/PXDClusters.m_vSize',
'PXDClusters/PXDClusters.m_uPosition',
'PXDClusters/PXDClusters.m_vPosition',
'PXDClusters/PXDClusters.m_sensorID']
# these are the branch names for cluster digits in the root file
self.digits = ['PXDDigits/PXDDigits.m_uCellID',
'PXDDigits/PXDDigits.m_vCellID',
'PXDDigits/PXDDigits.m_charge']
# this establishes the relationship between clusters and digits
# because for some reaseon the branch for digits has a different
# size than the cluster branch
self.clusterToDigis = 'PXDClustersToPXDDigits/m_elements/m_elements.m_to'
# these are the branch names for monte carlo data in the root file
self.mcData = ['MCParticles/MCParticles.m_pdg',
'MCParticles/MCParticles.m_momentum_x',
'MCParticles/MCParticles.m_momentum_y',
'MCParticles/MCParticles.m_momentum_z']

johannes bilk
committed
# indices for events to be imported
self.eventIndices = None
# these two establish the relation ship to an from clusters and monte carlo
# there more entries than in the cluster data, but there still mc data missing
# for some cluster files
self.clusterToMC = 'PXDClustersToMCParticles/m_elements/m_elements.m_to'
self.mcToCluster = 'PXDClustersToMCParticles/m_elements/m_elements.m_from'
# this dict stores the data
self.data = data if data is not None else {}
# list of pxd panels
self.pxdPanels = [[[-0.89 , 0.36 , 0.36 , -0.89 , -0.89 ], [ 1.4 , 1.4 , 1.4 , 1.4 , 1.4 ], [-3.12, -3.12, 5.92, 5.92, -3.12]], # 00
[[ 1.25 , 0.365, 0.365, 1.25 , 1.25 ], [ 0.72 , 1.615, 1.615, 0.72 , 0.72 ], [-3.12, -3.12, 5.92, 5.92, -3.12]], # 01
[[ 1.4 , 1.4 , 1.4 , 1.4 , 1.4 ], [-0.36 , 0.89 , 0.89 , -0.36 , -0.36 ], [-3.12, -3.12, 5.92, 5.92, -3.12]], # 02
[[ 0.72 , 1.615, 1.615, 0.72 , 0.72 ], [-1.25 , -0.365, -0.365, -1.25 , -1.25 ], [-3.12, -3.12, 5.92, 5.92, -3.12]], # 03
[[ 0.89 , -0.36 , -0.36 , 0.89 , 0.89 ], [-1.4 , -1.4 , -1.4 , -1.4 , -1.4 ], [-3.12, -3.12, 5.92, 5.92, -3.12]], # 04
[[-1.25 , -0.365, -0.365, -1.25 , -1.25 ], [-0.72 , -1.615, -1.615, -0.72 , -0.72 ], [-3.12, -3.12, 5.92, 5.92, -3.12]], # 05
[[-1.4 , -1.4 , -1.4 , -1.4 , -1.4 ], [ 0.36 , -0.89 , -0.89 , 0.36 , 0.36 ], [-3.12, -3.12, 5.92, 5.92, -3.12]], # 06
[[-0.72 , -1.615, -1.615, -0.72 , -0.72 ], [ 1.25 , 0.365, 0.365, 1.25 , 1.25 ], [-3.12, -3.12, 5.92, 5.92, -3.12]], # 07
[[-0.89 , 0.36 , 0.36 , -0.89 , -0.89 ], [ 2.2 , 2.2 , 2.2 , 2.2 , 2.2 ], [-4.28, -4.28, 8.08, 8.08, -4.28]], # 08
[[ 0.345, 1.4 , 1.4 , 0.345, 0.345], [ 2.35 , 1.725, 1.725, 2.35 , 2.35 ], [-4.28, -4.28, 8.08, 8.08, -4.28]], # 09
[[ 1.48 , 2.1 , 2.1 , 1.48 , 1.48 ], [ 1.85 , 0.78 , 0.78 , 1.85 , 1.85 ], [-4.28, -4.28, 8.08, 8.08, -4.28]], # 10
[[ 2.2 , 2.2 , 2.2 , 2.2 , 2.2 ], [ 0.89 , -0.36 , -0.36 , 0.89 , 0.89 ], [-4.28, -4.28, 8.08, 8.08, -4.28]], # 11
[[ 2.35 , 1.725, 1.725, 2.35 , 2.35 ], [-0.345, -1.4 , -1.4 , -0.345, -0.345], [-4.28, -4.28, 8.08, 8.08, -4.28]], # 12
[[ 1.85 , 0.78 , 0.78 , 1.85 , 1.85 ], [-1.48 , -2.1 , -2.1 , -1.48 , -1.48 ], [-4.28, -4.28, 8.08, 8.08, -4.28]], # 13
[[ 0.89 , -0.36 , -0.36 , 0.89 , 0.89 ], [-2.2 , -2.2 , -2.2 , -2.2 , -2.2 ], [-4.28, -4.28, 8.08, 8.08, -4.28]], # 14
[[-0.345, -1.4 , -1.4 , -0.345, -0.345], [-2.35 , -1.725, -1.725, -2.35 , -2.35 ], [-4.28, -4.28, 8.08, 8.08, -4.28]], # 15
[[-1.48 , -2.1 , -2.1 , -1.48 , -1.48 ], [-1.85 , -0.78 , -0.78 , -1.85 , -1.85 ], [-4.28, -4.28, 8.08, 8.08, -4.28]], # 16
[[-2.2 , -2.2 , -2.2 , -2.2 , -2.2 ], [-0.89 , 0.36 , 0.36 , -0.89 , -0.89 ], [-4.28, -4.28, 8.08, 8.08, -4.28]], # 17
[[-2.35 , -1.725, -1.725, -2.35 , -2.35 ], [ 0.345, 1.4 , 1.4 , 0.345, 0.345], [-4.28, -4.28, 8.08, 8.08, -4.28]], # 18
[[-1.85 , -0.78 , -0.78 , -1.85 , -1.85 ], [ 1.48 , 2.1 , 2.1 , 1.48 , 1.48 ], [-4.28, -4.28, 8.08, 8.08, -4.28]]] # 19

johannes bilk
committed
# parameter for checking if coordinates have been loaded
self.gotCoordinates = False
def __getitem__(self, index: str | int | ArrayLike) -> np.ndarray | dict:
"""
this makes the class subscriptable, one can retrieve one coloumn by using
strings as keywords, or get a row by using integer indices or arrays
"""
if isinstance(index, str):
return self.data[index]
return {key: value[index] for key, value in self.data.items()}

johannes bilk
committed
def __setitem__(self, index: str | int | ArrayLike, value: dict | Any) -> None:
"""
Allows setting the value of a column by using strings as keywords,
setting the value of a row by using integer indices or arrays,
or setting a specific value using a tuple of key and index.
:param index: The column name, row index, or tuple of key and index.
:param value: The value to set.
"""
if isinstance(index, str):
assert len(value) == len(self.data[list(self.data.keys())[0]]), 'value should have same length as data'
self.data[index] = value
elif isinstance(index, tuple) and len(index) == 2 and isinstance(index[0], str) and isinstance(index[1], int):
key, idx = index
assert key in self.data, f"key {key} not found in data"
self.data[key][idx] = value
else:
assert isinstance(value, dict), "value must be a dictionary when setting rows"
assert set(value.keys()) == set(self.data.keys()), "keys of value must match keys of data"
for key in self.data:
self.data[key][index] = value[key]
def where(self, *conditions: str) -> dict:
"""
Filters the data based on the provided conditions.
:param conditions: List of conditions as strings for filtering. The keys should be the names of the data fields, and the conditions should be in a format that can be split into key, operator, and value.
:return: Instance of the class containing the filtered data.
mask = np.ones(len(next(iter(self.data.values()))), dtype=bool) # Initial mask allowing all elements
# Applying the conditions to create the mask
for condition in conditions:

johannes bilk
committed
match = re.match(r'(\w+)\s*([<>=]=?| in )\s*(.+)', condition)
if match is None:
raise ValueError(f"Invalid condition: {condition}")
key, op, value = match.groups()
op = op.strip() # remove any leading and trailing spaces
if op == 'in':
value = eval(value)
mask &= np.isin(self.data[key], value)
else:
comparisionValue = float(value)
fieldValues = self.data[key].astype(float)
# Determine the correct comparison to apply
operation = {
'==': np.equal,
'<': np.less,
'>': np.greater,
'<=': np.less_equal,
'>=': np.greater_equal,
}.get(op)
if operation is None:
raise ValueError(f"Invalid operator {op}")
mask &= operation(fieldValues, comparisionValue)
# Applying the mask to filter the data
for key, values in filteredData.items():
filteredData[key] = values[mask]
return self.__class__(data=filteredData)
def __repr__(self) -> str:
return str(self.data)
def __iter__(self) -> Iterable:
keys = list(self.data.keys())
numRows = len(self.data[keys[0]])
for i in range(numRows):
yield {key: self.data[key][i] for key in keys}
def keys(self) -> list:
return list(self.data.keys())
def items(self) -> list:
return self.data.items()
def values(self) -> list:
return self.data.values()
def get(self, key: str) -> np.ndarray:
return self.data.get(key)
def pop(self, key: str) -> None:
return self.data.pop(key)
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
def stack(self, *columns, toKey: str, pop: bool = True) -> None:
"""
Stacks specified columns into a single column and stores it under a new key.
:param columns: The columns to stack.
:param toKey: The new key where the stacked column will be stored.
:param pop: Whether to remove the original columns.
"""
# Check that all specified columns exist
for column in columns:
if column not in self.data:
raise KeyError(f"Column '{column}' does not exist.")
# Column stack the specified columns
stackedColumn = np.column_stack([self.data[col] for col in columns])
# Flatten if it's 1D for consistency
if stackedColumn.shape[1] == 1:
stackedColumn = stackedColumn.flatten()
# Store it under the new key
self.data[toKey] = stackedColumn
# Remove the original columns if pop is True
if pop:
for column in columns:
self.data.pop(column)

johannes bilk
committed
def loadData(self, file: str, events: int = None, selection: str = None) -> None:

johannes bilk
committed
Reads the file off of the hard drive; it automatically creates event numbers.
file: str = it's the whole file path + .root ending

johannes bilk
committed
events: int = the number of events to import (None for all)
selection: str = method of event selection ('random' for random selection)
"""
self.eventTree = ur.open(f'{file}:tree')

johannes bilk
committed
numEvents = len(self.eventTree.arrays('PXDClusters/PXDClusters.m_clsCharge', library='np')['PXDClusters/PXDClusters.m_clsCharge'])
if events is not None:
if selection == 'random':
self.eventIndices = np.random.permutation(numEvents)[:events]
else:
self.eventIndices = np.arange(min(events, numEvents))
clusters = self.eventTree.arrays('PXDClusters/PXDClusters.m_clsCharge', library='np')['PXDClusters/PXDClusters.m_clsCharge'][self.eventIndices]
else:
clusters = self.eventTree.arrays('PXDClusters/PXDClusters.m_clsCharge', library='np')['PXDClusters/PXDClusters.m_clsCharge']
self._getEventNumbers(clusters)
def _getEventNumbers(self, clusters: np.ndarray, offset: int = 0) -> None:
eventNumbers = []
for i in range(len(clusters)):

johannes bilk
committed
eventNumbers.append(np.array([i]*len(clusters[i])) + offset)
self.data['eventNumber'] = self._flatten(eventNumbers)
def _getData(self, keyword: str, library: str = 'np') -> np.ndarray:
"""
a private method for converting branches into something useful, namely
into numpy arrays, if the keyward library is set to np.
keyword: str = the full branch name
library: str = can be 'np' (numpy), 'pd' (pandas) or 'ak' (akward)
see uproot documentation for more info
"""
try:

johannes bilk
committed
if self.eventIndices is not None:
data = self.eventTree.arrays(keyword, library=library)[keyword][self.eventIndices]
else:
data = self.eventTree.arrays(keyword, library=library)[keyword]
return self._flatten(data)
except:
return KeyError
def _flatten(self, structure: ArrayLike, maxDepth: int = None, currentDepth: int = 0) -> np.ndarray:
"""
this is a private function, that gets called during loading branches
it flattens ragged array, one can set the depths to which one wants to flatten
structure: the list/array to flatten
maxDepth: int = the amount of flattening
currentDepth: int = don't touch this, it's used for recursively calling
"""
flat_list = []
for element in structure:
if isinstance(element, (list, np.ndarray)) and (maxDepth is None or currentDepth < maxDepth):
flat_list.extend(self._flatten(element, maxDepth, currentDepth + 1))
else:
flat_list.append(element)
return np.array(flat_list)
def getClusters(self) -> None:
"""
this uses the array from __init__ to load different branches into the data dict
"""
self.gotClusters = True
for branch in self.clusters:
data = self._getData(branch)
keyword = branch.split('_')[-1]
self.data[keyword] = data
def getMatrices(self, matrixSize: tuple = (9, 9)) -> None:
"""
loads the digit branches into arrays and converts them into adc matrices
"""

johannes bilk
committed
if self.eventIndices is not None:
uCellIDs = self.eventTree.arrays(self.digits[0], library='np')[self.digits[0]][self.eventIndices]
vCellIDs = self.eventTree.arrays(self.digits[1], library='np')[self.digits[1]][self.eventIndices]
cellCharges = self.eventTree.arrays(self.digits[2], library='np')[self.digits[2]][self.eventIndices]
else:
uCellIDs = self.eventTree.arrays(self.digits[0], library='np')[self.digits[0]]
vCellIDs = self.eventTree.arrays(self.digits[1], library='np')[self.digits[1]]
cellCharges = self.eventTree.arrays(self.digits[2], library='np')[self.digits[2]]
# this establishes the relation between digits and clusters, it's still
# shocking to me, that this is necessary, why aren't digits stored in the
# same way as clusters, than one wouldn't need to jump through hoops just
# to have the data in a usable und sensible manner
# root is such a retarded file format

johannes bilk
committed
if self.eventIndices is not None:
clusterDigits = self.eventTree.arrays(self.clusterToDigis, library='np')[self.clusterToDigis][self.eventIndices]
else:
clusterDigits = self.eventTree.arrays(self.clusterToDigis, library='np')[self.clusterToDigis]
indexChunnks = np.array_split(range(len(cellCharges)), 4)
with ThreadPoolExecutor(max_workers=None) as executor:
futures = [executor.submit(self._getMatrices, chunk, uCellIDs, vCellIDs, cellCharges, clusterDigits, matrixSize) for chunk in indexChunnks]
results = [future.result() for future in futures]
# Combine the results from all chunks
self.data['cluster'] = np.concatenate(results).astype('int')

johannes bilk
committed
@staticmethod
def _getMatrices(indexChunks: ArrayLike, uCellIDs: ArrayLike, vCellIDs: ArrayLike, cellCharges: ArrayLike, clusterDigits: ArrayLike, matrixSize: tuple = (9, 9)) -> np.ndarray:
"""
this takes the ragged/jagged digit arrays and converts them into 9x9 matrices
it's a rather slow process because of all the looping
"""
plotRange = np.array(matrixSize) // 2
events = []
for event in indexChunks:
digitsU, digitsV, digitsCharge = np.array(uCellIDs[event]), np.array(vCellIDs[event]), np.array(cellCharges[event])
for indices in digitIndices:
cacheImg = np.zeros(matrixSize)
maxChargeIndex = digitsCharge[indices].argmax()
uMax, vMax = digitsU[indices[maxChargeIndex]], digitsV[indices[maxChargeIndex]]
uPos, vPos = digitsU[indices] - uMax + plotRange[0], digitsV[indices] - vMax + plotRange[1]
valid_indices = (uPos >= 0) & (uPos < matrixSize[0]) & (vPos >= 0) & (vPos < matrixSize[1])
cacheImg[uPos[valid_indices].astype(int), vPos[valid_indices].astype(int)] = digitsCharge[indices][valid_indices]
adcValues.append(cacheImg)
events.extend(adcValues)
return np.array(events, dtype=object)
def getCoordinates(self) -> None:
"""
converting the uv coordinates, together with sensor ids, into xyz coordinates
"""

johannes bilk
committed
# checking if cluster parameters have been loaded
if self.gotClusters is False:
self.getClusters()

johannes bilk
committed
# setting a bool for checking if coordinates were calculated
self.gotCoordinates = True
indexChunnks = np.array_split(range(len(self.data['sensorID'])), 4)
with ThreadPoolExecutor(max_workers=None) as executor:
futures = [executor.submit(self._getCoordinates, self.data['uPosition'][chunk], self.data['vPosition'][chunk], self.data['sensorID'][chunk]) for chunk in indexChunnks]
xResults, yResults, zResults = [], [], []
for future in futures:
x, y, z = future.result()
xResults.append(x)
yResults.append(y)
zResults.append(z)
self.data['xPosition'] = np.concatenate(xResults)
self.data['yPosition'] = np.concatenate(yResults)
self.data['zPosition'] = np.concatenate(zResults)
def _getCoordinates(self, uPositions: ArrayLike, vPositions: ArrayLike, sensorIDs: ArrayLike) -> tuple[np.ndarray]:
"""
a private method for transposing/converting 2d uv coords into 3d xyz coordinates
"""
length = len(sensorIDs)
xArr, yArr, zArr = np.zeros(length), np.zeros(length), np.zeros(length)
# iterting over the cluster arrays
for index, (u, v, sensor_id) in enumerate(zip(uPositions, vPositions, sensorIDs)):
# grabbing the shift vector and rotation angle
shift, angle = self.transformation[str(sensor_id)]
# setting up rotation matrix
theta = np.deg2rad(angle)
rotMatrix = np.array([[np.cos(theta), -np.sin(theta), 0], [np.sin(theta), np.cos(theta), 0], [0, 0, 1]])
# projecting uv coordinates into 3d space
point = np.array([u, 0, v])
# shifting and rotating the projected vector
shifted = rotMatrix.dot(point) + shift
xArr[index], yArr[index], zArr[index] = shifted
return xArr, yArr, zArr

johannes bilk
committed
def getSphericals(self) -> None:
"""
Calculate spherical coordinates for each cluster.
"""
# Checking if coordinates have been loaded
if self.gotClusters is False:
self.getCoordinates()
xSquare = np.square(self.data['xPosition'])
ySquare = np.square(self.data['yPosition'])
zSquare = np.square(self.data['zPosition'])
# Avoid division by zero by replacing zeros with a small number
r = np.sqrt(xSquare + ySquare + zSquare)
rSafe = np.where(r == 0, 1e-10, r)
theta = np.arccos(self.data['zPosition'] / rSafe)
phi = np.arctan2(self.data['yPosition'], self.data['xPosition'])
self.data['rPosition'] = r
self.data['thetaPosition'] = theta
self.data['phiPosition'] = phi
def getLayers(self) -> None:
"""
looks up the corresponding layers and ladders for every cluster
"""
if self.gotClusters is False:
self.getClusters()
layers, ladders = [], []
for id in self.data['sensorID']:
layer, ladder = self.layersLadders[str(id)]
layers.append(layer)
ladders.append(ladder)

johannes bilk
committed
self.data['layer'] = np.array(layers)
self.data['ladder'] = np.array(ladders)
def getMCData(self) -> None:
"""
this loads the monte carlo from the root file
"""
# the monte carlo data, they are longer than the cluster data

johannes bilk
committed
if self.eventIndices is not None:
pdg = self.eventTree.arrays(self.mcData[0], library='np')[self.mcData[0]][self.eventIndices]
momentumX = self.eventTree.arrays(self.mcData[1], library='np')[self.mcData[1]][self.eventIndices]
momentumY = self.eventTree.arrays(self.mcData[2], library='np')[self.mcData[2]][self.eventIndices]
momentumZ = self.eventTree.arrays(self.mcData[3], library='np')[self.mcData[3]][self.eventIndices]
else:
pdg = self.eventTree.arrays(self.mcData[0], library='np')[self.mcData[0]]
momentumX = self.eventTree.arrays(self.mcData[1], library='np')[self.mcData[1]]
momentumY = self.eventTree.arrays(self.mcData[2], library='np')[self.mcData[2]]
momentumZ = self.eventTree.arrays(self.mcData[3], library='np')[self.mcData[3]]
# this loads the relation ships to and from clusters and mc data
# this is the same level of retardedness as with the cluster digits

johannes bilk
committed
if self.eventIndices is not None:
clusterToMC = self.eventTree.arrays(self.clusterToMC, library='np')[self.clusterToMC][self.eventIndices]
mcToCluster = self.eventTree.arrays(self.mcToCluster, library='np')[self.mcToCluster][self.eventIndices]
else:
clusterToMC = self.eventTree.arrays(self.clusterToMC, library='np')[self.clusterToMC]
mcToCluster = self.eventTree.arrays(self.mcToCluster, library='np')[self.mcToCluster]
# it need the cluster charge as a jagged/ragged array, maybe I could simply
# use the event numbers, but I am too tired to fix this shitty file format

johannes bilk
committed
if self.eventIndices is not None:
clsCharge = self.eventTree.arrays('PXDClusters/PXDClusters.m_clsCharge', library='np')['PXDClusters/PXDClusters.m_clsCharge'][self.eventIndices]
else:
clsCharge = self.eventTree.arrays('PXDClusters/PXDClusters.m_clsCharge', library='np')['PXDClusters/PXDClusters.m_clsCharge']
# reorganizing MC data
momentumXList = []
momentumYList = []
momentumZList = []
pdgList = []
clusterNumbersList = []
for i in range(len(clusterToMC)):
# _fillMCList fills in the missing spots, because there are not mc data for
# every cluster, even though there are more entries in this branch than
# in the cluster branch... as I said, the root format is retarded
fullClusterReferences = self._fillMCList(mcToCluster[i], clusterToMC[i], len(clsCharge[i]))
clusterNumbersList.append(fullClusterReferences)
pdgs, xmom, ymom, zmom = self._getMCData(fullClusterReferences, pdg[i], momentumX[i], momentumY[i], momentumZ[i])
momentumXList.append(xmom)
momentumYList.append(ymom)
momentumZList.append(zmom)
pdgList.append(pdgs)
self.data['momentumX'] = self._flatten(momentumXList)
self.data['momentumY'] = self._flatten(momentumYList)
self.data['momentumZ'] = self._flatten(momentumZList)
self.data['pdg'] = self._flatten(pdgList)
self.data['clsNumber'] = self._flatten(clusterNumbersList)

johannes bilk
committed
@staticmethod
def _findMissing(lst: list, length: int) -> list:
"""
a private method for finding missing elements in mc data arrays
"""
return sorted(set(range(0, length)) - set(lst))
def _fillMCList(self, fromClusters: ArrayLike, toClusters: ArrayLike, length: ArrayLike) -> list:
"""
a private method for filling MC data arrays where clusters don't have
any information
"""
missingIndex = self._findMissing(fromClusters, length)
testList = [-1] * length
fillIndex = 0
for i in range(len(testList)):
if i in missingIndex:
testList[i] = -1
else:
try:
testList[i] = int(toClusters[fillIndex])
except TypeError:
testList[i] = int(toClusters[fillIndex][0])
fillIndex += 1
return testList

johannes bilk
committed
@staticmethod
def _getMCData(toClusters: ArrayLike, pdgs: ArrayLike, xMom: ArrayLike, yMom: ArrayLike, zMom: ArrayLike) -> tuple[np.ndarray]:
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
"""
after filling and reorganizing MC data arrays one can finally collect the
actual MC data, where there's data missing I will with zeros
"""
pxList, pyList, pzList = [], [], []
pdgList = []
for references in toClusters:
if references == -1:
pxList.append(0)
pyList.append(0)
pzList.append(0)
pdgList.append(0)
else:
pxList.append(xMom[references])
pyList.append(yMom[references])
pzList.append(zMom[references])
pdgList.append(pdgs[references])
return np.array(pdgList,dtype=list), np.array(pxList,dtype=list), np.array(pyList,dtype=list), np.array(pzList,dtype=list)
def getStructuredArray(self) -> np.ndarray:
"""
this converts the data dict of this class into a structured numpy array
"""
# Create a list to hold the dtype specifications
dtype = []
# Iterate through the dictionary keys and values
for key, value in self.data.items():
# Determine the data type of the first value in the list
sampleValue = value[0]
if isinstance(sampleValue, np.ndarray):
# If the value is an array, use its shape and dtype
fieldDtype = (sampleValue.dtype, sampleValue.shape)
else:
# Otherwise, use the type of the value itself
fieldDtype = type(sampleValue)
# Append the key and data type to the dtype list
dtype.append((key, fieldDtype))
# Convert the dictionary to a list of tuples
keys = list(self.data.keys())
dataList = [tuple(self.data[key][i] for key in keys) for i in range(len(self.data[keys[0]]))]
# Create the structured array
structuredArray = np.array(dataList, dtype=dtype)
return structuredArray