wrote comments/doc strings

e4d3420e · johannes bilk · 0d40ffb1 · e4d3420e · e4d3420e · e4d3420e
Commit e4d3420e authored 1 year ago by johannes bilk
--- a/README.md
+++ b/README.md
@@ -9,4 +9,78 @@ native Python formats.
 It's using [Numpy](https://numpy.org) and a library called [Uproot](https://github.com/scikit-hep/uproot5)
 to read and process these damn root files. So far it is specialist for one task
 and I will have to work on it to make it actually viable for more use cases. That task
-is to extract PXD data from Belle 2 data files.
\ No newline at end of file
+is to extract PXD data from Belle 2 data files.
+
+
+## How to use this?
+
+This is a single class, that needs to be instantiated, it doesn't take any arguments.
+Just import it like this:
+
+> from rootable import Rootable
+
+Then you can create an instance:
+
+> loadFromRoot = Rootable()
+
+and load the root file and all the data:
+
+> loadFromRoot.loadData('/root-files/slow_pions_2.root')
+> loadFromRoot.getClusters()
+> loadFromRoot.genCoordisnate()
+> loadFromRoot.getLayers()
+> loadFromRoot.getMatrices()
+> loadFromRoot.getMCData()
+
+This commands don't have any return value, but instead work in-place.
+Then all data is stored inside the object as dict:
+
+> loadFromRoot.data
+
+Here follows a list of keywords contained in the dict:
+
+- cluster data:
+    - 'eventNumbers'
+    - 'clsCharge'
+    - 'seedCharge'
+    - 'clsSize'
+    - 'uSize'
+    - 'vSize'
+    - 'uPosition'
+    - 'vPosition'
+    - 'sensorID'
+- coordinates:
+    - 'xPosition'
+    - 'yPosition'
+    - 'zPosition'
+- layers:
+    - 'layers'
+    - 'ladder'
+- matrices:
+    - 'cluster'
+- Monte Carlo data:
+    - 'momentumX'
+    - 'momentumY'
+    - 'momentumZ'
+    - 'pdg'
+    - 'clsNumber'
+
+Since the class is subscriptable one can access every element directly using the keywords
+like this:
+
+>
+
+And finally you can convert the dict into a structured Numpy array by simply writing:
+
+> loadFromRoot.loadFromRoot()
+
+This last command returns a Numpy array. From there the user can save it using
+Numpys build-in functions, convert it to Pandas or use it in any way that is
+compatible with Numpy.
+
+
+## Installation
+
+Download the repo, navigate in the terminal to the folder and run the following script:
+
+> python3 setup.py
\ No newline at end of file
--- a/rootable/__init__.py
+++ b/rootable/__init__.py
--- a/rootable/rootable.py
+++ b/rootable/rootable.py
+import numpy as np
+from numpy.typing import ArrayLike
+import uproot as ur
+
+
+class Rootable:
+    """
+    this class uses uproot to load pxd data from root files and converts them into
+    native python data structures.
+    it can load the cluster information, uses the digits to generate the adc matrices,
+    coordinates, layer and ladders and finally also monte carlo data.
+    """
+    def __init__(self) -> None:
+        # these are the sensor IDs of the pxd modules/panels from the root file, they are
+        # use to identify on which panels a cluster event happened
+        self.panelIDs = np.array([ 8480,  8512,  8736,  8768,  8992,  9024,  9248,  9280,
+                              9504,  9536,  9760,  9792, 10016, 10048, 10272, 10304,
+                             16672, 16704, 16928, 16960, 17184, 17216, 17440, 17472,
+                             17696, 17728, 17952, 17984, 18208, 18240, 18464, 18496,
+                             18720, 18752, 18976, 19008, 19232, 19264, 19488, 19520])
+
+        # every line in this corresponds to one entry in the array above, this is used
+        # to put the projected uv plane in the right position
+        self.panelShifts = np.array([[1.3985    ,  0.2652658 ,  3.68255],
+                               [ 1.3985    ,  0.23238491, -0.88255],
+                               [ 0.80146531,  1.17631236,  3.68255],
+                               [ 0.82407264,  1.15370502, -0.88255],
+                               [-0.2582769 ,  1.3985    ,  3.68255],
+                               [-0.2322286 ,  1.3985    , -0.88255],
+                               [-1.17531186,  0.80246583, 3.68255 ],
+                               [-1.15510614,  0.82267151, -0.88255],
+                               [-1.3985    , -0.2645974 ,  3.68255],
+                               [-1.3985    , -0.23012119, -0.88255],
+                               [-0.80591227, -1.17186534,  3.68255],
+                               [-0.82344228, -1.15433536, -0.88255],
+                               [ 0.26975836, -1.3985    ,  3.68255],
+                               [ 0.23326624, -1.3985    , -0.88255],
+                               [ 1.1746111 , -0.80316652,  3.68255],
+                               [ 1.15205703, -0.82572062, -0.88255],
+                               [ 2.2015    ,  0.26959865,  5.01305],
+                               [ 2.2015    ,  0.2524582 , -1.21305],
+                               [ 1.77559093,  1.32758398,  5.01305],
+                               [ 1.78212569,  1.31626522, -1.21305],
+                               [ 0.87798948,  2.03516717,  5.01305],
+                               [ 0.88478563,  2.03124357, -1.21305],
+                               [-0.26129975,  2.2015    ,  5.01305],
+                               [-0.25184137,  2.2015    , -1.21305],
+                               [-1.32416655,  1.77756402,  5.01305],
+                               [-1.31417539,  1.78333226, -1.21305],
+                               [-2.03421133,  0.87964512,  5.01305],
+                               [-2.02960691,  0.88762038, -1.21305],
+                               [-2.2015    , -0.25954151,  5.01305],
+                               [-2.2015    , -0.24969109, -1.21305],
+                               [-1.77636043, -1.32625112,  5.01305],
+                               [-1.78138268, -1.31755219, -1.21305],
+                               [-0.87493138, -2.03693277, 5.01305 ],
+                               [-0.8912978 , -2.02748378, -1.21305],
+                               [ 0.26489725, -2.2015    ,  5.01305],
+                               [ 0.25364439, -2.2015    , -1.21305],
+                               [ 1.3269198 , -1.7759744 ,  5.01305],
+                               [ 1.32258793, -1.77847528, -1.21305],
+                               [ 2.03616649, -0.87625871,  5.01305],
+                               [ 2.02936825, -0.8880338 , -1.21305]])
+
+        # every entry here corresponds to the entries in the array above, these are
+        # used for rotating the projected uv plane
+        self.panelRotations = np.array([ 90,  90, 135, 135, 180, 180, 225, 225, 270, 270, 315, 315, 360,
+                                   360, 405, 405,  90,  90, 120, 120, 150, 150, 180, 180, 210, 210,
+                                   240, 240, 270, 270, 300, 300, 330, 330, 360, 360, 390, 390, 420,
+                                   420])
+
+        # the layer and ladder arrays, for finding them from sensor id
+        self.panelLayer = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
+        self.panelLadder = np.array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21])
+
+        # all transpormaations are stored in a dict, with the sensor id as a keyword
+        self.transformation = {}
+        self.layersLadders = {}
+        for i in range(len(self.panelIDs)):
+            self.transformation[str(self.panelIDs[i])] = [self.panelShifts[i], self.panelRotations[i]]
+            self.layersLadders[str(self.panelIDs[i])] = [self.panelLayer[i], self.panelLadder[i]]
+
+        # these are the branch names for cluster info in the root file
+        self.gotClusters = False
+        self.clusters = ['PXDClusters/PXDClusters.m_clsCharge',
+                         'PXDClusters/PXDClusters.m_seedCharge',
+                         'PXDClusters/PXDClusters.m_clsSize',
+                         'PXDClusters/PXDClusters.m_uSize',
+                         'PXDClusters/PXDClusters.m_vSize',
+                         'PXDClusters/PXDClusters.m_uPosition',
+                         'PXDClusters/PXDClusters.m_vPosition',
+                         'PXDClusters/PXDClusters.m_sensorID']
+
+        # these are the branch names for cluster digits in the root file
+        self.digits = ['PXDDigits/PXDDigits.m_uCellID',
+                       'PXDDigits/PXDDigits.m_vCellID',
+                       'PXDDigits/PXDDigits.m_charge']
+
+        # this establishes the relationship between clusters and digits
+        # because for some reaseon the branch for digits has a different
+        # size than the cluster branch
+        self.clusterToDigis = 'PXDClustersToPXDDigits/m_elements/m_elements.m_to'
+
+        # these are the branch names for monte carlo data in the root file
+        self.mcData = ['MCParticles/MCParticles.m_pdg',
+                       'MCParticles/MCParticles.m_momentum_x',
+                       'MCParticles/MCParticles.m_momentum_y',
+                       'MCParticles/MCParticles.m_momentum_z']
+
+        # these two establish the relation ship to an from clusters and monte carlo
+        # there more entries than in the cluster data, but there still mc data missing
+        # for some cluster files
+        self.clusterToMC = 'PXDClustersToMCParticles/m_elements/m_elements.m_to'
+        self.mcToCluster = 'PXDClustersToMCParticles/m_elements/m_elements.m_from'
+
+        # this dict stores the data
+        self.data = {}
+
+    def __getitem__(self, index: str | int | ArrayLike) -> np.ndarray | dict:
+        """
+        this makes the class subscriptable, one can retrieve one coloumn by using
+        strings as keywords, or get a row by using integer indices or arrays
+        """
+        if isinstance(index, str):
+            return self.data[index]
+        return {key: value[index] for key, value in self.data.items()}
+
+    def loadData(self, file: str) -> None:
+        """
+        reads the file off of the harddrive, it automatically creates event numbers
+        file: str = it's the whole file path + .root ending
+        """
+        self.eventTree = ur.open(f'{file}:tree')
+        self._genEventNumbers()
+
+    def _genEventNumbers(self) -> None:
+        """
+        a private method that gets called on file import
+        it generates the event numbers from the jagged arrays
+        coming from the branches
+        """
+        eventNumbers = []
+        clusters = self.eventTree.arrays('PXDClusters/PXDClusters.m_clsCharge', library='np')['PXDClusters/PXDClusters.m_clsCharge']
+        for i in range(len(clusters)):
+            eventNumbers.append(np.array([i]*len(clusters[i])))
+        self.data['eventNumbers'] = self._flatten(eventNumbers)
+
+    def _getData(self, keyword: str, library: str = 'np') -> np.ndarray:
+        """
+        a private method for converting branches into something useful, namely
+        into numpy arrays, if the keyward library is set to np.
+        keyword: str = the full branch name
+        library: str = can be 'np' (numpy), 'pd' (pandas) or 'ak' (akward)
+                       see uproot documentation for more info
+        """
+        try:
+            data = self.eventTree.arrays(keyword, library=library)[keyword]
+            return self._flatten(data)
+        except:
+            return KeyError
+
+    def _flatten(self, structure: ArrayLike, maxDepth: int = None, currentDepth: int = 0) -> np.ndarray:
+        """
+        this is a private function, that gets called during loading branches
+        it flattens ragged array, one can set the depths to which one wants to flatten
+        structure: the list/array to flatten
+        maxDepth: int = the amount of flattening
+        currentDepth: int = don't touch this, it's used for recursively calling
+        """
+        flat_list = []
+
+        for element in structure:
+            if isinstance(element, (list, np.ndarray)) and (maxDepth is None or currentDepth < maxDepth):
+                flat_list.extend(self._flatten(element, maxDepth, currentDepth + 1))
+            else:
+                flat_list.append(element)
+
+        return np.array(flat_list, dtype=object)
+
+    def getClusters(self) -> None:
+        """
+        this uses the array from __init__ to load different branches into the data dict
+        """
+        self.gotClusters = True
+        for branch in self.clusters:
+            data = self._getData(branch)
+            keyword = branch.split('_')[-1]
+            self.data[keyword] = data
+
+    def getMatrices(self) -> None:
+        """
+        loads the digit branches into arrays and converts them into adc matrices
+        """
+        uCellIDs = self.eventTree.arrays(self.digits[0], library='np')[self.digits[0]]
+        vCellIDs = self.eventTree.arrays(self.digits[1], library='np')[self.digits[1]]
+        cellCharges = self.eventTree.arrays(self.digits[2], library='np')[self.digits[2]]
+
+        # this establishes the relation between digits and clusters, it's still
+        # shocking to me, that this is necessary, why aren't digits stored in the
+        # same way as clusters, than one wouldn't need to jump through hoops just
+        # to have the data in a usable und sensible manner
+        # root is such a retarded file format
+        clusterDigits = self.eventTree.arrays(self.clusterToDigis, library='np')[self.clusterToDigis]
+
+        self.data['cluster'] = self._genMatrices(uCellIDs, vCellIDs, cellCharges, clusterDigits).astype('int')
+
+    def _genMatrices(self, uCellIDs: ArrayLike, vCellIDs: ArrayLike, cellCharges: ArrayLike, clusterDigits: ArrayLike, matrixSize: tuple = (9, 9)) -> np.ndarray:
+        """
+        this takes the ragged/jagged digit arrays and converts them into 9x9 matrices
+        it's a rather slow process because of all the looping
+        """
+        plotRange = np.array(matrixSize) // 2
+        events = []
+
+        for event in range(len(cellCharges)):
+            adcValues = []
+            digitsU = np.array(uCellIDs[event])
+            digitsV = np.array(vCellIDs[event])
+            digitsCharge = np.array(cellCharges[event])
+            digitIndices = clusterDigits[event]
+
+            for indices in digitIndices:
+                cacheImg = np.zeros(matrixSize)
+                maxChargeIndex = digitsCharge[indices].argmax()
+                uMax, vMax = digitsU[indices[maxChargeIndex]], digitsV[indices[maxChargeIndex]]
+                uPos, vPos = digitsU[indices] - uMax + plotRange[0], digitsV[indices] - vMax + plotRange[1]
+
+                valid_indices = (uPos >= 0) & (uPos < matrixSize[0]) & (vPos >= 0) & (vPos < matrixSize[1])
+                cacheImg[uPos[valid_indices].astype(int), vPos[valid_indices].astype(int)] = digitsCharge[indices][valid_indices]
+                adcValues.append(cacheImg)
+
+            events.extend(adcValues)
+
+        return np.array(events, dtype=object)
+
+    def genCoordisnate(self) -> None:
+        """
+        converting the uv coordinates, together with sensor ids, into xyz coordinates
+        """
+        if self.gotClusters is False:
+            self.getClusters()
+        xcoords, ycoords, zcoords = self._getCartesian(self.data['uPosition'], self.data['vPosition'], self.data['sensorID'])
+        self.data['xPosition'] = xcoords
+        self.data['yPosition'] = ycoords
+        self.data['zPosition'] = zcoords
+
+    def _getCartesian(self, uPositions: ArrayLike, vPositions: ArrayLike, sensorIDs: ArrayLike) -> tuple[np.ndarray]:
+        """
+        a private method for transposing/converting 2d uv coords into 3d xyz coordinates
+        """
+        length = len(sensorIDs)
+        xArr, yArr, zArr = np.zeros(length), np.zeros(length), np.zeros(length)
+
+        # iterting over the cluster arrays
+        for index, (u, v, sensor_id) in enumerate(zip(uPositions, vPositions, sensorIDs)):
+            # grabbing the shift vector and rotation angle
+            shift, angle = self.transformation[str(sensor_id)]
+
+            # setting up rotation matrix
+            theta = np.deg2rad(angle)
+            rotMatrix = np.array([[np.cos(theta), -np.sin(theta), 0], [np.sin(theta), np.cos(theta), 0], [0, 0, 1]])
+
+            # projecting uv coordinates into 3d space
+            point = np.array([u, 0, v])
+
+            # shifting and rotating the projected vector
+            shifted = rotMatrix.dot(point) + shift
+            xArr[index], yArr[index], zArr[index] = shifted
+
+        return xArr, yArr, zArr
+
+    def getLayers(self) -> None:
+        """
+        looks up the corresponding layers and ladders for every cluster
+        """
+        if self.gotClusters is False:
+            self.getClusters()
+        layers, ladders = [], []
+        for id in self.data['sensorID']:
+            layer, ladder = self.layersLadders[str(id)]
+            layers.append(layer)
+            ladders.append(ladder)
+        self.data['layers'] = np.array(layers)
+        self.data['ladder'] = np.array(ladders)
+
+    def getMCData(self) -> None:
+        """
+        this loads the monte carlo from the root file
+        """
+
+        # the monte carlo data, they are longer than the cluster data
+        pdg = self.eventTree.arrays(self.mcData[0], library='np')[self.mcData[0]]
+        momentumX = self.eventTree.arrays(self.mcData[1], library='np')[self.mcData[1]]
+        momentumY = self.eventTree.arrays(self.mcData[2], library='np')[self.mcData[2]]
+        momentumZ = self.eventTree.arrays(self.mcData[3], library='np')[self.mcData[3]]
+
+        # this loads the relation ships to and from clusters and mc data
+        # this is the same level of retardedness as with the cluster digits
+        clusterToMC = self.eventTree.arrays(self.clusterToMC, library='np')[self.clusterToMC]
+        mcToCluster = self.eventTree.arrays(self.mcToCluster, library='np')[self.mcToCluster]
+
+        # it need the cluster charge as a jagged/ragged array, maybe I could simply
+        # use the event numbers, but I am too tired to fix this shitty file format
+        clsCharge = self.eventTree.arrays('PXDClusters/PXDClusters.m_clsCharge', library='np')['PXDClusters/PXDClusters.m_clsCharge']
+
+        # reorganizing MC data
+        momentumXList = []
+        momentumYList = []
+        momentumZList = []
+        pdgList = []
+        clusterNumbersList = []
+        for i in range(len(clusterToMC)):
+            # _fillMCList fills in the missing spots, because there are not mc data for
+            # every cluster, even though there are more entries in this branch than
+            # in the cluster branch... as I said, the root format is retarded
+            fullClusterReferences = self._fillMCList(mcToCluster[i], clusterToMC[i], len(clsCharge[i]))
+            clusterNumbersList.append(fullClusterReferences)
+            pdgs, xmom, ymom, zmom = self._getMCData(fullClusterReferences, pdg[i], momentumX[i], momentumY[i], momentumZ[i])
+            momentumXList.append(xmom)
+            momentumYList.append(ymom)
+            momentumZList.append(zmom)
+            pdgList.append(pdgs)
+
+        self.data['momentumX'] = self._flatten(momentumXList)
+        self.data['momentumY'] = self._flatten(momentumYList)
+        self.data['momentumZ'] = self._flatten(momentumZList)
+        self.data['pdg'] = self._flatten(pdgList)
+        self.data['clsNumber'] = self._flatten(clusterNumbersList)
+
+    def _findMissing(self, lst: list, length: int) -> list:
+        """
+        a private method for finding missing elements in mc data arrays
+        """
+        return sorted(set(range(0, length)) - set(lst))
+
+    def _fillMCList(self, fromClusters: ArrayLike, toClusters: ArrayLike, length: ArrayLike) -> list:
+        """
+        a private method for filling MC data arrays where clusters don't have
+        any information
+        """
+        missingIndex = self._findMissing(fromClusters, length)
+        testList = [-1] * length
+        fillIndex = 0
+        for i in range(len(testList)):
+            if i in missingIndex:
+                testList[i] = -1
+            else:
+                try:
+                    testList[i] = int(toClusters[fillIndex])
+                except TypeError:
+                    testList[i] = int(toClusters[fillIndex][0])
+                fillIndex += 1
+        return testList
+
+    def _getMCData(self, toClusters: ArrayLike, pdgs: ArrayLike, xMom: ArrayLike, yMom: ArrayLike, zMom: ArrayLike) -> tuple[np.ndarray]:
+        """
+        after filling and reorganizing MC data arrays one can finally collect the
+        actual MC data, where there's data missing I will with zeros
+        """
+        pxList, pyList, pzList = [], [], []
+        pdgList = []
+        for references in toClusters:
+            if references == -1:
+                pxList.append(0)
+                pyList.append(0)
+                pzList.append(0)
+                pdgList.append(0)
+            else:
+                pxList.append(xMom[references])
+                pyList.append(yMom[references])
+                pzList.append(zMom[references])
+                pdgList.append(pdgs[references])
+        return np.array(pdgList,dtype=list), np.array(pxList,dtype=list), np.array(pyList,dtype=list), np.array(pzList,dtype=list)
+
+    def getStructuredArray(self) -> np.ndarray:
+        """
+        this converts the data dict of this class into a structured numpy array
+        """
+        # Create a list to hold the dtype specifications
+        dtype = []
+
+        # Iterate through the dictionary keys and values
+        for key, value in self.data.items():
+            # Determine the data type of the first value in the list
+            sampleValue = value[0]
+
+            if isinstance(sampleValue, np.ndarray):
+                # If the value is an array, use its shape and dtype
+                fieldDtype = (sampleValue.dtype, sampleValue.shape)
+            else:
+                # Otherwise, use the type of the value itself
+                fieldDtype = type(sampleValue)
+
+            # Append the key and data type to the dtype list
+            dtype.append((key, fieldDtype))
+
+        # Convert the dictionary to a list of tuples
+        keys = list(self.data.keys())
+        dataList = [tuple(self.data[key][i] for key in keys) for i in range(len(self.data[keys[0]]))]
+
+        # Create the structured array
+        structuredArray = np.array(dataList, dtype=dtype)
+
+        return structuredArray
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
+import setuptools
+
+with open("README.md", "r") as fh:
+    description = fh.read()
+
+setuptools.setup(
+    name="rootable",
+    version="0.0.1",
+    author="Johannes Bilk",
+    author_email="johannes.bilk@physik.uni-giessen.de",
+    packages=["rootable"],
+    description="A simple packages for extracting PXD data from root files",
+    long_description=description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/gituser/test-tackage",
+    license='MIT',
+    python_requires='>=3.10',
+    install_requires=[],
+    keywords=['python', 'pxd', 'root'],
+    classifiers= [
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Researchers",
+        "Programming Language :: Python :: 3",
+        "Operating System :: MacOS :: MacOS X",
+        "Operating System :: Microsoft :: Windows",
+    ]
+)