Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import numpy as np
from numpy.typing import ArrayLike
from typing import Iterable, Any
import re
class FancyDict:
def __init__(self, data: dict = None) -> None:
self.data = data if data is not None else {}
def __getitem__(self, index: str | int | ArrayLike):
"""
this makes the class subscriptable, one can retrieve one coloumn by using
strings as keywords, or get a row by using integer indices or arrays
"""
if isinstance(index, str):
return self.data[index]
return self.__class__({key: value[index] for key, value in self.data.items()})
def __setitem__(self, index: str | int | ArrayLike, value: dict | Any) -> None:
"""
Allows setting the value of a column by using strings as keywords,
setting the value of a row by using integer indices or arrays,
or setting a specific value using a tuple of key and index.
:param index: The column name, row index, or tuple of key and index.
:param value: The value to set.
"""
if isinstance(index, str):
assert len(value) == len(self.data[list(self.data.keys())[0]]), 'value should have same length as data'
self.data[index] = value
elif isinstance(index, tuple) and len(index) == 2 and isinstance(index[0], str) and isinstance(index[1], int):
key, idx = index
assert key in self.data, f"key {key} not found in data"
self.data[key][idx] = value
else:
assert isinstance(value, dict), "value must be a dictionary when setting rows"
assert set(value.keys()) == set(self.data.keys()), "keys of value must match keys of data"
for key in self.data:
self.data[key][index] = value[key]
def set(self, keyWord: str, value: list | np.ndarray) -> None:
"""
an in-place method for setting values
"""
if keyWord in self.data:
self.data[keyWord] = np.concatenate((self.data[keyWord], value))
else:
self.data[keyWord] = np.array(value)
def extend(self, value: dict, axis: int = None) -> None:
"""
an in-place method for extending certain keys
"""
assert isinstance(value, dict), "value must be a dictionary when setting rows"
assert set(value.keys()).issubset(set(self.data.keys())), "keys of value must be a subset of keys of data"
for key in value:
self.data[key] = np.concatenate((self.data[key], value[key]), axis=axis)
def where(self, *conditions: str) -> dict:
"""
Filters the data based on the provided conditions.
:param conditions: List of conditions as strings for filtering. The keys should be the names of the data fields, and the conditions should be in a format that can be split into key, operator, and value.
:return: Instance of the class containing the filtered data.
"""
filteredData = self.data.copy()
mask = np.ones(len(next(iter(self.data.values()))), dtype=bool) # Initial mask allowing all elements
# Applying the conditions to create the mask
for condition in conditions:
match = re.match(r'(\w+)\s*([<>=]=?| in )\s*(.+)', condition)
if match is None:
raise ValueError(f"Invalid condition: {condition}")
key, op, value = match.groups()
op = op.strip() # remove any leading and trailing spaces
if op == 'in':
value = eval(value)
mask &= np.isin(self.data[key], value)
else:
try:
# Attempt to convert value to float or boolean
if value.lower() in ['true', 'false']:
comparisionValue = value.lower() == 'true'
else:
comparisionValue = float(value)
except ValueError:
# If conversion fails, treat it as a string
fieldValues = self.data[key]
# Determine the correct comparison to apply
operation = {
'==': np.equal,
'<': np.less,
'>': np.greater,
'<=': np.less_equal,
'>=': np.greater_equal,
}.get(op)
if operation is None:
raise ValueError(f"Invalid operator {op}")
mask &= operation(fieldValues, comparisionValue)
# Applying the mask to filter the data
for key, values in filteredData.items():
filteredData[key] = values[mask]
return self.__class__(data=filteredData)
def __repr__(self) -> str:
return f'fancyDict({repr(self.data)})'
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
def __iter__(self) -> Iterable:
keys = list(self.data.keys())
numRows = len(self.data[keys[0]])
for i in range(numRows):
yield {key: self.data[key][i] for key in keys}
def __len__(self) -> int:
return len(self.data)
def keys(self) -> list:
return list(self.data.keys())
def items(self) -> list:
return self.data.items()
def values(self) -> list:
return self.data.values()
def get(self, key: str) -> np.ndarray:
return self.data.get(key)
def pop(self, key: str) -> None:
return self.data.pop(key)
@property
def numClusters(self) -> int:
key = list(self.keys())[0]
return len(self.data[key])
def stack(self, *columns, toKey: str, pop: bool = True) -> None:
"""
Stacks specified columns into a single column and stores it under a new key.
:param columns: The columns to stack.
:param toKey: The new key where the stacked column will be stored.
:param pop: Whether to remove the original columns.
"""
# Check that all specified columns exist
for column in columns:
if column not in self.data:
raise KeyError(f"Column '{column}' does not exist.")
# Column stack the specified columns
stackedColumn = np.column_stack([self.data[col] for col in columns])
# Flatten if it's 1D for consistency
if stackedColumn.shape[1] == 1:
stackedColumn = stackedColumn.flatten()
# Store it under the new key
self.data[toKey] = stackedColumn
# Remove the original columns if pop is True
if pop:
for column in columns:
self.data.pop(column)