Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import numpy as np
from numpy.typing import ArrayLike
from typing import Iterable, Any
import re
class FancyDict:
def __init__(self, data: dict = None) -> None:
self.data = data if data is not None else {}
def __getitem__(self, index: str | int | ArrayLike):
"""
this makes the class subscriptable, one can retrieve one coloumn by using
strings as keywords, or get a row by using integer indices or arrays
"""
if isinstance(index, str):
return self.data[index]
return self.__class__({key: value[index] for key, value in self.data.items()})
def __setitem__(self, index: str | int | ArrayLike, value: dict | Any) -> None:
"""
Allows setting the value of a column by using strings as keywords,
setting the value of a row by using integer indices or arrays,
or setting a specific value using a tuple of key and index.
:param index: The column name, row index, or tuple of key and index.
:param value: The value to set.
"""
if isinstance(index, str):
assert len(value) == len(self.data[list(self.data.keys())[0]]), 'value should have same length as data'
self.data[index] = value
elif isinstance(index, tuple) and len(index) == 2 and isinstance(index[0], str) and isinstance(index[1], int):
key, idx = index
assert key in self.data, f"key {key} not found in data"
self.data[key][idx] = value
else:
assert isinstance(value, dict), "value must be a dictionary when setting rows"
assert set(value.keys()) == set(self.data.keys()), "keys of value must match keys of data"
for key in self.data:
self.data[key][index] = value[key]
def where(self, *conditions: str) -> dict:
"""
Filters the data based on the provided conditions.
:param conditions: List of conditions as strings for filtering. The keys should be the names of the data fields, and the conditions should be in a format that can be split into key, operator, and value.
:return: Instance of the class containing the filtered data.
"""
filteredData = self.data.copy()
mask = np.ones(len(next(iter(self.data.values()))), dtype=bool) # Initial mask allowing all elements
# Applying the conditions to create the mask
for condition in conditions:
match = re.match(r'(\w+)\s*([<>=]=?| in )\s*(.+)', condition)
if match is None:
raise ValueError(f"Invalid condition: {condition}")
key, op, value = match.groups()
op = op.strip() # remove any leading and trailing spaces
if op == 'in':
value = eval(value)
mask &= np.isin(self.data[key], value)
else:
if value.replace('.', '').isdigit():
comparisionValue = float(value)
fieldValues = self.data[key].astype(float)
elif value.lower() == 'true' or value.lower() == 'false':
comparisionValue = True if value.lower() == 'true' else False
fieldValues = self.data[key]
else:
comparisionValue = value
fieldValues = self.data[key]
# Determine the correct comparison to apply
operation = {
'==': np.equal,
'<': np.less,
'>': np.greater,
'<=': np.less_equal,
'>=': np.greater_equal,
}.get(op)
if operation is None:
raise ValueError(f"Invalid operator {op}")
mask &= operation(fieldValues, comparisionValue)
# Applying the mask to filter the data
for key, values in filteredData.items():
filteredData[key] = values[mask]
return self.__class__(data=filteredData)
def __repr__(self) -> str:
return str(self.data)
def __iter__(self) -> Iterable:
keys = list(self.data.keys())
numRows = len(self.data[keys[0]])
for i in range(numRows):
yield {key: self.data[key][i] for key in keys}
def __len__(self) -> int:
return len(self.data)
def keys(self) -> list:
return list(self.data.keys())
def items(self) -> list:
return self.data.items()
def values(self) -> list:
return self.data.values()
def get(self, key: str) -> np.ndarray:
return self.data.get(key)
def pop(self, key: str) -> None:
return self.data.pop(key)
@property
def numClusters(self) -> int:
key = list(self.keys())[0]
return len(self.data[key])
def stack(self, *columns, toKey: str, pop: bool = True) -> None:
"""
Stacks specified columns into a single column and stores it under a new key.
:param columns: The columns to stack.
:param toKey: The new key where the stacked column will be stored.
:param pop: Whether to remove the original columns.
"""
# Check that all specified columns exist
for column in columns:
if column not in self.data:
raise KeyError(f"Column '{column}' does not exist.")
# Column stack the specified columns
stackedColumn = np.column_stack([self.data[col] for col in columns])
# Flatten if it's 1D for consistency
if stackedColumn.shape[1] == 1:
stackedColumn = stackedColumn.flatten()
# Store it under the new key
self.data[toKey] = stackedColumn
# Remove the original columns if pop is True
if pop:
for column in columns:
self.data.pop(column)