-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathloader.py
More file actions
116 lines (79 loc) · 2.79 KB
/
loader.py
File metadata and controls
116 lines (79 loc) · 2.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pandas as pd
import os
import torch
import numpy as np
import re
_SPLITS = ["train", "dev", "test"]
_LABELS = ["Label", "attack_cat"]
def _load(csv_file: str, cache_file: str):
if os.path.exists(cache_file):
return pd.read_feather(cache_file)
else:
return pd.read_csv(csv_file, low_memory=False)
def load(dir: str = "data/processed/"):
return {
k: _load(
os.path.join(dir, f"{k}.csv"),
os.path.join(dir, f"{k}.feather"),
)
for k in _SPLITS
}
def must_norm(col):
if col in _LABELS:
return False
if re.match(r".+_[0-9]+$", col):
return False
return True
class DataLoader:
def __init__(self, batch_size=32, device="cuda", W=100):
self.data = load()
self.batch_size = batch_size
self.device = device
self.W = W
# Normalize features to zero mean and unit variance
mean = self.data["train"].mean()
var = self.data["train"].var()
for split in self.data:
for col in self.data[split]:
if must_norm(col):
self.data[split][col] = (self.data[split][col] - mean[col]) / (
var[col] + 1e-6
) ** 0.5
self.cache_x = {}
self.cache_y = {}
def len(self, split):
return self.data[split].shape[0] // self.batch_size // self.W
def iter(self, split, drop_last=True, W=None):
if W is None:
W = self.W
if split in self.cache_x:
x = self.cache_x[split]
else:
x = self.data[split].drop(columns=_LABELS).to_numpy()
if split in self.cache_y:
y = self.cache_y[split]
else:
y = self.data[split].loc[:, _LABELS].to_numpy()
starts = np.random.randint(0, x.shape[0] - W, x.shape[0] // W)
x = np.stack([x[i : i + W] for i in starts], axis=0)
y = np.stack([y[i : i + W] for i in starts], axis=0)
# Shuffle the sequences
idx = np.random.permutation(x.shape[0])
x = x[idx]
y = y[idx]
# Drop last elements so that the batch size divides evenly
n = (x.shape[0] // self.batch_size) * self.batch_size
x, x_rem = x[:n], x[n:]
y, y_rem = y[:n], y[n:]
x = x.reshape(-1, self.batch_size, W, x.shape[-1])
y = y.reshape(-1, self.batch_size, W, y.shape[-1])
kwargs = {"dtype": torch.float32, "device": self.device}
for xx, yy in zip(x, y):
yield torch.tensor(xx, **kwargs), torch.tensor(yy, **kwargs)
if not drop_last:
yield torch.tensor(x_rem, **kwargs), torch.tensor(y_rem, **kwargs)
if __name__ == "__main__":
d = DataLoader()
for x, y in d.iter("train"):
print(x.shape, y.shape)
break