vollo-ids-example/loader.py at main · MyrtleSoftware/vollo-ids-example · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pandas as pd
import os
import torch
import numpy as np
import re

_SPLITS = ["train", "dev", "test"]

_LABELS = ["Label", "attack_cat"]


def _load(csv_file: str, cache_file: str):
    if os.path.exists(cache_file):
        return pd.read_feather(cache_file)
    else:
        return pd.read_csv(csv_file, low_memory=False)


def load(dir: str = "data/processed/"):
    return {
        k: _load(
            os.path.join(dir, f"{k}.csv"),
            os.path.join(dir, f"{k}.feather"),
        )
        for k in _SPLITS
    }


def must_norm(col):

    if col in _LABELS:
        return False

    if re.match(r".+_[0-9]+$", col):
        return False

    return True


class DataLoader:
    def __init__(self, batch_size=32, device="cuda", W=100):

        self.data = load()
        self.batch_size = batch_size
        self.device = device
        self.W = W

        # Normalize features to zero mean and unit variance

        mean = self.data["train"].mean()
        var = self.data["train"].var()

        for split in self.data:
            for col in self.data[split]:
                if must_norm(col):
                    self.data[split][col] = (self.data[split][col] - mean[col]) / (
                        var[col] + 1e-6
                    ) ** 0.5

        self.cache_x = {}
        self.cache_y = {}

    def len(self, split):
        return self.data[split].shape[0] // self.batch_size // self.W

    def iter(self, split, drop_last=True, W=None):

        if W is None:
            W = self.W

        if split in self.cache_x:
            x = self.cache_x[split]
        else:
            x = self.data[split].drop(columns=_LABELS).to_numpy()

        if split in self.cache_y:
            y = self.cache_y[split]
        else:
            y = self.data[split].loc[:, _LABELS].to_numpy()

        starts = np.random.randint(0, x.shape[0] - W, x.shape[0] // W)

        x = np.stack([x[i : i + W] for i in starts], axis=0)
        y = np.stack([y[i : i + W] for i in starts], axis=0)

        # Shuffle the sequences
        idx = np.random.permutation(x.shape[0])

        x = x[idx]
        y = y[idx]

        # Drop last elements so that the batch size divides evenly
        n = (x.shape[0] // self.batch_size) * self.batch_size

        x, x_rem = x[:n], x[n:]
        y, y_rem = y[:n], y[n:]

        x = x.reshape(-1, self.batch_size, W, x.shape[-1])
        y = y.reshape(-1, self.batch_size, W, y.shape[-1])

        kwargs = {"dtype": torch.float32, "device": self.device}

        for xx, yy in zip(x, y):
            yield torch.tensor(xx, **kwargs), torch.tensor(yy, **kwargs)

        if not drop_last:
            yield torch.tensor(x_rem, **kwargs), torch.tensor(y_rem, **kwargs)


if __name__ == "__main__":

    d = DataLoader()

    for x, y in d.iter("train"):
        print(x.shape, y.shape)
        break