fyler_code_fm/data.py at main · cavalab/fyler_code_fm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
import torch
import h5py
import pandas as pd
import numpy as np
from operator import itemgetter
from scipy.sparse import load_npz
import pickle

class H5Dataset(torch.utils.data.Dataset):
    """
    see https://discuss.pytorch.org/t/dataloader-when-num-worker-0-there-is-bug/25643/16?fbclid=IwAR2jFrRkKXv4PL9urrZeiHT_a3eEn7eZDWjUaQ-zcLP6BRtMO7e0nMgwlKU
    """

    def __init__(self, path, preprocessing=None):
        self.file_path = path
        self.dataset = None
        self.preprocessing = preprocessing
        with h5py.File(self.file_path, "r") as file:
            self.dataset_len = len(file["tracings"])

    def __getitem__(self, index):
        if self.dataset is None:
            self.dataset = h5py.File(self.file_path, "r")["tracings"]
        sample = self.dataset[index]
        return sample if self.preprocessing is None else self.preprocessing(sample)

    def __len__(self):
        return self.dataset_len


class H5DatasetMap(torch.utils.data.Dataset):
    """
    Read ECG data corresponding to a file specifying the records to sample.

    ecg_path: str
        Path to an `h5` file. Keys are ECG IDs, values are ECG tracings.
    sample_path: str
        Path to a text file with one ECG ID per line. Only these will be used.
    preprocessing: callable
        Optional function applied to each sample (after converting to tensor).
    """
    def __init__(self, ecg_path, sample_path, preprocessing=None, return_id=False):
        self.file_path = ecg_path
        self.sample_path = sample_path
        self.dataset = None
        self.preprocessing = preprocessing
        with open(sample_path, "r") as file:
            self.sample_ids = {i: r.strip() for i, r in enumerate(file.readlines())}
        self.dataset_len = len(self.sample_ids)
        self.return_id = return_id

    def __getitem__(self, index):
        if self.dataset is None:
            self.dataset = h5py.File(self.file_path, "r")
        # read h5 dataset and convert to tensor
        sample = torch.tensor(self.dataset[self.sample_ids[index]][()], dtype=torch.float32)
        # apply preprocessing if any
        if self.preprocessing is not None:
            sample = self.preprocessing(sample)
        if self.return_id:
            return self.sample_ids[index], sample
        else:
            return sample  # shape [C,H,W]

    def __len__(self):
        return self.dataset_len


task = ['chd_dx', 'chd_lvef', 'cmr']
age_name = {'chd_dx': 'Patient Age at Event in Years', 'chd_lvef':'age_echo', 'cmr':'Age_CMR'}
gender_name = {'cmr':'SEX'}
ecg_id = {'chd_dx':'Event ID Number', 'chd_lvef':'ecg_id', 'cmr':'ECG_EVENTID'}
# CMR
# sv_cols = ["sv", "hlhs", "fontan", "glenn", "triatresia", "dolv", "dilv", "dorv", "dirv", "composite_svproblem"]
# rv_cols = ["tof", "rhf", "rdcavc", "asd", "pa", "tapvr", "ebstein", "truncus", "composite_rvproblem"]
# lv_cols = ["lhf", "coa", "mi", "ldcavc", "bcavc", "ltga", "dtga", "alcapa", "cardiomyopathy", "rejection", "vsd", "myocarditis", "iron", "hohf", "composite_lvproblem"]
# diagnosis_columns = sv_cols + rv_cols + lv_cols
# CHD LVEF
diagnosis_columns = ['tof','cardiomyopathy','asd','cavc','coa','dorv','dtga','ebstein','hlhs','ltga','pa','tapvr','triatresia', 'truncus','vsd','dextrocardia','pacemaker']

class H5LabelledDataset(torch.utils.data.Dataset):
    """
    Load an ECG dataset with labels. Meant for supervised tasks.

    ecg_path: str
        A path to an `h5` file. The keys of the file should be ECG IDs, with corresponding ECG tracing values.
    label_path: str
        A path to a csv file containing ECG IDs and their corresponding labels.
        The label/ECG_ID pairs in this file are the only ones that will be used in training.
    labels: list[str] or None
        Optionally specify which columns of `label_path` to use for training. By default all of
        the ECG-related labels will be used.

    """

    def __init__(self, ecg_path, label_path, labels=None, train_group=None, covariate_path=None, preprocessing=None):
        self.df_labels = pd.read_csv(label_path).set_index("ECG_ID")
        if labels is None:
            self.labels = [c for c in self.df_labels.columns if c not in ["ECG_ID"]]
        else:
            self.labels = labels
        # print("training labels:", self.labels)
        self.ecg_path = ecg_path
        self.ecg_dataset = None
        self.n_labels = len(self.labels)
        self.covariate_path = covariate_path
        self.preprocessing = preprocessing

        if covariate_path:
            for t in task:
                if t in label_path:
                    self.covs = pd.read_csv(covariate_path, index_col=ecg_id[t])
                    self.covs.index = 'BCH'+self.covs.index.astype(str)
                    self.covs = self.covs.rename(columns={age_name[t]:'age'})
                    self.covs.index.name = 'ECG_ID'
        '''filter by lesion '''
        if train_group:
            self.covs_unique = self.covs[diagnosis_columns].reset_index().drop_duplicates(subset='ECG_ID').set_index('ECG_ID')
            merged_df = self.df_labels.merge(self.covs_unique[diagnosis_columns], left_index=True, right_index=True,how='left')
            self.df_labels = merged_df[merged_df[train_group]==1]
        self.dataset_len = len(self.df_labels)

    def assign_lesion_group(self, row):
        '''
        For CMR
        '''
        if row.loc[sv_cols].sum() > 0:
            return "Functionally SV"
        elif row.loc[rv_cols].sum() > 0:
            return "RV at risk"
        elif row.loc[lv_cols].sum() > 0:
            return "LV at risk"
        else:
            return "Other"

    def assign_age_group(self, row):
        if row['age'] <= 20:
            return "<=20"
        elif 20<row['age']<=40:
            return "20-40"
        elif row['age'] > 40:
            return ">40"

    def __getitem__(self, index):
        labels = self.df_labels.iloc[index][self.labels].astype(np.float32).values
        if self.ecg_dataset is None:
            self.ecg_dataset = h5py.File(self.ecg_path, "r")
        sample = self.ecg_dataset[self.df_labels.iloc[index].name][:]
        '''If wanna return diagnoses'''
        # if self.covariate_path:
        #     cov_row = self.covs.loc[self.df_labels.iloc[index].name]
        #     if isinstance(cov_row, pd.DataFrame):
        #         cov_row = cov_row.iloc[0]
        #     diagnoses = [cov_row[col] for col in diagnosis_columns]
        #     diagnoses = torch.tensor(diagnoses, dtype=torch.float32)
        #     return sample, labels, diagnoses
        # else:
        #     return sample, labels
        if self.preprocessing is not None:
            sample = self.preprocessing(sample)
        return sample, labels

    def __len__(self):
        return self.dataset_len


class H5LVEFLabelledDataset(torch.utils.data.Dataset):
    """
    Load an ECG dataset with regression labels from covariates (LVEF only).

    ecg_path: str
        A path to an `h5` file. The keys of the file should be ECG IDs, with corresponding ECG tracing values.
    label_path: str
        A path to a csv file containing ECG IDs to include in training.
    covariate_path: str
        A path to a csv file containing covariates including an LVEF column.
    lvef_col: str
        Name of the LVEF column in covariates. Defaults to "LVEF".
    """

    def __init__(self, ecg_path, label_path, covariate_path, lvef_col="LVEF", train_group=None, preprocessing=None):
        if covariate_path is None:
            raise ValueError("covariate_path is required for H5LVEFLabelledDataset")

        self.df_labels = pd.read_csv(label_path).set_index("ECG_ID")
        self.ecg_path = ecg_path
        self.ecg_dataset = None
        self.preprocessing = preprocessing
        self.covariate_path = covariate_path
        self.lvef_col = lvef_col

        matched_task = None
        for t in task:
            if t in label_path:
                matched_task = t
                break

        if matched_task is not None:
            self.covs = pd.read_csv(covariate_path, index_col=ecg_id[matched_task])
            self.covs.index = 'BCH' + self.covs.index.astype(str)
            self.covs = self.covs.rename(columns={age_name[matched_task]: 'age'})
            self.covs.index.name = 'ECG_ID'
        else:
            self.covs = pd.read_csv(covariate_path)
            if 'ECG_ID' not in self.covs.columns:
                raise ValueError("Cannot infer ECG ID column in covariates. Please include an 'ECG_ID' column.")
            self.covs = self.covs.set_index('ECG_ID')

        if lvef_col not in self.covs.columns:
            raise ValueError(f"'{lvef_col}' not found in covariate file columns")

        cov_lvef = self.covs[[lvef_col]].copy()
        cov_lvef = cov_lvef[~cov_lvef.index.duplicated(keep='first')]

        merged_df = self.df_labels.merge(cov_lvef, left_index=True, right_index=True, how='inner')

        if train_group:
            self.covs_unique = self.covs[diagnosis_columns].reset_index().drop_duplicates(subset='ECG_ID').set_index('ECG_ID')
            merged_df = merged_df.merge(self.covs_unique[diagnosis_columns], left_index=True, right_index=True, how='left')
            merged_df = merged_df[merged_df[train_group] == 1]

        merged_df = merged_df.dropna(subset=[lvef_col])
        self.df_labels = merged_df

        self.labels = [lvef_col]
        self.n_labels = 1
        self.dataset_len = len(self.df_labels)

    def __getitem__(self, index):
        label = np.array([self.df_labels.iloc[index][self.lvef_col]], dtype=np.float32)
        if self.ecg_dataset is None:
            self.ecg_dataset = h5py.File(self.ecg_path, "r")
        sample = self.ecg_dataset[self.df_labels.iloc[index].name][:]
        if self.preprocessing is not None:
            sample = self.preprocessing(sample)
        return sample, label

    def __len__(self):
        return self.dataset_len

class H5FCLabelledDataset(torch.utils.data.Dataset):
    """
    Load an ECG dataset with labels (Fyler codes).

    ecg_path: str
        A path to an `h5` file. The keys of the file should be ECG IDs, with corresponding ECG tracing values.
    label_path: str
        A path to a npz file containing Fyler code labels (one-hot vectors).
    ecg_id_path: str
        A path to a pkl file containing corresponding ECG IDs (each entry corresponds exactly to the same row in label_path).
    code_name_path: str
        A path to a pkl file containing the name of all codes (order of codes corresponds exactly to the columns of the one-hot labels).
    """

    def __init__(self, ecg_path, label_path, ecg_id_path, code_name_path):
        self.ecgs = h5py.File(ecg_path, "r")

        self.fyler_code_onehot = load_npz(label_path)
        with open(ecg_id_path, 'rb') as f:
            self.ecg_ids = pickle.load(f)
        with open(code_name_path, 'rb') as f:
            self.labels = pickle.load(f)

        self.n_labels = len(self.labels)
        self.dataset_len = len(self.ecg_ids)

    def __getitem__(self, index):
        ecg_id = self.ecg_ids[index]
        sample = self.ecgs[ecg_id][:]
        label = self.fyler_code_onehot.getrow(index).toarray()[0].astype(np.float32)

        return sample, label

    def __len__(self):
        return self.dataset_len


class H5MFMLabelledDataset(torch.utils.data.Dataset):
    """
    Load an MFM dataset with labels. Meant for supervised tasks.

    ecg_path: str
        A path to an `h5` file. The keys of the file should be IDs, with corresponding tracing values.
    label_path: str
        A path to a csv file containing ECG IDs and their corresponding labels.
        The label/ECG_ID pairs in this file are the only ones that will be used in training.
    labels: list[str] or None
        Optionally specify which columns of `label_path` to use for training. By default all of
        the ECG-related labels will be used.

    """

    def __init__(self, ecg_path, label_path, labels=None, train_group=None, covariate_path=None, preprocessing=None):
        self.df_labels = pd.read_parquet(label_path).set_index("PID")
        if labels is None:
            self.labels = [c for c in self.df_labels.columns if "pH Cord <" in c]
        else:
            self.labels = labels
        print("training labels:", self.labels)
        self.ecg_path = ecg_path
        self.ecg_dataset = None
        self.n_labels = len(self.labels)
        self.covariate_path = covariate_path
        self.preprocessing = preprocessing
        self.dataset_len = len(self.df_labels)

    def __getitem__(self, index):
        labels = self.df_labels.iloc[index][self.labels].astype(int).astype(np.float32).values
        if self.ecg_dataset is None:
            self.ecg_dataset = h5py.File(self.ecg_path, "r")
        sample = torch.tensor(self.ecg_dataset[self.df_labels.iloc[index].name][()], dtype=torch.float32)
        if self.preprocessing is not None:
            sample = self.preprocessing(sample)
        return sample, labels

    def __len__(self):
        return self.dataset_len