From 84129d4e91669fdba2d415260f3e11478d86c7bd Mon Sep 17 00:00:00 2001 From: Peverell <1079572226@qq.com> Date: Sat, 25 Feb 2023 11:00:08 +0800 Subject: [PATCH] add GraphDataProvider --- singleVis/data.py | 300 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 267 insertions(+), 33 deletions(-) diff --git a/singleVis/data.py b/singleVis/data.py index a6af1b4..5e35b33 100644 --- a/singleVis/data.py +++ b/singleVis/data.py @@ -15,7 +15,7 @@ 3. provide data """ class DataProviderAbstractClass(ABC): - + def __init__(self, content_path, model, epoch_start, epoch_end, epoch_period): self.mode = "abstract" self.content_path = content_path @@ -23,7 +23,7 @@ def __init__(self, content_path, model, epoch_start, epoch_end, epoch_period): self.s = epoch_start self.e = epoch_end self.p = epoch_period - + @property @abstractmethod def train_num(self): @@ -41,7 +41,7 @@ def _meta_data(self): @abstractmethod def _estimate_boundary(self): pass - + def update_interval(self, epoch_s, epoch_e): self.s = epoch_s self.e = epoch_e @@ -76,7 +76,7 @@ def test_num(self): del testing_data gc.collect() return test_num - + def _meta_data(self): raise NotImplementedError @@ -88,7 +88,7 @@ class NormalDataProvider(DataProvider): def __init__(self, content_path, model, epoch_start, epoch_end, epoch_period, device, classes, epoch_name, verbose=1): super().__init__(content_path, model, epoch_start, epoch_end, epoch_period, device, classes, epoch_name, verbose) self.mode = "normal" - + @property def representation_dim(self): train_data_loc = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, self.s), "train_data.npy") @@ -248,7 +248,7 @@ def train_representation(self, epoch): print("no train data saved for Epoch {}".format(epoch)) train_data = None return train_data - + def train_labels(self, epoch): # load train data training_data_loc = os.path.join(self.content_path, "Training_data", "training_dataset_label.pth") @@ -275,7 +275,7 @@ def test_representation(self, epoch): test_data = None # max_x = self.max_norm(epoch) return test_data - + def test_labels(self, epoch): # load train data testing_data_loc = os.path.join(self.content_path, "Testing_data", "testing_dataset_label.pth") @@ -299,7 +299,7 @@ def border_representation(self, epoch): print("no border points saved for Epoch {}".format(epoch)) border_centers = np.array([]) return border_centers - + def test_border_representation(self, epoch): border_centers_loc = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch), "test_border_centers.npy") @@ -309,7 +309,7 @@ def test_border_representation(self, epoch): print("no border points saved for Epoch {}".format(epoch)) border_centers = np.array([]) return border_centers - + def max_norm(self, epoch): train_data_loc = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch), "train_data.npy") index_file = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch), "index.json") @@ -373,7 +373,7 @@ def testing_accu(self, epoch): pred = self.get_pred(epoch, data).argmax(-1) val = evaluate_inv_accu(labels, pred) return val - + def is_deltaB(self, epoch, data): """ check wheter input vectors are lying on delta-boundary or not @@ -384,19 +384,19 @@ def is_deltaB(self, epoch, data): preds = self.get_pred(epoch, data) border = is_B(preds) return border - + def checkpoint_path(self, epoch): path = os.path.join(self.model_path, "{}_{}".format(self.epoch_name, epoch)) return path - + class ActiveLearningDataProvider(DataProvider): def __init__(self, content_path, model, base_epoch_start, device, classes, iteration_name="Iteration",verbose=1): # dummy input as epoch_end and epoch_period super().__init__(content_path, model, base_epoch_start, base_epoch_start, 1, device, classes, iteration_name, verbose) self.mode = "al" self.iteration_name = iteration_name - + def label_num(self, iteration): return len(self.get_labeled_idx(iteration)) @@ -408,7 +408,7 @@ def representation_dim(self, iteration): return repr_dim except Exception as e: return None - + def get_labeled_idx(self, iteration): index_file = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, iteration), "index.json") lb_idxs = np.array(load_labelled_data_index(index_file)) @@ -452,7 +452,7 @@ def _meta_data(self, iteration): save_dir = os.path.join(self.model_path, "time_al.json") if not os.path.exists(save_dir): evaluation = dict() - + else: f = open(save_dir, "r") evaluation = json.load(f) @@ -543,7 +543,7 @@ def train_representation(self, iteration): print("no train data saved for Iteration {}".format(iteration)) train_data = None return train_data - + def train_labels(self, epoch): # load train data training_data_loc = os.path.join(self.content_path, "Training_data", "training_dataset_label.pth") @@ -554,7 +554,7 @@ def train_labels(self, epoch): print("no train labels saved for Iteration {}".format(epoch)) training_labels = None return training_labels.numpy() - + def train_representation_ulb(self, iteration): # load train data train_data_loc = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, iteration), "train_data.npy") @@ -567,7 +567,7 @@ def train_representation_ulb(self, iteration): print("no train data saved for Iteration {}".format(iteration)) train_data = None return train_data - + def train_labels_ulb(self, epoch): # load train data training_data_loc = os.path.join(self.content_path, "Training_data", "training_dataset_label.pth") @@ -580,7 +580,7 @@ def train_labels_ulb(self, epoch): print("no train labels saved for Iteration {}".format(epoch)) training_labels = None return training_labels.numpy() - + def train_representation_all(self, iteration): # load train data train_data_loc = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, iteration), "train_data.npy") @@ -590,7 +590,7 @@ def train_representation_all(self, iteration): print("no train data saved for Iteration {}".format(iteration)) train_data = None return train_data - + def train_labels_all(self, epoch): # load train data training_data_loc = os.path.join(self.content_path, "Training_data", "training_dataset_label.pth") @@ -614,7 +614,7 @@ def test_representation(self, epoch): test_data = None # max_x = self.max_norm(epoch) return test_data - + def test_labels(self, epoch): # load train data testing_data_loc = os.path.join(self.content_path, "Testing_data", "testing_dataset_label.pth") @@ -638,7 +638,7 @@ def border_representation(self, epoch): print("no border points saved for Iteration {}".format(epoch)) border_centers = np.array([]) return border_centers - + def test_border_representation(self, epoch): border_centers_loc = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, epoch), "test_border_centers.npy") @@ -648,7 +648,7 @@ def test_border_representation(self, epoch): print("no border points saved for Epoch {}".format(epoch)) border_centers = np.array([]) return border_centers - + def max_norm(self, epoch): train_data = self.train_representation(epoch) max_x = np.linalg.norm(train_data, axis=1).max() @@ -699,7 +699,7 @@ def testing_accu(self, epoch): pred = self.get_pred(epoch, data).argmax(1) val = evaluate_inv_accu(labels, pred) return val - + def is_deltaB(self, epoch, data): """ check wheter input vectors are lying on delta-boundary or not @@ -710,11 +710,11 @@ def is_deltaB(self, epoch, data): preds = self.get_pred(epoch, data) border = is_B(preds) return border - + def checkpoint_path(self, epoch): path = os.path.join(self.model_path, "{}_{}".format(self.iteration_name, epoch)) return path - + class DenseActiveLearningDataProvider(ActiveLearningDataProvider): def __init__(self, content_path, model, base_epoch_start, epoch_num, device, classes, iteration_name="Iteration", epoch_name="Epoch", verbose=1): super().__init__(content_path, model, base_epoch_start, device, classes, iteration_name, verbose) @@ -877,7 +877,7 @@ def train_representation_all(self, iteration, epoch): print("no train data saved for Iteration {} Epoch {}".format(iteration, epoch)) train_data = None return train_data - + def train_representation_ulb(self, iteration, epoch): # load train data train_data_loc = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, iteration), "{}_{}".format(self.epoch_name, epoch), "train_data.npy") @@ -891,7 +891,7 @@ def train_representation_ulb(self, iteration, epoch): print("no train data saved for Iteration {}".format(iteration)) train_data = None return train_data - + def train_labels_ulb(self, iteration, epoch): # load train data training_data_loc = os.path.join(self.content_path, "Training_data", "training_dataset_label.pth") @@ -904,7 +904,7 @@ def train_labels_ulb(self, iteration, epoch): print("no train labels saved for Iteration {}".format(epoch)) training_labels = None return training_labels.numpy() - + def train_labels(self, iteration, epoch): # load labelled train labels training_data_loc = os.path.join(self.content_path, "Training_data", "training_dataset_label.pth") @@ -950,7 +950,7 @@ def border_representation(self, iteration, epoch): print("no border points saved for Epoch {}".format(epoch)) border_centers = np.array([]) return border_centers - + def test_border_representation(self, iteration, epoch): border_centers_loc = os.path.join(self.model_path, "{}_{}".format(self.iteration_name, iteration), "{}_{:d}".format(self.epoch_name, epoch), "test_border_centers.npy") @@ -960,7 +960,7 @@ def test_border_representation(self, iteration, epoch): print("no border points saved for Iteration {} Epoch {}".format(iteration, epoch)) border_centers = np.array([]) return border_centers - + def max_norm(self, iteration, epoch): train_data_loc = os.path.join(self.model_path, "{}_{}".format(self.iteration_name, iteration), "{}_{:d}".format(self.epoch_name, epoch), "train_data.npy") index_file = os.path.join(self.model_path, "{}_{}".format(self.iteration_name, iteration), "index.json") @@ -1023,7 +1023,7 @@ def testing_accu(self, iteration, epoch): pred = self.get_pred(epoch, data).argmax(-1) val = evaluate_inv_accu(labels, pred) return val - + def is_deltaB(self, iteration, epoch, data): """ check wheter input vectors are lying on delta-boundary or not @@ -1034,7 +1034,241 @@ def is_deltaB(self, iteration, epoch, data): preds = self.get_pred(iteration, epoch, data) border = is_B(preds) return border - + def single_checkpoint_path(self, iteration, epoch): path = os.path.join(self.model_path, "{}_{}".format(self.iteration_name, iteration), "{}_{}".format(self.epoch_name, epoch)) return path + + +class GraphDataProvider(DataProvider): + def __init__(self, content_path, model, epoch_start, epoch_end, epoch_period, device, classes, epoch_name, + verbose=1): + super().__init__(content_path, model, epoch_start, epoch_end, epoch_period, device, classes, epoch_name, + verbose) + self.mode = "graph_normal" + + @property + def representation_dim(self): + train_data_loc = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, self.s), "train_data.npy") + try: + train_data = np.load(train_data_loc) + repr_dim = np.prod(train_data.shape[1:]) + return repr_dim + except Exception as e: + return None + + def _meta_data(self): + time_inference = list() + training_data_path = os.path.join(self.content_path, "Training_data") + training_data = torch.load(os.path.join(training_data_path, "training_dataset_data.pth"), + map_location="cpu") + + testing_data_path = os.path.join(self.content_path, "Testing_data") + testing_data = torch.load(os.path.join(testing_data_path, "testing_dataset_data.pth"), + map_location="cpu") + + for n_epoch in range(self.s, self.e + 1, self.p): + t_s = time.time() + + model_location = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, n_epoch), + "subject_model.pth") + self.model.load_state_dict(torch.load(model_location, map_location=torch.device("cpu"))) + self.model = self.model.to(self.DEVICE) + self.model.eval() + + repr_model = self.feature_function(n_epoch) + + + # training data clustering + data_pool_representation = graph_batch_run(repr_model, training_data) + # data_pool_representation = graph_batch_run(repr_model, training_feature, training_A, training_mask) + location = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, n_epoch), "train_data.npy") + np.save(location, data_pool_representation) + + # test data + test_data_representation = graph_batch_run(repr_model, testing_data) + # test_data_representation = graph_batch_run(repr_model, testing_feature, testing_A, testing_mask) + location = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, n_epoch), "test_data.npy") + np.save(location, test_data_representation) + + t_e = time.time() + time_inference.append(t_e - t_s) + if self.verbose > 0: + print("Finish inferencing data for Epoch {:d}...".format(n_epoch)) + print( + "Average time for inferencing data: {:.4f}".format(sum(time_inference) / len(time_inference))) + + # save result + save_dir = os.path.join(self.model_path, "time.json") + if not os.path.exists(save_dir): + evaluation = dict() + else: + f = open(save_dir, "r") + evaluation = json.load(f) + f.close() + evaluation["data_inference"] = round(sum(time_inference) / len(time_inference), 3) + with open(save_dir, 'w') as f: + json.dump(evaluation, f) + + del training_data + del testing_data + gc.collect() + + def _estimate_boundary(self, num, l_bound): + pass + + def initialize(self, num, l_bound): + self._meta_data() + self._estimate_boundary(num, l_bound) + + def train_representation(self, epoch): + # load train data + train_data_loc = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch), "train_data.npy") + index_file = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch), "index.json") + index = load_labelled_data_index(index_file) + try: + train_data = np.load(train_data_loc) + train_data = train_data[index] + except Exception as e: + print("no train data saved for Epoch {}".format(epoch)) + train_data = None + return train_data + + def train_labels(self, epoch): + # load train data + training_data_loc = os.path.join(self.content_path, "Training_data", "training_dataset_label.pth") + index_file = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch), "index.json") + index = load_labelled_data_index(index_file) + try: + training_labels = torch.load(training_data_loc, map_location="cpu") + training_labels = training_labels[index] + except Exception as e: + print("no train labels saved for Epoch {}".format(epoch)) + training_labels = None + return training_labels.numpy() + + def test_representation(self, epoch): + data_loc = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch), "test_data.npy") + try: + test_data = np.load(data_loc) + index_file = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch), "test_index.json") + if os.path.exists(index_file): + index = load_labelled_data_index(index_file) + test_data = test_data[index] + except Exception as e: + print("no test data saved for Epoch {}".format(epoch)) + test_data = None + # max_x = self.max_norm(epoch) + return test_data + + def test_labels(self, epoch): + # load train data + testing_data_loc = os.path.join(self.content_path, "Testing_data", "testing_dataset_label.pth") + try: + testing_labels = torch.load(testing_data_loc).to(device=self.DEVICE) + index_file = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch), "test_index.json") + if os.path.exists(index_file): + idxs = load_labelled_data_index(index_file) + testing_labels = testing_labels[idxs] + except Exception as e: + print("no train labels saved for Epoch {}".format(epoch)) + testing_labels = None + return testing_labels.cpu().numpy() + + def border_representation(self, epoch): + border_centers_loc = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch), + "border_centers.npy") + try: + border_centers = np.load(border_centers_loc) + except Exception as e: + print("no border points saved for Epoch {}".format(epoch)) + border_centers = np.array([]) + return border_centers + + def test_border_representation(self, epoch): + border_centers_loc = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch), + "test_border_centers.npy") + try: + border_centers = np.load(border_centers_loc) + except Exception as e: + print("no border points saved for Epoch {}".format(epoch)) + border_centers = np.array([]) + return border_centers + + def max_norm(self, epoch): + train_data_loc = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch), "train_data.npy") + index_file = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch), "index.json") + index = load_labelled_data_index(index_file) + try: + train_data = np.load(train_data_loc) + train_data = train_data[index] + max_x = np.linalg.norm(train_data, axis=1).max() + except Exception as e: + print("no train data saved for Epoch {}".format(epoch)) + max_x = None + return max_x + + def prediction_function(self, epoch): + model_location = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch), "subject_model.pth") + self.model.load_state_dict(torch.load(model_location, map_location=torch.device("cpu"))) + self.model.to(self.DEVICE) + self.model.eval() + + pred_fn = self.model.prediction + return pred_fn + + def feature_function(self, epoch): + model_location = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch), "subject_model.pth") + self.model.load_state_dict(torch.load(model_location, map_location=torch.device("cpu"))) + self.model = self.model.to(self.DEVICE) + self.model.eval() + + fea_fn = self.model.feature + return fea_fn + + def get_pred(self, epoch, data): + ''' + get the prediction score for data in epoch_id + :param data: numpy.ndarray + :param epoch_id: + :return: pred, numpy.ndarray + ''' + prediction_func = self.prediction_function(epoch) + + data = torch.from_numpy(data) + data = data.to(self.DEVICE) + pred = batch_run(prediction_func, data) + return pred + + def training_accu(self, epoch): + data = self.train_representation(epoch) + labels = self.train_labels(epoch) + pred = self.get_pred(epoch, data).argmax(-1) + val = evaluate_inv_accu(labels, pred) + return val + + def testing_accu(self, epoch): + data = self.test_representation(epoch) + labels = self.test_labels(epoch) + test_index_file = os.path.join(self.model_path, "{}_{}".format(self.epoch_name, epoch), "test_index.json") + if os.path.exists(test_index_file): + index = load_labelled_data_index(test_index_file) + labels = labels[index] + pred = self.get_pred(epoch, data).argmax(-1) + val = evaluate_inv_accu(labels, pred) + return val + + def is_deltaB(self, epoch, data): + """ + check wheter input vectors are lying on delta-boundary or not + :param epoch_id: + :param data: numpy.ndarray + :return: numpy.ndarray, boolean, True stands for is_delta_boundary + """ + preds = self.get_pred(epoch, data) + border = is_B(preds) + return border + + def checkpoint_path(self, epoch): + path = os.path.join(self.model_path, "{}_{}".format(self.epoch_name, epoch)) + return path