"""SEAL-CI model.""" import torch import random from tqdm import trange from layers import SEAL from utils import hierarchical_graph_reader, GraphDatasetGenerator from sklearn import metrics import pandas as pd class SEALCITrainer(object): """ Semi-Supervised Graph Classification: A Hierarchical Graph Perspective Cautious Iteration model. """ def __init__(self, args): """ Creating dataset, doing dataset split, creating target and node index vectors. :param args: Arguments object. """ self.device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu") self.device = 'cpu' self.args = args self.macro_graph = hierarchical_graph_reader(self.args.hierarchical_graph) # 大图 self.dataset_generator = GraphDatasetGenerator(self.args.graphs, self.args.feature_which) self._setup_macro_graph() # 大图的边 边给加好了self.macro_graph_edges self._create_split() # 区分了带标签的和不带标签的 self.labled_indices, self.unlabeld_indices self._create_labeled_target() # self.labeled_mask, self.labeled_target self._create_node_indices() # node_indices def _setup_model(self): """ Creating a SEAL model. """ self.dataset_generator.number_of_features = self.dataset_generator.number_of_features self.dataset_generator.number_of_labels = self.dataset_generator.number_of_labels self.model = SEAL(self.args, self.dataset_generator.number_of_features, self.dataset_generator.number_of_labels, self.device).to(self.device) def _setup_macro_graph(self): """ Creating an edge list for the hierarchical graph. """ self.macro_graph_edges = [[edge[0], edge[1]] for edge in self.macro_graph.edges()] self.macro_graph_edges = self.macro_graph_edges + [[edge[1], edge[0]] for edge in self.macro_graph.edges()] self.macro_graph_edges = torch.t(torch.LongTensor(self.macro_graph_edges)) def _create_split(self): """ Creating a labeled-unlabeled split. """ # graph_indices = [index for index in range(len(self.dataset_generator.graphs))] random.seed(2) self.train_indices = [] self.var_indices = [] self.test_indices = [] len_type = [len(i) for i in self.dataset_generator.type_ind] print(f"type:len: {len_type}") for i in self.dataset_generator.type_ind: random.shuffle(i) train_count = min(int(len(i) * 0.7), 1000) var_count = int(len(i) * 0.8) self.train_indices.extend(i[0: train_count]) self.var_indices.extend(i[train_count: var_count]) self.test_indices.extend(i[var_count:]) ''' random.shuffle(graph_indices) labeled_count = int(len(graph_indices) * 0.8) self.labeled_indices = graph_indices[0:labeled_count] # ->参数里的label_count self.unlabeled_indices = graph_indices[labeled_count:] ''' def _create_labeled_target(self): """ Creating a mask for labeled instances and a target for them. """ self.labeled_mask = torch.LongTensor([0 for node in self.macro_graph.nodes()]) self.labeled_target = torch.LongTensor([0 for node in self.macro_graph.nodes()]) indices = torch.LongTensor(self.train_indices) self.labeled_mask[indices] = 1 indices = torch.LongTensor(self.test_indices) self.labeled_mask[indices] = 0 indices = torch.LongTensor(self.var_indices) self.labeled_mask[indices] = 2 self.labeled_target = self.dataset_generator.target dict_train = {} dict_var = {} dict_test = {} # temp = torch.LongTensor([0 for node in self.macro_graph.nodes()]) t = self.labeled_target[self.labeled_mask == 1] print(len(t[t==0])) for i in range(len(self.dataset_generator.label_map)): t = self.labeled_target[self.labeled_mask == 1] dict_train[i] = len(t[t == i]) t = self.labeled_target[self.labeled_mask == 2] dict_var[i] = len(t[t == i]) t = self.labeled_target[self.labeled_mask == 0] dict_test[i] = len(t[t == i]) print(f"train : {dict_train}") print(f"test : {dict_test}") print(f"var : {dict_var}") ''' for i in range(len(self.labeled_target)): a = labeled.item() if i not in dict_labeled: dict_labeled[i] = 0 print(f"labeled : {dict_labeled}") unlabeled_indices = torch.LongTensor(self.unlabeled_indices) self.labeled_target[unlabeled_indices] = self.dataset_generator.target[unlabeled_indices] dict_unlabeled = {} for i in self.dataset_generator.target[unlabeled_indices]: i = i.item() if i not in dict_unlabeled: dict_unlabeled[i] = 0 dict_unlabeled[i] += 1 print(f"unlabeled : {dict_unlabeled}") ''' def _create_node_indices(self): """ Creating an index of nodes. """ self.node_indices = [index for index in range(self.macro_graph.number_of_nodes())] self.node_indices = torch.LongTensor(self.node_indices) def fit_a_single_model(self): """ Fitting a single SEAL model. """ self._setup_model() optimizer = torch.optim.Adam(self.model.parameters(), lr=self.args.learning_rate, weight_decay=self.args.weight_decay) for _ in range(self.args.epochs): optimizer.zero_grad() predictions, penalty = self.model(self.dataset_generator.graphs, self.macro_graph_edges) loss = torch.nn.functional.nll_loss(predictions[self.labeled_mask == 1], self.labeled_target[self.labeled_mask == 1]) loss = loss + self.args.gamma*penalty print(f"epochs {_}*****loss: {loss} ") scores, prediction_indices = predictions.max(dim=1) correct = prediction_indices[self.labeled_mask == 1] correct = correct.eq(self.labeled_target[self.labeled_mask == 1]).sum().item() normalizer = prediction_indices[self.labeled_mask == 1].shape[0] accuracy = float(correct)/float(normalizer) print(f"accuracy for train: {accuracy}") correct = prediction_indices[self.labeled_mask == 2] correct = correct.eq(self.labeled_target[self.labeled_mask == 2]).sum().item() normalizer = prediction_indices[self.labeled_mask == 2].shape[0] accuracy = float(correct)/float(normalizer) print(f"accuracy for var: {accuracy}") loss.backward() optimizer.step() def score_a_single_model(self): """ Scoring the SEAL model. """ self.model.eval() predictions, _ = self.model(self.dataset_generator.graphs, self.macro_graph_edges) scores, prediction_indices = predictions.max(dim=1) # 打标签的图数目和没打标签的图数目 print("train: %d" % len(self.labeled_target[self.labeled_mask == 1])) print("test: %d" % len(self.labeled_target[self.labeled_mask == 0])) print("var: %d" % len(self.labeled_target[self.labeled_mask == 2])) correct = prediction_indices[self.labeled_mask == 0] correct = correct.eq(self.labeled_target[self.labeled_mask == 0]).sum().item() normalizer = prediction_indices[self.labeled_mask == 0].shape[0] accuracy = float(correct)/float(normalizer) #scores_test = scores[self.labeled_mask == 0] #scores_test = [scores_test[i].item() for i in range(len(scores_test))] y_true = self.labeled_target[self.labeled_mask == 0] y_pred = prediction_indices[self.labeled_mask == 0] #print(y_true,y_pred) f1 = metrics.f1_score(y_true, y_pred, average='micro') precition = metrics.precision_score(y_true, y_pred, average='micro') recall = metrics.recall_score(y_true, y_pred, average='micro') report = pd.DataFrame(metrics.classification_report(y_true, y_pred, output_dict=True)).transpose() print("accuracy: %s" % accuracy) print("f1_score: %s" % f1) print("precision_score: %s" % precition) print("recall_score: %s" % recall) print(report) return scores, prediction_indices, accuracy, f1, precition, recall, report def _choose_best_candidate(self, predictions, indices): """ Choosing the best candidate based on predictions. :param predictions: Scores. :param indices: Vector of likely labels. :return candidate: Node chosen. :return label: Label of node. """ nodes = self.node_indices[self.labeled_mask == 0] sub_predictions = predictions[self.labeled_mask == 0] sub_predictions, candidate = sub_predictions.max(dim=0) candidate = nodes[candidate] label = indices[candidate] return candidate, label def _update_target(self, candidate, label): """ Adding the new node to the mask and the target is updated with the predicted label. :param candidate: Candidate node identifier. :param label: Label of candidate node. """ self.labeled_mask[candidate] = 1 self.labeled_target[candidate] = label def fit(self): """ Training models sequentially. """ print("\nTraining started.\n") self.fit_a_single_model() ''' budget_size = trange(self.args.budget, desc='Unlabeled Accuracy: ', leave=True) for _ in budget_size: scores, prediction_indices, accuracy = self.score_a_single_model() candidate, label = self._choose_best_candidate(scores, prediction_indices) self._update_target(candidate, label) budget_size.set_description("Unlabeled Accuracy:%g" % round(accuracy, 4)) ''' return self.score_a_single_model() def score(self): """ Scoring the model. """ print("\nModel scoring.\n") scores, prediction_indices, accuracy = self.score_a_single_model() print("Unlabeled Accuracy:%g" % round(accuracy, 4))