Source code for SocialED.detector.hcrc

import gc
import random
import numpy as np
import os
import time
from datetime import datetime
from collections import deque
from sklearn import metrics
from scipy.sparse import csr_matrix
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch import optim
from torch_geometric.nn import global_add_pool, global_mean_pool, global_max_pool, GlobalAttention, Set2Set
import pickle
import copy
import argparse
import networkx as nx
import scipy.sparse as sp
from torch_geometric import loader
import spacy
import pandas as pd
import networkx as nx
from torch_geometric.data import Data
from torch_geometric import loader
from torch_geometric.nn import GCNConv
from torch.distributions import Categorical, MultivariateNormal
from sklearn.metrics import silhouette_score,calinski_harabasz_score
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from dataset.dataloader import DatasetLoader,Event2012

[docs]def currentTime(): return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
[docs]class HCRC: r"""The HCRC model for social event detection that uses hierarchical clustering and reinforcement learning for adaptive event detection. .. note:: This detector uses hierarchical clustering and reinforcement learning to adaptively detect events in social media data. The model requires a dataset object with a load_data() method. Parameters ---------- dataset : object The dataset object containing social media data. Must provide load_data() method that returns the raw data. file_path : str, optional Path to save model files. Default: ``'../model/model_saved/hcrc/'``. result_path : str, optional Path to save results file. Default: ``'../model/model_saved/hcrc/res.txt'``. task : str, optional Task type, e.g. 'DRL' for deep reinforcement learning. Default: ``'DRL'``. layers : str, optional Hidden layer dimensions as string. Default: ``'[256]'``. N_pred_hid : int, optional Node prediction hidden dimension. Default: ``64``. G_pred_hid : int, optional Graph prediction hidden dimension. Default: ``16``. eval_freq : float, optional Evaluation frequency. Default: ``5``. mad : float, optional Moving average decay rate. Default: ``0.9``. Glr : float, optional Learning rate for graph model. Default: ``0.0000006``. Nlr : float, optional Learning rate for node model. Default: ``0.00001``. Ges : int, optional Graph model early stopping patience. Default: ``50``. Nes : int, optional Node model early stopping patience. Default: ``2000``. Gepochs : int, optional Number of graph model training epochs. Default: ``105``. Nepochs : int, optional Number of node model training epochs. Default: ``100``. device : int, optional GPU device ID to use. Default: ``0``. """ def __init__(self, dataset, # 文件路径相关参数 file_path: str = '../model/model_saved/hcrc/', result_path: str = '../model/model_saved/hcrc/res.txt', # 任务相关参数 task: str = 'DRL', # 模型结构相关参数 layers: str = '[256]', N_pred_hid: int = 64, G_pred_hid: int = 16, # 训练和评估相关参数 eval_freq: float = 5, mad: float = 0.9, Glr: float = 0.0000006, Nlr: float = 0.00001, Ges: int = 50, Nes: int = 2000, Gepochs: int = 105, Nepochs: int = 100, # 设备相关参数 device: int = 0): self.dataset = dataset.load_data() # 文件路径相关参数 self.file_path = file_path self.result_path = result_path # 任务相关参数 self.task = task # 模型结构相关参数 self.layers = layers self.N_pred_hid = N_pred_hid self.G_pred_hid = G_pred_hid # 训练和评估相关参数 self.eval_freq = eval_freq self.mad = mad self.Glr = Glr self.Nlr = Nlr self.Ges = Ges self.Nes = Nes self.Gepochs = Gepochs self.Nepochs = Nepochs # 设备相关参数 self.device = device
[docs] def fit(self): pass
[docs] def detection(self): args=self for i in range(22): print("************Message Block "+str(i)+" start! ************") #Node-level learning embedder_N = Node_ModelTrainer(self,i) Node_emb,label = embedder_N.get_embedding() #Graph-level learning embedder_G = Graph_ModelTrainer(self,i) Graph_emb,label = embedder_G.get_embedding() #combining vectors if i==0: all_embeddings = np.concatenate((Graph_emb,Node_emb),axis=1) all_label = label else: temp = np.concatenate((Graph_emb,Node_emb),axis=1) all_embeddings = np.concatenate((all_embeddings,temp),axis=0) all_label = all_label+label all_embeddings = torch.tensor(all_embeddings) all_embeddings = F.normalize(all_embeddings, dim=-1, p=2).detach().cpu().numpy() if i == 0: pred_y = evaluate_fun(all_embeddings,label,i,None,args.result_path,args.task) all_pred_y = pred_y else: pred_y = evaluate_fun(all_embeddings,label,i,all_pred_y,args.result_path,args.task) all_pred_y = all_pred_y + pred_y print("************Message Block "+str(i)+" end! ************\n\n") predictions = all_pred_y ground_truths = all_label return predictions, ground_truths
[docs] def evaluate(self, predictions, ground_truths): print("************Evaluation start! ************") ars = metrics.adjusted_rand_score(ground_truths, predictions) # Calculate Adjusted Mutual Information (AMI) ami = metrics.adjusted_mutual_info_score(ground_truths, predictions) # Calculate Normalized Mutual Information (NMI) nmi = metrics.normalized_mutual_info_score(ground_truths, predictions) print(f"Model Adjusted Rand Index (ARI): {ars}") print(f"Model Adjusted Mutual Information (AMI): {ami}") print(f"Model Normalized Mutual Information (NMI): {nmi}") return ars, ami, nmi
[docs]class EMA: #Exponential Moving Average def __init__(self, beta, epochs): super().__init__() self.beta = beta self.step = 0 self.total_steps = epochs
[docs] def update_average(self, old, new): if old is None: return new return old * self.beta + (1 - self.beta) * new
[docs]def get_task(strs): tasks = ["DRL","random","semi-supervised","traditional"] if len(strs) == 1: return "DRL" if ("--task" in strs) and len(strs) == 2: return "DRL" if ("--task" not in strs) or len(strs)!=3: return False elif strs[-1] not in tasks: return False else: return strs[-1]
[docs]def init_weights(m): #Model parameter initialization if type(m) == nn.Linear: torch.nn.init.xavier_uniform_(m.weight) m.bias.data.fill_(0.01)
[docs]def sim(z1, z2): z1 = F.normalize(z1) z2 = F.normalize(z2) return torch.mm(z1, z2.t())
[docs]def semi_loss(z1, z2): f = lambda x: torch.exp(x / 0.05) refl_sim = f(sim(z1, z1)) between_sim = f(sim(z1, z2)) return -torch.log(between_sim.diag() / (refl_sim.sum(1) + between_sim.sum(1) - refl_sim.diag()))
[docs]def get_loss(h1, h2): l1 = semi_loss(h1, h2) l2 = semi_loss(h2, h1) ret = (l1 + l2) * 0.5 ret = ret.mean() return ret
[docs]def update_moving_average(ema_updater, ma_model, current_model): for current_params, ma_params in zip(current_model.parameters(), ma_model.parameters()): old_weight, up_weight = ma_params.data, current_params.data ma_params.data = ema_updater.update_average(old_weight, up_weight)
[docs]def set_requires_grad(model, val): #set require_grad for p in model.parameters(): p.requires_grad = val
[docs]def enumerateConfig(args): args_names = [] args_vals = [] for arg in vars(args): args_names.append(arg) args_vals.append(getattr(args, arg)) return args_names, args_vals
[docs]def config2string(args): args_names, args_vals = enumerateConfig(args) st = '' for name, val in zip(args_names, args_vals): if val == False: continue if name not in ['device','root','epochs','isAnneal','dropout','warmup_step','clus_num_iters']: st_ = "{}_{}_".format(name, val) st += st_ return st[:-1]
[docs]def printConfig(args): args_names, args_vals = enumerateConfig(args) print(args_names) print(args_vals)
[docs]class embedder: def __init__(self, args): self.args = args self.hidden_layers = eval(args.layers)
[docs]class Encoder(nn.Module): def __init__(self, layer_config): super().__init__() self.stacked_gnn = nn.ModuleList( [GCNConv(layer_config[i - 1], layer_config[i]) for i in range(1, len(layer_config))]) self.stacked_bns = nn.ModuleList( [nn.BatchNorm1d(layer_config[i], momentum=0.01) for i in range(1, len(layer_config))]) self.stacked_prelus = nn.ModuleList([nn.PReLU() for _ in range(1, len(layer_config))])
[docs] def forward(self, x, edge_index): for i, gnn in enumerate(self.stacked_gnn): x = gnn(x, edge_index, edge_weight=None) x = self.stacked_bns[i](x) x = self.stacked_prelus[i](x) return x
M =[20254,28976,30467,32302,34312,36146,37422,42700,44260,45623,46719, 47951,51188,53160,56116,58665,59575,62251,64138,65537,66430,68840]
[docs]def DRL_cluster(all_embeddings,block_num,pred_label): para = 0.1 if block_num == 0: print("Evaluating initial message block...") start_time = time.time() sp = SinglePass(0.87, all_embeddings, 0, pred_label, M[0], None, para, 0, sim=False) end_time = time.time() run_time = end_time - start_time print("Done! " + "It takes "+str(int(run_time))+" seconds.\n") else: print("Using DRL-Single-Pass to learn threshold...") global_step = 0 agent = PPO([5], 1, continuous=True) sp_sim = SinglePass(0.6, all_embeddings, 1, pred_label, M[block_num] - M[block_num - 1], agent, para, M[block_num-1]-2000, sim=True) global_step = sp_sim.global_step sp = SinglePass(0.6, all_embeddings, 1, pred_label, M[block_num] - M[block_num - 1], agent, para, M[block_num-1]-2000, sim=False) return sp.cluster_result,sp.sim_threshold
[docs]def random_cluster(all_embeddings,block_num,pred_label): threshold = random.uniform(0.6,0.8) if block_num == 0: print("Evaluating initial message block...") start_time = time.time() sp = SinglePass(0.87, all_embeddings, 0, pred_label, M[0], None, 0, 0, sim=False) end_time = time.time() run_time = end_time - start_time print("Done! " + "It takes "+str(int(run_time))+" seconds.\n") threshold = 0.87 else: print("Evaluating message block...") start_time = time.time() sp = SinglePass(threshold, all_embeddings, 2, pred_label, M[block_num] - M[block_num - 1], None, 0, 0, sim=False) end_time = time.time() run_time = end_time - start_time print("Done! " + "It takes "+str(int(run_time))+" seconds.\n") return sp.cluster_result,threshold
[docs]def semi_cluster(all_embeddings,label,block_num,pred_label): if block_num == 0: print("Evaluating initial message block...") start_time = time.time() sp = SinglePass(0.87, all_embeddings, 0, pred_label, M[0], None, 0, 0, sim=False) end_time = time.time() run_time = end_time - start_time print("Done! " + "It takes "+str(int(run_time))+" seconds.\n") threshold = 0.87 else: print("Evaluating message block...") start_time = time.time() embeddings = all_embeddings.tolist() size = M[block_num] - M[block_num - 1] embeddings = embeddings[0:len(embeddings)-int(size*0.9)] pre_label = pred_label[0:len(embeddings)] size = len(embeddings) - M[block_num - 1] embeddings = np.array(embeddings) thresholds = [0.6,0.65,0.7,0.75,0.8] s1s = [] for t in thresholds: sp = SinglePass(t, embeddings, 2, pre_label, size, None, 0, 0, sim=False) true_label = label[0:len(sp.cluster_result)] s1 = metrics.normalized_mutual_info_score(true_label, sp.cluster_result, average_method='arithmetic') s1s.append(s1) index = s1s.index(max(s1s)) sp = SinglePass(thresholds[index], all_embeddings, 2, pred_label, M[block_num] - M[block_num - 1], None, 0, 0, sim=False) end_time = time.time() run_time = end_time - start_time print("Done! " + "It takes "+str(int(run_time))+" seconds.\n") threshold = thresholds[index] return sp.cluster_result,threshold
[docs]def NMI_cluster(all_embeddings,label,block_num,pred_label): if block_num == 0: print("Evaluating initial message block...") start_time = time.time() sp = SinglePass(0.87, all_embeddings, 0, pred_label, M[0], None, 0, 0, sim=False) end_time = time.time() run_time = end_time - start_time print("Done! " + "It takes "+str(int(run_time))+" seconds.\n") threshold = 0.87 else: print("Evaluating message block...") start_time = time.time() thresholds = [0.6,0.65,0.7,0.75,0.8] s1s = [] for t in thresholds: sp = SinglePass(t, all_embeddings, 2, pred_label, M[block_num] - M[block_num - 1], None, 0, 0, sim=False) s1 = metrics.normalized_mutual_info_score(label, sp.cluster_result, average_method='arithmetic') s1s.append(s1) index = s1s.index(max(s1s)) sp = SinglePass(thresholds[index], all_embeddings, 2, pred_label, M[block_num] - M[block_num - 1], None, 0, 0, sim=False) end_time = time.time() run_time = end_time - start_time print("Done! " + "It takes "+str(int(run_time))+" seconds.\n") threshold = thresholds[index] return sp.cluster_result,threshold
[docs]def evaluate_fun(all_embeddings,label,block_num,pred_label,result_path,task): if task == "DRL": y_pred,threshold = DRL_cluster(all_embeddings,block_num,pred_label) elif task == "random": y_pred,threshold = random_cluster(all_embeddings,block_num,pred_label) elif task == "semi-supervised": y_pred,threshold = semi_cluster(all_embeddings,label,block_num,pred_label) elif task == "traditional": y_pred,threshold = NMI_cluster(all_embeddings,label,block_num,pred_label) #NMI s1 = metrics.normalized_mutual_info_score(label, y_pred, average_method='arithmetic') #AMI s2 = metrics.adjusted_mutual_info_score(label, y_pred, average_method='arithmetic') #ARI s3 = metrics.adjusted_rand_score(label, y_pred) print('** Theta:{:.2f} **\n'.format(threshold)) print('** NMI: {:.2f} **\n'.format(s1)) print('** AMI: {:.2f} **\n'.format(s2)) print('** ARI: {:.2f} **\n'.format(s3)) result = '\nmessage_block_'+str(block_num)+'\nthreshold: {:.2f} '.format(threshold)+'\n** NMI: {:.2f} **\n'.format(s1) + '** AMI: {:.2f} **\n'.format(s2) + '** ARI: {:.2f} **\n'.format(s3) if not os.path.exists(result_path) : pass else: with open(result_path,encoding='utf-8') as file: content=file.read() result = content.rstrip() + result file = open(result_path, mode='w') file.write(result) file.close() return y_pred
[docs]class SinglePass: def __init__(self, sim_threshold, data, flag, label, size, agent, para, sim_init, sim=False, global_step=0): self.device = torch.device('cuda:0') self.text_vec = None # self.topic_serial = None self.topic_cnt = 0 self.sim_threshold = sim_threshold self.done_data = data[0:data.shape[0] - size] self.new_data = data[data.shape[0] - size:] self.done_label = label if flag == 0 or flag == 2: self.cluster_result = self.run_cluster(flag, size) else: self.agent = agent self.scheme = ["state", "action", "reward", "done", "log_prob"] self.global_step = global_step self.sim = sim if self.sim: start_time = time.time() self.cluster_result = self.run_cluster_sim(flag, size, para, sim_init, sim, data) end_time = time.time() self.time = end_time - start_time print("Creating Environment Done! " + "It takes "+str(int(self.time))+" seconds.") else: start_time = time.time() self.pseudo_labels = self.run_cluster_init(0.6, size) if flag == 1: self.text_vec = self.done_data self.topic_serial = copy.deepcopy(self.done_label) self.topic_cnt = max(self.topic_serial) state = self.get_state(sim, sim_init, data) action, action_log_prob = self.agent.select_action(state) # action projection sim_threshold = torch.clamp(action, -1, 1).detach() sim_threshold += 7 sim_threshold /=10 self.sim_threshold = sim_threshold.item() end_time = time.time() self.time = end_time - start_time print("Getting Threshold Done! " + "It takes "+str(int(self.time))+" seconds. ") print("Threshold is "+str(self.sim_threshold)+".\n") print("Evaluating message block...") start_time = time.time() self.cluster_result = self.run_cluster(flag, size) # clustering end_time = time.time() self.time = end_time - start_time print("Done! " + "It takes "+str(int(self.time))+" seconds.\n")
[docs] def clustering(self, sen_vec): if self.topic_cnt == 0: self.text_vec = sen_vec self.topic_cnt += 1 self.topic_serial = [self.topic_cnt] else: sim_vec = np.dot(sen_vec, self.text_vec.T) max_value = np.max(sim_vec) topic_ser = self.topic_serial[np.argmax(sim_vec)] self.text_vec = np.vstack([self.text_vec, sen_vec]) if max_value >= self.sim_threshold: self.topic_serial.append(topic_ser) else: self.topic_cnt += 1 self.topic_serial.append(self.topic_cnt)
[docs] def clustering_init(self, t, sen_vec): if self.topic_cnt_init == 0: self.text_vec_init = sen_vec self.topic_cnt_init += 1 self.topic_serial_init = [self.topic_cnt_init] else: sim_vec = np.dot(sen_vec, self.text_vec_init.T) max_value = np.max(sim_vec) topic_ser = self.topic_serial_init[np.argmax(sim_vec)] self.text_vec_init = np.vstack([self.text_vec_init, sen_vec]) if max_value >= t: self.topic_serial_init.append(topic_ser) else: self.topic_cnt_init += 1 self.topic_serial_init.append(self.topic_cnt_init)
[docs] def run_cluster_init(self, t, size): self.text_vec_init = [] self.topic_serial_init = [] self.topic_cnt_init = 0 for vec in self.new_data: self.clustering_init(t,vec) return self.topic_serial_init
[docs] def run_cluster_sim(self, flag, size, para, sim_init, sim, data): self.text_vec = [] self.topic_serial = [] self.topic_cnt = 0 if flag == 1: self.text_vec = self.done_data self.topic_serial = copy.deepcopy(self.done_label) self.topic_cnt = max(self.topic_serial) for i, vec in enumerate(self.new_data): self.global_step += 1 if i > 200: break if i > self.new_data.shape[0] * para: break state = self.get_state(sim, sim_init, data) action, action_log_prob = self.agent.select_action(state) self.sim_threshold = action.item() self.clustering(vec) reward = self.get_reward(sim_init, data) done = False transition = make_transition(self.scheme, state, action, reward, done, action_log_prob) self.agent.add_buffer(transition) if self.global_step % 200==0: self.agent.learn() return self.topic_serial[len(self.topic_serial) - size:]
[docs] def run_cluster(self, flag, size): self.text_vec = [] self.topic_serial = [] self.topic_cnt = 0 if flag == 1 or flag == 2: self.text_vec = self.done_data self.topic_serial = copy.deepcopy(self.done_label) self.topic_cnt = max(self.topic_serial) for i, vec in enumerate(self.new_data): self.clustering(vec) return self.topic_serial[len(self.topic_serial) - size:]
[docs] def get_center(self,label,data): centers = [] indexs_per_cluster = [] label_u = list(set(label)) for i in range(len(label_u)): indexs = [False] * data.shape[0] tmp_indexs_text = [] for j in range(len(indexs)): if label[j] == label_u[i]: indexs[j] = True tmp_indexs_text.append(j) center = np.mean(data[indexs], 0).tolist() centers.append(center) indexs_per_cluster.append(tmp_indexs_text) return centers,indexs_per_cluster
[docs] def get_info_cluster(self,text_vec,indexs_per_cluster): # Get detailed clustering results res = [] for i in range(len(indexs_per_cluster)): tmp_vec = [] for j in range(len(indexs_per_cluster[i])): tmp_vec.append(text_vec[indexs_per_cluster[i][j]]) tmp_vec = np.array(tmp_vec) res.append(tmp_vec) return res
[docs] def get_state(self, sim, sim_init, data): # get state of RL state = [] if sim: data = data[sim_init:len(self.topic_serial)] topic_serial = self.topic_serial[sim_init:] else: data = self.new_data topic_serial = self.pseudo_labels centers,indexs_per_cluster = self.get_center(topic_serial, data) centers = np.array(centers) neighbor_dists = np.dot(centers, centers.T) neighbor_dists = np.nan_to_num(neighbor_dists, 0.0001) # the minimum neighbor distance state.append(neighbor_dists.min()) # the average separation distance state.append((neighbor_dists.mean() * max(topic_serial) - 1) / max(topic_serial)) info_of_cluster = self.get_info_cluster(data,indexs_per_cluster) coh_dists = 0 for cluster in info_of_cluster: if cluster.shape[0] == 1: continue else: sums = cluster.shape[0] * (cluster.shape[0] - 1) / 2 tmp_vec = np.array(cluster) cohdist = np.dot(tmp_vec, tmp_vec.T) if cohdist.max() > coh_dists: coh_dists = cohdist.max() # Dunn index state.append(neighbor_dists.min()/coh_dists) #Sum of intra-group error squares SSE = 0 SSEE = 0 for i in range(len(indexs_per_cluster)): sumtmp = 0 for j in range(len(indexs_per_cluster[i])): tmp = np.dot(data[indexs_per_cluster[i][j]].T,centers[i]) SSE = SSE + (tmp)**2 sumtmp = sumtmp + (tmp)**2 SSEE = SSEE + sumtmp/len(indexs_per_cluster[i]) # state.append(SSE) # Sum of squared errors between groups SSR = 0 SSRR = 0 for i in range(len(centers)): SSR = SSR + np.dot(centers[i].T,centers.mean(axis=0)) SSRR = SSRR + np.dot(centers[i].T,centers.mean(axis=0))**2 SSRR = SSRR / max(topic_serial) # state.append(SSR) #the average cohesion distance coh_dists = 0 for cluster in info_of_cluster: if cluster.shape[0] == 1: continue else: sums = cluster.shape[0] * (cluster.shape[0] - 1) / 2 tmp_vec = np.array(cluster) cohdist = np.dot(tmp_vec, tmp_vec.T) cohdist = np.maximum(cohdist, -cohdist) coh_dists = coh_dists + (cohdist.sum() - cluster.shape[0]) / (2 * sums + 0.0001) state.append(coh_dists / max(topic_serial)) state.append(silhouette_score(data, topic_serial, metric='euclidean')) return np.array(state)
[docs] def get_reward(self, sim_init, data): # get reward of RL data = data[sim_init:len(self.topic_serial)] topic_serial = self.topic_serial[sim_init:] return calinski_harabasz_score(data, topic_serial)
[docs]class Node_ModelTrainer(embedder): def __init__(self, args,block_num): embedder.__init__(self, args) self._args = args self.block_num = block_num self._init() def _init(self): block_num = self.block_num args = self._args os.environ["CUDA_VISIBLE_DEVICES"] = str(args.device) self._device = f'cuda:{args.device}' if torch.cuda.is_available() else "cpu" torch.cuda.set_device(self._device) if block_num == 0 or block_num == 1: args.Nes = 200 else: args.Nes = 2000 self.true_label = [] #load data self._loader,self.true_label = get_Node_Dataset(args,block_num) layers = [302] + self.hidden_layers self._model = NodeLevel(layers, args) self._optimizer = optim.AdamW(params=self._model.parameters(), lr=args.Nlr, weight_decay=1e-5) self.train() self.all_embeddings = F.normalize(self.all_embeddings, dim=-1, p=2).detach().cpu().numpy()
[docs] def get_embedding(self): return self.all_embeddings,self.true_label
[docs] def train(self): loss_t = 1e10 cnt_wait = 0 # Start Model Training print("----Node-Level Training Start! ----\n") for epoch in range(self._args.Nepochs): losses = [] embs = [] for batch in self._loader: emb,loss = self._model(batch) self._optimizer.zero_grad() loss.backward() self._optimizer.step() self._model.update_moving_average() losses.append(loss.item()) embs = embs +emb st = '[{}][Epoch {}/{}] Loss: {:.4f}'.format(currentTime(), epoch, self._args.Nepochs, np.mean(np.array(losses))) print(st) #Early Stopping Criterion if np.mean(np.array(losses)) < loss_t: loss_t = np.mean(np.array(losses)) else: cnt_wait = cnt_wait + 1 if cnt_wait > self._args.Nes: print("Early Stopping Criterion") break self.all_embeddings = torch.tensor(embs) print("\n----Node-Level Training Done! ----\n")
[docs]class NodeLevel(nn.Module): def __init__(self, layer_config, args): super().__init__() self._device = f'cuda:{args.device}' if torch.cuda.is_available() else "cpu" #encoder self.student_encoder = Encoder(layer_config=layer_config) self.teacher_encoder = copy.deepcopy(self.student_encoder) self.student_encoder = self.student_encoder.to(self._device) self.teacher_encoder = self.teacher_encoder.to(self._device) set_requires_grad(self.teacher_encoder, False) self.teacher_ema_updater = EMA(args.mad, args.Nepochs) rep_dim = layer_config[-1] #projection head self.student_projector = nn.Sequential(nn.Linear(rep_dim, args.N_pred_hid), nn.BatchNorm1d(args.N_pred_hid), nn.PReLU(), nn.Linear(args.N_pred_hid, rep_dim)) self.student_projector = self.student_projector.to(self._device) self.student_projector.apply(init_weights) self.teacher_projector = copy.deepcopy(self.student_projector) set_requires_grad(self.teacher_projector, False)
[docs] def reset_moving_average(self): del self.teacher_encoder self.teacher_encoder = None
[docs] def update_moving_average(self): assert self.teacher_encoder is not None, 'teacher encoder has not been created yet' update_moving_average(self.teacher_ema_updater, self.teacher_encoder, self.student_encoder) update_moving_average(self.teacher_ema_updater, self.teacher_projector, self.student_projector)
[docs] def forward(self, batch): student = self.student_encoder(batch.x1.to(torch.float32).to(self._device),batch.edge_index1.to(self._device)) h1 = self.student_projector(student) with torch.no_grad(): #stop gradient teacher = self.teacher_encoder(batch.x2.to(torch.float32).to(self._device), batch.edge_index2.to(self._device)) with torch.no_grad(): #stop gradient h2 = self.teacher_projector(teacher) emb = self.student_encoder(batch.x.to(torch.float32).to(self._device),batch.edge_index.to(self._device)) emb = emb.detach().cpu().numpy().tolist() loss = get_loss(h1,h2) return emb,loss
[docs]class Graph_ModelTrainer(embedder): def __init__(self, args,block_num): self.block_num = block_num embedder.__init__(self, args) self._args = args self._init() def _init(self): args = self._args block_num = self.block_num os.environ["CUDA_VISIBLE_DEVICES"] = str(args.device) self._device = f'cuda:{args.device}' if torch.cuda.is_available() else "cpu" torch.cuda.set_device(self._device) if torch.cuda.is_available(): print("using cuda") layers = [300] + self.hidden_layers #load data self._model = GraphLevel(layers, args).to(self._device) self._model.to(self._device) self._optimizer = optim.AdamW(params=self._model.parameters(), lr=args.Glr, weight_decay=1e-5) self._loader,self.true_label = get_Graph_Dataset(args,block_num) self.train() self.all_embeddings = F.normalize(self.all_embeddings, dim=-1, p=2).detach().cpu().numpy()
[docs] def get_embedding(self): return self.all_embeddings,self.true_label
[docs] def train(self): h_loss = 1e10 cnt_wait = 0 # Start Model Training print("----Graph-Level Training Start! ----\n") for epoch in range(self._args.Gepochs): losses = [] embs = [] for batch in self._loader: batch = batch.to(self._device) emb,loss = self._model(batch) self._optimizer.zero_grad() loss.backward() self._optimizer.step() self._model.update_moving_average() losses.append(loss.item()) embs = embs + emb st = '[{}][Epoch {}/{}] Loss: {:.4f}'.format(currentTime(), epoch,self._args.Gepochs, np.mean(np.array(losses))) print(st) #Early Stopping Criterion if np.mean(np.array(losses)) < h_loss: h_loss = np.mean(np.array(losses)) elif np.mean(np.array(losses)) > h_loss: cnt_wait = cnt_wait + 1 if cnt_wait > 5: break self.all_embeddings = torch.tensor(embs) print("\n----Graph-Level Training Done! ----")
[docs]class GraphLevel(nn.Module): def __init__(self, layer_config, args): self.args = args super().__init__() self._device = f'cuda:{args.device}' if torch.cuda.is_available() else "cpu" #encoder self.student_encoder = Encoder(layer_config) self.teacher_encoder = copy.deepcopy(self.student_encoder) self.student_encoder.to(self._device) self.teacher_encoder.to(self._device) set_requires_grad(self.teacher_encoder, False) self.teacher_ema_updater = EMA(args.mad, args.Gepochs) rep_dim = layer_config[-1] #projection head self.student_projector = nn.Sequential(nn.Linear(rep_dim, args.G_pred_hid), nn.BatchNorm1d(args.G_pred_hid), nn.PReLU(), nn.Linear(args.G_pred_hid, rep_dim)) self.student_projector.to(self._device) self.student_projector.apply(init_weights) self.teacher_projector = copy.deepcopy(self.student_projector) set_requires_grad(self.teacher_projector, False) #pooling self.pool = GlobalAttention(gate_nn=nn.Sequential( nn.Linear(rep_dim, rep_dim), nn.BatchNorm1d(rep_dim), nn.ReLU(), nn.Linear(rep_dim, 1)))
[docs] def reset_moving_average(self): del self.teacher_encoder self.teacher_encoder = None
[docs] def update_moving_average(self): assert self.teacher_encoder is not None, 'teacher encoder has not been created yet' update_moving_average(self.teacher_ema_updater, self.teacher_encoder, self.student_encoder) update_moving_average(self.teacher_ema_updater, self.teacher_projector, self.student_projector)
[docs] def forward(self,batch): student = self.student_encoder(batch.x1.to(self._device),batch.edge_index1.to(self._device)) h1 = self.pool(student,batch.batch) h1 = self.student_projector(h1) with torch.no_grad(): #stop gradient teacher = self.teacher_encoder(batch.x2.to(self._device),batch.edge_index2.to(self._device)) h2 = self.pool(teacher,batch.batch) with torch.no_grad(): #stop gradient h2 = self.teacher_projector(h2) emb = self.student_encoder(batch.x.to(self._device),batch.edge_index.to(self._device)) emb = self.pool(emb,batch.batch) emb = emb.detach().cpu().numpy().tolist() loss = get_loss(h1,h2) res_emb = emb return res_emb,loss
[docs]def make_transition(trans, *items): transition = {} for key, item in zip(trans, items): if isinstance(item, list): item = torch.stack(item) transition[key] = item elif isinstance(item, np.ndarray): item = torch.from_numpy(item) transition[key] = item elif isinstance(item, torch.Tensor): transition[key] = item else: transition[key] = torch.Tensor([item]) return transition
[docs]def make_batch(state, action, old_log_prob, advantage, old_value, learn_size, batch_size, use_cuda): batch = [] total_indices = torch.randperm(learn_size) for i in range(learn_size // batch_size): indices = total_indices[batch_size * i: batch_size * (i + 1)] mini_state = torch.Tensor([]) mini_action = torch.Tensor([]) mini_old_log_prob = torch.Tensor([]) mini_advantage = torch.Tensor([]) mini_old_value = torch.Tensor([]) if use_cuda: mini_state = mini_state.cuda() mini_action = mini_action.cuda() mini_old_log_prob = mini_old_log_prob.cuda() mini_advantage = mini_advantage.cuda() mini_old_value = mini_old_value.cuda() for ind in indices: mini_state = torch.cat((mini_state, state[ind].unsqueeze(0)), dim=0) mini_action = torch.cat((mini_action, action[ind].unsqueeze(0)), dim=0) mini_old_log_prob = torch.cat((mini_old_log_prob, old_log_prob[ind].unsqueeze(0)), dim=0) mini_advantage = torch.cat((mini_advantage, advantage[ind].unsqueeze(0)), dim=0) mini_old_value = torch.cat((mini_old_value, old_value[ind].unsqueeze(0)), dim=0) batch.append([mini_state, mini_action, mini_old_log_prob, mini_advantage, mini_old_value]) return batch
[docs]def calculate_nature_cnn_out_dim(height, weight): size_h = np.floor((height - 8) / 4) + 1 size_h = np.floor((size_h - 4) / 2) + 1 size_h = np.floor((size_h - 3) / 1) + 1 size_w = np.floor((weight - 8) / 4) + 1 size_w = np.floor((size_w - 4) / 2) + 1 size_w = np.floor((size_w - 3) / 1) + 1 return size_h, size_w
[docs]class DQN_Config: def __init__(self, input_type, input_size=None): self.max_buffer = 100000 self.update_freq = 200 self.use_cuda = True self.trans = ["state", "action", "reward", "next_state", "done"] self.lr = 0.001 self.tau = 0.005 self.gamma = 0.99 self.batch_size = 128 self.max_grad_norm = 1 self.epsilon_init = 1 self.epsilon_decay = 0.995 self.epsilon_min = 0.05 self.q_layer = [256, 256] if input_type == "vector": self.encoder = "mlp" self.encoder_layer = [512, 256] self.feature_dim = 256 elif input_type == "image": self.encoder = "cnn" self.encoder_layer = [[input_size[0], 32, 8, 4], [32, 64, 4, 2], [64, 64, 3, 1]] size_h, size_w = calculate_nature_cnn_out_dim(input_size[1], input_size[2]) self.feature_dim = [int(64 * size_h * size_w), 256]
[docs]class DQN: def __init__(self, state_dim, action_dim, input_type="vector", args=None): if args is None: self.args = DQN_Config(input_type, state_dim) self.action_dim = action_dim self.buffer = BaseBuffer(self.args.trans, self.args.max_buffer) self.policy_net = QNet(state_dim[0], action_dim, self.args.q_layer, self.args.encoder, self.args.encoder_layer, self.args.feature_dim) self.target_net = QNet(state_dim[0], action_dim, self.args.q_layer, self.args.encoder, self.args.encoder_layer, self.args.feature_dim) if self.args.use_cuda: self.policy_net = self.policy_net.cuda() self.target_net = self.target_net.cuda() self.update_network() self.policy_net.eval() self.target_net.eval() self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.args.lr) self.epsilon = self.args.epsilon_init
[docs] def select_action(self, state, epsilon=None): if epsilon is None: epsilon = self.epsilon if random.random() > epsilon: state = torch.Tensor(state) if self.args.use_cuda: state = state.cuda() q_value = self.policy_net(state) return torch.argmax(q_value).cpu().unsqueeze(0).detach() else: return torch.Tensor([random.choice(np.arange(self.action_dim))]).type(torch.int64).detach()
[docs] def add_buffer(self, transition): self.buffer.add(transition)
[docs] def epsilon_decay(self): self.epsilon = max(self.epsilon * self.args.epsilon_decay, self.args.epsilon_min)
[docs] def update_network(self): self.target_net.load_state_dict(self.policy_net.state_dict())
[docs] def save_model(self, model_path): torch.save(self.policy_net.state_dict(), model_path)
[docs] def learn(self, step): self.policy_net.train() data = self.buffer.sample(self.args.batch_size) states = torch.stack(data["state"]) # [batch_size, state_dim] actions = torch.stack(data["action"]) # [batch_size, 1] rewards = torch.stack(data["reward"]) # [batch_size, 1] next_states = torch.stack(data["next_state"]) # [batch_size, state_dim] dones = torch.stack(data["done"]) # [batch_size, 1] # print("shape check", states.shape, actions.shape, rewards.shape, next_states.shape, dones.shape) if self.args.use_cuda: states = states.cuda() actions = actions.cuda() rewards = rewards.cuda() next_states = next_states.cuda() dones = dones.cuda() actions = actions.type(torch.int64) rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7) # get q-values for all actions in current states predicted_q = self.policy_net(states) # [batch_size, action_dim] # select q-values for chosen actions predicted_q_actions = torch.gather(predicted_q, -1, actions) # [batch_size, 1] # compute q-values for all actions in next states predicted_next_q = self.target_net(next_states) # [batch_size, action_dim] # compute V*(next_states) using predicted next q-values next_state_values, indexes = torch.max(predicted_next_q, dim=-1) # [batch_size] next_state_values = next_state_values.unsqueeze(-1) # [batch_size, 1] # compute "target q-values" for loss - it's what's inside square parentheses in the above formula. target_q_actions = rewards + self.args.gamma * next_state_values * (1 - dones) # [batch_size, 1] loss = nn.SmoothL1Loss()(predicted_q_actions, target_q_actions.detach()) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.policy_net.eval() self.epsilon_decay() if step % self.args.update_freq == 0: print("update target network") self.update_network() return loss.item()
[docs]class PPO_Config: def __init__(self, input_type, input_size=None): self.max_buffer = 2048 self.trainable_std = False self.use_cuda = True self.trans = ["state", "action", "reward", "done", "log_prob"] self.lr = 0.0003 self.gamma = 0.99 self.lambda_ = 0.95 self.train_epoch = 80 self.clip_ratio = 0.2 self.critic_coef = 0.5 self.entropy_coef = 0.01 self.max_grad_norm = 0.5 self.action_std_init = 0.6 self.action_std_decay_rate = 0.05 self.action_std_min = 0.1 self.action_std_update_freq = 100 self.actor_layer = [32, 32] self.critic_layer = [32, 32] if input_type == "vector": self.encoder = "mlp" self.encoder_layer = [64, 64] self.feature_dim = 32 elif input_type == "image": self.encoder = "cnn" self.encoder_layer = [[input_size[0], 32, 8, 4], [32, 64, 4, 2], [64, 64, 3, 1]] size_h, size_w = calculate_nature_cnn_out_dim(input_size[1], input_size[2]) self.feature_dim = [int(64 * size_h * size_w), 256]
[docs]class PPO: def __init__(self, state_dim, action_dim, continuous=True, input_type="vector", args=None): if args is None: self.args = PPO_Config(input_type, state_dim) self.buffer = BaseBuffer(self.args.trans, self.args.max_buffer) self.model = ActorCritic(state_dim[0], action_dim, self.args.actor_layer, self.args.critic_layer, self.args.encoder, self.args.encoder_layer, self.args.feature_dim, continuous, self.args.action_std_init) if self.args.use_cuda: self.model = self.model.cuda() self.model.eval() self.optimizer = optim.Adam(self.model.parameters(), lr=self.args.lr)
[docs] def select_action(self, state): state = torch.Tensor(state).float() if self.args.use_cuda: state = state.cuda() action, action_log_prob = self.model.act(state) return action.detach().cpu(), action_log_prob.detach().cpu()
[docs] def add_buffer(self, transition): self.buffer.add(transition)
[docs] def save_model(self, model_path): torch.save(self.model.state_dict(), model_path)
[docs] def learn(self): self.model.train() data, size = self.buffer.get_data_buffer() states = data["state"] actions = data["action"] rewards = data["reward"] dones = data["done"] old_log_probs = data["log_prob"] # Monte Carlo estimate of returns returns = [] discounted_reward = 0 for reward, done in zip(reversed(rewards), reversed(dones)): if done: discounted_reward = 0 discounted_reward = reward + (self.args.gamma * discounted_reward) returns.insert(0, discounted_reward) # Normalizing the rewards returns = torch.tensor(returns, dtype=torch.float32) returns = (returns - returns.mean()) / (returns.std() + 1e-5) # trans into tensor and send to cpu/gpu states = torch.stack(states).float() actions = torch.stack(actions) old_log_probs = torch.stack(old_log_probs) if self.args.use_cuda: states = states.cuda() # [batch_size, state_dim] actions = actions.cuda() # [batch_size] old_log_probs = old_log_probs.cuda() # [batch_size] returns = returns.cuda() # [batch_size] loss_list = [] for e in range(self.args.train_epoch): # Evaluating old actions and values log_probs, values, dist_entropy = self.model.evaluate_AC(states, actions) # print("shape check", log_probs.shape, values.shape, dist_entropy.shape) values = values.squeeze(-1) # Finding the ratio (pi_theta / pi_theta__old) ratios = torch.exp(log_probs - old_log_probs.detach()) # Finding Surrogate Loss advantages = returns - values.detach() surr1 = ratios * advantages surr2 = torch.clamp(ratios, 1 - self.args.clip_ratio, 1 + self.args.clip_ratio) * advantages # final loss of clipped objective PPO actor_loss = -torch.min(surr1, surr2) critic_loss = nn.MSELoss()(values, returns) entropy_bonus = -dist_entropy loss = actor_loss + 0.5 * critic_loss + 0.01 * entropy_bonus loss = loss.mean() # take gradient step self.optimizer.zero_grad() loss.backward() self.optimizer.step() loss_list.append(loss.item()) self.model.eval() self.buffer.clear() return np.mean(loss_list)
[docs]class ActorCritic(nn.Module): def __init__(self, state_dim, action_dim, actor_layer, critic_layer, encoder=None, encoder_layer=None, feature_dim=None, continuous=False, std_init=0.6): super(ActorCritic, self).__init__() self.continuous = continuous if continuous: self.action_var = torch.full((action_dim,), std_init * std_init) self.encoder = encoder if encoder is None: input_dim = state_dim elif encoder == "mlp": self.encoder = MLPEncoder(state_dim, encoder_layer, feature_dim) input_dim = self.encoder.get_dim() elif encoder == "cnn": self.encoder = CNNEncoder(encoder_layer, feature_dim) input_dim = self.encoder.get_dim() else: raise NotImplementedError if self.continuous: layers = [nn.Linear(input_dim, actor_layer[0]), nn.ReLU(inplace=True)] for i in range(len(actor_layer) - 1): layers.append(nn.Linear(actor_layer[i], actor_layer[i + 1])) layers.append(nn.ReLU(inplace=True)) layers.append(nn.Linear(actor_layer[-1], action_dim)) self.actor = nn.Sequential(*layers) else: layers = [nn.Linear(input_dim, actor_layer[0]), nn.ReLU(inplace=True)] for i in range(len(actor_layer) - 1): layers.append(nn.Linear(actor_layer[i], actor_layer[i + 1])) layers.append(nn.ReLU(inplace=True)) layers.append(nn.Linear(actor_layer[-1], action_dim)) layers.append(nn.Softmax(dim=-1)) self.actor = nn.Sequential(*layers) layers = [nn.Linear(input_dim, critic_layer[0]), nn.ReLU(inplace=True)] for i in range(len(critic_layer) - 1): layers.append(nn.Linear(critic_layer[i], critic_layer[i + 1])) layers.append(nn.ReLU(inplace=True)) layers.append(nn.Linear(critic_layer[-1], 1)) self.critic = nn.Sequential(*layers) self.__network_init()
[docs] def forward(self): raise NotImplementedError
def __network_init(self): for layer in self.modules(): if isinstance(layer, nn.Linear): nn.init.orthogonal_(layer.weight) if layer.bias is not None: layer.bias.data.zero_()
[docs] def act(self, state): if self.encoder is not None: state = self.encoder(state) if self.continuous: mu = self.actor(state).cpu() cov_mat = torch.diag(self.action_var) dist = MultivariateNormal(mu, cov_mat) else: action_prob = self.actor(state) dist = Categorical(action_prob) action = dist.sample() action_log_prob = dist.log_prob(action) return action.detach(), action_log_prob.detach()
[docs] def evaluate_AC(self, state, action): if self.encoder is not None: state = self.encoder(state) if self.continuous: mu = self.actor(state).cpu() action_var = self.action_var.expand_as(mu) cov_mat = torch.diag_embed(action_var) dist = MultivariateNormal(mu, cov_mat) else: action_prob = self.actor(state) dist = Categorical(action_prob) action_log_prob = dist.log_prob(action.cpu()) dist_entropy = dist.entropy() value = self.critic(state) return action_log_prob.cuda(), value, dist_entropy.cuda()
[docs]class MLPEncoder(nn.Module): def __init__(self, state_dim, layer_dim: list, feature_dim: int): super(MLPEncoder, self).__init__() layers = [nn.Linear(state_dim, layer_dim[0]), # nn.BatchNorm1d(layer_dim[0]), nn.ReLU(inplace=True)] for i in range(len(layer_dim) - 1): layers.append(nn.Linear(layer_dim[i], layer_dim[i + 1])) # layers.append(nn.BatchNorm1d(layer_dim[i + 1])) layers.append(nn.ReLU(inplace=True)) layers.append(nn.Linear(layer_dim[-1], feature_dim, bias=False)) self.encoder = nn.Sequential(*layers) self.out_dim = feature_dim
[docs] def forward(self, x): return self.encoder(x)
[docs] def get_dim(self): return self.out_dim
[docs]class CNNEncoder(nn.Module): def __init__(self, layer_dim: list, feature_dim: list): super(CNNEncoder, self).__init__() layers = [] for layer in layer_dim: layers.append(nn.Conv2d(layer[0], layer[1], layer[2], layer[3])) # layers.append(nn.BatchNorm2d(layer[1])) layers.append(nn.ReLU(inplace=True)) self.encoder = nn.Sequential(*layers) self.projector = nn.Linear(feature_dim[0], feature_dim[1], bias=False) self.out_dim = feature_dim[1]
[docs] def forward(self, x): f = self.encoder(x) f = f.reshape(f.shape[0], -1) f = self.projector(f) return f
[docs] def get_dim(self): return self.out_dim
[docs]class QNet(nn.Module): def __init__(self, state_dim, action_dim, q_layer, encoder=None, encoder_layer=None, feature_dim=None): super(QNet, self).__init__() self.encoder = encoder if encoder is None: input_dim = state_dim elif encoder == "mlp": self.encoder = MLPEncoder(state_dim, encoder_layer, feature_dim) input_dim = self.encoder.get_dim() elif encoder == "cnn": self.encoder = CNNEncoder(encoder_layer, feature_dim) input_dim = self.encoder.get_dim() else: raise NotImplementedError layers = [nn.Linear(input_dim, q_layer[0]), nn.ReLU(inplace=True)] for i in range(len(q_layer)-1): layers.append(nn.Linear(q_layer[i], q_layer[i + 1])) layers.append(nn.ReLU(inplace=True)) layers.append(nn.Linear(q_layer[-1], action_dim)) self.q_net = nn.Sequential(*layers) self.__network_init() def __network_init(self): for layer in self.modules(): if isinstance(layer, nn.Linear): nn.init.orthogonal_(layer.weight) if layer.bias is not None: layer.bias.data.zero_()
[docs] def forward(self, state): if self.encoder is not None: state = self.encoder(state) q = self.q_net(state) return q
[docs]class BaseBuffer: def __init__(self, trans, max_len): self.trans = trans self.max_len = max_len self.data = {} for key in trans: self.data[key] = deque(maxlen=self.max_len) self.total_idx = 0
[docs] def get_len(self): return self.total_idx
[docs] def clear(self): """ clear the buffer :return: """ self.data = {} for key in self.trans: self.data[key] = [] self.total_idx = 0
[docs] def add(self, transition): """ add a transition in buffer :return: """ for key in transition: self.data[key].append(transition[key]) self.total_idx += 1
[docs] def get_data_buffer(self): data_size = len(self.data["state"]) data = {} for key in self.data: data[key] = list(self.data[key]) return data, data_size
[docs] def sample(self, size): data_size = len(self.data["state"]) size = min(data_size, size) indices = torch.randperm(data_size)[:size] data = {} for key in self.trans: data[key] = [] for idx, ind in enumerate(indices): data[key].append(self.data[key][ind]) return data
[docs]def unique(lists): #delete duplicate attribute values lists = list(map(lambda x: x.lower(), lists )) if lists[0]=='': res = [] else: res = [lists[0]] for i in range(len(lists)): if i==0 or (lists[i] in lists[0:i]) or lists[i]=='': continue else: res.append(lists[i]) return res
[docs]def construct_graph_from_df(df, G=None): # construct graph according to df if G is None: G = nx.Graph() for _, row in df.iterrows(): tid = 't_' + str(row['tweet_id']) G.add_node(tid) user_ids = row['user_mentions'] user_ids.append(row['user_id']) user_ids = ['u_' + str(each) for each in user_ids] G.add_nodes_from(user_ids) words = row['filtered_words'] words = [('w_' + each).lower() for each in words] G.add_nodes_from(words) hashtags = row['hashtags'] hashtags = [('h_' + each).lower() for each in hashtags] G.add_nodes_from(hashtags) edges = [] #Connect the message node with each related user node, word node, etc edges += [(tid, each) for each in user_ids] edges += [(tid, each) for each in words] edges += [(tid, each) for each in hashtags] G.add_edges_from(edges) return G
[docs]def construct_graph(data,feature,index): #Build graph for a single tweet G = nx.Graph() X = [] tweet = data["text"].values X.append(feature[index].tolist()) index = index+1 tweet_id = data["tweet_id"].values G.add_node(tweet_id[0]) user_loc = data["user_loc"].values f_w = data["filtered_words"].tolist() edges = [] h_t = data["hashtags"].tolist() h_t = h_t[0] n = [user_loc[0]] + f_w[0] + h_t n = unique(n) if len(n)!=0: for each in n: X.append(feature[index].tolist()) index = index+1 G.add_nodes_from(n) edges +=[(tweet_id[0], each) for each in n] G.add_edges_from(edges) return G,X
[docs]def normalize_adj(adj): # Symmetrically normalize adjacency matrix adj = sp.coo_matrix(adj) rowsum = np.array(adj.sum(1)) d_inv_sqrt = np.power(rowsum, -0.5).flatten() d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. d_mat_inv_sqrt = sp.diags(d_inv_sqrt) return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()
[docs]def aug_edge(adj): # edge perturbation adj = np.array(adj) aug_adj1 = np.array([[i for i in j] for j in adj]) aug_adj2 = np.array([[i for i in j] for j in adj]) p = np.random.randint(0,len(adj)-1) aug_adj1[p][0] = 0 aug_adj1[0][p] = 0 t = np.random.randint(1,len(adj)-1) aug_adj1[t][p] = 1 aug_adj1[p][t] = 1 p = np.random.randint(0,len(adj)-1) aug_adj2[p][0] = 0 aug_adj2[0][p] = 0 t = np.random.randint(1,len(adj)-1) aug_adj2[t][p] = 1 aug_adj2[p][t] = 1 return aug_adj1,aug_adj2
[docs]def get_edge_index(adj): #Get edge set according to adjacency matrix edge_index1 = [] edge_index2 = [] for i in range(len(adj)): for j in range(len(adj)): if adj[i][j]==1 and i<j: edge_index1.append(i) edge_index2.append(j) edge_index = [edge_index1] + [edge_index2] return edge_index
[docs]def get_data(message_num,start,tweet_sum,save_path): os.makedirs(save_path, exist_ok=True) dataset = Event2012().load_data() df = dataset.sort_values(by='created_at').reset_index() ini_df = df[start:tweet_sum] G = construct_graph_from_df(ini_df) d_features = documents_to_features(df) print("Document features generated.") t_features = df_to_t_features(df) print("Time features generated.") combined_features = np.concatenate((d_features, t_features), axis=1) print("Concatenated document features and time features.") np.save(save_path + 'features_69612_0709_spacy_lg_zero_multiclasses_filtered.npy', combined_features) print("Initial features saved.") combined_features = np.load(save_path + 'features_69612_0709_spacy_lg_zero_multiclasses_filtered.npy') A = nx.adjacency_matrix(G).todense().tolist() X = [] nodes = list(G.nodes) tweet=[] j = 0 for i in range(len(nodes)): t=nodes[i][0:2] e=nodes[i][2:] if t=="t_": tweet.append(i) index=list(ini_df["tweet_id"]).index(int(e)) X.append(list(combined_features[index])) j=j+1 X = torch.tensor(X) adj = np.array([[0]*len(tweet)]*len(tweet)) for i in range(len(tweet)): for j in range(len(A)): if A[tweet[i]][j]==1: for s in range(len(tweet)): if A[j][tweet[s]]==1 and s!=i: adj[i][s] = 1 edge_index = get_edge_index(adj) edge_index1 = copy.deepcopy(edge_index) edge_index2 = copy.deepcopy(edge_index) true_y = torch.tensor(list(ini_df['event_id'])) drop_percent = 0.2 i = 0 while 1: if i >= len(G.edges)*drop_percent: break m1 = random.randint(0, len(edge_index1[0])-1) m2 = random.randint(0, len(edge_index2[0])-1) if m1==m2: continue else: del edge_index1[0][m1] del edge_index1[1][m1] del edge_index2[0][m2] del edge_index2[1][m2] i = i + 1 edge_index = torch.tensor(edge_index) edge_index1 = torch.tensor(edge_index1) edge_index2 = torch.tensor(edge_index2) dict_graph = {} dict_graph['x'] = X dict_graph['x1'] = X dict_graph['x2'] = X dict_graph['edge_index'] = edge_index dict_graph['edge_index1'] = edge_index1 dict_graph['edge_index2'] = edge_index2 dict_graph['y'] = true_y return dict_graph
[docs]def getData(args,M_num): #construct an entire graph within a block # print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) M =[20254,28976,30467,32302,34312,36146,37422,42700,44260,45623,46719, 47951,51188,53160,56116,58665,59575,62251,64138,65537,66430,68840] if M_num == 0: num = 0 size = 500 elif M_num == 1: num = M[M_num-1] size = 500 elif M[M_num]-M[M_num-1]>2000: num = M[M_num-1] size = 1000 else: num = M[M_num-1] size = M[M_num]-M[M_num-1] data = [] i = M_num j = 0 while 1: if (num+size)>=M[i]: tmp = get_data(i, num ,M[i], args.file_path) data.append(tmp) break else: tmp = get_data(i, num, num+size, args.file_path) data.append(tmp) j = j + 1 print("***************Block "+str(j)+" is done.****************") num = num+size save_data(data, args.file_path, M_num) return data
[docs]def save_data(data, save_path, M_num): os.makedirs(save_path, exist_ok=True) file_path = os.path.join(save_path, f'data_{M_num}.pkl') with open(file_path, 'wb') as f: pickle.dump(data, f) print(f"Data saved at {file_path}")
[docs]def get_Graph_Dataset(args,message_number): print("\nBuilding graph-level social network...") start_time = time.time() #load data for graph-level contrastive learning dataset = [] label = [] file_name = args.file_path + 'GCL-data/message_block_'+str(message_number)+'.npy' data = np.load(file_name,allow_pickle=True) for dict_data in data: data = Data(x=dict_data['X'],x1=dict_data['x1'],x2=dict_data['x2'], edge_index=dict_data['edge_index'],edge_index1=dict_data['edge_index1'], edge_index2=dict_data['edge_index2']) dataset.append(data) label.append(dict_data['label']) if message_number == 0 : dataset = loader.DataLoader(dataset,batch_size=4096) else: dataset = loader.DataLoader(dataset,batch_size=len(dataset)) end_time = time.time() run_time = end_time - start_time print("Done! It takes "+str(int(run_time))+" seconds.\n") return dataset,label
[docs]def get_Node_Dataset(args,message_number): #load data for node-level contrastive learning print("\nBuilding node-level social network...") start_time = time.time() #datas = getData(message_number) file_path = os.path.join(args.file_path, f'data_{message_number}.pkl') if os.path.exists(file_path): with open(file_path, 'rb') as f: datas = pickle.load(f) print("Data loaded successfully.") # 现在你可以使用 data 进行进一步操作 else: print(f"No data file found at {file_path}") datas = getData(message_number) dataset = [] labels = [] for data in datas: dict_data = data dict_data['x'] = torch.tensor(np.array(dict_data['x'])) dict_data['x1'] = torch.tensor(np.array(dict_data['x1'])) dict_data['x2'] = torch.tensor(np.array(dict_data['x2'])) dict_data['edge_index'] = torch.tensor(np.array(dict_data['edge_index'])) dict_data['edge_index1'] = torch.tensor(np.array(dict_data['edge_index1'])) dict_data['edge_index2'] = torch.tensor(np.array(dict_data['edge_index2'])) data = Data(x=dict_data['x'],x1=dict_data['x1'],x2=dict_data['x2'], edge_index=dict_data['edge_index'],edge_index1=dict_data['edge_index1'], edge_index2=dict_data['edge_index2']) label = dict_data['y'] if len(labels)==0: labels = label else: labels = torch.cat([labels,label]) dataset.append(data) end_time = time.time() run_time = end_time - start_time print("Done! It takes "+str(int(run_time))+" seconds.\n") return dataset,np.array(labels).tolist()
# Calculate the embeddings of all the documents in the dataframe, # the embedding of each document is an average of the pre-trained embeddings of all the words in it
[docs]def documents_to_features(df): nlp = spacy.load("en_core_web_lg") features = df.filtered_words.apply(lambda x: nlp(' '.join(x)).vector).values return np.stack(features, axis=0)
# encode one times-tamp # t_str: a string of format '2012-10-11 07:19:34'
[docs]def extract_time_feature(t_str): t = datetime.fromisoformat(str(t_str)) OLE_TIME_ZERO = datetime(1899, 12, 30) delta = t - OLE_TIME_ZERO return [(float(delta.days) / 100000.), (float(delta.seconds) / 86400)] # 86,400 seconds in day
# encode the times-tamps of all the messages in the dataframe
[docs]def df_to_t_features(df): t_features = np.asarray([extract_time_feature(t_str) for t_str in df['created_at']]) return t_features