Source code for SocialED.detector.adpsemevent

import networkx as nx
from itertools import combinations, chain
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import metrics
from sklearn.cluster import SpectralClustering
import sys
from datetime import datetime
import math
import pickle
import pandas as pd
import os
from os.path import exists
import time
import multiprocessing
import torch
from matplotlib import pyplot as plt
from networkx.algorithms import cuts
from sentence_transformers import SentenceTransformer
import re
from sklearn.model_selection import train_test_split
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))


[docs]class ADPSEMEvent: """ADPSEMEvent class for event detection. This class implements adaptive semantic event detection. Args: dataset: Input dataset ... """ # 修复缩进问题 def __init__(self, dataset): self.dataset = dataset self.language = dataset.get_dataset_language() self.dataset_name = dataset.get_dataset_name() self.save_path = "../model/model_saved/adpsemevent/"+self.dataset_name+"/"
[docs] def preprocess(self): preprocessor = Preprocessor(self.dataset) preprocessor.preprocess()
[docs] def detection(self): ground_truths, predictions = run_hier_2D_SE_mini_closed_set(self.save_path, n=300, e_a=True, e_s=True) return ground_truths, predictions
[docs] def evaluate(self, ground_truths, predictions): """ Evaluate the model. """ # Calculate Normalized Mutual Information (NMI) nmi = metrics.normalized_mutual_info_score(ground_truths, predictions) print(f"Normalized Mutual Information (NMI): {nmi}") # Calculate Adjusted Mutual Information (AMI) ami = metrics.adjusted_mutual_info_score(ground_truths, predictions) print(f"Adjusted Mutual Information (AMI): {ami}") # Calculate Adjusted Rand Index (ARI) ari = metrics.adjusted_rand_score(ground_truths, predictions) print(f"Adjusted Rand Index (ARI): {ari}")
[docs]class Preprocessor:
[docs] def __init__(self, dataset, mode='close'): """Initialize preprocessor Args: dataset: Dataset calss (e.g. Event2012, Event2018, etc.) language: Language of the dataset (default 'English') mode: 'open' or 'close' (default 'close') - determines preprocessing mode """ self.dataset = dataset self.language = dataset.get_dataset_language() self.dataset_name = dataset.get_dataset_name() self.mode = mode self.columns = ['tweet_id', 'text', 'event_id', 'words', 'filtered_words', 'entities', 'user_id', 'created_at', 'urls', 'hashtags', 'user_mentions']
[docs] def get_closed_set_test_df(self, df): """Get closed set test dataframe""" save_path = f'../model/model_saved/adpsemevent/{self.dataset_name}/closed_set/' if not exists(save_path): os.makedirs(save_path) test_set_df_np_path = save_path + 'test_set.npy' if not exists(test_set_df_np_path): # Use 2012-style processing for all datasets test_mask = torch.load(f'../model/model_saved/adpsemevent/{self.dataset_name}/masks/test_mask.pt').cpu().detach().numpy() test_mask = list(np.where(test_mask==True)[0]) test_df = df.iloc[test_mask] test_df_np = test_df.to_numpy() np.save(test_set_df_np_path, test_df_np) return
[docs] def get_closed_set_messages_embeddings(self): """Get SBERT embeddings for closed set messages""" save_path = f'../model/model_saved/adpsemevent/{self.dataset_name}/closed_set/' SBERT_embedding_path = f'{save_path}/SBERT_embeddings.pkl' if not exists(SBERT_embedding_path): test_set_df_np_path = save_path + 'test_set.npy' test_df_np = np.load(test_set_df_np_path, allow_pickle=True) test_df = pd.DataFrame(data=test_df_np, columns=self.columns) print("Dataframe loaded.") processed_text = [preprocess_sentence(s) for s in test_df['text'].values] print('message text contents preprocessed.') embeddings = SBERT_embed(processed_text, language=self.language) with open(SBERT_embedding_path, 'wb') as fp: pickle.dump(embeddings, fp) print('SBERT embeddings stored.') return
[docs] def get_open_set_messages_embeddings(self): """Get SBERT embeddings for open set messages""" save_path = f'../model/model_saved/adpsemevent/{self.dataset_name}/open_set/' num_blocks = 21 # Use 2012-style processing for all datasets for i in range(num_blocks): block = i + 1 print('\n\n====================================================') print('block: ', block) SBERT_embedding_path = f'{save_path}{block}/SBERT_embeddings.pkl' if not exists(SBERT_embedding_path): df_np = np.load(f'{save_path}{block}/{block}.npy', allow_pickle=True) df = pd.DataFrame(data=df_np, columns=self.columns + ['original_index', 'date']) print("Dataframe loaded.") df['processed_text'] = [preprocess_sentence(s) for s in df['text']] print('message text contents preprocessed.') embeddings = SBERT_embed(df['processed_text'].tolist(), language=self.language) with open(SBERT_embedding_path, 'wb') as fp: pickle.dump(embeddings, fp) print('SBERT embeddings stored.') return
[docs] def split_open_set(self, df, root_path): """Split data into open set blocks""" if not exists(root_path): os.makedirs(root_path) df = df.sort_values(by='created_at').reset_index() df['date'] = [d.date() for d in df['created_at']] distinct_dates = df.date.unique() # First week -> block 0 folder = root_path + '0/' if not exists(folder): os.mkdir(folder) df_np_path = folder + '0.npy' if not exists(df_np_path): ini_df = df.loc[df['date'].isin(distinct_dates[:7])] ini_df_np = ini_df.to_numpy() np.save(df_np_path, ini_df_np) # Following dates -> block 1, 2, ... end = len(distinct_dates) - 1 # Use 2012-style processing for i in range(7, end): folder = root_path + str(i - 6) + '/' if not exists(folder): os.mkdir(folder) df_np_path = folder + str(i - 6) + '.npy' if not exists(df_np_path): incr_df = df.loc[df['date'] == distinct_dates[i]] incr_df_np = incr_df.to_numpy() np.save(df_np_path, incr_df_np) return
[docs] def preprocess(self): """Main preprocessing function""" # Load raw data using 2012-style processing df_np = self.dataset.load_data() print("Loaded data.") df = pd.DataFrame(data=df_np, columns=self.columns) print("Data converted to dataframe.") if self.mode == 'open': # Open-set setting root_path = f'../model/model_saved/adpsemevent/{self.dataset_name}/open_set/' self.split_open_set(df, root_path) self.get_open_set_messages_embeddings() else: # Close-set setting # Create masks directory and generate train/val/test splits save_dir = os.path.join(f'../model/model_saved/adpsemevent/{self.dataset_name}', 'masks') os.makedirs(save_dir, exist_ok=True) # Split and save masks self.split_and_save_masks(df, save_dir) print("Generated and saved train/val/test masks.") self.get_closed_set_test_df(df) self.get_closed_set_messages_embeddings() return
[docs] def split_and_save_masks(self, df, save_dir, train_size=0.7, val_size=0.1, test_size=0.2, random_seed=42): """ Splits the DataFrame into training, validation, and test sets, and saves the indices (masks) as .pt files. Parameters: - df (pd.DataFrame): The DataFrame to be split - save_dir (str): Directory to save the masks - train_size (float): Proportion for training (default 0.7) - val_size (float): Proportion for validation (default 0.1) - test_size (float): Proportion for testing (default 0.2) - random_seed (int): Random seed for reproducibility """ if train_size + val_size + test_size != 1.0: raise ValueError("train_size + val_size + test_size must equal 1.0") if df.empty: raise ValueError("The input DataFrame is empty.") print(f"Total samples in DataFrame: {len(df)}") # Set random seed torch.manual_seed(random_seed) # Split into train and temp train_data, temp_data = train_test_split(df, train_size=train_size, random_state=random_seed) # Split temp into val and test val_data, test_data = train_test_split(temp_data, train_size=val_size/(val_size + test_size), random_state=random_seed) # Create boolean masks full_train_mask = torch.zeros(len(df), dtype=torch.bool) full_val_mask = torch.zeros(len(df), dtype=torch.bool) full_test_mask = torch.zeros(len(df), dtype=torch.bool) # Set indices full_train_mask[train_data.index] = True full_val_mask[val_data.index] = True full_test_mask[test_data.index] = True print(f"Training samples: {full_train_mask.sum()}") print(f"Validation samples: {full_val_mask.sum()}") print(f"Test samples: {full_test_mask.sum()}") # Save masks mask_paths = { 'train_mask.pt': full_train_mask, 'val_mask.pt': full_val_mask, 'test_mask.pt': full_test_mask } for filename, mask in mask_paths.items(): mask_path = os.path.join(save_dir, filename) if not os.path.exists(mask_path): try: torch.save(mask, mask_path) print(f"Saved {filename}") except Exception as e: print(f"Error saving {filename}: {e}") else: print(f"{filename} already exists") # Verify saved file if os.path.exists(mask_path): saved_mask = torch.load(mask_path) if saved_mask.numel() == 0: print(f"Warning: {filename} is empty") else: print(f"Verified {filename} with {saved_mask.numel()} elements") print("Mask generation completed")
[docs]def get_stable_point(path, if_updata, epsilon): stable_point_path = path + f'stable_point_{epsilon}.pkl' if not exists(stable_point_path) or if_updata == True: embeddings_path = path + 'SBERT_embeddings.pkl' with open(embeddings_path, 'rb') as f: embeddings = pickle.load(f) first_stable_point, global_stable_point, Sensitivity = search_stable_points(embeddings, epsilon, path) stable_points = {'first': first_stable_point, 'global': global_stable_point} with open(stable_point_path, 'wb') as fp: pickle.dump(stable_points, fp) print('stable points stored.') with open(stable_point_path, 'rb') as f: stable_points = pickle.load(f) print('stable points loaded.') return stable_points, Sensitivity
[docs]def run_hier_2D_SE_mini_open_set(save_path, n=400, e_a=True, e_s=True, test_with_one_block=True, epsilon=0.2): if test_with_one_block: blocks = [16] else: blocks = [i+1 for i in range(20) if i+1>=1] for block in blocks: print('\n\n====================================================') print('block: ', block) print(datetime.now().strftime("%H:%M:%S")) folder = f'{save_path}{block}/' # Load message embeddings embeddings_path = folder + 'SBERT_embeddings.pkl' with open(embeddings_path, 'rb') as f: embeddings = pickle.load(f) # Load and process dataframe df_np = np.load(f'{folder}{block}.npy', allow_pickle=True) columns = ['tweet_id', 'text', 'event_id', 'words', 'filtered_words', 'entities', 'user_id', 'created_at', 'urls', 'hashtags', 'user_mentions'] df = pd.DataFrame(data=df_np, columns=columns) all_node_features = [list(set([str(u)] + \ [str(each) for each in um] + \ [h.lower() for h in hs] + \ e)) \ for u, um, hs, e in \ zip(df['user_id'], df['user_mentions'], df['hashtags'], df['entities'])] start_time = time.time() stable_points, Sensitivity = get_stable_point(folder, if_updata=True, epsilon=epsilon) if e_a == False: # only rely on e_s (semantic-similarity-based edges) default_num_neighbors = stable_points['global'] else: default_num_neighbors = stable_points['first'] if default_num_neighbors == 0: default_num_neighbors = math.ceil((len(embeddings)/1000)*10) global_edges = get_global_edges(all_node_features, epsilon, folder, default_num_neighbors, e_a=e_a, e_s=e_s) corr_matrix = np.load(f"{folder}corr_matrix_{epsilon}.npy") weighted_global_edges = [(edge[0], edge[1], corr_matrix[edge[0]-1, edge[1]-1]) for edge in global_edges \ if corr_matrix[edge[0]-1, edge[1]-1] > 0] division = hier_2D_SE_mini(weighted_global_edges, len(embeddings), n=n) print(datetime.now().strftime("%H:%M:%S")) prediction = decode(division) labels_true = df['event_id'].tolist() n_clusters = len(list(set(labels_true))) print('n_clusters gt: ', n_clusters) nmi, ami, ari = evaluate_labels(labels_true, prediction) print('n_clusters pred: ', len(division)) print('nmi: ', nmi) print('ami: ', ami) print('ari: ', ari) with open(f"open_set_{epsilon}.txt", 'a') as f: f.write("block:" + str(block) + '\n') f.write("division:"+str(division)+ '\n') f.write('Runtime: ' + str(time.time() - start_time) + " Seconds" + '\n') f.write('n_clusters gt: '+ str(len(list(set(labels_true))))+ '\n') f.write('n_clusters pred: ' + str(len(division)) + '\n') f.write('epsilon: ' + str(epsilon) + '\n') f.write('n: ' + str(n) + '\n') f.write('Sensitivity: ' + str(Sensitivity) + '\n') f.write('nmi: ' + str(nmi) + '\n') f.write('ami: ' + str(ami) + '\n') f.write('ari: ' + str(ari) + '\n' + '\n') return
[docs]def run_hier_2D_SE_mini_closed_set(save_path, n=300, e_a=True, e_s=True, epsilon=None): save_path = save_path + 'closed_set/' # Load test set dataframe test_set_df_np_path = save_path + 'test_set.npy' test_df_np = np.load(test_set_df_np_path, allow_pickle=True) columns = ['tweet_id', 'text', 'event_id', 'words', 'filtered_words', 'entities', 'user_id', 'created_at', 'urls', 'hashtags', 'user_mentions'] test_df = pd.DataFrame(data=test_df_np, columns=columns) print("Dataframe loaded.") all_node_features = [[str(u)] + \ [str(each) for each in (um if isinstance(um, (list, tuple)) else [])] + \ [str(h).lower() if isinstance(h, str) else str(h) for h in (hs if isinstance(hs, (list, tuple)) else [])] + \ [str(e) for e in (e if isinstance(e, (list, tuple)) else [])] \ for u, um, hs, e in \ zip(test_df['user_id'], test_df['user_mentions'], test_df['hashtags'], test_df['entities'])] # Load embeddings with open(f'{save_path}/SBERT_embeddings.pkl', 'rb') as f: embeddings = pickle.load(f) start_time = time.time() stable_points, Sensitivity = get_stable_point(save_path, if_updata=True, epsilon=epsilon) default_num_neighbors = stable_points['first'] global_edges = get_global_edges(all_node_features, epsilon, save_path, default_num_neighbors, e_a=e_a, e_s=e_s) corr_matrix = np.load(f"{save_path}corr_matrix_{epsilon}.npy") weighted_global_edges = [(edge[0], edge[1], corr_matrix[edge[0]-1, edge[1]-1]) for edge in global_edges \ if corr_matrix[edge[0]-1, edge[1]-1] > 0] division = hier_2D_SE_mini(weighted_global_edges, len(embeddings), n=n) prediction = decode(division) labels_true = test_df['event_id'].tolist() n_clusters = len(list(set(labels_true))) print('n_clusters gt: ', n_clusters) print('n_clusters pred: ', len(division)) return labels_true, prediction
[docs]def create_process_open_set(epsilon): target = run_hier_2D_SE_mini_open_set kwargs = { "n": 100, "e_a": True, "e_s": True, "test_with_one_block": True, "epsilon": epsilon } p = multiprocessing.Process(target=target, kwargs=kwargs) p.start() return p
[docs]def create_process_closed_set(epsilon): target = run_hier_2D_SE_mini_closed_set n = 300 kwargs = { "n": n, "e_a": True, "e_s": True, "epsilon": epsilon } p = multiprocessing.Process(target=target, kwargs=kwargs) p.start() return p
[docs]def run_processes(epsilons, dataset_name, mode='close'): if mode == 'open': processes = [create_process_open_set(dataset_name, epsilon) for epsilon in epsilons] else: processes = [create_process_closed_set(dataset_name, epsilon) for epsilon in epsilons] for process in processes: process.join() print("All processes have completed their tasks.")
[docs]def make_symmetric(matrix): return np.triu(matrix) + np.triu(matrix, 1).T
[docs]def search_stable_points(embeddings, epsilon, path, max_num_neighbors = 200): print("size_of_embeddings",len(embeddings)) corr_matrix = np.corrcoef(embeddings) np.fill_diagonal(corr_matrix, 0) print("epsilon=",epsilon) s = -1 if epsilon != None: max_ = np.max(corr_matrix) min_ = np.min(corr_matrix) print("Local Sensitivity:",(max_- min_)) # delta = 10e-6 delta = 1 / len(embeddings)**2 beta = epsilon / (2 * np.log(2/delta)) S = np.exp(-beta) * (max_- min_) * 2 print("Smooth Sensitivity:", S) if S < 2: s = S else: s = 2 print("Sensitivity=",s) corr_matrix = [[i+np.random.laplace(loc=0, scale=s/epsilon) for i in corr_matrix_] for corr_matrix_ in corr_matrix] corr_matrix = np.array(corr_matrix) corr_matrix = make_symmetric(corr_matrix) np.fill_diagonal(corr_matrix, 0) print(f"{path}"+f'corr_matrix_{epsilon}.npy') np.save(f"{path}"+f'corr_matrix_{epsilon}.npy', corr_matrix) corr_matrix_sorted_indices = np.argsort(corr_matrix) all_1dSEs = [] seg = None for i in range(max_num_neighbors): dst_ids = corr_matrix_sorted_indices[:, -(i+1)] knn_edges = [(s+1, d+1, corr_matrix[s, d]) \ for s, d in enumerate(dst_ids) if corr_matrix[s, d] > 0] # (s+1, d+1): +1 as node indexing starts from 1 instead of 0 if i == 0: g = nx.Graph() g.add_weighted_edges_from(knn_edges) seg = SE(g) all_1dSEs.append(seg.calc_1dSE()) else: all_1dSEs.append(seg.update_1dSE(all_1dSEs[-1], knn_edges)) #print('all_1dSEs: ', all_1dSEs) stable_indices = [] for i in range(1, len(all_1dSEs) - 1): if all_1dSEs[i] < all_1dSEs[i - 1] and all_1dSEs[i] < all_1dSEs[i + 1]: stable_indices.append(i) if len(stable_indices) == 0: print('No stable points found after checking k = 1 to ', max_num_neighbors) return 0, 0, s else: stable_SEs = [all_1dSEs[index] for index in stable_indices] index = stable_indices[stable_SEs.index(min(stable_SEs))] print('stable_indices: ', stable_indices) print('stable_SEs: ', stable_SEs) print('First stable point: k = ', stable_indices[0]+1, ', correspoding 1dSE: ', stable_SEs[0]) # n_neighbors should be index + 1 print('Global stable point within the searching range: k = ', index + 1, \ ', correspoding 1dSE: ', all_1dSEs[index]) # n_neighbors should be index + 1 return stable_indices[0]+1, index + 1, s # first stable point, global stable point
[docs]def get_graph_edges(attributes): attr_nodes_dict = {} for i, l in enumerate(attributes): for attr in l: if attr not in attr_nodes_dict: attr_nodes_dict[attr] = [i+1] # node indexing starts from 1 else: attr_nodes_dict[attr].append(i+1) for attr in attr_nodes_dict.keys(): attr_nodes_dict[attr].sort() graph_edges = [] for l in attr_nodes_dict.values(): graph_edges += list(combinations(l, 2)) return list(set(graph_edges))
[docs]def get_knn_edges(epsilon, path, default_num_neighbors): # corr_matrix = np.corrcoef(embeddings) # np.fill_diagonal(corr_matrix, 0) corr_matrix = np.load(f"{path}"+f'corr_matrix_{epsilon}.npy') corr_matrix_sorted_indices = np.argsort(corr_matrix) knn_edges = [] for i in range(default_num_neighbors): dst_ids = corr_matrix_sorted_indices[:, -(i+1)] knn_edges += [(s+1, d+1) if s < d else (d+1, s+1) \ for s, d in enumerate(dst_ids) if corr_matrix[s, d] > 0] # (s+1, d+1): +1 as node indexing starts from 1 instead of 0 return list(set(knn_edges))
[docs]def get_global_edges(attributes, epsilon, folder, default_num_neighbors, e_a = True, e_s = True): graph_edges, knn_edges = [], [] if e_a == True: graph_edges = get_graph_edges(attributes) if e_s == True: knn_edges = get_knn_edges(epsilon, folder, default_num_neighbors) return list(set(knn_edges + graph_edges))
[docs]def get_subgraphs_edges(clusters, graph_splits, weighted_global_edges): """Get subgraph edges. Args: clusters: a list containing the current clusters, each cluster is a list of nodes of the original graph graph_splits: a list of (start_index, end_index) pairs, each (start_index, end_index) pair indicates a subset of clusters, which will serve as the nodes of a new subgraph weighted_global_edges: a list of (start node, end node, edge weight) tuples, each tuple is an edge in the original graph Returns: all_subgraphs_edges: a list containing the edges of all subgraphs """ # 修复缩进和块引用问题 all_subgraphs_edges = [] for split in graph_splits: subgraph_clusters = clusters[split[0]:split[1]] subgraph_nodes = list(chain(*subgraph_clusters)) subgraph_edges = [edge for edge in weighted_global_edges if edge[0] in subgraph_nodes and edge[1] in subgraph_nodes] all_subgraphs_edges.append(subgraph_edges) return all_subgraphs_edges
[docs]def get_best_egde(adj_matrix_, subgraphs_, all_subgraphs): adj_matrix = adj_matrix_.copy() mask_nodes = list(set(all_subgraphs+subgraphs_)) if len(mask_nodes) >0: adj_matrix[mask_nodes, :] = 0 adj_matrix[:, mask_nodes] = 0 flat_index = np.argmax(adj_matrix) egde = np.unravel_index(flat_index, adj_matrix.shape) weight = adj_matrix[egde] if weight > 0: return list(egde), weight else: print("There is no egdes in current G") return -1, -1
[docs]def get_best_node(adj_matrix_, subgraphs_, all_subgraphs): adj_matrix = adj_matrix_.copy() mask_nodes = list(set(all_subgraphs+subgraphs_)) nodes_to_modify = np.array(mask_nodes) adj_matrix[np.ix_(nodes_to_modify, nodes_to_modify)] = 0 distance = adj_matrix[subgraphs_].sum(axis=0) distance_sort_arg = np.argsort(distance)[::-1] distance_sort = np.sort(distance)[::-1] avg = np.mean(distance[distance>0]) indices = distance_sort[distance_sort>avg] if len(indices) > 0: return distance_sort_arg[:len(indices)].tolist(), distance_sort[:len(indices)].tolist() else: print("There are no edges connected to the current subgraph") return -1, -1
[docs]def get_subgraphs(adj_matrix, division, n, k_max): merged_rows_matrix = np.vstack([ adj_matrix[np.array(ls_)-1].sum(axis=0).tolist() for ls_ in division ]) final_sum = np.array([ merged_rows_matrix[:, np.array(ls_)-1].sum(axis=1).tolist() for ls_ in division ] ) np.fill_diagonal(final_sum, 0) G = nx.from_numpy_array(final_sum) subgraphs = [] all_subgraphs = [] for k in range(k_max): subgraphs_ = [] if len(final_sum) - len(all_subgraphs)<= n: G.remove_nodes_from(all_subgraphs) subgraphs_ = list(G.nodes) subgraphs.append(subgraphs_) print(len(subgraphs_), subgraphs_) break max_edge_or_node, max_weight = get_best_egde(final_sum, subgraphs_, all_subgraphs) subgraphs_.extend(max_edge_or_node) all_subgraphs.extend(max_edge_or_node) while True: if len(subgraphs_) >= n: break node_, weight_ = get_best_node(final_sum, subgraphs_, all_subgraphs) if node_ == -1: max_edge_or_node, max_weight = get_best_egde(final_sum, subgraphs_, all_subgraphs) subgraphs_.extend(max_edge_or_node) all_subgraphs.extend(max_edge_or_node) continue else: if len(subgraphs_) + len(node_) > n: index_ = n - len(subgraphs_) subgraphs_.extend(node_[:index_]) all_subgraphs.extend(node_[:index_]) else: subgraphs_.extend(node_) all_subgraphs.extend(node_) subgraphs.append(subgraphs_) # print(len(subgraphs_), subgraphs_) # subgraphs = [[element + 1 for element in row] for row in subgraphs] new_division = [] for subgraphs_index in subgraphs: new_division_ = [] for index in subgraphs_index: new_division_.append(division[index]) new_division.append(new_division_) return new_division
[docs]def hier_2D_SE_mini(weighted_global_edges, n_messages, n = 100): ''' hierarchical 2D SE minimization ''' ite = 0 # initially, each node (message) is in its own cluster # node encoding starts from 1 G = nx.Graph() G.add_weighted_edges_from(weighted_global_edges) adj_matrix = nx.to_numpy_array(G) clusters = [[i] for i in list(G.nodes)] while True: ite += 1 print('\n=========Iteration ', str(ite), '=========') n_clusters = len(clusters) graph_splits = [(s, min(s+n, n_clusters)) for s in range(0, n_clusters, n)] # [s, e) # all_subgraphs_edges = get_subgraphs_edges(clusters, graph_splits, weighted_global_edges) if 1: subgraphs = get_subgraphs(adj_matrix, clusters, n, len(graph_splits)) all_subgraphs_edges = [] for subgraph_nodes in subgraphs: subgraph_nodes = [str(item) for sublist in subgraph_nodes for item in sublist] subgraph_edges = [(int(edge[0]),int(edge[1]),edge[2]) for edge in weighted_global_edges if str(edge[0]) in subgraph_nodes and str(edge[1]) in subgraph_nodes] all_subgraphs_edges.append(subgraph_edges) else: all_subgraphs_edges = get_subgraphs_edges(clusters, graph_splits, weighted_global_edges) last_clusters = clusters print(f"the number of clusters: {len(last_clusters)}") clusters = [] for i, subgraph_edges in enumerate(all_subgraphs_edges): print('\tSubgraph ', str(i+1)) g = nx.Graph() g.add_weighted_edges_from(subgraph_edges) seg = SE(g) if 1: seg.division = {j: cluster for j, cluster in enumerate(subgraphs[i]) } # print({j: cluster for j, cluster in enumerate(subgraphs[i]) }) else: seg.division = {j: cluster for j, cluster in enumerate(last_clusters[graph_splits[i][0]:graph_splits[i][1]])} # print(seg.division) seg.add_isolates() for k in seg.division.keys(): for node in seg.division[k]: seg.graph.nodes[node]['comm'] = k seg.update_struc_data() seg.update_struc_data_2d() seg.update_division_MinSE() print(f"size of subgraph{str(i+1)}: {len(subgraphs[i])} to {len(list(seg.division.values()))}") clusters += list(seg.division.values()) if len(graph_splits) == 1: break if clusters == last_clusters: n *= 2 return clusters
[docs]class SE: def __init__(self, graph: nx.Graph): self.graph = graph.copy() self.vol = self.get_vol() self.division = {} # {comm1: [node11, node12, ...], comm2: [node21, node22, ...], ...} self.struc_data = {} # {comm1: [vol1, cut1, community_node_SE, leaf_nodes_SE], comm2:[vol2, cut2, community_node_SE, leaf_nodes_SE],... } self.struc_data_2d = {} # {comm1: {comm2: [vol_after_merge, cut_after_merge, comm_node_SE_after_merge, leaf_nodes_SE_after_merge], comm3: [], ...}, ...}
[docs] def get_vol(self): ''' get the volume of the graph ''' return cuts.volume(self.graph, self.graph.nodes, weight = 'weight')
[docs] def calc_1dSE(self): ''' get the 1D SE of the graph ''' SE = 0 for n in self.graph.nodes: d = cuts.volume(self.graph, [n], weight = 'weight') SE += - (d / self.vol) * math.log2(d / self.vol) return SE
[docs] def update_1dSE(self, original_1dSE, new_edges): ''' get the updated 1D SE after new edges are inserted into the graph ''' affected_nodes = [] for edge in new_edges: affected_nodes += [edge[0], edge[1]] affected_nodes = set(affected_nodes) original_vol = self.vol original_degree_dict = {node:0 for node in affected_nodes} for node in affected_nodes.intersection(set(self.graph.nodes)): original_degree_dict[node] = self.graph.degree(node, weight = 'weight') # insert new edges into the graph self.graph.add_weighted_edges_from(new_edges) self.vol = self.get_vol() updated_vol = self.vol updated_degree_dict = {} for node in affected_nodes: updated_degree_dict[node] = self.graph.degree(node, weight = 'weight') updated_1dSE = (original_vol / updated_vol) * (original_1dSE - math.log2(original_vol / updated_vol)) for node in affected_nodes: d_original = original_degree_dict[node] d_updated = updated_degree_dict[node] if d_original != d_updated: if d_original != 0: updated_1dSE += (d_original / updated_vol) * math.log2(d_original / updated_vol) updated_1dSE -= (d_updated / updated_vol) * math.log2(d_updated / updated_vol) return updated_1dSE
[docs] def get_cut(self, comm): ''' get the sum of the degrees of the cut edges of community comm ''' return cuts.cut_size(self.graph, comm, weight = 'weight')
[docs] def get_volume(self, comm): ''' get the volume of community comm ''' return cuts.volume(self.graph, comm, weight = 'weight')
[docs] def calc_2dSE(self): ''' get the 2D SE of the graph ''' SE = 0 for comm in self.division.values(): g = self.get_cut(comm) v = self.get_volume(comm) SE += - (g / self.vol) * math.log2(v / self.vol) for node in comm: d = self.graph.degree(node, weight = 'weight') SE += - (d / self.vol) * math.log2(d / v) return SE
[docs] def show_division(self): print(self.division) return self.division
[docs] def show_struc_data(self): print(self.struc_data)
[docs] def show_struc_data_2d(self): print(self.struc_data_2d) return self.struc_data_2d
[docs] def print_graph(self): fig, ax = plt.subplots() nx.draw(self.graph, ax=ax, with_labels=True) plt.show()
[docs] def update_struc_data(self): ''' calculate the volume, cut, communitiy mode SE, and leaf nodes SE of each cummunity, then store them into self.struc_data ''' self.struc_data = {} # {comm1: [vol1, cut1, community_node_SE, leaf_nodes_SE], comm2:[vol2, cut2, community_node_SE, leaf_nodes_SE],... } for vname in self.division.keys(): comm = self.division[vname] volume = self.get_volume(comm) cut = self.get_cut(comm) if volume == 0: vSE = 0 else: vSE = - (cut / self.vol) * math.log2(volume / self.vol) vnodeSE = 0 for node in comm: d = self.graph.degree(node, weight = 'weight') if d != 0: vnodeSE -= (d / self.vol) * math.log2(d / volume) self.struc_data[vname] = [volume, cut, vSE, vnodeSE]
[docs] def update_struc_data_2d(self): ''' calculate the volume, cut, communitiy mode SE, and leaf nodes SE after merging each pair of cummunities, then store them into self.struc_data_2d ''' self.struc_data_2d = {} # {(comm1, comm2): [vol_after_merge, cut_after_merge, comm_node_SE_after_merge, leaf_nodes_SE_after_merge], (comm1, comm3): [], ...} comm_num = len(self.division) for i in range(comm_num): for j in range(i + 1, comm_num): v1 = list(self.division.keys())[i] v2 = list(self.division.keys())[j] if v1 < v2: k = (v1, v2) else: k = (v2, v1) comm_merged = self.division[v1] + self.division[v2] gm = self.get_cut(comm_merged) vm = self.struc_data[v1][0] + self.struc_data[v2][0] if self.struc_data[v1][0] == 0 or self.struc_data[v2][0] == 0: vmSE = self.struc_data[v1][2] + self.struc_data[v2][2] vmnodeSE = self.struc_data[v1][3] + self.struc_data[v2][3] else: vmSE = - (gm / self.vol) * math.log2(vm / self.vol) vmnodeSE = self.struc_data[v1][3] - (self.struc_data[v1][0]/ self.vol) * math.log2(self.struc_data[v1][0] / vm) + \ self.struc_data[v2][3] - (self.struc_data[v2][0]/ self.vol) * math.log2(self.struc_data[v2][0] / vm) self.struc_data_2d[k] = [vm, gm, vmSE, vmnodeSE]
[docs] def init_division(self): ''' initialize self.division such that each node assigned to its own community ''' self.division = {} for node in self.graph.nodes: new_comm = node self.division[new_comm] = [node] self.graph.nodes[node]['comm'] = new_comm
[docs] def add_isolates(self): ''' add any isolated nodes into graph ''' all_nodes = list(chain(*list(self.division.values()))) all_nodes.sort() edge_nodes = list(self.graph.nodes) edge_nodes.sort() if all_nodes != edge_nodes: for node in set(all_nodes)-set(edge_nodes): self.graph.add_node(node)
[docs] def update_division_MinSE(self): ''' greedily update the encoding tree to minimize 2D SE ''' def Mg_operator(v1, v2): ''' MERGE operator. It calculates the delta SE caused by mergeing communities v1 and v2, without actually merging them, i.e., the encoding tree won't be changed ''' v1SE = self.struc_data[v1][2] v1nodeSE = self.struc_data[v1][3] v2SE = self.struc_data[v2][2] v2nodeSE = self.struc_data[v2][3] if v1 < v2: k = (v1, v2) else: k = (v2, v1) vm, gm, vmSE, vmnodeSE = self.struc_data_2d[k] delta_SE = vmSE + vmnodeSE - (v1SE + v1nodeSE + v2SE + v2nodeSE) return delta_SE # continue merging any two communities that can cause the largest decrease in SE, # until the SE can't be further reduced while True: comm_num = len(self.division) delta_SE = 99999 vm1 = None vm2 = None for i in range(comm_num): for j in range(i + 1, comm_num): v1 = list(self.division.keys())[i] v2 = list(self.division.keys())[j] new_delta_SE = Mg_operator(v1, v2) if new_delta_SE < delta_SE: delta_SE = new_delta_SE vm1 = v1 vm2 = v2 if delta_SE < 0: # Merge v2 into v1, and update the encoding tree accordingly for node in self.division[vm2]: self.graph.nodes[node]['comm'] = vm1 self.division[vm1] += self.division[vm2] self.division.pop(vm2) volume = self.struc_data[vm1][0] + self.struc_data[vm2][0] cut = self.get_cut(self.division[vm1]) vmSE = - (cut / self.vol) * math.log2(volume / self.vol) vmnodeSE = self.struc_data[vm1][3] - (self.struc_data[vm1][0]/ self.vol) * math.log2(self.struc_data[vm1][0] / volume) + \ self.struc_data[vm2][3] - (self.struc_data[vm2][0]/ self.vol) * math.log2(self.struc_data[vm2][0] / volume) self.struc_data[vm1] = [volume, cut, vmSE, vmnodeSE] self.struc_data.pop(vm2) struc_data_2d_new = {} for k in self.struc_data_2d.keys(): if k[0] == vm2 or k[1] == vm2: continue elif k[0] == vm1 or k[1] == vm1: v1 = k[0] v2 = k[1] comm_merged = self.division[v1] + self.division[v2] gm = self.get_cut(comm_merged) vm = self.struc_data[v1][0] + self.struc_data[v2][0] if self.struc_data[v1][0] == 0 or self.struc_data[v2][0] == 0: vmSE = self.struc_data[v1][2] + self.struc_data[v2][2] vmnodeSE = self.struc_data[v1][3] + self.struc_data[v2][3] else: vmSE = - (gm / self.vol) * math.log2(vm / self.vol) vmnodeSE = self.struc_data[v1][3] - (self.struc_data[v1][0]/ self.vol) * math.log2(self.struc_data[v1][0] / vm) + \ self.struc_data[v2][3] - (self.struc_data[v2][0]/ self.vol) * math.log2(self.struc_data[v2][0] / vm) struc_data_2d_new[k] = [vm, gm, vmSE, vmnodeSE] else: struc_data_2d_new[k] = self.struc_data_2d[k] self.struc_data_2d = struc_data_2d_new else: break
[docs]def vanilla_2D_SE_mini(weighted_edges): ''' vanilla (greedy) 2D SE minimization ''' g = nx.Graph() g.add_weighted_edges_from(weighted_edges) seg = SE(g) seg.init_division() #seg.show_division() SE1D = seg.calc_1dSE() seg.update_struc_data() #seg.show_struc_data() seg.update_struc_data_2d() #seg.show_struc_data_2d() initial_SE2D = seg.calc_2dSE() seg.update_division_MinSE() communities = seg.division minimized_SE2D = seg.calc_2dSE() return SE1D, initial_SE2D, minimized_SE2D, communities
[docs]def test_vanilla_2D_SE_mini(): weighted_edges = [(1, 2, 2), (1, 3, 4)] g = nx.Graph() g.add_weighted_edges_from(weighted_edges) A = nx.adjacency_matrix(g).todense() print('adjacency matrix: \n', A) print('g.nodes: ', g.nodes) print('g.edges: ', g.edges) print('degrees of nodes: ', list(g.degree(g.nodes, weight = 'weight'))) SE1D, initial_SE2D, minimized_SE2D, communities = vanilla_2D_SE_mini(weighted_edges) print('\n1D SE of the graph: ', SE1D) print('initial 2D SE of the graph: ', initial_SE2D) print('the minimum 2D SE of the graph: ', minimized_SE2D) print('communities detected: ', communities) return
[docs]def replaceAtUser(text): """ Replaces "@user" with "" """ text = re.sub('@[^\s]+|RT @[^\s]+','',text) return text
[docs]def removeUnicode(text): """ Removes unicode strings like "\u002c" and "x96" """ text = re.sub(r'(\\u[0-9A-Fa-f]+)',r'', text) text = re.sub(r'[^\x00-\x7f]',r'',text) return text
[docs]def replaceURL(text): """ Replaces url address with "url" """ text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','url',text) text = re.sub(r'#([^\s]+)', r'\1', text) return text
[docs]def replaceMultiExclamationMark(text): """ Replaces repetitions of exlamation marks """ text = re.sub(r"(\!)\1+", '!', text) return text
[docs]def replaceMultiQuestionMark(text): """ Replaces repetitions of question marks """ text = re.sub(r"(\?)\1+", '?', text) return text
[docs]def removeEmoticons(text): """ Removes emoticons from text """ text = re.sub(':\)|;\)|:-\)|\(-:|:-D|=D|:P|xD|X-p|\^\^|:-*|\^\.\^|\^\-\^|\^\_\^|\,-\)|\)-:|:\'\(|:\(|:-\(|:\S|T\.T|\.\_\.|:<|:-\S|:-<|\*\-\*|:O|=O|=\-O|O\.o|XO|O\_O|:-\@|=/|:/|X\-\(|>\.<|>=\(|D:', '', text) return text
[docs]def removeNewLines(text): text = re.sub('\n', '', text) return text
[docs]def preprocess_sentence(s): return removeNewLines(replaceAtUser(removeEmoticons(replaceMultiQuestionMark(replaceMultiExclamationMark(removeUnicode(replaceURL(s)))))))
[docs]def preprocess_french_sentence(s): return removeNewLines(replaceAtUser(removeEmoticons(replaceMultiQuestionMark(replaceMultiExclamationMark(replaceURL(s))))))
[docs]def SBERT_embed(s_list, language): ''' Use Sentence-BERT to embed sentences. s_list: a list of sentences/ tokens to be embedded. language: the language of the sentences ('English', 'French', 'Arabic'). output: the embeddings of the sentences/ tokens. ''' # Model paths or names for each language model_map = { 'English': '../model/model_needed/all-MiniLM-L6-v2', 'French': '../model/model_needed/distiluse-base-multilingual-cased-v1', 'Arabic': '../model/model_needed/paraphrase-multilingual-mpnet-base-v2' } # Default model for Hugging Face hf_model_map = { 'English': 'sentence-transformers/all-MiniLM-L6-v2', 'French': 'sentence-transformers/distiluse-base-multilingual-cased-v1', 'Arabic': 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2' } # Print language and model being used print(f"Embedding sentences in language: {language}") # Determine model path model_path = model_map.get(language) if not model_path: raise ValueError(f"Unsupported language: {language}. Supported languages are: {', '.join(model_map.keys())}") print(f"Using model: {model_path}") # Load the model, downloading if necessary try: model = SentenceTransformer(model_path) print(f"Successfully loaded model from local path: {model_path}") except Exception as e: print(f"Model {model_path} not found locally. Attempting to download from Hugging Face...") model = SentenceTransformer(hf_model_map[language]) print(f"Model downloaded from Hugging Face: {hf_model_map[language]}") # Compute embeddings embeddings = model.encode(s_list, convert_to_tensor=True, normalize_embeddings=True) print(f"Computed embeddings for {len(s_list)} sentences/tokens.") return embeddings.cpu()
[docs]def evaluate_labels(labels_true, labels_pred): nmi = metrics.normalized_mutual_info_score(labels_true, labels_pred) ami = metrics.adjusted_mutual_info_score(labels_true, labels_pred) ari = metrics.adjusted_rand_score(labels_true, labels_pred) return nmi, ami, ari
[docs]def decode(division): if type(division) is dict: prediction_dict = {m: event for event, messages in division.items() for m in messages} elif type(division) is list: prediction_dict = {m: event for event, messages in enumerate(division) for m in messages} prediction_dict_sorted = dict(sorted(prediction_dict.items())) return list(prediction_dict_sorted.values())