import os
import numpy as np
import pandas as pd
from os.path import exists
import pickle
import torch
from sentence_transformers import SentenceTransformer
import re
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_mutual_info_score, adjusted_rand_score
from sklearn import metrics
from itertools import combinations, chain
import networkx as nx
from datetime import datetime
import math
from networkx.algorithms import cuts
from sklearn.model_selection import train_test_split
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
[docs]class HISEvent():
"""HISEvent class for event detection.
This class implements hierarchical structure-based event detection.
Args:
dataset: Input dataset
...
"""
def __init__(self, dataset):
self.dataset = dataset
self.language = dataset.get_dataset_language()
self.dataset_name = dataset.get_dataset_name()
self.save_path = "../model/model_saved/hisevent/"+self.dataset_name+"/"
[docs] def preprocess(self):
preprocessor = Preprocessor(self.dataset)
preprocessor.preprocess()
[docs] def detection(self):
ground_truths, predictions = run_hier_2D_SE_mini_data(self.save_path, n=300, e_a=True, e_s=True)
return ground_truths, predictions
[docs] def evaluate(self, ground_truths, predictions):
"""
Evaluate the model.
"""
# Calculate Normalized Mutual Information (NMI)
nmi = metrics.normalized_mutual_info_score(ground_truths, predictions)
print(f"Normalized Mutual Information (NMI): {nmi}")
# Calculate Adjusted Mutual Information (AMI)
ami = metrics.adjusted_mutual_info_score(ground_truths, predictions)
print(f"Adjusted Mutual Information (AMI): {ami}")
# Calculate Adjusted Rand Index (ARI)
ari = metrics.adjusted_rand_score(ground_truths, predictions)
print(f"Adjusted Rand Index (ARI): {ari}")
[docs]class Preprocessor:
[docs] def __init__(self, dataset, mode='close'):
"""Initialize preprocessor
Args:
dataset: Dataset calss (e.g. Event2012, Event2018, etc.)
language: Language of the dataset (default 'English')
mode: 'open' or 'close' (default 'close') - determines preprocessing mode
"""
self.dataset = dataset.load_data()
self.language = dataset.get_dataset_language()
self.dataset_name = dataset.get_dataset_name()
self.mode = mode
self.columns = ['tweet_id', 'text', 'event_id', 'words', 'filtered_words',
'entities', 'user_id', 'created_at', 'urls', 'hashtags', 'user_mentions']
[docs] def get_closed_set_test_df(self, df):
"""Get closed set test dataframe"""
save_path = f'../model/model_saved/{self.dataset_name}/closed_set/'
if not exists(save_path):
os.makedirs(save_path)
test_set_df_np_path = save_path + 'test_set.npy'
if not exists(test_set_df_np_path):
# Use 2012-style processing for all datasets
test_mask = torch.load(f'../model/model_saved/{self.dataset_name}/masks/test_mask.pt').cpu().detach().numpy()
test_mask = list(np.where(test_mask==True)[0])
test_df = df.iloc[test_mask]
test_df_np = test_df.to_numpy()
np.save(test_set_df_np_path, test_df_np)
return
[docs] def get_closed_set_messages_embeddings(self):
"""Get SBERT embeddings for closed set messages"""
save_path = f'../model/model_saved/{self.dataset_name}/closed_set/'
SBERT_embedding_path = f'{save_path}/SBERT_embeddings.pkl'
if not exists(SBERT_embedding_path):
test_set_df_np_path = save_path + 'test_set.npy'
test_df_np = np.load(test_set_df_np_path, allow_pickle=True)
test_df = pd.DataFrame(data=test_df_np, columns=self.columns)
print("Dataframe loaded.")
processed_text = [preprocess_sentence(s) for s in test_df['text'].values]
print('message text contents preprocessed.')
embeddings = SBERT_embed(processed_text, language=self.language)
with open(SBERT_embedding_path, 'wb') as fp:
pickle.dump(embeddings, fp)
print('SBERT embeddings stored.')
return
[docs] def get_open_set_messages_embeddings(self):
"""Get SBERT embeddings for open set messages"""
save_path = f'../model/model_saved/{self.dataset_name}/open_set/'
num_blocks = 21 # Use 2012-style processing for all datasets
for i in range(num_blocks):
block = i + 1
print('\n\n====================================================')
print('block: ', block)
SBERT_embedding_path = f'{save_path}{block}/SBERT_embeddings.pkl'
if not exists(SBERT_embedding_path):
df_np = np.load(f'{save_path}{block}/{block}.npy', allow_pickle=True)
df = pd.DataFrame(data=df_np, columns=self.columns + ['original_index', 'date'])
print("Dataframe loaded.")
df['processed_text'] = [preprocess_sentence(s) for s in df['text']]
print('message text contents preprocessed.')
embeddings = SBERT_embed(df['processed_text'].tolist(), language=self.language)
with open(SBERT_embedding_path, 'wb') as fp:
pickle.dump(embeddings, fp)
print('SBERT embeddings stored.')
return
[docs] def split_open_set(self, df, root_path):
"""Split data into open set blocks"""
if not exists(root_path):
os.makedirs(root_path)
df = df.sort_values(by='created_at').reset_index()
df['date'] = [d.date() for d in df['created_at']]
distinct_dates = df.date.unique()
# First week -> block 0
folder = root_path + '0/'
if not exists(folder):
os.mkdir(folder)
df_np_path = folder + '0.npy'
if not exists(df_np_path):
ini_df = df.loc[df['date'].isin(distinct_dates[:7])]
ini_df_np = ini_df.to_numpy()
np.save(df_np_path, ini_df_np)
# Following dates -> block 1, 2, ...
end = len(distinct_dates) - 1 # Use 2012-style processing
for i in range(7, end):
folder = root_path + str(i - 6) + '/'
if not exists(folder):
os.mkdir(folder)
df_np_path = folder + str(i - 6) + '.npy'
if not exists(df_np_path):
incr_df = df.loc[df['date'] == distinct_dates[i]]
incr_df_np = incr_df.to_numpy()
np.save(df_np_path, incr_df_np)
return
[docs] def preprocess(self):
"""Main preprocessing function"""
# Load raw data using 2012-style processing
df_np = self.dataset
print("Loaded data.")
df = pd.DataFrame(data=df_np, columns=self.columns)
print("Data converted to dataframe.")
if self.mode == 'open':
# Open-set setting
root_path = f'../model/model_saved/{self.dataset_name}/open_set/'
self.split_open_set(df, root_path)
self.get_open_set_messages_embeddings()
else:
# Close-set setting
# Create masks directory and generate train/val/test splits
save_dir = os.path.join(f'../model/model_saved/{self.dataset_name}', 'masks')
os.makedirs(save_dir, exist_ok=True)
# Split and save masks
self.split_and_save_masks(df, save_dir)
print("Generated and saved train/val/test masks.")
self.get_closed_set_test_df(df)
self.get_closed_set_messages_embeddings()
return
[docs] def split_and_save_masks(self, df, save_dir, train_size=0.7, val_size=0.1, test_size=0.2, random_seed=42):
"""
Splits the DataFrame into training, validation, and test sets, and saves the indices (masks) as .pt files.
Parameters:
- df (pd.DataFrame): The DataFrame to be split
- save_dir (str): Directory to save the masks
- train_size (float): Proportion for training (default 0.7)
- val_size (float): Proportion for validation (default 0.1)
- test_size (float): Proportion for testing (default 0.2)
- random_seed (int): Random seed for reproducibility
"""
if train_size + val_size + test_size != 1.0:
raise ValueError("train_size + val_size + test_size must equal 1.0")
if df.empty:
raise ValueError("The input DataFrame is empty.")
print(f"Total samples in DataFrame: {len(df)}")
# Set random seed
torch.manual_seed(random_seed)
# Split into train and temp
train_data, temp_data = train_test_split(df, train_size=train_size, random_state=random_seed)
# Split temp into val and test
val_data, test_data = train_test_split(temp_data,
train_size=val_size/(val_size + test_size),
random_state=random_seed)
# Create boolean masks
full_train_mask = torch.zeros(len(df), dtype=torch.bool)
full_val_mask = torch.zeros(len(df), dtype=torch.bool)
full_test_mask = torch.zeros(len(df), dtype=torch.bool)
# Set indices
full_train_mask[train_data.index] = True
full_val_mask[val_data.index] = True
full_test_mask[test_data.index] = True
print(f"Training samples: {full_train_mask.sum()}")
print(f"Validation samples: {full_val_mask.sum()}")
print(f"Test samples: {full_test_mask.sum()}")
# Save masks
mask_paths = {
'train_mask.pt': full_train_mask,
'val_mask.pt': full_val_mask,
'test_mask.pt': full_test_mask
}
for filename, mask in mask_paths.items():
mask_path = os.path.join(save_dir, filename)
if not os.path.exists(mask_path):
try:
torch.save(mask, mask_path)
print(f"Saved {filename}")
except Exception as e:
print(f"Error saving {filename}: {e}")
else:
print(f"{filename} already exists")
# Verify saved file
if os.path.exists(mask_path):
saved_mask = torch.load(mask_path)
if saved_mask.numel() == 0:
print(f"Warning: {filename} is empty")
else:
print(f"Verified {filename} with {saved_mask.numel()} elements")
print("Mask generation completed")
[docs]def get_stable_point(path):
stable_point_path = path + 'stable_point.pkl'
if not exists(stable_point_path):
embeddings_path = path + 'SBERT_embeddings.pkl'
with open(embeddings_path, 'rb') as f:
embeddings = pickle.load(f)
print(f"Loaded embeddings: {embeddings}")
print(f"Shape of embeddings: {np.shape(embeddings)}")
first_stable_point, global_stable_point = search_stable_points(embeddings)
stable_points = {'first': first_stable_point, 'global': global_stable_point}
with open(stable_point_path, 'wb') as fp:
pickle.dump(stable_points, fp)
print('stable points stored.')
with open(stable_point_path, 'rb') as f:
stable_points = pickle.load(f)
print('stable points loaded.')
return stable_points
[docs]def run_hier_2D_SE_mini_data(save_path, n=300, e_a=True, e_s=True):
# load test_set_df
test_set_df_np_path = save_path + 'test_set.npy'
test_df_np = np.load(test_set_df_np_path, allow_pickle=True)
test_df = pd.DataFrame(data=test_df_np,
columns=['tweet_id', 'text', 'event_id', 'words', 'filtered_words',
'entities', 'user_id', 'created_at', 'urls', 'hashtags', 'user_mentions'])
print("Dataframe loaded.")
all_node_features = [[str(u)] + \
[str(each) for each in um] + \
[h.lower() for h in hs] + \
e \
for u, um, hs, e in \
zip(test_df['user_id'], test_df['user_mentions'], test_df['hashtags'], test_df['entities'])]
# load embeddings of the test set messages
with open(f'{save_path}/SBERT_embeddings.pkl', 'rb') as f:
embeddings = pickle.load(f)
stable_points = get_stable_point(save_path)
default_num_neighbors = stable_points['first']
global_edges = get_global_edges(all_node_features, embeddings, default_num_neighbors, e_a=e_a, e_s=e_s)
corr_matrix = np.corrcoef(embeddings)
np.fill_diagonal(corr_matrix, 0)
weighted_global_edges = [(edge[0], edge[1], corr_matrix[edge[0] - 1, edge[1] - 1]) for edge in global_edges \
if corr_matrix[edge[0] - 1, edge[1] - 1] > 0] # node encoding starts from 1
division = hier_2D_SE_mini(weighted_global_edges, len(embeddings), n=n)
prediction = decode(division)
labels_true = test_df['event_id'].tolist()
n_clusters = len(list(set(labels_true)))
print('n_clusters gt: ', n_clusters)
return labels_true, prediction
[docs]def search_stable_points(embeddings, max_num_neighbors=50):
corr_matrix = np.corrcoef(embeddings)
np.fill_diagonal(corr_matrix, 0)
corr_matrix_sorted_indices = np.argsort(corr_matrix)
all_1dSEs = []
seg = None
for i in range(max_num_neighbors):
dst_ids = corr_matrix_sorted_indices[:, -(i + 1)]
knn_edges = [(s + 1, d + 1, corr_matrix[s, d]) \
for s, d in enumerate(dst_ids) if
corr_matrix[s, d] > 0] # (s+1, d+1): +1 as node indexing starts from 1 instead of 0
if i == 0:
g = nx.Graph()
g.add_weighted_edges_from(knn_edges)
seg = SE(g)
all_1dSEs.append(seg.calc_1dSE())
else:
all_1dSEs.append(seg.update_1dSE(all_1dSEs[-1], knn_edges))
# print('all_1dSEs: ', all_1dSEs)
stable_indices = []
for i in range(1, len(all_1dSEs) - 1):
if all_1dSEs[i] < all_1dSEs[i - 1] and all_1dSEs[i] < all_1dSEs[i + 1]:
stable_indices.append(i)
if len(stable_indices) == 0:
print('No stable points found after checking k = 1 to ', max_num_neighbors)
return 0, 0
else:
stable_SEs = [all_1dSEs[index] for index in stable_indices]
index = stable_indices[stable_SEs.index(min(stable_SEs))]
print('stable_indices: ', stable_indices)
print('stable_SEs: ', stable_SEs)
print('First stable point: k = ', stable_indices[0] + 1, ', correspoding 1dSE: ',
stable_SEs[0]) # n_neighbors should be index + 1
print('Global stable point within the searching range: k = ', index + 1, \
', correspoding 1dSE: ', all_1dSEs[index]) # n_neighbors should be index + 1
return stable_indices[0] + 1, index + 1 # first stable point, global stable point
[docs]def get_graph_edges(attributes):
attr_nodes_dict = {}
for i, l in enumerate(attributes):
for attr in l:
if attr not in attr_nodes_dict:
attr_nodes_dict[attr] = [i + 1] # node indexing starts from 1
else:
attr_nodes_dict[attr].append(i + 1)
for attr in attr_nodes_dict.keys():
attr_nodes_dict[attr].sort()
graph_edges = []
for l in attr_nodes_dict.values():
graph_edges += list(combinations(l, 2))
return list(set(graph_edges))
[docs]def get_knn_edges(embeddings, default_num_neighbors):
corr_matrix = np.corrcoef(embeddings)
np.fill_diagonal(corr_matrix, 0)
corr_matrix_sorted_indices = np.argsort(corr_matrix)
knn_edges = []
for i in range(default_num_neighbors):
dst_ids = corr_matrix_sorted_indices[:, -(i + 1)]
knn_edges += [(s + 1, d + 1) if s < d else (d + 1, s + 1) \
for s, d in enumerate(dst_ids) if
corr_matrix[s, d] > 0] # (s+1, d+1): +1 as node indexing starts from 1 instead of 0
return list(set(knn_edges))
[docs]def get_global_edges(attributes, embeddings, default_num_neighbors, e_a=True, e_s=True):
graph_edges, knn_edges = [], []
if e_a == True:
graph_edges = get_graph_edges(attributes)
if e_s == True:
knn_edges = get_knn_edges(embeddings, default_num_neighbors)
return list(set(knn_edges + graph_edges))
[docs]def get_subgraphs_edges(clusters, graph_splits, weighted_global_edges):
"""Get subgraph edges.
Args:
clusters: a list containing the current clusters, each cluster is a list of nodes of the original graph
graph_splits: a list of (start_index, end_index) pairs, each (start_index, end_index) pair indicates a subset of clusters,
which will serve as the nodes of a new subgraph
weighted_global_edges: a list of (start node, end node, edge weight) tuples, each tuple is an edge in the original graph
Returns:
all_subgraphs_edges: a list containing the edges of all subgraphs
"""
all_subgraphs_edges = []
for split in graph_splits:
subgraph_clusters = clusters[split[0]:split[1]]
subgraph_nodes = list(chain(*subgraph_clusters))
subgraph_edges = [edge for edge in weighted_global_edges if
edge[0] in subgraph_nodes and edge[1] in subgraph_nodes]
all_subgraphs_edges.append(subgraph_edges)
return all_subgraphs_edges
[docs]def hier_2D_SE_mini(weighted_global_edges, n_messages, n=100):
'''
hierarchical 2D SE minimization
'''
ite = 1
# initially, each node (message) is in its own cluster
# node encoding starts from 1
clusters = [[i + 1] for i in range(n_messages)]
while True:
print('\n=========Iteration ', str(ite), '=========')
n_clusters = len(clusters)
graph_splits = [(s, min(s + n, n_clusters)) for s in range(0, n_clusters, n)] # [s, e)
all_subgraphs_edges = get_subgraphs_edges(clusters, graph_splits, weighted_global_edges)
last_clusters = clusters
clusters = []
print('Number of subgraphs: ', len(graph_splits))
for i, subgraph_edges in enumerate(all_subgraphs_edges):
print('\tSubgraph ', str(i + 1))
g = nx.Graph()
g.add_weighted_edges_from(subgraph_edges)
seg = SE(g)
seg.division = {j: cluster for j, cluster in
enumerate(last_clusters[graph_splits[i][0]:graph_splits[i][1]])}
seg.add_isolates()
for k in seg.division.keys():
for node in seg.division[k]:
seg.graph.nodes[node]['comm'] = k
seg.update_struc_data()
seg.update_struc_data_2d()
seg.update_division_MinSE()
clusters += list(seg.division.values())
if len(graph_splits) == 1:
break
if clusters == last_clusters:
n *= 2
return clusters
# SE
[docs]class SE:
def __init__(self, graph: nx.Graph):
self.graph = graph.copy()
self.vol = self.get_vol()
self.division = {} # {comm1: [node11, node12, ...], comm2: [node21, node22, ...], ...}
self.struc_data = {} # {comm1: [vol1, cut1, community_node_SE, leaf_nodes_SE], comm2:[vol2, cut2, community_node_SE, leaf_nodes_SE],... }
self.struc_data_2d = {} # {comm1: {comm2: [vol_after_merge, cut_after_merge, comm_node_SE_after_merge, leaf_nodes_SE_after_merge], comm3: [], ...}, ...}
[docs] def get_vol(self):
'''
get the volume of the graph
'''
return cuts.volume(self.graph, self.graph.nodes, weight='weight')
[docs] def calc_1dSE(self):
'''
get the 1D SE of the graph
'''
SE = 0
for n in self.graph.nodes:
d = cuts.volume(self.graph, [n], weight='weight')
SE += - (d / self.vol) * math.log2(d / self.vol)
return SE
[docs] def update_1dSE(self, original_1dSE, new_edges):
'''
get the updated 1D SE after new edges are inserted into the graph
'''
affected_nodes = []
for edge in new_edges:
affected_nodes += [edge[0], edge[1]]
affected_nodes = set(affected_nodes)
original_vol = self.vol
original_degree_dict = {node: 0 for node in affected_nodes}
for node in affected_nodes.intersection(set(self.graph.nodes)):
original_degree_dict[node] = self.graph.degree(node, weight='weight')
# insert new edges into the graph
self.graph.add_weighted_edges_from(new_edges)
self.vol = self.get_vol()
updated_vol = self.vol
updated_degree_dict = {}
for node in affected_nodes:
updated_degree_dict[node] = self.graph.degree(node, weight='weight')
updated_1dSE = (original_vol / updated_vol) * (original_1dSE - math.log2(original_vol / updated_vol))
for node in affected_nodes:
d_original = original_degree_dict[node]
d_updated = updated_degree_dict[node]
if d_original != d_updated:
if d_original != 0:
updated_1dSE += (d_original / updated_vol) * math.log2(d_original / updated_vol)
updated_1dSE -= (d_updated / updated_vol) * math.log2(d_updated / updated_vol)
return updated_1dSE
[docs] def get_cut(self, comm):
'''
get the sum of the degrees of the cut edges of community comm
'''
return cuts.cut_size(self.graph, comm, weight='weight')
[docs] def get_volume(self, comm):
'''
get the volume of community comm
'''
return cuts.volume(self.graph, comm, weight='weight')
[docs] def calc_2dSE(self):
'''
get the 2D SE of the graph
'''
SE = 0
for comm in self.division.values():
g = self.get_cut(comm)
v = self.get_volume(comm)
SE += - (g / self.vol) * math.log2(v / self.vol)
for node in comm:
d = self.graph.degree(node, weight='weight')
SE += - (d / self.vol) * math.log2(d / v)
return SE
[docs] def show_division(self):
print(self.division)
[docs] def show_struc_data(self):
print(self.struc_data)
[docs] def show_struc_data_2d(self):
print(self.struc_data_2d)
[docs] def print_graph(self):
fig, ax = plt.subplots()
nx.draw(self.graph, ax=ax, with_labels=True)
plt.show()
[docs] def update_struc_data(self):
'''
calculate the volume, cut, communitiy mode SE, and leaf nodes SE of each cummunity,
then store them into self.struc_data
'''
self.struc_data = {} # {comm1: [vol1, cut1, community_node_SE, leaf_nodes_SE], comm2:[vol2, cut2, community_node_SE, leaf_nodes_SE],... }
for vname in self.division.keys():
comm = self.division[vname]
volume = self.get_volume(comm)
cut = self.get_cut(comm)
if volume == 0:
vSE = 0
else:
vSE = - (cut / self.vol) * math.log2(volume / self.vol)
vnodeSE = 0
for node in comm:
d = self.graph.degree(node, weight='weight')
if d != 0:
vnodeSE -= (d / self.vol) * math.log2(d / volume)
self.struc_data[vname] = [volume, cut, vSE, vnodeSE]
[docs] def update_struc_data_2d(self):
'''
calculate the volume, cut, communitiy mode SE, and leaf nodes SE after merging each pair of cummunities,
then store them into self.struc_data_2d
'''
self.struc_data_2d = {} # {(comm1, comm2): [vol_after_merge, cut_after_merge, comm_node_SE_after_merge, leaf_nodes_SE_after_merge], (comm1, comm3): [], ...}
comm_num = len(self.division)
for i in range(comm_num):
for j in range(i + 1, comm_num):
v1 = list(self.division.keys())[i]
v2 = list(self.division.keys())[j]
if v1 < v2:
k = (v1, v2)
else:
k = (v2, v1)
comm_merged = self.division[v1] + self.division[v2]
gm = self.get_cut(comm_merged)
vm = self.struc_data[v1][0] + self.struc_data[v2][0]
if self.struc_data[v1][0] == 0 or self.struc_data[v2][0] == 0:
vmSE = self.struc_data[v1][2] + self.struc_data[v2][2]
vmnodeSE = self.struc_data[v1][3] + self.struc_data[v2][3]
else:
vmSE = - (gm / self.vol) * math.log2(vm / self.vol)
vmnodeSE = self.struc_data[v1][3] - (self.struc_data[v1][0] / self.vol) * math.log2(
self.struc_data[v1][0] / vm) + \
self.struc_data[v2][3] - (self.struc_data[v2][0] / self.vol) * math.log2(
self.struc_data[v2][0] / vm)
self.struc_data_2d[k] = [vm, gm, vmSE, vmnodeSE]
[docs] def init_division(self):
'''
initialize self.division such that each node assigned to its own community
'''
self.division = {}
for node in self.graph.nodes:
new_comm = node
self.division[new_comm] = [node]
self.graph.nodes[node]['comm'] = new_comm
[docs] def add_isolates(self):
'''
add any isolated nodes into graph
'''
all_nodes = list(chain(*list(self.division.values())))
all_nodes.sort()
edge_nodes = list(self.graph.nodes)
edge_nodes.sort()
if all_nodes != edge_nodes:
for node in set(all_nodes) - set(edge_nodes):
self.graph.add_node(node)
[docs] def update_division_MinSE(self):
'''
greedily update the encoding tree to minimize 2D SE
'''
def Mg_operator(v1, v2):
'''
MERGE operator. It calculates the delta SE caused by mergeing communities v1 and v2,
without actually merging them, i.e., the encoding tree won't be changed
'''
v1SE = self.struc_data[v1][2]
v1nodeSE = self.struc_data[v1][3]
v2SE = self.struc_data[v2][2]
v2nodeSE = self.struc_data[v2][3]
if v1 < v2:
k = (v1, v2)
else:
k = (v2, v1)
vm, gm, vmSE, vmnodeSE = self.struc_data_2d[k]
delta_SE = vmSE + vmnodeSE - (v1SE + v1nodeSE + v2SE + v2nodeSE)
return delta_SE
# continue merging any two communities that can cause the largest decrease in SE,
# until the SE can't be further reduced
while True:
comm_num = len(self.division)
delta_SE = 99999
vm1 = None
vm2 = None
for i in range(comm_num):
for j in range(i + 1, comm_num):
v1 = list(self.division.keys())[i]
v2 = list(self.division.keys())[j]
new_delta_SE = Mg_operator(v1, v2)
if new_delta_SE < delta_SE:
delta_SE = new_delta_SE
vm1 = v1
vm2 = v2
if delta_SE < 0:
# Merge v2 into v1, and update the encoding tree accordingly
for node in self.division[vm2]:
self.graph.nodes[node]['comm'] = vm1
self.division[vm1] += self.division[vm2]
self.division.pop(vm2)
volume = self.struc_data[vm1][0] + self.struc_data[vm2][0]
cut = self.get_cut(self.division[vm1])
vmSE = - (cut / self.vol) * math.log2(volume / self.vol)
vmnodeSE = self.struc_data[vm1][3] - (self.struc_data[vm1][0] / self.vol) * math.log2(
self.struc_data[vm1][0] / volume) + \
self.struc_data[vm2][3] - (self.struc_data[vm2][0] / self.vol) * math.log2(
self.struc_data[vm2][0] / volume)
self.struc_data[vm1] = [volume, cut, vmSE, vmnodeSE]
self.struc_data.pop(vm2)
struc_data_2d_new = {}
for k in self.struc_data_2d.keys():
if k[0] == vm2 or k[1] == vm2:
continue
elif k[0] == vm1 or k[1] == vm1:
v1 = k[0]
v2 = k[1]
comm_merged = self.division[v1] + self.division[v2]
gm = self.get_cut(comm_merged)
vm = self.struc_data[v1][0] + self.struc_data[v2][0]
if self.struc_data[v1][0] == 0 or self.struc_data[v2][0] == 0:
vmSE = self.struc_data[v1][2] + self.struc_data[v2][2]
vmnodeSE = self.struc_data[v1][3] + self.struc_data[v2][3]
else:
vmSE = - (gm / self.vol) * math.log2(vm / self.vol)
vmnodeSE = self.struc_data[v1][3] - (self.struc_data[v1][0] / self.vol) * math.log2(
self.struc_data[v1][0] / vm) + \
self.struc_data[v2][3] - (self.struc_data[v2][0] / self.vol) * math.log2(
self.struc_data[v2][0] / vm)
struc_data_2d_new[k] = [vm, gm, vmSE, vmnodeSE]
else:
struc_data_2d_new[k] = self.struc_data_2d[k]
self.struc_data_2d = struc_data_2d_new
else:
break
[docs]def vanilla_2D_SE_mini(weighted_edges):
"""
vanilla (greedy) 2D SE minimization
"""
g = nx.Graph()
g.add_weighted_edges_from(weighted_edges)
seg = SE(g)
seg.init_division()
# seg.show_division()
SE1D = seg.calc_1dSE()
seg.update_struc_data()
# seg.show_struc_data()
seg.update_struc_data_2d()
# seg.show_struc_data_2d()
initial_SE2D = seg.calc_2dSE()
seg.update_division_MinSE()
communities = seg.division
minimized_SE2D = seg.calc_2dSE()
return SE1D, initial_SE2D, minimized_SE2D, communities
[docs]def test_vanilla_2D_SE_mini():
weighted_edges = [(1, 2, 2), (1, 3, 4)]
g = nx.Graph()
g.add_weighted_edges_from(weighted_edges)
A = nx.adjacency_matrix(g).todense()
print('adjacency matrix: \n', A)
print('g.nodes: ', g.nodes)
print('g.edges: ', g.edges)
print('degrees of nodes: ', list(g.degree(g.nodes, weight='weight')))
SE1D, initial_SE2D, minimized_SE2D, communities = vanilla_2D_SE_mini(weighted_edges)
print('\n1D SE of the graph: ', SE1D)
print('initial 2D SE of the graph: ', initial_SE2D)
print('the minimum 2D SE of the graph: ', minimized_SE2D)
print('communities detected: ', communities)
return
[docs]def replaceAtUser(text):
""" Replaces "@user" with "" """
text = re.sub('@[^\s]+|RT @[^\s]+', '', text)
return text
[docs]def removeUnicode(text):
""" Removes unicode strings like "\u002c" and "x96" """
text = re.sub(r'(\\u[0-9A-Fa-f]+)', r'', text)
text = re.sub(r'[^\x00-\x7f]', r'', text)
return text
[docs]def replaceURL(text):
""" Replaces url address with "url" """
text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'url', text)
text = re.sub(r'#([^\s]+)', r'\1', text)
return text
[docs]def replaceMultiExclamationMark(text):
""" Replaces repetitions of exlamation marks """
text = re.sub(r"(\!)\1+", '!', text)
return text
[docs]def replaceMultiQuestionMark(text):
""" Replaces repetitions of question marks """
text = re.sub(r"(\?)\1+", '?', text)
return text
[docs]def removeEmoticons(text):
""" Removes emoticons from text """
text = re.sub(
':\)|;\)|:-\)|\(-:|:-D|=D|:P|xD|X-p|\^\^|:-*|\^\.\^|\^\-\^|\^\_\^|\,-\)|\)-:|:\'\(|:\(|:-\(|:\S|T\.T|\.\_\.|:<|:-\S|:-<|\*\-\*|:O|=O|=\-O|O\.o|XO|O\_O|:-\@|=/|:/|X\-\(|>\.<|>=\(|D:',
'', text)
return text
[docs]def removeNewLines(text):
text = re.sub('\n', '', text)
return text
[docs]def preprocess_sentence(s):
return removeNewLines(replaceAtUser(
removeEmoticons(replaceMultiQuestionMark(replaceMultiExclamationMark(removeUnicode(replaceURL(s)))))))
[docs]def preprocess_french_sentence(s):
return removeNewLines(
replaceAtUser(removeEmoticons(replaceMultiQuestionMark(replaceMultiExclamationMark(replaceURL(s))))))
[docs]def SBERT_embed(s_list, language):
'''
Use Sentence-BERT to embed sentences.
s_list: a list of sentences/ tokens to be embedded.
language: the language of the sentences ('English', 'French', 'Arabic').
output: the embeddings of the sentences/ tokens.
'''
# Model paths or names for each language
model_map = {
'English': '../model/model_needed/all-MiniLM-L6-v2',
'French': '../model/model_needed/distiluse-base-multilingual-cased-v1',
'Arabic': '../model/model_needed/paraphrase-multilingual-mpnet-base-v2'
}
# Default model for Hugging Face
hf_model_map = {
'English': 'sentence-transformers/all-MiniLM-L6-v2',
'French': 'sentence-transformers/distiluse-base-multilingual-cased-v1',
'Arabic': 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
}
# Print language and model being used
print(f"Embedding sentences in language: {language}")
# Determine model path
model_path = model_map.get(language)
if not model_path:
raise ValueError(f"Unsupported language: {language}. Supported languages are: {', '.join(model_map.keys())}")
print(f"Using model: {model_path}")
# Load the model, downloading if necessary
try:
model = SentenceTransformer(model_path)
print(f"Successfully loaded model from local path: {model_path}")
except Exception as e:
print(f"Model {model_path} not found locally. Attempting to download from Hugging Face...")
model = SentenceTransformer(hf_model_map[language])
print(f"Model downloaded from Hugging Face: {hf_model_map[language]}")
# Compute embeddings
embeddings = model.encode(s_list, convert_to_tensor=True, normalize_embeddings=True)
print(f"Computed embeddings for {len(s_list)} sentences/tokens.")
return embeddings.cpu()
[docs]def evaluate(labels_true, labels_pred):
nmi = normalized_mutual_info_score(labels_true, labels_pred)
ami = adjusted_mutual_info_score(labels_true, labels_pred)
ari = adjusted_rand_score(labels_true, labels_pred)
return nmi, ami, ari
[docs]def decode(division):
if type(division) is dict:
prediction_dict = {m: event for event, messages in division.items() for m in messages}
elif type(division) is list:
prediction_dict = {m: event for event, messages in enumerate(division) for m in messages}
prediction_dict_sorted = dict(sorted(prediction_dict.items()))
return list(prediction_dict_sorted.values())