Source code for SocialED.detector.glove

import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn import metrics
import logging
import datetime
import pickle
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from dataset.dataloader import DatasetLoader
from huggingface_hub import hf_hub_download
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


[docs]class GloVe: r"""The GloVe model for social event detection that uses GloVe word embeddings to detect events in social media data. .. note:: This detector uses word embeddings to identify events in social media data. The model requires a dataset object with a load_data() method. Parameters ---------- dataset : object The dataset object containing social media data. Must provide load_data() method that returns the raw data. num_clusters : int, optional Number of clusters for KMeans clustering. Default: ``50``. random_state : int, optional Random seed for reproducibility. Default: ``1``. file_path : str, optional Path to save model files. Default: ``'../model/model_saved/GloVe/'``. model : str, optional Path to pre-trained GloVe word vectors file. Default: ``'../model/model_needed/glove.6B.100d.txt'``. """ def __init__(self, dataset, num_clusters=50, random_state=1, file_path='../model/model_saved/GloVe/', model='../model/model_needed/glove.6B.100d.txt'): self.dataset = dataset.load_data() self.num_clusters = num_clusters self.random_state = random_state self.model_path = os.path.join(file_path, 'kmeans_model') self.df = None self.train_df = None self.test_df = None self.model = model self.embeddings_index = self.load_glove_vectors()
[docs] def load_glove_vectors(self): """ Load GloVe pre-trained word vectors. """ embeddings_index = {} with open(self.model, 'r', encoding='utf8') as f: for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs return embeddings_index
[docs] def preprocess(self): """ Data preprocessing: tokenization, stop words removal, etc. """ df = self.dataset[['filtered_words', 'event_id']].copy() df['processed_text'] = df['filtered_words'].apply( lambda x: [str(word).lower() for word in x] if isinstance(x, list) else []) self.df = df return df
[docs] def text_to_glove_vector(self, text, embedding_dim=100): """ Convert text to GloVe vector representation. """ words = text embedding = np.zeros(embedding_dim) valid_words = 0 for word in words: if word in self.embeddings_index: embedding += self.embeddings_index[word] valid_words += 1 if valid_words > 0: embedding /= valid_words return embedding
[docs] def create_vectors(self, df, text_column): """ Create GloVe vectors for each document. """ texts = df[text_column].tolist() vectors = np.array([self.text_to_glove_vector(text) for text in texts]) return vectors
[docs] def load_model(self): """ Load the KMeans model from a file. """ logging.info(f"Loading KMeans model from {self.model_path}...") kmeans_model = KMeans(n_clusters=self.num_clusters, random_state=self.random_state) kmeans_model = kmeans_model.fit(self.train_vectors) # 重新训练模型 logging.info("KMeans model loaded successfully.") self.kmeans_model = kmeans_model return kmeans_model
[docs] def fit(self): os.makedirs(os.path.dirname(self.model_path), exist_ok=True) train_df, test_df = train_test_split(self.df, test_size=0.2, random_state=self.random_state) self.train_df = train_df self.test_df = test_df self.train_vectors = self.create_vectors(train_df, 'processed_text') logging.info("Training KMeans model...") kmeans_model = KMeans(n_clusters=self.num_clusters, random_state=self.random_state) kmeans_model.fit(self.train_vectors) logging.info("KMeans model trained successfully.") # Save the trained model to a file with open(self.model_path, 'wb') as f: pickle.dump(kmeans_model, f) logging.info(f"KMeans model saved to {self.model_path}")
[docs] def detection(self): """ Assign clusters to each document. """ self.load_model() # Ensure the model is loaded before making detections self.test_vectors = self.create_vectors(self.test_df, 'processed_text') labels = self.kmeans_model.predict(self.test_vectors) # Get the ground truth labels and predicted labels ground_truths = self.test_df['event_id'].tolist() predicted_labels = labels.tolist() return ground_truths, predicted_labels
[docs] def evaluate(self, ground_truths, predictions): """ Evaluate the model. """ # Calculate Normalized Mutual Information (NMI) nmi = metrics.normalized_mutual_info_score(ground_truths, predictions) print(f"Normalized Mutual Information (NMI): {nmi}") # Calculate Adjusted Mutual Information (AMI) ami = metrics.adjusted_mutual_info_score(ground_truths, predictions) print(f"Adjusted Mutual Information (AMI): {ami}") # Calculate Adjusted Rand Index (ARI) ari = metrics.adjusted_rand_score(ground_truths, predictions) print(f"Adjusted Rand Index (ARI): {ari}")