Source code for SocialED.detector.lda

import argparse
import os
import logging
import datetime
import pandas as pd
import numpy as np
from gensim.models.ldamodel import LdaModel
from gensim import corpora
from sklearn.model_selection import train_test_split
from sklearn import metrics
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from dataset.dataloader import Event2012

[docs]class LDA: r"""The LDA model for social event detection that uses Latent Dirichlet Allocation for topic modeling and event detection. .. note:: This detector uses topic modeling to identify events in social media data. The model requires a dataset object with a load_data() method. See :cite:`blei2003latent` for details. Parameters ---------- dataset : object The dataset object containing social media data. Must provide load_data() method that returns the raw data. num_topics : int, optional Number of topics to extract. Default: ``50``. passes : int, optional Number of passes through corpus during training. Default: ``20``. iterations : int, optional Maximum number of iterations through corpus. Default: ``50``. alpha : str or float, optional Prior document-topic distribution. Default: ``'symmetric'``. eta : float, optional Prior topic-word distribution. Default: ``None``. random_state : int, optional Random seed for reproducibility. Default: ``1``. eval_every : int, optional Log perplexity evaluation frequency. Default: ``10``. chunksize : int, optional Number of documents per training chunk. Default: ``2000``. file_path : str, optional Path to save model files. Default: ``'../model/model_saved/LDA/'``. """ def __init__(self, dataset, num_topics=50, passes=20, iterations=50, alpha='symmetric', eta=None, random_state=1, eval_every=10, chunksize=2000, file_path='../model/model_saved/LDA/'): self.dataset = dataset.load_data() self.num_topics = num_topics self.passes = passes self.iterations = iterations self.alpha = alpha self.eta = eta self.random_state = random_state self.eval_every = eval_every self.chunksize = chunksize self.df = None self.train_df = None self.test_df = None self.file_path = file_path self.model_path = os.path.join(file_path, 'lda_model')
[docs] def preprocess(self): """ Data preprocessing: tokenization, stop words removal, etc. """ df = self.dataset[['filtered_words', 'event_id']].copy() df['processed_text'] = df['filtered_words'].apply( lambda x: [str(word).lower() for word in x] if isinstance(x, list) else []) self.df = df return df
[docs] def create_corpus(self, df, text_column): """ Create corpus and dictionary required for LDA model. """ texts = df[text_column].tolist() dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] return corpus, dictionary
[docs] def load_model(self): """ Load the LDA model from a file. """ logging.info(f"Loading LDA model from {self.model_path}...") lda_model = LdaModel.load(self.model_path) logging.info("LDA model loaded successfully.") self.lda_model = lda_model return lda_model
[docs] def display_topics(self, num_words=10): """ Display topics generated by the LDA model. """ topics = self.lda_model.show_topics(num_words=num_words, formatted=False) for i, topic in topics: print(f"Topic {i}: {[word for word, _ in topic]}")
[docs] def fit(self): os.makedirs(os.path.dirname(self.model_path), exist_ok=True) train_df, test_df = train_test_split(self.df, test_size=0.2, random_state=self.random_state) self.train_df = train_df self.test_df = test_df train_corpus, train_dictionary = self.create_corpus(train_df, 'processed_text') logging.info("Training LDA model...") lda_model = LdaModel(corpus=train_corpus, id2word=train_dictionary, num_topics=self.num_topics, passes=self.passes, iterations=self.iterations, alpha=self.alpha, eta=self.eta, random_state=self.random_state, eval_every=self.eval_every, chunksize=self.chunksize) logging.info("LDA model trained successfully.") # Save the trained model to a file lda_model.save(self.model_path) logging.info(f"LDA model saved to {self.model_path}")
[docs] def detection(self): """ Assign topics to each document and save unique ground truths and predictions to a CSV file. """ self.load_model() # Ensure the model is loaded before making detections corpus, _ = self.create_corpus(self.test_df, 'processed_text') topics = [self.lda_model.get_document_topics(bow) for bow in corpus] # Get the ground truth labels and predicted labels ground_truths = self.test_df['event_id'].tolist() predictions = [max(topic, key=lambda x: x[1])[0] for topic in topics] # Convert to sets to remove duplicates unique_ground_truths = list(set(ground_truths)) unique_predictions = list(set(predictions)) # Pad the shorter list with None to make them the same length max_len = max(len(unique_ground_truths), len(unique_predictions)) unique_ground_truths.extend([None] * (max_len - len(unique_ground_truths))) unique_predictions.extend([None] * (max_len - len(unique_predictions))) # Combine into a dataframe data = { 'Unique Ground Truths': unique_ground_truths, 'Unique Predictions': unique_predictions } df = pd.DataFrame(data) # Save to a CSV file output_file = os.path.join(self.file_path, "unique_ground_truths_predictions.csv") df.to_csv(output_file, index=False) print(f"Unique ground truths and predictions have been saved to {output_file}") return ground_truths, predictions
[docs] def evaluate(self, ground_truths, predictions): """ Evaluate the model. """ # Calculate Normalized Mutual Information (NMI) nmi = metrics.normalized_mutual_info_score(ground_truths, predictions) print(f"Normalized Mutual Information (NMI): {nmi}") # Calculate Adjusted Mutual Information (AMI) ami = metrics.adjusted_mutual_info_score(ground_truths, predictions) print(f"Adjusted Mutual Information (AMI): {ami}") # Calculate Adjusted Rand Index (ARI) ari = metrics.adjusted_rand_score(ground_truths, predictions) print(f"Adjusted Rand Index (ARI): {ari}") # Get the current date and time current_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") # Save results to a file in append mode with open(self.model_path + "_evaluation.txt", "a") as f: f.write(f"Date and Time: {current_datetime}\n") f.write(f"Normalized Mutual Information (NMI): {nmi}\n") f.write(f"Adjusted Mutual Information (AMI): {ami}\n") f.write(f"Adjusted Rand Index (ARI): {ari}\n") f.write("\n") # Add a newline for better readability