import argparse
import os
import logging
import datetime
import pandas as pd
import numpy as np
from gensim.models.ldamodel import LdaModel
from gensim import corpora
from sklearn.model_selection import train_test_split
from sklearn import metrics
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from dataset.dataloader import Event2012
[docs]class LDA:
r"""The LDA model for social event detection that uses Latent Dirichlet Allocation
for topic modeling and event detection.
.. note::
This detector uses topic modeling to identify events in social media data.
The model requires a dataset object with a load_data() method.
See :cite:`blei2003latent` for details.
Parameters
----------
dataset : object
The dataset object containing social media data.
Must provide load_data() method that returns the raw data.
num_topics : int, optional
Number of topics to extract. Default: ``50``.
passes : int, optional
Number of passes through corpus during training. Default: ``20``.
iterations : int, optional
Maximum number of iterations through corpus. Default: ``50``.
alpha : str or float, optional
Prior document-topic distribution. Default: ``'symmetric'``.
eta : float, optional
Prior topic-word distribution. Default: ``None``.
random_state : int, optional
Random seed for reproducibility. Default: ``1``.
eval_every : int, optional
Log perplexity evaluation frequency. Default: ``10``.
chunksize : int, optional
Number of documents per training chunk. Default: ``2000``.
file_path : str, optional
Path to save model files. Default: ``'../model/model_saved/LDA/'``.
"""
def __init__(self,
dataset,
num_topics=50,
passes=20,
iterations=50,
alpha='symmetric',
eta=None,
random_state=1,
eval_every=10,
chunksize=2000,
file_path='../model/model_saved/LDA/'):
self.dataset = dataset.load_data()
self.num_topics = num_topics
self.passes = passes
self.iterations = iterations
self.alpha = alpha
self.eta = eta
self.random_state = random_state
self.eval_every = eval_every
self.chunksize = chunksize
self.df = None
self.train_df = None
self.test_df = None
self.file_path = file_path
self.model_path = os.path.join(file_path, 'lda_model')
[docs] def preprocess(self):
"""
Data preprocessing: tokenization, stop words removal, etc.
"""
df = self.dataset[['filtered_words', 'event_id']].copy()
df['processed_text'] = df['filtered_words'].apply(
lambda x: [str(word).lower() for word in x] if isinstance(x, list) else [])
self.df = df
return df
[docs] def create_corpus(self, df, text_column):
"""
Create corpus and dictionary required for LDA model.
"""
texts = df[text_column].tolist()
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
return corpus, dictionary
[docs] def load_model(self):
"""
Load the LDA model from a file.
"""
logging.info(f"Loading LDA model from {self.model_path}...")
lda_model = LdaModel.load(self.model_path)
logging.info("LDA model loaded successfully.")
self.lda_model = lda_model
return lda_model
[docs] def display_topics(self, num_words=10):
"""
Display topics generated by the LDA model.
"""
topics = self.lda_model.show_topics(num_words=num_words, formatted=False)
for i, topic in topics:
print(f"Topic {i}: {[word for word, _ in topic]}")
[docs] def fit(self):
os.makedirs(os.path.dirname(self.model_path), exist_ok=True)
train_df, test_df = train_test_split(self.df, test_size=0.2, random_state=self.random_state)
self.train_df = train_df
self.test_df = test_df
train_corpus, train_dictionary = self.create_corpus(train_df, 'processed_text')
logging.info("Training LDA model...")
lda_model = LdaModel(corpus=train_corpus, id2word=train_dictionary, num_topics=self.num_topics,
passes=self.passes,
iterations=self.iterations, alpha=self.alpha, eta=self.eta, random_state=self.random_state,
eval_every=self.eval_every, chunksize=self.chunksize)
logging.info("LDA model trained successfully.")
# Save the trained model to a file
lda_model.save(self.model_path)
logging.info(f"LDA model saved to {self.model_path}")
[docs] def detection(self):
"""
Assign topics to each document and save unique ground truths and predictions to a CSV file.
"""
self.load_model() # Ensure the model is loaded before making detections
corpus, _ = self.create_corpus(self.test_df, 'processed_text')
topics = [self.lda_model.get_document_topics(bow) for bow in corpus]
# Get the ground truth labels and predicted labels
ground_truths = self.test_df['event_id'].tolist()
predictions = [max(topic, key=lambda x: x[1])[0] for topic in topics]
# Convert to sets to remove duplicates
unique_ground_truths = list(set(ground_truths))
unique_predictions = list(set(predictions))
# Pad the shorter list with None to make them the same length
max_len = max(len(unique_ground_truths), len(unique_predictions))
unique_ground_truths.extend([None] * (max_len - len(unique_ground_truths)))
unique_predictions.extend([None] * (max_len - len(unique_predictions)))
# Combine into a dataframe
data = {
'Unique Ground Truths': unique_ground_truths,
'Unique Predictions': unique_predictions
}
df = pd.DataFrame(data)
# Save to a CSV file
output_file = os.path.join(self.file_path, "unique_ground_truths_predictions.csv")
df.to_csv(output_file, index=False)
print(f"Unique ground truths and predictions have been saved to {output_file}")
return ground_truths, predictions
[docs] def evaluate(self, ground_truths, predictions):
"""
Evaluate the model.
"""
# Calculate Normalized Mutual Information (NMI)
nmi = metrics.normalized_mutual_info_score(ground_truths, predictions)
print(f"Normalized Mutual Information (NMI): {nmi}")
# Calculate Adjusted Mutual Information (AMI)
ami = metrics.adjusted_mutual_info_score(ground_truths, predictions)
print(f"Adjusted Mutual Information (AMI): {ami}")
# Calculate Adjusted Rand Index (ARI)
ari = metrics.adjusted_rand_score(ground_truths, predictions)
print(f"Adjusted Rand Index (ARI): {ari}")
# Get the current date and time
current_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# Save results to a file in append mode
with open(self.model_path + "_evaluation.txt", "a") as f:
f.write(f"Date and Time: {current_datetime}\n")
f.write(f"Normalized Mutual Information (NMI): {nmi}\n")
f.write(f"Adjusted Mutual Information (AMI): {ami}\n")
f.write(f"Adjusted Rand Index (ARI): {ari}\n")
f.write("\n") # Add a newline for better readability