import os
import pandas as pd
import numpy as np
import datetime
from gensim.models import Word2Vec
from gensim.similarities import WmdSimilarity
from sklearn.model_selection import train_test_split
from sklearn import metrics
from tqdm import tqdm
import multiprocessing
from concurrent.futures import ProcessPoolExecutor
from functools import partial
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from dataset.dataloader import Event2012
# event_id, filtered_words
[docs]class WMD:
r"""The WMD model for social event detection that uses Word Mover's Distance
to measure document similarity and detect events.
.. note::
This detector uses word embeddings and Word Mover's Distance to identify similar documents
and detect events in social media data. The model requires a dataset object with a
load_data() method.
Parameters
----------
dataset : object
The dataset object containing social media data.
Must provide load_data() method that returns the raw data.
vector_size : int, optional
Dimensionality of word vectors. Default: ``100``.
window : int, optional
Maximum distance between current and predicted word. Default: ``5``.
min_count : int, optional
Minimum word frequency. Default: ``1``.
sg : int, optional
Training algorithm: Skip-gram (1) or CBOW (0). Default: ``1``.
num_best : int, optional
Number of best matches to return. Default: ``5``.
threshold : float, optional
Similarity threshold for event detection. Default: ``0.6``.
batch_size : int, optional
Batch size for processing. Default: ``1000``.
n_workers : int, optional
Number of worker processes. Default: ``CPU count - 1``.
file_path : str, optional
Path to save model files. Default: ``'../model/model_saved/WMD/'``.
"""
def __init__(self,
dataset,
vector_size=100,
window=5,
min_count=1,
sg=1,
num_best=5,
threshold=0.6,
batch_size=1000, # 新增:批处理大小
n_workers=None, # 新增:进程数
file_path='../model/model_saved/WMD/'):
self.dataset = dataset.load_data()
self.vector_size = vector_size
self.window = window
self.min_count = min_count
self.sg = sg
self.num_best = num_best
self.threshold = threshold
self.batch_size = batch_size
self.n_workers = n_workers or max(1, multiprocessing.cpu_count() - 1)
self.file_path = file_path
self.df = None
self.train_df = None
self.test_df = None
self.word2vec_model = None
self.model_path = os.path.join(self.file_path, 'word2vec_model.model')
[docs] def preprocess(self):
"""
优化的数据预处理
"""
df = self.dataset[['filtered_words', 'event_id']].copy()
# 使用列表推导式优化处理速度
df['processed_text'] = [
[str(word).lower() for word in x] if isinstance(x, list) else []
for x in df['filtered_words']
]
# 过滤掉空文档
df = df[df['processed_text'].map(len) > 0]
self.df = df
return df
[docs] def fit(self):
"""
Train the Word2Vec model and save it to a file.
"""
os.makedirs(os.path.dirname(self.model_path), exist_ok=True)
train_df, test_df = train_test_split(self.df, test_size=0.2, random_state=42)
self.train_df = train_df
self.test_df = test_df
sentences = train_df['processed_text'].tolist()
print("Training Word2Vec model...")
word2vec_model = Word2Vec(sentences=sentences,
vector_size=self.vector_size,
window=self.window,
min_count=self.min_count,
sg=self.sg)
print("Word2Vec model trained successfully.")
word2vec_model.save(self.model_path)
print(f"Word2Vec model saved to {self.model_path}")
self.word2vec_model = word2vec_model.wv
[docs] def detection(self):
"""
优化的事件检测
"""
if self.word2vec_model is None:
word2vec_model = Word2Vec.load(self.model_path)
self.word2vec_model = word2vec_model.wv
test_corpus = self.test_df['processed_text'].tolist()
train_corpus = self.train_df['processed_text'].tolist()
print("Calculating WMD distances...")
instance = WmdSimilarity(train_corpus, self.word2vec_model, num_best=self.num_best)
# 使用多进程处理文档
process_doc = partial(
process_document,
instance=instance,
train_df=self.train_df,
threshold=self.threshold,
num_best=self.num_best
)
predictions = []
with ProcessPoolExecutor(max_workers=self.n_workers) as executor:
# 批处理文档
for i in tqdm(range(0, len(test_corpus), self.batch_size)):
batch = test_corpus[i:i + self.batch_size]
batch_predictions = list(executor.map(process_doc, batch))
predictions.extend(batch_predictions)
# 处理未分配事件的文档
max_event_id = max(self.train_df['event_id'])
new_event_counter = 1
for i in range(len(predictions)):
if predictions[i] == -1:
predictions[i] = max_event_id + new_event_counter
new_event_counter += 1
ground_truths = self.test_df['event_id'].tolist()
# 保存结果
self._save_results(ground_truths, predictions)
return ground_truths, predictions
[docs] def _save_results(self, ground_truths, predictions):
"""
保存结果的辅助方法
"""
unique_ground_truths = list(set(ground_truths))
unique_predictions = list(set(predictions))
max_len = max(len(unique_ground_truths), len(unique_predictions))
unique_ground_truths.extend([None] * (max_len - len(unique_ground_truths)))
unique_predictions.extend([None] * (max_len - len(unique_predictions)))
data = {
'Unique Ground Truths': unique_ground_truths,
'Unique Predictions': unique_predictions
}
df = pd.DataFrame(data)
output_file = os.path.join(self.file_path, "unique_ground_truths_predictions.csv")
df.to_csv(output_file, index=False)
print(f"Unique ground truths and predictions have been saved to {output_file}")
[docs] def evaluate(self, ground_truths, predictions):
"""
Evaluate the model and save results.
"""
nmi = metrics.normalized_mutual_info_score(ground_truths, predictions)
print(f"Normalized Mutual Information (NMI): {nmi}")
ami = metrics.adjusted_mutual_info_score(ground_truths, predictions)
print(f"Adjusted Mutual Information (AMI): {ami}")
ari = metrics.adjusted_rand_score(ground_truths, predictions)
print(f"Adjusted Rand Index (ARI): {ari}")
current_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
with open(self.model_path + "_evaluation.txt", "a") as f:
f.write(f"Date and Time: {current_datetime}\n")
f.write(f"Normalized Mutual Information (NMI): {nmi}\n")
f.write(f"Adjusted Mutual Information (AMI): {ami}\n")
f.write(f"Adjusted Rand Index (ARI): {ari}\n")
f.write("\n")
# 新增:计算单个文档的相似度
[docs]def process_document(doc, instance, train_df, threshold, num_best):
sims = instance[doc]
similar_events = []
for idx, score in sims[:num_best]: # 只处理前num_best个结果
if score > threshold:
similar_events.append(train_df.iloc[idx]['event_id'])
if similar_events:
prediction = max(set(similar_events), key=similar_events.count)
else:
prediction = -1 # 使用临时标记,后续处理
return prediction