Source code for SocialED.detector.wmd

import os
import pandas as pd
import numpy as np
import datetime
from gensim.models import Word2Vec
from gensim.similarities import WmdSimilarity
from sklearn.model_selection import train_test_split
from sklearn import metrics
from tqdm import tqdm
import multiprocessing
from concurrent.futures import ProcessPoolExecutor
from functools import partial
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from dataset.dataloader import Event2012


# event_id, filtered_words
[docs]class WMD:

    r"""The WMD model for social event detection that uses Word Mover's Distance
    to measure document similarity and detect events.

    .. note::
        This detector uses word embeddings and Word Mover's Distance to identify similar documents
        and detect events in social media data. The model requires a dataset object with a 
        load_data() method.

    Parameters
    ----------
    dataset : object
        The dataset object containing social media data.
        Must provide load_data() method that returns the raw data.
    vector_size : int, optional
        Dimensionality of word vectors. Default: ``100``.
    window : int, optional
        Maximum distance between current and predicted word. Default: ``5``.
    min_count : int, optional
        Minimum word frequency. Default: ``1``.
    sg : int, optional
        Training algorithm: Skip-gram (1) or CBOW (0). Default: ``1``.
    num_best : int, optional
        Number of best matches to return. Default: ``5``.
    threshold : float, optional
        Similarity threshold for event detection. Default: ``0.6``.
    batch_size : int, optional
        Batch size for processing. Default: ``1000``.
    n_workers : int, optional
        Number of worker processes. Default: ``CPU count - 1``.
    file_path : str, optional
        Path to save model files. Default: ``'../model/model_saved/WMD/'``.
    """
    def __init__(self,
                 dataset,
                 vector_size=100,
                 window=5,
                 min_count=1,
                 sg=1,
                 num_best=5,
                 threshold=0.6,
                 batch_size=1000,  # 新增：批处理大小
                 n_workers=None,   # 新增：进程数
                 file_path='../model/model_saved/WMD/'):
        self.dataset = dataset.load_data()
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.sg = sg
        self.num_best = num_best
        self.threshold = threshold
        self.batch_size = batch_size
        self.n_workers = n_workers or max(1, multiprocessing.cpu_count() - 1)
        self.file_path = file_path
        self.df = None
        self.train_df = None
        self.test_df = None
        self.word2vec_model = None
        self.model_path = os.path.join(self.file_path, 'word2vec_model.model')

[docs]    def preprocess(self):
        """
        优化的数据预处理
        """
        df = self.dataset[['filtered_words', 'event_id']].copy()
        # 使用列表推导式优化处理速度
        df['processed_text'] = [
            [str(word).lower() for word in x] if isinstance(x, list) else []
            for x in df['filtered_words']
        ]
        # 过滤掉空文档
        df = df[df['processed_text'].map(len) > 0]
        self.df = df
        return df

[docs]    def fit(self):
        """
        Train the Word2Vec model and save it to a file.
        """
        os.makedirs(os.path.dirname(self.model_path), exist_ok=True)

        train_df, test_df = train_test_split(self.df, test_size=0.2, random_state=42)
        self.train_df = train_df
        self.test_df = test_df

        sentences = train_df['processed_text'].tolist()

        print("Training Word2Vec model...")
        word2vec_model = Word2Vec(sentences=sentences, 
                                vector_size=self.vector_size,
                                window=self.window,
                                min_count=self.min_count,
                                sg=self.sg)
        print("Word2Vec model trained successfully.")

        word2vec_model.save(self.model_path)
        print(f"Word2Vec model saved to {self.model_path}")

        self.word2vec_model = word2vec_model.wv

[docs]    def detection(self):
        """
        优化的事件检测
        """
        if self.word2vec_model is None:
            word2vec_model = Word2Vec.load(self.model_path)
            self.word2vec_model = word2vec_model.wv

        test_corpus = self.test_df['processed_text'].tolist()
        train_corpus = self.train_df['processed_text'].tolist()

        print("Calculating WMD distances...")
        instance = WmdSimilarity(train_corpus, self.word2vec_model, num_best=self.num_best)

        # 使用多进程处理文档
        process_doc = partial(
            process_document,
            instance=instance,
            train_df=self.train_df,
            threshold=self.threshold,
            num_best=self.num_best
        )

        predictions = []
        with ProcessPoolExecutor(max_workers=self.n_workers) as executor:
            # 批处理文档
            for i in tqdm(range(0, len(test_corpus), self.batch_size)):
                batch = test_corpus[i:i + self.batch_size]
                batch_predictions = list(executor.map(process_doc, batch))
                predictions.extend(batch_predictions)

        # 处理未分配事件的文档
        max_event_id = max(self.train_df['event_id'])
        new_event_counter = 1
        for i in range(len(predictions)):
            if predictions[i] == -1:
                predictions[i] = max_event_id + new_event_counter
                new_event_counter += 1

        ground_truths = self.test_df['event_id'].tolist()

        # 保存结果
        self._save_results(ground_truths, predictions)

        return ground_truths, predictions

[docs]    def _save_results(self, ground_truths, predictions):
        """
        保存结果的辅助方法
        """
        unique_ground_truths = list(set(ground_truths))
        unique_predictions = list(set(predictions))

        max_len = max(len(unique_ground_truths), len(unique_predictions))
        unique_ground_truths.extend([None] * (max_len - len(unique_ground_truths)))
        unique_predictions.extend([None] * (max_len - len(unique_predictions)))

        data = {
            'Unique Ground Truths': unique_ground_truths,
            'Unique Predictions': unique_predictions
        }
        df = pd.DataFrame(data)
        output_file = os.path.join(self.file_path, "unique_ground_truths_predictions.csv")
        df.to_csv(output_file, index=False)
        print(f"Unique ground truths and predictions have been saved to {output_file}")

[docs]    def evaluate(self, ground_truths, predictions):
        """
        Evaluate the model and save results.
        """
        nmi = metrics.normalized_mutual_info_score(ground_truths, predictions)
        print(f"Normalized Mutual Information (NMI): {nmi}")

        ami = metrics.adjusted_mutual_info_score(ground_truths, predictions)
        print(f"Adjusted Mutual Information (AMI): {ami}")

        ari = metrics.adjusted_rand_score(ground_truths, predictions)
        print(f"Adjusted Rand Index (ARI): {ari}")

        current_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        with open(self.model_path + "_evaluation.txt", "a") as f:
            f.write(f"Date and Time: {current_datetime}\n")
            f.write(f"Normalized Mutual Information (NMI): {nmi}\n")
            f.write(f"Adjusted Mutual Information (AMI): {ami}\n")
            f.write(f"Adjusted Rand Index (ARI): {ari}\n")
            f.write("\n")

# 新增：计算单个文档的相似度
[docs]def process_document(doc, instance, train_df, threshold, num_best):
    sims = instance[doc]
    similar_events = []
    for idx, score in sims[:num_best]:  # 只处理前num_best个结果
        if score > threshold:
            similar_events.append(train_df.iloc[idx]['event_id'])
    
    if similar_events:
        prediction = max(set(similar_events), key=similar_events.count)
    else:
        prediction = -1  # 使用临时标记，后续处理
    
    return prediction