Source code for SocialED.detector.word2vec

import argparse
import os
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn import metrics
import logging
from sklearn.cluster import KMeans
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from dataset.dataloader import DatasetLoader
# Setup logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


[docs]class WORD2VEC:
    r"""The Word2Vec model for social event detection that uses word embeddings 
    to detect events in social media data.

    .. note::
        This detector uses word embeddings to identify semantic relationships
        and detect events in social media data. The model requires a dataset
        object with a load_data() method.

    See :cite:`mikolov2013efficient` for details.

    Parameters
    ----------
    dataset : object
        The dataset object containing social media data.
        Must provide load_data() method that returns the raw data.
    vector_size : int, optional
        Dimensionality of word vectors. Default: ``100``.
    window : int, optional
        Maximum distance between current and predicted word. Default: ``5``.
    min_count : int, optional
        Minimum word frequency. Default: ``1``.
    sg : int, optional
        Training algorithm: Skip-gram (1) or CBOW (0). Default: ``1``.
    file_path : str, optional
        Path to save model files. Default: ``'../model/model_saved/Word2vec/word2vec_model.model'``.
    """
    def __init__(self,
                 dataset,
                 vector_size=100, 
                 window=5,
                 min_count=1,
                 sg=1,
                 file_path='../model/model_saved/Word2vec/word2vec_model.model'):

        self.dataset = dataset.load_data()
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.sg = sg
        self.file_path = file_path
        self.df = None
        self.train_df = None
        self.test_df = None
        self.word2vec_model = None

[docs]    def preprocess(self):
        """
        Data preprocessing: tokenization, stop words removal, etc.
        """
        df = self.dataset[['filtered_words', 'event_id']].copy()
        df['processed_text'] = df['filtered_words'].apply(
            lambda x: [str(word).lower() for word in x] if isinstance(x, list) else [])
        self.df = df
        return df

[docs]    def fit(self):
        """
        Train the Word2Vec model and save it to a file.
        """
        # Ensure the directory exists
        os.makedirs(os.path.dirname(self.file_path), exist_ok=True)

        train_df, test_df = train_test_split(self.df, test_size=0.2, random_state=42)
        self.train_df = train_df
        self.test_df = test_df

        sentences = train_df['processed_text'].tolist()

        logging.info("Training Word2Vec model...")
        word2vec_model = Word2Vec(sentences=sentences, vector_size=self.vector_size, window=self.window,
                                  min_count=self.min_count, sg=self.sg)
        logging.info("Word2Vec model trained successfully.")

        # Save the trained model to a file
        word2vec_model.save(self.file_path)
        logging.info(f"Word2Vec model saved to {self.file_path}")

        self.word2vec_model = word2vec_model
        return word2vec_model

[docs]    def load_model(self):
        """
        Load the Word2Vec model from a file.
        """
        logging.info(f"Loading Word2Vec model from {self.file_path}...")
        word2vec_model = Word2Vec.load(self.file_path)
        logging.info("Word2Vec model loaded successfully.")

        self.word2vec_model = word2vec_model
        return word2vec_model

[docs]    def document_vector(self, document):
        """
        Create a document vector by averaging the Word2Vec embeddings of its words.
        """
        words = [word for word in document if word in self.word2vec_model.wv]
        if words:
            return np.mean(self.word2vec_model.wv[words], axis=0)
        else:
            return np.zeros(self.vector_size)

[docs]    def detection(self):
        """
        Detect events by representing each document as the average Word2Vec embedding of its words.
        """
        self.load_model()  # Ensure the model is loaded before making detections

        test_vectors = self.test_df['processed_text'].apply(self.document_vector)
        predictions = np.stack(test_vectors)

        ground_truths = self.test_df['event_id'].tolist()
        kmeans = KMeans(n_clusters=len(set(ground_truths)), random_state=42)
        predictions = kmeans.fit_predict(predictions)

        return ground_truths, predictions

[docs]    def evaluate(self, ground_truths, predictions):
        """
        Evaluate the model.
        """

        # Calculate Adjusted Rand Index (ARI)
        ari = metrics.adjusted_rand_score(ground_truths, predictions)
        print(f"Adjusted Rand Index (ARI): {ari}")

        # Calculate Adjusted Mutual Information (AMI)
        ami = metrics.adjusted_mutual_info_score(ground_truths, predictions)
        print(f"Adjusted Mutual Information (AMI): {ami}")

        # Calculate Normalized Mutual Information (NMI)
        nmi = metrics.normalized_mutual_info_score(ground_truths, predictions)
        print(f"Normalized Mutual Information (NMI): {nmi}")

        return ari, ami, nmi