Source code for SocialED.detector.eventx

# eventx original paper:
# Bang Liu, Fred X Han, Di Niu, Linglong Kong, Kunfeng Lai, and Yu Xu. 2020. Story Forest: Extracting Events and Telling Stories from Breaking News. TKDD 14, 3 (2020), 1–28.
import pandas as pd
import numpy as np
import itertools
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
import random
from sklearn import metrics
from collections import Counter
import os
import pickle
from sklearn.model_selection import train_test_split
import logging
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


[docs]class EventX:
    r"""The EventX model for social event detection that extracts events from breaking news
    using keyword co-occurrence and graph-based clustering.

    .. note::
        This detector uses keyword co-occurrence and graph-based clustering to identify events in social media data.
        The model requires a dataset object with a load_data() method.

    See :cite:`liu2020story` for details.

    Parameters
    ----------
    dataset : object
        The dataset object containing social media data.
        Must provide load_data() method that returns the raw data.
    file_path : str, optional
        Path to save model files. Default: ``'../model/model_saved/eventX/'``.
    num_repeats : int, optional
        Number of times to repeat keyword extraction. Default: ``5``.
    min_cooccur_time : int, optional
        Minimum number of times keywords must co-occur. Default: ``2``.
    min_prob : float, optional
        Minimum probability threshold for keyword selection. Default: ``0.15``.
    max_kw_num : int, optional
        Maximum number of keywords to extract per document. Default: ``3``.
    """
    def __init__(self,
                 dataset,
                 file_path='../model/model_saved/eventX/',
                 num_repeats=5,
                 min_cooccur_time=2,
                 min_prob=0.15,
                 max_kw_num=3):
        self.dataset = dataset.load_data()
        self.file_path = file_path
        self.mask_path = None
        self.num_repeats = num_repeats
        self.min_cooccur_time = min_cooccur_time
        self.min_prob = min_prob
        self.max_kw_num = max_kw_num

    # construct offline df
[docs]    def preprocess(self):

        self.split()
        self.mask_path = '../model/model_saved/eventX/split_indices/test_indices.npy'
        df = self.dataset
        logging.info("Loaded all_df_words_ents_mid.")

        test_mask = np.load(self.mask_path, allow_pickle=True)
        df = df.iloc[test_mask, :]
        logging.info("Test df extracted.")

        np.save(self.file_path + 'corpus_offline.npy', df.values)
        logging.info("corpus_offline saved.")

        self.df = self.dataset[['event_id','filtered_words','tweet_id','entities']].copy()
        logging.info("Data preprocessed.")

[docs]    def split(self):
        """
        Split the dataset into training, validation, and test sets.
        """
        train_ratio = 0.7
        test_ratio = 0.2
        val_ratio = 0.1

        df = self.dataset

        train_data, temp_data = train_test_split(df, test_size=(1 - train_ratio), random_state=42)
        test_size = test_ratio / (test_ratio + val_ratio)
        test_data, val_data = train_test_split(temp_data, test_size=test_size, random_state=42)

        os.makedirs(self.file_path + '/split_indices/', exist_ok=True)
        np.save(self.file_path + '/split_indices/train_indices.npy', train_data.index.to_numpy())
        np.save(self.file_path + '/split_indices/test_indices.npy', test_data.index.to_numpy())
        np.save(self.file_path + '/split_indices/val_indices.npy', val_data.index.to_numpy())

        os.makedirs(self.file_path + '/split_data/', exist_ok=True)
        train_data.to_numpy().dump(self.file_path + '/split_data/train_data.npy')
        test_data.to_numpy().dump(self.file_path + '/split_data/test_data.npy')
        val_data.to_numpy().dump(self.file_path + '/split_data/val_data.npy')

        self.train_df = train_data
        self.test_df = test_data
        self.val_df = val_data

        logging.info(
            f"Data split completed: {len(train_data)} train, {len(test_data)} test, {len(val_data)} validation samples.")

[docs]    def fit(self):
        pass


[docs]    def detection(self):
        kw_pair_dict, kw_dict = self.construct_dict(self.df, self.file_path)
        m_kw_pair_dict, m_kw_dict, map_index_to_kw, map_kw_to_index = self.map_dicts(kw_pair_dict, kw_dict,
                                                                                     self.file_path)
        G = self.construct_kw_graph(kw_pair_dict, kw_dict, self.min_cooccur_time, self.min_prob)
        communities = []
        self.detect_kw_communities_iter(G, communities, kw_pair_dict, kw_dict, self.max_kw_num)
        m_communities = self.map_communities(communities, map_kw_to_index)

        logging.info("Model fitted.")
        m_tweets, ground_truths = self.map_tweets(self.df, self.file_path)
        predictions = self.classify_docs(m_tweets, m_communities, map_kw_to_index, self.file_path)

        logging.info("Events detected.")
        return ground_truths, predictions

[docs]    def evaluate(self, ground_truths, predictions):
        """
        Evaluate the model.
        """
        # Calculate Normalized Mutual Information (NMI)
        nmi = metrics.normalized_mutual_info_score(ground_truths, predictions)
        print(f"Normalized Mutual Information (NMI): {nmi}")

        # Calculate Adjusted Mutual Information (AMI)
        ami = metrics.adjusted_mutual_info_score(ground_truths, predictions)
        print(f"Adjusted Mutual Information (AMI): {ami}")

        # Calculate Adjusted Rand Index (ARI)
        ari = metrics.adjusted_rand_score(ground_truths, predictions)
        print(f"Adjusted Rand Index (ARI): {ari}")

[docs]    def construct_dict(self, df, dir_path=None):
        kw_pair_dict = {}
        kw_dict = {}

        for _, row in df.iterrows():
            tweet_id = str(row['tweet_id'])
            entities = row['entities']
            entities = ['_'.join(tup) for tup in entities]
            for each in entities:
                if each not in kw_dict.keys():
                    kw_dict[each] = []
                kw_dict[each].append(tweet_id)

            words = row['filtered_words']
            for each in words:
                if each not in kw_dict.keys():
                    kw_dict[each] = []
                kw_dict[each].append(tweet_id)

            for r in itertools.combinations(entities + words, 2):
                r = list(r)
                r.sort()
                pair = (r[0], r[1])
                if pair not in kw_pair_dict.keys():
                    kw_pair_dict[pair] = []
                kw_pair_dict[pair].append(tweet_id)

        if dir_path is not None:
            pickle.dump(kw_dict, open(dir_path + '/kw_dict.pickle', 'wb'))
            pickle.dump(kw_pair_dict, open(dir_path + '/kw_pair_dict.pickle', 'wb'))

        return kw_pair_dict, kw_dict

[docs]    def map_dicts(self, kw_pair_dict, kw_dict, dir_path=None):
        map_index_to_kw = {}
        m_kw_dict = {}
        for i, k in enumerate(kw_dict.keys()):
            map_index_to_kw['k' + str(i)] = k
            m_kw_dict['k' + str(i)] = kw_dict[k]
        map_kw_to_index = {v: k for k, v in map_index_to_kw.items()}
        m_kw_pair_dict = {}
        for _, pair in enumerate(kw_pair_dict.keys()):
            m_kw_pair_dict[(map_kw_to_index[pair[0]], map_kw_to_index[pair[1]])] = kw_pair_dict[pair]

        if dir_path is not None:
            pickle.dump(m_kw_pair_dict, open(dir_path + '/m_kw_pair_dict.pickle', 'wb'))
            pickle.dump(m_kw_dict, open(dir_path + '/m_kw_dict.pickle', 'wb'))
            pickle.dump(map_index_to_kw, open(dir_path + '/map_index_to_kw.pickle', 'wb'))
            pickle.dump(map_kw_to_index, open(dir_path + '/map_kw_to_index.pickle', 'wb'))

        return m_kw_pair_dict, m_kw_dict, map_index_to_kw, map_kw_to_index

[docs]    def construct_kw_graph(self, kw_pair_dict, kw_dict, min_cooccur_time, min_prob):
        G = nx.Graph()
        G.add_nodes_from(list(kw_dict.keys()))
        for pair, co_tid_list in kw_pair_dict.items():
            if len(co_tid_list) > min_cooccur_time:
                if (len(co_tid_list) / len(kw_dict[pair[0]]) > min_prob) and (
                        len(co_tid_list) / len(kw_dict[pair[1]]) > min_prob):
                    G.add_edge(*pair)
        return G

[docs]    def detect_kw_communities_iter(self, G, communities, kw_pair_dict, kw_dict, max_kw_num=3):
        connected_components = [c for c in nx.connected_components(G)]
        while len(connected_components) >= 1:
            c = connected_components[0]
            if len(c) < max_kw_num:
                communities.append(c)
                G.remove_nodes_from(c)
            else:
                c_sub_G = G.subgraph(c).copy()
                d = nx.edge_betweenness_centrality(c_sub_G)
                max_value = max(d.values())
                edges = [key for key, value in d.items() if value == max_value]
                if len(edges) > 1:
                    probs = []
                    for e in edges:
                        e = list(e)
                        e.sort()
                        pair = (e[0], e[1])
                        co_len = len(kw_pair_dict[pair])
                        e_prob = (co_len / len(kw_dict[pair[0]]) + co_len / len(kw_dict[pair[1]])) / 2
                        probs.append(e_prob)
                    min_prob = min(probs)
                    min_index = [i for i, j in enumerate(probs) if j == min_prob]
                    edge_to_remove = edges[min_index[0]]
                else:
                    edge_to_remove = edges[0]
                G.remove_edge(*edge_to_remove)
            connected_components = [c for c in nx.connected_components(G)]

[docs]    def map_communities(self, communities, map_kw_to_index):
        m_communities = []
        for cluster in communities:
            m_cluster = ' '.join(map_kw_to_index[kw] for kw in cluster)
            m_communities.append(m_cluster)
        return m_communities

[docs]    def classify_docs(self, test_tweets, m_communities, map_kw_to_index, dir_path=None):
        m_test_tweets = []
        for doc in test_tweets:
            m_doc = ' '.join(map_kw_to_index[kw] for kw in doc)     
            m_test_tweets.append(m_doc)

        vectorizer = TfidfVectorizer()
        X = vectorizer.fit_transform(m_communities + m_test_tweets)
        train_size = len(m_communities)
        test_size = len(m_test_tweets)
        classes = []
        for i in range(test_size):
            cosine_similarities = linear_kernel(X[train_size + i], X[:train_size]).flatten()
            max_similarity = cosine_similarities[cosine_similarities.argsort()[-1]]
            related_clusters = [i for i, sim in enumerate(cosine_similarities) if sim == max_similarity]
            if len(related_clusters) == 1:
                classes.append(related_clusters[0])
            else:
                classes.append(random.choice(related_clusters))

        if dir_path is not None:
            np.save(dir_path + '/classes.npy', classes)

        return classes

[docs]    def map_tweets(self, df, dir_path=None):
        m_tweets = []
        ground_truths = []
        for _, row in df.iterrows():
            entities = row['entities']
            entities = ['_'.join(tup) for tup in entities]
            words = row['filtered_words']
            m_tweets.append(entities + words)
            ground_truths.append(row['event_id'])

        if dir_path is not None:
            with open(os.path.join(dir_path, 'm_tweets.pkl'), 'wb') as f:
                pickle.dump(m_tweets, f)
            with open(os.path.join(dir_path, 'ground_truths.pkl'), 'wb') as f:
                pickle.dump(ground_truths, f)

        return m_tweets, ground_truths


# recursive version, can cause RecursionError when running on large graphs. Changed to iterative version below.
[docs]def detect_kw_communities(G, communities, kw_pair_dict, kw_dict, max_kw_num=3):
    connected_components = [c for c in nx.connected_components(G)]
    if len(connected_components) >= 1:
        c = connected_components[0]
        if len(c) < max_kw_num:
            communities.append(c)
            G.remove_nodes_from(c)
        else:
            c_sub_G = G.subgraph(c).copy()
            d = nx.edge_betweenness_centrality(c_sub_G)
            max_value = max(d.values())
            edges = [key for key, value in d.items() if value == max_value]
            # If two edges have the same betweenness score, the one with lower conditional probability will be removed
            if len(edges) > 1:
                probs = []
                for e in edges:
                    e = list(e)
                    e.sort()
                    pair = (e[0], e[1])
                    co_len = len(kw_pair_dict[pair])
                    e_prob = (co_len / len(kw_dict[pair[0]]) + co_len / len(kw_dict[pair[1]])) / 2
                    probs.append(e_prob)
                min_prob = min(probs)
                min_index = [i for i, j in enumerate(probs) if j == min_prob]
                edge_to_remove = edges[min_index[0]]
            else:
                edge_to_remove = edges[0]
            G.remove_edge(*edge_to_remove)
        detect_kw_communities(G, communities, kw_pair_dict, kw_dict, max_kw_num)
    else:
        return