Source code for SocialED.detector.bert

import os
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn import metrics
import torch
import logging
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from dataset.dataloader import DatasetLoader
# Setup logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


[docs]class BERT: r"""The BERT model for social event detection that uses BERT embeddings to detect events in social media data. .. note:: This detector uses BERT embeddings to identify events in social media data. The model requires a dataset object with a load_data() method. Parameters ---------- dataset : object The dataset object containing social media data. Must provide load_data() method that returns the raw data. model_name : str, optional Path to pretrained BERT model or name from HuggingFace. If path doesn't exist, defaults to 'bert-base-uncased'. Default: ``'../model/model_needed/bert-base-uncased'``. max_length : int, optional Maximum sequence length for BERT tokenizer. Longer sequences will be truncated. Default: ``128``. df : pandas.DataFrame, optional Preprocessed dataframe. If None, will be created during preprocessing. Default: ``None``. train_df : pandas.DataFrame, optional Training data split. If None, will be created during model fitting. Default: ``None``. test_df : pandas.DataFrame, optional Test data split. If None, will be created during model fitting. Default: ``None``. """ def __init__(self, dataset, model_name='../model/model_needed/bert-base-uncased', max_length=128, df=None, train_df=None, test_df=None ): self.dataset = dataset.load_data() if os.path.exists(model_name): self.model_name = model_name else: self.model_name = 'bert-base-uncased' self.max_length = max_length self.df = df self.train_df = train_df self.test_df = test_df # self.device = torch.device("cpu") self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.tokenizer = BertTokenizer.from_pretrained(self.model_name) self.model = BertModel.from_pretrained(self.model_name).to(self.device)
[docs] def preprocess(self): """ Data preprocessing: tokenization, stop words removal, etc. """ df = self.dataset df['processed_text'] = df['filtered_words'].apply( lambda x: ' '.join([str(word).lower() for word in x]) if isinstance(x, list) else '') self.df = df return df
[docs] def get_bert_embeddings(self, text): """ Get BERT embeddings for a given text. """ inputs = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, truncation=True, padding='max_length') inputs = {key: val.to(self.device) for key, val in inputs.items()} with torch.no_grad(): outputs = self.model(**inputs) last_hidden_states = outputs.last_hidden_state mean_embedding = torch.mean(last_hidden_states, dim=1).squeeze().cpu().numpy() return mean_embedding
[docs] def fit(self): pass
[docs] def detection(self): """ Detect events by comparing BERT embeddings. """ train_df, test_df = train_test_split(self.df, test_size=0.2, random_state=42) self.train_df = train_df self.test_df = test_df logging.info("Calculating BERT embeddings for the training set...") train_df['bert_embedding'] = train_df['processed_text'].apply(self.get_bert_embeddings) logging.info("BERT embeddings calculated for the training set.") logging.info("Calculating BERT embeddings for the test set...") test_df['bert_embedding'] = test_df['processed_text'].apply(self.get_bert_embeddings) logging.info("BERT embeddings calculated for the test set.") train_embeddings = np.stack(self.train_df['bert_embedding'].values) test_embeddings = np.stack(self.test_df['bert_embedding'].values) predictions = [] for test_emb in test_embeddings: distances = np.linalg.norm(train_embeddings - test_emb, axis=1) closest_idx = np.argmin(distances) predictions.append(self.train_df.iloc[closest_idx]['event_id']) ground_truths = self.test_df['event_id'].tolist() return ground_truths, predictions
[docs] def evaluate(self, ground_truths, predictions): """ Evaluate the BERT-based model. """ # Calculate Adjusted Rand Index (ARI) ari = metrics.adjusted_rand_score(ground_truths, predictions) print(f"Adjusted Rand Index (ARI): {ari}") # Calculate Adjusted Mutual Information (AMI) ami = metrics.adjusted_mutual_info_score(ground_truths, predictions) print(f"Adjusted Mutual Information (AMI): {ami}") # Calculate Normalized Mutual Information (NMI) nmi = metrics.normalized_mutual_info_score(ground_truths, predictions) print(f"Normalized Mutual Information (NMI): {nmi}") return ari, ami, nmi