import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F
from torch.autograd import Variable
import os
import pandas as pd
from collections import Counter
from itertools import combinations
from time import time
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.model_selection import train_test_split
import argparse
import logging
import spacy
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from dataset.dataloader import DatasetLoader
[docs]class BiLSTM:
r"""The BiLSTM model for social event detection that uses bidirectional LSTM
to detect events in social media data.
.. note::
This detector uses bidirectional LSTM to identify events in social media data.
The model requires a dataset object with a load_data() method.
Parameters
----------
dataset : object
The dataset object containing social media data.
Must provide load_data() method that returns the raw data.
lr : float, optional
Learning rate for optimizer. Default: ``1e-3``.
batch_size : int, optional
Batch size for training. Default: ``1000``.
dropout_keep_prob : float, optional
Dropout keep probability. Default: ``0.8``.
embedding_size : int, optional
Size of word embeddings. Default: ``300``.
max_size : int, optional
Maximum vocabulary size. Default: ``5000``.
seed : int, optional
Random seed for reproducibility. Default: ``1``.
num_hidden_nodes : int, optional
Number of LSTM hidden nodes. Default: ``32``.
hidden_dim2 : int, optional
Size of second hidden layer. Default: ``64``.
num_layers : int, optional
Number of LSTM layers. Default: ``1``.
bi_directional : bool, optional
Whether to use bidirectional LSTM. Default: ``True``.
pad_index : int, optional
Index used for padding. Default: ``0``.
num_epochs : int, optional
Number of training epochs. Default: ``20``.
margin : int, optional
Margin for triplet loss. Default: ``3``.
max_len : int, optional
Maximum sequence length. Default: ``10``.
file_path : str, optional
Path to save model files. Default: ``'../model/model_saved/Bilstm/'``.
"""
def __init__(self, dataset,
lr=1e-3,
batch_size=1000,
dropout_keep_prob=0.8,
embedding_size=300,
max_size=5000,
seed=1,
num_hidden_nodes=32,
hidden_dim2=64,
num_layers=1,
bi_directional=True,
pad_index=0,
num_epochs=20,
margin=3,
max_len=10,
file_path='../model/model_saved/Bilstm/'):
self.dataset = dataset.load_data()
self.lr = lr
self.batch_size = batch_size
self.dropout_keep_prob = dropout_keep_prob
self.embedding_size = embedding_size
self.max_size = max_size
self.seed = seed
self.num_hidden_nodes = num_hidden_nodes
self.hidden_dim2 = hidden_dim2
self.num_layers = num_layers
self.bi_directional = bi_directional
self.pad_index = pad_index
self.num_epochs = num_epochs
self.margin = margin
self.max_len = max_len
self.file_path = file_path
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.df = None
self.train_df = None
self.test_df = None
self.word2idx = None
self.idx2word = None
self.weight = None
# Add the rest of the class methods here, like preprocess, fit, detection, etc.
[docs] def preprocess(self):
"""
Data preprocessing: tokenization, stop words removal, etc.
"""
self.split()
df = self.dataset[['event_id', 'words', 'filtered_words']].copy()
# Tokenize tweets
# f_batch_text = df.iloc[:, 5]
f_batch_text = df.iloc[:, 2]
logging.info("Extracted tweets.")
# Count unique words (converted to lowercases)
words = Counter()
for tweet in f_batch_text.values:
words.update(w.lower() for w in tweet)
# Convert words from counter to list (sorted by frequencies from high to low)
words = [key for key, _ in words.most_common()]
words = ['_PAD', '_UNK'] + words
logging.info('Extracted unique words.')
# Construct a mapping of words to indices and vice versa
self.word2idx = {o: i for i, o in enumerate(words)}
self.idx2word = {i: o for i, o in enumerate(words)}
# Save
os.makedirs(self.file_path, exist_ok=True)
np.save(self.file_path + 'word2idx.npy', self.word2idx)
np.save(self.file_path + 'idx2word.npy', self.idx2word)
logging.info('Constructed and saved word2idx and idx2word maps.')
# Load
self.word2idx = np.load(self.file_path + 'word2idx.npy', allow_pickle='TRUE').item()
logging.info('word2idx map loaded.')
df["wordsidx"] = df.words.apply(
lambda tweet: [self.word2idx.get(w.lower(), self.word2idx['_UNK']) for w in tweet])
logging.info('Tokenized tweets in the df to word indices.')
self.df = df
return df
[docs] def split(self):
"""
Split the dataset into training, validation, and test sets.
"""
train_ratio = 0.7
test_ratio = 0.2
val_ratio = 0.1
df = self.dataset
train_data, temp_data = train_test_split(df, test_size=(1 - train_ratio), random_state=42)
test_size = test_ratio / (test_ratio + val_ratio)
test_data, val_data = train_test_split(temp_data, test_size=test_size, random_state=42)
os.makedirs(self.file_path + '/split_indices/', exist_ok=True)
np.save(self.file_path + '/split_indices/train_indices_7170.npy', train_data.index.to_numpy())
np.save(self.file_path + '/split_indices/test_indices_2048.npy', test_data.index.to_numpy())
np.save(self.file_path + '/split_indices/val_indices_1024.npy', val_data.index.to_numpy())
os.makedirs(self.file_path + '/split_data/', exist_ok=True)
train_data.to_numpy().dump(self.file_path + '/split_data/train_data_7170.npy')
test_data.to_numpy().dump(self.file_path + '/split_data/test_data_2048.npy')
val_data.to_numpy().dump(self.file_path + '/split_data/val_data_1024.npy')
self.train_df = train_data
self.test_df = test_data
self.val_df = val_data
logging.info(
f"Data split completed: {len(train_data)} train, {len(test_data)} test, {len(val_data)} validation samples.")
[docs] def load_embeddings(self):
"""
Load pre-trained word embeddings.
"""
# Initialize weight matrix with zeros
self.weight = np.zeros((len(self.word2idx), self.embedding_size), dtype=np.float64)
# Load pre-trained word2vec model
start = time()
nlp = spacy.load("en_core_web_lg")
logging.info('Word2vec model took {:.2f} mins to load.'.format((time() - start) / 60))
# Update word embeddings to weight
for i in range(len(self.word2idx)):
w = self.idx2word.get(i)
token = nlp(w)
if token.has_vector:
self.weight[i] = token.vector
logging.info('Word embeddings extracted. Shape: {}'.format(self.weight.shape))
# Save and load word embeddings
np.save(self.file_path + 'word_embeddings.npy', self.weight)
logging.info('Word embeddings saved.')
self.weight = np.load(self.file_path + 'word_embeddings.npy')
logging.info('Word embeddings loaded. Shape: {}'.format(self.weight.shape))
self.weight = torch.tensor(self.weight, dtype=torch.float)
[docs] def train(self, model, train_iterator, optimizer, loss_func, log_interval=40):
"""
Train the BiLSTM model.
"""
n_batches = len(train_iterator)
epoch_loss = 0
for i, batch in enumerate(train_iterator):
optimizer.zero_grad()
text, text_lengths = batch['text']
predictions = model(text, text_lengths)
loss, num_triplets = loss_func(predictions, batch['label'])
loss.backward()
optimizer.step()
epoch_loss += loss.item()
if i % log_interval == 0:
print(
f'\tBatch: [{i}/{n_batches} ({100. * (i + 1) / n_batches:.0f}%)]\tLoss: {epoch_loss / (i + 1):.4f}\tNum_triplets: {num_triplets}')
return epoch_loss / n_batches
[docs] def evaluate(self, ground_truths, predictions):
"""
Evaluate the model.
"""
# Calculate Normalized Mutual Information (NMI)
nmi = metrics.normalized_mutual_info_score(ground_truths, predictions)
print(f"Normalized Mutual Information (NMI): {nmi}")
# Calculate Adjusted Mutual Information (AMI)
ami = metrics.adjusted_mutual_info_score(ground_truths, predictions)
print(f"Adjusted Mutual Information (AMI): {ami}")
# Calculate Adjusted Rand Index (ARI)
ari = metrics.adjusted_rand_score(ground_truths, predictions)
print(f"Adjusted Rand Index (ARI): {ari}")
[docs] def run_train(self, epochs, model, train_iterator, test_iterator, optimizer, loss_func):
"""
Run the training and evaluation process for the BiLSTM model.
"""
all_nmi, all_ami, all_ari, all_predictions, all_labels = [], [], [], [], []
for epoch in range(epochs):
# Train the model
start = time()
print(f'Epoch {epoch}. Training.')
train_loss = self.train(model, train_iterator, optimizer, loss_func)
print(f'\tTrain Loss: {train_loss:.4f}')
print(f'\tThis epoch took {(time() - start) / 60:.2f} mins to train.')
# Evaluate the model
start = time()
print(f'Epoch {epoch}. Evaluating.')
model.eval()
with torch.no_grad():
for i, batch in enumerate(test_iterator):
assert i == 0 # cluster all the test tweets at once
text, text_lengths = batch['text']
predictions = model(text, text_lengths)
assert predictions.shape[0] == batch['label'].shape[0]
n_classes = len(set(batch['label'].tolist()))
kmeans = KMeans(n_clusters=n_classes, n_init=10, random_state=0).fit(predictions)
predictions = kmeans.labels_
validate_nmi = metrics.normalized_mutual_info_score(batch['label'], predictions)
validate_ami = metrics.adjusted_mutual_info_score(batch['label'], predictions)
validate_ari = metrics.adjusted_rand_score(batch['label'], predictions)
all_nmi.append(validate_nmi)
all_ami.append(validate_ami)
all_ari.append(validate_ari)
all_predictions.append(predictions)
all_labels.append(batch['label'])
print(f'\tVal. NMI: {validate_nmi:.4f}')
print(f'\tVal. AMI: {validate_ami:.4f}')
print(f'\tVal. ARI: {validate_ari:.4f}')
print(f'\tThis epoch took {(time() - start) / 60:.2f} mins to evaluate.')
return all_nmi, all_ami, all_ari, all_predictions, all_labels
[docs] def fit(self):
"""
Fit the model on the training data and save the best model.
"""
self.load_embeddings()
# Split training and test datasets
train_mask = list(np.load(self.file_path + '/split_indices/train_indices_7170.npy', allow_pickle=True))
test_mask = list(np.load(self.file_path + '/split_indices/test_indices_2048.npy', allow_pickle=True))
train_data = VectorizeData(self.df.iloc[train_mask, :].copy().reset_index(drop=True), self.max_len)
test_data = VectorizeData(self.df.iloc[test_mask, :].copy().reset_index(drop=True), self.max_len)
# Construct training and test iterator
train_iterator = DataLoader(train_data, batch_size=self.batch_size, shuffle=True)
test_iterator = DataLoader(test_data, batch_size=len(test_data), shuffle=True)
# Loss function
loss_func = OnlineTripletLoss(self.margin, RandomNegativeTripletSelector(self.margin))
# Model
lstm_model = LSTM(self.embedding_size, self.weight, self.num_hidden_nodes, self.hidden_dim2,
self.num_layers, self.bi_directional, self.dropout_keep_prob, self.pad_index,
self.batch_size)
# Optimizer
optimizer = torch.optim.Adam(lstm_model.parameters(), lr=self.lr)
# Train and evaluation
all_nmi, all_ami, all_ari, all_predictions, all_labels = self.run_train(self.num_epochs, lstm_model,
train_iterator, test_iterator,
optimizer, loss_func)
best_epoch = [i for i, j in enumerate(all_nmi) if j == max(all_nmi)][0]
print("all_nmi: ", all_nmi)
print("all_ami: ", all_ami)
print("all_ari: ", all_ari)
print("\nTraining completed. Best results at epoch ", best_epoch)
# Save the best model
self.best_model_path = os.path.join(self.file_path, "best_model.pth")
torch.save(lstm_model.state_dict(), self.best_model_path)
print(f"Best model saved at {self.best_model_path}")
self.best_epoch = best_epoch
self.best_model = lstm_model
[docs] def detection(self):
"""
Detect events using the best trained model on the test data.
"""
# Load the best model
lstm_model = LSTM(self.embedding_size, self.weight, self.num_hidden_nodes, self.hidden_dim2,
self.num_layers, self.bi_directional, self.dropout_keep_prob, self.pad_index,
self.batch_size)
lstm_model.load_state_dict(torch.load(self.best_model_path))
lstm_model.eval()
# Load test data
test_mask = list(np.load(self.file_path + '/split_indices/test_indices_2048.npy', allow_pickle=True))
test_data = VectorizeData(self.df.iloc[test_mask, :].copy().reset_index(drop=True), self.max_len)
test_iterator = DataLoader(test_data, batch_size=len(test_data), shuffle=False)
with torch.no_grad():
for i, batch in enumerate(test_iterator):
assert i == 0 # Process all test tweets at once
text, text_lengths = batch['text']
predictions = lstm_model(text, text_lengths)
assert predictions.shape[0] == batch['label'].shape[0]
n_classes = len(set(batch['label'].tolist()))
kmeans = KMeans(n_clusters=n_classes, n_init=10, random_state=0).fit(predictions)
predictions = kmeans.labels_
ground_truths = batch['label']
return ground_truths, predictions
[docs]class LSTM(nn.Module):
# define all the layers used in model
def __init__(self, embedding_dim, weight, lstm_units, hidden_dim, lstm_layers,
bidirectional, dropout, pad_index, batch_size):
super().__init__()
# self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_index)
# use pretrained embeddings
self.embedding = nn.Embedding.from_pretrained(weight, padding_idx=pad_index)
self.lstm = nn.LSTM(embedding_dim,
lstm_units,
num_layers=lstm_layers,
bidirectional=bidirectional,
batch_first=True)
num_directions = 2 if bidirectional else 1
self.fc1 = nn.Linear(lstm_units * num_directions, hidden_dim)
# self.fc2 = nn.Linear(hidden_dim, num_classes)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(dropout)
self.lstm_layers = lstm_layers
self.num_directions = num_directions
self.lstm_units = lstm_units
[docs] def init_hidden(self, batch_size):
h, c = (Variable(torch.zeros(self.lstm_layers * self.num_directions, batch_size, self.lstm_units)),
Variable(torch.zeros(self.lstm_layers * self.num_directions, batch_size, self.lstm_units)))
return h, c
[docs] def forward(self, text, text_lengths):
batch_size = text.shape[0]
h_0, c_0 = self.init_hidden(batch_size)
embedded = self.embedding(text)
packed_embedded = pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted=False)
# output of shape (batch, seq_len, num_directions * hidden_size): tensor containing the
# output features (h_t) from the last layer of the LSTM, for each t.
output, (h_n, c_n) = self.lstm(packed_embedded, (h_0, c_0))
output_unpacked, output_lengths = pad_packed_sequence(output, batch_first=True)
# get the hidden state of the last time step
out = output_unpacked[:, -1, :]
rel = self.relu(out)
dense1 = self.fc1(rel)
# drop = self.dropout(dense1)
# preds = self.fc2(drop)
preds = self.dropout(dense1)
return preds
[docs]class VectorizeData(Dataset):
def __init__(self, df, max_len):
self.df = df
self.maxlen = max_len
self.df["lengths"] = self.df.wordsidx.apply(lambda x: self.maxlen if len(x) > self.maxlen else len(x))
self.df = self.df[self.df["lengths"] > 0].reset_index(drop=True)
self.df["wordsidxpadded"] = self.df.wordsidx.apply(self.pad_data)
def __len__(self):
return self.df.shape[0]
def __getitem__(self, idx):
x = self.df.wordsidxpadded[idx]
lens = self.df.lengths[idx] # truncated tweet length
y = self.df.event_id[idx]
sample = {'text': (x, lens), 'label': y}
return sample
[docs] def pad_data(self, tweet):
padded = np.zeros((self.maxlen,), dtype=np.int64)
if len(tweet) > self.maxlen:
padded[:] = tweet[:self.maxlen]
else:
padded[:len(tweet)] = tweet
return padded
[docs]class OnlineTripletLoss(nn.Module):
"""
Online Triplets loss
Takes a batch of embeddings and corresponding labels.
Triplets are generated using triplet_selector object that take embeddings and targets and return indices of
triplets
"""
def __init__(self, margin, triplet_selector):
super(OnlineTripletLoss, self).__init__()
self.margin = margin
self.triplet_selector = triplet_selector
[docs] def forward(self, embeddings, target):
triplets = self.triplet_selector.get_triplets(embeddings, target)
if embeddings.is_cuda:
triplets = triplets.cuda()
ap_distances = (embeddings[triplets[:, 0]] - embeddings[triplets[:, 1]]).pow(2).sum(1) # .pow(.5)
an_distances = (embeddings[triplets[:, 0]] - embeddings[triplets[:, 2]]).pow(2).sum(1) # .pow(.5)
losses = F.relu(ap_distances - an_distances + self.margin)
return losses.mean(), len(triplets)
[docs]def pdist(vectors):
distance_matrix = -2 * vectors.mm(torch.t(vectors)) + vectors.pow(2).sum(dim=1).view(1, -1) + vectors.pow(2).sum(
dim=1).view(-1, 1)
return distance_matrix
[docs]class TripletSelector:
"""
Implementation should return indices of anchors, positive and negative samples
return np array of shape [N_triplets x 3]
"""
def __init__(self):
pass
[docs] def get_triplets(self, embeddings, labels):
raise NotImplementedError
[docs]class FunctionNegativeTripletSelector(TripletSelector):
"""
For each positive pair, takes the hardest negative sample (with the greatest triplet loss value) to create a triplet
Margin should match the margin used in triplet loss.
negative_selection_fn should take array of loss_values for a given anchor-positive pair and all negative samples
and return a negative index for that pair
"""
def __init__(self, margin, negative_selection_fn, cpu=True):
super(FunctionNegativeTripletSelector, self).__init__()
self.cpu = cpu
self.margin = margin
self.negative_selection_fn = negative_selection_fn
[docs] def get_triplets(self, embeddings, labels):
if self.cpu:
embeddings = embeddings.cpu()
distance_matrix = pdist(embeddings)
distance_matrix = distance_matrix.cpu()
labels = labels.cpu().data.numpy()
triplets = []
for label in set(labels):
label_mask = (labels == label)
label_indices = np.where(label_mask)[0]
if len(label_indices) < 2:
continue
negative_indices = np.where(np.logical_not(label_mask))[0]
anchor_positives = list(combinations(label_indices, 2)) # All anchor-positive pairs
anchor_positives = np.array(anchor_positives)
ap_distances = distance_matrix[anchor_positives[:, 0], anchor_positives[:, 1]]
for anchor_positive, ap_distance in zip(anchor_positives, ap_distances):
loss_values = ap_distance - distance_matrix[
torch.LongTensor(np.array([anchor_positive[0]])), torch.LongTensor(negative_indices)] + self.margin
loss_values = loss_values.data.cpu().numpy()
hard_negative = self.negative_selection_fn(loss_values)
if hard_negative is not None:
hard_negative = negative_indices[hard_negative]
triplets.append([anchor_positive[0], anchor_positive[1], hard_negative])
if len(triplets) == 0:
triplets.append([anchor_positive[0], anchor_positive[1], negative_indices[0]])
triplets = np.array(triplets)
return torch.LongTensor(triplets)
[docs]def random_hard_negative(loss_values):
hard_negatives = np.where(loss_values > 0)[0]
return np.random.choice(hard_negatives) if len(hard_negatives) > 0 else None
[docs]def hardest_negative(loss_values):
hard_negative = np.argmax(loss_values)
return hard_negative if loss_values[hard_negative] > 0 else None
[docs]def HardestNegativeTripletSelector(margin, cpu=False): return FunctionNegativeTripletSelector(margin=margin,
negative_selection_fn=hardest_negative,
cpu=cpu)
[docs]def RandomNegativeTripletSelector(margin, cpu=False): return FunctionNegativeTripletSelector(margin=margin,
negative_selection_fn=random_hard_negative,
cpu=cpu)