[docs]classSBERT:r"""The SBERT model for social event detection that uses Sentence-BERT for text embedding and event detection. .. note:: This detector uses Sentence-BERT to generate text embeddings for identifying events in social media data. The model requires a dataset object with a load_data() method. Parameters ---------- dataset : object The dataset object containing social media data. Must provide load_data() method that returns the raw data. model_name : str, optional Path or name of the SBERT model to use. Default: ``'../model/model_needed/paraphrase-MiniLM-L6-v2'`` df : pandas.DataFrame, optional Processed dataframe. Default: ``None`` train_df : pandas.DataFrame, optional Training dataframe. Default: ``None`` test_df : pandas.DataFrame, optional Test dataframe. Default: ``None`` """def__init__(self,dataset,model_name='../model/model_needed/paraphrase-MiniLM-L6-v2',df=None,train_df=None,test_df=None,):self.dataset=dataset.load_data()ifos.path.exists(model_name):self.model_name=model_nameelse:self.model_name='sentence-transformers/paraphrase-MiniLM-L6-v2'self.df=dfself.train_df=train_dfself.test_df=test_dfself.model=SentenceTransformer(self.model_name)
[docs]defpreprocess(self):""" Data preprocessing: tokenization, stop words removal, etc. """df=self.datasetdf['processed_text']=df['filtered_words'].apply(lambdax:' '.join([str(word).lower()forwordinx])ifisinstance(x,list)else'')self.df=dfreturndf
[docs]defget_sbert_embeddings(self,text):""" Get SBERT embeddings for a given text. """returnself.model.encode(text)
[docs]defdetection(self):""" Detect events by comparing SBERT embeddings. """train_df,test_df=train_test_split(self.df,test_size=0.2,random_state=42)self.train_df=train_dfself.test_df=test_dflogging.info("Calculating SBERT embeddings for the training set...")train_df['sbert_embedding']=train_df['processed_text'].apply(self.get_sbert_embeddings)logging.info("SBERT embeddings calculated for the training set.")logging.info("Calculating SBERT embeddings for the test set...")test_df['sbert_embedding']=test_df['processed_text'].apply(self.get_sbert_embeddings)logging.info("SBERT embeddings calculated for the test set.")train_embeddings=np.stack(self.train_df['sbert_embedding'].values)test_embeddings=np.stack(self.test_df['sbert_embedding'].values)predictions=[]fortest_embintest_embeddings:distances=np.linalg.norm(train_embeddings-test_emb,axis=1)closest_idx=np.argmin(distances)predictions.append(self.train_df.iloc[closest_idx]['event_id'])ground_truths=self.test_df['event_id'].tolist()returnground_truths,predictions
[docs]defevaluate(self,ground_truths,predictions):""" Evaluate the model. """# Calculate Normalized Mutual Information (NMI)nmi=metrics.normalized_mutual_info_score(ground_truths,predictions)print(f"Normalized Mutual Information (NMI): {nmi}")# Calculate Adjusted Mutual Information (AMI)ami=metrics.adjusted_mutual_info_score(ground_truths,predictions)print(f"Adjusted Mutual Information (AMI): {ami}")# Calculate Adjusted Rand Index (ARI)ari=metrics.adjusted_rand_score(ground_truths,predictions)print(f"Adjusted Rand Index (ARI): {ari}")