import numpy as np
import os
from git import Repo, GitCommandError
import pandas as pd
import shutil
from uuid import uuid4
from datetime import datetime
import subprocess
import tempfile
[docs]class DatasetLoader:
r"""Base class for loading social event detection datasets.
.. note::
This is the base dataset loader class that provides common functionality for loading
and preprocessing social event detection datasets. All specific dataset loaders should
inherit from this class.
Parameters
----------
dataset : str, optional
Name of the dataset to load.
Default: ``None``.
dir_path : str, optional
Custom directory path to load data from.
Default: ``None``.
Attributes
----------
required_columns : list
Required columns that must be present in loaded datasets.
repo_url : str
URL of the repository containing the datasets.
target_folder : str
Target folder name for downloaded data.
"""
def __init__(self, dataset=None, dir_path=None):
self.dir_path = dir_path
self.dataset = dataset
self.default_root_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../dataset/data"))
print(f"Data root path: {self.default_root_path}") # 调试信息
os.makedirs(self.default_root_path, exist_ok=True)
self.required_columns = [
'tweet_id', 'text', 'event_id', 'words', 'filtered_words',
'entities', 'user_id', 'created_at', 'urls', 'hashtags', 'user_mentions'
]
self.repo_url = "https://github.com/ChenBeici/SocialED_datasets.git"
self.target_folder = "npy_data"
[docs] def download_and_cleanup(self, repo_url, dataset_name, local_target_folder):
# 创建临时目录
local_repo_path = os.path.join(os.path.dirname(__file__), "tmp", str(uuid4()))
try:
print(f"Downloading {dataset_name}.npy from {repo_url}")
# 克隆仓库
subprocess.run(['git', 'clone', '--branch', 'main', repo_url, local_repo_path], check=True)
# 确保目标目录存在
os.makedirs(local_target_folder, exist_ok=True)
print(f"Target directory: {local_target_folder}") # 调试信息
# 搜索.npy文件
npy_files = []
for root, dirs, files in os.walk(local_repo_path):
for file in files:
if file == f'{dataset_name}.npy':
npy_files.append(os.path.join(root, file))
if npy_files:
target_file = os.path.join(local_target_folder, f'{dataset_name}.npy')
print(f"Copying from {npy_files[0]} to {target_file}") # 调试信息
shutil.copy2(npy_files[0], target_file)
return True
else:
print(f"Error: {dataset_name}.npy not found in repository")
return False
except Exception as e:
print(f"Error during download: {str(e)}")
return False
finally:
if os.path.exists(local_repo_path):
shutil.rmtree(local_repo_path)
[docs] def download(self):
local_target_folder = os.path.join(self.default_root_path, self.dataset)
return self.download_and_cleanup(
self.repo_url,
self.dataset,
local_target_folder
)
[docs] def load_data(self):
"""Temporary implementation that returns empty dataset"""
print(f"Loading {self.dataset} dataset (mock data)")
return {
'texts': [],
'labels': [],
'metadata': {'name': self.dataset}
}
[docs] def get_dataset_language(self):
"""
Determine the language based on the current dataset.
Returns:
str: The language of the dataset ('English', 'French', 'Arabic').
"""
dataset_language_map = {
'MAVEN': 'English',
'Event2012': 'English',
'Event2018': 'French',
'Arabic_Twitter': 'Arabic',
'CrisisLexT26': 'English',
'CrisisLexT6': 'English',
'CrisisMMD': 'English',
'CrisisNLP': 'English',
'HumAID': 'English',
'Mix_Data': 'English',
'KBP': 'English',
'Event2012_100': 'English',
'Event2018_100': 'French',
'Arabic_7': 'Arabic'
}
language = dataset_language_map.get(self.dataset)
if not language:
raise ValueError(f"Unsupported dataset: {self.dataset}. Supported datasets are: {', '.join(dataset_language_map.keys())}")
return language
[docs] def get_dataset_name(self):
"""
Get the name of the current dataset.
Returns:
str: The name of the dataset.
"""
return self.dataset
[docs] def get_dataset_info(self):
"""
Get the info of the current dataset.
Returns:
list: The info of the dataset.
"""
df = self.load_data().sort_values(by='created_at').reset_index()
print(self.get_dataset_name())
print(self.get_dataset_language())
print("Columns:", df.columns.tolist())
print("First row:")
print(df.iloc[0].to_dict())
print("DataFrame shape:", df.shape)
#return end
[docs]class MAVEN(DatasetLoader):
r"""The MAVEN dataset for social event detection.
.. note::
This dataset contains English language social media posts related to various events.
The dataset provides text content and event labels for social event detection tasks.
"""
def __init__(self, dir_path=None):
super().__init__(dataset='MAVEN', dir_path=dir_path)
[docs] def load_data(self):
dataset_path = os.path.join(self.default_root_path, self.dataset)
print(f"Dataset path: {dataset_path}")
if not os.path.exists(dataset_path) or not os.listdir(dataset_path):
print(f"Directory {dataset_path} does not exist or is empty, downloading...")
if not self.download():
raise RuntimeError("Failed to download dataset")
file_path = os.path.join(dataset_path, f'{self.dataset}.npy')
print(f"Loading file from: {file_path}")
if not os.path.exists(file_path):
print(f"File not found at: {file_path}")
print(f"Directory contents: {os.listdir(dataset_path) if os.path.exists(dataset_path) else 'Directory does not exist'}")
raise FileNotFoundError(f"Data file not found at {file_path}")
data = np.load(file_path, allow_pickle=True)
df = pd.DataFrame(data, columns=self.required_columns)
print("MAVEN dataset loaded successfully.")
return df
[docs]class CrisisNLP(DatasetLoader):
r"""The CrisisNLP dataset for social event detection.
.. note::
This dataset contains English language social media posts related to crisis events.
The dataset provides text content and event labels for crisis event detection tasks.
"""
def __init__(self, dir_path=None):
super().__init__(dataset='CrisisNLP', dir_path=dir_path)
[docs] def load_data(self):
dataset_path = os.path.join(self.default_root_path, self.dataset)
print(f"Dataset path: {dataset_path}")
if not os.path.exists(dataset_path) or not os.listdir(dataset_path):
print(f"Directory {dataset_path} does not exist or is empty, downloading...")
if not self.download():
raise RuntimeError("Failed to download dataset")
file_path = os.path.join(dataset_path, f'{self.dataset}.npy')
print(f"Loading file from: {file_path}")
if not os.path.exists(file_path):
print(f"File not found at: {file_path}")
print(f"Directory contents: {os.listdir(dataset_path) if os.path.exists(dataset_path) else 'Directory does not exist'}")
raise FileNotFoundError(f"Data file not found at {file_path}")
data = np.load(file_path, allow_pickle=True)
df = pd.DataFrame(data, columns=self.required_columns)
print("CrisisNLP dataset loaded successfully.")
return df
[docs]class Event2012(DatasetLoader):
r"""The Event2012 dataset for social event detection.
.. note::
This dataset contains English language social media posts from 2012.
The dataset provides text content and event labels for social event detection tasks.
"""
def __init__(self, dir_path=None):
super().__init__(dataset='Event2012', dir_path=dir_path)
[docs] def load_data(self):
dataset_path = os.path.join(self.default_root_path, self.dataset)
print(f"Dataset path: {dataset_path}")
if not os.path.exists(dataset_path) or not os.listdir(dataset_path):
print(f"Directory {dataset_path} does not exist or is empty, downloading...")
if not self.download():
raise RuntimeError("Failed to download dataset")
file_path = os.path.join(dataset_path, f'{self.dataset}.npy')
print(f"Loading file from: {file_path}")
if not os.path.exists(file_path):
print(f"File not found at: {file_path}")
print(f"Directory contents: {os.listdir(dataset_path) if os.path.exists(dataset_path) else 'Directory does not exist'}")
raise FileNotFoundError(f"Data file not found at {file_path}")
data = np.load(file_path, allow_pickle=True)
df = pd.DataFrame(data, columns=self.required_columns)
print("Event2012 dataset loaded successfully.")
return df
[docs]class Event2018(DatasetLoader):
r"""The Event2018 dataset for social event detection.
.. note::
This dataset contains French language social media posts from 2018.
The dataset provides text content and event labels for social event detection tasks.
"""
def __init__(self, dir_path=None):
super().__init__(dataset='Event2018', dir_path=dir_path)
[docs] def load_data(self):
dataset_path = os.path.join(self.default_root_path, self.dataset)
print(f"Dataset path: {dataset_path}")
if not os.path.exists(dataset_path) or not os.listdir(dataset_path):
print(f"Directory {dataset_path} does not exist or is empty, downloading...")
if not self.download():
raise RuntimeError("Failed to download dataset")
file_path = os.path.join(dataset_path, f'{self.dataset}.npy')
print(f"Loading file from: {file_path}")
if not os.path.exists(file_path):
print(f"File not found at: {file_path}")
print(f"Directory contents: {os.listdir(dataset_path) if os.path.exists(dataset_path) else 'Directory does not exist'}")
raise FileNotFoundError(f"Data file not found at {file_path}")
data = np.load(file_path, allow_pickle=True)
df = pd.DataFrame(data, columns=self.required_columns)
print("Event2018 dataset loaded successfully.")
return df
[docs]class CrisisLexT26(DatasetLoader):
r"""The CrisisLexT26 dataset for social event detection.
.. note::
This dataset contains tweets related to 26 different crisis events.
The dataset provides text content and event labels for social event detection tasks.
"""
def __init__(self, dir_path=None):
super().__init__(dataset='CrisisLexT26', dir_path=dir_path)
[docs] def load_data(self):
dataset_path = os.path.join(self.default_root_path, self.dataset)
print(f"Dataset path: {dataset_path}")
if not os.path.exists(dataset_path) or not os.listdir(dataset_path):
print(f"Directory {dataset_path} does not exist or is empty, downloading...")
if not self.download():
raise RuntimeError("Failed to download dataset")
file_path = os.path.join(dataset_path, f'{self.dataset}.npy')
print(f"Loading file from: {file_path}")
if not os.path.exists(file_path):
print(f"File not found at: {file_path}")
print(f"Directory contents: {os.listdir(dataset_path) if os.path.exists(dataset_path) else 'Directory does not exist'}")
raise FileNotFoundError(f"Data file not found at {file_path}")
data = np.load(file_path, allow_pickle=True)
df = pd.DataFrame(data, columns=self.required_columns)
print("CrisisLexT26 dataset loaded successfully.")
return df
[docs]class CrisisMMD(DatasetLoader):
r"""The CrisisMMD dataset for social event detection.
.. note::
This dataset contains multimodal crisis-related social media data.
The dataset provides text, images and event labels for social event detection tasks.
"""
def __init__(self, dir_path=None):
super().__init__(dataset='CrisisMMD', dir_path=dir_path)
[docs] def load_data(self):
dataset_path = os.path.join(self.default_root_path, self.dataset)
print(f"Dataset path: {dataset_path}")
if not os.path.exists(dataset_path) or not os.listdir(dataset_path):
print(f"Directory {dataset_path} does not exist or is empty, downloading...")
if not self.download():
raise RuntimeError("Failed to download dataset")
file_path = os.path.join(dataset_path, f'{self.dataset}.npy')
print(f"Loading file from: {file_path}")
if not os.path.exists(file_path):
print(f"File not found at: {file_path}")
print(f"Directory contents: {os.listdir(dataset_path) if os.path.exists(dataset_path) else 'Directory does not exist'}")
raise FileNotFoundError(f"Data file not found at {file_path}")
data = np.load(file_path, allow_pickle=True)
df = pd.DataFrame(data, columns=self.required_columns)
print("CrisisMMD dataset loaded successfully.")
return df
[docs]class HumAID(DatasetLoader):
r"""The HumAID dataset for social event detection.
.. note::
This dataset contains tweets related to humanitarian crises and disasters.
The dataset provides text content and event labels for social event detection tasks.
"""
def __init__(self, dir_path=None):
super().__init__(dataset='HumAID', dir_path=dir_path)
[docs] def load_data(self):
dataset_path = os.path.join(self.default_root_path, self.dataset)
print(f"Dataset path: {dataset_path}")
if not os.path.exists(dataset_path) or not os.listdir(dataset_path):
print(f"Directory {dataset_path} does not exist or is empty, downloading...")
if not self.download():
raise RuntimeError("Failed to download dataset")
file_path = os.path.join(dataset_path, f'{self.dataset}.npy')
print(f"Loading file from: {file_path}")
if not os.path.exists(file_path):
print(f"File not found at: {file_path}")
print(f"Directory contents: {os.listdir(dataset_path) if os.path.exists(dataset_path) else 'Directory does not exist'}")
raise FileNotFoundError(f"Data file not found at {file_path}")
data = np.load(file_path, allow_pickle=True)
df = pd.DataFrame(data, columns=self.required_columns)
print("HumAID dataset loaded successfully.")
return df
[docs]class KBP(DatasetLoader):
r"""The KBP dataset for social event detection.
.. note::
This dataset contains knowledge base population event data.
The dataset provides text content and event labels for social event detection tasks.
"""
def __init__(self, dir_path=None):
super().__init__(dataset='KBP', dir_path=dir_path)
[docs] def load_data(self):
dataset_path = os.path.join(self.default_root_path, self.dataset)
print(f"Dataset path: {dataset_path}")
if not os.path.exists(dataset_path) or not os.listdir(dataset_path):
print(f"Directory {dataset_path} does not exist or is empty, downloading...")
if not self.download():
raise RuntimeError("Failed to download dataset")
file_path = os.path.join(dataset_path, f'{self.dataset}.npy')
print(f"Loading file from: {file_path}")
if not os.path.exists(file_path):
print(f"File not found at: {file_path}")
print(f"Directory contents: {os.listdir(dataset_path) if os.path.exists(dataset_path) else 'Directory does not exist'}")
raise FileNotFoundError(f"Data file not found at {file_path}")
data = np.load(file_path, allow_pickle=True)
df = pd.DataFrame(data, columns=self.required_columns)
print("KBP dataset loaded successfully.")
return df
[docs]class Arabic_7(DatasetLoader):
r"""The Arabic_7 dataset for social event detection.
.. note::
This dataset contains Arabic language social media posts for 7 event types.
The dataset provides text content and event labels for social event detection tasks.
"""
def __init__(self, dir_path=None):
super().__init__(dataset='Arabic_7', dir_path=dir_path)
[docs] def load_data(self):
dataset_path = os.path.join(self.default_root_path, self.dataset)
print(f"Dataset path: {dataset_path}")
if not os.path.exists(dataset_path) or not os.listdir(dataset_path):
print(f"Directory {dataset_path} does not exist or is empty, downloading...")
if not self.download():
raise RuntimeError("Failed to download dataset")
file_path = os.path.join(dataset_path, f'{self.dataset}.npy')
print(f"Loading file from: {file_path}")
if not os.path.exists(file_path):
print(f"File not found at: {file_path}")
print(f"Directory contents: {os.listdir(dataset_path) if os.path.exists(dataset_path) else 'Directory does not exist'}")
raise FileNotFoundError(f"Data file not found at {file_path}")
data = np.load(file_path, allow_pickle=True)
df = pd.DataFrame(data, columns=self.required_columns)
print("Arabic_7 dataset loaded successfully.")
return df
[docs]class Event2012_100(DatasetLoader):
r"""The Event2012_100 dataset for social event detection.
.. note::
This dataset contains tweets from 2012 related to 100 different events.
The dataset provides text content and event labels for social event detection tasks.
"""
def __init__(self, dir_path=None):
super().__init__(dataset='Event2012_100', dir_path=dir_path)
[docs] def load_data(self):
dataset_path = os.path.join(self.default_root_path, self.dataset)
print(f"Dataset path: {dataset_path}")
if not os.path.exists(dataset_path) or not os.listdir(dataset_path):
print(f"Directory {dataset_path} does not exist or is empty, downloading...")
if not self.download():
raise RuntimeError("Failed to download dataset")
file_path = os.path.join(dataset_path, f'{self.dataset}.npy')
print(f"Loading file from: {file_path}")
if not os.path.exists(file_path):
print(f"File not found at: {file_path}")
print(f"Directory contents: {os.listdir(dataset_path) if os.path.exists(dataset_path) else 'Directory does not exist'}")
raise FileNotFoundError(f"Data file not found at {file_path}")
data = np.load(file_path, allow_pickle=True)
df = pd.DataFrame(data, columns=self.required_columns)
print("Event2012_100 dataset loaded successfully.")
return df
[docs]class Event2018_100(DatasetLoader):
r"""The Event2018_100 dataset for social event detection.
.. note::
This dataset contains tweets from 2018 related to 100 different events.
The dataset provides text content and event labels for social event detection tasks.
"""
def __init__(self, dir_path=None):
super().__init__(dataset='Event2018_100', dir_path=dir_path)
[docs] def load_data(self):
dataset_path = os.path.join(self.default_root_path, self.dataset)
print(f"Dataset path: {dataset_path}")
if not os.path.exists(dataset_path) or not os.listdir(dataset_path):
print(f"Directory {dataset_path} does not exist or is empty, downloading...")
if not self.download():
raise RuntimeError("Failed to download dataset")
file_path = os.path.join(dataset_path, f'{self.dataset}.npy')
print(f"Loading file from: {file_path}")
if not os.path.exists(file_path):
print(f"File not found at: {file_path}")
print(f"Directory contents: {os.listdir(dataset_path) if os.path.exists(dataset_path) else 'Directory does not exist'}")
raise FileNotFoundError(f"Data file not found at {file_path}")
data = np.load(file_path, allow_pickle=True)
df = pd.DataFrame(data, columns=self.required_columns)
print("Event2018_100 dataset loaded successfully.")
return df
[docs]class Mix_Data(DatasetLoader):
r"""The Mix_Data dataset for social event detection.
.. note::
This dataset contains a mixture of social media data from various sources.
The dataset provides text content and event labels for social event detection tasks.
"""
def __init__(self, dir_path=None):
super().__init__(dataset='Mix_Data', dir_path=dir_path)
[docs] def load_data(self):
dataset_path = os.path.join(self.default_root_path, self.dataset)
print(f"Dataset path: {dataset_path}")
if not os.path.exists(dataset_path) or not os.listdir(dataset_path):
print(f"Directory {dataset_path} does not exist or is empty, downloading...")
if not self.download():
raise RuntimeError("Failed to download dataset")
file_path = os.path.join(dataset_path, f'{self.dataset}.npy')
print(f"Loading file from: {file_path}")
if not os.path.exists(file_path):
print(f"File not found at: {file_path}")
print(f"Directory contents: {os.listdir(dataset_path) if os.path.exists(dataset_path) else 'Directory does not exist'}")
raise FileNotFoundError(f"Data file not found at {file_path}")
data = np.load(file_path, allow_pickle=True)
df = pd.DataFrame(data, columns=self.required_columns)
print("Mix_Data dataset loaded successfully.")
return df
[docs]class CrisisLexT6(DatasetLoader):
r"""The CrisisLexT6 dataset for social event detection.
.. note::
This dataset contains tweets related to 6 different crisis events.
The dataset provides text content and event labels for social event detection tasks.
"""
def __init__(self, dir_path=None):
super().__init__(dataset='CrisisLexT6', dir_path=dir_path)
[docs] def load_data(self):
dataset_path = os.path.join(self.default_root_path, self.dataset)
print(f"Dataset path: {dataset_path}")
if not os.path.exists(dataset_path) or not os.listdir(dataset_path):
print(f"Directory {dataset_path} does not exist or is empty, downloading...")
if not self.download():
raise RuntimeError("Failed to download dataset")
file_path = os.path.join(dataset_path, f'{self.dataset}.npy')
print(f"Loading file from: {file_path}")
if not os.path.exists(file_path):
print(f"File not found at: {file_path}")
print(f"Directory contents: {os.listdir(dataset_path) if os.path.exists(dataset_path) else 'Directory does not exist'}")
raise FileNotFoundError(f"Data file not found at {file_path}")
data = np.load(file_path, allow_pickle=True)
df = pd.DataFrame(data, columns=self.required_columns)
print("CrisisLexT6 dataset loaded successfully.")
return df
[docs]class CrisisLexT7(DatasetLoader):
r"""The CrisisLexT7 dataset for social event detection.
.. note::
This dataset contains tweets related to 7 different crisis events.
The dataset provides text content and event labels for social event detection tasks.
"""
def __init__(self, dir_path=None):
super().__init__(dataset='CrisisLexT7', dir_path=dir_path)
[docs] def load_data(self):
dataset_path = os.path.join(self.default_root_path, self.dataset)
print(f"Dataset path: {dataset_path}")
if not os.path.exists(dataset_path) or not os.listdir(dataset_path):
print(f"Directory {dataset_path} does not exist or is empty, downloading...")
if not self.download():
raise RuntimeError("Failed to download dataset")
file_path = os.path.join(dataset_path, f'{self.dataset}.npy')
print(f"Loading file from: {file_path}")
if not os.path.exists(file_path):
print(f"File not found at: {file_path}")
print(f"Directory contents: {os.listdir(dataset_path) if os.path.exists(dataset_path) else 'Directory does not exist'}")
raise FileNotFoundError(f"Data file not found at {file_path}")
data = np.load(file_path, allow_pickle=True)
df = pd.DataFrame(data, columns=self.required_columns)
print("CrisisLexT7 dataset loaded successfully.")
return df
if __name__ == "__main__":
# Test MAVEN dataset
#maven = MAVEN()
#dataset = MAVEN().load_data()
print(Event2018().get_dataset_name())
print(Event2018().get_dataset_language())