# -*- coding: utf-8 -*-"""Data processing utilities for multilingual social media data."""importnumpyasnpimporttorchimportosfromdatetimeimportdatetimefromcollectionsimportCounterimportrequestsfromstatisticsimportmean
[docs]defconstruct_graph(df,G=None):"""Construct a graph from a DataFrame containing social media data. Parameters ---------- df : pandas.DataFrame DataFrame containing social media data with columns: tweet_id, user_mentions, user_id, entities, sampled_words G : networkx.Graph, optional (default=None) Existing graph to add nodes/edges to. If None, creates new graph. Returns ------- G : networkx.Graph Graph with nodes for tweets, users, entities and words, and edges between them. """importnetworkxasnxifGisNone:G=nx.Graph()for_,rowindf.iterrows():# Add tweet nodetid='t_'+str(row['tweet_id'])G.add_node(tid)G.nodes[tid]['tweet_id']=True# Add user nodesuser_ids=row['user_mentions']user_ids.append(row['user_id'])user_ids=['u_'+str(each)foreachinuser_ids]G.add_nodes_from(user_ids)foreachinuser_ids:G.nodes[each]['user_id']=True# Add entity nodesentities=row['entities']G.add_nodes_from(entities)foreachinentities:G.nodes[each]['entity']=True# Add word nodeswords=['w_'+eachforeachinrow['sampled_words']]G.add_nodes_from(words)foreachinwords:G.nodes[each]['word']=True# Add edges between tweet and other nodesedges=[]edges+=[(tid,each)foreachinuser_ids]edges+=[(tid,each)foreachinentities]edges+=[(tid,each)foreachinwords]G.add_edges_from(edges)returnG
[docs]defload_data(name,cache_dir=None):""" Data loading function that downloads .npy files from SocialED_datasets repository. Parameters ---------- name : str The name of the dataset. cache_dir : str, optional The directory for dataset caching. Default: ``None``. Returns ------- data : numpy.ndarray The loaded dataset. """ifcache_dirisNone:cache_dir=os.path.join(os.path.expanduser('~'),'.socialed/data')file_path=os.path.join(cache_dir,name+'.npy')ifos.path.exists(file_path):data=np.load(file_path,allow_pickle=True)else:url="https://github.com/ChenBeici/SocialED_datasets/raw/main/npy_data/"+name+".npy"ifnotos.path.exists(cache_dir):os.makedirs(cache_dir)r=requests.get(url,stream=True)ifr.status_code!=200:raiseRuntimeError("Failed downloading url %s"%url)withopen(file_path,'wb')asf:forchunkinr.iter_content(chunk_size=1024):ifchunk:# filter out keep-alive new chunksf.write(chunk)data=np.load(file_path,allow_pickle=True)returndata
[docs]defgraph_statistics(G,save_path):""" Calculate and save basic statistics of a graph. Parameters ---------- G : networkx.Graph The input graph to analyze. save_path : str Directory path to save the statistics. Returns ------- num_isolated_nodes : int Number of isolated nodes in the graph. """message='\nGraph statistics:\n'num_nodes=G.number_of_nodes()num_edges=G.number_of_edges()ave_degree=(num_edges/2)//num_nodesin_degrees=G.in_degrees()isolated_nodes=torch.zeros([in_degrees.size()[0]],dtype=torch.long)isolated_nodes=(in_degrees==isolated_nodes)torch.save(isolated_nodes,save_path+'/isolated_nodes.pt')num_isolated_nodes=torch.sum(isolated_nodes).item()message+='We have '+str(num_nodes)+' nodes.\n'message+='We have '+str(num_edges/2)+' in-edges.\n'message+='Average degree: '+str(ave_degree)+'\n'message+='Number of isolated nodes: '+str(num_isolated_nodes)+'\n'print(message)withopen(save_path+"/graph_statistics.txt","a")asf:f.write(message)returnnum_isolated_nodes
[docs]defextract_time_feature(t_str):""" Extract time features from timestamp string. Parameters ---------- t_str : str Timestamp string in ISO format. Returns ------- list List containing two normalized time features: [days, seconds]. """t=datetime.fromisoformat(str(t_str))OLE_TIME_ZERO=datetime(1899,12,30)delta=t-OLE_TIME_ZEROreturn[(float(delta.days)/100000.),(float(delta.seconds)/86400)]# 86,400 seconds in day
[docs]defget_word2id_emb(wordpath,embpath):""" Load word-to-id mapping and embeddings from files. Parameters ---------- wordpath : str Path to file containing words. embpath : str Path to file containing embeddings. Returns ------- tuple (word2id dictionary, embeddings array). """word2id={}withopen(wordpath,'r')asf:fori,winenumerate(list(f.readlines()[0].split())):word2id[w]=iembeddings=np.load(embpath)returnword2id,embeddings
[docs]defdf_to_t_features(df):""" Convert DataFrame timestamps to time features. Parameters ---------- df : pandas.DataFrame DataFrame with 'created_at' column containing timestamps. Returns ------- numpy.ndarray Array of time features for each timestamp. """t_features=np.asarray([extract_time_feature(t_str)fort_strindf['created_at']])returnt_features
[docs]defcheck_class_sizes(ground_truths,predictions):""" Check sizes of predicted classes against ground truth classes. Parameters ---------- ground_truths : array-like Ground truth class labels. predictions : array-like Predicted class labels. Returns ------- list List of predicted class labels that are larger than average ground truth class size. """count_true_labels=list(Counter(ground_truths).values())ave_true_size=mean(count_true_labels)distinct_predictions=list(Counter(predictions).keys())count_predictions=list(Counter(predictions).values())large_classes=[distinct_predictions[i]fori,countinenumerate(count_predictions)ifcount>ave_true_size]returnlarge_classes