Commit 8ad83712 authored by Shubhankar Patankar's avatar Shubhankar Patankar
Browse files

.py to compute filtration metrics

parent 773ddc18
%% Cell type:code id:2bff3ff8-b980-4b65-94a8-745dade96bc0 tags:
``` python
import os, sys
import json
import networkx as nx
import bct
import numpy as np
import utils_network_metrics
import pickle
import graph_tool.all as gt
import matplotlib.pyplot as plt
from sqlitedict import SqliteDict
import utils_network
import utils_gt
import utils_networkx # functions for generating different nodes/edges need for the networks
import matplotlib.cm as cm
%load_ext autoreload
%autoreload 2
```
%% Cell type:code id:4cd29f82-4a94-4693-8ffb-5687682e281c tags:
``` python
snapshot = "2022-03"
wiki_db = "enwiki"
country_code_sel = "US" # country_filter
N_sample = 1000 # number of samples
# mode = "pickle" # better for bulk processing of sessions
mode = "sqlite" # better for processing of individual sessions
```
%% Cell type:code id:6914e5c8-e2f3-452d-802c-97096f5681dd tags:
``` python
def filter_session_country(session, country_code):
"""
Filter sessions in which all pageviews come from the same country (specified via country_code).
Returns True or False
"""
keep_session=False
session_country_list = list(set([h["country_code"] for h in session]))
if session_country_list == [country_code]:
keep_session=True
return keep_session
```
%% Cell type:code id:8bfeecba-fd39-4b4c-85a0-11273cbfe4d9 tags:
``` python
# load the sample of 1000 people
FNAME_read = "/home/mgerlach/REPOS/curios-critical-readers/data/sessions-app_%s_%s_small.json"%(wiki_db,snapshot)
n_processed = 0
n_filter = 0
list_sessions = []
with open(FNAME_read) as fin:
for line in fin:
json_in = json.loads(line)
n_processed+=1
session = json_in.get("session",[])
list_sessions += [session]
print("Number of sessions processed: ",n_processed)
```
%% Cell type:code id:2d125dac-ff76-4929-a24c-fb119cf222a5 tags:
``` python
# Load a links table of the form {page_id: pageids of outlinks }
FNAME_read = "/home/mgerlach/REPOS/curios-critical-readers/data/pages-links_%s_%s.{0}"%(wiki_db,snapshot)
if mode == "pickle":
with open(FNAME_read.format("pkl"),"rb") as fin:
dict_links = pickle.load(fin)
elif mode == "sqlite":
dict_links = SqliteDict(FNAME_read.format("sqlite"))
else:
dict_links = {}
print(len(dict_links))
```
%% Cell type:code id:9607e7de-1f85-45af-9cd3-e8c22b7f344a tags:
``` python
# loop through the networks and analyze them
directed = False # only consider undirected networks
timed = True
# preallocate nans
if timed==True:
output_matrix = np.full((n_processed, 12), np.nan)
else:
output_matrix = np.full((n_processed, 9), np.nan)
for i in list(range(n_processed)):
list_nodes, list_edges = utils_network.session2edgelist_links(list_sessions[i],dict_links, directed = directed)
g = utils_networkx.make_graph_links(list_nodes, list_edges, directed=directed)
g_gt = utils_gt.make_graph_links(list_nodes, list_edges, directed = directed)
output_matrix[i,] = utils_network_metrics.calculate_network_metrics(g, g_gt, timed=timed)
FNAME_write = "/home/dalezhou/results/networkMetrics1000.csv"
np.savetxt(FNAME_write, output_matrix, delimiter=",")
```
%% Cell type:code id:bdae907c-498c-4bb5-ad0b-fedee978f809 tags:
``` python
# see how much time the networkx and gt metrics took
output_matrix = np.loadtxt('/home/dalezhou/results/networkMetrics1000.csv', delimiter=",")
print("NetworkX metrics took", sum(output_matrix[:,9]), "seconds, BCT metrics took", sum(output_matrix[:,10]), "seconds, and graph-tool metrics took", sum(output_matrix[:,11]), "seconds")
```
%% Cell type:code id:8ab6a988-f96f-486f-8a24-3a26b54aa76a tags:
``` python
# plot how long metrics took by network size
plt.scatter(output_matrix[:,0], output_matrix[:,9], s=3, label = "NetworkX")