metrics.py 1.24 KB
Newer Older
1
2
3
4
5
from datetime import datetime, timedelta

import pyspark.sql.functions as F

import wmfdata as wmf
6
spark = wmf.spark.get_session(
7
8
9
10
11
12
13
14
15
16
17
18
19
    type='local',
    app_name="knowledge-gaps"
)

from knowledge_gaps import article_features, func
# import importlib; importlib.reload(article_features)

projects = ['cawiki', 'dewiki', 'enwiki', 'eswiki', 'itwiki']
table = "bmansurov.pageview_hourly_20220308"
today = datetime.today()
ninety_days_earlier = today - timedelta(days=90)

pageviews_df = article_features.extract_pageviews(
20
    spark,
21
22
23
24
25
26
27
28
    ninety_days_earlier,
    today,
    projects,
    table
)
print(pageviews_df.head(3))


29
def get_pages_df(spark, table='aikochou.pages_20220310'):
30
    query = f"SELECT * FROM {table}"
31
    return spark.sql(query)
32
33


34
pages_df = get_pages_df(spark)
35
36
37
38
39
40
41
42
43
44
45
46
47
print(pages_df.head(3))

# Here we can add content gaps to pages_df, etc.

pages_df = (pages_df.alias('p')
            .join(pageviews_df.alias('pv'),
                  (F.col('p.wiki_db') == F.col('pv.wiki_db')) &
                  (F.col('p.page_id') == F.col('pv.page_id')) &
                  (F.col('p.page_title') == F.col('pv.page_title')),
                  'left')
            .select('p.*', 'pv.pageviews'))
pages_df = pages_df.fillna(0, ['pageviews'])
print(pages_df.head(3))