Commit a2b6cc4d authored by Miriam Redi's avatar Miriam Redi
Browse files

added metric for unillustrated articles

parent d444bf1f
Pipeline #5638 waiting for manual action with stages
in 3 minutes and 34 seconds
%% Cell type:code id:53285172 tags:
``` python
import re
import pickle
import pandas as pd
import math
import numpy as np
import random
import requests
#from bs4 import BeautifulSoup
import json
import os
from wmfdata.spark import get_session
```
%% Output
You are using wmfdata v1.3.2, but v1.3.3 is available.
To update, run `pip install --upgrade git+https://github.com/wikimedia/wmfdata-python.git@release --ignore-installed`.
To see the changes, refer to https://github.com/wikimedia/wmfdata-python/blob/release/CHANGELOG.md
%% Cell type:code id:1ab757a0 tags:
``` python
!which python
```
%% Output
/home/mirrys/.conda/envs/2021-04-27T11.10.00_mirrys/bin/python
%% Cell type:code id:63bc91c5 tags:
``` python
qids_and_properties={}
```
%% Cell type:code id:8edc57cc tags:parameters
``` python
# Pass in directory to place output files
output_dir = 'Output'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Pass in the full snapshot date
snapshot = '2022-06-26'
# A spark session type determines the resource pool
# to initialise on yarn
spark_session_type = 'regular'
```
%% Cell type:code id:2d83f992 tags:
``` python
# We use wmfdata boilerplate to init a spark session.
# Under the hood the library uses findspark to initialise
# Spark's environment. pyspark imports will be available
# after initialisation
spark = get_session(type='regular', app_name="ImageRec-DEV Training")
import pyspark
import pyspark.sql
```
%% Output
PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
%% Cell type:code id:5cc8bc1e tags:
``` python
reg = r'^([\w]+-[\w]+)'
short_snapshot = re.match(reg, snapshot).group()
short_snapshot
```
%% Output
'2022-06'
%% Cell type:code id:cc6bf202 tags:
``` python
short_snapshot
```
%% Output
'2022-06'
%% Cell type:code id:0b0206b0 tags:
``` python
def get_threshold(wiki_size):
#change th to optimize precision vs recall. recommended val for accuracy = 5
sze, th, lim = 50000, 15, 4
if (wiki_size >= sze):
#if wiki_size > base size, scale threshold by (log of ws/bs) + 1
return (math.log(wiki_size/sze, 10)+1)*th
#else scale th down by ratio bs/ws, w min possible val of th = th/limiting val
return max((wiki_size/sze) * th, th/lim)
```
%% Cell type:code id:5fe4f0f4 tags:
``` python
qids_and_properties={}
languages=[line.strip()+'wiki' for line in open('../wikis')]
```
%% Cell type:code id:ba1de990 tags:
``` python
val={}
total={}
for wiki in languages:
querytot="""SELECT COUNT(*) as c
FROM wmf_raw.mediawiki_page
WHERE page_namespace=0
AND page_is_redirect=0
AND snapshot='"""+short_snapshot+"""'
AND wiki_db='"""+wiki+"""'"""
wikisize = spark.sql(querytot).toPandas()
val[wiki]=get_threshold(int(wikisize['c']))
total[wiki]=int(wikisize['c'])
```
%% Cell type:markdown id:2452b85f tags:
The query below retrieves, for each unillustrated article: its Wikidata ID, the image of the Wikidata ID (if any), the Commons category of the Wikidata ID (if any), and the lead images of the articles in other languages (if any).
`allowed_images` contains the list of icons (images appearing in more than `val` articles)
`all_image_pageids` contains the list of illustrated articles (articles with images that are not icons + articles with page images)
%% Cell type:code id:6e90e91c tags:
``` python
snapshot = '2022-06-06'
```
%% Cell type:code id:051716e3 tags:
``` python
for wiki in languages:
print(wiki)
queryd="""WITH allowed_images AS
(
SELECT il_to
FROM wmf_raw.mediawiki_imagelinks
WHERE il_from_namespace=0
AND snapshot='"""+short_snapshot+"""'
AND wiki_db='"""+wiki+"""'
AND il_to not like '%\"%' AND il_to not like '%,%'
GROUP BY il_to
HAVING COUNT(il_to)>"""+str(val[wiki])+"""),
image_pageids AS
(SELECT DISTINCT il_from as pageid
FROM wmf_raw.mediawiki_imagelinks il1
LEFT ANTI JOIN allowed_images
ON allowed_images.il_to=il1.il_to
WHERE il1.il_from_namespace=0
AND il1.wiki_db='"""+wiki+"""'