languages=['enwiki','arwiki','kowiki','cswiki','viwiki','frwiki','fawiki','ptwiki','ruwiki','trwiki','plwiki','hewiki','svwiki','ukwiki','huwiki','hywiki','srwiki','euwiki','arzwiki','cebwiki','dewiki','bnwiki']#language editions to consider
#val=100 #threshold above which we consider images as non-icons
#change th to optimize precision vs recall. recommended val for accuracy = 5
sze,th,lim=50000,15,4
if(wiki_size>=sze):
#if wiki_size > base size, scale threshold by (log of ws/bs) + 1
return(math.log(wiki_size/sze,10)+1)*th
#else scale th down by ratio bs/ws, w min possible val of th = th/limiting val
returnmax((wiki_size/sze)*th,th/lim)
# In[ ]:
val={}
total={}
forwikiinlanguages:
querytot="""SELECT COUNT(*) as c
FROM wmf_raw.mediawiki_page
WHERE page_namespace=0
AND page_is_redirect=0
AND snapshot='"""+short_snapshot+"""'
AND wiki_db='"""+wiki+"""'"""
wikisize=spark.sql(querytot).toPandas()
val[wiki]=get_threshold(int(wikisize['c']))
total[wiki]=int(wikisize['c'])
# In[ ]:
val
# In[ ]:
total
# In[ ]:
wikisize
# The query below retrieves, for each unillustrated article: its Wikidata ID, the image of the Wikidata ID (if any), the Commons category of the Wikidata ID (if any), and the lead images of the articles in other languages (if any).
#
# `allowed_images` contains the list of icons (images appearing in more than `val` articles)
#
# `image_pageids` contains the list of illustrated articles (articles with images that are not icons)
#
# `noimage_pages` contains the pageid and Qid of unillustrated articles
#
# `qid_props` contains for each Qid in `noimage_pages`, the values of the following properties, when present:
# * P18: the item's image
# * P373: the item's Commons category
# * P31: the item's "instance of" property
#
# `category_image_list` contains the list of all images in a Commons category in `qid_props`
#
# `lan_page_images` contains the list of lead images in Wikipedia articles in all languages linked to each Qid
#
# `qid_props_with_image_list` is qid_props plus the list of images in the Commons category linked to the Wikidata item
#
#
# In[ ]:
forwikiinlanguages:
print(wiki)
queryd="""WITH allowed_images AS
(
SELECT il_to
FROM wmf_raw.mediawiki_imagelinks
WHERE il_from_namespace=0
AND snapshot='"""+short_snapshot+"""'
AND wiki_db='"""+wiki+"""'
AND il_to not like '%\"%' AND il_to not like '%,%'
GROUP BY il_to
HAVING COUNT(il_to)>"""+str(val[wiki])+"""),
image_pageids AS
(SELECT DISTINCT il_from as pageid
FROM wmf_raw.mediawiki_imagelinks il1
LEFT ANTI JOIN allowed_images
ON allowed_images.il_to=il1.il_to
WHERE il1.il_from_namespace=0
AND il1.wiki_db='"""+wiki+"""'
AND il1.snapshot='"""+short_snapshot+"""'
),
pageimage_pageids AS
(
SELECT DISTINCT pp_page as pageid
FROM wmf_raw.mediawiki_page_props pp
WHERE pp.wiki_db ='"""+wiki+"""'
AND pp.snapshot='"""+short_snapshot+"""'
AND pp_propname in ('page_image','page_image_free')),
AND page_is_redirect=0 AND p.wiki_db='"""+wiki+"""'
AND p.snapshot='"""+short_snapshot+"""'
AND wipl.snapshot='"""+snapshot+"""'
AND wipl.page_namespace=0
AND wipl.wiki_db='"""+wiki+"""'
ORDER BY page_len desc
),
qid_props AS
(
SELECT we.id,label_val,
MAX(CASE WHEN claim.mainSnak.property = 'P18' THEN claim.mainSnak.datavalue.value ELSE NULL END) AS hasimage,
MAX(CASE WHEN claim.mainSnak.property = 'P373' THEN REPLACE(REPLACE(claim.mainSnak.datavalue.value,'\"',''),' ','_') ELSE NULL END) AS commonscategory,
MAX(CASE WHEN claim.mainSnak.property = 'P31' THEN claim.mainSnak.datavalue.value ELSE NULL END) AS instanceof
FROM wmf.wikidata_entity we
JOIN noimage_pages
ON we.id=noimage_pages.item_id
LATERAL VIEW explode(labels) t AS label_lang,label_val
LATERAL VIEW OUTER explode(claims) c AS claim
WHERE typ='item'
AND t.label_lang='"""+label_lang+"""'
AND snapshot='"""+snapshot+"""'
AND claim.mainSnak.property in ('P18','P31','P373')
GROUP BY id,label_val
),
category_image_list AS
(
SELECT cl_to,concat_ws(';',collect_list(mp.page_title)) as category_imagelist
on qid_props_with_image_list.id=joined_lan_page_images.item_id
"""
qid_props=spark.sql(queryd).toPandas()
qids_and_properties[wiki]=qid_props
# Below I am just creating different tables according to whether an image is retrieved from a specific source (Wikidata image, Commons Category, or interlingual links)