" #change th to optimize precision vs recall. recommended val for accuracy = 5\n",
" sze, th, lim = 50000, 15, 4 \n",
" if (wiki_size >= sze):\n",
" #if wiki_size > base size, scale threshold by (log of ws/bs) + 1\n",
" return (math.log(wiki_size/sze, 10)+1)*th\n",
" #else scale th down by ratio bs/ws, w min possible val of th = th/limiting val\n",
" return max((wiki_size/sze) * th, th/lim)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "5fe4f0f4",
"metadata": {},
"outputs": [],
"source": [
"\n",
"qids_and_properties={}\n",
"languages=[line.strip()+'wiki' for line in open('../wikis')]"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "ba1de990",
"metadata": {},
"outputs": [],
"source": [
"val={}\n",
"total={}\n",
"for wiki in languages:\n",
" querytot=\"\"\"SELECT COUNT(*) as c\n",
" FROM wmf_raw.mediawiki_page\n",
" WHERE page_namespace=0 \n",
" AND page_is_redirect=0\n",
" AND snapshot='\"\"\"+short_snapshot+\"\"\"' \n",
" AND wiki_db='\"\"\"+wiki+\"\"\"'\"\"\"\n",
" wikisize = spark.sql(querytot).toPandas()\n",
" val[wiki]=get_threshold(int(wikisize['c']))\n",
" total[wiki]=int(wikisize['c'])\n"
]
},
{
"cell_type": "markdown",
"id": "2452b85f",
"metadata": {},
"source": [
"The query below retrieves, for each unillustrated article: its Wikidata ID, the image of the Wikidata ID (if any), the Commons category of the Wikidata ID (if any), and the lead images of the articles in other languages (if any).\n",
"\n",
"`allowed_images` contains the list of icons (images appearing in more than `val` articles)\n",
"\n",
"`all_image_pageids` contains the list of illustrated articles (articles with images that are not icons + articles with page images)\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "6e90e91c",
"metadata": {},
"outputs": [],
"source": [
"snapshot = '2022-06-06'\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "051716e3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"enwiki\n",
"eswiki\n",
"dewiki\n",
"frwiki\n",
"jawiki\n",
"ruwiki\n",
"itwiki\n",
"zhwiki\n",
"ptwiki\n",
"plwiki\n",
"arwiki\n",
"nlwiki\n",
"fawiki\n",
"idwiki\n",
"kowiki\n",
"ukwiki\n",
"viwiki\n",
"svwiki\n",
"thwiki\n",
"cswiki\n",
"hewiki\n",
"huwiki\n",
"hiwiki\n",
"fiwiki\n",
"nowiki\n",
"rowiki\n",
"elwiki\n",
"cawiki\n",
"simplewiki\n",
"srwiki\n",
"dawiki\n",
"bnwiki\n",
"bgwiki\n",
"hrwiki\n",
"trwiki\n",
"mswiki\n",
"skwiki\n",
"azwiki\n",
"tawiki\n",
"hywiki\n",
"ltwiki\n",
"zh-yuewiki\n",
"etwiki\n",
"mrwiki\n",
"slwiki\n",
"kkwiki\n",
"mlwiki\n",
"kawiki\n",
"shwiki\n",
"lvwiki\n",
"sqwiki\n",
"tlwiki\n",
"bswiki\n",
"euwiki\n",
"tewiki\n",
"glwiki\n",
"mkwiki\n",
"urwiki\n",
"arzwiki\n",
"knwiki\n",
"afwiki\n",
"eowiki\n",
"bewiki\n",
"uzwiki\n",
"astwiki\n",
"swwiki\n",
"nnwiki\n",
"mywiki\n",
"iswiki\n",
"jvwiki\n",
"mnwiki\n",
"guwiki\n",
"cebwiki\n",
"lawiki\n",
"scowiki\n",
"pawiki\n",
"newiki\n",
"cywiki\n",
"be-x-oldwiki\n",
"minwiki\n",
"kywiki\n",
"ckbwiki\n",
"kmwiki\n",
"aswiki\n",
"siwiki\n",
"suwiki\n",
"alswiki\n",
"gawiki\n",
"ttwiki\n",
"anwiki\n",
"lbwiki\n",
"azbwiki\n",
"warwiki\n",
"sowiki\n",
"ocwiki\n",
"brwiki\n",
"barwiki\n",
"bawiki\n",
"tgwiki\n",
"bhwiki\n",
"fywiki\n",
"orwiki\n",
"zh-classicalwiki\n",
"ndswiki\n",
"zh-min-nanwiki\n",
"amwiki\n",
"wuuwiki\n",
"kuwiki\n",
"bclwiki\n",
"vecwiki\n",
"maiwiki\n",
"yiwiki\n",
"hawiki\n",
"lmowiki\n",
"pnbwiki\n",
"iowiki\n",
"cvwiki\n",
"iawiki\n",
"mgwiki\n",
"pswiki\n",
"sahwiki\n",
"liwiki\n",
"mznwiki\n",
"hifwiki\n",
"cewiki\n",
"sawiki\n",
"tkwiki\n",
"sdwiki\n",
"gomwiki\n",
"yowiki\n",
"lowiki\n",
"scnwiki\n",
"quwiki\n",
"htwiki\n",
"fowiki\n",
"nds-nlwiki\n",
"vowiki\n",
"bjnwiki\n",
"igwiki\n"
]
}
],
"source": [
"for wiki in languages:\n",
" print(wiki)\n",
" queryd=\"\"\"WITH allowed_images AS \n",
" (\n",
" SELECT il_to\n",
" FROM wmf_raw.mediawiki_imagelinks\n",
" WHERE il_from_namespace=0 \n",
" AND snapshot='\"\"\"+short_snapshot+\"\"\"' \n",
" AND wiki_db='\"\"\"+wiki+\"\"\"' \n",
" AND il_to not like '%\\\"%' AND il_to not like '%,%'\n",
" GROUP BY il_to \n",
" HAVING COUNT(il_to)>\"\"\"+str(val[wiki])+\"\"\"),\n",
" image_pageids AS \n",
" (SELECT DISTINCT il_from as pageid\n",
" FROM wmf_raw.mediawiki_imagelinks il1 \n",
" LEFT ANTI JOIN allowed_images\n",
" ON allowed_images.il_to=il1.il_to\n",
" WHERE il1.il_from_namespace=0 \n",
" AND il1.wiki_db='\"\"\"+wiki+\"\"\"' \n",
" AND il1.snapshot='\"\"\"+short_snapshot+\"\"\"'\n",
" ),\n",
" pageimage_pageids AS \n",
" (\n",
" SELECT DISTINCT pp_page as pageid\n",
" FROM wmf_raw.mediawiki_page_props pp\n",
" WHERE pp.wiki_db ='\"\"\"+wiki+\"\"\"'\n",
" AND pp.snapshot='\"\"\"+short_snapshot+\"\"\"'\n",
" AND pp_propname in ('page_image','page_image_free')),\n",
The query below retrieves, for each unillustrated article: its Wikidata ID, the image of the Wikidata ID (if any), the Commons category of the Wikidata ID (if any), and the lead images of the articles in other languages (if any).
`allowed_images` contains the list of icons (images appearing in more than `val` articles)
`all_image_pageids` contains the list of illustrated articles (articles with images that are not icons + articles with page images)
%% Cell type:code id:6e90e91c tags:
``` python
snapshot='2022-06-06'
```
%% Cell type:code id:051716e3 tags:
``` python
forwikiinlanguages:
print(wiki)
queryd="""WITH allowed_images AS
(
SELECT il_to
FROM wmf_raw.mediawiki_imagelinks
WHERE il_from_namespace=0
AND snapshot='"""+short_snapshot+"""'
AND wiki_db='"""+wiki+"""'
AND il_to not like '%\"%' AND il_to not like '%,%'