Commit 3cc14e35 authored by Clarakosi's avatar Clarakosi
Browse files

Add algorithm version 2

* Added a new notebook for algorithm_v2
* Modified algorithm script to use algorithm_v2
* Updated setup.py to now show algorithm version
* Fixed minor typos on README
parent adf6d2ae
......@@ -29,8 +29,8 @@ source venv/bin/activate
Install the dependencies
```shell
export=http_proxy=http://webproxy.eqiad.wmnet:8080
export=https_proxy=http://webproxy.eqiad.wmnet:8080
export http_proxy=http://webproxy.eqiad.wmnet:8080
export https_proxy=http://webproxy.eqiad.wmnet:8080
make wheel
pip install dist/algorunner-0.2.0-py3-none-any.whl --no-cache-dir
```
......
This diff is collapsed.
......@@ -3,7 +3,7 @@
# In[ ]:
#
# Generated by nbconvert on ../notebooks/algorithm.ipynb. Modified to
# Generated by nbconvert on ../notebooks/algorithm_v2.ipynb. Modified to
# work with the `plaform-airflow-dags` project.
#
import re
......@@ -12,8 +12,6 @@ import pandas as pd
import math
import numpy as np
import random
import json
import os
import sys
......@@ -25,13 +23,6 @@ qids_and_properties={}
# In[ ]:
# Pass in directory to place output files
output_dir = '/srv/airflow-platform_eng/image-matching/runs/8419345a-3404-4a7c-93e1-b9e6813706ff/Output/'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Pass in the full snapshot date
snapshot = sys.argv[1]
......@@ -40,7 +31,7 @@ language = sys.argv[2]
# A spark session type determines the resource pool
# to initialise on yarn
spark_session_type = 'regular'
spark_session_type = 'large'
# Name of placeholder images parquet file
image_placeholders_file = 'image_placeholders'
......@@ -51,16 +42,17 @@ image_placeholders_file = 'image_placeholders'
# We use wmfdata boilerplate to init a spark session.
# Under the hood the library uses findspark to initialise
# Spark's environment. pyspark imports will be available
# Spark's environment. pyspark imports will be available
# after initialisation
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
# In[ ]:
languages=['enwiki','arwiki','kowiki','cswiki','viwiki','frwiki','fawiki','ptwiki','ruwiki','trwiki','plwiki','hewiki','svwiki','ukwiki','huwiki','hywiki','srwiki','euwiki','arzwiki','cebwiki','dewiki','bnwiki'] #language editions to consider
# languages=['enwiki','arwiki','kowiki','cswiki','viwiki','frwiki','fawiki','ptwiki','ruwiki','trwiki','plwiki','hewiki','svwiki','ukwiki','huwiki','hywiki','srwiki','euwiki','arzwiki','cebwiki','dewiki','bnwiki'] #language editions to consider
#val=100 #threshold above which we consider images as non-icons
languages=[language]
......@@ -297,9 +289,8 @@ for wiki in languages:
on qid_props_with_image_list.id=joined_lan_page_images.item_id
"""
qid_props = spark.sql(queryd).toPandas()
qid_props = spark.sql(queryd).cache()
qids_and_properties[wiki]=qid_props
# Below I am just creating different tables according to whether an image is retrieved from a specific source (Wikidata image, Commons Category, or interlingual links)
......@@ -307,170 +298,179 @@ for wiki in languages:
# In[ ]:
hasimage={}
commonscategory={}
lanimages={}
allimages={}
for wiki in languages:
print(wiki)
hasimage[wiki]=qids_and_properties[wiki][qids_and_properties[wiki]['hasimage'].astype(str).ne('None')]
commonscategory[wiki]=qids_and_properties[wiki][qids_and_properties[wiki]['category_imagelist'].astype(str).ne('None')]
lanimages[wiki]=qids_and_properties[wiki][qids_and_properties[wiki]['lan_images'].astype(str).ne('None')]
print("number of unillustrated articles: "+str(len(qids_and_properties[wiki])))
print("number of articles items with Wikidata image: "+str(len(hasimage[wiki])))
print("number of articles items with Wikidata Commons Category: "+str(len(commonscategory[wiki])))
print("number of articles items with Language Links: "+str(len(lanimages[wiki])))
####
allimages[wiki]=qids_and_properties[wiki]
# hasimage={}
# commonscategory={}
# lanimages={}
# allimages={}
# for wiki in languages:
# print(wiki)
# hasimage[wiki]=qids_and_properties[wiki][qids_and_properties[wiki]['hasimage'].astype(str).ne('None')]
# commonscategory[wiki]=qids_and_properties[wiki][qids_and_properties[wiki]['category_imagelist'].astype(str).ne('None')]
# lanimages[wiki]=qids_and_properties[wiki][qids_and_properties[wiki]['lan_images'].astype(str).ne('None')]
# print("number of unillustrated articles: "+str(len(qids_and_properties[wiki])))
# print("number of articles items with Wikidata image: "+str(len(hasimage[wiki])))
# print("number of articles items with Wikidata Commons Category: "+str(len(commonscategory[wiki])))
# print("number of articles items with Language Links: "+str(len(lanimages[wiki])))
# ####
# allimages[wiki]=qids_and_properties[wiki]
# Below the two functions to select images depending on the source:
# * `select_image_language` takes as input the list of images from articles in multiple languages and selects the one which is used more often across languages (after some major filtering)
# * `select_image_category` selects at random one of the images in the Commons category linked to the Wikidata item.
# Below the priority assignment process:
# * If the article has a Wikidata image (not a flag, as this is likely a duplicate), give it priority 1
# * Choose up to 3 images among the ones from related Wikipedia articles in other languages, using the `select_image_language` function, and give priority 2.x where `x` is a ranking given by the number of languages using that image
# * If the article has an associated Commons category, call the `select_image_category` function, randomly selecting up to 3 images form that category
#
# In[ ]:
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, MapType, StringType
# In[ ]:
def image_language_checks(iname):
#list of substrings to check for
substring_list=['.svg','flag','noantimage','no_free_image','image_manquante',
'replace_this_image','disambig','regions','map','map','default',
'defaut','falta_imagem_','imageNA','noimage','noenzyimage']
iname=iname.lower()
if any(map(iname.__contains__, substring_list)):
return False
else:
return True
# Rewrite helper functions as udfs
def select_image_language(imagelist):
@udf(returnType=MapType(StringType(), StringType()))
def select_image_language_udf(imagelist):
counts={} #contains counts of image occurrences across languages
languages={} #constains which languages cover a given image
#for each image
for image in imagelist:
data=image.strip().split(' ')#this contains the language and image name data
###
if len(data)==2: #if we actually have 2 fields
iname=data[1].strip()
lan=data[0].strip()[:-1]
if str(imagelist)!='None':
for image in imagelist:
data=image.strip().split(' ')#this contains the language and image name data
###
if iname not in counts: #if this image does not exist in our counts yet, initialize counts
if not image_language_checks(iname): #if the image name is not valid
continue
# urll = 'https://commons.wikimedia.org/wiki/File:'+iname.replace(' ','_')+'?uselang='+language
#page = requests.get(urll)
#if page.status_code == 404:
# print (urll)
# continue
counts[iname]=1
languages[iname]=[]
else:
counts[iname]+=1
languages[iname].append(lan)
if len(data)==2: #if we actually have 2 fields
iname=data[1].strip()
lan=data[0].strip()[:-1]
###
if iname not in counts: #if this image does not exist in our counts yet, initialize counts
substring_list=['.svg','flag','noantimage','no_free_image','image_manquante',
'replace_this_image','disambig','regions','map','map','default',
'defaut','falta_imagem_','imageNA','noimage','noenzyimage']
if any(map(iname.lower().__contains__, substring_list)): #if the image name is not valid
continue
# urll = 'https://commons.wikimedia.org/wiki/File:'+iname.replace(' ','_')+'?uselang='+language
#page = requests.get(urll)
#if page.status_code == 404:
# print (urll)
# continue
counts[iname]=1
languages[iname]=[]
else:
counts[iname]+=1
languages[iname].append(lan)
return languages
def select_image_category(imagelist):
counts={}
languages={}
data=list(imagelist.strip().split(';'))
data=[d for d in data if d.find('.')!=-1]
return random.choice(data)
# Below the priority assignment process:
# * If the article has a Wikidata image (not a flag, as this is likely a duplicate), give it priority 1
# * Choose up to 3 images among the ones from related Wikipedia articles in other languages, using the `select_image_language` function, and give priority 2.x where `x` is a ranking given by the number of languages using that image
# * If the article has an associated Commons category, call the `select_image_category` function, randomly selecting up to 3 images form that category
#
# In[ ]:
@udf(returnType=ArrayType(MapType(StringType(), StringType())))
def select_commons_images_udf(commons):
commons_images=[]
random.seed(snapshot)
def select_image_category(imagelist):
data=list(imagelist.strip().split(';'))
data.sort()
data=[d for d in data if d.find('.')!=-1]
return random.choice(data)
if str(commons)!='None':
for i in range(min(len(list(commons.strip().split(';'))),3)):
image=select_image_category(commons)
rating=3
note='image was found in the Commons category linked in the Wikidata item'
commons_images.append({'image':image,'rating':rating,'note':note})
return commons_images
@udf(returnType=ArrayType(MapType(StringType(), StringType())))
def select_wikipedia_images_udf(wikipedia):
wikipedia_images=[]
wikipedia = dict(sorted(wikipedia.items(), key=lambda x: x[0]))
index=np.argsort([len(l) for l in list(wikipedia.values())])
if str(wikipedia)!='None' and len(wikipedia)!=0:
for i in range(min(len(wikipedia),3)):
image=list(wikipedia.keys())[index[-(i+1)]]
rating=2+(float(i)/10)
found_in = wikipedia[image]
found_in.sort()
note='image was found in the following Wikis: '+', '.join(found_in)
wikipedia_images.append({'image':image,'rating':rating,'note':note})
return wikipedia_images
@udf(returnType=ArrayType(MapType(StringType(), StringType())))
def select_wikidata_images_udf(wikidata):
wikidata_images=[]
if str(wikidata)!='None' and wikidata.lower().find('flag') ==-1:
image=wikidata[1:-1]
rating=1
note='image was in the Wikidata item'
wikidata_images.append({'image':image,'rating':rating,'note':note})
return wikidata_images
@udf(returnType=ArrayType(MapType(StringType(), StringType())))
def select_top_candidates_udf(wikidata_images, wikipedia_images, commons_images):
top_candidates=[]
for image in wikidata_images:
if len(top_candidates) < 3 and image:
top_candidates.append(image.copy())
else:
break
for image in wikipedia_images:
if len(top_candidates) < 3 and image:
top_candidates.append(image.copy())
else:
break
stats={}
data_small={}
####
for wiki in languages:
selected=[] #stores selected images for each article
notes=[] #stores information about the source where the candidate image was drawn from
wikis=[]
data_small[wiki]=allimages[wiki].sample(len(allimages[wiki]))
language=wiki.replace('wiki','')
#rtl=direction[wiki] #right to left -> rtl; left to right -> ltr
for wikipedia in data_small[wiki]['lan_images']:
if str(wikipedia)!='None':
lg=select_image_language(wikipedia)
if len(lg)==0:
lg=None
wikis.append(lg)
for image in commons_images:
if len(top_candidates) < 3 and image:
top_candidates.append(image.copy())
else:
wikis.append(None)
data_small[wiki]['wikipedia_imagelist']=wikis
for wikidata,commons,wikipedia,jdata in zip(data_small[wiki]['hasimage'],data_small[wiki]['category_imagelist'],data_small[wiki]['wikipedia_imagelist'],data_small[wiki]['instanceof']):
if jdata is not None:
qid=json.loads(jdata)["numeric-id"]
if qid in [4167410,577,13406463]:
selected.append(None)
notes.append(None)
continue
image=None
tier={}
note={}
if str(commons)!='None':
for i in range(min(len(list(commons.strip().split(';'))),3)):
image=select_image_category(commons)
tier[image]=3
note[image]='image was found in the Commons category linked in the Wikidata item'
###
if str(wikipedia) !='None':
index=np.argsort([len(l) for l in list(wikipedia.values())])
#print(wikipedia)
for i in range(min(len(wikipedia),3)):
image=list(wikipedia.keys())[index[-(i+1)]]
tier[image]=2+(float(i)/10)
note[image]='image was found in the following Wikis: '+', '.join(wikipedia[image])
if str(wikidata)!='None' and wikidata.lower().find('flag') ==-1:
image=wikidata[1:-1]
tier[image]=1
note[image]='image was in the Wikidata item'
selected.append(tier if len(tier)>0 else None)
notes.append(note if len(note)>0 else None)
# if image is not None:
# properties.append(get_properties(image,language,rtl,page))
# else:
# properties.append([None,None,None,None,None,None,None,None,None])
#updating table
data_small[wiki]['selected']=selected
data_small[wiki]['notes']=notes
data_small[wiki]['good_interlinks']=wikis
#TODO(REMOVE FROM repo) data_small[wiki]=data_small[wiki][data_small[wiki]['selected'].astype(str).ne('None')]
#print("total number of articles: "+str(total[wiki]))
#print("number of unillustrated articles: "+str(len(qids_and_properties[wiki])))
#print("number of articles with at least 1 recommendation: "+str(len(data_small[wiki])))
#stats[wiki]=[total[wiki],len(qids_and_properties[wiki]),len(data_small[wiki]),len(all3images),len(hasimage),len(commonscategory),len(lanimages)]
break
return top_candidates
# In[ ]:
#the final selection process: select up to 3 images per candidateand their relative confidence score (1=high, 2=medium, 3=low)
#based on the priorities assigned earlier
username = getpass.getuser()
for wiki in languages:
top_candidates=[]
for selected,notes in zip (data_small[wiki]['selected'],data_small[wiki]['notes']):
if selected is not None:
index=np.argsort([l for l in list(selected.values())])
candidates=[]
#print(wikipedia)
for i in range(min(len(index),3)):
image=list(selected.keys())[index[i]]
rating=selected[image]
note=notes[image]
candidates.append({'image':image,'rating':rating,'note':note})
top_candidates.append(candidates)
else:
top_candidates.append(None)
data_small[wiki]['top_candidates']=top_candidates
data_small[wiki][['item_id','page_id','page_title','top_candidates', 'instanceof']].to_csv(output_dir+'/'+wiki+'_'+snapshot+'_wd_image_candidates.tsv',sep='\t')
qids_and_properties[wiki].withColumn(
"wikipedia_imagelist", select_image_language_udf(F.col("lan_images"))
).withColumn(
"commons_images", select_commons_images_udf(F.col("category_imagelist"))
).withColumn(
"wikipedia_images", select_wikipedia_images_udf(F.col("wikipedia_imagelist"))
).withColumn(
"wikidata_images", select_wikidata_images_udf(F.col("hasimage"))
).withColumn(
"top_candidates", select_top_candidates_udf(F.col("wikidata_images"), F.col("wikipedia_images"), F.col("commons_images"))
).withColumn("wiki_db", F.lit(wiki)
).withColumn("snapshot", F.lit(short_snapshot)
).withColumnRenamed(
"instanceof", "instance_of"
).select(
"item_id",
"page_id",
"page_title",
"instance_of",
"top_candidates",
"wiki_db",
"snapshot"
).write.partitionBy("wiki_db", "snapshot").mode("overwrite").parquet("imagerec")
# In[ ]:
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment