Commit 7712d9f4 authored by Clarakosi's avatar Clarakosi Committed by GitHub
Browse files

Implement parsing of “instance of” fields in ImageMatching production datasets (#9)

* Update transform.py to parse "instance of" json blob

* Update tests and fix transform.py schema changes

* Simplify parsing logic, add metrics, and update tests

* Updates based on code review
parent 292c864a
...@@ -12,7 +12,7 @@ def raw_data(spark_session): ...@@ -12,7 +12,7 @@ def raw_data(spark_session):
"44444", "44444",
"Some page with suggestions", "Some page with suggestions",
'[{"image": "image1.jpg", "rating": 2.0, "note": "image was found in the following Wikis: ruwiki"}]', '[{"image": "image1.jpg", "rating": 2.0, "note": "image was found in the following Wikis: ruwiki"}]',
"", None,
"arwiki", "arwiki",
"2020-12", "2020-12",
), ),
...@@ -22,7 +22,7 @@ def raw_data(spark_session): ...@@ -22,7 +22,7 @@ def raw_data(spark_session):
"55555", "55555",
"Some page with no suggestion", "Some page with no suggestion",
None, None,
"", None,
"arwiki", "arwiki",
"2020-12", "2020-12",
), ),
...@@ -32,11 +32,12 @@ def raw_data(spark_session): ...@@ -32,11 +32,12 @@ def raw_data(spark_session):
"523523", "523523",
"Some page with 3 suggestions", "Some page with 3 suggestions",
'[' '['
' {"image": "image2.jpg", "rating": 2.0, "note": "image was found in the following Wikis: ruwiki"}, ' '{"image": "image2.jpg", "rating": 2.0, "note": "image was found in the following Wikis: ruwiki"}, '
'{"image": "image3.jpg", "rating": 1, "note": "image was in the Wikidata item"}, ' '{"image": "image3.jpg", "rating": 1, "note": "image was in the Wikidata item"}, '
'{"image": "image4.jpg", "rating": 3.0, "note": "image was found in the Commons category linked in the Wikidata item"}' '{"image": "image4.jpg", "rating": 3.0, "note": "image was found in the Commons category linked in '
'the Wikidata item"} '
']', ']',
"", '{"entity-type":"item","numeric-id":577,"id":"Q577"}',
"enwiki", "enwiki",
"2020-12", "2020-12",
), ),
......
%% Cell type:code id:requested-karaoke tags: %% Cell type:code id:impressed-fourth tags:
``` python ``` python
import pyspark.sql import pyspark.sql
import pandas as pd import pandas as pd
``` ```
%% Cell type:code id:provincial-southeast tags:parameters %% Cell type:code id:deluxe-mailman tags:parameters
``` python ``` python
# Create output directory # Create output directory
output_dir = "Data_Metrics_Output" output_dir = "Data_Metrics_Output"
if not os.path.exists(output_dir): if not os.path.exists(output_dir):
os.makedirs(output_dir) os.makedirs(output_dir)
snapshot = "2021-01" snapshot = "2021-01"
``` ```
%% Cell type:markdown id:incorporate-registration tags: %% Cell type:markdown id:improving-jonathan tags:
### Total number of records (per wiki) ### Total number of records (per wiki)
%% Cell type:code id:ranking-gibraltar tags: %% Cell type:code id:engaged-inflation tags:
``` python ``` python
query = """SELECT wiki AS Wiki, snapshot, COUNT(*) as `Number of Records` query = """SELECT wiki AS Wiki, snapshot, COUNT(*) as `Number of Records`
FROM gmodena.imagerec_prod FROM gmodena.imagerec_prod
WHERE snapshot='"""+snapshot+"""' WHERE snapshot='"""+snapshot+"""'
GROUP BY wiki, snapshot GROUP BY wiki, snapshot
ORDER BY wiki""" ORDER BY wiki"""
total_number_of_records = spark.sql(query).toPandas() total_number_of_records = spark.sql(query).toPandas()
``` ```
%% Cell type:code id:dangerous-conservative tags: %% Cell type:code id:lucky-vocabulary tags:
``` python ``` python
total_number_of_records total_number_of_records
``` ```
%% Cell type:code id:standard-special tags: %% Cell type:code id:activated-worker tags:
``` python ``` python
total_number_of_records.to_csv(output_dir+"/"+"Total number of records") total_number_of_records.to_csv(output_dir+"/"+"Total number of records")
``` ```
%% Cell type:markdown id:romance-superintendent tags: %% Cell type:markdown id:intimate-penny tags:
### Population statistics ### Population statistics
%% Cell type:code id:freelance-florence tags: %% Cell type:code id:arabic-casting tags:
``` python ``` python
population_stat = total_number_of_records['Number of Records'].describe() population_stat = total_number_of_records['Number of Records'].describe()
population_stat.to_csv(output_dir+"/"+"Population statistics") population_stat.to_csv(output_dir+"/"+"Population statistics")
population_stat population_stat
``` ```
%% Cell type:code id:hispanic-standard tags: %% Cell type:code id:friendly-leonard tags:
``` python ``` python
total_number_of_records.boxplot(column=['Number of Records']) total_number_of_records.boxplot(column=['Number of Records'])
``` ```
%% Cell type:code id:patent-scale tags: %% Cell type:code id:loose-throw tags:
``` python ``` python
pop_stat_median = pd.DataFrame(data={"Median": [total_number_of_records["Number of Records"].median()]}) pop_stat_median = pd.DataFrame(data={"Median": [total_number_of_records["Number of Records"].median()]})
pop_stat_median.to_csv(output_dir+"/"+"Population statistics median") pop_stat_median.to_csv(output_dir+"/"+"Population statistics median")
pop_stat_median pop_stat_median
``` ```
%% Cell type:code id:metropolitan-keeping tags: %% Cell type:code id:neither-coating tags:
``` python ``` python
pop_stat_mode = total_number_of_records['Number of Records'].mode() pop_stat_mode = total_number_of_records['Number of Records'].mode()
pop_stat_mode.to_csv(output_dir+"/"+"Population statistics mode") pop_stat_mode.to_csv(output_dir+"/"+"Population statistics mode")
pop_stat_mode pop_stat_mode
``` ```
%% Cell type:markdown id:middle-hamilton tags: %% Cell type:markdown id:banner-criticism tags:
### Total number of images per page ### Total number of images per page
%% Cell type:code id:distinguished-stranger tags: %% Cell type:code id:lesbian-angel tags:
``` python ``` python
query = """SELECT wiki AS Wiki, page_id as `Page ID`, COUNT(*) as `Number of Images` query = """SELECT wiki AS Wiki, page_id as `Page ID`, COUNT(*) as `Number of Images`
FROM gmodena.imagerec_prod FROM gmodena.imagerec_prod
WHERE snapshot='"""+snapshot+"""' WHERE snapshot='"""+snapshot+"""'
GROUP BY wiki, page_id GROUP BY wiki, page_id
ORDER BY wiki, page_id""" ORDER BY wiki, page_id"""
total_number_of_images_per_page = spark.sql(query).toPandas() total_number_of_images_per_page = spark.sql(query).toPandas()
``` ```
%% Cell type:code id:adopted-mexican tags: %% Cell type:code id:polar-click tags:
``` python ``` python
total_number_of_images_per_page.to_csv(output_dir+"/"+"Total number of images per page") total_number_of_images_per_page.to_csv(output_dir+"/"+"Total number of images per page")
total_number_of_images_per_page total_number_of_images_per_page
``` ```
%% Cell type:markdown id:fifty-motel tags: %% Cell type:markdown id:front-ratio tags:
#### Breakdown of the number of images being suggested for each page #### Breakdown of the number of images being suggested for each page
%% Cell type:markdown id:deluxe-father tags: %% Cell type:markdown id:awful-stuart tags:
Keep in mind that pages without an image suggestion will apear as 1. Keep in mind that pages without an image suggestion will apear as 1.
%% Cell type:code id:accomplished-leather tags: %% Cell type:code id:neither-emphasis tags:
``` python ``` python
query = """SELECT number_of_images AS `Image Suggestions`, count(*) AS `Pages` query = """SELECT number_of_images AS `Image Suggestions`, count(*) AS `Pages`
FROM ( FROM (
SELECT wiki, page_id, COUNT(*) as number_of_images SELECT wiki, page_id, COUNT(*) as number_of_images
FROM gmodena.imagerec_prod FROM gmodena.imagerec_prod
WHERE snapshot='"""+snapshot+"""' WHERE snapshot='"""+snapshot+"""'
GROUP BY wiki, page_id GROUP BY wiki, page_id
) AS expr_qry ) AS expr_qry
GROUP BY number_of_images GROUP BY number_of_images
ORDER BY number_of_images""" ORDER BY number_of_images"""
breakdown_of_image_sug_per_page = spark.sql(query).toPandas() breakdown_of_image_sug_per_page = spark.sql(query).toPandas()
``` ```
%% Cell type:code id:undefined-childhood tags: %% Cell type:code id:assisted-startup tags:
``` python ``` python
breakdown_of_image_sug_per_page.set_index('Image Suggestions', inplace=True) breakdown_of_image_sug_per_page.set_index('Image Suggestions', inplace=True)
breakdown_of_image_sug_per_page.to_csv(output_dir+"/"+"Breakdown of image sug per page") breakdown_of_image_sug_per_page.to_csv(output_dir+"/"+"Breakdown of image sug per page")
breakdown_of_image_sug_per_page breakdown_of_image_sug_per_page
``` ```
%% Cell type:code id:dynamic-jacket tags: %% Cell type:code id:complicated-delay tags:
``` python ``` python
breakdown_of_image_sug_per_page.plot(y="Pages", breakdown_of_image_sug_per_page.plot(y="Pages",
title="Breakdown of Images Suggestion Per Page", title="Breakdown of Images Suggestion Per Page",
autopct="%.2f", autopct="%.2f",
figsize=(6, 6), figsize=(6, 6),
kind="pie"); kind="pie");
``` ```
%% Cell type:markdown id:excessive-intelligence tags: %% Cell type:markdown id:downtown-manner tags:
Breakdown of image suggestion data by confidence rating. Breakdown of image suggestion data by confidence rating.
A rating of None indicates that the page has no image suggestion A rating of None indicates that the page has no image suggestion
%% Cell type:code id:filled-dutch tags: %% Cell type:code id:generic-priority tags:
``` python ``` python
query = """SELECT wiki AS Wiki, confidence_rating AS `Confidence Rating`, COUNT(*) AS `Image Suggestions` query = """SELECT wiki AS Wiki, confidence_rating AS `Confidence Rating`, COUNT(*) AS `Image Suggestions`
FROM gmodena.imagerec_prod FROM gmodena.imagerec_prod
WHERE snapshot='"""+snapshot+"""' WHERE snapshot='"""+snapshot+"""'
GROUP BY Wiki, `Confidence Rating` GROUP BY Wiki, `Confidence Rating`
ORDER BY Wiki, `Confidence Rating`""" ORDER BY Wiki, `Confidence Rating`"""
breakdown_of_image_sug_by_confidence_score = spark.sql(query).toPandas() breakdown_of_image_sug_by_confidence_score = spark.sql(query).toPandas()
``` ```
%% Cell type:code id:effective-thomson tags: %% Cell type:code id:impressive-failure tags:
``` python ``` python
breakdown_of_image_sug_by_confidence_score.to_csv(output_dir+"/"+"Breakdown of image sug by conf rating") breakdown_of_image_sug_by_confidence_score.to_csv(output_dir+"/"+"Breakdown of image sug by conf rating")
breakdown_of_image_sug_by_confidence_score breakdown_of_image_sug_by_confidence_score
``` ```
%% Cell type:markdown id:cultural-defeat tags: %% Cell type:markdown id:executive-theory tags:
#### Get articles with more than 3 image suggestions #### Get articles with more than 3 image suggestions
Assuming no error this table should be empty Assuming no error this table should be empty
%% Cell type:code id:compressed-brooks tags: %% Cell type:code id:fiscal-poverty tags:
``` python ``` python
query = """WITH large_image_sug AS query = """WITH large_image_sug AS
(SELECT wiki, page_id, COUNT(*) (SELECT wiki, page_id, COUNT(*)
FROM gmodena.imagerec_prod FROM gmodena.imagerec_prod
WHERE snapshot='"""+snapshot+"""' WHERE snapshot='"""+snapshot+"""'
GROUP BY wiki, page_id GROUP BY wiki, page_id
HAVING COUNT(*) > 3) HAVING COUNT(*) > 3)
SELECT p.* SELECT p.*
FROM gmodena.imagerec_prod p FROM gmodena.imagerec_prod p
JOIN large_image_sug JOIN large_image_sug
ON large_image_sug.wiki = p.wiki ON large_image_sug.wiki = p.wiki
AND large_image_sug.page_id = p.page_id AND large_image_sug.page_id = p.page_id
AND p.snapshot='"""+snapshot+"""' AND p.snapshot='"""+snapshot+"""'
ORDER BY p.wiki, p.page_id, p.image_id""" ORDER BY p.wiki, p.page_id, p.image_id"""
articles_with_more_image_sug = spark.sql(query).toPandas() articles_with_more_image_sug = spark.sql(query).toPandas()
``` ```
%% Cell type:code id:happy-navigator tags: %% Cell type:code id:metallic-visibility tags:
``` python ``` python
articles_with_more_image_sug.to_csv(output_dir+"/"+"Articles with more than 3 sug") articles_with_more_image_sug.to_csv(output_dir+"/"+"Articles with more than 3 sug")
articles_with_more_image_sug articles_with_more_image_sug
``` ```
%% Cell type:markdown id:dental-bennett tags: %% Cell type:markdown id:invalid-trader tags:
###