Commit e37946cc authored by Clarakosi's avatar Clarakosi Committed by GitHub
Browse files

Add a list of instances to filter (#14)

* Add a list of instances to filter

* Update ddl/export_prod_data.hql changelog

* Add metrics and move filer list to enum class

* Update column name to is_article_page
parent 4ac2c22c
%% Cell type:code id:impressed-fourth tags:
``` python
import pyspark.sql
import pandas as pd
import os
import getpass
%% Cell type:code id:deluxe-mailman tags:parameters
``` python
# Create output directory
output_dir = "Data_Metrics_Output"
if not os.path.exists(output_dir):
snapshot = "2021-01"
username = getpass.getuser()
%% Cell type:markdown id:improving-jonathan tags:
### Total number of records (per wiki)
%% Cell type:code id:engaged-inflation tags:
``` python
query = """SELECT wiki AS Wiki, snapshot, COUNT(*) as `Number of Records`
FROM gmodena.imagerec_prod
WHERE snapshot='"""+snapshot+"""'
GROUP BY wiki, snapshot
ORDER BY wiki"""
total_number_of_records = spark.sql(query).toPandas()
%% Cell type:code id:lucky-vocabulary tags:
``` python
%% Cell type:code id:activated-worker tags:
``` python
total_number_of_records.to_csv(output_dir+"/"+"Total number of records")
%% Cell type:markdown id:intimate-penny tags:
### Population statistics
%% Cell type:code id:arabic-casting tags:
``` python
population_stat = total_number_of_records['Number of Records'].describe()
population_stat.to_csv(output_dir+"/"+"Population statistics")
%% Cell type:code id:friendly-leonard tags:
``` python
total_number_of_records.boxplot(column=['Number of Records'])
%% Cell type:code id:loose-throw tags:
``` python
pop_stat_median = pd.DataFrame(data={"Median": [total_number_of_records["Number of Records"].median()]})
pop_stat_median.to_csv(output_dir+"/"+"Population statistics median")
%% Cell type:code id:neither-coating tags:
``` python
pop_stat_mode = total_number_of_records['Number of Records'].mode()
pop_stat_mode.to_csv(output_dir+"/"+"Population statistics mode")
%% Cell type:markdown id:banner-criticism tags:
### Total number of images per page
%% Cell type:code id:lesbian-angel tags:
``` python
query = """SELECT wiki AS Wiki, page_id as `Page ID`, COUNT(*) as `Number of Images`
FROM gmodena.imagerec_prod
WHERE snapshot='"""+snapshot+"""'
GROUP BY wiki, page_id
ORDER BY wiki, page_id"""
total_number_of_images_per_page = spark.sql(query).toPandas()
%% Cell type:code id:polar-click tags:
``` python
total_number_of_images_per_page.to_csv(output_dir+"/"+"Total number of images per page")
%% Cell type:markdown id:front-ratio tags:
#### Breakdown of the number of images being suggested for each page
%% Cell type:markdown id:awful-stuart tags:
Keep in mind that pages without an image suggestion will apear as 1.
%% Cell type:code id:neither-emphasis tags:
``` python
query = """SELECT number_of_images AS `Image Suggestions`, count(*) AS `Pages`
SELECT wiki, page_id, COUNT(*) as number_of_images
FROM gmodena.imagerec_prod
WHERE snapshot='"""+snapshot+"""'
GROUP BY wiki, page_id
) AS expr_qry
GROUP BY number_of_images
ORDER BY number_of_images"""
breakdown_of_image_sug_per_page = spark.sql(query).toPandas()
%% Cell type:code id:assisted-startup tags:
``` python
breakdown_of_image_sug_per_page.set_index('Image Suggestions', inplace=True)
breakdown_of_image_sug_per_page.to_csv(output_dir+"/"+"Breakdown of image sug per page")
%% Cell type:code id:complicated-delay tags:
``` python
title="Breakdown of Images Suggestion Per Page",
figsize=(6, 6),
%% Cell type:markdown id:downtown-manner tags:
Breakdown of image suggestion data by confidence rating.
A rating of None indicates that the page has no image suggestion
%% Cell type:code id:generic-priority tags:
``` python
query = """SELECT wiki AS Wiki, confidence_rating AS `Confidence Rating`, COUNT(*) AS `Image Suggestions`
FROM gmodena.imagerec_prod
WHERE snapshot='"""+snapshot+"""'
GROUP BY Wiki, `Confidence Rating`
ORDER BY Wiki, `Confidence Rating`"""
breakdown_of_image_sug_by_confidence_score = spark.sql(query).toPandas()
%% Cell type:code id:impressive-failure tags:
``` python
breakdown_of_image_sug_by_confidence_score.to_csv(output_dir+"/"+"Breakdown of image sug by conf rating")
%% Cell type:markdown id:executive-theory tags:
#### Get articles with more than 3 image suggestions
Assuming no error this table should be empty
%% Cell type:code id:fiscal-poverty tags:
``` python
query = """WITH large_image_sug AS
(SELECT wiki, page_id, COUNT(*)
FROM gmodena.imagerec_prod
WHERE snapshot='"""+snapshot+"""'
GROUP BY wiki, page_id
FROM gmodena.imagerec_prod p
JOIN large_image_sug
ON =
AND large_image_sug.page_id = p.page_id
AND p.snapshot='"""+snapshot+"""'
ORDER BY, p.page_id, p.image_id"""
articles_with_more_image_sug = spark.sql(query).toPandas()
%% Cell type:code id:metallic-visibility tags:
``` python
articles_with_more_image_sug.to_csv(output_dir+"/"+"Articles with more than 3 sug")
%% Cell type:markdown id:invalid-trader tags:
### Size and counts of intermediate and final datasets
%% Cell type:code id:integrated-spell tags:
``` python
query = """SELECT wiki_db AS `Wiki`, snapshot, COUNT(*) AS `Raw Number of Records`
FROM gmodena.imagerec
WHERE snapshot='"""+snapshot+"""'
GROUP BY wiki_db, snapshot
ORDER BY wiki_db"""
raw_total_number_of_records = spark.sql(query).toPandas()
%% Cell type:code id:apparent-marble tags:
``` python
%% Cell type:code id:aquatic-selling tags:
``` python
total_number_of_records = total_number_of_records.rename(columns={"Number of Records": "Final Number of Records"})
result = pd.merge(raw_total_number_of_records, total_number_of_records, on=["Wiki", "snapshot"])
%% Cell type:code id:supreme-monday tags:
``` python
result.to_csv(output_dir+"/"+"Counts of intermediate and final datasets")
%% Cell type:code id:green-intellectual tags:
``` python
y=["Raw Number of Records", "Final Number of Records"],
title="Comparison of intermediate and final number of records",
figsize=(6, 6),
%% Cell type:markdown id:supposed-nigeria tags:
### Number of articles with and without valid "instance of"
Todo: Update snapshot and table name to be passed in parameters
%% Cell type:code id:regulation-rental tags:
``` python
query = """SELECT wiki_db, snapshot,
COUNT(instance_of) AS with_instance_of,
SUM(CASE WHEN instance_of IS NULL then 1 ELSE 0 END) AS without_instance_of
FROM gmodena.imagerec_parquet
WHERE snapshot = '2021-01'
GROUP BY wiki_db, snapshot
ORDER BY wiki_db"""
instance_of_metrics = spark.sql(query).toPandas()
%% Cell type:code id:offensive-underwear tags:
``` python
instance_of_metrics.to_csv(output_dir+"/"+"Number of articles with and without valid instance_of")
%% Cell type:code id:chronic-clothing tags:
``` python
### Number of redirect articles
Validate that no "page redirects" are present in the dataset.
%% Cell type:code id:taken-ordinary tags:
``` python
query = f"""
select im.snapshot, count(*) as page_redirect from {username}.imagerec im
join wmf_raw.mediawiki_page as mp
where im.wiki_db = mp.wiki_db
and cast(im.page_id as string) = cast(mp.page_id as string)
and im.snapshot = mp.snapshot
and mp.page_is_redirect = 1
and im.wiki_db != '' and im.snapshot >= "{snapshot}"
group by im.snapshot"""
page_redirect = spark.sql(query).toPandas()
page_redirect.to_csv(os.path.join(output_dir, "Page redirects"))
%% Cell type:markdown id: tags:
### Number of records filtered out
%% Cell type:code id: tags:
``` python
query = """SELECT wiki, snapshot,
SUM(CASE WHEN is_article_page = True THEN 1 ELSE 0 END ) as "Final number of records",
SUM(CASE WHEN is_article_page = False THEN 1 ELSE 0 END ) as "Number of records filtered out"
FROM {username}.imagerec_prod
GROUP BY wiki, snapshot"""
filtered_out_records = spark.sql(query).toPandas()
filtered_out_records.to_csv(output_dir+"/"+"Number of records filtered out")
......@@ -13,6 +13,7 @@
-- Changelog:
-- * 2021-03-08: schema and format freeze.
-- * 2021-03-25: add is_article_page to where clause
use ${hiveconf:username};
set hivevar:null_value="";
......@@ -26,4 +27,4 @@ select page_id,
from imagerec_prod
where wiki = '${hiveconf:wiki}' and snapshot='${hiveconf:snapshot}'
where wiki = '${hiveconf:wiki}' and snapshot='${hiveconf:snapshot}' and is_article_page=true
......@@ -16,22 +16,14 @@ CREATE EXTERNAL TABLE IF NOT EXISTS `imagerec_prod`(
`image_id` string,
`confidence_rating` string,
`source` string,
`instance_of` string,
`is_article_page` boolean,
`dataset_id` string,
`insertion_ts` float)
`insertion_ts` double)
PARTITIONED BY (`wiki` string, `snapshot` string)
-- Update partition metadata
MSCK REPAIR TABLE `imagerec_prod`;
from enum import Enum
class InstancesToFilter(Enum):
YEAR = "Q577"
LIST = "Q13406463"
def list(cls):
return [p.value for p in InstancesToFilter]
......@@ -3,6 +3,7 @@ from pyspark.sql import Column, DataFrame
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType
from schema import RawDataset
from instances_to_filter import InstancesToFilter
import argparse
import uuid
......@@ -41,6 +42,14 @@ class ImageRecommendation:
is_article_page: Column = (
def __init__(self, dataFrame: DataFrame):
self.dataFrame = dataFrame
if not dataFrame.schema == RawDataset.schema:
......@@ -89,7 +98,9 @@ class ImageRecommendation:
return with_recommendations.union(without_recommendations).withColumn("instance_of", self.instance_of)
return with_recommendations.union(without_recommendations)\
.withColumn("instance_of", self.instance_of)\
.withColumn("is_article_page", self.is_article_page)
def parse_args():
......@@ -111,7 +111,7 @@ echo "Generating production data"
spark2-submit --properties-file ${spark_config} --files etl/ etl/ \
--snapshot ${monthly_snapshot} \
--source ${hdfs_imagerec} \
......@@ -17,6 +17,7 @@ def test_etl(raw_data):
......@@ -59,3 +60,14 @@ def test_etl(raw_data):
assert len(rows) == 1
assert rows[0]["instance_of"] == expected_instance_of
# Pages are correctly marked for filtering
expected_page_id = "523523"
filter_out_rows = (
assert len(filter_out_rows) == 1
assert filter_out_rows[0]["page_id"] == expected_page_id
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment