Commit e4163f38 authored by Clarakosi's avatar Clarakosi Committed by GitHub
Browse files

T275165 dataset metrics (#8)

* Add initial dataset metrics

* Update draft dataset metrics with updated datasets

* Add dataset metrics script and comparison of intermediate & final data

* Add initial dataset metrics

* Update draft dataset metrics with updated datasets

* Add dataset metrics script and comparison of intermediate & final data

* Changes based on code review

* Add initial dataset metrics

* Update draft dataset metrics with updated datasets

* Add dataset metrics script and comparison of intermediate & final data

* Add initial dataset metrics

* Update draft dataset metrics with updated datasets

* Add dataset metrics script and comparison of intermediate & final data

* Changes based on code review

* Update dataset_metrics_runner
parent 05888e6a
......@@ -15,9 +15,9 @@ clean_spark:
flake8: venv
# stop the build if there are Python syntax errors or undefined names in *.py file
. venv/bin/activate; flake8 *.py etl/ tests/ --count --select=E9,F63,F7,F82 --show-source --statistics
. venv/bin/activate; flake8 *.py dataset_metrics/ etl/ tests/ --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
. venv/bin/activate; flake8 *.py etl/ tests/ --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
. venv/bin/activate; flake8 *.py dataset_metrics/ etl/ tests/ --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
test: venv
. venv/bin/activate; pytest --cov etl tests/
......@@ -73,3 +73,10 @@ python3 algorunner.py 2020-12-28 hywiki Output
Will submit the `algorunner` job, with additional instrumentation.
For more information refer to https://spark.apache.org/docs/latest/monitoring.html.
### Get dataset Metrics
To get the dataset metrics run the dataset_metrics_python script. The script expects the **snapshot** (required)
and **output directory** (defaults to Output)
```shell
cd dataset_metrics/
python3 dataset_metrics_runner.py 2021-01 Output
```
\ No newline at end of file
%% Cell type:code id:requested-karaoke tags:
``` python
import pyspark.sql
import pandas as pd
```
%% Cell type:code id:provincial-southeast tags:parameters
``` python
# Create output directory
output_dir = "Data_Metrics_Output"
if not os.path.exists(output_dir):
os.makedirs(output_dir)
snapshot = "2021-01"
```
%% Cell type:markdown id:incorporate-registration tags:
### Total number of records (per wiki)
%% Cell type:code id:ranking-gibraltar tags:
``` python
query = """SELECT wiki AS Wiki, snapshot, COUNT(*) as `Number of Records`
FROM gmodena.imagerec_prod
WHERE snapshot='"""+snapshot+"""'
GROUP BY wiki, snapshot
ORDER BY wiki"""
total_number_of_records = spark.sql(query).toPandas()
```
%% Cell type:code id:dangerous-conservative tags:
``` python
total_number_of_records
```
%% Cell type:code id:standard-special tags:
``` python
total_number_of_records.to_csv(output_dir+"/"+"Total number of records")
```
%% Cell type:markdown id:romance-superintendent tags:
### Population statistics
%% Cell type:code id:freelance-florence tags:
``` python
population_stat = total_number_of_records['Number of Records'].describe()
population_stat.to_csv(output_dir+"/"+"Population statistics")
population_stat
```
%% Cell type:code id:hispanic-standard tags:
``` python
total_number_of_records.boxplot(column=['Number of Records'])
```
%% Cell type:code id:patent-scale tags:
``` python
pop_stat_median = pd.DataFrame(data={"Median": [total_number_of_records["Number of Records"].median()]})
pop_stat_median.to_csv(output_dir+"/"+"Population statistics median")
pop_stat_median
```
%% Cell type:code id:metropolitan-keeping tags:
``` python
pop_stat_mode = total_number_of_records['Number of Records'].mode()
pop_stat_mode.to_csv(output_dir+"/"+"Population statistics mode")
pop_stat_mode
```
%% Cell type:markdown id:middle-hamilton tags:
### Total number of images per page
%% Cell type:code id:distinguished-stranger tags:
``` python
query = """SELECT wiki AS Wiki, page_id as `Page ID`, COUNT(*) as `Number of Images`
FROM gmodena.imagerec_prod
WHERE snapshot='"""+snapshot+"""'
GROUP BY wiki, page_id
ORDER BY wiki, page_id"""
total_number_of_images_per_page = spark.sql(query).toPandas()
```
%% Cell type:code id:adopted-mexican tags:
``` python
total_number_of_images_per_page.to_csv(output_dir+"/"+"Total number of images per page")
total_number_of_images_per_page
```
%% Cell type:markdown id:fifty-motel tags:
#### Breakdown of the number of images being suggested for each page
%% Cell type:markdown id:deluxe-father tags:
Keep in mind that pages without an image suggestion will apear as 1.
%% Cell type:code id:accomplished-leather tags:
``` python
query = """SELECT number_of_images AS `Image Suggestions`, count(*) AS `Pages`
FROM (
SELECT wiki, page_id, COUNT(*) as number_of_images
FROM gmodena.imagerec_prod
WHERE snapshot='"""+snapshot+"""'
GROUP BY wiki, page_id
) AS expr_qry
GROUP BY number_of_images
ORDER BY number_of_images"""
breakdown_of_image_sug_per_page = spark.sql(query).toPandas()
```
%% Cell type:code id:undefined-childhood tags:
``` python
breakdown_of_image_sug_per_page.set_index('Image Suggestions', inplace=True)
breakdown_of_image_sug_per_page.to_csv(output_dir+"/"+"Breakdown of image sug per page")
breakdown_of_image_sug_per_page
```
%% Cell type:code id:dynamic-jacket tags:
``` python
breakdown_of_image_sug_per_page.plot(y="Pages",
title="Breakdown of Images Suggestion Per Page",
autopct="%.2f",
figsize=(6, 6),
kind="pie");
```
%% Cell type:markdown id:excessive-intelligence tags:
Breakdown of image suggestion data by confidence rating.
A rating of None indicates that the page has no image suggestion
%% Cell type:code id:filled-dutch tags:
``` python
query = """SELECT wiki AS Wiki, confidence_rating AS `Confidence Rating`, COUNT(*) AS `Image Suggestions`
FROM gmodena.imagerec_prod
WHERE snapshot='"""+snapshot+"""'
GROUP BY Wiki, `Confidence Rating`
ORDER BY Wiki, `Confidence Rating`"""
breakdown_of_image_sug_by_confidence_score = spark.sql(query).toPandas()
```
%% Cell type:code id:effective-thomson tags:
``` python
breakdown_of_image_sug_by_confidence_score.to_csv(output_dir+"/"+"Breakdown of image sug by conf rating")
breakdown_of_image_sug_by_confidence_score
```
%% Cell type:markdown id:cultural-defeat tags:
#### Get articles with more than 3 image suggestions
Assuming no error this table should be empty
%% Cell type:code id:compressed-brooks tags:
``` python
query = """WITH large_image_sug AS
(SELECT wiki, page_id, COUNT(*)
FROM gmodena.imagerec_prod
WHERE snapshot='"""+snapshot+"""'
GROUP BY wiki, page_id
HAVING COUNT(*) > 3)
SELECT p.*
FROM gmodena.imagerec_prod p
JOIN large_image_sug
ON large_image_sug.wiki = p.wiki
AND large_image_sug.page_id = p.page_id
AND p.snapshot='"""+snapshot+"""'
ORDER BY p.wiki, p.page_id, p.image_id"""
articles_with_more_image_sug = spark.sql(query).toPandas()
```
%% Cell type:code id:happy-navigator tags:
``` python
articles_with_more_image_sug.to_csv(output_dir+"/"+"Articles with more than 3 sug")
articles_with_more_image_sug
```
%% Cell type:markdown id:dental-bennett tags:
### Size and counts of intermediate and final datasets
%% Cell type:code id:complete-glossary tags:
``` python
query = """SELECT wiki_db AS `Wiki`, snapshot, COUNT(*) AS `Raw Number of Records`
FROM gmodena.imagerec
WHERE snapshot='"""+snapshot+"""'
GROUP BY wiki_db, snapshot
ORDER BY wiki_db"""
raw_total_number_of_records = spark.sql(query).toPandas()
```
%% Cell type:code id:numerical-bryan tags:
``` python
raw_total_number_of_records
```
%% Cell type:code id:packed-counter tags:
``` python
total_number_of_records = total_number_of_records.rename(columns={"Number of Records": "Final Number of Records"})
result = pd.merge(raw_total_number_of_records, total_number_of_records, on=["Wiki", "snapshot"])
```
%% Cell type:code id:instrumental-species tags:
``` python
result.to_csv(output_dir+"/"+"Counts of intermediate and final datasets")
result
```
%% Cell type:code id:modern-productivity tags:
``` python
result.plot(x="Wiki",
y=["Raw Number of Records", "Final Number of Records"],
title="Comparison of intermediate and final number of records",
figsize=(6, 6),
kind="bar")
```
%% Cell type:code id:worse-fleece tags:
``` python
```
import argparse
import papermill as pm
import os
class DatasetMetricsRunner:
def __init__(self, snapshot, output_dir):
"""
:param str snapshot: Snapshot date
:param str output_dir: Directory to place output .ipynb and .csv files
"""
self.snapshot = snapshot
self.output_dir = output_dir
print(f'Initializing with snapshot={self.snapshot} output_dir={self.output_dir}')
def run(self):
"""