major initial refactoring
- move
hql
folder to root - rename Spark jobs
- merge
section_topics.py
intotopic_slis.py
, i.e., section topics-based suggestions - rename
commons.get_commons_data
tocommons.build_weighted_tags
- remove
unillustratable.get_overused_images
wrapper function, directly useoverused_images.get
instead - update tests
Bug: T378005
Airflow test run
The test generated 10160
ALIS less than the production run. I guess that's probably fine, due to the nondeterministic nature of the pipeline.
# Total suggestions
prod = spark.read.table('analytics_platform_eng.image_suggestions_suggestions').where('snapshot="2024-12-23"')
dev = spark.read.table('refactoring.image_suggestions_suggestions').where('snapshot="2024-12-23"')
prod.count(), dev.count()
(25664138, 25653978)
# ALIS
prod.where('section_index is null').count(), dev.where('section_index is null').count()
(24413747, 24403587)
# SLIS
prod.where('section_index is not null').count(), dev.where('section_index is not null').count()
(1250391, 1250391)
# Wikidata data
prod = spark.read.table('analytics_platform_eng.image_suggestions_wikidata_data').where('snapshot="2024-12-23"')
dev = spark.read.table('refactoring.image_suggestions_wikidata_data').where('snapshot="2024-12-23"')
prod.count(), dev.count()
(110915735, 110915735)
# Lead image data
prod = spark.read.table('analytics_platform_eng.image_suggestions_lead_image_data').where('snapshot="2024-12-23"')
dev = spark.read.table('refactoring.image_suggestions_lead_image_data').where('snapshot="2024-12-23"')
prod.count(), dev.count()
(8381808, 8381808)
# Search index full
prod = spark.read.table('analytics_platform_eng.image_suggestions_search_index_full').where('snapshot="2024-12-23"')
dev = spark.read.table('refactoring.image_suggestions_search_index_full').where('snapshot="2024-12-23"')
prod.count(), dev.count()
(77978152, 77926993)
# Search index delta
prod = spark.read.table('analytics_platform_eng.image_suggestions_search_index_delta').where('snapshot="2024-12-23"')
dev = spark.read.table('refactoring.image_suggestions_search_index_delta').where('snapshot="2024-12-23"')
prod.count(), dev.count()
(199608, 203136)
# Title cache
prod = spark.read.table('analytics_platform_eng.image_suggestions_title_cache').where('snapshot="2024-12-23"')
dev = spark.read.table('refactoring.image_suggestions_title_cache').where('snapshot="2024-12-23"')
prod.count(), dev.count()
(4655331, 4651845)
# Instance-of cache
prod = spark.read.table('analytics_platform_eng.image_suggestions_instanceof_cache').where('snapshot="2024-12-23"')
dev = spark.read.table('refactoring.image_suggestions_instanceof_cache').where('snapshot="2024-12-23"')
prod.count(), dev.count()
(4702646, 4699160)
Edited by Marco Fossati