From 41e85c2a811c82942c7c581af9a5b928689dcf20 Mon Sep 17 00:00:00 2001 From: Joseph Allemandou Date: Tue, 27 Sep 2022 09:21:41 +0200 Subject: [PATCH 1/5] Remove log gathering from wikidata metrics skein Two tasks of wikidata metrics job (special-entity-data and reliability metrics) generate too much logs for skein to gather. This patch adds the configuration to the tasks to prevent skein to gather the logs (they still will be visible from HDFS). --- .../wikidata_metrics_to_graphite_daily_dag.py | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/analytics/dags/wikidata/wikidata_metrics_to_graphite_daily_dag.py b/analytics/dags/wikidata/wikidata_metrics_to_graphite_daily_dag.py index c66d722..d522104 100644 --- a/analytics/dags/wikidata/wikidata_metrics_to_graphite_daily_dag.py +++ b/analytics/dags/wikidata/wikidata_metrics_to_graphite_daily_dag.py @@ -12,7 +12,7 @@ ### Metrics include: * ArticlePlaceholder metrics - - Metrics can be viewed in Graphite under the daily.wikidata.articleplaceholder namespace, + Metrics can be viewed in Graphite under the daily.wikidata.articleplaceholder namespace, in varnish_requests.abouttopic.* folder * Reliability metrics - metrics for the Wikidata reliability graphs @@ -66,13 +66,13 @@ with DAG( #define common arguments for the 3 etl tasks hive_to_graphite_common_args = [ # Graphite parameters - '--graphite_host', 'graphite-in.eqiad.wmnet', + '--graphite_host', 'graphite-in.eqiad.wmnet', '--graphite_port', 2003, - # HQL parameters - '-d', f'webrequest_table={source_table}', - '-d', f'year={year}', - '-d', f'month={month}', - '-d', f'day={day}', + # HQL parameters + '-d', f'webrequest_table={source_table}', + '-d', f'year={year}', + '-d', f'month={month}', + '-d', f'day={day}', '-d', 'coalesce_partitions=4' ] @@ -94,9 +94,12 @@ with DAG( application=var_props.get('refinery_job_jar', dag_config.artifact('refinery-job-0.2.1-shaded.jar')), java_class='org.wikimedia.analytics.refinery.job.HiveToGraphite', + # This job generates too much YARN logs for the skein client. + # Preventing skein to collect logs avoids issues. + skein_app_log_collection_enabled=False, application_args=[ '-f', var_props.get('query_file2', f'{dag_config.hql_directory}/wikidata/wikidata_reliability_metrics.hql'), - '--metric_prefix', var_props.get('metric_prefix2', 'daily.wikidata.reliability_metrics'), + '--metric_prefix', var_props.get('metric_prefix2', 'daily.wikidata.reliability_metrics'), ]+hive_to_graphite_common_args, conf={'spark.dynamicAllocation.maxExecutors': 128}, sla=timedelta(hours=6) @@ -107,6 +110,9 @@ with DAG( application=var_props.get('refinery_job_jar', dag_config.artifact('refinery-job-0.2.1-shaded.jar')), java_class='org.wikimedia.analytics.refinery.job.HiveToGraphite', + # This job generates too much YARN logs for the skein client. + # Preventing skein to collect logs avoids issues. + skein_app_log_collection_enabled=False, application_args=[ '-f', var_props.get('query_file3', f'{dag_config.hql_directory}/wikidata/wikidata_specialentity_data_metrics.hql'), '--metric_prefix', var_props.get('metric_prefix3', 'daily.wikidata.entitydata'), -- GitLab From b3914c3ed8f3d0aa0d125aa5c5954df9f874ddbd Mon Sep 17 00:00:00 2001 From: AQ Date: Tue, 27 Sep 2022 10:01:52 +0200 Subject: [PATCH 2/5] Add missing hdfds-tools artifact --- analytics/config/artifacts.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/analytics/config/artifacts.yaml b/analytics/config/artifacts.yaml index 31fab8e..ee16677 100644 --- a/analytics/config/artifacts.yaml +++ b/analytics/config/artifacts.yaml @@ -47,6 +47,10 @@ artifacts: id: datahub:cli:tgz:0.8.38 source: wmf_archiva_python + hdfs-tools-0.0.6-shaded.jar: + id: org.wikimedia.analytics.hdfstools:hdfs-tools:jar:shaded:0.0.6 + source: wmf_archiva_releases + # HACK # This jar has been manually loaded to archiva, it contains a patch to a bug # preventing cassandra quoted-columns to be accessible in spark-sql. -- GitLab From 9307a97e99597dfd40e0ea33825a81d01b06dbbf Mon Sep 17 00:00:00 2001 From: AQ Date: Tue, 27 Sep 2022 10:23:02 +0200 Subject: [PATCH 3/5] Linting --- wmf_airflow_common/operators/hdfs.py | 2 +- wmf_airflow_common/operators/skein.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/wmf_airflow_common/operators/hdfs.py b/wmf_airflow_common/operators/hdfs.py index b06cbad..966a401 100644 --- a/wmf_airflow_common/operators/hdfs.py +++ b/wmf_airflow_common/operators/hdfs.py @@ -12,7 +12,7 @@ class HDFSArchiveOperator(SimpleSkeinOperator): - moves the source file to the archive file - deletes the source directory """ - template_fields = SimpleSkeinOperator.template_fields + ( + template_fields: tuple = SimpleSkeinOperator.template_fields + ( 'source_directory', 'archive_file', 'hdfsarchiver_job_shaded_jar_path',) diff --git a/wmf_airflow_common/operators/skein.py b/wmf_airflow_common/operators/skein.py index 398a4d6..5dcb0e2 100644 --- a/wmf_airflow_common/operators/skein.py +++ b/wmf_airflow_common/operators/skein.py @@ -48,12 +48,12 @@ class SimpleSkeinOperator(PythonOperator): except that the script will be run in YARN instead of locally. Uses wmf_airflow_common.hooks.skein.SkeinHookBuilder - to aide in building a simple SkeinHook. + to aid in building a simple SkeinHook. """ - # Need to ignore mypy incompatible type assignmnet warning here. + # Need to ignore mypy incompatible type assignment warning here. # PythonOperator defines this as a 3 tuple, but we want to override it. - template_fields = PythonOperator.template_fields + ( + template_fields: tuple = PythonOperator.template_fields + ( '_name', # type: ignore[assignment] '_queue', '_principal', @@ -144,7 +144,7 @@ class SimpleSkeinOperator(PythonOperator): **kwargs ) - def hook_submit(self, script): + def hook_submit(self, script) -> None: builder = SkeinHookBuilder() -- GitLab From 0ffdca13af0a7ef836d22d42a57e97da348cc12c Mon Sep 17 00:00:00 2001 From: AQ Date: Tue, 27 Sep 2022 10:43:45 +0200 Subject: [PATCH 4/5] More linting --- wmf_airflow_common/operators/skein.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wmf_airflow_common/operators/skein.py b/wmf_airflow_common/operators/skein.py index 5dcb0e2..28f9d17 100644 --- a/wmf_airflow_common/operators/skein.py +++ b/wmf_airflow_common/operators/skein.py @@ -54,7 +54,7 @@ class SimpleSkeinOperator(PythonOperator): # Need to ignore mypy incompatible type assignment warning here. # PythonOperator defines this as a 3 tuple, but we want to override it. template_fields: tuple = PythonOperator.template_fields + ( - '_name', # type: ignore[assignment] + '_name', '_queue', '_principal', '_keytab', @@ -144,7 +144,7 @@ class SimpleSkeinOperator(PythonOperator): **kwargs ) - def hook_submit(self, script) -> None: + def hook_submit(self, script: str) -> None: builder = SkeinHookBuilder() -- GitLab From 08432830ba82ba7824688d2e60bcce990ee361a8 Mon Sep 17 00:00:00 2001 From: AQ Date: Tue, 27 Sep 2022 12:55:31 +0200 Subject: [PATCH 5/5] Revert unrelated work to this MR Revert "More linting" This reverts commit 0ffdca13af0a7ef836d22d42a57e97da348cc12c. Revert "Linting" This reverts commit 9307a97e99597dfd40e0ea33825a81d01b06dbbf. Revert "Add missing hdfds-tools artifact" This reverts commit b3914c3ed8f3d0aa0d125aa5c5954df9f874ddbd. --- analytics/config/artifacts.yaml | 4 ---- wmf_airflow_common/operators/hdfs.py | 2 +- wmf_airflow_common/operators/skein.py | 10 +++++----- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/analytics/config/artifacts.yaml b/analytics/config/artifacts.yaml index ee16677..31fab8e 100644 --- a/analytics/config/artifacts.yaml +++ b/analytics/config/artifacts.yaml @@ -47,10 +47,6 @@ artifacts: id: datahub:cli:tgz:0.8.38 source: wmf_archiva_python - hdfs-tools-0.0.6-shaded.jar: - id: org.wikimedia.analytics.hdfstools:hdfs-tools:jar:shaded:0.0.6 - source: wmf_archiva_releases - # HACK # This jar has been manually loaded to archiva, it contains a patch to a bug # preventing cassandra quoted-columns to be accessible in spark-sql. diff --git a/wmf_airflow_common/operators/hdfs.py b/wmf_airflow_common/operators/hdfs.py index 966a401..b06cbad 100644 --- a/wmf_airflow_common/operators/hdfs.py +++ b/wmf_airflow_common/operators/hdfs.py @@ -12,7 +12,7 @@ class HDFSArchiveOperator(SimpleSkeinOperator): - moves the source file to the archive file - deletes the source directory """ - template_fields: tuple = SimpleSkeinOperator.template_fields + ( + template_fields = SimpleSkeinOperator.template_fields + ( 'source_directory', 'archive_file', 'hdfsarchiver_job_shaded_jar_path',) diff --git a/wmf_airflow_common/operators/skein.py b/wmf_airflow_common/operators/skein.py index 28f9d17..398a4d6 100644 --- a/wmf_airflow_common/operators/skein.py +++ b/wmf_airflow_common/operators/skein.py @@ -48,13 +48,13 @@ class SimpleSkeinOperator(PythonOperator): except that the script will be run in YARN instead of locally. Uses wmf_airflow_common.hooks.skein.SkeinHookBuilder - to aid in building a simple SkeinHook. + to aide in building a simple SkeinHook. """ - # Need to ignore mypy incompatible type assignment warning here. + # Need to ignore mypy incompatible type assignmnet warning here. # PythonOperator defines this as a 3 tuple, but we want to override it. - template_fields: tuple = PythonOperator.template_fields + ( - '_name', + template_fields = PythonOperator.template_fields + ( + '_name', # type: ignore[assignment] '_queue', '_principal', '_keytab', @@ -144,7 +144,7 @@ class SimpleSkeinOperator(PythonOperator): **kwargs ) - def hook_submit(self, script: str) -> None: + def hook_submit(self, script): builder = SkeinHookBuilder() -- GitLab