Commit f144ae45 authored by Gmodena's avatar Gmodena
Browse files

Merge branch 'T292741-implement-ci-checks-w-tox' into 'multi-project-dags-repo'

T292741 implement ci checks w tox

See merge request gmodena/platform-airflow-dags!9
parents 90ec8494 a550daf6
...@@ -10,18 +10,21 @@ jobs: ...@@ -10,18 +10,21 @@ jobs:
strategy: strategy:
max-parallel: 4 max-parallel: 4
matrix: matrix:
python-version: [3.7, ] python-version: ["3.7", ]
pet-data-pipeline: ["image-matching", ] pet-data-pipeline: ["image-matching", ]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2
- uses: conda-incubator/setup-miniconda@v2 - name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with: with:
activate-environment: venv python-version: ${{ matrix.python-version }}
python-version: ${{ matrix.python-version }} - name: Install tox
auto-activate-base: false run: pip install tox
- name: Lint with flake8 - name: Lint with flake8
run: cd ${{ }}; make lint SKIP_DOCKER=true run: cd ${{ }}; make lint SKIP_DOCKER=true
- name: Type check with mypy - name: Type check with mypy
if: always()
run: cd ${{ }}; make mypy SKIP_DOCKER=true run: cd ${{ }}; make mypy SKIP_DOCKER=true
- name: Test with pytest - name: Test with pytest
if: always()
run: cd ${{ }}; make test SKIP_DOCKER=true run: cd ${{ }}; make test SKIP_DOCKER=true
FROM continuumio/miniconda3
RUN conda install python=3.7
RUN pip install tox
FROM continuumio/miniconda3
RUN conda install python=3.7
RUN pip install tox
...@@ -10,16 +10,25 @@ extra_pypi := ...@@ -10,16 +10,25 @@ extra_pypi :=
CONDA_CMD := conda config --set pip_interop_enabled True; conda create -n ${venv} python=${conda_python_version}; conda init bash; source ~/.bashrc && conda activate ${venv} CONDA_CMD := conda config --set pip_interop_enabled True; conda create -n ${venv} python=${conda_python_version}; conda init bash; source ~/.bashrc && conda activate ${venv}
DOCKER_IMG := platform/miniconda3
DOCKERFILE := ../Dockerfile.conda
ifneq ($(SKIP_DOCKER),true) ifneq ($(SKIP_DOCKER),true)
CURRENT_DIR := $(shell pwd) CURRENT_DIR := $(shell pwd)
DOCKER_IMG := continuumio/miniconda3
DOCKER_CMD := docker run -it \ DOCKER_CMD := docker run -it \
--rm \ --rm \
-v ${CURRENT_DIR}:/root \ -v ${CURRENT_DIR}:/root \
-e SKIP_DOCKER=true \ -e SKIP_DOCKER=true \
-w /root ${DOCKER_IMG} -w /root ${DOCKER_IMG}
lint: docker-conda
test: docker-conda
mypy: docker-conda
venv: docker-conda
endif endif
docker build -t ${DOCKER_IMG} -f ${DOCKERFILE} .
venv: ${pip_requirements} venv: ${pip_requirements}
${DOCKER_CMD} bash -c "export CONDA_ALWAYS_YES=true; ${CONDA_CMD}; \ ${DOCKER_CMD} bash -c "export CONDA_ALWAYS_YES=true; ${CONDA_CMD}; \
pip install --extra-index-url ${extra_pypi} -r ${pip_requirements}; \ pip install --extra-index-url ${extra_pypi} -r ${pip_requirements}; \
...@@ -27,8 +36,11 @@ venv: ${pip_requirements} ...@@ -27,8 +36,11 @@ venv: ${pip_requirements}
conda install conda-pack; \ conda install conda-pack; \
conda-pack -n ${venv} --format ${venv_archive_format}" conda-pack -n ${venv} --format ${venv_archive_format}"
test: ${pip_requirements_test} mypy: ${pip_requirements_test}
${DOCKER_CMD} bash -c "export CONDA_ALWAYS_YES=true; ${CONDA_CMD}; \ ${DOCKER_CMD} bash -c "tox -e mypy"
pip install -r ${pip_requirements_test}; \
python -m pytest tests/"
lint: ${pip_requirements_test}
${DOCKER_CMD} bash -c "tox -e flake8"
test: ${pip_requirements_test}
${DOCKER_CMD} bash -c "tox -e pytest"
[![Project Status: Concept <E2><80><93> Minimal or no implementation has been done yet, or the repository is only intended to be a limited example, demo, or proof-of-concept.](](
[![build](]( [![build](](
# wmf-platform-airflow-dags
Experiments with airflow repo and code structure.
# platform-airflow-dags
This repo contains data pipelines operationalised by the Generated Datasets Platform team.
You can reach out to us at
* <Add wikitech url>
* <Add irc channel?>
* Slack: `#data-platform-value-stream`.
# Data pipelines
> […] a pipeline, also known as a data pipeline, is a set of data processing elements connected in series, where the output of one element is the input of the next one. The elements of a pipeline are often executed in parallel or in time-sliced fashion. […] >
A Generated Datasets Platform pipeline is made up by two components:
1. Project specific tasks and data transformation that operate on input (sources) and produce output (sink). We depend on Apache Spark for elastic compute.
2. An [Airflow DAG](, that is a thin orchestration layer that composes and executes tasks
Data pipelines are executed on Hadoop. Elastic compute is provided by Spark (jobs are deployed in cluster mode). Scheduling and orchestration is delegated to Apache Airflow. Currently we support Python based projects. Scala support is planned.
## Repo layout
This repository follows a [monorepo]( strategy. Its structure matches the layout of `AIRFLOW_HOME` on the [an-airflow1003.eqiad.wmnet]( airflow instance.
* `dags` contains [Airflow dags]( for all projects. Each DAG schedules a data pipeline. No business logic is contained in the dag.
* `tests/` contain the `dags` validation test suite. Project specific tests are implemented under `<project-name>`
* `<project-name>` directories contain tasks and data transformations. For an example, see `image-matching`.
## Deployment
DAGs are currently deployed and scheduled on [an-airflow1003.eqiad.wmnet]( This service has no SLO and is meant for development and experimentation use.
The following command will run code checks and deploy data pipelines:
make deploy-local-build
# CI & code checks
We favour test-driven development with `pytest`, lint with `flake8` and type check with `mypy`. We encourage, but not yet enforce, the use of `isort` and `black` for formatting code. We log errors and information messages with the Python logging library.
## Code checks
We enforce code checks at at DAG and project level
### Dag validation
DAG validation tests live under the toplevel `tests` directory. They can be triggered with
`make test_dags`.
### Project checks
The following commands can be executed at top level (they'll be invoked for all projects),
or inside a single project directory (they'll be triggered for that project only):
* `make lint` triggers project linting.
* `make mypy` triggers type checking.
* `make test` triggers unit/integration tests.
All targets are configured with [tox](
By default, code checks are executed inside a docker container that provides an [Conda
Python]( distribution. They can be run "natively" by passing `SKIP_DOCKER=true`. For example:
make test SKIP_DOCKER=true
## CI
This project does not currently have Gitlab runners available. As an ad interim solution,
we mirror to Github an run CI atop a `build` Action `build` is triggered on every push to any branch.
SHELL := /bin/bash SHELL := /bin/bash
include ../Makefile.conda include ../Makefile.python
mypy: ${pip_requirements_test}
${DOCKER_CMD} bash -c "export CONDA_ALWAYS_YES=true; ${CONDA_CMD}; \
pip install -r ${pip_requirements_test}; \
mypy spark"
# TODO(gmodena, 2021-11-01): this conflicts with Makefile layout changes in
lint: ${pip_requirements_test}
# check for syntax errors or undefined names in ${lint_targets} files.
# exit-zero treats all errors as warnings.
# the GitHub editor is 127 chars wide; set that as ax-line-length.
${DOCKER_CMD} bash -c "pip install -r ${pip_requirements_test}; ${CONDA_CMD}; \
flake8 ${lint_targets} \
--count \
--max-complexity=10 \
--max-line-length=80 \
--select=E9,F63,F7,F82 \
--show-source \
test: ${pip_requirements_test}
${DOCKER_CMD} bash -c "export CONDA_ALWAYS_YES=true; ${CONDA_CMD}; \
conda install openjdk pyspark==${pyspark_version}; \
pip install -r ${pip_requirements_test}; \
PYTHONPATH=${PYTHONPATH}:spark/ pytest --cov spark tests/"
requires = ["setuptools ~= 58.0", "cython ~= 0.29.0"]
build-backend = "setuptools.build_meta"
pytest==6.2.2 pytest==6.2.2
pytest-spark==0.6.0 pytest-spark==0.6.0
pytest-cov==2.10.1 pytest-cov==2.10.1
flake8==3.8.4 mypy==0.910
mypy==0.910 mypy-extensions==0.4.3
pyspark-stubs==2.4.0post10 typed-ast
requires = tox-conda
envlist = flake8, mypy, pytest
skipsdist = True
setenv =
max-complexity = 10
max-line-length = 127
show-source = true
statistics = true
select = E9,F63,F7,F82
python_version = 3.7
flake8 spark
python -m pytest --cov spark tests/
python -m mypy spark
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment