ssh ${airflow_host}"sudo -u ${airflow_user} tar xzf${gitlab_package_archive} -C ${airflow_home}";
for target in$(shellecho${TARGETS});do\
ssh ${airflow_host}"sudo -u ${airflow_user} tar xvzf ${airflow_home}/$$target/${venv_archive} -C ${airflow_home}/$$target/venv";\
done
ssh ${airflow_host}"rm ${gitlab_package_archive}"
## Code checks
# Run linting on all projects
lint-all:
cd${ima_home}; make lint
for target in$(shellecho${TARGETS});do\
make lint -C$$target;\
done
# Run the tests suite on all projects
test-all:
for target in$(shellecho${TARGETS});do\
make test-C$$target;\
done
# Run compile-time type checks on all projects.
mypy-all:
for target in$(shellecho${TARGETS});do\
make mypy -C$$target;\
done
# Run the top level airflow dags test suite
test-dags:${pip_requirements_test}
${DOCKER_CMD} bash -c"tox -e dags"
...
...
@@ -47,13 +69,16 @@ test_dags:
echo"WARNING: deprecated. Use make test-dags instead"
make test-dags
test-all:
cd${ima_home}; make test
## Package dags and project dependencies.
archive:ima-venv
tar cvz --exclude='.[^/]*'--exclude='__pycache__'--exclude='venv/'-f platform-airflow-dags.tar.gz *
archive:
# Build a virtual environment for a datapipeline project.
for target in$(shellecho${TARGETS});do\
rm-f$$target/${venv_archive};\
make venv -C$$target;\
done
# Archive the projects and virtual environments.
# This is the artifact that will be deployed on ${gitlab_package_archive}.
tar cvz --exclude='.[^/]*'--exclude='datapipeline-scaffold/*'--exclude='__pycache__'--exclude='venv/*'--exclude=${gitlab_package_archive}-f${gitlab_package_archive} dags $(shellecho${TARGETS})
# Publish an artifact to a Gitlab Generic Archive registry using a private token.
Tools provided by this repository require [Docker](https://www.docker.com/).
# Data pipelines
> […] a pipeline, also known as a data pipeline, is a set of data processing elements connected in series, where the output of one element is the input of the next one. The elements of a pipeline are often executed in parallel or in time-sliced fashion. […] > https://en.wikipedia.org/wiki/Pipeline_(computing)
...
...
@@ -22,6 +26,32 @@ A Generated Datasets Platform pipeline is made up by two components:
Data pipelines are executed on Hadoop. Elastic compute is provided by Spark (jobs are deployed in cluster mode). Scheduling and orchestration is delegated to Apache Airflow. Currently we support Python based projects. Scala support is planned.
This will generate a new directory for pipeline code under:
```bash
your_data_pipeline
```
And install an Airflow dag template under
```
dags/your_data_pipeline_dag.py
```
## Repo layout
This repository follows a [monorepo](https://en.wikipedia.org/wiki/Monorepo) strategy. Its structure matches the layout of `AIRFLOW_HOME` on the [an-airflow1003.eqiad.wmnet](https://wikitech.wikimedia.org/wiki/Analytics/Systems/Airflow#platform_eng) airflow instance.
...
...
@@ -38,6 +68,16 @@ The following command will run code checks and deploy data pipelines:
```
make deploy-local-build
```
### Deploy a new pipeline
Deployment piplines are declared in the `TARGET` variable in `Makefile`.
To deploy a new pipeline, append its project directory name to `TARGET`.
For example, if a new pipeline has been created as `my_new_datapipeline`, the new
> A boilerplate README.md for your Generate Data Platform pipeline.
Take a look at our [documentation]() before getting started.
This pipeline is owned by {{cookiecutter.pipeline_owner}}.
# Guidelines
In order to get the best out of the template:
* Don't modify the Makefiles and Dockerfiles we provide.
* Don't remove any lines from the tox.ini file we provide.
* Don't commit data to git.
* Don't commit any credentials or local configuration files.
* Convert Jupyter Notebooks you'd like to schedule to a script with `jupyter nbconvert --to script notebook.ipynb`.
* Install Docker or Docker Desktop on your development machine.
You can read more about our guidelines, codechecks and contribution model
in our [documentation]().
# Content
-`conf` contains Spark job specific config files. `spark.properties` will let you define your cluster topology and
desired resources. We default to a [yarn-regular](https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Spark#Spark_Resource_Settings) sized cluster.
-`pyspark` contains Spark based data processing tasks.
-`sql` contains SQL/HQL based data processing tasks.
-`test` contains a test suite
An Airflow DAG template has been created in the monorepo top level dags directory.