Commit 6c6a519e authored by Gmodena's avatar Gmodena
Browse files

Add base docker compose config

parent 6f16bf2a
dataset_variant := api
dataset := imagerec_prod
dataset_archive := ${dataset}.tar.bz2
dataset_url := https://analytics.wikimedia.org/published/datasets/one-off/platform-imagematching/${dataset_variant}/${dataset_archive}
download:
test ${dataset_archive} || curl -o ${dataset_archive} ${dataset_url}
data: download
test -d ${dataset} || mkdir ${dataset}
tar xvjf ${dataset_archive} -C ${dataset}
cat ${dataset}/prod* | shuf > ${dataset}/matches.tsv
rm ${dataset}/prod*
clean:
rm -r ${dataset} ${dataset_archive}
# wmf-cassandra-imagematching
A Docker Compose configuration for testing/developing Cassandra ingestion of IMA data.
# Requirements
You will need Docker Engine and Docker Compose. On non-linux systems, you'll need to install
`coreutils`. The latter is needed to satisfy a dependency on `shuf`.
# Data preparation
Run
```
$ make data
```
The command will download the lastet available `imagerec_prod` tarball, combine wiki files into a single dataset,
and shuffles records. Output will be available under `imagerec_prod`.
# Running
```
$ docker-compose <up|down> [--build] cassandra-load-imagerec
```
-- Based on @eevans example at https://gist.github.com/eevans/d2e06b8163e05f7e2fbd6691ac7caccb
DROP KEYSPACE IF EXISTS imagerec;
CREATE KEYSPACE imagerec WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};
CREATE TABLE imagerec.matches (
page_id TEXT,
page_title TEXT,
image_id TEXT,
confidence_rating TEXT,
source TEXT,
dataset_id TEXT,
creation_time FLOAT,
wiki TEXT,
found_on TEXT,
PRIMARY KEY((wiki, page_id), image_id)
);
COPY imagerec.matches(page_id, page_title, image_id, confidence_rating, source, dataset_id, creation_time, wiki, found_on)
FROM 'imagerec_prod/matches.tsv'
WITH HEADER=true AND DELIMITER='\t' AND NULL='';
version: "3.8"
services:
cassandra:
image: cassandra:latest
container_name: cassandra
restart: always
environment:
- HEAP_NEWSIZE=128M
- MAX_HEAP_SIZE=1024M
volumes:
- ./cassandra_storage:/var/lib/cassandra
ports:
- 7000:7000
- 9042:9042
cassandra-load-imagerec:
container_name: cassandra-load-imagerec
image: cassandra:latest
environment:
- HEAP_NEWSIZE=128M
- MAX_HEAP_SIZE=1024M
depends_on:
- cassandra
volumes:
- ./ddl/imagerec.cql:/imagerec.cql:ro
- ./imagerec_prod:/imagerec_prod:ro
command: /bin/bash -c "sleep 60 && cqlsh cassandra -f /imagerec.cql"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment