external_imagerec_prod.hql 1.16 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
-- DDL to create an external table that exposes samples of the 
-- production dataset.
-- The default HDFS location and Hive database are relative to a developer's.
-- username. Example hdfs://analytics-hadoop/user/gmodena/imagerec_prod/data.
--
-- The dataset will be available at https://superset.wikimedia.org/superset/sqllab via the 
-- `presto_analytics` database.
--
-- Execution
-- hive -hiveconf username=<username> -f external_imagerec_prod.hql

USE ${hiveconf:username};

CREATE EXTERNAL TABLE IF NOT EXISTS `imagerec_prod`(
  `wiki` string,
  `page_id` string,
Gmodena's avatar
Gmodena committed
17
  `page_title` string,
18
19
20
21
22
23
24
25
26
  `image_id` string,
  `confidence_rating` string,
  `source` string,
  `dataset_id` string,
  `insertion_ts` float)
ROW FORMAT SERDE
  'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
  'field.delim'='\t',
Gmodena's avatar
Gmodena committed
27
28
  'serialization.format'='\t',
  'serialization.null.format'='""')
29
30
31
32
33
34
35
STORED AS INPUTFORMAT
  'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
  'hdfs://analytics-hadoop/user/${hiveconf:username}/imagerec_prod/data';

Gmodena's avatar
Gmodena committed
36

37
38
-- Update partition metadata
MSCK REPAIR TABLE `imagerec_prod`;