Skip to content

Instantly share code, notes, and snippets.

@davidgin
Forked from tovbinm/query.hive.template
Created January 18, 2013 19:59
Show Gist options
  • Save davidgin/4567963 to your computer and use it in GitHub Desktop.
Save davidgin/4567963 to your computer and use it in GitHub Desktop.
ADD JAR s3://<s3-bucket>/jars/hive_contrib-0.5.jar;
CREATE TEMPORARY FUNCTION now as 'com.mt.utils.udf.Now';
CREATE TEMPORARY FUNCTION user_agent_f as 'com.mt.utils.UserAgent';
set hive.merge.mapredfiles=true;
set hive.merge.mapfiles=true;
set hive.merge.size.per.task=500000000;
CREATE EXTERNAL TABLE data
(
ip STRING,
user_id STRING,
time_stamp STRING,
hit_url STRING,
http_response_code STRING,
referrer STRING,
user_agent STRING,
country_code STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t\n]*)?",
"output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s")
LOCATION "s3://<s3-bucket>/data/$dt";
INSERT OVERWRITE DIRECTORY "s3://<s3-bucket>/results/<my-query-name>/"
SELECT
from_unixtime(cast(time_stamp as BIGINT), 'yyyyMMdd') as date_id,
from_unixtime(cast(time_stamp as BIGINT), 'HH') as hour_id,
parse_url(concat("http://bla.com",hit_url), 'QUERY', 't') as container_id,
regexp_extract(user_agent_f('BROWSER',user_agent),"(\\D*)",1) as browser,
user_agent_f('BR_VERSION',user_agent) as browser_version,
country_code,
count(1) as page_views_count
FROM data
GROUP BY
from_unixtime(cast(time_stamp as BIGINT), 'yyyyMMdd') ,
from_unixtime(cast(time_stamp as BIGINT), 'HH'),
parse_url(concat("http://bla.com",hit_url), 'QUERY', 't'),
regexp_extract(user_agent_f('BROWSER',user_agent),"(\\D*)",1) , user_agent_f('BR_VERSION',user_agent),
country_code;
EXIT;
#!/bin/bash
fldr=$1
bucket=$2
num_of_instances=$3 #10
alias ssh='ssh -o StrictHostKeyChecking=no'
#dt=20110727
dt=`date --date='1 days ago' +%Y%m%d`
sed 's/$dt/'"${dt}"'/g' $fldr/query.hive.template > $fldr/query.hive
#upload hive query and jars
s3cmd -c $fldr/credentials/s3cmd.cfg put $fldr/query.hive s3://$bucket/hive-queries/
s3cmd -c $fldr/credentials/s3cmd.cfg put $fldr/jars/hive_contrib-0.5.jar s3://$bucket/jars/
#execute elastic MR hive query
$fldr/elastic-mapreduce-cli/elastic-mapreduce \
--create --name "Query (by $USER, created at: $(date +%H:%M:%S-%e/%m/%Y))" \
--num-instances $num_of_instances \
--instance-type m1.large \
--credentials "$fldr/credentials/elastic.json" \
--hive-script "s3://$bucket/hive-queries/query.hive" \
--hive-versions 0.7.1 \
# --logs
# --verbose \
# --trace \
# --debug
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment