Created
March 15, 2012 21:46
-
-
Save tovbinm/2047198 to your computer and use it in GitHub Desktop.
EMR & S3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ADD JAR s3://<s3-bucket>/jars/hive_contrib-0.5.jar; | |
CREATE TEMPORARY FUNCTION now as 'com.mt.utils.udf.Now'; | |
CREATE TEMPORARY FUNCTION user_agent_f as 'com.mt.utils.UserAgent'; | |
set hive.merge.mapredfiles=true; | |
set hive.merge.mapfiles=true; | |
set hive.merge.size.per.task=500000000; | |
CREATE EXTERNAL TABLE data | |
( | |
ip STRING, | |
user_id STRING, | |
time_stamp STRING, | |
hit_url STRING, | |
http_response_code STRING, | |
referrer STRING, | |
user_agent STRING, | |
country_code STRING | |
) | |
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe' | |
WITH SERDEPROPERTIES ( | |
"input.regex" = "([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t\n]*)?", | |
"output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s") | |
LOCATION "s3://<s3-bucket>/data/$dt"; | |
INSERT OVERWRITE DIRECTORY "s3://<s3-bucket>/results/<my-query-name>/" | |
SELECT | |
from_unixtime(cast(time_stamp as BIGINT), 'yyyyMMdd') as date_id, | |
from_unixtime(cast(time_stamp as BIGINT), 'HH') as hour_id, | |
parse_url(concat("http://bla.com",hit_url), 'QUERY', 't') as container_id, | |
regexp_extract(user_agent_f('BROWSER',user_agent),"(\\D*)",1) as browser, | |
user_agent_f('BR_VERSION',user_agent) as browser_version, | |
country_code, | |
count(1) as page_views_count | |
FROM data | |
GROUP BY | |
from_unixtime(cast(time_stamp as BIGINT), 'yyyyMMdd') , | |
from_unixtime(cast(time_stamp as BIGINT), 'HH'), | |
parse_url(concat("http://bla.com",hit_url), 'QUERY', 't'), | |
regexp_extract(user_agent_f('BROWSER',user_agent),"(\\D*)",1) , user_agent_f('BR_VERSION',user_agent), | |
country_code; | |
EXIT; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
fldr=$1 | |
bucket=$2 | |
num_of_instances=$3 #10 | |
alias ssh='ssh -o StrictHostKeyChecking=no' | |
#dt=20110727 | |
dt=`date --date='1 days ago' +%Y%m%d` | |
sed 's/$dt/'"${dt}"'/g' $fldr/query.hive.template > $fldr/query.hive | |
#upload hive query and jars | |
s3cmd -c $fldr/credentials/s3cmd.cfg put $fldr/query.hive s3://$bucket/hive-queries/ | |
s3cmd -c $fldr/credentials/s3cmd.cfg put $fldr/jars/hive_contrib-0.5.jar s3://$bucket/jars/ | |
#execute elastic MR hive query | |
$fldr/elastic-mapreduce-cli/elastic-mapreduce \ | |
--create --name "Query (by $USER, created at: $(date +%H:%M:%S-%e/%m/%Y))" \ | |
--num-instances $num_of_instances \ | |
--instance-type m1.large \ | |
--credentials "$fldr/credentials/elastic.json" \ | |
--hive-script "s3://$bucket/hive-queries/query.hive" \ | |
--hive-versions 0.7.1 \ | |
# --logs | |
# --verbose \ | |
# --trace \ | |
# --debug |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment