Skip to content

Instantly share code, notes, and snippets.

@f-ewald
Last active September 12, 2017 09:45
Show Gist options
  • Save f-ewald/b2a70b06537d532fe475 to your computer and use it in GitHub Desktop.
Save f-ewald/b2a70b06537d532fe475 to your computer and use it in GitHub Desktop.
Ex06
INSERT OVERWRITE LOCAL DIRECTORY '${hiveconf:output_file}'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
SELECT hashtag.object, COUNT(*) as amount
FROM
${hiveconf:table} t1
JOIN
${hiveconf:table} t2
ON
t1.subject = t2.subject
JOIN
${hiveconf:table} hashtag
ON
t2.subject = hashtag.subject
WHERE
t1.predicate == 'a' AND
t1.object == 'sib:Post' AND
t2.predicate == 'dc:created' AND
regexp_extract(t2.object, '"([0-9]{4}-[0-9]{2}-[0-9]{2})T', 1) BETWEEN '${hiveconf:start_date}' AND '${hiveconf:stop_date}' AND
hashtag.predicate == 'sib:hashtag'
GROUP BY
hashtag.object
ORDER BY
amount DESC
LIMIT 10;
-- This is just for the file output
INSERT OVERWRITE LOCAL DIRECTORY '${hiveconf:output_file}'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
-- First select all Posts, then join by the date created to filter between these dates.
-- Then join the post by its content (any content) and by all of its hashtags.
-- If either the content or the hashtag or both is equal to the keyword include in output
-- GROUP BY subject to output only the post id.
SELECT t1.subject
FROM
${hiveconf:table} t1
JOIN
${hiveconf:table} t2
ON
t1.subject = t2.subject
JOIN
${hiveconf:table} content
ON
t2.subject = content.subject
LEFT OUTER JOIN
(SELECT * FROM ${hiveconf:table} WHERE predicate == 'sib:hashtag') hashtag
ON
t2.subject = hashtag.subject
WHERE
t1.predicate == 'a' AND
t1.object == 'sib:Post' AND
t2.predicate == 'dc:created' AND
regexp_extract(t2.object, '"([0-9]{4}-[0-9]{2}-[0-9]{2})T', 1) BETWEEN '${hiveconf:start_date}' AND '${hiveconf:stop_date}' AND
content.predicate == 'sioc:content' AND
(hashtag.object LIKE '%${hiveconf:hashtag}' OR content.object LIKE '%${hiveconf:hashtag}%')
GROUP BY
t1.subject
-- This is just for the file output
INSERT OVERWRITE LOCAL DIRECTORY '${hiveconf:output_file}'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
-- We go the other way round, we select all the users which are NOT IN the active user set
-- this set is determined by the posts which are created at this date range.
SELECT t1.subject FROM ${hiveconf:table} t1 WHERE object == 'sioc:user' AND t1.subject
NOT IN
(SELECT
t12.subject
FROM
${hiveconf:table} t11
JOIN
${hiveconf:table} t12
ON
t11.subject = t12.object
JOIN
${hiveconf:table} t_date
ON
t11.subject = t_date.subject
JOIN
${hiveconf:table} registration
ON
t12.subject = registration.subject
WHERE
t11.predicate == 'a' AND
t11.object == 'sib:Post' AND
t12.predicate == 'sioc:creator_of' AND
t_date.predicate == 'dc:created' AND
regexp_extract(t_date.object, '"([0-9]{4}-[0-9]{2}-[0-9]{2})T', 1) BETWEEN '${hiveconf:start_date}' AND '${hiveconf:stop_date}' AND
regexp_extract(registration.object, '"([0-9]{4}-[0-9]{2}-[0-9]{2})T', 1) < '${hiveconf:startDate}'
GROUP BY
t12.subject);
INSERT OVERWRITE LOCAL DIRECTORY '${hiveconf:output_file}'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
SELECT * FROM (
SELECT
t_friend.subject, COUNT(*) as amount_in
FROM
${hiveconf:table} t_user
JOIN
${hiveconf:table} t_friend
ON
t_user.subject = t_friend.subject
WHERE
t_user.predicate == 'a' AND
t_user.object == 'sib:User' AND
t_friend.predicate == 'foaf:knows' AND
t_friend.object IN (
SELECT
t_grp.object
FROM
${hiveconf:table} t_grp
WHERE
subject == '${hiveconf:group}' AND
predicate == 'sioc:has_member')
GROUP BY
t_friend.subject
HAVING
COUNT(*) >= 3
) as res
WHERE
res.subject NOT IN (
SELECT
object
FROM
${hiveconf:table}
WHERE
subject == '${hiveconf:group}' AND
predicate == 'sioc:has_member'
)
INSERT OVERWRITE LOCAL DIRECTORY '${hiveconf:output_file}'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
SELECT likes.object, likes.amount, comments.amount, (likes.amount/comments.amount) as ratio FROM (SELECT
likes.object, COUNT(*) as amount
FROM
${hiveconf:table} likes
WHERE
predicate == 'sib:like'
GROUP BY
likes.object) as likes
JOIN
(SELECT subject, COUNT(*) as amount FROM ${hiveconf:table} WHERE predicate == 'sioc:container_of' GROUP BY subject) as comments
ON
likes.object = comments.subject
ORDER BY
ratio ASC
LIMIT 50
REGISTER RDFStorage.jar ;
-- /home/cloudera/Downloads/input/sibdataset200.nt
indata = LOAD '$input_file' USING RDFStorage() AS (s:chararray,p:chararray,o:chararray) ;
likes = FILTER indata BY p == 'sib:like';
likes_group = GROUP likes BY o;
likes_grouped_count = FOREACH likes_group GENERATE group AS object, COUNT(likes) AS amount;
comments = FILTER indata BY p == 'sioc:container_of';
comments_grouped = GROUP comments BY s;
comments_grouped_count = FOREACH comments_grouped GENERATE group AS subject, COUNT(comments) AS amount;
combined = JOIN likes_grouped_count BY object, comments_grouped_count BY subject;
combined_ratio = FOREACH combined GENERATE likes_grouped_count::object AS post, (float)likes_grouped_count::amount/(float)comments_grouped_count::amount/1f AS ratio;
combined_ratio_sorted = ORDER combined_ratio BY ratio ASC;
outdata = LIMIT combined_ratio_sorted 50;
STORE outdata INTO '$output_file' USING PigStorage(',') ;
CREATE EXTERNAL TABLE ${hiveconf:table} (subject STRING, predicate STRING, object STRING ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LOCATION '${hiveconf:tab_file}';
REGISTER RDFStorage.jar ;
indata = LOAD '$input_file' USING RDFStorage() AS (s,p,o) ;
STORE indata INTO '$output_file' USING PigStorage('\t') ;
We can make this file beautiful and searchable if this error is corrected: It looks like row 2 should actually have 1 column, instead of 2 in line 1.
Note: Date chosen between '2010-10-14' AND '2012-12-31'
hashtag.object,amount
dbp:Battle,2484
dbp:album,1964
dbp:Adam,1500
dbp:Levine,1500
dbp:The,682
dbp:War,616
dbp:Chris,595
dbp:Antonacci,463
dbp:Biagio,463
dbp:Kirkpatrick,433
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
t1.subject
sibpo:po7529
sibpo:po7581
sibpo:po7588
sibpo:po7595
We can make this file beautiful and searchable if this error is corrected: It looks like row 2 should actually have 2 columns, instead of 1 in line 1.
WARNING: Results updated, still need to COUNT(*) the resulsts.
Below just the raw results
t1.subject
==========
sibu:u139
sibu:u109
sibu:u189
sibu:u105
sibu:u178
sibu:u42
sibu:u128
sibu:u184
sibu:u196
sibu:u92
sibu:u70
sibu:u108
sibu:u174
We can make this file beautiful and searchable if this error is corrected: Illegal quoting in line 1.
Note: Used group "g1" from sibdataset400.nt
res.subject,res.amount_in
sibu:u0,6
sibu:u1,3
sibu:u10,3
sibu:u100,7
sibu:u101,7
sibu:u102,5
sibu:u103,3
sibu:u104,7
sibu:u105,5
sibu:u106,4
sibu:u107,5
sibu:u108,7
sibu:u110,4
sibu:u111,7
sibu:u112,3
sibu:u114,3
sibu:u115,10
sibu:u117,11
sibu:u118,3
sibu:u120,3
sibu:u122,7
sibu:u123,9
sibu:u127,4
sibu:u13,4
sibu:u131,6
sibu:u133,4
sibu:u134,3
sibu:u135,9
sibu:u139,4
sibu:u140,7
sibu:u146,3
sibu:u149,3
sibu:u152,4
sibu:u153,3
sibu:u157,5
sibu:u158,5
sibu:u16,3
sibu:u162,4
sibu:u165,3
sibu:u168,6
sibu:u169,5
sibu:u170,3
sibu:u172,4
sibu:u174,6
sibu:u176,3
sibu:u179,3
sibu:u181,4
sibu:u186,6
sibu:u191,3
sibu:u194,5
sibu:u20,3
sibu:u204,3
sibu:u205,3
sibu:u207,4
sibu:u209,5
sibu:u21,4
sibu:u210,5
sibu:u211,3
sibu:u212,5
sibu:u213,4
sibu:u214,4
sibu:u219,3
sibu:u22,3
sibu:u225,3
sibu:u226,3
sibu:u227,7
sibu:u231,3
sibu:u236,3
sibu:u237,3
sibu:u238,3
sibu:u24,4
sibu:u241,4
sibu:u244,3
sibu:u251,4
sibu:u252,5
sibu:u254,3
sibu:u265,5
sibu:u271,5
sibu:u273,3
sibu:u277,4
sibu:u28,4
sibu:u280,4
sibu:u287,4
sibu:u29,7
sibu:u291,4
sibu:u295,3
sibu:u296,3
sibu:u299,3
sibu:u30,3
sibu:u305,5
sibu:u307,3
sibu:u310,5
sibu:u318,3
sibu:u320,3
sibu:u324,5
sibu:u325,6
sibu:u326,5
sibu:u327,4
sibu:u328,4
sibu:u334,3
sibu:u336,3
sibu:u338,3
sibu:u34,8
sibu:u340,3
sibu:u341,3
sibu:u345,4
sibu:u350,5
sibu:u351,3
sibu:u352,7
sibu:u353,4
sibu:u357,3
sibu:u36,3
sibu:u360,3
sibu:u362,3
sibu:u364,3
sibu:u375,3
sibu:u378,4
sibu:u387,6
sibu:u39,3
sibu:u392,3
sibu:u394,3
sibu:u398,3
sibu:u40,7
sibu:u44,3
sibu:u48,3
sibu:u5,4
sibu:u51,3
sibu:u54,3
sibu:u58,4
sibu:u59,3
sibu:u6,5
sibu:u60,3
sibu:u67,4
sibu:u69,3
sibu:u70,4
sibu:u71,6
sibu:u72,7
sibu:u73,6
sibu:u74,6
sibu:u76,4
sibu:u79,7
sibu:u81,6
sibu:u82,11
sibu:u84,4
sibu:u85,5
sibu:u88,30
sibu:u89,7
sibu:u91,10
sibu:u92,4
sibu:u93,4
sibu:u94,4
sibu:u96,5
sibu:u98,9
likes.object likes.amount comments.amount ratio
sibpo:po10978 1 19 0.0526315789474
sibpo:po7126 1 19 0.0526315789474
sibpo:po15828 1 19 0.0526315789474
sibpo:po20321 1 19 0.0526315789474
sibpo:po28088 1 19 0.0526315789474
sibpo:po9725 1 19 0.0526315789474
sibpo:po9717 1 19 0.0526315789474
sibpo:po22738 1 19 0.0526315789474
sibpo:po4385 1 19 0.0526315789474
sibpo:po13308 1 19 0.0526315789474
sibpo:po12674 1 18 0.0555555555556
sibpo:po15728 1 18 0.0555555555556
sibpo:po22771 1 18 0.0555555555556
sibpo:po9562 1 18 0.0555555555556
sibpo:po6463 1 18 0.0555555555556
sibpo:po27731 1 18 0.0555555555556
sibpo:po24746 1 18 0.0555555555556
sibpo:po22710 1 18 0.0555555555556
sibpo:po11611 1 18 0.0555555555556
sibpo:po15973 1 18 0.0555555555556
sibpo:po7489 1 18 0.0555555555556
sibpo:po7482 1 18 0.0555555555556
sibpo:po8815 1 18 0.0555555555556
sibpo:po8380 1 17 0.0588235294118
sibpo:po19688 1 17 0.0588235294118
sibpo:po28094 1 17 0.0588235294118
sibpo:po8986 1 17 0.0588235294118
sibpo:po11409 1 17 0.0588235294118
sibpo:po24323 1 17 0.0588235294118
sibpo:po30107 1 17 0.0588235294118
sibpo:po26033 1 17 0.0588235294118
sibpo:po8313 1 17 0.0588235294118
sibpo:po3769 1 17 0.0588235294118
sibpo:po15757 1 17 0.0588235294118
sibpo:po21688 1 17 0.0588235294118
sibpo:po5376 1 17 0.0588235294118
sibpo:po23437 1 17 0.0588235294118
sibpo:po6001 1 17 0.0588235294118
sibpo:po28991 1 17 0.0588235294118
sibpo:po18156 1 17 0.0588235294118
sibpo:po8749 1 17 0.0588235294118
sibpo:po6246 1 17 0.0588235294118
sibpo:po21917 1 17 0.0588235294118
sibpo:po20952 1 17 0.0588235294118
sibpo:po21876 1 17 0.0588235294118
sibpo:po16522 1 17 0.0588235294118
sibpo:po23509 1 17 0.0588235294118
sibpo:po14344 1 17 0.0588235294118
sibpo:po9447 1 16 0.0625
sibpo:po7747 1 16 0.0625
#!/bin/sh
# Full path to input file or dir
INPUT_FILE='/home/cloudera/Downloads/input/sibdataset200.nt'
INPUT_FILE400='/home/cloudera/Downloads/input/sibdataset400.nt'
# Output path for tab dir (from which we move to hdfs)
TAB_DIR='/home/cloudera/Downloads/ex6/'
#Name of the tab file
TAB_FILE='sibdataset_tab'
TAB_FILE400='sibdataset_tab400'
# Directory on HDFS where to write and read the tab file from
HDFS_DIR='/user/cloudera/'
# Name of the hive table
TABLE='ewaldf_sibdataset'
TABLE400='ewaldf_sibdataset400'
# Prepare the tabbed output (only run once)
# If you want to repeat the run, then first: "DROP TABLE $TABLE" AND DELETE output files :-)
pig -x local -param input_file=$INPUT_FILE -param output_file=$TAB_DIR$TAB_FILE prepare.pig
hadoop fs -put $TAB_DIR$TAB_FILE/ $HDFS_DIR
hive -hiveconf tab_file=$HDFS_DIR/$TAB_FILE -hiveconf table=$TABLE -f prepare.hive
# Prepare the 400er output
pig -x local -param input_file=$INPUT_FILE400 -param output_file=$TAB_DIR$TAB_FILE400 prepare.pig
hadoop fs -put $TAB_DIR$TAB_FILE400/ $HDFS_DIR
hive -hiveconf tab_file=$HDFS_DIR/$TAB_FILE400 -hiveconf table=$TABLE400 -f prepare.hive
# Task 6_1
hive -hiveconf table=$TABLE -hiveconf output_file=$TAB_DIR/6_1 -hiveconf start_date=2010-12-01 -hiveconf stop_date=2010-12-31 -f 6_1.hive
# Task 6_2
hive -hiveconf table=$TABLE -hiveconf output_file=$TAB_DIR/6_2 -hiveconf start_date=2010-12-01 -hiveconf stop_date=2010-12-31 -hiveconf hashtag=Tunisia -f 6_2.hive
# Task 6_3
hive -hiveconf table=$TABLE -hiveconf output_file=$TAB_DIR/6_3 -hiveconf start_date=2010-12-01 -hiveconf stop_date=2010-12-31 -f 6_3.hive
# Task 6_4
hive -hiveconf table=$TABLE400 -hiveconf output_file=$TAB_DIR/6_4 -hiveconf group=sibg:g1 -f 6_4.hive
# Task 6_5_1
hive -hiveconf table=$TABLE -hiveconf output_file=$TAB_DIR/6_5_1 -f 6_5_1.hive
# Task 6_5_2
pig -x local -param input_file=$INPUT_FILE -param output_file=$TAB_DIR/6_5_2 6_5_2.pig
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment