f-ewald · September 12, 2017 09:45
diff --git a/6_1.hive b/6_1.hive
 INSERT OVERWRITE LOCAL DIRECTORY '${hiveconf:output_file}'
 ROW FORMAT DELIMITED 
 FIELDS TERMINATED BY ','

 SELECT hashtag.object, COUNT(*) as amount
 FROM
    ${hiveconf:table} t1
 JOIN
    ${hiveconf:table} t2
 ON
    t1.subject = t2.subject
 JOIN
    ${hiveconf:table} hashtag
 ON
    t2.subject = hashtag.subject
 WHERE
    t1.predicate == 'a' AND
    t1.object == 'sib:Post' AND
    t2.predicate == 'dc:created' AND
    regexp_extract(t2.object, '"([0-9]{4}-[0-9]{2}-[0-9]{2})T', 1) BETWEEN '${hiveconf:start_date}' AND '${hiveconf:stop_date}' AND
    hashtag.predicate == 'sib:hashtag'
 GROUP BY
    hashtag.object
 ORDER BY
    amount DESC
 LIMIT 10;
diff --git a/6_2.hive b/6_2.hive
 -- This is just for the file output
 INSERT OVERWRITE LOCAL DIRECTORY '${hiveconf:output_file}'
 ROW FORMAT DELIMITED 
 FIELDS TERMINATED BY ','

 -- First select all Posts, then join by the date created to filter between these dates.
 -- Then join the post by its content (any content) and by all of its hashtags.
 -- If either the content or the hashtag or both is equal to the keyword include in output
 -- GROUP BY subject to output only the post id.

 SELECT t1.subject
 FROM
    ${hiveconf:table} t1
 JOIN
    ${hiveconf:table} t2
 ON
    t1.subject = t2.subject
 JOIN
    ${hiveconf:table} content
 ON
    t2.subject = content.subject
 LEFT OUTER JOIN
    (SELECT * FROM ${hiveconf:table} WHERE predicate == 'sib:hashtag') hashtag
 ON
    t2.subject = hashtag.subject
 WHERE
    t1.predicate == 'a' AND
    t1.object == 'sib:Post' AND
    t2.predicate == 'dc:created' AND
    regexp_extract(t2.object, '"([0-9]{4}-[0-9]{2}-[0-9]{2})T', 1) BETWEEN '${hiveconf:start_date}' AND '${hiveconf:stop_date}' AND
    content.predicate == 'sioc:content' AND
    (hashtag.object LIKE '%${hiveconf:hashtag}' OR content.object LIKE '%${hiveconf:hashtag}%')
 GROUP BY
    t1.subject
diff --git a/6_3.hive b/6_3.hive
 -- This is just for the file output
 INSERT OVERWRITE LOCAL DIRECTORY '${hiveconf:output_file}'
 ROW FORMAT DELIMITED 
 FIELDS TERMINATED BY ','


 -- We go the other way round, we select all the users which are NOT IN the active user set
 -- this set is determined by the posts which are created at this date range.

 SELECT t1.subject FROM ${hiveconf:table} t1 WHERE object == 'sioc:user' AND t1.subject
 NOT IN
 (SELECT
    t12.subject
 FROM
    ${hiveconf:table} t11
 JOIN
    ${hiveconf:table} t12
 ON
    t11.subject = t12.object
 JOIN
    ${hiveconf:table} t_date
 ON
    t11.subject = t_date.subject
 JOIN
    ${hiveconf:table} registration
 ON  
    t12.subject = registration.subject
 WHERE
    t11.predicate == 'a' AND
    t11.object == 'sib:Post' AND
    t12.predicate == 'sioc:creator_of' AND
    t_date.predicate == 'dc:created' AND
    regexp_extract(t_date.object, '"([0-9]{4}-[0-9]{2}-[0-9]{2})T', 1) BETWEEN '${hiveconf:start_date}' AND '${hiveconf:stop_date}' AND
    regexp_extract(registration.object, '"([0-9]{4}-[0-9]{2}-[0-9]{2})T', 1) < '${hiveconf:startDate}'
 GROUP BY
    t12.subject);
diff --git a/6_4.hive b/6_4.hive
 INSERT OVERWRITE LOCAL DIRECTORY '${hiveconf:output_file}'
 ROW FORMAT DELIMITED 
 FIELDS TERMINATED BY ','

 SELECT * FROM (

 SELECT
    t_friend.subject, COUNT(*) as amount_in
 FROM
    ${hiveconf:table} t_user
 JOIN
    ${hiveconf:table} t_friend
 ON
    t_user.subject = t_friend.subject
 WHERE
    t_user.predicate == 'a' AND
    t_user.object == 'sib:User' AND
    t_friend.predicate == 'foaf:knows' AND
    t_friend.object IN (
 SELECT
    t_grp.object
 FROM
    ${hiveconf:table} t_grp
 WHERE
    subject == '${hiveconf:group}' AND
    predicate == 'sioc:has_member')
 GROUP BY
    t_friend.subject
 HAVING
    COUNT(*) >= 3

 ) as res
 WHERE
    res.subject NOT IN (
 SELECT
    object
 FROM
    ${hiveconf:table}
 WHERE
    subject == '${hiveconf:group}' AND
    predicate == 'sioc:has_member'
 )

diff --git a/6_5_1.hive b/6_5_1.hive
 INSERT OVERWRITE LOCAL DIRECTORY '${hiveconf:output_file}'
 ROW FORMAT DELIMITED 
 FIELDS TERMINATED BY ','

 SELECT likes.object, likes.amount, comments.amount, (likes.amount/comments.amount) as ratio FROM (SELECT
    likes.object, COUNT(*) as amount
 FROM
    ${hiveconf:table} likes
 WHERE
    predicate == 'sib:like'
 GROUP BY
    likes.object) as likes
 JOIN
    (SELECT subject, COUNT(*) as amount FROM ${hiveconf:table} WHERE predicate == 'sioc:container_of' GROUP BY subject) as comments
 ON
    likes.object = comments.subject
 ORDER BY
    ratio ASC
 LIMIT 50
diff --git a/6_5_2.pig b/6_5_2.pig
 REGISTER RDFStorage.jar ;

 -- /home/cloudera/Downloads/input/sibdataset200.nt
 indata = LOAD '$input_file' USING RDFStorage() AS (s:chararray,p:chararray,o:chararray) ;

 likes = FILTER indata BY p == 'sib:like';
 likes_group = GROUP likes BY o;
 likes_grouped_count = FOREACH likes_group GENERATE group AS object, COUNT(likes) AS amount;

 comments = FILTER indata BY p == 'sioc:container_of';
 comments_grouped = GROUP comments BY s;
 comments_grouped_count = FOREACH comments_grouped GENERATE group AS subject, COUNT(comments) AS amount;

 combined = JOIN likes_grouped_count BY object, comments_grouped_count BY subject;
 combined_ratio = FOREACH combined GENERATE likes_grouped_count::object AS post, (float)likes_grouped_count::amount/(float)comments_grouped_count::amount/1f AS ratio;
 combined_ratio_sorted = ORDER combined_ratio BY ratio ASC;
 outdata = LIMIT combined_ratio_sorted 50;

 STORE outdata INTO '$output_file' USING PigStorage(',') ;
diff --git a/prepare.hive b/prepare.hive
 CREATE EXTERNAL TABLE ${hiveconf:table} (subject STRING, predicate STRING, object STRING ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LOCATION '${hiveconf:tab_file}';
diff --git a/prepare.pig b/prepare.pig
 REGISTER RDFStorage.jar ;

 indata = LOAD '$input_file' USING RDFStorage() AS (s,p,o) ;
 STORE indata INTO '$output_file' USING PigStorage('\t') ;
diff --git a/query_result_6_1.csv b/query_result_6_1.csv
 Note: Date chosen between '2010-10-14' AND '2012-12-31'

 hashtag.object,amount
 dbp:Battle,2484
 dbp:album,1964
 dbp:Adam,1500
 dbp:Levine,1500
 dbp:The,682
 dbp:War,616
 dbp:Chris,595
 dbp:Antonacci,463
 dbp:Biagio,463
 dbp:Kirkpatrick,433
diff --git a/query_result_6_2.csv b/query_result_6_2.csv
 t1.subject
 sibpo:po7529
 sibpo:po7581
 sibpo:po7588
 sibpo:po7595
diff --git a/query_result_6_3.csv b/query_result_6_3.csv
 WARNING: Results updated, still need to COUNT(*) the resulsts.
 Below just the raw results
 t1.subject
 ==========
 sibu:u139
 sibu:u109
 sibu:u189
 sibu:u105
 sibu:u178
 sibu:u42
 sibu:u128
 sibu:u184
 sibu:u196
 sibu:u92
 sibu:u70
 sibu:u108
 sibu:u174
diff --git a/query_result_6_4.csv b/query_result_6_4.csv
 Note: Used group "g1" from sibdataset400.nt


 res.subject,res.amount_in
 sibu:u0,6
 sibu:u1,3
 sibu:u10,3
 sibu:u100,7
 sibu:u101,7
 sibu:u102,5
 sibu:u103,3
 sibu:u104,7
 sibu:u105,5
 sibu:u106,4
 sibu:u107,5
 sibu:u108,7
 sibu:u110,4
 sibu:u111,7
 sibu:u112,3
 sibu:u114,3
 sibu:u115,10
 sibu:u117,11
 sibu:u118,3
 sibu:u120,3
 sibu:u122,7
 sibu:u123,9
 sibu:u127,4
 sibu:u13,4
 sibu:u131,6
 sibu:u133,4
 sibu:u134,3
 sibu:u135,9
 sibu:u139,4
 sibu:u140,7
 sibu:u146,3
 sibu:u149,3
 sibu:u152,4
 sibu:u153,3
 sibu:u157,5
 sibu:u158,5
 sibu:u16,3
 sibu:u162,4
 sibu:u165,3
 sibu:u168,6
 sibu:u169,5
 sibu:u170,3
 sibu:u172,4
 sibu:u174,6
 sibu:u176,3
 sibu:u179,3
 sibu:u181,4
 sibu:u186,6
 sibu:u191,3
 sibu:u194,5
 sibu:u20,3
 sibu:u204,3
 sibu:u205,3
 sibu:u207,4
 sibu:u209,5
 sibu:u21,4
 sibu:u210,5
 sibu:u211,3
 sibu:u212,5
 sibu:u213,4
 sibu:u214,4
 sibu:u219,3
 sibu:u22,3
 sibu:u225,3
 sibu:u226,3
 sibu:u227,7
 sibu:u231,3
 sibu:u236,3
 sibu:u237,3
 sibu:u238,3
 sibu:u24,4
 sibu:u241,4
 sibu:u244,3
 sibu:u251,4
 sibu:u252,5
 sibu:u254,3
 sibu:u265,5
 sibu:u271,5
 sibu:u273,3
 sibu:u277,4
 sibu:u28,4
 sibu:u280,4
 sibu:u287,4
 sibu:u29,7
 sibu:u291,4
 sibu:u295,3
 sibu:u296,3
 sibu:u299,3
 sibu:u30,3
 sibu:u305,5
 sibu:u307,3
 sibu:u310,5
 sibu:u318,3
 sibu:u320,3
 sibu:u324,5
 sibu:u325,6
 sibu:u326,5
 sibu:u327,4
 sibu:u328,4
 sibu:u334,3
 sibu:u336,3
 sibu:u338,3
 sibu:u34,8
 sibu:u340,3
 sibu:u341,3
 sibu:u345,4
 sibu:u350,5
 sibu:u351,3
 sibu:u352,7
 sibu:u353,4
 sibu:u357,3
 sibu:u36,3
 sibu:u360,3
 sibu:u362,3
 sibu:u364,3
 sibu:u375,3
 sibu:u378,4
 sibu:u387,6
 sibu:u39,3
 sibu:u392,3
 sibu:u394,3
 sibu:u398,3
 sibu:u40,7
 sibu:u44,3
 sibu:u48,3
 sibu:u5,4
 sibu:u51,3
 sibu:u54,3
 sibu:u58,4
 sibu:u59,3
 sibu:u6,5
 sibu:u60,3
 sibu:u67,4
 sibu:u69,3
 sibu:u70,4
 sibu:u71,6
 sibu:u72,7
 sibu:u73,6
 sibu:u74,6
 sibu:u76,4
 sibu:u79,7
 sibu:u81,6
 sibu:u82,11
 sibu:u84,4
 sibu:u85,5
 sibu:u88,30
 sibu:u89,7
 sibu:u91,10
 sibu:u92,4
 sibu:u93,4
 sibu:u94,4
 sibu:u96,5
 sibu:u98,9
diff --git a/query_result_6_5_1.csv b/query_result_6_5_1.csv
diff --git a/runner.sh b/runner.sh
 #!/bin/sh

 # Full path to input file or dir
 INPUT_FILE='/home/cloudera/Downloads/input/sibdataset200.nt'
 INPUT_FILE400='/home/cloudera/Downloads/input/sibdataset400.nt'

 # Output path for tab dir (from which we move to hdfs)
 TAB_DIR='/home/cloudera/Downloads/ex6/'

 #Name of the tab file
 TAB_FILE='sibdataset_tab'
 TAB_FILE400='sibdataset_tab400'

 # Directory on HDFS where to write and read the tab file from
 HDFS_DIR='/user/cloudera/'

 # Name of the hive table
 TABLE='ewaldf_sibdataset'
 TABLE400='ewaldf_sibdataset400'


 # Prepare the tabbed output (only run once)
 # If you want to repeat the run, then first: "DROP TABLE $TABLE" AND DELETE output files :-)

 pig -x local -param input_file=$INPUT_FILE -param output_file=$TAB_DIR$TAB_FILE prepare.pig
 hadoop fs -put $TAB_DIR$TAB_FILE/ $HDFS_DIR
 hive -hiveconf tab_file=$HDFS_DIR/$TAB_FILE -hiveconf table=$TABLE -f prepare.hive

 # Prepare the 400er output
 pig -x local -param input_file=$INPUT_FILE400 -param output_file=$TAB_DIR$TAB_FILE400 prepare.pig
 hadoop fs -put $TAB_DIR$TAB_FILE400/ $HDFS_DIR
 hive -hiveconf tab_file=$HDFS_DIR/$TAB_FILE400 -hiveconf table=$TABLE400 -f prepare.hive

 # Task 6_1
 hive -hiveconf table=$TABLE -hiveconf output_file=$TAB_DIR/6_1 -hiveconf start_date=2010-12-01 -hiveconf stop_date=2010-12-31 -f 6_1.hive

 # Task 6_2
 hive -hiveconf table=$TABLE -hiveconf output_file=$TAB_DIR/6_2 -hiveconf start_date=2010-12-01 -hiveconf stop_date=2010-12-31 -hiveconf hashtag=Tunisia -f 6_2.hive

 # Task 6_3
 hive -hiveconf table=$TABLE -hiveconf output_file=$TAB_DIR/6_3 -hiveconf start_date=2010-12-01 -hiveconf stop_date=2010-12-31 -f 6_3.hive

 # Task 6_4
 hive -hiveconf table=$TABLE400 -hiveconf output_file=$TAB_DIR/6_4 -hiveconf group=sibg:g1 -f 6_4.hive

 # Task 6_5_1
 hive -hiveconf table=$TABLE -hiveconf output_file=$TAB_DIR/6_5_1 -f 6_5_1.hive

 # Task 6_5_2
 pig -x local -param input_file=$INPUT_FILE -param output_file=$TAB_DIR/6_5_2 6_5_2.pig
	INSERT OVERWRITE LOCAL DIRECTORY '${hiveconf:output_file}'
	ROW FORMAT DELIMITED
	FIELDS TERMINATED BY ','

	SELECT hashtag.object, COUNT(*) as amount
	FROM
	${hiveconf:table} t1
	JOIN
	${hiveconf:table} t2
	ON
	t1.subject = t2.subject
	JOIN
	${hiveconf:table} hashtag
	ON
	t2.subject = hashtag.subject
	WHERE
	t1.predicate == 'a' AND
	t1.object == 'sib:Post' AND
	t2.predicate == 'dc:created' AND
	regexp_extract(t2.object, '"([0-9]{4}-[0-9]{2}-[0-9]{2})T', 1) BETWEEN '${hiveconf:start_date}' AND '${hiveconf:stop_date}' AND
	hashtag.predicate == 'sib:hashtag'
	GROUP BY
	hashtag.object
	ORDER BY
	amount DESC
	LIMIT 10;
	-- This is just for the file output
	INSERT OVERWRITE LOCAL DIRECTORY '${hiveconf:output_file}'
	ROW FORMAT DELIMITED
	FIELDS TERMINATED BY ','

	-- First select all Posts, then join by the date created to filter between these dates.
	-- Then join the post by its content (any content) and by all of its hashtags.
	-- If either the content or the hashtag or both is equal to the keyword include in output
	-- GROUP BY subject to output only the post id.

	SELECT t1.subject
	FROM
	${hiveconf:table} t1
	JOIN
	${hiveconf:table} t2
	ON
	t1.subject = t2.subject
	JOIN
	${hiveconf:table} content
	ON
	t2.subject = content.subject
	LEFT OUTER JOIN
	(SELECT * FROM ${hiveconf:table} WHERE predicate == 'sib:hashtag') hashtag
	ON
	t2.subject = hashtag.subject
	WHERE
	t1.predicate == 'a' AND
	t1.object == 'sib:Post' AND
	t2.predicate == 'dc:created' AND
	regexp_extract(t2.object, '"([0-9]{4}-[0-9]{2}-[0-9]{2})T', 1) BETWEEN '${hiveconf:start_date}' AND '${hiveconf:stop_date}' AND
	content.predicate == 'sioc:content' AND
	(hashtag.object LIKE '%${hiveconf:hashtag}' OR content.object LIKE '%${hiveconf:hashtag}%')
	GROUP BY
	t1.subject
	REGISTER RDFStorage.jar ;

	-- /home/cloudera/Downloads/input/sibdataset200.nt
	indata = LOAD '$input_file' USING RDFStorage() AS (s:chararray,p:chararray,o:chararray) ;

	likes = FILTER indata BY p == 'sib:like';
	likes_group = GROUP likes BY o;
	likes_grouped_count = FOREACH likes_group GENERATE group AS object, COUNT(likes) AS amount;

	comments = FILTER indata BY p == 'sioc:container_of';
	comments_grouped = GROUP comments BY s;
	comments_grouped_count = FOREACH comments_grouped GENERATE group AS subject, COUNT(comments) AS amount;

	combined = JOIN likes_grouped_count BY object, comments_grouped_count BY subject;
	combined_ratio = FOREACH combined GENERATE likes_grouped_count::object AS post, (float)likes_grouped_count::amount/(float)comments_grouped_count::amount/1f AS ratio;
	combined_ratio_sorted = ORDER combined_ratio BY ratio ASC;
	outdata = LIMIT combined_ratio_sorted 50;

	STORE outdata INTO '$output_file' USING PigStorage(',') ;
	REGISTER RDFStorage.jar ;

	indata = LOAD '$input_file' USING RDFStorage() AS (s,p,o) ;
	STORE indata INTO '$output_file' USING PigStorage('\t') ;
	Note: Date chosen between '2010-10-14' AND '2012-12-31'

	hashtag.object,amount
	dbp:Battle,2484
	dbp:album,1964
	dbp:Adam,1500
	dbp:Levine,1500
	dbp:The,682
	dbp:War,616
	dbp:Chris,595
	dbp:Antonacci,463
	dbp:Biagio,463
	dbp:Kirkpatrick,433
	WARNING: Results updated, still need to COUNT(*) the resulsts.
	Below just the raw results
	t1.subject
	==========
	sibu:u139
	sibu:u109
	sibu:u189
	sibu:u105
	sibu:u178
	sibu:u42
	sibu:u128
	sibu:u184
	sibu:u196
	sibu:u92
	sibu:u70
	sibu:u108
	sibu:u174
	Note: Used group "g1" from sibdataset400.nt


	res.subject,res.amount_in
	sibu:u0,6
	sibu:u1,3
	sibu:u10,3
	sibu:u100,7
	sibu:u101,7
	sibu:u102,5
	sibu:u103,3
	sibu:u104,7
	sibu:u105,5
	sibu:u106,4
	sibu:u107,5
	sibu:u108,7
	sibu:u110,4
	sibu:u111,7
	sibu:u112,3
	sibu:u114,3
	sibu:u115,10
	sibu:u117,11
	sibu:u118,3
	sibu:u120,3
	sibu:u122,7
	sibu:u123,9
	sibu:u127,4
	sibu:u13,4
	sibu:u131,6
	sibu:u133,4
	sibu:u134,3
	sibu:u135,9
	sibu:u139,4
	sibu:u140,7
	sibu:u146,3
	sibu:u149,3
	sibu:u152,4
	sibu:u153,3
	sibu:u157,5
	sibu:u158,5
	sibu:u16,3
	sibu:u162,4
	sibu:u165,3
	sibu:u168,6
	sibu:u169,5
	sibu:u170,3
	sibu:u172,4
	sibu:u174,6
	sibu:u176,3
	sibu:u179,3
	sibu:u181,4
	sibu:u186,6
	sibu:u191,3
	sibu:u194,5
	sibu:u20,3
	sibu:u204,3
	sibu:u205,3
	sibu:u207,4
	sibu:u209,5
	sibu:u21,4
	sibu:u210,5
	sibu:u211,3
	sibu:u212,5
	sibu:u213,4
	sibu:u214,4
	sibu:u219,3
	sibu:u22,3
	sibu:u225,3
	sibu:u226,3
	sibu:u227,7
	sibu:u231,3
	sibu:u236,3
	sibu:u237,3
	sibu:u238,3
	sibu:u24,4
	sibu:u241,4
	sibu:u244,3
	sibu:u251,4
	sibu:u252,5
	sibu:u254,3
	sibu:u265,5
	sibu:u271,5
	sibu:u273,3
	sibu:u277,4
	sibu:u28,4
	sibu:u280,4
	sibu:u287,4
	sibu:u29,7
	sibu:u291,4
	sibu:u295,3
	sibu:u296,3
	sibu:u299,3
	sibu:u30,3
	sibu:u305,5
	sibu:u307,3
	sibu:u310,5
	sibu:u318,3
	sibu:u320,3
	sibu:u324,5
	sibu:u325,6
	sibu:u326,5
	sibu:u327,4
	sibu:u328,4
	sibu:u334,3
	sibu:u336,3
	sibu:u338,3
	sibu:u34,8
	sibu:u340,3
	sibu:u341,3
	sibu:u345,4
	sibu:u350,5
	sibu:u351,3
	sibu:u352,7
	sibu:u353,4
	sibu:u357,3
	sibu:u36,3
	sibu:u360,3
	sibu:u362,3
	sibu:u364,3
	sibu:u375,3
	sibu:u378,4
	sibu:u387,6
	sibu:u39,3
	sibu:u392,3
	sibu:u394,3
	sibu:u398,3
	sibu:u40,7
	sibu:u44,3
	sibu:u48,3
	sibu:u5,4
	sibu:u51,3
	sibu:u54,3
	sibu:u58,4
	sibu:u59,3
	sibu:u6,5
	sibu:u60,3
	sibu:u67,4
	sibu:u69,3
	sibu:u70,4
	sibu:u71,6
	sibu:u72,7
	sibu:u73,6
	sibu:u74,6
	sibu:u76,4
	sibu:u79,7
	sibu:u81,6
	sibu:u82,11
	sibu:u84,4
	sibu:u85,5
	sibu:u88,30
	sibu:u89,7
	sibu:u91,10
	sibu:u92,4
	sibu:u93,4
	sibu:u94,4
	sibu:u96,5
	sibu:u98,9
likes.object	likes.amount	comments.amount	ratio
sibpo:po10978	1	19	0.0526315789474
sibpo:po7126	1	19	0.0526315789474
sibpo:po15828	1	19	0.0526315789474
sibpo:po20321	1	19	0.0526315789474
sibpo:po28088	1	19	0.0526315789474
sibpo:po9725	1	19	0.0526315789474
sibpo:po9717	1	19	0.0526315789474
sibpo:po22738	1	19	0.0526315789474
sibpo:po4385	1	19	0.0526315789474
sibpo:po13308	1	19	0.0526315789474
sibpo:po12674	1	18	0.0555555555556
sibpo:po15728	1	18	0.0555555555556
sibpo:po22771	1	18	0.0555555555556
sibpo:po9562	1	18	0.0555555555556
sibpo:po6463	1	18	0.0555555555556
sibpo:po27731	1	18	0.0555555555556
sibpo:po24746	1	18	0.0555555555556
sibpo:po22710	1	18	0.0555555555556
sibpo:po11611	1	18	0.0555555555556
sibpo:po15973	1	18	0.0555555555556
sibpo:po7489	1	18	0.0555555555556
sibpo:po7482	1	18	0.0555555555556
sibpo:po8815	1	18	0.0555555555556
sibpo:po8380	1	17	0.0588235294118
sibpo:po19688	1	17	0.0588235294118
sibpo:po28094	1	17	0.0588235294118
sibpo:po8986	1	17	0.0588235294118
sibpo:po11409	1	17	0.0588235294118
sibpo:po24323	1	17	0.0588235294118
sibpo:po30107	1	17	0.0588235294118
sibpo:po26033	1	17	0.0588235294118
sibpo:po8313	1	17	0.0588235294118
sibpo:po3769	1	17	0.0588235294118
sibpo:po15757	1	17	0.0588235294118
sibpo:po21688	1	17	0.0588235294118
sibpo:po5376	1	17	0.0588235294118
sibpo:po23437	1	17	0.0588235294118
sibpo:po6001	1	17	0.0588235294118
sibpo:po28991	1	17	0.0588235294118
sibpo:po18156	1	17	0.0588235294118
sibpo:po8749	1	17	0.0588235294118
sibpo:po6246	1	17	0.0588235294118
sibpo:po21917	1	17	0.0588235294118
sibpo:po20952	1	17	0.0588235294118
sibpo:po21876	1	17	0.0588235294118
sibpo:po16522	1	17	0.0588235294118
sibpo:po23509	1	17	0.0588235294118
sibpo:po14344	1	17	0.0588235294118
sibpo:po9447	1	16	0.0625
sibpo:po7747	1	16	0.0625
	#!/bin/sh

	# Full path to input file or dir
	INPUT_FILE='/home/cloudera/Downloads/input/sibdataset200.nt'
	INPUT_FILE400='/home/cloudera/Downloads/input/sibdataset400.nt'

	# Output path for tab dir (from which we move to hdfs)
	TAB_DIR='/home/cloudera/Downloads/ex6/'

	#Name of the tab file
	TAB_FILE='sibdataset_tab'
	TAB_FILE400='sibdataset_tab400'

	# Directory on HDFS where to write and read the tab file from
	HDFS_DIR='/user/cloudera/'

	# Name of the hive table
	TABLE='ewaldf_sibdataset'
	TABLE400='ewaldf_sibdataset400'


	# Prepare the tabbed output (only run once)
	# If you want to repeat the run, then first: "DROP TABLE $TABLE" AND DELETE output files :-)

	pig -x local -param input_file=$INPUT_FILE -param output_file=$TAB_DIR$TAB_FILE prepare.pig
	hadoop fs -put $TAB_DIR$TAB_FILE/ $HDFS_DIR
	hive -hiveconf tab_file=$HDFS_DIR/$TAB_FILE -hiveconf table=$TABLE -f prepare.hive

	# Prepare the 400er output
	pig -x local -param input_file=$INPUT_FILE400 -param output_file=$TAB_DIR$TAB_FILE400 prepare.pig
	hadoop fs -put $TAB_DIR$TAB_FILE400/ $HDFS_DIR
	hive -hiveconf tab_file=$HDFS_DIR/$TAB_FILE400 -hiveconf table=$TABLE400 -f prepare.hive

	# Task 6_1
	hive -hiveconf table=$TABLE -hiveconf output_file=$TAB_DIR/6_1 -hiveconf start_date=2010-12-01 -hiveconf stop_date=2010-12-31 -f 6_1.hive

	# Task 6_2
	hive -hiveconf table=$TABLE -hiveconf output_file=$TAB_DIR/6_2 -hiveconf start_date=2010-12-01 -hiveconf stop_date=2010-12-31 -hiveconf hashtag=Tunisia -f 6_2.hive

	# Task 6_3
	hive -hiveconf table=$TABLE -hiveconf output_file=$TAB_DIR/6_3 -hiveconf start_date=2010-12-01 -hiveconf stop_date=2010-12-31 -f 6_3.hive

	# Task 6_4
	hive -hiveconf table=$TABLE400 -hiveconf output_file=$TAB_DIR/6_4 -hiveconf group=sibg:g1 -f 6_4.hive

	# Task 6_5_1
	hive -hiveconf table=$TABLE -hiveconf output_file=$TAB_DIR/6_5_1 -f 6_5_1.hive

	# Task 6_5_2
	pig -x local -param input_file=$INPUT_FILE -param output_file=$TAB_DIR/6_5_2 6_5_2.pig