Skip to content

Instantly share code, notes, and snippets.

@skyer9
Last active August 11, 2019 04:50
Show Gist options
  • Save skyer9/afd34970912106ac2b7c1f87557771c9 to your computer and use it in GitHub Desktop.
Save skyer9/afd34970912106ac2b7c1f87557771c9 to your computer and use it in GitHub Desktop.
TF IDF with amazon review dataset
# -*- coding: utf-8 -*-
import pyspark
from pyspark.sql import SQLContext
# +--------------+---------+------------------+
# | doc_id| token| tfidf|
# +--------------+---------+------------------+
# |R13E0LFYMKV62T| |12.613611576441143|
# |R1QYQ8ZJ9LS8L9| high|11.634883586257445|
# |R1QYQ8ZJ9LS8L9| flies| 9.942558014017578|
# |R162SXU36VVE3N| toogoo| 9.942558014017578|
# |R1QYQ8ZJ9LS8L9| so| 9.656627474604601|
# |R2HZQ7EE1JKASF| br| 9.591581091193483|
# |R2E07B9J2KB60P| only| 8.726162689693084|
# |R2TCCMZE9ME0JS|perfectly| 7.863116472337741|
# |R2TCCMZE9ME0JS| as|7.2404344315850055|
# | RN1CRPKNIX49D| ive| 7.193685818395112|
# |R3V50MBK65GGKT| line| 6.818992368953701|
# |R3E4XGTZ5NC4QC| all| 6.646721148013247|
# |R3MK80M8WDXAX5| without| 6.646721148013247|
# |R13E0LFYMKV62T| across|6.6283720093450516|
# |R3GRH2J37F8FE7| company|6.6283720093450516|
# |R205FXUV0AWV3H| each|6.6283720093450516|
# |R2RCEVGMYOW5D7| havnt|6.6283720093450516|
# |R2TCCMZE9ME0JS| her|6.6283720093450516|
# |R1QYQ8ZJ9LS8L9| highbr|6.6283720093450516|
# |R2HZQ7EE1JKASF| metal|6.6283720093450516|
# +--------------+---------+------------------+
if __name__ == '__main__':
sc = pyspark.SparkContext.getOrCreate()
sqlContext = SQLContext(sc)
# df = sqlContext.createDataFrame([
# (0, 'bbb ccc1 fff, ggg. hhh?'),
# (1, 'aa bbb bbb ccc ddd, d12.'),
# (2, 'aaa bb eee'),
# ], ['doc_id', 'doc_str'])
# df.createOrReplaceTempView('tmp_view')
# sql = """
# SELECT
# regexp_replace(lower(doc_str), '[^a-z ]', '') as doc_str
# FROM
# tmp_view
# """
# doc = sqlContext.sql(sql)
# doc.show()
# 출처 : https://s3.amazonaws.com/amazon-reviews-pds/tsv/index.txt
df = sqlContext.read.load('hdfs://172.31.29.43:9000/amazon_reviews_us_Toys_v1_00.tsv.gz', format='csv', sep='\t', header='true')
df.createOrReplaceTempView('tmp_reviewdata')
sqls = """
CREATE OR REPLACE TEMPORARY VIEW tmp_ratingdata AS
SELECT
review_id as doc_id, regexp_replace(lower(review_body), '[^a-z ]', '') as doc_str
FROM
tmp_reviewdata
WHERE
product_parent = '100005735';
CREATE OR REPLACE TEMPORARY VIEW tmp_rawdata AS
SELECT doc_id, doc_str, explode(split(doc_str, ' ')) as token
FROM tmp_ratingdata;
CREATE OR REPLACE TEMPORARY VIEW tmp_tf AS
SELECT doc_id, token, count(*) as tf
FROM tmp_rawdata
GROUP BY doc_id, token;
CREATE OR REPLACE TEMPORARY VIEW tmp_df AS
SELECT
token,
count(distinct doc_id) as df,
(
SELECT count(distinct doc_id) as d
FROM tmp_rawdata
) d
FROM
tmp_rawdata R
GROUP BY token, d;
CREATE OR REPLACE TEMPORARY VIEW tmp_idf AS
SELECT
token,
log((d + 1) / (df + 1)) as idf
FROM
tmp_df;
SELECT
T.doc_id,
T.token,
(T.tf * I.idf) as tfidf
FROM
tmp_tf T
JOIN tmp_idf I ON T.token = I.token
ORDER BY
tfidf DESC, T.token, T.doc_id;
"""
sqlarray = sqls.split(';')
result = None
for i in range(len(sqlarray)):
sql = sqlarray[i].rstrip()
if sql != '':
result = sqlContext.sql(sql)
result.show()
sc.stop()
@skyer9
Copy link
Author

skyer9 commented Aug 10, 2019

temp table, multi query 기능 부재로 저렇게 지저분하게 코딩했는데...
조만간 기능추가가 되지 않을까 합니다.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment