Created
April 12, 2018 03:29
-
-
Save irfanandratama/d8ffb396ac752f1dae0b46f5397bd59c to your computer and use it in GitHub Desktop.
Representasi TF-IDF dengan Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Teks harus sudah melalui proses word tokenizing terlebih dahulu. | |
def tf(sudahDiTokenize): #Term Frequency | |
wordlist = sudahDiTokenize | |
#flat_list = [item for sublist in wordlist for item in sublist] #bila memakai tf normalized | |
#jumkata = len(flat_list) # bila memakai tf normalized | |
wordfreq = {} | |
for w in wordlist: | |
for o in w: | |
wordfreq[o] = wordfreq.get(o,0) + 1 | |
#wordfreq.update((x, y/jumkata) for x, y in wordfreq.items()) #Gunakan ini apabila memakai TF Normalized | |
print(wordfreq) | |
return wordfreq | |
def idf(sudahDiTokenize): #Inverse Term Frequency | |
idf_values = {} | |
jumdok = len(sudahDiTokenize) | |
all_tokens_set = set([item for sublist in sudahDiTokenize for item in sublist]) | |
print(all_tokens_set) | |
for tkn in all_tokens_set: | |
contains_token = map(lambda doc: tkn in doc, sudahDiTokenize) | |
idf_values[tkn] = math.log10(jumdok/(sum(contains_token))) | |
print(idf_values) | |
return idf_values | |
def tfxidf(tf,idf): | |
hasil = {k: tf[k]*idf[k] for k in tf} | |
print(hasil) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment