-
-
Save guenodz/d5add59b31114a3a3c66 to your computer and use it in GitHub Desktop.
package com.guendouz.textclustering.preprocessing; | |
import java.util.Arrays; | |
import java.util.List; | |
/** | |
* @author Mohamed Guendouz | |
*/ | |
public class TFIDFCalculator { | |
/** | |
* @param doc list of strings | |
* @param term String represents a term | |
* @return term frequency of term in document | |
*/ | |
public double tf(List<String> doc, String term) { | |
double result = 0; | |
for (String word : doc) { | |
if (term.equalsIgnoreCase(word)) | |
result++; | |
} | |
return result / doc.size(); | |
} | |
/** | |
* @param docs list of list of strings represents the dataset | |
* @param term String represents a term | |
* @return the inverse term frequency of term in documents | |
*/ | |
public double idf(List<List<String>> docs, String term) { | |
double n = 0; | |
for (List<String> doc : docs) { | |
for (String word : doc) { | |
if (term.equalsIgnoreCase(word)) { | |
n++; | |
break; | |
} | |
} | |
} | |
return Math.log(docs.size() / n); | |
} | |
/** | |
* @param doc a text document | |
* @param docs all documents | |
* @param term term | |
* @return the TF-IDF of term | |
*/ | |
public double tfIdf(List<String> doc, List<List<String>> docs, String term) { | |
return tf(doc, term) * idf(docs, term); | |
} | |
public static void main(String[] args) { | |
List<String> doc1 = Arrays.asList("Lorem", "ipsum", "dolor", "ipsum", "sit", "ipsum"); | |
List<String> doc2 = Arrays.asList("Vituperata", "incorrupte", "at", "ipsum", "pro", "quo"); | |
List<String> doc3 = Arrays.asList("Has", "persius", "disputationi", "id", "simul"); | |
List<List<String>> documents = Arrays.asList(doc1, doc2, doc3); | |
TFIDFCalculator calculator = new TFIDFCalculator(); | |
double tfidf = calculator.tfIdf(doc1, documents, "ipsum"); | |
System.out.println("TF-IDF (ipsum) = " + tfidf); | |
} | |
} |
Thanks. Just starting on this toping and this is great help.
C:\Users\M\Desktop>javac TFIDFCalculator.java
C:\Users\M\Desktop>java TFIDFCalculator
Error: Could not find or load main class TFIDFCalculator
Sir, I am getting this error, can you help me to sort out this error..
K75: It sounds like you have %CLASSPATH% defined, given no other information. Try:
javac TFIDFCalculator.java
java -cp . TFIDFCalculator
in idf method you should add the case where "term" doesn't exist because you can't /0
return Math.log(docs.size() / n);
cast it..
return Math.log((double)docs.size() /*(double) n);
Everything allrigth
Thanks.
This is a great demo, thanks for putting this up
Thank you, just getting into text mining and this is very helpful.
Hi,
I'm currently looking into TF-IDF for the first time.
One detail question:
In line 39 you are using Math.log, which returns the natural logarithm (base e) (https://docs.oracle.com/javase/7/docs/api/java/lang/Math.html).
The wikipedia article (https://en.wikipedia.org/wiki/Tf%E2%80%93idf) states that base 10 logarithm should be used, so shouldn't this be changed to:
Math.log10(docs.size() / n)
Kind regards,
Michael
I think you will get exception in line return Math.log(docs.size() / n); in case n is 0.
Thanks mate, couldn't figure out how to compute this efficiently enough !