-
-
Save leomelzer/3075236 to your computer and use it in GitHub Desktop.
package analyse; | |
import java.io.BufferedReader; | |
import java.io.File; | |
import java.io.FileReader; | |
import java.io.IOException; | |
import java.util.ArrayList; | |
import java.util.HashMap; | |
import java.util.List; | |
import java.util.Map; | |
import java.util.concurrent.TimeUnit; | |
import org.apache.lucene.document.Document; | |
import org.apache.lucene.index.IndexReader; | |
import org.apache.lucene.queryParser.QueryParser; | |
import org.apache.lucene.store.Directory; | |
import org.apache.lucene.store.FSDirectory; | |
import com.cybozu.labs.langdetect.Detector; | |
import com.cybozu.labs.langdetect.DetectorFactory; | |
import com.cybozu.labs.langdetect.LangDetectException; | |
/* | |
* (Really simple-dumb) Sentiment analysis for a lucene index of 1 million Tweets! | |
* Based on http://jeffreybreen.wordpress.com/2011/07/04/twitter-text-mining-r-slides/ | |
* | |
*/ | |
public class Analyse { | |
// path to lucene index | |
private final static String indexPath = "/Users/leomelzer/Downloads/Tweets/"; | |
// path to language profiles for classifier | |
private static String langProfileDirectory = "./src/profiles/"; | |
// lucene queryParser for saving | |
private static QueryParser queryParser; | |
// used to store positive and negative words for scoring | |
static List<String> posWords = new ArrayList<String>(); | |
static List<String> negWords = new ArrayList<String>(); | |
// keep some stats! [-1 / 0 / 1 / not english / foursquare / no text to | |
// classify] | |
static int[] stats = new int[6]; | |
/** | |
* @param args | |
* @throws IOException | |
* @throws LangDetectException | |
*/ | |
public static void main(String[] args) throws IOException, | |
LangDetectException { | |
// huh, how long? | |
long startTime = System.currentTimeMillis(); | |
// open lucene index | |
Directory dir; | |
IndexReader docReader = null; | |
try { | |
dir = FSDirectory.open(new File(indexPath)); | |
docReader = IndexReader.open(dir, true); | |
} catch (IOException e1) { | |
e1.printStackTrace(); | |
} | |
System.out.println("START: reading file list"); | |
// source: www.cs.uic.edu/~liub/FBS/sentiment-analysis.html | |
BufferedReader negReader = new BufferedReader(new FileReader(new File( | |
"./src/negative-words.txt"))); | |
BufferedReader posReader = new BufferedReader(new FileReader(new File( | |
"./src/positive-words.txt"))); | |
// currently read word | |
String word; | |
// add words to comparison list | |
while ((word = negReader.readLine()) != null) { | |
negWords.add(word); | |
} | |
while ((word = posReader.readLine()) != null) { | |
posWords.add(word); | |
} | |
// cleanup | |
negReader.close(); | |
posReader.close(); | |
System.out.println("FINISH: reading file list"); | |
// ---------------------------------------------- | |
System.out.println("START: calculating sentiment"); | |
// prepare language classifier | |
DetectorFactory.loadProfile(langProfileDirectory); | |
// store different languages | |
Map<String, Integer> langHitList = new HashMap<String, Integer>(); | |
// detect language, using http://code.google.com/p/language-detection/ | |
// has 99% accuracy | |
Detector detector; | |
// current tweet | |
Document tweet; | |
// current score | |
int score = 0; | |
// current text | |
String text; | |
// maximum number of documents | |
int max = docReader.maxDoc(); | |
// used to give some feedback during processing the 1 million tweets | |
int j = 0; | |
// do we want to skip saving that document? | |
boolean skipSave = false; | |
for (int i = 0; i < max; i++) { // | |
if (i % 100000 == 0) { | |
System.out.println("PROCESSING: " + j * 100000 + " of " | |
+ max + " tweets processed..."); | |
j++; | |
} | |
// reset, most of the times we want that. | |
skipSave = false; | |
try { | |
// read it! | |
tweet = docReader.document(i); | |
text = tweet.get("text"); | |
// we need a new instance every time unfortunately... | |
detector = DetectorFactory.create(); | |
detector.append(text); | |
// classify language! | |
String detectedLanguage = detector.detect(); | |
// if it is not english... | |
if (detectedLanguage.equals("en") == false) { | |
stats[3]++; | |
// we can't classify non-english tweets, so just keep them | |
// neutral | |
score = 0; | |
} else if (text.startsWith("I'm at") | |
|| text.startsWith("I just became the mayor") | |
|| text.startsWith("I just ousted")) { | |
// all your foursquare updates are belong to us. | |
stats[4]++; | |
// and we don't save them. yo. | |
skipSave = true; | |
} else { | |
// finally! retrieve sentiment score. | |
score = getSentimentScore(tweet.get("text")); | |
// ++ index so we won't have -1 and stuff... | |
stats[score + 1]++; | |
// wanna see what neutral tweets look like? uncomment. | |
// if (score == 0) { | |
// System.out.println("Score: " + score + " for Tweet (" + | |
// tweet.get("ID") + "):"+ tweet.get("text")); | |
// } | |
} | |
// so now for the saving... | |
if (skipSave == false) { | |
Integer currentCount = langHitList.get(detectedLanguage); | |
// ...save the detected language for some stats | |
langHitList.put(detectedLanguage, | |
(currentCount == null) ? 1 : currentCount + 1); | |
// tweet.set("language", detectedLanguage) | |
// tweet.set("sentiment", score); | |
// tweet.get("ID"); | |
} | |
} catch (LangDetectException e) { | |
// thrown by the language classifier when tweets are like :D or | |
// :3 or ????????? | |
// count how many times there is no valid input, plus we won't | |
// save it as it's in the catch clause... | |
stats[5]++; | |
} catch (Exception e) { | |
// something went wrong, ouuups! | |
e.printStackTrace(); | |
System.err.println("Doc at " + i + " does not exist"); | |
} | |
} | |
System.out.println("FINISH: calculating sentiment"); | |
// ---------------------------------------------- | |
long endTime = System.currentTimeMillis(); | |
long totalTime = endTime - startTime; | |
System.out.println("----------------------------------------------"); | |
System.out.println("STATS - TIME: Analysis took " | |
+ TimeUnit.SECONDS.convert(totalTime, TimeUnit.MILLISECONDS) | |
+ " seconds"); | |
// ---------------------------------------------- | |
// get me some info! | |
System.out.println("STATS - COUNTS: [negative | neutral | positive | not english | foursquare | no text to classify]"); | |
System.out.println("STATS - COUNTS: " + java.util.Arrays.toString(stats)); | |
System.out.println("STATS - LANGUAGE: " + langHitList.toString()); | |
// cleanup | |
docReader.close(); | |
} | |
/** | |
* does some string mangling and then calculates occurrences in positive / | |
* negative word list and finally the delta | |
* | |
* | |
* @param input | |
* String: the text to classify | |
* @return score int: if < 0 then -1, if > 0 then 1 otherwise 0 - we don't | |
* care about the actual delta | |
*/ | |
private static int getSentimentScore(String input) { | |
// normalize! | |
input = input.toLowerCase(); | |
input = input.trim(); | |
// remove all non alpha-numeric non whitespace chars | |
input = input.replaceAll("[^a-zA-Z0-9\\s]", ""); | |
int negCounter = 0; | |
int posCounter = 0; | |
// so what we got? | |
String[] words = input.split(" "); | |
// check if the current word appears in our reference lists... | |
for (int i = 0; i < words.length; i++) { | |
if (posWords.contains(words[i])) { | |
posCounter++; | |
} | |
if (negWords.contains(words[i])) { | |
negCounter++; | |
} | |
} | |
// positive matches MINUS negative matches | |
int result = (posCounter - negCounter); | |
// negative? | |
if (result < 0) { | |
return -1; | |
// or positive? | |
} else if (result > 0) { | |
return 1; | |
} | |
// neutral to the rescue! | |
return 0; | |
} | |
} |
START: reading file list | |
FINISH: reading file list | |
START: calculating sentiment | |
PROCESSING: 0 of 1057001 tweets processed... | |
PROCESSING: 100000 of 1057001 tweets processed... | |
PROCESSING: 200000 of 1057001 tweets processed... | |
PROCESSING: 300000 of 1057001 tweets processed... | |
PROCESSING: 400000 of 1057001 tweets processed... | |
PROCESSING: 500000 of 1057001 tweets processed... | |
PROCESSING: 600000 of 1057001 tweets processed... | |
PROCESSING: 700000 of 1057001 tweets processed... | |
PROCESSING: 800000 of 1057001 tweets processed... | |
PROCESSING: 900000 of 1057001 tweets processed... | |
PROCESSING: 1000000 of 1057001 tweets processed... | |
FINISH: calculating sentiment | |
---------------------------------------------- | |
STATS - TIME: Analysis took 569 seconds | |
STATS - COUNTS: [negative | neutral | positive | not english | foursquare | no text to classify] | |
STATS - COUNTS: [89309, 248062, 130849, 560431, 23063, 5287] | |
STATS - LANGUAGE: {tl=12767, tr=14695, no=8690, th=3268, bn=671, fi=10503, ta=5, sv=6037, fr=19364, bg=454, sw=4527, sl=5516, sk=2467, da=5461, so=24462, sq=1955, ko=3151, he=92, cs=1186, kn=3, pa=1, pl=4483, ru=4920, hr=3802, ro=5077, vi=2981, hu=3411, lv=1713, lt=3106, hi=4, id=34223, de=17254, zh-tw=476, mk=238, uk=160, it=23536, zh-cn=761, ur=70, fa=297, ar=6145, el=1071, ne=2, pt=188253, en=468220, et=12870, es=84303, ja=9758, nl=14863, af=11379} |
May not work fine for all the tweets. Suppose iam having something like "the product is too good to be ignored" and in your "./src/negative-words.txt" file you are having a word called "ignored" and in the file "./src/positive-words.txt" you are having "good". In this case the result will be zero and hence the statement will be deemed as neutal but actually speaking the review is truely positive
@rks0191 , well that why the header of program is "Simple-stupid Sentiment analysis" 😉
what is private final static String indexPath = "/Users/leomelzer/Downloads/Tweets/";
wat this path contains? i tried but i am getting following error
START: reading file list
org.apache.lucene.index.IndexNotFoundException: no segments* file found in org.apache.lucene.store.SimpleFSDirectory@C:\vinay\apache-lucene\org\apache\lucene\index lockFactory=org.apache.lucene.store.NativeFSLockFactory@ed1f14: files: [BufferedDeletes$Num.class, BufferedDeletes.class, ByteBlockPool$Allocator.class, ByteBlockPool.class, ByteSliceReader.class, ByteSliceWriter.class, CharBlockPool.class, CheckIndex$MySegmentTermDocs.class, CheckIndex$Status$SegmentInfoStatus.class, CheckIndex$Status.class, CheckIndex.class, CompoundFileReader$1.class, CompoundFileReader$CSIndexInput.class, CompoundFileReader$FileEntry.class, CompoundFileReader.class, CompoundFileWriter$1.class, CompoundFileWriter$FileEntry.class, CompoundFileWriter.class, ConcurrentMergeScheduler$MergeThread.class, ConcurrentMergeScheduler.class, CorruptIndexException.class, DefaultSkipListReader.class, DefaultSkipListWriter.class, DirectoryIndexReader$1.class, DirectoryIndexReader$2.class, DirectoryIndexReader$ReaderCommit.class, DirectoryIndexReader.class, DocConsumer.class, DocConsumerPerThread.class, DocFieldConsumer.class, DocFieldConsumerPerField.class, DocFieldConsumerPerThread.class, DocFieldConsumers$PerDoc.class, DocFieldConsumers.class, DocFieldConsumersPerField.class, DocFieldConsumersPerThread.class, DocFieldProcessor.class, DocFieldProcessorPerField.class, DocFieldProcessorPerThread.class, DocInverter$FieldInvertState.class, DocInverter.class, DocInverterPerField.class, DocInverterPerThread.class, DocumentsWriter$1.class, DocumentsWriter$ByteBlockAllocator.class, DocumentsWriter$DocState.class, DocumentsWriter$DocWriter.class, DocumentsWriter$FlushState.class, DocumentsWriter$SkipDocWriter.class, DocumentsWriter$WaitQueue.class, DocumentsWriter.class, DocumentsWriterThreadState.class, FieldInfo.class, FieldInfos.class, FieldReaderException.class, FieldSortedTermVectorMapper.class, FieldsReader$FieldForMerge.class, FieldsReader$LazyField.class, FieldsReader.class, FieldsWriter.class, FilterIndexReader$FilterTermDocs.class, FilterIndexReader$FilterTermEnum.class, FilterIndexReader$FilterTermPositions.class, FilterIndexReader.class, FreqProxFieldMergeState.class, FreqProxTermsWriter$PostingList.class, FreqProxTermsWriter.class, FreqProxTermsWriterPerField.class, FreqProxTermsWriterPerThread.class, IndexCommit.class, IndexCommitPoint.class, IndexDeletionPolicy.class, IndexFileDeleter$1.class, IndexFileDeleter$CommitPoint.class, IndexFileDeleter$RefCount.class, IndexFileDeleter.class, IndexFileNameFilter.class, IndexFileNames.class, IndexModifier.class, IndexReader$1.class, IndexReader$2.class, IndexReader$FieldOption.class, IndexReader.class, IndexWriter$MaxFieldLength.class, IndexWriter.class, IntBlockPool.class, InvertedDocConsumer.class, InvertedDocConsumerPerField.class, InvertedDocConsumerPerThread.class, InvertedDocEndConsumer.class, InvertedDocEndConsumerPerField.class, InvertedDocEndConsumerPerThread.class, KeepOnlyLastCommitDeletionPolicy.class, LogByteSizeMergePolicy.class, LogDocMergePolicy.class, LogMergePolicy.class, MergeDocIDRemapper.class, MergePolicy$MergeAbortedException.class, MergePolicy$MergeException.class, MergePolicy$MergeSpecification.class, MergePolicy$OneMerge.class, MergePolicy.class, MergeScheduler.class, MultiLevelSkipListReader$SkipBuffer.class, MultiLevelSkipListReader.class, MultiLevelSkipListWriter.class, MultipleTermPositions$1.class, MultipleTermPositions$IntQueue.class, MultipleTermPositions$TermPositionsQueue.class, MultipleTermPositions.class, MultiReader.class, MultiSegmentReader$MultiTermDocs.class, MultiSegmentReader$MultiTermEnum.class, MultiSegmentReader$MultiTermPositions.class, MultiSegmentReader.class, NormsWriter.class, NormsWriterPerField.class, NormsWriterPerThread.class, ParallelArrayTermVectorMapper.class, ParallelReader$ParallelTermDocs.class, ParallelReader$ParallelTermEnum.class, ParallelReader$ParallelTermPositions.class, ParallelReader.class, Payload.class, PositionBasedTermVectorMapper$TVPositionInfo.class, PositionBasedTermVectorMapper.class, RawPostingList.class, ReadOnlyMultiSegmentReader.class, ReadOnlySegmentReader.class, ReusableStringReader.class, SegmentInfo.class, SegmentInfos$1.class, SegmentInfos$2.class, SegmentInfos$FindSegmentsFile.class, SegmentInfos.class, SegmentMergeInfo.class, SegmentMergeQueue.class, SegmentMerger$1.class, SegmentMerger$CheckAbort.class, SegmentMerger.class, SegmentReader$Norm.class, SegmentReader.class, SegmentTermDocs.class, SegmentTermEnum.class, SegmentTermPositions.class, SegmentTermPositionVector.class, SegmentTermVector.class, SerialMergeScheduler.class, SnapshotDeletionPolicy$MyCommitPoint.class, SnapshotDeletionPolicy.class, SortedTermVectorMapper.class, StaleReaderException.class, StoredFieldsWriter$PerDoc.class, StoredFieldsWriter.class, StoredFieldsWriterPerField.class, StoredFieldsWriterPerThread.class, Term.class, TermBuffer.class, TermDocs.class, TermEnum.class, TermFreqVector.class, TermInfo.class, TermInfosReader$1.class, TermInfosReader$ThreadResources.class, TermInfosReader.class, TermInfosWriter.class, TermPositions.class, TermPositionVector.class, TermsHash.class, TermsHashConsumer.class, TermsHashConsumerPerField.class, TermsHashConsumerPerThread.class, TermsHashPerField.class, TermsHashPerThread.class, TermVectorEntry.class, TermVectorEntryFreqSortedComparator.class, TermVectorMapper.class, TermVectorOffsetInfo.class, TermVectorsReader.class, TermVectorsTermsWriter$PerDoc.class, TermVectorsTermsWriter$PostingList.class, TermVectorsTermsWriter.class, TermVectorsTermsWriterPerField.class, TermVectorsTermsWriterPerThread.class, TermVectorsWriter.class]
at org.apache.lucene.index.SegmentInfos$FindSegmentsFile.run(SegmentInfos.java:741)
at org.apache.lucene.index.StandardDirectoryReader.open(StandardDirectoryReader.java:52)
at org.apache.lucene.index.DirectoryReader.open(DirectoryReader.java:65)
at org.apache.lucene.index.IndexReader.open(IndexReader.java:291)
at vinay.Analyse.main(Analyse.java:57)
FINISH: reading file list
START: calculating sentiment
Exception in thread "main" java.lang.NoClassDefFoundError: net/arnx/jsonic/JSONException
at vinay.Analyse.main(Analyse.java:91)
Caused by: java.lang.ClassNotFoundException: net.arnx.jsonic.JSONException
at java.net.URLClassLoader$1.run(Unknown Source)
at java.net.URLClassLoader$1.run(Unknown Source)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(Unknown Source)
at java.lang.ClassLoader.loadClass(Unknown Source)
at sun.misc.Launcher$AppClassLoader.loadClass(Unknown Source)
at java.lang.ClassLoader.loadClass(Unknown Source)
... 1 more
hello sir,
i m new to Sentiment analysis can any one help me out for +ve and -ve word list ..?
i need to detect sarcasm and non sarcasm from twitter data
Thank you in Advance...
Can you please attach the sample files being used in the code?
I am very new to this sentimental analysis.So,can u please share input file..it will be more helpful to me.Thanks in Advance
Hi, what s the format of your input tweet? how have you denoted your documents inside "/Users/leomelzer/Downloads/Tweets/"?