leomelzer · July 9, 2012 09:02 · jay-pk-codebook · Mar 28, 2013 · rks0191 · Dec 12, 2013
diff --git a/Analyse.java b/Analyse.java
 package analyse;

 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileReader;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.concurrent.TimeUnit;

 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.queryParser.QueryParser;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;

 import com.cybozu.labs.langdetect.Detector;
 import com.cybozu.labs.langdetect.DetectorFactory;
 import com.cybozu.labs.langdetect.LangDetectException;

 /*
 * (Really simple-dumb) Sentiment analysis for a lucene index of 1 million Tweets!
 * Based on http://jeffreybreen.wordpress.com/2011/07/04/twitter-text-mining-r-slides/
 * 
 */
 public class Analyse {

 	// path to lucene index
 	private final static String indexPath = "/Users/leomelzer/Downloads/Tweets/";
 	// path to language profiles for classifier
 	private static String langProfileDirectory = "./src/profiles/";

 	// lucene queryParser for saving
 	private static QueryParser queryParser;

 	// used to store positive and negative words for scoring
 	static List<String> posWords = new ArrayList<String>();
 	static List<String> negWords = new ArrayList<String>();

 	// keep some stats! [-1 / 0 / 1 / not english / foursquare / no text to
 	// classify]
 	static int[] stats = new int[6];

 	/**
 	 * @param args
 	 * @throws IOException
 	 * @throws LangDetectException
 	 */
 	public static void main(String[] args) throws IOException,
 			LangDetectException {

 		// huh, how long?
 		long startTime = System.currentTimeMillis();

 		// open lucene index
 		Directory dir;
 		IndexReader docReader = null;
 		try {
 			dir = FSDirectory.open(new File(indexPath));
 			docReader = IndexReader.open(dir, true);
 		} catch (IOException e1) {
 			e1.printStackTrace();
 		}

 		System.out.println("START: reading file list");
 		// source: www.cs.uic.edu/~liub/FBS/sentiment-analysis.html
 		BufferedReader negReader = new BufferedReader(new FileReader(new File(
 				"./src/negative-words.txt")));
 		BufferedReader posReader = new BufferedReader(new FileReader(new File(
 				"./src/positive-words.txt")));

 		// currently read word
 		String word;

 		// add words to comparison list
 		while ((word = negReader.readLine()) != null) {
 			negWords.add(word);
 		}
 		while ((word = posReader.readLine()) != null) {
 			posWords.add(word);
 		}

 		// cleanup
 		negReader.close();
 		posReader.close();

 		System.out.println("FINISH: reading file list");

 		// ----------------------------------------------

 		System.out.println("START: calculating sentiment");

 		// prepare language classifier
 		DetectorFactory.loadProfile(langProfileDirectory);
 		// store different languages
 		Map<String, Integer> langHitList = new HashMap<String, Integer>();

 		// detect language, using http://code.google.com/p/language-detection/
 		// has 99% accuracy
 		Detector detector;

 		// current tweet
 		Document tweet;
 		// current score
 		int score = 0;
 		// current text
 		String text;

 		// maximum number of documents
 		int max = docReader.maxDoc();
 		// used to give some feedback during processing the 1 million tweets
 		int j = 0;
 		// do we want to skip saving that document?
 		boolean skipSave = false;

 		for (int i = 0; i < max; i++) { //
 			if (i % 100000 == 0) {
 				System.out.println("PROCESSING: " + j * 100000 + " of "
 						+ max + " tweets processed...");
 				j++;
 			}

 			// reset, most of the times we want that.
 			skipSave = false;

 			try {
 				// read it!
 				tweet = docReader.document(i);

 				text = tweet.get("text");

 				// we need a new instance every time unfortunately...
 				detector = DetectorFactory.create();
 				detector.append(text);
 				// classify language!
 				String detectedLanguage = detector.detect();

 				// if it is not english...
 				if (detectedLanguage.equals("en") == false) {
 					stats[3]++;

 					// we can't classify non-english tweets, so just keep them
 					// neutral
 					score = 0;
 				} else if (text.startsWith("I'm at")
 						|| text.startsWith("I just became the mayor")
 						|| text.startsWith("I just ousted")) {
 					// all your foursquare updates are belong to us.
 					stats[4]++;
 					// and we don't save them. yo.
 					skipSave = true;
 				} else {
 					// finally! retrieve sentiment score.
 					score = getSentimentScore(tweet.get("text"));
 					// ++ index so we won't have -1 and stuff...
 					stats[score + 1]++;

 					// wanna see what neutral tweets look like? uncomment.
 					// if (score == 0) {
 					// System.out.println("Score: " + score + " for Tweet (" +
 					// tweet.get("ID") + "):"+ tweet.get("text"));
 					// }
 				}

 				// so now for the saving...
 				if (skipSave == false) {
 					Integer currentCount = langHitList.get(detectedLanguage);
 					// ...save the detected language for some stats
 					langHitList.put(detectedLanguage,
 							(currentCount == null) ? 1 : currentCount + 1);
 					
 					// tweet.set("language", detectedLanguage)
 					// tweet.set("sentiment", score);
 					// tweet.get("ID");
 				}
 			} catch (LangDetectException e) {
 				// thrown by the language classifier when tweets are like :D or
 				// :3 or ?????????
 				// count how many times there is no valid input, plus we won't
 				// save it as it's in the catch clause...
 				stats[5]++;
 			} catch (Exception e) {
 				// something went wrong, ouuups!
 				e.printStackTrace();
 				System.err.println("Doc at " + i + " does not exist");
 			}
 		}

 		System.out.println("FINISH: calculating sentiment");

 		// ----------------------------------------------

 		long endTime = System.currentTimeMillis();
 		long totalTime = endTime - startTime;

 		System.out.println("----------------------------------------------");
 		System.out.println("STATS - TIME: Analysis took "
 				+ TimeUnit.SECONDS.convert(totalTime, TimeUnit.MILLISECONDS)
 				+ " seconds");

 		// ----------------------------------------------

 		// get me some info!
 		System.out.println("STATS - COUNTS: [negative | neutral | positive | not english | foursquare | no text to classify]");
 		System.out.println("STATS - COUNTS: " + java.util.Arrays.toString(stats));
 		System.out.println("STATS - LANGUAGE: " + langHitList.toString());

 		// cleanup
 		docReader.close();
 	}

 	/**
 	 * does some string mangling and then calculates occurrences in positive /
 	 * negative word list and finally the delta
 	 * 
 	 * 
 	 * @param input
 	 *            String: the text to classify
 	 * @return score int: if < 0 then -1, if > 0 then 1 otherwise 0 - we don't
 	 *         care about the actual delta
 	 */
 	private static int getSentimentScore(String input) {
 		// normalize!
 		input = input.toLowerCase();
 		input = input.trim();
 		// remove all non alpha-numeric non whitespace chars
 		input = input.replaceAll("[^a-zA-Z0-9\\s]", "");

 		int negCounter = 0;
 		int posCounter = 0;

 		// so what we got?
 		String[] words = input.split(" ");

 		// check if the current word appears in our reference lists...
 		for (int i = 0; i < words.length; i++) {
 			if (posWords.contains(words[i])) {
 				posCounter++;
 			}
 			if (negWords.contains(words[i])) {
 				negCounter++;
 			}
 		}

 		// positive matches MINUS negative matches
 		int result = (posCounter - negCounter);

 		// negative?
 		if (result < 0) {
 			return -1;
 			// or positive?
 		} else if (result > 0) {
 			return 1;
 		}

 		// neutral to the rescue!
 		return 0;
 	}

 }
diff --git a/output.txt b/output.txt
 START: reading file list
 FINISH: reading file list
 START: calculating sentiment
 PROCESSING: 0 of 1057001 tweets processed...
 PROCESSING: 100000 of 1057001 tweets processed...
 PROCESSING: 200000 of 1057001 tweets processed...
 PROCESSING: 300000 of 1057001 tweets processed...
 PROCESSING: 400000 of 1057001 tweets processed...
 PROCESSING: 500000 of 1057001 tweets processed...
 PROCESSING: 600000 of 1057001 tweets processed...
 PROCESSING: 700000 of 1057001 tweets processed...
 PROCESSING: 800000 of 1057001 tweets processed...
 PROCESSING: 900000 of 1057001 tweets processed...
 PROCESSING: 1000000 of 1057001 tweets processed...
 FINISH: calculating sentiment
 ----------------------------------------------
 STATS - TIME: Analysis took 569 seconds
 STATS - COUNTS: [negative | neutral | positive | not english | foursquare | no text to classify]
 STATS - COUNTS: [89309, 248062, 130849, 560431, 23063, 5287]
 STATS - LANGUAGE: {tl=12767, tr=14695, no=8690, th=3268, bn=671, fi=10503, ta=5, sv=6037, fr=19364, bg=454, sw=4527, sl=5516, sk=2467, da=5461, so=24462, sq=1955, ko=3151, he=92, cs=1186, kn=3, pa=1, pl=4483, ru=4920, hr=3802, ro=5077, vi=2981, hu=3411, lv=1713, lt=3106, hi=4, id=34223, de=17254, zh-tw=476, mk=238, uk=160, it=23536, zh-cn=761, ur=70, fa=297, ar=6145, el=1071, ne=2, pt=188253, en=468220, et=12870, es=84303, ja=9758, nl=14863, af=11379}
	package analyse;

	import java.io.BufferedReader;
	import java.io.File;
	import java.io.FileReader;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.HashMap;
	import java.util.List;
	import java.util.Map;
	import java.util.concurrent.TimeUnit;

	import org.apache.lucene.document.Document;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.queryParser.QueryParser;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.store.FSDirectory;

	import com.cybozu.labs.langdetect.Detector;
	import com.cybozu.labs.langdetect.DetectorFactory;
	import com.cybozu.labs.langdetect.LangDetectException;

	/*
	* (Really simple-dumb) Sentiment analysis for a lucene index of 1 million Tweets!
	* Based on http://jeffreybreen.wordpress.com/2011/07/04/twitter-text-mining-r-slides/
	*
	*/
	public class Analyse {

	// path to lucene index
	private final static String indexPath = "/Users/leomelzer/Downloads/Tweets/";
	// path to language profiles for classifier
	private static String langProfileDirectory = "./src/profiles/";

	// lucene queryParser for saving
	private static QueryParser queryParser;

	// used to store positive and negative words for scoring
	static List<String> posWords = new ArrayList<String>();
	static List<String> negWords = new ArrayList<String>();

	// keep some stats! [-1 / 0 / 1 / not english / foursquare / no text to
	// classify]
	static int[] stats = new int[6];

	/**
	* @param args
	* @throws IOException
	* @throws LangDetectException
	*/
	public static void main(String[] args) throws IOException,
	LangDetectException {

	// huh, how long?
	long startTime = System.currentTimeMillis();

	// open lucene index
	Directory dir;
	IndexReader docReader = null;
	try {
	dir = FSDirectory.open(new File(indexPath));
	docReader = IndexReader.open(dir, true);
	} catch (IOException e1) {
	e1.printStackTrace();
	}

	System.out.println("START: reading file list");
	// source: www.cs.uic.edu/~liub/FBS/sentiment-analysis.html
	BufferedReader negReader = new BufferedReader(new FileReader(new File(
	"./src/negative-words.txt")));
	BufferedReader posReader = new BufferedReader(new FileReader(new File(
	"./src/positive-words.txt")));

	// currently read word
	String word;

	// add words to comparison list
	while ((word = negReader.readLine()) != null) {
	negWords.add(word);
	}
	while ((word = posReader.readLine()) != null) {
	posWords.add(word);
	}

	// cleanup
	negReader.close();
	posReader.close();

	System.out.println("FINISH: reading file list");

	// ----------------------------------------------

	System.out.println("START: calculating sentiment");

	// prepare language classifier
	DetectorFactory.loadProfile(langProfileDirectory);
	// store different languages
	Map<String, Integer> langHitList = new HashMap<String, Integer>();

	// detect language, using http://code.google.com/p/language-detection/
	// has 99% accuracy
	Detector detector;

	// current tweet
	Document tweet;
	// current score
	int score = 0;
	// current text
	String text;

	// maximum number of documents
	int max = docReader.maxDoc();
	// used to give some feedback during processing the 1 million tweets
	int j = 0;
	// do we want to skip saving that document?
	boolean skipSave = false;

	for (int i = 0; i < max; i++) { //
	if (i % 100000 == 0) {
	System.out.println("PROCESSING: " + j * 100000 + " of "
	+ max + " tweets processed...");
	j++;
	}

	// reset, most of the times we want that.
	skipSave = false;

	try {
	// read it!
	tweet = docReader.document(i);

	text = tweet.get("text");

	// we need a new instance every time unfortunately...
	detector = DetectorFactory.create();
	detector.append(text);
	// classify language!
	String detectedLanguage = detector.detect();

	// if it is not english...
	if (detectedLanguage.equals("en") == false) {
	stats[3]++;

	// we can't classify non-english tweets, so just keep them
	// neutral
	score = 0;
	} else if (text.startsWith("I'm at")
	\|\| text.startsWith("I just became the mayor")
	\|\| text.startsWith("I just ousted")) {
	// all your foursquare updates are belong to us.
	stats[4]++;
	// and we don't save them. yo.
	skipSave = true;
	} else {
	// finally! retrieve sentiment score.
	score = getSentimentScore(tweet.get("text"));
	// ++ index so we won't have -1 and stuff...
	stats[score + 1]++;

	// wanna see what neutral tweets look like? uncomment.
	// if (score == 0) {
	// System.out.println("Score: " + score + " for Tweet (" +
	// tweet.get("ID") + "):"+ tweet.get("text"));
	// }
	}

	// so now for the saving...
	if (skipSave == false) {
	Integer currentCount = langHitList.get(detectedLanguage);
	// ...save the detected language for some stats
	langHitList.put(detectedLanguage,
	(currentCount == null) ? 1 : currentCount + 1);

	// tweet.set("language", detectedLanguage)
	// tweet.set("sentiment", score);
	// tweet.get("ID");
	}
	} catch (LangDetectException e) {
	// thrown by the language classifier when tweets are like :D or
	// :3 or ?????????
	// count how many times there is no valid input, plus we won't
	// save it as it's in the catch clause...
	stats[5]++;
	} catch (Exception e) {
	// something went wrong, ouuups!
	e.printStackTrace();
	System.err.println("Doc at " + i + " does not exist");
	}
	}

	System.out.println("FINISH: calculating sentiment");

	// ----------------------------------------------

	long endTime = System.currentTimeMillis();
	long totalTime = endTime - startTime;

	System.out.println("----------------------------------------------");
	System.out.println("STATS - TIME: Analysis took "
	+ TimeUnit.SECONDS.convert(totalTime, TimeUnit.MILLISECONDS)
	+ " seconds");

	// ----------------------------------------------

	// get me some info!
	System.out.println("STATS - COUNTS: [negative \| neutral \| positive \| not english \| foursquare \| no text to classify]");
	System.out.println("STATS - COUNTS: " + java.util.Arrays.toString(stats));
	System.out.println("STATS - LANGUAGE: " + langHitList.toString());

	// cleanup
	docReader.close();
	}

	/**
	* does some string mangling and then calculates occurrences in positive /
	* negative word list and finally the delta
	*
	*
	* @param input
	* String: the text to classify
	* @return score int: if < 0 then -1, if > 0 then 1 otherwise 0 - we don't
	* care about the actual delta
	*/
	private static int getSentimentScore(String input) {
	// normalize!
	input = input.toLowerCase();
	input = input.trim();
	// remove all non alpha-numeric non whitespace chars
	input = input.replaceAll("[^a-zA-Z0-9\\s]", "");

	int negCounter = 0;
	int posCounter = 0;

	// so what we got?
	String[] words = input.split(" ");

	// check if the current word appears in our reference lists...
	for (int i = 0; i < words.length; i++) {
	if (posWords.contains(words[i])) {
	posCounter++;
	}
	if (negWords.contains(words[i])) {
	negCounter++;
	}
	}

	// positive matches MINUS negative matches
	int result = (posCounter - negCounter);

	// negative?
	if (result < 0) {
	return -1;
	// or positive?
	} else if (result > 0) {
	return 1;
	}

	// neutral to the rescue!
	return 0;
	}

	}
	START: reading file list
	FINISH: reading file list
	START: calculating sentiment
	PROCESSING: 0 of 1057001 tweets processed...
	PROCESSING: 100000 of 1057001 tweets processed...
	PROCESSING: 200000 of 1057001 tweets processed...
	PROCESSING: 300000 of 1057001 tweets processed...
	PROCESSING: 400000 of 1057001 tweets processed...
	PROCESSING: 500000 of 1057001 tweets processed...
	PROCESSING: 600000 of 1057001 tweets processed...
	PROCESSING: 700000 of 1057001 tweets processed...
	PROCESSING: 800000 of 1057001 tweets processed...
	PROCESSING: 900000 of 1057001 tweets processed...
	PROCESSING: 1000000 of 1057001 tweets processed...
	FINISH: calculating sentiment
	----------------------------------------------
	STATS - TIME: Analysis took 569 seconds
	STATS - COUNTS: [negative \| neutral \| positive \| not english \| foursquare \| no text to classify]
	STATS - COUNTS: [89309, 248062, 130849, 560431, 23063, 5287]
	STATS - LANGUAGE: {tl=12767, tr=14695, no=8690, th=3268, bn=671, fi=10503, ta=5, sv=6037, fr=19364, bg=454, sw=4527, sl=5516, sk=2467, da=5461, so=24462, sq=1955, ko=3151, he=92, cs=1186, kn=3, pa=1, pl=4483, ru=4920, hr=3802, ro=5077, vi=2981, hu=3411, lv=1713, lt=3106, hi=4, id=34223, de=17254, zh-tw=476, mk=238, uk=160, it=23536, zh-cn=761, ur=70, fa=297, ar=6145, el=1071, ne=2, pt=188253, en=468220, et=12870, es=84303, ja=9758, nl=14863, af=11379}