Created
April 4, 2012 21:01
-
-
Save soopercorp/2305624 to your computer and use it in GitHub Desktop.
nlp-hw2-extract
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import pprint | |
import string | |
import cPickle as pickle | |
import sys | |
from nltk.corpus import wordnet as wn | |
#pretty print | |
pp = pprint.PrettyPrinter(indent=4) | |
#point to files | |
testFile = sys.argv[1] | |
arffFile = sys.argv[2] | |
#gazetteer data | |
funcFile = "/home/hr/study/nlp/2012/nlp-hw2/gazetteers/prep.txt" | |
countriesFile = "/home/hr/study/nlp/2012/nlp-hw2/gazetteers/countries.txt" | |
datesFile = "/home/hr/study/nlp/2012/nlp-hw2/gazetteers/dates.txt" | |
pkl_file = open('/home/hr/study/nlp/2012/nlp-hw2/hasFamilyName.pkl', 'rb') | |
names = pickle.load(pkl_file) | |
pkl_file.close() | |
places_file = open('/home/hr/study/nlp/2012/nlp-hw2/hasPopulation.pkl', 'rb') | |
places = pickle.load(places_file) | |
places_file.close() | |
#punctuations | |
puncs = ['.',',','!','(',')','-',':',';','~','--','"','?','$',"''"] | |
#open testfile | |
fd = open(testFile) | |
contents = fd.readlines() | |
fd.close() | |
#open functional word file | |
fd = open(funcFile) | |
funcs = fd.readlines() | |
fd.close() | |
#countries file | |
fd = open(countriesFile) | |
countries = fd.readlines() | |
fd.close() | |
#dates file | |
fd = open(datesFile) | |
dates = fd.readlines() | |
fd.close() | |
#strip \n | |
contents = map(string.strip,contents) | |
funcs = map(string.strip,funcs) | |
countries = map(string.strip,countries) | |
dates = map(string.strip,dates) | |
# features | |
# [posTag,isAllCaps,firstLetterCaps,wordLength,isFuncWord,isPunctuation | |
# contatinsDots, ] | |
features = [] | |
wnset = set() | |
def extractFeature(entity): | |
if not entity: | |
return | |
feature = [] | |
#posTag | |
if entity[1] not in puncs: | |
feature.append(entity[1]) | |
else: | |
feature.append('punc') | |
#isInitial | |
feature.append(str(int((len(entity[0]) == 2 and '.' in entity[0])))) | |
#functional word = determiner,conjunction,preposition | |
feature.append(str(int(entity[0].lower() in funcs))) | |
#allCaps - not considering func words | |
if entity[0].isupper() and feature[-1] == '0': | |
feature.append('1') | |
else: | |
feature.append('0') | |
#firstCaps | |
if(entity[0][0].isupper()): | |
feature.append('1') | |
else: | |
feature.append('0') | |
#word length | |
feature.append(str(len(entity[0]))) | |
#punctuation mark | |
feature.append(str(int(entity[0] in puncs))) | |
#contains dots | |
if(entity[0] != '.'): | |
feature.append(str(int('.' in entity[0]))) | |
else: | |
feature.append('0') | |
#wordnet lexical info | |
wnres = wn.synsets(entity[0]) | |
if wnres and ("person" in wnres[0].lexname or "location" in wnres[0].lexname): | |
feature.append(wnres[0].lexname) | |
wnset.add(wnres[0].lexname) | |
else: | |
feature.append('?') | |
#isCountry | |
feature.append(str(int(entity[0].title() in countries))) | |
#inYagoNames | |
feature.append(str(int(entity[0].title() in names))) | |
#inYagoPlaces | |
feature.append(str(int(entity[0].title() in places))) | |
#isDate | |
if entity[0].title() in dates: | |
feature.append('1') | |
feature[0] = "date" | |
else: | |
feature.append('0') | |
#prevPOSTag | |
if features: | |
feature.append(features[-1][0]) | |
else: | |
feature.append('?') | |
#class | |
feature.append(entity[2]) | |
features.append(feature) | |
# strip empty lines and extract features | |
for line in contents: | |
entity = line.rstrip().split() | |
extractFeature(entity) | |
# write wordNet @attribute line | |
with open(arffFile, 'r') as file: | |
# read a list of lines into data | |
data = file.readlines() | |
data[10] = '@attribute wnLex {' | |
for lex in wnset: | |
data[10]+=lex+',' | |
data[10] = data[10][:-1] # remove comma | |
data[10]+='}\n' | |
with open(arffFile, 'w') as file: | |
file.writelines(data) | |
arff = open(arffFile,"a") | |
# join and write to file | |
for line in features: | |
arff.write((','.join(line))+'\n') | |
arff.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment