Skip to content

Instantly share code, notes, and snippets.

@soopercorp
Created April 4, 2012 21:01
Show Gist options
  • Save soopercorp/2305624 to your computer and use it in GitHub Desktop.
Save soopercorp/2305624 to your computer and use it in GitHub Desktop.
nlp-hw2-extract
#!/usr/bin/env python
import pprint
import string
import cPickle as pickle
import sys
from nltk.corpus import wordnet as wn
#pretty print
pp = pprint.PrettyPrinter(indent=4)
#point to files
testFile = sys.argv[1]
arffFile = sys.argv[2]
#gazetteer data
funcFile = "/home/hr/study/nlp/2012/nlp-hw2/gazetteers/prep.txt"
countriesFile = "/home/hr/study/nlp/2012/nlp-hw2/gazetteers/countries.txt"
datesFile = "/home/hr/study/nlp/2012/nlp-hw2/gazetteers/dates.txt"
pkl_file = open('/home/hr/study/nlp/2012/nlp-hw2/hasFamilyName.pkl', 'rb')
names = pickle.load(pkl_file)
pkl_file.close()
places_file = open('/home/hr/study/nlp/2012/nlp-hw2/hasPopulation.pkl', 'rb')
places = pickle.load(places_file)
places_file.close()
#punctuations
puncs = ['.',',','!','(',')','-',':',';','~','--','"','?','$',"''"]
#open testfile
fd = open(testFile)
contents = fd.readlines()
fd.close()
#open functional word file
fd = open(funcFile)
funcs = fd.readlines()
fd.close()
#countries file
fd = open(countriesFile)
countries = fd.readlines()
fd.close()
#dates file
fd = open(datesFile)
dates = fd.readlines()
fd.close()
#strip \n
contents = map(string.strip,contents)
funcs = map(string.strip,funcs)
countries = map(string.strip,countries)
dates = map(string.strip,dates)
# features
# [posTag,isAllCaps,firstLetterCaps,wordLength,isFuncWord,isPunctuation
# contatinsDots, ]
features = []
wnset = set()
def extractFeature(entity):
if not entity:
return
feature = []
#posTag
if entity[1] not in puncs:
feature.append(entity[1])
else:
feature.append('punc')
#isInitial
feature.append(str(int((len(entity[0]) == 2 and '.' in entity[0]))))
#functional word = determiner,conjunction,preposition
feature.append(str(int(entity[0].lower() in funcs)))
#allCaps - not considering func words
if entity[0].isupper() and feature[-1] == '0':
feature.append('1')
else:
feature.append('0')
#firstCaps
if(entity[0][0].isupper()):
feature.append('1')
else:
feature.append('0')
#word length
feature.append(str(len(entity[0])))
#punctuation mark
feature.append(str(int(entity[0] in puncs)))
#contains dots
if(entity[0] != '.'):
feature.append(str(int('.' in entity[0])))
else:
feature.append('0')
#wordnet lexical info
wnres = wn.synsets(entity[0])
if wnres and ("person" in wnres[0].lexname or "location" in wnres[0].lexname):
feature.append(wnres[0].lexname)
wnset.add(wnres[0].lexname)
else:
feature.append('?')
#isCountry
feature.append(str(int(entity[0].title() in countries)))
#inYagoNames
feature.append(str(int(entity[0].title() in names)))
#inYagoPlaces
feature.append(str(int(entity[0].title() in places)))
#isDate
if entity[0].title() in dates:
feature.append('1')
feature[0] = "date"
else:
feature.append('0')
#prevPOSTag
if features:
feature.append(features[-1][0])
else:
feature.append('?')
#class
feature.append(entity[2])
features.append(feature)
# strip empty lines and extract features
for line in contents:
entity = line.rstrip().split()
extractFeature(entity)
# write wordNet @attribute line
with open(arffFile, 'r') as file:
# read a list of lines into data
data = file.readlines()
data[10] = '@attribute wnLex {'
for lex in wnset:
data[10]+=lex+','
data[10] = data[10][:-1] # remove comma
data[10]+='}\n'
with open(arffFile, 'w') as file:
file.writelines(data)
arff = open(arffFile,"a")
# join and write to file
for line in features:
arff.write((','.join(line))+'\n')
arff.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment