Created
July 21, 2013 02:06
-
-
Save tuhinc/6047206 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Sat Jul 20 16:34:41 2013 | |
@author: gregory | |
""" | |
# PANDAS USAGE | |
import pandas as pd | |
import numpy as np | |
import nltk | |
# PANDAS USAGE | |
train = pd.read_csv('questionone.csv') | |
features = [] # make new empty list | |
for row in range(train.shape[0]): # for every row in the data | |
# data we need to create features | |
question_tokens = nltk.word_tokenize(train.ix[1,0]) # all tokens (words, punc, etc) | |
question_tags = nltk.pos_tag(question_tokens) # all tokens and parts of speech | |
question_first_word_tag = question_tags[0] | |
question_nouns = [] | |
question_verbs = [] | |
question_adjectives = [] | |
count = 0 | |
correct_capitalization = 0 | |
hasPNoun = False | |
for i in range(len(question_tokens)): | |
if question_tokens[i] == ".": | |
if question_tokens[i + 1][0] and question_tokens[i + 1][0].isupper(): | |
correct_capitalization += 1 | |
for tag in question_tags: | |
if tag[1] == "NNP" or tag[1] == "NNPS": | |
hasPNoun = True | |
for tag in question_tags: #separate parts of speech | |
if tag[1][0] == "N": | |
question_nouns.append(tag[0]) # all noun types | |
if tag[1][0] == "V": | |
question_verbs.append(tag[0]) # all verb types | |
if tag[1][0] == "J": | |
question_adjectives.append(tag[0]) | |
for tokens in question_tokens: | |
count += len(tokens) | |
question_avlength = count/len(question_tokens) | |
# create features | |
features.append([]) # append a list to features for each row in data | |
features[row].append(train.ix[row,1])# # of followers in context topic | |
features[row].append(len(question_tokens))# # of words in the question | |
features[row].append()# # of topics | |
# PANDAS USAGE | |
features[row].append(train.ix[1][5])# sum of followers in topics | |
features[row].append()# not anon | |
features[row].append()# # of common nouns between question text and context topic | |
features[row].append()# # of common nouns between question text and topics | |
features[row].append()# Is it a yes or no question? (Is..will..can..do..does..are..) | |
features[row].append()# What kind of question is it? (Who? What? Where? When? Why? How?) | |
features[row].append()# no additional topics | |
features[row].append()# question text count > 50 | |
features[row].append()# # of sentences | |
features[row].append()# ends with a question mark | |
features[row].append()# freq of punctuation | |
features[row].append()# ratio of extraneous pronouns | |
features[row].append(len(question_verbs)/len(question_tags))# ratio of verbs | |
features[row].append(len(question_adjectives)/len(question_tags))# ratio of adjectives | |
features[row].append(question_avlength)# What is the average length of word? | |
features[row].append(correct_capitalization)# if words are capitalized after a period | |
features[row].append(hasPNoun)# Does the question have a proper noun in it? | |
features[row].append()# Does the question have a name in it? | |
features[row].append()# Does the question have a name of someone famous in it? (list of celebrities) | |
features[row].append()# Is the question related to technology? | |
for tag in question_tags: | |
if tag[1] == "NNP": | |
print "Proper Noun" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment