Created
March 10, 2016 16:07
-
-
Save jeshuamaxey/bee85740917274a4d0a6 to your computer and use it in GitHub Desktop.
Expects a documents gold data file to exist. Breaks the content of the document gold data into sentences and save them to a csv file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
from textblob import TextBlob | |
import sys | |
reload(sys) | |
sys.setdefaultencoding("utf-8") | |
INFILE_NAME = r"absolute/path/to/text-labelling/text-labelling/cf_files/explosion: document_gold.csv" | |
OUTFILE_NAME = r"absolute/path/to/text-labelling/text-labelling/cf_files/explosion: sentence_gold.csv" | |
def get_sentences_from_document_goldfile(infile_name): | |
with open(infile_name) as infile: | |
reader = csv.DictReader(infile) | |
sentences = [] | |
doc_id = 0 | |
for sample in reader: | |
if sample['label_gold'] == 'yes': | |
blob = TextBlob(sample['content'].encode('ascii', 'ignore')) | |
for sentence_text in blob.sentences: | |
sample = { | |
'content': sentence_text, | |
'document_id': doc_id, | |
'source_id': 0, | |
'model_id': sample['model_id'], | |
'text_type': 'sentence', | |
'label_gold': '', | |
'label_gold_reason': '', | |
'_golden': 'TRUE' | |
} | |
sentences.append(sample) | |
doc_id += 1 | |
return sentences | |
def write_sentence_samples_to_file(samples, outfile_name): | |
with open(outfile_name, 'w') as outfile: | |
fieldnames = ['document_id', 'source_id', 'model_id', 'text_type', 'content', 'label_gold', 'label_gold_reason', '_golden'] | |
writer = csv.DictWriter(outfile, fieldnames=fieldnames) | |
writer.writeheader() | |
for sample in samples: | |
writer.writerow(sample) | |
print 'Written {} samples to {}'.format(len(samples), outfile_name) | |
def main(): | |
samples = get_sentences_from_document_goldfile(INFILE_NAME) | |
write_sentence_samples_to_file(samples, OUTFILE_NAME) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment