Created
November 24, 2015 18:27
-
-
Save statsmaths/70c40dfd75ab48a019d2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" Parses a directory of XML files and saves delimited parsed records | |
Set the DIR_IN variable to the location where you have a set of XML | |
records. This script will then save a pipe separated file for each | |
input file. There is one row per token in the input file, and three | |
columns of data: the raw token, the lemmatized form of the token, and | |
a part of speech tag. The part of speech tag are only the basic universal | |
ones (VERB, NOUN, DET, ect.) not the full Penn TreeBank codes. The benefit | |
of using spacy as a library is that is has a much greater speed when compared | |
to other methods. | |
This can be used in other generic applications where you want to apply | |
a basic part of speech tagger to a corpus of files. Simply do not call the | |
application specific replace_date function and you will ready to go. | |
""" | |
from __future__ import print_function | |
import os | |
import re | |
import spacy.en | |
NLP = spacy.en.English() | |
DIR_IN = "blogs" | |
DIR_OUT = "blogs_out" | |
VERBOSE = True | |
def get_files(): | |
""" get lists of input and output files """ | |
files_in = [DIR_IN + "/" + x for x in os.listdir(DIR_IN)] | |
files_out = [DIR_OUT + "/" + x + ".csv" for x in os.listdir(DIR_IN)] | |
return files_in, files_out | |
def parse_this_file(fin_name, fout_name): | |
""" reads data from fin_name, parses uses spacy, and saves in fout_name """ | |
fin = open(fin_name, mode='r') | |
fout = open(fout_name, mode='w') | |
text = fin.read() | |
text = replace_date(text) | |
text = clean_string(text) | |
tokens = NLP(text, tag=True, parse=False) | |
for tok in tokens: | |
out = tok.text + '|' + tok.lemma_ + '|' + tok.pos_ + '\n' | |
out = out.encode('ascii', errors='ignore') | |
fout.write(out) | |
fin.close() | |
fout.close() | |
def clean_string(text): | |
""" give a string object, cleans XML and returns unicode """ | |
text = re.sub('<date>[^>]+</date>', "BPOST", text) | |
text = unicode(text, errors='replace') | |
text = re.sub(u'<[^<]+>', "", text) | |
text = re.sub(u'\n', '', text) | |
text = re.sub(u'\t', '', text) | |
text = re.sub(u'\r', '', text) | |
text = re.sub(u'\\|', '', text) | |
text = re.sub(u'[ ]+', ' ', text) | |
return text | |
def replace_date(text): | |
""" replaces the date XML tag with the BPOST; boundary of the post """ | |
return re.sub('<date>[^>]+</date>', "BPOST", text) | |
def main(): | |
""" calculates and parses set of files in DIR_IN """ | |
files_in, files_out = get_files() | |
if not os.path.exists(DIR_OUT): | |
os.mkdir(DIR_OUT) | |
if VERBOSE: | |
print("Processing " + str(len(files_in)) + " input files.") | |
for fin_name, fout_name in zip(files_in, files_out)[18208:]: | |
parse_this_file(fin_name, fout_name) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment