statsmaths · November 24, 2015 18:27
diff --git a/parse_xml_pos.py b/parse_xml_pos.py
 #!/usr/bin/env python
 """ Parses a directory of XML files and saves delimited parsed records

 Set the DIR_IN variable to the location where you have a set of XML
 records. This script will then save a pipe separated file for each
 input file. There is one row per token in the input file, and three
 columns of data: the raw token, the lemmatized form of the token, and
 a part of speech tag. The part of speech tag are only the basic universal
 ones (VERB, NOUN, DET, ect.) not the full Penn TreeBank codes. The benefit
 of using spacy as a library is that is has a much greater speed when compared
 to other methods.

 This can be used in other generic applications where you want to apply
 a basic part of speech tagger to a corpus of files. Simply do not call the
 application specific replace_date function and you will ready to go.
 """

 from __future__ import print_function

 import os
 import re
 import spacy.en

 NLP = spacy.en.English()
 DIR_IN = "blogs"
 DIR_OUT = "blogs_out"
 VERBOSE = True

 def get_files():
    """ get lists of input and output files """
    files_in = [DIR_IN + "/" + x for x in os.listdir(DIR_IN)]
    files_out = [DIR_OUT + "/" + x + ".csv" for x in os.listdir(DIR_IN)]
    return files_in, files_out


 def parse_this_file(fin_name, fout_name):
    """ reads data from fin_name, parses uses spacy, and saves in fout_name """
    fin = open(fin_name, mode='r')
    fout = open(fout_name, mode='w')
    text = fin.read()
    text = replace_date(text)
    text = clean_string(text)
    tokens = NLP(text, tag=True, parse=False)
    for tok in tokens:
        out = tok.text + '|' + tok.lemma_ + '|' + tok.pos_ + '\n'
        out = out.encode('ascii', errors='ignore')
        fout.write(out)
    fin.close()
    fout.close()


 def clean_string(text):
    """ give a string object, cleans XML and returns unicode """
    text = re.sub('<date>[^>]+</date>', "BPOST", text)
    text = unicode(text, errors='replace')
    text = re.sub(u'<[^<]+>', "", text)
    text = re.sub(u'\n', '', text)
    text = re.sub(u'\t', '', text)
    text = re.sub(u'\r', '', text)
    text = re.sub(u'\\|', '', text)
    text = re.sub(u'[ ]+', ' ', text)
    return text


 def replace_date(text):
    """ replaces the date XML tag with the BPOST; boundary of the post """
    return re.sub('<date>[^>]+</date>', "BPOST", text)


 def main():
    """ calculates and parses set of files in DIR_IN """
    files_in, files_out = get_files()
    if not os.path.exists(DIR_OUT):
        os.mkdir(DIR_OUT)

    if VERBOSE:
        print("Processing " + str(len(files_in)) + " input files.")

    for fin_name, fout_name in zip(files_in, files_out)[18208:]:
        parse_this_file(fin_name, fout_name)


 if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	""" Parses a directory of XML files and saves delimited parsed records

	Set the DIR_IN variable to the location where you have a set of XML
	records. This script will then save a pipe separated file for each
	input file. There is one row per token in the input file, and three
	columns of data: the raw token, the lemmatized form of the token, and
	a part of speech tag. The part of speech tag are only the basic universal
	ones (VERB, NOUN, DET, ect.) not the full Penn TreeBank codes. The benefit
	of using spacy as a library is that is has a much greater speed when compared
	to other methods.

	This can be used in other generic applications where you want to apply
	a basic part of speech tagger to a corpus of files. Simply do not call the
	application specific replace_date function and you will ready to go.
	"""

	from __future__ import print_function

	import os
	import re
	import spacy.en

	NLP = spacy.en.English()
	DIR_IN = "blogs"
	DIR_OUT = "blogs_out"
	VERBOSE = True

	def get_files():
	""" get lists of input and output files """
	files_in = [DIR_IN + "/" + x for x in os.listdir(DIR_IN)]
	files_out = [DIR_OUT + "/" + x + ".csv" for x in os.listdir(DIR_IN)]
	return files_in, files_out


	def parse_this_file(fin_name, fout_name):
	""" reads data from fin_name, parses uses spacy, and saves in fout_name """
	fin = open(fin_name, mode='r')
	fout = open(fout_name, mode='w')
	text = fin.read()
	text = replace_date(text)
	text = clean_string(text)
	tokens = NLP(text, tag=True, parse=False)
	for tok in tokens:
	out = tok.text + '\|' + tok.lemma_ + '\|' + tok.pos_ + '\n'
	out = out.encode('ascii', errors='ignore')
	fout.write(out)
	fin.close()
	fout.close()


	def clean_string(text):
	""" give a string object, cleans XML and returns unicode """
	text = re.sub('<date>[^>]+</date>', "BPOST", text)
	text = unicode(text, errors='replace')
	text = re.sub(u'<[^<]+>', "", text)
	text = re.sub(u'\n', '', text)
	text = re.sub(u'\t', '', text)
	text = re.sub(u'\r', '', text)
	text = re.sub(u'\\\|', '', text)
	text = re.sub(u'[ ]+', ' ', text)
	return text


	def replace_date(text):
	""" replaces the date XML tag with the BPOST; boundary of the post """
	return re.sub('<date>[^>]+</date>', "BPOST", text)


	def main():
	""" calculates and parses set of files in DIR_IN """
	files_in, files_out = get_files()
	if not os.path.exists(DIR_OUT):
	os.mkdir(DIR_OUT)

	if VERBOSE:
	print("Processing " + str(len(files_in)) + " input files.")

	for fin_name, fout_name in zip(files_in, files_out)[18208:]:
	parse_this_file(fin_name, fout_name)


	if __name__ == '__main__':
	main()