rickcrawford · October 23, 2018 05:08
diff --git a/download.sh b/download.sh
 # The following file downloads complaint data from the Consumer Finance Board.
 URL="https://data.consumerfinance.gov/api/views/s6ew-h6mp/rows.csv?accessType=DOWNLOAD"
 FILE=complaints.csv

 if [ ! -f $FILE ]; then
 	echo "-- Downloading file --"
 	curl -L $URL --output $FILE
 fi

 python process.py complaints.csv

 for filename in output/*.csv; do
    tail -n 1000 $filename >> training_set.csv
 done

 rm -r output/
diff --git a/process.py b/process.py
 import sys
 import csv
 import re
 import md5
 import logging
 import os

 # Classes

 OUTPUT_PRODUCTS = []
 #OUTPUT_PRODUCTS = [('Credit card',''),('Credit reporting',''),('Debt collection','Credit card'),('Consumer Loan','Vehicle loan')]


 class OutputWriter:
    def __init__(self, max_values_per_tag=1000, min_values_per_tag=6000, max_values=100000, unique_values=True):
        self.files = {}
        self.counters = {}
        self.values = set()
        self.total = 0
        self.unique_values = unique_values
        self.max_values_per_tag = max_values_per_tag
        self.max_values = max_values
        self.min_values_per_tag = min_values_per_tag

    def _get_writer(self, tag):
        if tag not in self.files:
            filename = re.sub(r'[^a-z0-9]+', '_', tag.lower()) + ".csv"
            f = open("output/" + filename, 'w')
            self.files[tag] = f
            logging.debug("creating new file: %s" % filename)
        return csv.writer(self.files[tag], delimiter=',', quotechar='"')

    def write(self, tag, value):
    	value = value.strip()
    	tag = tag.strip()

        if len(value) == 0 or len(tag) == 0:
        	return

        if self.unique_values:
            value_key = md5.new(value).hexdigest()
            if value_key in self.values:
                return

            self.values.add(value_key)

        # get count
        count = self.counters.get(tag, 0)
        writer = self._get_writer(tag)
        writer.writerow([value, tag])
        count += 1
        self.counters[tag] = count

    def close(self):
    	# with open('output/output.csv', 'w') as outfile:
        for tag in self.files:
            f = self.files[tag]
            count = self.counters.get(tag, 0)
            logging.debug("closing file: %s, count:%d" % (f.name, count))
            if count < self.min_values_per_tag or tag == 'Credit_card_or_prepaid_card':
                os.unlink(f.name)

 def clean_tag(tag):
    return re.sub(r'[^A-Za-z0-9]+', '_', tag.split(',')[0])


 def clean_value(value):
    return re.sub(r'\s+', ' ', re.sub(r'[\n\r\'\",\/\{\}\[\]]|X{2,}', ' ', value))


 ###############
 # process_csv - process the csv file for complaint data.
 #
 # The file has the following headers:
 #  * Date received
 #  * Product
 #  * Sub-product
 #  * Issue
 #  * Sub-issue
 #  * Consumer complaint narrative
 #  * Company public response
 #  * Company
 #  * State
 #  * ZIP code
 #  * Tags
 #  * Consumer consent provided?
 #  * Submitted via
 #  * Date sent to company
 #  * Company response to consumer
 #  * Timely response?
 #  * Consumer disputed?
 #  * Complaint ID

 def process_csv(incoming):
    writers = OutputWriter()
    with open(incoming, 'rb') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        count = 0
        for row in reader:
            if count > 0 and row[5]:
                product = clean_tag(row[1])
                sub_product = clean_tag(row[2])
                issue = clean_tag(row[3])
                tag = product
                value = clean_value(row[5])


                # check and see if the issue has at least 10 words
                words = re.split(r'\s+', value)
                if len(words) < 10 or len(words) > 150:
                    #logging.debug('skipping row: %s' % value)
                    continue

                t = (row[1],row[2])
                if len(OUTPUT_PRODUCTS) == 0 or t in OUTPUT_PRODUCTS:
                    writers.write(tag, value)
            count += 1

    writers.close()


 # main function
 def main():
    if len(sys.argv) < 2:
        print("Usage: process.py [input csv]")
        exit(1)

    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG)
    incoming = sys.argv[1]

    if not os.path.isdir("output"):
    	os.makedirs("output")

    logging.info("Parsing file: %s" % incoming)
    process_csv(incoming)

 if __name__ == "__main__":
    main()
	# The following file downloads complaint data from the Consumer Finance Board.
	URL="https://data.consumerfinance.gov/api/views/s6ew-h6mp/rows.csv?accessType=DOWNLOAD"
	FILE=complaints.csv

	if [ ! -f $FILE ]; then
	echo "-- Downloading file --"
	curl -L $URL --output $FILE
	fi

	python process.py complaints.csv

	for filename in output/*.csv; do
	tail -n 1000 $filename >> training_set.csv
	done

	rm -r output/
	import sys
	import csv
	import re
	import md5
	import logging
	import os

	# Classes

	OUTPUT_PRODUCTS = []
	#OUTPUT_PRODUCTS = [('Credit card',''),('Credit reporting',''),('Debt collection','Credit card'),('Consumer Loan','Vehicle loan')]


	class OutputWriter:
	def __init__(self, max_values_per_tag=1000, min_values_per_tag=6000, max_values=100000, unique_values=True):
	self.files = {}
	self.counters = {}
	self.values = set()
	self.total = 0
	self.unique_values = unique_values
	self.max_values_per_tag = max_values_per_tag
	self.max_values = max_values
	self.min_values_per_tag = min_values_per_tag

	def _get_writer(self, tag):
	if tag not in self.files:
	filename = re.sub(r'[^a-z0-9]+', '_', tag.lower()) + ".csv"
	f = open("output/" + filename, 'w')
	self.files[tag] = f
	logging.debug("creating new file: %s" % filename)
	return csv.writer(self.files[tag], delimiter=',', quotechar='"')

	def write(self, tag, value):
	value = value.strip()
	tag = tag.strip()

	if len(value) == 0 or len(tag) == 0:
	return

	if self.unique_values:
	value_key = md5.new(value).hexdigest()
	if value_key in self.values:
	return

	self.values.add(value_key)

	# get count
	count = self.counters.get(tag, 0)
	writer = self._get_writer(tag)
	writer.writerow([value, tag])
	count += 1
	self.counters[tag] = count

	def close(self):
	# with open('output/output.csv', 'w') as outfile:
	for tag in self.files:
	f = self.files[tag]
	count = self.counters.get(tag, 0)
	logging.debug("closing file: %s, count:%d" % (f.name, count))
	if count < self.min_values_per_tag or tag == 'Credit_card_or_prepaid_card':
	os.unlink(f.name)

	def clean_tag(tag):
	return re.sub(r'[^A-Za-z0-9]+', '_', tag.split(',')[0])


	def clean_value(value):
	return re.sub(r'\s+', ' ', re.sub(r'[\n\r\'\",\/\{\}\[\]]\|X{2,}', ' ', value))


	###############
	# process_csv - process the csv file for complaint data.
	#
	# The file has the following headers:
	# * Date received
	# * Product
	# * Sub-product
	# * Issue
	# * Sub-issue
	# * Consumer complaint narrative
	# * Company public response
	# * Company
	# * State
	# * ZIP code
	# * Tags
	# * Consumer consent provided?
	# * Submitted via
	# * Date sent to company
	# * Company response to consumer
	# * Timely response?
	# * Consumer disputed?
	# * Complaint ID

	def process_csv(incoming):
	writers = OutputWriter()
	with open(incoming, 'rb') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	count = 0
	for row in reader:
	if count > 0 and row[5]:
	product = clean_tag(row[1])
	sub_product = clean_tag(row[2])
	issue = clean_tag(row[3])
	tag = product
	value = clean_value(row[5])


	# check and see if the issue has at least 10 words
	words = re.split(r'\s+', value)
	if len(words) < 10 or len(words) > 150:
	#logging.debug('skipping row: %s' % value)
	continue

	t = (row[1],row[2])
	if len(OUTPUT_PRODUCTS) == 0 or t in OUTPUT_PRODUCTS:
	writers.write(tag, value)
	count += 1

	writers.close()


	# main function
	def main():
	if len(sys.argv) < 2:
	print("Usage: process.py [input csv]")
	exit(1)

	logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG)
	incoming = sys.argv[1]

	if not os.path.isdir("output"):
	os.makedirs("output")

	logging.info("Parsing file: %s" % incoming)
	process_csv(incoming)

	if __name__ == "__main__":
	main()