Created
October 23, 2018 05:08
-
-
Save rickcrawford/0b47bb082f869517226e6710982e7cd5 to your computer and use it in GitHub Desktop.
Complaint data processing scripts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The following file downloads complaint data from the Consumer Finance Board. | |
URL="https://data.consumerfinance.gov/api/views/s6ew-h6mp/rows.csv?accessType=DOWNLOAD" | |
FILE=complaints.csv | |
if [ ! -f $FILE ]; then | |
echo "-- Downloading file --" | |
curl -L $URL --output $FILE | |
fi | |
python process.py complaints.csv | |
for filename in output/*.csv; do | |
tail -n 1000 $filename >> training_set.csv | |
done | |
rm -r output/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import csv | |
import re | |
import md5 | |
import logging | |
import os | |
# Classes | |
OUTPUT_PRODUCTS = [] | |
#OUTPUT_PRODUCTS = [('Credit card',''),('Credit reporting',''),('Debt collection','Credit card'),('Consumer Loan','Vehicle loan')] | |
class OutputWriter: | |
def __init__(self, max_values_per_tag=1000, min_values_per_tag=6000, max_values=100000, unique_values=True): | |
self.files = {} | |
self.counters = {} | |
self.values = set() | |
self.total = 0 | |
self.unique_values = unique_values | |
self.max_values_per_tag = max_values_per_tag | |
self.max_values = max_values | |
self.min_values_per_tag = min_values_per_tag | |
def _get_writer(self, tag): | |
if tag not in self.files: | |
filename = re.sub(r'[^a-z0-9]+', '_', tag.lower()) + ".csv" | |
f = open("output/" + filename, 'w') | |
self.files[tag] = f | |
logging.debug("creating new file: %s" % filename) | |
return csv.writer(self.files[tag], delimiter=',', quotechar='"') | |
def write(self, tag, value): | |
value = value.strip() | |
tag = tag.strip() | |
if len(value) == 0 or len(tag) == 0: | |
return | |
if self.unique_values: | |
value_key = md5.new(value).hexdigest() | |
if value_key in self.values: | |
return | |
self.values.add(value_key) | |
# get count | |
count = self.counters.get(tag, 0) | |
writer = self._get_writer(tag) | |
writer.writerow([value, tag]) | |
count += 1 | |
self.counters[tag] = count | |
def close(self): | |
# with open('output/output.csv', 'w') as outfile: | |
for tag in self.files: | |
f = self.files[tag] | |
count = self.counters.get(tag, 0) | |
logging.debug("closing file: %s, count:%d" % (f.name, count)) | |
if count < self.min_values_per_tag or tag == 'Credit_card_or_prepaid_card': | |
os.unlink(f.name) | |
def clean_tag(tag): | |
return re.sub(r'[^A-Za-z0-9]+', '_', tag.split(',')[0]) | |
def clean_value(value): | |
return re.sub(r'\s+', ' ', re.sub(r'[\n\r\'\",\/\{\}\[\]]|X{2,}', ' ', value)) | |
############### | |
# process_csv - process the csv file for complaint data. | |
# | |
# The file has the following headers: | |
# * Date received | |
# * Product | |
# * Sub-product | |
# * Issue | |
# * Sub-issue | |
# * Consumer complaint narrative | |
# * Company public response | |
# * Company | |
# * State | |
# * ZIP code | |
# * Tags | |
# * Consumer consent provided? | |
# * Submitted via | |
# * Date sent to company | |
# * Company response to consumer | |
# * Timely response? | |
# * Consumer disputed? | |
# * Complaint ID | |
def process_csv(incoming): | |
writers = OutputWriter() | |
with open(incoming, 'rb') as csvfile: | |
reader = csv.reader(csvfile, delimiter=',', quotechar='"') | |
count = 0 | |
for row in reader: | |
if count > 0 and row[5]: | |
product = clean_tag(row[1]) | |
sub_product = clean_tag(row[2]) | |
issue = clean_tag(row[3]) | |
tag = product | |
value = clean_value(row[5]) | |
# check and see if the issue has at least 10 words | |
words = re.split(r'\s+', value) | |
if len(words) < 10 or len(words) > 150: | |
#logging.debug('skipping row: %s' % value) | |
continue | |
t = (row[1],row[2]) | |
if len(OUTPUT_PRODUCTS) == 0 or t in OUTPUT_PRODUCTS: | |
writers.write(tag, value) | |
count += 1 | |
writers.close() | |
# main function | |
def main(): | |
if len(sys.argv) < 2: | |
print("Usage: process.py [input csv]") | |
exit(1) | |
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG) | |
incoming = sys.argv[1] | |
if not os.path.isdir("output"): | |
os.makedirs("output") | |
logging.info("Parsing file: %s" % incoming) | |
process_csv(incoming) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment