Skip to content

Instantly share code, notes, and snippets.

@coreyhermanson
Created May 10, 2017 15:04
Show Gist options
  • Save coreyhermanson/3732ada84ebd349d386e3add311acf42 to your computer and use it in GitHub Desktop.
Save coreyhermanson/3732ada84ebd349d386e3add311acf42 to your computer and use it in GitHub Desktop.
Strip company indicators from company terms
#!/usr/bin/env python
import pyperclip
import re
from list_clipboard_manipulations import list_to_clipboard
delete_counter = 0
good_list = list()
sort_alpha = False
# Functions
def clean_text(regex_list, word):
"""
Checks word against a regex list and cleans the terms
:param regex_list:
:param word:
:return: term after going through regex filter
"""
new_text = word
for r in regex_list:
new_text = re.sub(r, '', new_text)
return new_text
# known endings to companies
regexes = [
" Ltd\.?$",
" Inc\.?$",
" Corp\.?$",
" Corporation$",
" Limited$",
" Incorporated$",
" [Pp][Ll][Cc]$"
]
# Combine regexes in a string (not using)
regex_string = "(" + ")|(".join(regexes) + ")"
# Grab text from clipboard and convert to string,
text = str(pyperclip.paste())
# Split string into list using \r\n as delimiters, then filter out blank value (with 'None' argument), alphabetize
if sort_alpha:
input_list = list(sorted(filter(None, text.split('\r\n'))))
else:
input_list = list(filter(None, text.split('\r\n')))
# Loop through domains in list, delete domains matching a regex string for known TLDs
for term in input_list:
new_term = clean_text(regexes, term)
good_list.append(new_term)
if new_term != term:
delete_counter += 1
# Check if len(list) > 0, then copy to clipboard
list_to_clipboard(good_list)
print(str(delete_counter) + " terms were edited.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment