Created
May 10, 2017 15:04
-
-
Save coreyhermanson/3732ada84ebd349d386e3add311acf42 to your computer and use it in GitHub Desktop.
Strip company indicators from company terms
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import pyperclip | |
import re | |
from list_clipboard_manipulations import list_to_clipboard | |
delete_counter = 0 | |
good_list = list() | |
sort_alpha = False | |
# Functions | |
def clean_text(regex_list, word): | |
""" | |
Checks word against a regex list and cleans the terms | |
:param regex_list: | |
:param word: | |
:return: term after going through regex filter | |
""" | |
new_text = word | |
for r in regex_list: | |
new_text = re.sub(r, '', new_text) | |
return new_text | |
# known endings to companies | |
regexes = [ | |
" Ltd\.?$", | |
" Inc\.?$", | |
" Corp\.?$", | |
" Corporation$", | |
" Limited$", | |
" Incorporated$", | |
" [Pp][Ll][Cc]$" | |
] | |
# Combine regexes in a string (not using) | |
regex_string = "(" + ")|(".join(regexes) + ")" | |
# Grab text from clipboard and convert to string, | |
text = str(pyperclip.paste()) | |
# Split string into list using \r\n as delimiters, then filter out blank value (with 'None' argument), alphabetize | |
if sort_alpha: | |
input_list = list(sorted(filter(None, text.split('\r\n')))) | |
else: | |
input_list = list(filter(None, text.split('\r\n'))) | |
# Loop through domains in list, delete domains matching a regex string for known TLDs | |
for term in input_list: | |
new_term = clean_text(regexes, term) | |
good_list.append(new_term) | |
if new_term != term: | |
delete_counter += 1 | |
# Check if len(list) > 0, then copy to clipboard | |
list_to_clipboard(good_list) | |
print(str(delete_counter) + " terms were edited.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment