Created
August 28, 2022 14:47
-
-
Save gamingflexer/44b22a186680201a5cc178c9c043fa01 to your computer and use it in GitHub Desktop.
Bunch of Cleaning Functions | ML & Backend Dev
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os,re,string,json,emoji,csv | |
import numpy as np | |
import pandas as pd | |
def clean_text(text): | |
'''Clean emoji, Make text lowercase, remove text in square brackets,remove links,remove punctuation | |
and remove words containing numbers.''' | |
text = emoji.demojize(text) | |
text = re.sub(r'\:(.*?)\:', '', text) | |
text = str(text).lower() # Making Text Lowercase | |
text = re.sub('\[.*?\]', '', text) | |
# The next 2 lines remove html text | |
text = BeautifulSoup(text, 'lxml').get_text() | |
text = re.sub('https?://\S+|www\.\S+', '', text) | |
text = re.sub('<.*?>+', '', text) | |
text = re.sub('\n', '', text) | |
text = re.sub('\w*\d\w*', '', text) | |
# replacing everything with space except (a-z, A-Z, ".", "?", "!", ",", "'") | |
text = re.sub(r"[^a-zA-Z?.!,¿']+", " ", text) | |
return text | |
def clean_contractions(text, mapping): | |
'''Clean contraction using contraction mapping''' | |
specials = ["’", "‘", "´", "`"] | |
for s in specials: | |
text = text.replace(s, "'") | |
for word in mapping.keys(): | |
if ""+word+"" in text: | |
text = text.replace(""+word+"", ""+mapping[word]+"") | |
# Remove Punctuations | |
text = re.sub('[%s]' % re.escape(string.punctuation), '', text) | |
# creating a space between a word and the punctuation following it | |
# eg: "he is a boy." => "he is a boy ." | |
text = re.sub(r"([?.!,¿])", r" \1 ", text) | |
text = re.sub(r'[" "]+', " ", text) | |
return text | |
def clean_special_chars(text, punct, mapping): | |
'''Cleans special characters present(if any)''' | |
for p in mapping: | |
text = text.replace(p, mapping[p]) | |
for p in punct: | |
text = text.replace(p, f' {p} ') | |
specials = {'\u200b': ' ', '…': ' ... ', | |
'\ufeff': '', 'करना': '', 'है': ''} | |
for s in specials: | |
text = text.replace(s, specials[s]) | |
return text | |
def correct_spelling(x, dic): | |
'''Corrects common spelling errors''' | |
for word in dic.keys(): | |
x = x.replace(word, dic[word]) | |
return x | |
def remove_space(text): | |
'''Removes awkward spaces''' | |
# Removes awkward spaces | |
text = text.strip() | |
text = text.split() | |
return " ".join(text) | |
def text_preprocessing_pipeline(text): | |
'''Cleaning and parsing the text.''' | |
text = clean_text(text) | |
text = clean_contractions(text, contraction_mapping) | |
text = clean_special_chars(text, punct, punct_mapping) | |
text = correct_spelling(text, mispell_dict) | |
text = remove_space(text) | |
return text | |
# hexcode | |
def hexcode_clean(text): | |
text = (r'[^\x00-\x7f]', r'', text) | |
return text | |
#clean temp dir for files | |
def cleandir(path): | |
for root, dirs, files in os.walk(path): | |
for currentFile in files: | |
exts = ('.json', '.csv','.png') | |
if currentFile.lower().endswith(exts): | |
os.remove(os.path.join(root, currentFile)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment