Last active
March 18, 2020 09:22
-
-
Save frenzy2106/8b30f699f09d4bc0797cde31c9afa1f1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def clean_corpus(text): | |
corpus = [] | |
for i in range(len(text)): | |
tweet = re.sub(r"^https://t.co/[a-zA-Z0-9]*\s"," ", str(text[i])) | |
tweet = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*\s"," ", tweet) | |
tweet = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*$"," ", tweet) | |
tweet = tweet.lower() | |
tweet = re.sub(r"can't","can not", tweet) | |
tweet = re.sub(r"hv","have", tweet) | |
tweet = re.sub(r"ur","your", tweet) | |
tweet = re.sub(r"ain't","is not", tweet) | |
tweet = re.sub(r"don't","do not", tweet) | |
tweet = re.sub(r"couldn't","could not", tweet) | |
tweet = re.sub(r"shouldn't","should not", tweet ) | |
tweet = re.sub(r"won't","will not", tweet) | |
tweet = re.sub(r"there's", "there is", tweet) | |
tweet = re.sub(r"it's","it is", tweet) | |
tweet = re.sub(r"that's","that is", tweet) | |
tweet = re.sub(r"where's","where is", tweet) | |
tweet = re.sub(r"who's","who is", tweet) | |
tweet = re.sub(r"\W"," ", tweet) | |
tweet = re.sub(r"\d"," ", tweet) | |
tweet = re.sub(r"[ðâï¼½³ªãºæååçæåä¹µó¾_ëìêè]"," ", tweet) | |
tweet =re.sub(r"\s[a-z]\s"," ", tweet) | |
tweet = re.sub(r"\s+[a-z]\s+"," ", tweet) | |
tweet = re.sub(r"^[a-z]\s"," ", tweet) | |
tweet = re.sub(r"^[a-z]\s+"," ", tweet) | |
tweet = re.sub(r"\s+"," ", tweet) | |
tweet = re.sub(r"^\s","", tweet) | |
tweet = re.sub(r"\s$","", tweet) | |
corpus.append(tweet) | |
#return the corpus | |
return corpus | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment