Last active
July 29, 2021 22:32
-
-
Save timothyrenner/dd487b9fd8081530509c to your computer and use it in GitHub Desktop.
Python Utilities for Tweets
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import datetime | |
import string | |
from nltk.stem.lancaster import LancasterStemmer | |
from nltk.corpus import stopwords | |
#Gets the tweet time. | |
def get_time(tweet): | |
return datetime.strptime(tweet['created_at'], "%a %b %d %H:%M:%S +0000 %Y") | |
#Gets all hashtags. | |
def get_hashtags(tweet): | |
return [tag['text'] for tag in tweet['entities']['hashtags']] | |
#Gets the screen names of any user mentions. | |
def get_user_mentions(tweet): | |
return [m['screen_name'] for m in tweet['entities']['user_mentions']] | |
#Gets the text, sans links, hashtags, mentions, media, and symbols. | |
def get_text_cleaned(tweet): | |
text = tweet['text'] | |
slices = [] | |
#Strip out the urls. | |
if 'urls' in tweet['entities']: | |
for url in tweet['entities']['urls']: | |
slices += [{'start': url['indices'][0], 'stop': url['indices'][1]}] | |
#Strip out the hashtags. | |
if 'hashtags' in tweet['entities']: | |
for tag in tweet['entities']['hashtags']: | |
slices += [{'start': tag['indices'][0], 'stop': tag['indices'][1]}] | |
#Strip out the user mentions. | |
if 'user_mentions' in tweet['entities']: | |
for men in tweet['entities']['user_mentions']: | |
slices += [{'start': men['indices'][0], 'stop': men['indices'][1]}] | |
#Strip out the media. | |
if 'media' in tweet['entities']: | |
for med in tweet['entities']['media']: | |
slices += [{'start': med['indices'][0], 'stop': med['indices'][1]}] | |
#Strip out the symbols. | |
if 'symbols' in tweet['entities']: | |
for sym in tweet['entities']['symbols']: | |
slices += [{'start': sym['indices'][0], 'stop': sym['indices'][1]}] | |
# Sort the slices from highest start to lowest. | |
slices = sorted(slices, key=lambda x: -x['start']) | |
#No offsets, since we're sorted from highest to lowest. | |
for s in slices: | |
text = text[:s['start']] + text[s['stop']:] | |
return text | |
#Sanitizes the text by removing front and end punctuation, | |
#making words lower case, and removing any empty strings. | |
def get_text_sanitized(tweet): | |
return ' '.join([w.lower().strip().rstrip(string.punctuation)\ | |
.lstrip(string.punctuation).strip()\ | |
for w in get_text_cleaned(tweet).split()\ | |
if w.strip().rstrip(string.punctuation).strip()]) | |
#Gets the text, clean it, make it lower case, stem the words, and split | |
#into a vector. Also, remove stop words. | |
def get_text_normalized(tweet): | |
#Sanitize the text first. | |
text = get_text_sanitized(tweet).split() | |
#Remove the stop words. | |
text = [t for t in text if t not in stopwords.words('english')] | |
#Create the stemmer. | |
stemmer = LancasterStemmer() | |
#Stem the words. | |
return [stemmer.stem(t) for t in text] |
@rahulsaini In the script, the tweet variable is a python object with keys for each attribute, which represent the tweet fields as per the documentation Tweet field reference. But in your case, you are reading from a csv file, so tweet object of similar structure is not possible. So, either you have to take a raw tweet object as input, or change the code as per your need. Hope this helps!
tweets's structure is not always the same, there can be retweet and extended tweets where your script doesn't work
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi Timothy, I am using your script TweetUtils.py to do Twitter Text data cleaning. Basically I am a beginner at Python and I have this Tweets as a CSV (file format as below, with 1 example data record)
"text","favorited","favoriteCount","replyToSN","created","truncated","replyToSID","id","replyToUID","statusSource","screenName","retweetCount","isRetweet","retweeted","longitude","latitude"
"#SkinTags are associated with Type 2 Diabetes Mellitus, and are a common sign of prediabetes.",FALSE,0,NA,2016-08-09 15:23:11,FALSE,NA,"763032885519605764",NA,"<a href=""http://www.mrsaactionuk.net"" rel=""nofollow"">MrsaActionApp","NoMoreMoles",0,FALSE,FALSE,NA,NA
How do I run this python script for my csv file. I mean how to invoke for a tweet text ?
If I keep just "text" of tweet like
"#SkinTags are associated with Type 2 Diabetes Mellitus, and are a common sign of prediabetes." , do I invoke with resultOutput as below
I tried this in IntelliJ PyCharm Editor with Python 3.4.2 but on run it gives me error like
Traceback (most recent call last):
File "C:/DataScienceWorks/SocialMediaDataAnalysis/TweetUtils.py", line 89, in
outputTweet = get_text_normalized('RT @MargaretLarsenT: Is type 2 diabetes associated with osteoarthritis? - Medivizor https://t.co/TwfzRWdIDl')
File "C:/DataScienceWorks/SocialMediaDataAnalysis/TweetUtils.py", line 77, in get_text_normalized
text = get_text_sanitized(tweet).split()
File "C:/DataScienceWorks/SocialMediaDataAnalysis/TweetUtils.py", line 69, in get_text_sanitized
for w in get_text_cleaned(tweet).split()
File "C:/DataScienceWorks/SocialMediaDataAnalysis/TweetUtils.py", line 26, in get_text_cleaned
text = tweet['text']
TypeError: string indices must be integers
Kindly help. Thanks.