Xiaoting Cai cxtadment

View GitHub Profile

Recently created

Least recently created

Recently updated

Least recently updated

cxtadment / removeTopic.py

Created April 18, 2016 00:17

	def removeTopic(content):
	topics = re.findall(u"#[\w\u0000-\u9FFF]+#", content)
	for i in range(0, len(topics)):
	content = content.replace(topics[i], '')
	return content

cxtadment / bag_of_words.py

Last active April 18, 2016 17:55

	def pickle_words_features(microblogType):
	microblogs = Microblog.objects(microblogType=microblogType)

	all_words = []
	for microblog in microblogs:
	all_words.extend(microblog.words)
	all_words = nltk.FreqDist(all_words)

	words_features = list(all_words.keys())

cxtadment / polarity_count.py

Created April 18, 2016 00:13

	def polarity_count(self, microblog_text):

	seg_list = list(jieba.cut(microblog_text))
	t = 0
	while t < len(seg_list) - 1:
	if seg_list[t] in ESCAPE_WORDS:
	seg_list[t + 1] = seg_list[t] + seg_list[t + 1]
	seg_list.pop(t)
	t += 1

cxtadment / pos_tagging.py

Created April 18, 2016 00:10

	def pos_tagging(self, microblog_text):
	words_taggings = pseg.cut(microblog_text)

	words, taggings = [], []
	for word, tagging in words_taggings:
	if self.seg_filter(word, tagging):
	words.append(word)
	taggings.append(tagging)
	return words, taggings

cxtadment / seg_filter.py

Created April 18, 2016 00:08

	def seg_filter(self, word, tagging):
	# filter stop words including punctuation
	if word in self.stopwords:
	return False
	# filter element containing number
	if re.match('^(?=.*\\d)', word):
	return False
	# if the word is in the topics
	if word in self.topics:
	return False

cxtadment / removeBracket.py

Last active April 18, 2016 00:02

	def removeBracket(content):
	content = content.replace('（', '(')
	content = content.replace('）', ')')
	brackets = re.findall(u"\([\w\u0000-\u9FFF]+\)", content)
	for i in range(0, len(brackets)):
	content = content.replace(brackets[i], '')
	return content

cxtadment / removeForward.py

Created April 17, 2016 23:58

	def removeForward(content):
	forward_index = content.find("//")
	if not forward_index == -1:
	content = content[0:forward_index]
	return content

cxtadment / removePrivate.py

Last active April 17, 2016 23:57

	def removePrivate(content):
	privates = re.findall(u"@[\w\u0000-\u9FFF]+", content)
	for i in range(0, len(privates)):
	content = content.replace(privates[i], '')
	return content

cxtadment / convertPun.py

Created April 17, 2016 23:54

	def convertPun(content):
	punctuation_list = ['，', '。', '？', '！', '……', ':', '「', '」', '.....', '】', '：', '、']
	punctuation_list2 = ['《', '》', '“', '”', '"', '"']
	for i in range(0, len(punctuation_list)):
	content = content.replace(punctuation_list[i], '.')
	content = content.replace('【', ' ')
	for i in range(0, len(punctuation_list2)):
	content = content.replace(punctuation_list2[i], '')
	return content

cxtadment / removeLinks.py

Created April 17, 2016 23:39

	def removeLink(content):
	urls = re.findall('http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\(\),]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+', content)
	for i in range(0, len(urls)):
	content = content.replace(urls[i], '')
	return content

NewerOlder