Last active
March 12, 2018 07:39
-
-
Save wannaphong/857d564c6dbc549f29f698564f2a07f8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
"""Implementation of Rapid Automatic Keyword Extraction algorithm. | |
As described in the paper `Automatic keyword extraction from individual | |
documents` by Stuart Rose, Dave Engel, Nick Cramer and Wendy Cowley. | |
Thai language by Mr.Wannaphong Phatthiyaphaibun <wannaphong@kkumail.com> | |
""" | |
import string | |
from collections import defaultdict, Counter | |
from itertools import chain, groupby, product | |
import pythainlp | |
class Rake(object): | |
def __init__(self, stopwords=None, punctuations=None, language='thai'): | |
"""Constructor. | |
:param stopwords: List of Words to be ignored for keyword extraction. | |
:param punctuations: Punctuations to be ignored for keyword extraction. | |
:param language: Language to be used for stopwords | |
""" | |
# If stopwords not provided we use language stopwords by default. | |
self.stopwords = stopwords | |
if self.stopwords is None: | |
self.stopwords = pythainlp.corpus.stopwords.words(language) | |
# If punctuations are not provided we ignore all punctuation symbols. | |
self.punctuations = punctuations | |
if self.punctuations is None: | |
self.punctuations = string.punctuation | |
# All things which act as sentence breaks during keyword extraction. | |
self.to_ignore = set(chain(self.stopwords, self.punctuations)) | |
# Stuff to be extracted from the provided text. | |
self.frequency_dist = None | |
self.degree = None | |
self.rank_list = None | |
self.ranked_phrases = None | |
def extract_keywords_from_text(self, text): | |
"""Method to extract keywords from the text provided. | |
:param text: Text to extract keywords from, provided as a string. | |
""" | |
sentences = pythainlp.tokenize.sent_tokenize(text) | |
self.extract_keywords_from_sentences(sentences) | |
def extract_keywords_from_sentences(self, sentences): | |
"""Method to extract keywords from the list of sentences provided. | |
:param sentences: Text to extraxt keywords from, provided as a list | |
of strings, where each string is a sentence. | |
""" | |
phrase_list = self._generate_phrases(sentences) | |
self._build_frequency_dist(phrase_list) | |
self._build_word_co_occurance_graph(phrase_list) | |
self._build_ranklist(phrase_list) | |
def get_ranked_phrases(self): | |
"""Method to fetch ranked keyword strings. | |
:return: List of strings where each string represents an extracted | |
keyword string. | |
""" | |
return self.ranked_phrases | |
def get_ranked_phrases_with_scores(self): | |
"""Method to fetch ranked keyword strings along with their scores. | |
:return: List of tuples where each tuple is formed of an extracted | |
keyword string and its score. Ex: (5.68, 'Four Scoures') | |
""" | |
return self.rank_list | |
def get_word_frequency_distribution(self): | |
"""Method to fetch the word frequency distribution in the given text. | |
:return: Dictionary (defaultdict) of the format `word -> frequency`. | |
""" | |
return self.frequency_dist | |
def get_word_degrees(self): | |
"""Method to fetch the degree of words in the given text. Degree can be | |
defined as sum of co-occurances of the word with other words in the | |
given text. | |
:return: Dictionary (defaultdict) of the format `word -> degree`. | |
""" | |
return self.degree | |
def _build_frequency_dist(self, phrase_list): | |
"""Builds frequency distribution of the words in the given body of text. | |
:param phrase_list: List of List of strings where each sublist is a | |
collection of words which form a contender phrase. | |
""" | |
self.frequency_dist = Counter(chain.from_iterable(phrase_list)) | |
def _build_word_co_occurance_graph(self, phrase_list): | |
"""Builds the co-occurance graph of words in the given body of text to | |
compute degree of each word. | |
:param phrase_list: List of List of strings where each sublist is a | |
collection of words which form a contender phrase. | |
""" | |
co_occurance_graph = defaultdict(lambda: defaultdict(lambda: 0)) | |
for phrase in phrase_list: | |
# For each phrase in the phrase list, count co-occurances of the | |
# word with other words in the phrase. | |
# | |
# Note: Keep the co-occurances graph as is, to help facilitate its | |
# use in other creative ways if required later. | |
for (word, coword) in product(phrase, phrase): | |
co_occurance_graph[word][coword] += 1 | |
self.degree = defaultdict(lambda: 0) | |
for key in co_occurance_graph: | |
self.degree[key] = sum(co_occurance_graph[key].values()) | |
def _build_ranklist(self, phrase_list): | |
"""Method to rank each contender phrase using the formula | |
phrase_score = sum of scores of words in the phrase. | |
word_score = d(w)/f(w) where d is degree and f is frequency. | |
:param phrase_list: List of List of strings where each sublist is a | |
collection of words which form a contender phrase. | |
""" | |
self.rank_list = [] | |
for phrase in phrase_list: | |
rank = 0.0 | |
for word in phrase: | |
rank += 1.0 * self.degree[word] / self.frequency_dist[word] | |
self.rank_list.append((rank, ' '.join(phrase))) | |
self.rank_list.sort(reverse=True) | |
self.ranked_phrases = [ph[1] for ph in self.rank_list] | |
def _generate_phrases(self, sentences): | |
"""Method to generate contender phrases given the sentences of the text | |
document. | |
:param sentences: List of strings where each string represents a | |
sentence which forms the text. | |
:return: Set of string tuples where each tuple is a collection | |
of words forming a contender phrase. | |
""" | |
phrase_list = set() | |
# Create contender phrases from sentences. | |
for sentence in sentences: | |
word_list = [word.lower() for word in pythainlp.word_tokenize(sentence)] | |
phrase_list.update(self._get_phrase_list_from_words(word_list)) | |
return phrase_list | |
def _get_phrase_list_from_words(self, word_list): | |
"""Method to create contender phrases from the list of words that form | |
a sentence by dropping stopwords and punctuations and grouping the left | |
words into phrases. Ex: | |
Sentence: Red apples, are good in flavour. | |
List of words: ['red', 'apples', ",", 'are', 'good', 'in', 'flavour'] | |
List after dropping punctuations and stopwords. | |
List of words: ['red', 'apples', *, *, good, *, 'flavour'] | |
List of phrases: [('red', 'apples'), ('good',), ('flavour',)] | |
:param word_list: List of words which form a sentence when joined in | |
the same order. | |
:return: List of contender phrases that are formed after dropping | |
stopwords and punctuations. | |
""" | |
groups = groupby(word_list, lambda x: x not in self.to_ignore) | |
return [tuple(group[1]) for group in groups if group[0]] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment