Skip to content

Instantly share code, notes, and snippets.

@ccd97
Created October 3, 2017 16:34
Show Gist options
  • Save ccd97/768f7fd030b3cab8a71f3712679a5c4f to your computer and use it in GitHub Desktop.
Save ccd97/768f7fd030b3cab8a71f3712679a5c4f to your computer and use it in GitHub Desktop.
Get glossary from text
import requests
# pip install numpy
import numpy as np
# pip install pandas
import pandas as pd
# pip install scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
# ################# Edit this parameters ###################
# Add you Oxford API id and key here
app_id = 'ffffffff'
app_key = 'ffffffffffffffffffffffffffffffff'
input_file_name = 'report.txt'
output_file_name = 'glossary.txt'
max_word = 20
############################################################
in_file = open(input_file_name, 'r')
out_file = open(output_file_name, 'w')
data = in_file.read()
vectorize = TfidfVectorizer()
tvec_weights = vectorize.fit_transform([data])
weights = np.asarray(tvec_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': vectorize.get_feature_names(), 'weight': weights})
weights_df = weights_df.sort_values(by='weight', ascending=True)
def get_defination(word):
url = 'https://od-api.oxforddictionaries.com:443/api/v1/entries/en/' + word.lower()
r = requests.get(url, headers={'app_id': app_id, 'app_key': app_key})
try:
r_dict = r.json()
return r_dict['results'][0]['lexicalEntries'][0]['entries'][0]['senses'][0]['definitions'][0]
except:
return None
i = 0
for w in weights_df['term']:
print("Getting defination for " + w)
defination = get_defination(w)
if defination is not None:
i += 1
print("Defination of " + w + " -> " + defination)
out_file.write("%s - %s\n" % (w, defination))
if i >= max_word:
break
else:
print("Cannot find defination of " + w)
in_file.close()
out_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment