Last active
April 26, 2017 02:52
-
-
Save SarahTaylorProject/2a3a9723b31c7c26e3f046c3f8ffa7e9 to your computer and use it in GitHub Desktop.
"call the olden days"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# TROVE API TEST | |
# Updates April 25th 2017 | |
# Searches Trove (National Library of Australia) for historical articles matching input town name and search term | |
# Writes results to csv file and then proceeds to read them aloud with the "say_something" method | |
# CHANGES | |
# This version takes two search terms DIRECT FROM USER (one should be a town name) | |
# It does not (as in previous versions) require a csv of town names... | |
# Note that the csv is likely to be useful in future, but for live purposes it is too time consuming | |
# The "say_something" method will now work for Mac or Linus | |
# This version has more methods | |
# It has more direct user input for more flexibility | |
# In particular, it lets the user CURATE articles first, before proceeding to the reading | |
# (this approach won't necessarily work better in the long term, but for live use, flexibility and speed is important) | |
# LIMITATIONS | |
# Does not deal with the full result list from Trove, only the first 100 results per search | |
# STILL TO DO: | |
# Search could be more effective for comprehensiveness | |
# e.g. sorting results differently (to avoid repetition), or possibly fetching the whole article rather than the snippet | |
# Clean up more code, it is still a bit messy and some features should be in methods | |
require "csv" | |
require "json" | |
require "net/http" | |
require "nokogiri" | |
require "date" | |
require "rbconfig" | |
require "os" | |
# GENERAL METHODS | |
def say_something(text, speed = 135) | |
# This method says text aloud through the command line | |
# Checks for operating system and uses appropriate say-aloud command line | |
# Works for linux and mac, could expand to others later | |
# Will print text either way | |
puts(text) | |
result = operating_system() | |
case result | |
when "linux" | |
`echo "#{text}"|espeak -s #{speed}` | |
when "mac" | |
`say -r #{speed} "#{text}"` | |
else | |
puts "say_something does not yet support this operating system" | |
end | |
end | |
def get_user_input(prompt_text = "\nPlease enter value") | |
# This method just gets direct input from the user with a prompt | |
# Returns the user input | |
# Nothing fancy, just a handy function | |
puts prompt_text | |
input_text = STDIN.gets.chomp | |
return(input_text) | |
end | |
def clear_screen() | |
counter=0 | |
until counter == 25 | |
puts "\n" | |
counter += 1 | |
end | |
end | |
def operating_system() | |
# This method checks the operating system name and returns this, if it is in the list | |
# Requires 'os' to run | |
# Returns "unknown" if operating system is not recognised | |
os_name = "unknown" | |
if (OS.linux? == true) then | |
os_name = "linux" | |
elsif (OS.mac? == true) then | |
os_name = "mac" | |
elsif (OS.windows? == true) then | |
os_name = "windows" | |
elsif (OS.java? == true) then | |
os_name = "java" | |
elsif (OS.bsd? == true) then | |
os_name = "bsd" | |
end | |
return(os_name) | |
end | |
def convert_date(text) | |
new_date_array = text.split(/\/|\-/).map(&:to_i) | |
new_date = Date.new(*new_date_array) | |
new_date.strftime("%Y %d %B") | |
end | |
# TROVE-SPECIFIC METHODS | |
def fetch_trove_results(current_search_town, current_search_term, trove_key) | |
# This method constructs a single search request for Trove (of a very specific format!) | |
# Input: two search parameters (town name, and search term) and the API key | |
# Return: XML of results (if successful) or 0 if error encountered | |
# Note: will not necessarily fail if no results returned | |
# The search town and search term are currently both just passed as strings, eventually the town search will be expanded | |
#substitute spaces for Trove API | |
current_search_term = current_search_term.gsub(/\s/, "%20") | |
current_search_town = current_search_town.gsub(/\s/, "%20") | |
trove_api_request = "http://api.trove.nla.gov.au/result?key=" | |
trove_api_request = trove_api_request + "#{trove_key}&zone=newspaper&encoding&q=#{current_search_term}+AND+#{current_search_town}" | |
begin | |
trove_api_results = Nokogiri::XML.parse(`curl "#{trove_api_request}"`) | |
rescue | |
puts "Error getting API results" | |
return(0) | |
end | |
return(trove_api_results) | |
end | |
def write_trove_results(trove_api_results, output_file_name, search_term, search_town) | |
# This method writes the Trove XML results to a csv file, one article at a time | |
# Input: XML results, output file name, search term and search town | |
# (the latter are just written to the csv to help assess results later) | |
# the '//article' key word signals the start of a Trove article | |
# Each article counts as a result | |
# Returns: the result count after writing the file | |
result_count = 0 | |
CSV.open(output_file_name, 'w') do |csv| | |
csv << ["search_term", "search_town", "result_number", "trove_url", "trove_article_heading", "trove_article_title", "trove_article_", "trove_article_page", "trove_article_snippet"] | |
trove_api_results.xpath('//article').each do |trove_article| | |
result_count = result_count + 1 | |
csv << [search_term, search_town, result_count, trove_article.xpath('troveUrl').text, trove_article.xpath('heading').text, trove_article.xpath('title').text, trove_article.xpath('date').text, trove_article.xpath('page').text, trove_article.xpath('snippet').text.gsub(/<strong>|<\/strong>/,"")] | |
end#of article | |
end#of writing csv | |
return(result_count) | |
end | |
def curate_trove_results(input_trove_file, num_articles) | |
# This method helps the user to select the articles they want read aloud | |
# Input: a csv of Trove search results, written as above in the 'write_trove_results' method | |
# Note: Only takes in the more interesting parts of Trove results: heading (field 4), date (field 6), snippet (field 8) | |
# Return: a curated array of Trove results, wherein the last element (status) indicates true or false (i.e. to read or not) | |
puts "\nCURATING ARTICLES ******" | |
puts "Input from: #{input_trove_file}" | |
# NOTE - limit article intake here too? | |
# take only the fields of interest for reading aloud, into an array of trove results | |
input_trove = CSV.read(input_trove_file).map { |row| | |
[row[4], row[6], row[8]] | |
}.uniq | |
prompt_suffix = "\n\t'n' to skip this article for reading " | |
prompt_suffix += "\n\tcarriage return or 'y' to keep this article for reading" | |
# loop through and ask the user if they want to mark the article for reading or not | |
# the new array of results has a status element, this can be used later to mark articles of interest | |
i = 0 | |
curated_trove = input_trove.first(num_articles).map { |str_heading, str_date, str_snippet| | |
begin#error handling | |
status = 0 | |
clear_screen() | |
if i == 0 | |
puts "(skipping header row)" | |
else | |
puts "\nArticle #{i}" | |
puts "Heading: #{str_heading}" | |
puts "Date: #{str_date}" | |
puts "Snippet:\n#{str_snippet}" | |
response = get_user_input("\n\tContinue with article #{i}?" + prompt_suffix) | |
response = response.downcase | |
if (response != "n") then | |
status = 1 | |
end#of user response for this article | |
end#of skipping header row | |
rescue Exception | |
puts "Error at record #{i}" | |
end#of error handling | |
i += 1 | |
[str_heading, str_date, str_snippet, status] | |
} | |
return(curated_trove) | |
end | |
def read_curated_trove_results(curated_trove) | |
# This method reads the curated Trove results aloud | |
# It only read aloud articles with a status of 1 | |
# Input: curated Trove list (having passed through 'curate_trove_results' function above) | |
# Returns: nothing, just tries to read each article aloud when their status is set to 1 | |
puts "\nREADING CURATED ARTICLES ******" | |
i = 0 | |
curated_trove.each do |str_heading, str_date, str_snippet, status| | |
begin#error handling | |
if (status == 1) then | |
# only proceed with the extra formatting if this article is to be said aloud | |
# fancy date format | |
new_date = convert_date(str_date) | |
# remove the first part of the snippet, which is the same as the headline | |
str_snippet = str_snippet.gsub(str_heading, "") | |
# remove annoying dart strings common to Trove...I don't know how to do this in one command rather than two | |
str_snippet = str_snippet.gsub("...", " ") | |
str_snippet = str_snippet.gsub("..", " ") | |
# say the three items aloud | |
puts "\nReading Article #{i}" | |
puts "\t#{new_date}" | |
puts "\t#{str_heading}" | |
puts "\t#{str_snippet}" | |
say_something("date #{new_date}") | |
say_something(str_heading) | |
say_something("#{str_snippet}") | |
end#of reading aloud for this record | |
rescue Exception | |
puts "Error at record #{i}" | |
end#of error handling | |
i += 1 | |
end#of reading through curated_trove_input | |
end | |
# TROVE MAIN PROCEDURE | |
puts "\nSTART TROVE EXPERIMENT ******\n" | |
# my Trove API key and default searches | |
my_trove_key = 'lop9t29sfm35vfkq' | |
default_town = 'Elmore' | |
default_search = 'tragedy' | |
# Get search town from user input, use default value if no answer | |
say_something("Please enter a search town") | |
search_town = get_user_input("Please enter a search town (will default to '#{default_town}')") | |
if (search_town.length == 0) then | |
search_town = default_town | |
end | |
# Get search term from user input, use default value if no answer | |
# Note: use 'this+AND+that' for multiple terms in term | |
say_something("Please enter a search term") | |
search_term = get_user_input("Please enter a search term (will default to '#{default_search}')") | |
if (search_term.length == 0) then | |
search_term = default_search | |
end | |
# Get user input on whether to num_articles the number of articles...subtract 1 from input to avoid confusion with header row | |
default_num_articles = 10 | |
say_something("Do you want to limit the number of articles to curate?") | |
response = get_user_input("\nDo you want to limit number of articles to curate? (will default to #{default_num_articles})") | |
if response !~ /\d/ then | |
num_articles = default_num_articles | |
else | |
num_articles = response.to_i | |
end | |
puts "\nLimiting to #{num_articles} articles" | |
say_something("Thankyou. Calling the olden days, please wait.") | |
say_something("Connecting to Trove database now.") | |
trove_api_results = fetch_trove_results(search_town, search_term, my_trove_key) | |
output_file_name = "trove_result_#{search_town}_#{search_term}.csv".gsub(/\s/,"_") | |
say_something("Writing results to file now.") | |
result_count = write_trove_results(trove_api_results, output_file_name, search_term, search_town) | |
puts "\nSearch town: \n\t#{search_town}" | |
puts "Search term: \n\t#{search_term}" | |
puts "Result count: \n\t#{result_count}" | |
puts "Results written to: \n\t#{output_file_name}" | |
if (result_count > 0) then | |
continue = true | |
say_something("#{result_count} articles available about #{search_town} #{search_term}") | |
else | |
say_something("Sorry, no results to read.") | |
continue = false | |
end | |
if (continue == true) then | |
# pause before continuing, give user the chance to exit | |
say_something("Press enter to continue to curating articles") | |
response = get_user_input("\nPress enter to continue to CURATING articles ******\n\t('exit' to escape)") | |
response = response.downcase | |
if (response == "exit") then | |
continue = false | |
end | |
end | |
if (continue == true) then | |
curated_trove_results = curate_trove_results(output_file_name, num_articles) | |
end | |
if (continue == true) then | |
# pause before continuing | |
say_something("Press enter to continue to reading articles aloud") | |
response = get_user_input("\nPress enter to continue to READING articles ******\n\t('exit' to escape)") | |
response = response.downcase | |
if (response == "exit") then | |
continue = false | |
end | |
end | |
if (continue == true) then | |
read_curated_trove_results(curated_trove_results) | |
end | |
puts "\nEND TROVE EXPERIMENT ******\n" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment