Skip to content

Instantly share code, notes, and snippets.

@SarahTaylorProject
Created April 25, 2017 18:47
Show Gist options
  • Save SarahTaylorProject/bda9e917ab256877b3ec9af1c1f49fdf to your computer and use it in GitHub Desktop.
Save SarahTaylorProject/bda9e917ab256877b3ec9af1c1f49fdf to your computer and use it in GitHub Desktop.
Trove update
# TROVE API TEST
# Updates April 25th 2017
# Searches Trove (National Library of Australia) for historical articles matching input town name and search term
# Writes results to csv file and then proceeds to read them aloud with the "say_something" method
# CHANGES
# This version takes two search terms DIRECT FROM USER (one should be a town name)
# It does not (as in previous versions) require a csv of town names...
# Note that the csv is likely to be useful in future, but for live purposes it is too time consuming
# The "say_something" method will now work for Mac or Linus
# This version has more methods
# It has more direct user input for more flexibility
# In particular, it lets the user CURATE articles first, before proceeding to the reading
# (this approach won't necessarily work better in the long term, but for live use, flexibility and speed is important)
# LIMITATIONS
# Does not deal with the full result list from Trove, only the first 100 results per search
# STILL TO DO:
# Search could be more effective for comprehensiveness
# e.g. sorting results differently (to avoid repetition), or possibly fetching the whole article rather than the snippet
# Clean up more code, it is still a bit messy and some features should be in methods
require "csv"
require "json"
require "net/http"
require "nokogiri"
require "date"
require "rbconfig"
require "os"
# GENERAL METHODS
def say_something(text, speed = 135)
# This method checks for operating system and uses appropriate say-aloud command line
# Works for linux and mac, could expand to others later
result = operating_system()
case result
when "linux"
`echo "#{text}"|espeak -s #{speed}`
when "mac"
`say -r #{speed} "#{text}"`
else
puts "say_something does not yet support this operating system"
end
end
def get_user_input(prompt_text = "\nPlease enter value")
# This method just gets direct input from the user with a prompt
# Returns the user input
# Nothing fancy, just a handy function
puts prompt_text
input_text = STDIN.gets.chomp
return(input_text)
end
def operating_system()
# This method checks the operating system name and returns this, if it is in the list
# Requires 'os' to run
# Returns "unknown" if operating system is not recognised
os_name = "unknown"
if (OS.linux? == true) then
os_name = "linux"
elsif (OS.mac? == true) then
os_name = "mac"
elsif (OS.windows? == true) then
os_name = "windows"
elsif (OS.java? == true) then
os_name = "java"
elsif (OS.bsd? == true) then
os_name = "bsd"
end
return(os_name)
end
def convert_date(text)
new_date_array = text.split(/\/|\-/).map(&:to_i)
new_date = Date.new(*new_date_array)
new_date.strftime("%Y %d %B")
end
# TROVE-SPECIFIC METHODS
def fetch_trove_results(current_search_town, current_search_term, trove_key)
# This method constructs a single search request for Trove (of a very specific format!)
# Input: two search parameters (town name, and search term) and the API key
# Return: XML of results (if successful) or 0 if error encountered
# Note: will not necessarily fail if no results returned
# The search town and search term are currently both just passed as strings, eventually the town search will be expanded
#substitute spaces for Trove API
current_search_term = current_search_term.gsub(/\s/, "%20")
current_search_town = current_search_town.gsub(/\s/, "%20")
trove_api_request = "http://api.trove.nla.gov.au/result?key="
trove_api_request = trove_api_request + "#{trove_key}&zone=newspaper&encoding&q=#{current_search_term}+AND+#{current_search_town}"
begin
trove_api_results = Nokogiri::XML.parse(`curl "#{trove_api_request}"`)
rescue
puts "Error getting API results"
return(0)
end
return(trove_api_results)
end
def write_trove_results(trove_api_results, output_file_name, search_term, search_town)
# This method writes the Trove XML results to a csv file, one article at a time
# Input: XML results, output file name, search term and search town
# (the latter are just written to the csv to help assess results later)
# the '//article' key word signals the start of a Trove article
# Each article counts as a result
# Returns: the result count after writing the file
result_count = 0
CSV.open(output_file_name, 'w') do |csv|
csv << ["search_term", "search_town", "result_number", "trove_url", "trove_article_heading", "trove_article_title", "trove_article_", "trove_article_page", "trove_article_snippet"]
trove_api_results.xpath('//article').each do |trove_article|
result_count = result_count + 1
csv << [search_term, search_town, result_count, trove_article.xpath('troveUrl').text, trove_article.xpath('heading').text, trove_article.xpath('title').text, trove_article.xpath('date').text, trove_article.xpath('page').text, trove_article.xpath('snippet').text.gsub(/<strong>|<\/strong>/,"")]
end#of article
end#of writing csv
return(result_count)
end
def curate_trove_results(input_trove_file)
# This method helps the user to select the articles they want read aloud
# Input: a csv of Trove search results, written as above in the 'write_trove_results' method
# Note: Only takes in the more interesting parts of Trove results: heading (field 4), date (field 6), snippet (field 8)
# Return: a curated array of Trove results, wherein the last element (status) indicates true or false (i.e. to read or not)
puts "\nCURATING ARTICLES ******"
puts "Input from: #{input_trove_file}"
# user input on whether to limit the number of articles...subtract 1 from input to avoid confusion with header row
default_limit = 10
response = get_user_input("\nLimit number of articles to curate? (will default to #{default_limit})")
if response !~ /\d/ then
limit = default_limit
else
limit = response.to_i
end
puts "\nLimiting to #{limit} articles"
# take only the fields of interest for reading aloud, into an array of trove results
input_trove = CSV.read(input_trove_file).map { |row|
[row[4], row[6], row[8]]
}.uniq
prompt_suffix = "\n\t'n' to skip this article for reading "
prompt_suffix += "\n\tcarriage return or 'y' to keep this article for reading"
# loop through and ask the user if they want to mark the article for reading or not
# the new array of results has a status element, this can be used later to mark articles of interest
i = 0
curated_trove = input_trove.first(limit).map { |str_heading, str_date, str_snippet|
begin#error handling
status = 0
if i == 0
puts "(skipping header row)"
else
puts "\nArticle #{i}"
puts "Heading: #{str_heading}"
puts "Date: #{str_date}"
puts "Snippet:\n#{str_snippet}"
response = get_user_input("\n\tContinue with article #{i}?" + prompt_suffix)
response = response.downcase
if (response != "n") then
status = 1
end#of user response for this article
end#of skipping header row
rescue Exception
puts "Error at record #{i}"
end#of error handling
i += 1
[str_heading, str_date, str_snippet, status]
}
return(curated_trove)
end
def read_curated_trove_results(curated_trove)
# This method reads the curated Trove results aloud
# It only read aloud articles with a status of 1
# Input: curated Trove list (having passed through 'curate_trove_results' function above)
# Returns: nothing, just tries to read each article aloud when their status is set to 1
puts "\nREADING CURATED ARTICLES ******"
i = 0
curated_trove.each do |str_heading, str_date, str_snippet, status|
begin#error handling
if (status == 1) then
# only proceed with the extra formatting if this article is to be said aloud
# fancy date format
new_date = convert_date(str_date)
# remove the first part of the snippet, which is the same as the headline
str_snippet = str_snippet.gsub(str_heading, "")
# remove annoying dart strings common to Trove...I don't know how to do this in one command rather than two
str_snippet = str_snippet.gsub("...", " ")
str_snippet = str_snippet.gsub("..", " ")
# say the three items aloud
puts "\nReading Article #{i}"
puts "\t#{new_date}"
puts "\t#{str_heading}"
puts "\t#{str_snippet}"
say_something("date #{new_date}")
say_something(str_heading)
say_something("#{str_snippet}")
end#of reading aloud for this record
rescue Exception
puts "Error at record #{i}"
end#of error handling
i += 1
end#of reading through curated_trove_input
end
# TROVE MAIN PROCEDURE
puts "\nSTART TROVE EXPERIMENT ******\n"
# my Trove API key and default searches
my_trove_key = 'lop9t29sfm35vfkq'
default_town = 'Elmore'
default_search = 'tragedy'
# Get search town from user input, use default value if no answer
search_town = get_user_input("Please enter a search town (will default to '#{default_town}')")
if (search_town.length == 0) then
search_town = default_town
end
# Get search term from user input, use default value if no answer
# Note: use 'this+AND+that' for multiple terms in term
search_term = get_user_input("Please enter a search term (will default to '#{default_search}')")
if (search_term.length == 0) then
search_term = default_search
end
trove_api_results = fetch_trove_results(search_town, search_term, my_trove_key)
output_file_name = "trove_result_#{search_town}_#{search_term}.csv".gsub(/\s/,"_")
result_count = write_trove_results(trove_api_results, output_file_name, search_term, search_town)
puts "\nSearch town: \n\t#{search_town}"
puts "Search term: \n\t#{search_term}"
puts "Result count: \n\t#{result_count}"
puts "Results written to: \n\t#{output_file_name}"
if (result_count > 0) then
continue = true
else
puts "Sorry, no results to read."
continue = false
end
if (continue == true) then
# pause before continuing, give user the chance to exit
response = get_user_input("\nPress enter to continue to CURATING articles ******\n\t('exit' to escape)")
response = response.downcase
if (response == "exit") then
continue = false
end
end
if (continue == true) then
curated_trove_results = curate_trove_results(output_file_name)
end
if (continue == true) then
# #pause before continuing
response = get_user_input("\nPress enter to continue to READING articles ******\n\t('exit' to escape)")
response = response.downcase
if (response == "exit") then
continue = false
end
end
if (continue == true) then
read_curated_trove_results(curated_trove_results)
end
puts "\nEND TROVE EXPERIMENT ******\n"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment