Skip to content

Instantly share code, notes, and snippets.

@LukeB42
Last active August 29, 2015 14:21
Show Gist options
  • Save LukeB42/cee5415d03a7337ef1d4 to your computer and use it in GitHub Desktop.
Save LukeB42/cee5415d03a7337ef1d4 to your computer and use it in GitHub Desktop.
Extract the text of an article and read it in the system pager
#!/usr/bin/env python
import sys
from goose import Goose
import requests
from subprocess import Popen, PIPE
import errno
requests.packages.urllib3.disable_warnings()
def fetch(url):
h={"User-Agent":"aread 0.01"}
try:
r = requests.get(url,headers=h,verify=False)
return r.text
except Exception, e:
sys.stderr.write("Error: %s" % e.message[0])
def extract(html):
g = Goose()
a = g.extract(raw_html=html)
return a
if __name__ == "__main__":
if len(sys.argv) <= 1:
sys.stdout.write("Usage: aread <url>\n")
else:
url = sys.argv[1]
html = fetch(sys.argv[1])
article = extract(html)
p = Popen(['less', '-P', article.title], stdin=PIPE)
try:
p.stdin.write(article.cleaned_text.encode("utf-8","ignore"))
except IOError as e:
if e.errno == errno.EPIPE or e.errno == errno.EINVAL:
sys.stderr.write("Error writing to pipe.\n")
else:
raise
p.stdin.close()
p.wait()
print '%s: %i lines.' % (article.title, len(article.cleaned_text.split("\n")))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment