Last active January 17, 2016 22:52
Based on:
Modified to work on Windows by: Sergey Feldman
Jan 17, 2016
Requirements: pdflatex, bibtex
import requests
import lxml.html as html
import re
import os, os.path
import glob
import getpass
import urllib
import tarfile
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
import smtplib
import shutil
# Fill in with your own info
query = ''
kindle_email = ''
your_gmail = 'YOUR_EMAIL@EMAIL.COM'
gmailpass = getpass.getpass()
temp_dir = '\\temp' # where the intermediate files are stored
# paper settings (decrease width/height to increase font)
landscape = True
width = "6in"
height = "4in"
margin = "0.1in"
# settings for latex geometry package:
if landscape:
geom_settings = dict(paperwidth=width, paperheight=height, margin=margin)
geom_settings = dict(paperwidth=height, paperheight=width, margin=margin)
arxiv_id = re.match(r'(http://.*?/)?(?P<id>\d{4}\.\d{4,5}(v\d{1,2})?)', query).group('id')
arxiv_abs = '' + arxiv_id
arxiv_pdf = '' + arxiv_id
arxiv_pgtitle = html.fromstring(requests.get(arxiv_abs).text.encode('utf8')).xpath('/html/head/title/text()')[0]
arxiv_title = re.sub(r'\s+', ' ', re.sub(r'^\[[^]]+\]\s*', '', arxiv_pgtitle), re.DOTALL)
arxiv_title_scrubbed = re.sub('[^-_A-Za-z0-9]+', '_', arxiv_title, re.DOTALL)
# make a temporary directory to store the tex files and download
# tar.gz of the source
d = temp_dir + '\\' + arxiv_id
cwd = os.getcwd()
tar_fn = arxiv_id + '.tar.gz'
url = '' + arxiv_id
urllib.urlretrieve(url, tar_fn)
with, 'r:gz') as tar:
for item in tar:
# find the files with .tex
# and get the main
texfiles = glob.glob(os.path.join(d, '*.tex'))
for texfile in texfiles:
with open(texfile, 'r') as f:
src = f.readlines()
if 'documentclass' in src[0]:
print('correct file: ' + texfile)
# filter comments/newlines for easier debugging:
src = [line for line in src if line[0] != '%' and len(line.strip()) > 0]
# strip font size, column stuff, and paper size stuff in documentclass line:
src[0] = re.sub(r'\b\d+pt\b', '', src[0])
src[0] = re.sub(r'\b\w+column\b', '', src[0])
src[0] = re.sub(r'\b\w+paper\b', '', src[0])
src[0] = re.sub(r'(?<=\[),', '', src[0]) # remove extraneous starting commas
src[0] = re.sub(r',(?=[\],])', '', src[0]) # remove extraneous middle/ending commas
# find begin{document}:
begindocs = [i for i, line in enumerate(src) if line.startswith(r'\begin{document}')]
assert(len(begindocs) == 1)
src.insert(begindocs[0], '\\usepackage['+','.join(k+'='+v for k,v in geom_settings.items())+']{geometry}\n')
src.insert(begindocs[0], '\\usepackage{times}\n')
src.insert(begindocs[0], '\\pagestyle{empty}\n')
src.insert(begindocs[0], '\\usepackage{epstopdf}\n') # so eps will work with pdflatex
if landscape:
src.insert(begindocs[0], '\\usepackage{pdflscape}\n')
# shrink figures to be at most the size of the page, now that it's landscape
for i in range(len(src)):
line = src[i]
m ='\\includegraphics\[width=([.\d]+)\\(line|text)width\]', line)
if m:
mul =
src[i] = re.sub(r'\\includegraphics\[width=([.\d]+)\\(line|text)width\]',
# write updaetd tex
os.rename(texfile, texfile+'.bak')
with open(texfile, 'w') as f:
# compile -> could hang here if the texfile is poorly written
os.system('pdflatex ' + texfile)
os.system('bibtex ' + texfile)
os.system('pdflatex ' + texfile)
os.system('pdflatex ' + texfile)
file_name = arxiv_title_scrubbed+".pdf"
# send the email
msg = MIMEMultipart()
pdf = open(file_name, 'rb').read()
pdf_part = MIMEApplication(pdf, _subtype='pdf')
pdf_part.add_header('Content-Disposition', 'attachment', filename=file_name)
server = smtplib.SMTP('')
server.login(your_gmail, gmailpass)
server.sendmail(your_gmail, kindle_email, msg.as_string())
# clean up - delete the directory and its files
