platinhom · November 28, 2023 07:32 · YourCrush · Apr 11, 2016 · philliid · Jul 6, 2016
diff --git a/0-EndnoteProcess.md b/0-EndnoteProcess.md
diff --git a/addPDF.py b/addPDF.py
 #! /usr/bin/env python
 # -*- coding: utf8 -*-
 import os,sys

 predoi="10.1021/"
 pdfdir="Done/"

 if (__name__ == '__main__'):
 	fname=sys.argv[1]
 	fnamelist=os.path.splitext(fname)
 	fwname=fnamelist[0]+"_new"+fnamelist[1]
 	fr=open(fname)
 	all=fr.read()
 	fr.close()
 	fw=open(fwname,'w')
 	length=len(all)
 	
 	pos1=0;pos2=0
 	
 	while True:
 		pos1=all.find("</urls>",pos2)
 		if (pos1 is -1): 
 			break
 		else:
 			fw.write(all[pos2:pos1])
 		try:
 			pos2=all.find("</style></electronic-resource-num>",pos1)
 			if (all.find("pdf-urls>",pos1-50,pos1) is -1):
 				pd=all.find(predoi,pos1,pos2)
 				doi=all[pd:pos2]
 				doii=doi.split('/')
 				if (os.path.exists(pdfdir+doii[0]+"_"+doii[1]+".pdf")):
 					if (not os.path.exists(pdfdir+doii[1])): os.mkdir(pdfdir+doii[1])
 					os.renames(pdfdir+doii[0]+"_"+doii[1]+".pdf",pdfdir+doii[1]+os.sep+doii[0]+"_"+doii[1]+".pdf")
 					fw.write("<pdf-urls><url>internal-pdf://"+doii[1]+"/"+doii[0]+"_"+doii[1]+".pdf"+"</url></pdf-urls>")
 		except:
 			pass
 		fw.write(all[pos1:pos2])
 	#last part
 	fw.write(all[pos2:])
 	fw.close()
diff --git a/checkdone.sh b/checkdone.sh
 #! /bin/bash
 # Check doi list.. generate not.txt for files not done
 # Need doi list input file
 dos2unix $1
 echo > not.txt
 for line in `cat $1`
 do
 if [ -z $line ];then
 	continue;
 fi
 pre=${line:0:7}
 post=${line:8}
 if [ ! -f Done/${pre}_${post}.pdf ];then
 	echo "${pre}/${post}" >> not.txt
 fi
 done
diff --git a/getfiledoi.py b/getfiledoi.py
 #! /usr/bin/env python
 # Author: Hom, 2015.12.20
 # Purpose: To find the doi number in first page of pdf
 # Usage: python script.py pdffile [pdffile2 pdffile3 ...]
 #
 # Require pdfminer module 
 #    To install pdfminer: pip install pdfminer

 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 from pdfminer.pdfdevice import PDFDevice, TagExtractor
 from pdfminer.pdfpage import PDFPage
 from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
 from pdfminer.cmapdb import CMapDB
 from pdfminer.layout import LAParams
 from pdfminer.image import ImageWriter

 import sys,os,re

 class stdmodel(object):
 	'''a class to model stdout for pdfminer file parameter
 	Can get context use get() method'''

 	# saved string
 	_str=""

 	def __str__(self):
 		return self._str

 	def reset(self):
 		'''Reset the saved string'''
 		self._str=""
 	def get(self):
 		'''Get the saved string'''
 		return self._str

 	def write(self,line):
 		'''model write method of file'''
 		self._str+=line

 	def open(self,*args):
 		'''model open method of file'''
 		self._str=""

 	def close(self):
 		'''model close method of file'''
 		self._str=""

 	def read(self):
 		'''model read method of file'''
 		return self._str

 #	def writeline(self,lines):
 #		pass
 #	def readline(self):
 #		pass
 #	def readlines(self):
 #		pass

 ####### Setup for pdfminer ############

 # debug option
 debug = 0
 PDFDocument.debug = debug
 PDFParser.debug = debug
 CMapDB.debug = debug
 PDFResourceManager.debug = debug
 PDFPageInterpreter.debug = debug
 PDFDevice.debug = debug

 #only first page
 pagenos=set([0])
 pageno = 1

 #outfp = sys.stdout
 outfp = stdmodel()

 codec = 'utf-8'
 showpageno = True
 scale = 1
 password = ''
 maxpages = 0
 rotation = 0
 imagewriter = None
 laparams = LAParams()

 # ResourceManager facilitates reuse of shared resources
 # such as fonts and images so that large objects are not
 # allocated multiple times.
 #### This will cause some problem when set to default True.
 caching = False
 rsrcmgr = PDFResourceManager(caching=caching)

 # Important Main converter for pdf file
 device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)

 ####### Functions for read doi ############

 def GetFirstPage(fname):
 	'''Get First Page contents of PDF, return string'''	
 	fp = file(fname, 'rb')
 	interpreter = PDFPageInterpreter(rsrcmgr, device)
 	for page in PDFPage.get_pages(fp, pagenos,
 	                              maxpages=maxpages, password=password,
 	                              caching=caching, check_extractable=True):
 		page.rotate = (page.rotate+rotation) % 360
 		interpreter.process_page(page)
 	fp.close()
 	outstr=outfp.get()
 	outfp.reset()
 	return outstr 

 # avoid repeat generate doipattern
 doipattern=re.compile("\\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\\b")

 def getdoi(instr):
 	'''Get DOI number of input string'''
 	match=doipattern.search(instr)
 	if (match):
 		return match.group()
 	else:
 		return ""

 def getfiledoi(fname):
 	'''Get DOI number from first page of PDF. 
 	If not found, return "" '''
 	outs=GetFirstPage(fname)
 	return getdoi(outs)

 def doirenamefile(fname, doi):
 	'''Rename file based on doi number'''
 	realdoi=getdoi(doi)
 	if ( realdoi is not "" ):
 		fnames=os.path.split(os.path.abspath(fname))
 		dois=realdoi.split('/',1)
 		os.renames(fname, 
 			fnames[0]+os.sep+dois[0]+"@"+dois[1]+".pdf")
 	#else don't rename it	

 def mainusage():
 	'''Print usage'''
 	print 'usage: %s [-r] [-d] pdffile ...' % sys.argv[0]
 	exit(100)

 if __name__=="__main__":
 	import getopt
 	try:
 		(opts, args) = getopt.getopt(sys.argv[1:], 'rd')
 	except getopt.GetoptError:
 		mainusage()
 	if not args: mainusage()

 	# -r : rename file
 	rename_=False
 	# -d : only output doi name
 	onlydoi_=False
 	for (k, v) in opts:
 		if k == '-r': rename_=True
 		if k == '-d': onlydoi_=True

 	# Perform for each file
 	for fname in args:
 		#fname=sys.argv[1]
 		#fnamelist=os.path.splitext(fname)

 		doi=getfiledoi(fname);
 		if (rename_):
 			doirenamefile(fname,doi)
 			if (onlydoi_):
 				print doi
 		else:
 			if (onlydoi_):
 				print doi
 			else:
 				print fname+" "+"Found: "+doi
diff --git a/getPDF.py b/getPDF.py
 #! /usr/bin/env python
 import sys,os,shutil
 import urllib2 as ul2
 import random

 #doi="10.1021/ci960138u"
 #my_headers = ['Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
 #    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; InfoPath.1',
 #    'Mozilla/4.0 (compatible; GoogleToolbar 5.0.2124.2070; Windows 6.0; MSIE 8.0.6001.18241)',
 #    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
 #    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; Sleipnir/2.9.8)']

 f=open(sys.argv[1])
 for l in f:
 	doi=l.strip().strip('/')
 	doisplit=doi.split('/')
 	doiout="_".join(doisplit)
 	if (len(doisplit)<2):
 		print "Error DOI:"+doi
 		continue
 	if os.path.exists("./"+doiout+".pdf"):
 		continue
 	if os.path.exists("Done/"+doiout+".pdf"):
 		continue
 	if os.path.exists("Accept/"+doiout+".pdf"):
 		continue
 	if os.path.exists("Done/"+doi.split('/',1)[1]+"/"):
 		continue	
 	try:
 		link="http://sci-hub.io/"+doi
 		#random_header = random.choice(my_headers)
 		#req=ul2.Request(link)
 		#req.add_header("User-Agent",random_header)
 		#req.add_header('Host', 'pubs.acs.org.sci-hub.io')
 		#req.add_header('Referer', 'http://pubs.acs.org.sci-hub.io')
 		#req.add_header('GET', link)
 		web=ul2.urlopen(link)#(req)
 		pdflink=""
 		for line in web:
 			if "<iframe src" in line and "sci-hub.io" in line and ".pdf" in line:
 				pdflink=""
 				if ("http" in line):
 					i=line.index("http");
 					j=line.index(".pdf");
 					pdflink = line[i:j+4]
 				else:
 					i=line.index("sci-hub.io");
 					j=line.index(".pdf");
 					pdflink = "http://"+line[i:j+4]
 				break
 		# Another store link..
 		if (len(pdflink)<5):
 			link="http://pubs.acs.org.sci-hub.io/doi/abs/"+doi
 			web=ul2.urlopen(link)
 			for line in web:
 				if "<iframe src" in line and "sci-hub.io" in line and ".pdf" in line:
 					pdflink=""
 					if ("http" in line):
 						i=line.index("http");
 						j=line.index(".pdf");
 						pdflink = line[i:j+4]
 					else:
 						i=line.index("sci-hub.io");
 						j=line.index(".pdf");
 						pdflink = "http://"+line[i:j+4]
 					break

 		#pdfreq=ul2.urlopen(link)
 		#with open(doiout+".pdf",'w') as fp:
 		#	shutil.copyfileobj(pdfreq,fp)
 		if (len(pdflink)<5):
 			print doi+" can't find!!!!"
 		else:
 			os.system("wget "+pdflink+" -O "+doiout+".pdf")
 	except : 
 		pass
 f.close()
diff --git a/modifyXML.py b/modifyXML.py
 #! /usr/bin/env python
 # -*- coding: utf8 -*-
 import os,sys

 predoi="10.1021/"

 pos1str='<notes><style face="normal" font="default" size="100%">'
 pos1len=len(pos1str)
 pos2str='Times Cited'
 pos2len=len(pos2str)
 doistr='<electronic-resource-num><style face="normal" font="default" size="100%">'
 doistrlen=len(doistr)

 substr=False

 def processdoi(stri):
 	pos1=stri.find(doistr)
 	pos2=stri.find('</style></electronic-resource-num>')
 	if (pos1 is -1 or pos2 is -1):
 		return stri
 	dois=stri[pos1+doistrlen:pos2]
 	pos3=dois.find("10.")
 	if ( pos3 >=0):
 		newdoi=dois[pos3:].lower().strip()
 		return stri[:pos1+doistrlen]+newdoi+stri[pos2:]
 	else:
 		return stri
 	

 if (__name__ == '__main__'):
 	fname=sys.argv[1]
 	fnamelist=os.path.splitext(fname)
 	fwname=fnamelist[0]+"_new"+fnamelist[1]
 	fr=open(fname)
 	all=fr.read()
 	fr.close()
 	fw=open(fwname,'w')
 	length=len(all)
 	
 	prepos1=0; pos1=0;pos2=0
 	
 	while True:
 		prepos1=pos1;
 		pos1=all.find(pos1str,pos2)
 		writestr=""
 		if (pos1 is -1): 
 			break
 		elif ((pos1-pos2)>50):
 			fw.write(processdoi(all[pos2:pos1+pos1len]))
 		else:
 			fw.write(processdoi(all[prepos1:pos1+pos1len]))
 			
 		
 		try:
 			pos2=all.find(pos2str,pos1)
 			if (substr):
 				#oristr=all[pos1+pos1len:pos2]
 				fw.write(substr)
 		except:
 			pass
 		#fw.write(all[pos1+pos1len:pos2])
 	#last part
 	fw.write(processdoi(all[pos2:]))
 	fw.close()
diff --git a/prepareDOI.py b/prepareDOI.py
 #! /usr/bin/env python
 # -*- coding: utf8 -*-
 import os,sys

 predoi="10."

 if (__name__ == '__main__'):
 	fname=sys.argv[1]
 	fnamelist=os.path.splitext(fname)
 	fwname=fnamelist[0]+"_new"+fnamelist[1]
 	fr=open(fname)
 	fw=open(fwname,'w')
 	for line in fr:
 		fw.write(line[line.find("10."):].lower().strip()+"\n")
 	fr.close()
 	fw.close()
	#! /usr/bin/env python
	# -- coding: utf8 --
	import os,sys

	predoi="10.1021/"
	pdfdir="Done/"

	if (__name__ == '__main__'):
	fname=sys.argv[1]
	fnamelist=os.path.splitext(fname)
	fwname=fnamelist[0]+"_new"+fnamelist[1]
	fr=open(fname)
	all=fr.read()
	fr.close()
	fw=open(fwname,'w')
	length=len(all)

	pos1=0;pos2=0

	while True:
	pos1=all.find("</urls>",pos2)
	if (pos1 is -1):
	break
	else:
	fw.write(all[pos2:pos1])
	try:
	pos2=all.find("</style></electronic-resource-num>",pos1)
	if (all.find("pdf-urls>",pos1-50,pos1) is -1):
	pd=all.find(predoi,pos1,pos2)
	doi=all[pd:pos2]
	doii=doi.split('/')
	if (os.path.exists(pdfdir+doii[0]+"_"+doii[1]+".pdf")):
	if (not os.path.exists(pdfdir+doii[1])): os.mkdir(pdfdir+doii[1])
	os.renames(pdfdir+doii[0]+"_"+doii[1]+".pdf",pdfdir+doii[1]+os.sep+doii[0]+"_"+doii[1]+".pdf")
	fw.write("<pdf-urls><url>internal-pdf://"+doii[1]+"/"+doii[0]+"_"+doii[1]+".pdf"+"</url></pdf-urls>")
	except:
	pass
	fw.write(all[pos1:pos2])
	#last part
	fw.write(all[pos2:])
	fw.close()
	#! /bin/bash
	# Check doi list.. generate not.txt for files not done
	# Need doi list input file
	dos2unix $1
	echo > not.txt
	for line in `cat $1`
	do
	if [ -z $line ];then
	continue;
	fi
	pre=${line:0:7}
	post=${line:8}
	if [ ! -f Done/${pre}_${post}.pdf ];then
	echo "${pre}/${post}" >> not.txt
	fi
	done
	#! /usr/bin/env python
	# Author: Hom, 2015.12.20
	# Purpose: To find the doi number in first page of pdf
	# Usage: python script.py pdffile [pdffile2 pdffile3 ...]
	#
	# Require pdfminer module
	# To install pdfminer: pip install pdfminer

	from pdfminer.pdfdocument import PDFDocument
	from pdfminer.pdfparser import PDFParser
	from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
	from pdfminer.pdfdevice import PDFDevice, TagExtractor
	from pdfminer.pdfpage import PDFPage
	from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
	from pdfminer.cmapdb import CMapDB
	from pdfminer.layout import LAParams
	from pdfminer.image import ImageWriter

	import sys,os,re

	class stdmodel(object):
	'''a class to model stdout for pdfminer file parameter
	Can get context use get() method'''

	# saved string
	_str=""

	def __str__(self):
	return self._str

	def reset(self):
	'''Reset the saved string'''
	self._str=""
	def get(self):
	'''Get the saved string'''
	return self._str

	def write(self,line):
	'''model write method of file'''
	self._str+=line

	def open(self,*args):
	'''model open method of file'''
	self._str=""

	def close(self):
	'''model close method of file'''
	self._str=""

	def read(self):
	'''model read method of file'''
	return self._str

	# def writeline(self,lines):
	# pass
	# def readline(self):
	# pass
	# def readlines(self):
	# pass

	####### Setup for pdfminer ############

	# debug option
	debug = 0
	PDFDocument.debug = debug
	PDFParser.debug = debug
	CMapDB.debug = debug
	PDFResourceManager.debug = debug
	PDFPageInterpreter.debug = debug
	PDFDevice.debug = debug

	#only first page
	pagenos=set([0])
	pageno = 1

	#outfp = sys.stdout
	outfp = stdmodel()

	codec = 'utf-8'
	showpageno = True
	scale = 1
	password = ''
	maxpages = 0
	rotation = 0
	imagewriter = None
	laparams = LAParams()

	# ResourceManager facilitates reuse of shared resources
	# such as fonts and images so that large objects are not
	# allocated multiple times.
	#### This will cause some problem when set to default True.
	caching = False
	rsrcmgr = PDFResourceManager(caching=caching)

	# Important Main converter for pdf file
	device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
	imagewriter=imagewriter)

	####### Functions for read doi ############

	def GetFirstPage(fname):
	'''Get First Page contents of PDF, return string'''
	fp = file(fname, 'rb')
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	for page in PDFPage.get_pages(fp, pagenos,
	maxpages=maxpages, password=password,
	caching=caching, check_extractable=True):
	page.rotate = (page.rotate+rotation) % 360
	interpreter.process_page(page)
	fp.close()
	outstr=outfp.get()
	outfp.reset()
	return outstr

	# avoid repeat generate doipattern
	doipattern=re.compile("\\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\\b")

	def getdoi(instr):
	'''Get DOI number of input string'''
	match=doipattern.search(instr)
	if (match):
	return match.group()
	else:
	return ""

	def getfiledoi(fname):
	'''Get DOI number from first page of PDF.
	If not found, return "" '''
	outs=GetFirstPage(fname)
	return getdoi(outs)

	def doirenamefile(fname, doi):
	'''Rename file based on doi number'''
	realdoi=getdoi(doi)
	if ( realdoi is not "" ):
	fnames=os.path.split(os.path.abspath(fname))
	dois=realdoi.split('/',1)
	os.renames(fname,
	fnames[0]+os.sep+dois[0]+"@"+dois[1]+".pdf")
	#else don't rename it

	def mainusage():
	'''Print usage'''
	print 'usage: %s [-r] [-d] pdffile ...' % sys.argv[0]
	exit(100)

	if __name__=="__main__":
	import getopt
	try:
	(opts, args) = getopt.getopt(sys.argv[1:], 'rd')
	except getopt.GetoptError:
	mainusage()
	if not args: mainusage()

	# -r : rename file
	rename_=False
	# -d : only output doi name
	onlydoi_=False
	for (k, v) in opts:
	if k == '-r': rename_=True
	if k == '-d': onlydoi_=True

	# Perform for each file
	for fname in args:
	#fname=sys.argv[1]
	#fnamelist=os.path.splitext(fname)

	doi=getfiledoi(fname);
	if (rename_):
	doirenamefile(fname,doi)
	if (onlydoi_):
	print doi
	else:
	if (onlydoi_):
	print doi
	else:
	print fname+" "+"Found: "+doi
	#! /usr/bin/env python
	import sys,os,shutil
	import urllib2 as ul2
	import random

	#doi="10.1021/ci960138u"
	#my_headers = ['Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
	# 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; InfoPath.1',
	# 'Mozilla/4.0 (compatible; GoogleToolbar 5.0.2124.2070; Windows 6.0; MSIE 8.0.6001.18241)',
	# 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
	# 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; Sleipnir/2.9.8)']

	f=open(sys.argv[1])
	for l in f:
	doi=l.strip().strip('/')
	doisplit=doi.split('/')
	doiout="_".join(doisplit)
	if (len(doisplit)<2):
	print "Error DOI:"+doi
	continue
	if os.path.exists("./"+doiout+".pdf"):
	continue
	if os.path.exists("Done/"+doiout+".pdf"):
	continue
	if os.path.exists("Accept/"+doiout+".pdf"):
	continue
	if os.path.exists("Done/"+doi.split('/',1)[1]+"/"):
	continue
	try:
	link="http://sci-hub.io/"+doi
	#random_header = random.choice(my_headers)
	#req=ul2.Request(link)
	#req.add_header("User-Agent",random_header)
	#req.add_header('Host', 'pubs.acs.org.sci-hub.io')
	#req.add_header('Referer', 'http://pubs.acs.org.sci-hub.io')
	#req.add_header('GET', link)
	web=ul2.urlopen(link)#(req)
	pdflink=""
	for line in web:
	if "<iframe src" in line and "sci-hub.io" in line and ".pdf" in line:
	pdflink=""
	if ("http" in line):
	i=line.index("http");
	j=line.index(".pdf");
	pdflink = line[i:j+4]
	else:
	i=line.index("sci-hub.io");
	j=line.index(".pdf");
	pdflink = "http://"+line[i:j+4]
	break
	# Another store link..
	if (len(pdflink)<5):
	link="http://pubs.acs.org.sci-hub.io/doi/abs/"+doi
	web=ul2.urlopen(link)
	for line in web:
	if "<iframe src" in line and "sci-hub.io" in line and ".pdf" in line:
	pdflink=""
	if ("http" in line):
	i=line.index("http");
	j=line.index(".pdf");
	pdflink = line[i:j+4]
	else:
	i=line.index("sci-hub.io");
	j=line.index(".pdf");
	pdflink = "http://"+line[i:j+4]
	break

	#pdfreq=ul2.urlopen(link)
	#with open(doiout+".pdf",'w') as fp:
	# shutil.copyfileobj(pdfreq,fp)
	if (len(pdflink)<5):
	print doi+" can't find!!!!"
	else:
	os.system("wget "+pdflink+" -O "+doiout+".pdf")
	except :
	pass
	f.close()