neurojojo · March 14, 2022 18:42
diff --git a/pdfimages_to_text.py b/pdfimages_to_text.py
 # Variables:
 # (1) pdf_as_text
 # where some list elements are empty because an attempt was made to 
 # read in a PDF that is a scanned image (not typed text)
 # (2) files
 # a list of file names and paths to the PDFs 

 !apt-get install poppler-utils
 !pip install pdf2image

 from pdf2image import convert_from_path, convert_from_bytes
 from IPython.display import display, Image

 !pip install ocrmypdf
 !pip install opencv-python
 !pip3 install PIL
 !pip3 install pytesseract
 !pip3 install pdf2image
 !sudo apt-get install tesseract-ocr

 import pytesseract

 def convertImagePDF( inputfile ):
  inputimages = convert_from_bytes(open( inputfile , 'rb').read())
  text = [ str(((pytesseract.image_to_string(inputimage)))) for inputimage in inputimages ]
  return text

 problem_pdfs = [ files[index] for index,this_pdf in enumerate(pdf_as_text) if len( this_pdf )==0 ]

 image_pdf_text = [ convertImagePDF( problem_pdf ) for problem_pdf in problem_pdfs ]
	# Variables:
	# (1) pdf_as_text
	# where some list elements are empty because an attempt was made to
	# read in a PDF that is a scanned image (not typed text)
	# (2) files
	# a list of file names and paths to the PDFs

	!apt-get install poppler-utils
	!pip install pdf2image

	from pdf2image import convert_from_path, convert_from_bytes
	from IPython.display import display, Image

	!pip install ocrmypdf
	!pip install opencv-python
	!pip3 install PIL
	!pip3 install pytesseract
	!pip3 install pdf2image
	!sudo apt-get install tesseract-ocr

	import pytesseract

	def convertImagePDF( inputfile ):
	inputimages = convert_from_bytes(open( inputfile , 'rb').read())
	text = [ str(((pytesseract.image_to_string(inputimage)))) for inputimage in inputimages ]
	return text

	problem_pdfs = [ files[index] for index,this_pdf in enumerate(pdf_as_text) if len( this_pdf )==0 ]

	image_pdf_text = [ convertImagePDF( problem_pdf ) for problem_pdf in problem_pdfs ]