-
-
Save tiarno/8a2995e70cee42f01e79 to your computer and use it in GitHub Desktop.
from PyPDF2 import PdfFileReader | |
from pprint import pprint | |
def walk(obj, fnt, emb): | |
''' | |
If there is a key called 'BaseFont', that is a font that is used in the document. | |
If there is a key called 'FontName' and another key in the same dictionary object | |
that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is | |
embedded. | |
We create and add to two sets, fnt = fonts used and emb = fonts embedded. | |
''' | |
if not hasattr(obj, 'keys'): | |
return None, None | |
fontkeys = set(['/FontFile', '/FontFile2', '/FontFile3']) | |
if '/BaseFont' in obj: | |
fnt.add(obj['/BaseFont']) | |
if '/FontName' in obj: | |
if [x for x in fontkeys if x in obj]:# test to see if there is FontFile | |
emb.add(obj['/FontName']) | |
for k in obj.keys(): | |
walk(obj[k], fnt, emb) | |
return fnt, emb# return the sets for each page | |
if __name__ == '__main__': | |
fname = 'myfile.pdf' | |
pdf = PdfFileReader(fname) | |
fonts = set() | |
embedded = set() | |
for page in pdf.pages: | |
obj = page.getObject() | |
# updated via this answer: | |
# https://stackoverflow.com/questions/60876103/use-pypdf2-to-detect-non-embedded-fonts-in-pdf-file-generated-by-google-docs/60895334#60895334 | |
# in order to handle lists inside objects. Thanks misingnoglic ! | |
# untested code since I don't have such a PDF to play with. | |
if type(obj) == PyPDF2.generic.ArrayObject: # You can also do ducktyping here | |
for i in obj: | |
if hasattr(i, 'keys'): | |
f, e = walk(i, fonts, embedded_fonts) | |
fonts = fonts.union(f) | |
embedded = embedded.union(e) | |
else: | |
f, e = walk(obj['/Resources'], fonts, embedded) | |
fonts = fonts.union(f) | |
embedded = embedded.union(e) | |
unembedded = fonts - embedded | |
print 'Font List' | |
pprint(sorted(list(fonts))) | |
if unembedded: | |
print '\nUnembedded Fonts' | |
pprint(unembedded) |
That's a very handy! Is it also possuble to get a text wich is written bold?
First, install the two packages:
pip install PyPDF2
pip install pprint
Next, if using Python3, edit the following lines:
print('Font List')
pprint(sorted(list(fonts)))
…
print('\nUnembedded Fonts')
pprint(unembedded)
Hi - FYI I tried to use this script to find unembedded fonts in PDFs. I downloaded a PDF from Google docs with just Arial, and Adobe Reader says the font is embedded, but this script says it's not.
thanks for the info. This works for the PDFs I've come across, but there are so many different structures possible inside a PDF. I would definitely believe Adobe. If you want further confirmation, pdffonts is a command line tool you might be interested in. https://www.xpdfreader.com/pdffonts-man.html
Thanks - I'll try to reverse engineer what they have done. In the meantime I've asked on Stack Overflow: https://stackoverflow.com/questions/60876103/use-pypdf2-to-detect-non-embedded-fonts-in-pdf-file-generated-by-google-docs
Figured it out! You need to modify the script to handle lists as well. I put an example in the stackoverflow answer:
https://stackoverflow.com/questions/60876103/use-pypdf2-to-detect-non-embedded-fonts-in-pdf-file-generated-by-google-docs/60895334#60895334
I updated the code as best I could. Untested though. Thanks for the info!
That's a very handy! Is it also possuble to get a text wich is written bold?
@tiarno any way we can do this as well
hi, can someone help a rookie out... this is a function that will return the names of the various fonts within a pdf... correct?
so i should feed it a pdf ( i assume this is the object param of the function) but what are the other 2 params? it seems like its asking me for 2 fonts..
quick explain... thanks
That's a very handy! Is it also possuble to get a text wich is written bold?
@tiarno any way we can do this as well
Indeed this would be really useful!
We've been using this code for quite a while to detect unembedded fonts.
I made a change recently to support composite (Type 0) fonts: alphagov/notifications-template-preview@83c137b
We've been using this code for quite a while to detect unembedded fonts.
I made a change recently to support composite (Type 0) fonts: alphagov/notifications-template-preview@83c137b
Very nice, I didn't even know about Type0 fonts. Thanks for the comment.
Thank you for this! It helped me out. I've gotten outputs for PDFs I've tested as follows:
{'SymbolMT', 'ArialMT', 'BCDEEE', 'BCDGEE', 'BCDFEE'}
Is there a reason why it shows 'ArialMT' instead of just 'Arial' and what exactly is the 'BCDGEE'? Is there a way to get rid of the 'MT' in 'ArialMT' and the 'BCDGEE'?
I would love this in python 3.
@mteam88 here:
from PyPDF2 import PdfReader
from pprint import pprint
import PyPDF2
def walk(obj, fnt, emb):
'''
If there is a key called 'BaseFont', that is a font that is used in the document.
If there is a key called 'FontName' and another key in the same dictionary object
that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is
embedded.
We create and add to two sets, fnt = fonts used and emb = fonts embedded.
'''
if not hasattr(obj, 'keys'):
return None, None
fontkeys = set(['/FontFile', '/FontFile2', '/FontFile3'])
if '/BaseFont' in obj:
fnt.add(obj['/BaseFont'])
if '/FontName' in obj:
if [x for x in fontkeys if x in obj]:# test to see if there is FontFile
emb.add(obj['/FontName'])
for k in obj.keys():
walk(obj[k], fnt, emb)
return fnt, emb# return the sets for each page
if __name__ == '__main__':
fname = 'myfile.pdf'
pdf = PdfReader(fname)
fonts = set()
embedded = set()
for page in pdf.pages:
obj = page.get_object()
# updated via this answer:
# https://stackoverflow.com/questions/60876103/use-pypdf2-to-detect-non-embedded-fonts-in-pdf-file-generated-by-google-docs/60895334#60895334
# in order to handle lists inside objects. Thanks misingnoglic !
# untested code since I don't have such a PDF to play with.
if type(obj) == PyPDF2.generic.ArrayObject: # You can also do ducktyping here
for i in obj:
if hasattr(i, 'keys'):
f, e = walk(i, fonts, embedded_fonts)
fonts = fonts.union(f)
embedded = embedded.union(e)
else:
f, e = walk(obj['/Resources'], fonts, embedded)
fonts = fonts.union(f)
embedded = embedded.union(e)
unembedded = fonts - embedded
print('Font List')
pprint(sorted(list(fonts)))
if unembedded:
print ('\nUnembedded Fonts')
pprint(unembedded)
Hi! Is there a way to get bold words or bold phrases inside a page containing information about font used?
Thank you very much for the script - used it to get the fonts of a pdf that I couldn't extract by other means!