erwan-lemonnier · February 18, 2021 17:15
diff --git a/pdf2emails-image-2-vision.py b/pdf2emails-image-2-vision.py
        # Use vision api to extract all text in the image
        response = gvc.annotate_image({
            'image': {
                'source': {
                    'image_uri': 'gs://%s/%s' % (bucket_name, IMG_NAME),
                }
            },
            'features': [
                {
                    'type_': vision.Feature.Type.DOCUMENT_TEXT_DETECTION,
                }
            ]
        })
  
        for t in response.text_annotations:
            # Assuming that if it has only one '@', it's an email
            # address... You'll have to tweak that if your pages don't only
            # contain email addresses
            if t.description.count('@') > 1:
                # That's the first annotation, gathering ALL text parsed in the
                # image Here we are assuming that all text blocks in the page
                # are email addresses. You'll have to tweak that if your pages
                # don't only contain email addresses
                for l in t.description.split('\n'):
                    if '@' in l:
                        l = l.strip().lower().replace(' ', '')
                        emails.append(l)
	# Use vision api to extract all text in the image
	response = gvc.annotate_image({
	'image': {
	'source': {
	'image_uri': 'gs://%s/%s' % (bucket_name, IMG_NAME),
	}
	},
	'features': [
	{
	'type_': vision.Feature.Type.DOCUMENT_TEXT_DETECTION,
	}
	]
	})

	for t in response.text_annotations:
	# Assuming that if it has only one '@', it's an email
	# address... You'll have to tweak that if your pages don't only
	# contain email addresses
	if t.description.count('@') > 1:
	# That's the first annotation, gathering ALL text parsed in the
	# image Here we are assuming that all text blocks in the page
	# are email addresses. You'll have to tweak that if your pages
	# don't only contain email addresses
	for l in t.description.split('\n'):
	if '@' in l:
	l = l.strip().lower().replace(' ', '')
	emails.append(l)