daveebbelaar · July 13, 2024 07:08 · RemoSande · Jul 29, 2024 · daveebbelaar · Jul 29, 2024
diff --git a/document_intelligence.py b/document_intelligence.py
 import logging
 import requests
 import time
 from typing import Union, Dict
 from config.settings import get_settings


 class DocumentIntelligenceService:
    """
    A service class for interacting with Azure Document Intelligence API.
    This class provides methods to analyze documents using Azure's Document Intelligence service.
    """

    def __init__(self):
        """
        Initialize the DocumentIntelligenceService with API credentials and endpoint.
        """
        settings = get_settings()
        self.key = settings.document_intelligence.api_key
        self.endpoint = settings.document_intelligence.endpoint
        self.api_version = "2024-02-29-preview"  # Currently only available in East US, West US2, and West Europe

    def analyze(
        self,
        source: Union[str, bytes],
        is_url: bool = True,
        model_id: str = "prebuilt-layout",
    ) -> Dict:
        """
        Analyze a document using Azure Document Intelligence.

        Args:
            source (Union[str, bytes]): The document source, either a URL or base64 encoded content.
            is_url (bool): True if the source is a URL, False if it's base64 encoded content.
            model_id (str): The ID of the model to use for analysis.

        Returns:
            Dict: The analysis results.

        Raises:
            requests.HTTPError: If the API request fails.
        """
        result_id = self._submit_analysis(source, is_url, model_id)
        return self._get_analysis_results(result_id, model_id)

    def _submit_analysis(
        self, source: Union[str, bytes], is_url: bool, model_id: str
    ) -> str:
        """
        Submit a document for analysis to Azure Document Intelligence.

        Args:
            source (Union[str, bytes]): The document source, either a URL or base64 encoded content.
            is_url (bool): True if the source is a URL, False if it's base64 encoded content.
            model_id (str): The ID of the model to use for analysis.

        Returns:
            str: The result ID for the submitted analysis.

        Raises:
            ValueError: If the Operation-Location header is missing in the response.
            requests.HTTPError: If the API request fails.
        """
        url = f"{self.endpoint}/documentintelligence/documentModels/{model_id}:analyze?api-version={self.api_version}&outputContentFormat=markdown"
        headers = {
            "Content-Type": "application/json",
            "Ocp-Apim-Subscription-Key": self.key,
        }
        data = {"urlSource": source} if is_url else {"base64Source": source}

        logging.info("Submitting document for analysis")
        response = requests.post(url, headers=headers, json=data)
        response.raise_for_status()

        operation_location = response.headers.get("Operation-Location")
        if not operation_location:
            raise ValueError("Operation-Location header is missing in the response.")

        return operation_location.split("/")[-1].split("?")[0]

    def _get_analysis_results(self, result_id: str, model_id: str) -> Dict:
        """
        Retrieve the analysis results from Azure Document Intelligence.

        Args:
            result_id (str): The ID of the analysis result to retrieve.
            model_id (str): The ID of the model used for analysis.

        Returns:
            Dict: The analysis results.

        Raises:
            requests.HTTPError: If the API request fails.
        """
        url = f"{self.endpoint}/documentintelligence/documentModels/{model_id}/analyzeResults/{result_id}?api-version={self.api_version}&outputContentFormat=markdown"
        headers = {"Ocp-Apim-Subscription-Key": self.key}

        while True:
            logging.info("Waiting for analysis to complete.")
            time.sleep(2)
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            data = response.json()

            if data.get("status") in ["succeeded", "failed"]:
                return data


 if __name__ == "__main__":
    # Example usage of the DocumentIntelligenceService
    client = DocumentIntelligenceService()
    analysis_results = client.analyze(
        source="https://s2.q4cdn.com/299287126/files/doc_financials/2024/ar/Amazon-com-Inc-2023-Annual-Report.pdf"
    )
    print(analysis_results.keys())
    print(analysis_results["analyzeResult"].keys())
    print(analysis_results["analyzeResult"]["content"])
    print(analysis_results["analyzeResult"]["tables"])
	import logging
	import requests
	import time
	from typing import Union, Dict
	from config.settings import get_settings


	class DocumentIntelligenceService:
	"""
	A service class for interacting with Azure Document Intelligence API.
	This class provides methods to analyze documents using Azure's Document Intelligence service.
	"""

	def __init__(self):
	"""
	Initialize the DocumentIntelligenceService with API credentials and endpoint.
	"""
	settings = get_settings()
	self.key = settings.document_intelligence.api_key
	self.endpoint = settings.document_intelligence.endpoint
	self.api_version = "2024-02-29-preview" # Currently only available in East US, West US2, and West Europe

	def analyze(
	self,
	source: Union[str, bytes],
	is_url: bool = True,
	model_id: str = "prebuilt-layout",
	) -> Dict:
	"""
	Analyze a document using Azure Document Intelligence.

	Args:
	source (Union[str, bytes]): The document source, either a URL or base64 encoded content.
	is_url (bool): True if the source is a URL, False if it's base64 encoded content.
	model_id (str): The ID of the model to use for analysis.

	Returns:
	Dict: The analysis results.

	Raises:
	requests.HTTPError: If the API request fails.
	"""
	result_id = self._submit_analysis(source, is_url, model_id)
	return self._get_analysis_results(result_id, model_id)

	def _submit_analysis(
	self, source: Union[str, bytes], is_url: bool, model_id: str
	) -> str:
	"""
	Submit a document for analysis to Azure Document Intelligence.

	Args:
	source (Union[str, bytes]): The document source, either a URL or base64 encoded content.
	is_url (bool): True if the source is a URL, False if it's base64 encoded content.
	model_id (str): The ID of the model to use for analysis.

	Returns:
	str: The result ID for the submitted analysis.

	Raises:
	ValueError: If the Operation-Location header is missing in the response.
	requests.HTTPError: If the API request fails.
	"""
	url = f"{self.endpoint}/documentintelligence/documentModels/{model_id}:analyze?api-version={self.api_version}&outputContentFormat=markdown"
	headers = {
	"Content-Type": "application/json",
	"Ocp-Apim-Subscription-Key": self.key,
	}
	data = {"urlSource": source} if is_url else {"base64Source": source}

	logging.info("Submitting document for analysis")
	response = requests.post(url, headers=headers, json=data)
	response.raise_for_status()

	operation_location = response.headers.get("Operation-Location")
	if not operation_location:
	raise ValueError("Operation-Location header is missing in the response.")

	return operation_location.split("/")[-1].split("?")[0]

	def _get_analysis_results(self, result_id: str, model_id: str) -> Dict:
	"""
	Retrieve the analysis results from Azure Document Intelligence.

	Args:
	result_id (str): The ID of the analysis result to retrieve.
	model_id (str): The ID of the model used for analysis.

	Returns:
	Dict: The analysis results.

	Raises:
	requests.HTTPError: If the API request fails.
	"""
	url = f"{self.endpoint}/documentintelligence/documentModels/{model_id}/analyzeResults/{result_id}?api-version={self.api_version}&outputContentFormat=markdown"
	headers = {"Ocp-Apim-Subscription-Key": self.key}

	while True:
	logging.info("Waiting for analysis to complete.")
	time.sleep(2)
	response = requests.get(url, headers=headers)
	response.raise_for_status()
	data = response.json()

	if data.get("status") in ["succeeded", "failed"]:
	return data


	if __name__ == "__main__":
	# Example usage of the DocumentIntelligenceService
	client = DocumentIntelligenceService()
	analysis_results = client.analyze(
	source="https://s2.q4cdn.com/299287126/files/doc_financials/2024/ar/Amazon-com-Inc-2023-Annual-Report.pdf"
	)
	print(analysis_results.keys())
	print(analysis_results["analyzeResult"].keys())
	print(analysis_results["analyzeResult"]["content"])
	print(analysis_results["analyzeResult"]["tables"])