SdkException Request could not be completed. request.exceptions.ConnectionError.

Report · Sep 17, 2024

Hello community,

I always encounter the same error when I try to extract text from a PDF. During the first one or two attempts to get the job result, this error occurs. I have to retry three times to continue the program and extract the text, and it succeeds after a couple of tries. Is this expected, or am I doing something wrong?

SdkException: description =Request could not be completed. Possible cause attached!, requestTrackingId=(<class 'requests.exceptions.ConnectionError'>, ConnectionError(MaxRetryError('HTTPSConnectionPool(host=\'dcplatformstorageservice-prod-us-east-1.s3-accelerate.amazonaws.com\', port=443): Max retries exceeded with url: /...%40techacct.adobe.com/2bfe418e-cb4...?X-Amz-Security-Token=some long token. (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7fb05cc3c040>: Failed to resolve \'dcplatformstorageservice-prod-us-east-1.s3-accelerate.amazonaws.com\' ([Errno -3] Temporary failure in name resolution)"))')), <traceback object at 0x7fb05cc12d40>)

- Im using pdfservices-sdk 4.0.0 for python.
This is my facade class to access adobe extract services.

class ExtractTextInfoFromPDF:
    def __init__(self):
        self._credentials_setup()
        self._default_configs()
        self._asset = None
        self._pdf_services = None

    def _credentials_setup(self):
        self._credentials = ServicePrincipalCredentials(
            client_id=os.getenv("PDF_SERVICES_CLIENT_ID"),
            client_secret=os.getenv("PDF_SERVICES_CLIENT_SECRET"),
        )

    def _default_configs(self):
        self._configs = ClientConfig(read_timeout=10000, connect_timeout=4000)
        self._max_retries = 3

    def _create_pdf_job(self) -> ExtractPDFJob:
        if not self._asset:
            raise SdkException(
                "You must `upload_input_stream` before creating a pdf job."
            )
        params = ExtractPDFParams(
            elements_to_extract=[ExtractElementType.TEXT],
        )
        return ExtractPDFJob(input_asset=self._asset, extract_pdf_params=params)

    def change_configs(
        self,
        read_timeout_ms: int = 10000,
        connect_timeout_ms: int = 4000,
        max_retries: int = 3,
    ):
        """
        Change default configs.

        Args:
            read_timeout_ms (int): The number of milliseconds the client will wait for the server to send a response.
            connect_timeout_ms (int): The number of milliseconds Requests will wait for the client to establish a connection to Server.
            max_retries (int): The max retries to get the `extract_text` job.
        """
        self._configs = ClientConfig(
            read_timeout=read_timeout_ms, connect_timeout=connect_timeout_ms
        )
        self._max_retries = max_retries
        return self

    def start_pdf_services(self):
        """
        Start adobe pdf services using configs previously defined.
        """
        self._pdf_services = PDFServices(
            credentials=self._credentials, client_config=self._configs
        )
        return self

    def upload_input_stream(self, input_stream: bytes):
        """
        Upload file to use by `extract_text` job.

        Raises:
            It could raise `Exceptions`, be aware of using it without try.
        """
        if not self._pdf_services:
            raise SdkException(
                "You must initialize services calling `start_pdf_services`."
            )
        self._asset = self._pdf_services.upload(
            input_stream=input_stream, mime_type=PDFServicesMediaType.PDF
        )

    def extract_text(self) -> bytes | None:
        """
        Extract text from the provided PDF input stream.

        Raises:
            It could raise `Exceptions`, be aware of using it without try.
        """

        if not self._pdf_services:
            raise SdkException(
                "You must initialize services calling `start_pdf_services`."
            )

        for attempt in range(self._max_retries):
            try:
                extract_pdf_job = self._create_pdf_job()
                polling_url = self._pdf_services.submit(extract_pdf_job)
                job_response: PDFServicesResponse = self._pdf_services.get_job_result(
                    polling_url=polling_url, result_type=ExtractPDFResult
                )
                result: ExtractPDFResult = job_response.get_result()
                return result.get_content_json()

            except SdkException as e:
                # Log the exception if you have a logging mechanism
                logger.error(f"Attempt {attempt + 1} failed due to SdkException: {e}")
                if attempt == self._max_retries - 1:
                    # Raise the exception after the final attempt
                    raise Exception(
                        "Max retries exceeded while trying to extract text from PDF due to SdkException."
                    ) from e

            except (ServiceApiException, ServiceUsageException) as e:
                raise e