Copy link to clipboard
Copied
Hello community,
I always encounter the same error when I try to extract text from a PDF. During the first one or two attempts to get the job result, this error occurs. I have to retry three times to continue the program and extract the text, and it succeeds after a couple of tries. Is this expected, or am I doing something wrong?
SdkException: description =Request could not be completed. Possible cause attached!, requestTrackingId=(<class 'requests.exceptions.ConnectionError'>, ConnectionError(MaxRetryError('HTTPSConnectionPool(host=\'dcplatformstorageservice-prod-us-east-1.s3-accelerate.amazonaws.com\', port=443): Max retries exceeded with url: /...%40techacct.adobe.com/2bfe418e-cb4...?X-Amz-Security-Token=some long token. (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7fb05cc3c040>: Failed to resolve \'dcplatformstorageservice-prod-us-east-1.s3-accelerate.amazonaws.com\' ([Errno -3] Temporary failure in name resolution)"))')), <traceback object at 0x7fb05cc12d40>)
- Im using pdfservices-sdk 4.0.0 for python.
This is my facade class to access adobe extract services.
class ExtractTextInfoFromPDF:
def __init__(self):
self._credentials_setup()
self._default_configs()
self._asset = None
self._pdf_services = None
def _credentials_setup(self):
self._credentials = ServicePrincipalCredentials(
client_id=os.getenv("PDF_SERVICES_CLIENT_ID"),
client_secret=os.getenv("PDF_SERVICES_CLIENT_SECRET"),
)
def _default_configs(self):
self._configs = ClientConfig(read_timeout=10000, connect_timeout=4000)
self._max_retries = 3
def _create_pdf_job(self) -> ExtractPDFJob:
if not self._asset:
raise SdkException(
"You must `upload_input_stream` before creating a pdf job."
)
params = ExtractPDFParams(
elements_to_extract=[ExtractElementType.TEXT],
)
return ExtractPDFJob(input_asset=self._asset, extract_pdf_params=params)
def change_configs(
self,
read_timeout_ms: int = 10000,
connect_timeout_ms: int = 4000,
max_retries: int = 3,
):
"""
Change default configs.
Args:
read_timeout_ms (int): The number of milliseconds the client will wait for the server to send a response.
connect_timeout_ms (int): The number of milliseconds Requests will wait for the client to establish a connection to Server.
max_retries (int): The max retries to get the `extract_text` job.
"""
self._configs = ClientConfig(
read_timeout=read_timeout_ms, connect_timeout=connect_timeout_ms
)
self._max_retries = max_retries
return self
def start_pdf_services(self):
"""
Start adobe pdf services using configs previously defined.
"""
self._pdf_services = PDFServices(
credentials=self._credentials, client_config=self._configs
)
return self
def upload_input_stream(self, input_stream: bytes):
"""
Upload file to use by `extract_text` job.
Raises:
It could raise `Exceptions`, be aware of using it without try.
"""
if not self._pdf_services:
raise SdkException(
"You must initialize services calling `start_pdf_services`."
)
self._asset = self._pdf_services.upload(
input_stream=input_stream, mime_type=PDFServicesMediaType.PDF
)
def extract_text(self) -> bytes | None:
"""
Extract text from the provided PDF input stream.
Raises:
It could raise `Exceptions`, be aware of using it without try.
"""
if not self._pdf_services:
raise SdkException(
"You must initialize services calling `start_pdf_services`."
)
for attempt in range(self._max_retries):
try:
extract_pdf_job = self._create_pdf_job()
polling_url = self._pdf_services.submit(extract_pdf_job)
job_response: PDFServicesResponse = self._pdf_services.get_job_result(
polling_url=polling_url, result_type=ExtractPDFResult
)
result: ExtractPDFResult = job_response.get_result()
return result.get_content_json()
except SdkException as e:
# Log the exception if you have a logging mechanism
logger.error(f"Attempt {attempt + 1} failed due to SdkException: {e}")
if attempt == self._max_retries - 1:
# Raise the exception after the final attempt
raise Exception(
"Max retries exceeded while trying to extract text from PDF due to SdkException."
) from e
except (ServiceApiException, ServiceUsageException) as e:
raise e
Have something to add?