Unable to Extract any Data from Pdf's
- November 16, 2021
- 2 replies
- 11629 views
I'm facing some Adobe services exceptions while running the Python SDK of Adobe PDF Extract API Service.
Clueless thing is I'm facing this exception only when I'm trying to use any of my PDF Data sets. However, it's working succesfully for the pdf sample which comes with all SDK named: "extractPdfInput.pdf" and with this I'm able to generate json structure for all the .py files inside rc
1) .py script with my PDF data set : (AnalogDialogue.pdf)
import logging
import os.path
import zipfile
from adobe.pdfservices.operation.auth.credentials import Credentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import ExtractPDFOptions
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import ExtractElementType
from adobe.pdfservices.operation.execution_context import ExecutionContext
from adobe.pdfservices.operation.io.file_ref import FileRef
from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ExtractPDFOperation
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
try:
# get base path.
base_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Initial setup, create credentials instance.
credentials = Credentials.service_account_credentials_builder() \
.from_file(base_path + "/pdfservices-api-credentials.json") \
.build()
# Create an ExecutionContext using credentials and create a new operation instance.
execution_context = ExecutionContext.create(credentials)
extract_pdf_operation = ExtractPDFOperation.create_new()
# Set operation input from a source file.
source = FileRef.create_from_local_file(base_path + "/resources/AnalogDialogue.pdf")
extract_pdf_operation.set_input(source)
# Build ExtractPDF options and set them into the operation
extract_pdf_options: ExtractPDFOptions = ExtractPDFOptions.builder() \
.with_element_to_extract(ExtractElementType.TEXT) \
.build()
extract_pdf_operation.set_options(extract_pdf_options)
# Execute the operation.
result: FileRef = extract_pdf_operation.execute(execution_context)
# Save the result to the specified location.
result.save_as(base_path + "/output/ExtractTextInfoFromPDF.zip")
file_to_extract = "structuredData.json"
# extract the json
with zipfile.ZipFile(base_path + "/output/ExtractTextInfoFromPDF.zip") as z:
with open(file_to_extract, 'wb') as f:
f.write(z.read(file_to_extract))
print("Extracted", file_to_extract)
# os.remove(base_path + "/output/ExtractTextInfoFromPDF.zip")
except (ServiceApiException, ServiceUsageException, SdkException):
logging.exception("Exception encountered while executing operation")
- Terminal log while running "adobe-pdf-extract/src/extractpdf/extract_txt_from_pdf.py"
python3 src/extractpdf/extract_txt_table_info_with_figure_tables_rendition_from_pdf.py
INFO:adobe.pdfservices.operation.pdfops.extract_pdf_operation:All validations successfully done. Beginning ExtractPDF operation execution
INFO:adobe.pdfservices.operation.pdfops.extract_pdf_operation:Extract Operation Successful - Transaction ID: lUFDE1p1OC3oxgDtCeIdW6HeWmVc14Ry
INFO:adobe.pdfservices.operation.internal.io.file_ref_impl:Moving file at /var/folders/z_/hrr9wxg135x30vrj32b868100000gp/T/extractSdkResult/b22cc67a46ab11ec9955b88d120e91a8.zip to target /Users/achal/Downloads/PDFServices/adobe-pdf-extract/output/ExtractTextTableWithFigureTableRendition.zip
admins-MacBook-Air-3:adobe-pdf-extract achal$ python3 src/extractpdf/extract_txt_table_info_with_figure_tables_rendition_from_pdf.py
INFO:adobe.pdfservices.operation.pdfops.extract_pdf_operation:All validations successfully done. Beginning ExtractPDF operation execution
ERROR:root:Exception encountered while executing operation
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/urllib3/connectionpool.py", line 706, in urlopen
chunked=chunked,
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/urllib3/connectionpool.py", line 394, in _make_request
conn.request(method, url, **httplib_request_kw)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/urllib3/connection.py", line 234, in request
super(HTTPConnection, self).request(method, url, body=body, headers=headers)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/http/client.py", line 1262, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/http/client.py", line 1308, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/http/client.py", line 1257, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/http/client.py", line 1067, in _send_output
self.send(chunk)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/http/client.py", line 989, in send
self.sock.sendall(data)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/ssl.py", line 1034, in sendall
v = self.send(byte_view[count:])
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/ssl.py", line 1003, in send
return self._sslobj.write(data)
socket.timeout: The write operation timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/requests/adapters.py", line 449, in send
timeout=timeout
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/urllib3/connectionpool.py", line 756, in urlopen
method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/urllib3/util/retry.py", line 531, in increment
raise six.reraise(type(error), error, _stacktrace)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/urllib3/packages/six.py", line 734, in reraise
raise value.with_traceback(tb)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/urllib3/connectionpool.py", line 706, in urlopen
chunked=chunked,
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/urllib3/connectionpool.py", line 394, in _make_request
conn.request(method, url, **httplib_request_kw)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/urllib3/connection.py", line 234, in request
super(HTTPConnection, self).request(method, url, body=body, headers=headers)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/http/client.py", line 1262, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/http/client.py", line 1308, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/http/client.py", line 1257, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/http/client.py", line 1067, in _send_output
self.send(chunk)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/http/client.py", line 989, in send
self.sock.sendall(data)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/ssl.py", line 1034, in sendall
v = self.send(byte_view[count:])
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/ssl.py", line 1003, in send
return self._sslobj.write(data)
urllib3.exceptions.ProtocolError: ('Connection aborted.', timeout('The write operation timed out'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/adobe/pdfservices/operation/internal/http/http_client.py", line 73, in _execute_request
timeout=timeout)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/requests/api.py", line 119, in post
return request('post', url, data=data, json=json, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/requests/api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/requests/sessions.py", line 542, in request
resp = self.send(prep, **send_kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/requests/sessions.py", line 655, in send
r = adapter.send(request, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/requests/adapters.py", line 498, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', timeout('The write operation timed out'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "src/extractpdf/extract_txt_table_info_with_figure_tables_rendition_from_pdf.py", line 53, in <module>
result: FileRef = extract_pdf_operation.execute(execution_context)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py", line 131, in execute
location = ExtractPDFAPI.extract_pdf(execution_context, self._source_file_ref, self._extract_pdf_options)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/adobe/pdfservices/operation/internal/service/extract_pdf_api.py", line 43, in extract_pdf
ServiceConstants.EXTRACT_OPERATION_NAME)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py", line 65, in cpf_create_ops_api
error_response_handler=CPFApi.handle_error_response)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/adobe/pdfservices/operation/internal/http/http_client.py", line 41, in process_request
response = _execute_request(http_request)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/adobe/pdfservices/operation/internal/http/http_client.py", line 81, in _execute_request
raise SdkException("Request could not be completed. Possible cause attached!", sys.exc_info())
adobe.pdfservices.operation.exception.exceptions.SdkException: description =Request could not be completed. Possible cause attached!, requestTrackingId=(<class 'requests.exceptions.ConnectionError'>, ConnectionError(ProtocolError('Connection aborted.', timeout('The write operation timed out'))), <traceback object at 0x10ff36500>)
