Question
pdf Extractor Not able to infer an input media type
Hello,
I have the following short code to extract tables in csv from a pdf. But it seems I did not understand something in the call (error message after the code snippet)
import requests
import json
import pandas as pd
def get_access_token(credentials):
auth_url = "https://ims-na1.adobelogin.com/ims/token/v1"
payload = {
"grant_type": "client_credentials",
"client_id": credentials["client_id"],
"client_secret": credentials["client_secret"],
"scope": "openid,creative_sdk"
}
response = requests.post(auth_url, data=payload)
response.raise_for_status()
return response.json()["access_token"]
def get_uploadURI_and_asset(access_token, client_id):
initiate_upload_url = "https://pdf-services.adobe.io/assets"
headers = {
"Authorization": f"Bearer {access_token}",
"x-api-key": client_id,
"Content-Type": "application/json"
}
payload = {
"mediaType": "application/pdf"
}
response = requests.post(initiate_upload_url, headers=headers, json=payload)
response.raise_for_status()
upload_info = response.json()
return upload_info["uploadUri"], upload_info["assetID"]
def get_downloadURI(access_token, client_id, asset_id):
asset_url = f"https://pdf-services.adobe.io/assets/{asset_id}"
headers = {
"Authorization": f"Bearer {access_token}",
"x-api-key": client_id
}
response = requests.get(asset_url, headers=headers)
response.raise_for_status()
upload_info = response.json()
return upload_info["downloadUri"]
def upload_pdf(upload_url, pdf_path):
with open(pdf_path, 'rb') as f:
response = requests.put(upload_url, data=f, headers={'Content-Type': 'application/pdf'})
response.raise_for_status() # Raise an exception for HTTP errors
#NOT WORKING FUNCTION
def extract_tables(access_token, client_id, input_uri, output_uri):
extract_url = "https://pdf-services.adobe.io/operation/extractpdf"
payload = {
"input": {
"uri": input_uri
},
"output": {
"uri": output_uri
},
"params": {
"elementsToExtract": ["text","tables"],
"tableOutputFormat": "csv"
}
}
headers = {
"Authorization": f"Bearer {access_token}",
"x-api-key": client_id,
"Content-Type": "application/json"
}
# Debugging prints
print("Extract URL:", extract_url)
print("Headers:", headers)
print("Payload:", json.dumps(payload, indent=2))
response = requests.post(extract_url, headers=headers, json=payload)
print("Response Status Code:", response.status_code)
print("Response Content:", response.content)
response.raise_for_status() # Raise an exception for HTTP errors
return response.json()
#main process
def extract_tables_from_pdf(pdf_path, credentials_path):
# Read the credentials from the JSON file
with open(credentials_path) as f:
credentials = json.load(f)
# Get access token
access_token = get_access_token(credentials)
# Initiate the file upload and get the upload URL and asset ID
upload_url, asset_id = initiate_upload(access_token, credentials["client_id"])
# Upload the PDF file to the pre-signed URL
upload_pdf(upload_url, pdf_path)
# Get the download URI for the asset
download_uri = get_downloadURI(access_token, credentials["client_id"], asset_id)
# Extract tables from the uploaded PDF
# THIS IS WHERE IT DOESNT WORK
extract_data = extract_tables(access_token, credentials["client_id"], upload_url, download_uri)
print(extract_data)
table_list = []
# Iterate through the extracted data to find tables
for element in extract_data['elements']:
if element['type'] == 'table':
page_number = element['page_number']
table_data = element['data']
df = pd.DataFrame(table_data)
table_list.append({
'page': page_number,
'dataframe': df
})
return table_list
local_pdf_path = 'test.pdf'
credentials_path = "adobe_credentials.json" #json file with client secret and client id
tables = extract_tables_from_pdf(local_pdf_path, credentials_path)
Extract URL: https://pdf-services.adobe.io/operation/extractpdf Headers: {'Authorization': 'Bearer <result of get_access_token()>',
'x-api-key': '<client_id from my cred file>',
'Content-Type': 'application/json'} Payload: { "input": { "uri": "<uri from get_uploadURI_and_asset()>" }, "output": { "uri": "<uri from get_downloadURI()>" }, "params": { "elementsToExtract": [ "tables" ], "tableOutputFormat": "csv" } } Response Status Code: 400 Response Content: b'{"error":{"code":"BAD_PARAMS","message":"Not able to infer an input media type. Check the provided input and try again."}}'
--------------------------------------------------------------------------- HTTPError Traceback (most recent call last) Cell In[2], line 122 120 local_pdf_path = 'test.pdf' 121 credentials_path = "adobe_credentials.json" --> 122 tables = extract_tables_from_pdf(local_pdf_path, credentials_path) 124 # Displaying the tables extracted 125 for table in tables: Cell In[2], line 101, in extract_tables_from_pdf(pdf_path, credentials_path) 97 download_uri = get_downloadURI(access_token, credentials["client_id"], asset_id) 100 # Extract tables from the uploaded PDF --> 101 extract_data = extract_tables(access_token, credentials["client_id"], upload_url, download_uri) 103 print(extract_data) 104 table_list = [] Cell In[2], line 78, in extract_tables(access_token, client_id, input_uri, output_uri) 75 print("Response Status Code:", response.status_code) 76 print("Response Content:", response.content) ---> 78 response.raise_for_status() # Raise an exception for HTTP errors 79 return response.json() File /opt/conda/lib/python3.10/site-packages/requests/models.py:1021, in Response.raise_for_status(self) 1016 http_error_msg = ( 1017 f"{self.status_code} Server Error: {reason} for url: {self.url}" 1018 ) 1020 if http_error_msg: -> 1021 raise HTTPError(http_error_msg, response=self) HTTPError: 400 Client Error: Bad Request for url: https://pdf-services.adobe.io/operation/extractpdf
Can someone help me ? I did copy the documentation for the params but something went obviously wrong, but in the doc there is no mention to add media type anywhere
Thanks !
