pdf Extractor Not able to infer an input media type

Question

Hello,I have the following short code to extract tables in csv from a pdf. But it seems I did not understand something in the call (error message after the code snippet) import requests import json import pandas as pd def get_access_token(credentials): auth_url = "https://ims-na1.adobelogin.com/ims/token/v1" payload = { "grant_type": "client_credentials", "client_id": credentials["client_id"], "client_secret": credentials["client_secret"], "scope": "openid,creative_sdk" } response = requests.post(auth_url, data=payload) response.raise_for_status() return response.json()["access_token"] def get_uploadURI_and_asset(access_token, client_id): initiate_upload_url = "https://pdf-services.adobe.io/assets" headers = { "Authorization": f"Bearer {access_token}", "x-api-key": client_id, "Content-Type": "application/json" } payload = { "mediaType": "application/pdf" } response = requests.post(initiate_upload_url, headers=headers, json=payload) response.raise_for_status() upload_info = response.json() return upload_info["uploadUri"], upload_info["assetID"] def get_downloadURI(access_token, client_id, asset_id): asset_url = f"https://pdf-services.adobe.io/assets/{asset_id}" headers = { "Authorization": f"Bearer {access_token}", "x-api-key": client_id } response = requests.get(asset_url, headers=headers) response.raise_for_status() upload_info = response.json() return upload_info["downloadUri"] def upload_pdf(upload_url, pdf_path): with open(pdf_path, 'rb') as f: response = requests.put(upload_url, data=f, headers={'Content-Type': 'application/pdf'}) response.raise_for_status() # Raise an exception for HTTP errors #NOT WORKING FUNCTION def extract_tables(access_token, client_id, input_uri, output_uri): extract_url = "https://pdf-services.adobe.io/operation/extractpdf" payload = { "input": { "uri": input_uri }, "output": { "uri": output_uri }, "params": { "elementsToExtract": ["text","tables"], "tableOutputFormat": "csv" } } headers = { "Authorization": f"Bearer {access_token}", "x-api-key": client_id, "Content-Type": "application/json" } # Debugging prints print("Extract URL:", extract_url) print("Headers:", headers) print("Payload:", json.dumps(payload, indent=2)) response = requests.post(extract_url, headers=headers, json=payload) print("Response Status Code:", response.status_code) print("Response Content:", response.content) response.raise_for_status() # Raise an exception for HTTP errors return response.json() #main process def extract_tables_from_pdf(pdf_path, credentials_path): # Read the credentials from the JSON file with open(credentials_path) as f: credentials = json.load(f) # Get access token access_token = get_access_token(credentials) # Initiate the file upload and get the upload URL and asset ID upload_url, asset_id = initiate_upload(access_token, credentials["client_id"]) # Upload the PDF file to the pre-signed URL upload_pdf(upload_url, pdf_path) # Get the download URI for the asset download_uri = get_downloadURI(access_token, credentials["client_id"], asset_id) # Extract tables from the uploaded PDF # THIS IS WHERE IT DOESNT WORK extract_data = extract_tables(access_token, credentials["client_id"], upload_url, download_uri) print(extract_data) table_list = [] # Iterate through the extracted data to find tables for element in extract_data['elements']: if element['type'] == 'table': page_number = element['page_number'] table_data = element['data'] df = pd.DataFrame(table_data) table_list.append({ 'page': page_number, 'dataframe': df }) return table_list local_pdf_path = 'test.pdf' credentials_path = "adobe_credentials.json" #json file with client secret and client id tables = extract_tables_from_pdf(local_pdf_path, credentials_path) Extract URL: https://pdf-services.adobe.io/operation/extractpdf Headers: {'Authorization': 'Bearer ', 'x-api-key': '', 'Content-Type': 'application/json'} Payload: { "input": { "uri": "" }, "output": { "uri": "" }, "params": { "elementsToExtract": [ "tables" ], "tableOutputFormat": "csv" } } Response Status Code: 400 Response Content: b'{"error":{"code":"BAD_PARAMS","message":"Not able to infer an input media type. Check the provided input and try again."}}' --------------------------------------------------------------------------- HTTPError Traceback (most recent call last) Cell In[2], line 122 120 local_pdf_path = 'test.pdf' 121 credentials_path = "adobe_credentials.json" --> 122 tables = extract_tables_from_pdf(local_pdf_path, credentials_path) 124 # Displaying the tables extracted 125 for table in tables: Cell In[2], line 101, in extract_tables_from_pdf(pdf_path, credentials_path) 97 download_uri = get_downloadURI(access_token, credentials["client_id"], asset_id) 100 # Extract tables from the uploaded PDF --> 101 extract_data = extract_tables(access_token, credentials["client_id"], upload_url, download_uri) 103 print(extract_data) 104 table_list = [] Cell In[2], line 78, in extract_tables(access_token, client_id, input_uri, output_uri) 75 print("Response Status Code:", response.status_code) 76 print("Response Content:", response.content) ---> 78 response.raise_for_status() # Raise an exception for HTTP errors 79 return response.json() File /opt/conda/lib/python3.10/site-packages/requests/models.py:1021, in Response.raise_for_status(self) 1016 http_error_msg = ( 1017 f"{self.status_code} Server Error: {reason} for url: {self.url}" 1018 ) 1020 if http_error_msg: -> 1021 raise HTTPError(http_error_msg, response=self) HTTPError: 400 Client Error: Bad Request for url: https://pdf-services.adobe.io/operation/extractpdfCan someone help me ? I did copy the documentation for the params but something went obviously wrong, but in the doc there is no mention to add media type anywhereThanks !

Raymond Camden · Answer

Your logic is a bit off there. You can't get the download url until after the job is complete. The logic should be - create the job (and do not pass that download uri), poll for completion, and when it's done, it gives you a URL to download the bits.

Now, you CAN have our APIs automaticaly write to cloud storage, and in _that_ instance, you pass both an input and output URL, but that's not what you are doing here.

Sign up

To post, reply, or follow discussions, please sign in with your Adobe ID.

Sign in to Adobe Community

To post, reply, or follow discussions, please sign in with your Adobe ID.

Scanning file for viruses.

This file cannot be downloaded