Skip to main content
Participant
July 27, 2024
Question

pdf Extractor Not able to infer an input media type

  • July 27, 2024
  • 1 reply
  • 456 views

Hello,
I have the following short code to extract tables in csv from a pdf. But it seems I did not understand something in the call (error message after the code snippet) 

 

import requests
import json
import pandas as pd

def get_access_token(credentials):
    auth_url = "https://ims-na1.adobelogin.com/ims/token/v1"
    payload = {
        "grant_type": "client_credentials",
        "client_id": credentials["client_id"],
        "client_secret": credentials["client_secret"],
        "scope": "openid,creative_sdk"
    }
    
    response = requests.post(auth_url, data=payload)
    response.raise_for_status() 
    return response.json()["access_token"]

def get_uploadURI_and_asset(access_token, client_id):
    initiate_upload_url = "https://pdf-services.adobe.io/assets"
    headers = {
        "Authorization": f"Bearer {access_token}",
        "x-api-key": client_id,
        "Content-Type": "application/json"
    }
    payload = {
        "mediaType": "application/pdf"
    }
    response = requests.post(initiate_upload_url, headers=headers, json=payload)
    response.raise_for_status()
    upload_info = response.json()
    return upload_info["uploadUri"], upload_info["assetID"]

def get_downloadURI(access_token, client_id, asset_id):
    asset_url = f"https://pdf-services.adobe.io/assets/{asset_id}"
    headers = {
        "Authorization": f"Bearer {access_token}",
        "x-api-key": client_id
    }
    response = requests.get(asset_url, headers=headers)
    response.raise_for_status()
    upload_info = response.json()
    return upload_info["downloadUri"]

def upload_pdf(upload_url, pdf_path):
    with open(pdf_path, 'rb') as f:
        response = requests.put(upload_url, data=f, headers={'Content-Type': 'application/pdf'})
    response.raise_for_status()  # Raise an exception for HTTP errors

#NOT WORKING FUNCTION
def extract_tables(access_token, client_id, input_uri, output_uri):
    extract_url = "https://pdf-services.adobe.io/operation/extractpdf"
    payload = {
                "input": {
                        "uri": input_uri
                },
                "output": {
                        "uri": output_uri
                },
                "params": {
                        "elementsToExtract": ["text","tables"],
                        "tableOutputFormat": "csv"
                }
    }
    headers = {
        "Authorization": f"Bearer {access_token}",
        "x-api-key": client_id,
        "Content-Type": "application/json"
    }

    # Debugging prints
    print("Extract URL:", extract_url)
    print("Headers:", headers)
    print("Payload:", json.dumps(payload, indent=2))

    response = requests.post(extract_url, headers=headers, json=payload)
    print("Response Status Code:", response.status_code)
    print("Response Content:", response.content)

    response.raise_for_status()  # Raise an exception for HTTP errors
    return response.json()


#main process
def extract_tables_from_pdf(pdf_path, credentials_path):
    # Read the credentials from the JSON file
    with open(credentials_path) as f:
        credentials = json.load(f)

    # Get access token
    access_token = get_access_token(credentials)

    # Initiate the file upload and get the upload URL and asset ID
    upload_url, asset_id = initiate_upload(access_token, credentials["client_id"])
    
    # Upload the PDF file to the pre-signed URL
    upload_pdf(upload_url, pdf_path)
    

    # Get the download URI for the asset
    download_uri = get_downloadURI(access_token, credentials["client_id"], asset_id)


    # Extract tables from the uploaded PDF
    # THIS IS WHERE IT DOESNT WORK 
    extract_data = extract_tables(access_token, credentials["client_id"], upload_url, download_uri)

    print(extract_data)
    table_list = []

    # Iterate through the extracted data to find tables
    for element in extract_data['elements']:
        if element['type'] == 'table':
            page_number = element['page_number']
            table_data = element['data']
            df = pd.DataFrame(table_data)
            table_list.append({
                'page': page_number,
                'dataframe': df
            })

    return table_list


local_pdf_path = 'test.pdf'
credentials_path = "adobe_credentials.json" #json file with client secret and client id
tables = extract_tables_from_pdf(local_pdf_path, credentials_path)

 

 

Extract URL: https://pdf-services.adobe.io/operation/extractpdf
Headers: {'Authorization': 'Bearer <result of get_access_token()>', 
'x-api-key': '<client_id from my cred file>',
'Content-Type': 'application/json'} Payload: { "input": { "uri": "<uri from get_uploadURI_and_asset()>" }, "output": { "uri": "<uri from get_downloadURI()>" }, "params": { "elementsToExtract": [ "tables" ], "tableOutputFormat": "csv" } } Response Status Code: 400 Response Content: b'{"error":{"code":"BAD_PARAMS","message":"Not able to infer an input media type. Check the provided input and try again."}}'
 
---------------------------------------------------------------------------
HTTPError                                 Traceback (most recent call last)
Cell In[2], line 122
    120 local_pdf_path = 'test.pdf'
    121 credentials_path = "adobe_credentials.json"
--> 122 tables = extract_tables_from_pdf(local_pdf_path, credentials_path)
    124 # Displaying the tables extracted
    125 for table in tables:

Cell In[2], line 101, in extract_tables_from_pdf(pdf_path, credentials_path)
     97 download_uri = get_downloadURI(access_token, credentials["client_id"], asset_id)
    100 # Extract tables from the uploaded PDF
--> 101 extract_data = extract_tables(access_token, credentials["client_id"], upload_url, download_uri)
    103 print(extract_data)
    104 table_list = []

Cell In[2], line 78, in extract_tables(access_token, client_id, input_uri, output_uri)
     75 print("Response Status Code:", response.status_code)
     76 print("Response Content:", response.content)
---> 78 response.raise_for_status()  # Raise an exception for HTTP errors
     79 return response.json()

File /opt/conda/lib/python3.10/site-packages/requests/models.py:1021, in Response.raise_for_status(self)
   1016     http_error_msg = (
   1017         f"{self.status_code} Server Error: {reason} for url: {self.url}"
   1018     )
   1020 if http_error_msg:
-> 1021     raise HTTPError(http_error_msg, response=self)

HTTPError: 400 Client Error: Bad Request for url: https://pdf-services.adobe.io/operation/extractpdf



Can someone help me ? I did copy the documentation for the params but something went obviously wrong, but in the doc there is no mention to add media type anywhere

Thanks !

This topic has been closed for replies.

1 reply

Raymond Camden
Community Manager
Community Manager
July 29, 2024

Your logic is a bit off there. You can't get the download url until after the job is complete. The logic should be - create the job (and do not pass that download uri), poll for completion, and when it's done, it gives you a URL to download the bits.

 

Now, you CAN have our APIs automaticaly write to cloud storage, and in _that_ instance, you pass both an input and output URL, but that's not what you are doing here.

Participant
July 29, 2024

Ok ! 🙂 
Then I do not understand 
in the documentation https://developer.adobe.com/document-services/docs/apis/#tag/Extract-PDF
it is provided a json payload example 
{

  • "input": {},
  • "output": {},
  • "params": {
    • "getCharBounds": false,
    • "includeStyling": false,
    • "elementsToExtract": [
      • "text",
      • "tables"
      ],
    • "tableOutputFormat": "xlsx",
    • "renditionsToExtract": [
      • "tables",
      • "figures"
      ],,
    • "includeHeaderFooter": false,
    • "tagEncapsulatedText": [
      • "Figure"
      ]
    },
  • "notifiers": []

}

 

so what are the input and output uri I should give ?

I tried with 

 

payload = {
                "input": {
                        "uri": input_uri
                },
                "params": {
                        "elementsToExtract": ["tables"],
                        "tableOutputFormat": "csv"
                }
    }
    headers = {
        "Authorization": f"Bearer {access_token}",
        "x-api-key": client_id,
        "Content-Type": "application/json"
    }

 

It doesnt work neither (same error)