Skip to main content
Participant
January 19, 2025
Question

Getting stuck in the middle of uploading in pdf extract api

  • January 19, 2025
  • 0 replies
  • 123 views

Here is my implementation: 

import {
    ServicePrincipalCredentials,
    PDFServices,
    MimeType,
    ExtractPDFParams,
    ExtractElementType,
    ExtractPDFJob,
    ExtractPDFResult,
    TableStructureType,
    ExtractRenditionsElementType
  } from "@adobe/pdfservices-node-sdk";
  import * as fs from "fs";
  import AdmZip from "adm-zip"
  import * as dotenv from "dotenv"
  import { parentPort, workerData } from "worker_threads";
  import { PassThrough,Readable } from "stream";
  import crypto,{Hash} from 'crypto';
 
  dotenv.config()

  const workerPassThrough = new PassThrough();


  parentPort?.on('message', (message) => {
    if (message.type === 'chunk') {
        workerPassThrough.write(message.data);
    } else if (message.type === 'end') {
        workerPassThrough.end();
        (async () => {
            try {
                const inputStream: Readable = workerPassThrough;
                const filename: string = workerData.filename;
                const response: string = await processPDF(inputStream, filename);
                parentPort?.postMessage(response);
            } catch (error:any) {
                parentPort?.postMessage({ error: error.message });
            }
        })();
    }
});

workerPassThrough.on('data', (chunk) => {
  console.log('Worker processing chunk:',chunk.length);
});


  workerPassThrough.on('end', () => {
      parentPort?.postMessage('Stream processing complete!');
  });


  const processPDF = async(inputStream:Readable,filename:string):Promise<string>=>{

    const credentials = new ServicePrincipalCredentials({
      clientId: process.env.PDF_SERVICES_CLIENT_ID as string,
      clientSecret: process.env.PDF_SERVICES_CLIENT_SECRET as string
    });
   
    const pdfServices = new PDFServices({credentials});
    let readStream:Readable | undefined = inputStream
    const inputAsset = await pdfServices.upload({
      readStream,
      mimeType: MimeType.PDF
    });
    const params = new ExtractPDFParams({
      elementsToExtract: [ExtractElementType.TEXT],
      addCharInfo:true,
      getStylingInfo:true
    });
   
 
    const job = new ExtractPDFJob({inputAsset, params});

    const pollingURL = await pdfServices.submit({job});
    const pdfServicesResponse = await pdfServices.getJobResult({
      pollingURL,
      resultType: ExtractPDFResult
    });
    if(!pdfServicesResponse.result)throw new Error("pdfServicesResponse must have a property 'result'")
   
    const resultAsset = pdfServicesResponse.result.resource;
    const streamAsset = await pdfServices.getContent({asset: resultAsset});

   
    const outputFilePath = "./ExtractTextInfoFromPDF.zip";
    console.log(`Saving asset at ${outputFilePath}`);

    const writeStream = fs.createWriteStream(outputFilePath);
    streamAsset.readStream.pipe(writeStream);

    let zip = new AdmZip(outputFilePath)
    let jsondata= zip.readAsText("structuredData.json")

   return jsondata
  }
 
log presentation:

listening to PORT:3000
No logging configuration. Using default config
Worker processing chunk: 65010
Worker processing chunk: 44235
2025-01-19T15:30:19.833:[INFO]: Started uploading asset