Question
Getting stuck in the middle of uploading in pdf extract api
Here is my implementation:
import {
ServicePrincipalCredentials,
PDFServices,
MimeType,
ExtractPDFParams,
ExtractElementType,
ExtractPDFJob,
ExtractPDFResult,
TableStructureType,
ExtractRenditionsElementType
} from "@adobe/pdfservices-node-sdk";
import * as fs from "fs";
import AdmZip from "adm-zip"
import * as dotenv from "dotenv"
import { parentPort, workerData } from "worker_threads";
import { PassThrough,Readable } from "stream";
import crypto,{Hash} from 'crypto';
dotenv.config()
const workerPassThrough = new PassThrough();
parentPort?.on('message', (message) => {
if (message.type === 'chunk') {
workerPassThrough.write(message.data);
} else if (message.type === 'end') {
workerPassThrough.end();
(async () => {
try {
const inputStream: Readable = workerPassThrough;
const filename: string = workerData.filename;
const response: string = await processPDF(inputStream, filename);
parentPort?.postMessage(response);
} catch (error:any) {
parentPort?.postMessage({ error: error.message });
}
})();
}
});
workerPassThrough.on('data', (chunk) => {
console.log('Worker processing chunk:',chunk.length);
});
workerPassThrough.on('end', () => {
parentPort?.postMessage('Stream processing complete!');
});
const processPDF = async(inputStream:Readable,filename:string):Promise<string>=>{
const credentials = new ServicePrincipalCredentials({
clientId: process.env.PDF_SERVICES_CLIENT_ID as string,
clientSecret: process.env.PDF_SERVICES_CLIENT_SECRET as string
});
const pdfServices = new PDFServices({credentials});
let readStream:Readable | undefined = inputStream
const inputAsset = await pdfServices.upload({
readStream,
mimeType: MimeType.PDF
});
const params = new ExtractPDFParams({
elementsToExtract: [ExtractElementType.TEXT],
addCharInfo:true,
getStylingInfo:true
});
const job = new ExtractPDFJob({inputAsset, params});
const pollingURL = await pdfServices.submit({job});
const pdfServicesResponse = await pdfServices.getJobResult({
pollingURL,
resultType: ExtractPDFResult
});
if(!pdfServicesResponse.result)throw new Error("pdfServicesResponse must have a property 'result'")
const resultAsset = pdfServicesResponse.result.resource;
const streamAsset = await pdfServices.getContent({asset: resultAsset});
const outputFilePath = "./ExtractTextInfoFromPDF.zip";
console.log(`Saving asset at ${outputFilePath}`);
const writeStream = fs.createWriteStream(outputFilePath);
streamAsset.readStream.pipe(writeStream);
let zip = new AdmZip(outputFilePath)
let jsondata= zip.readAsText("structuredData.json")
return jsondata
}
listening to PORT:3000
No logging configuration. Using default config
Worker processing chunk: 65010
Worker processing chunk: 44235
2025-01-19T15:30:19.833:[INFO]: Started uploading asset
