Skip to main content
Participant
March 27, 2023
Answered

Want Automate PDF To EXCEL Extraction In Adobe Acrobat

  • March 27, 2023
  • 2 replies
  • 1056 views

I have a PDF Catalogue. I want to extract The Product Name And The Product Size In Different Coloums. Is There Any Way In Adobe Acribat Or Any Application Because The Pdf Is 1000s pages long. 

This topic has been closed for replies.
Correct answer bebarth

Hi,

Sorry to answer close to a week late and if this answer is too late!

I discovered this post only very recently and you are lucky because today the weather is not nice... 😉
Here is a script adapted to your file. You can run it via the console window or an action wizzard:

 

var trophies=[];
for (var p=0; p<this.numPages; p++) {
	var numWords=this.getPageNumWords(p);
	for (var i=0; i<numWords; i++) {
		var name=this.getPageNthWord(p, i, true);
		if (name=="TROPHIE") {
			var j=1;
			var reference="";
			while (this.getPageNthWord(p, (i+j), true).indexOf("Heigh")!=0) {
				reference+=this.getPageNthWord(p, (i+j), false);
				j++;
			}
			j++;
			var height="";
			while (this.getPageNthWord(p, (i+j), true)!="TROPHIE" && ((i+j)<numWords)) {
				height+=this.getPageNthWord(p, (i+j), false);
				j++;
			}
			trophies.push([reference.replace(/^\s|\s*$/g,""),height.substr(0,height.indexOf("\"")+1).replace(/:/g,"").replace(/^\s|\s*$/g,"")]);
		} else continue;
		i+=(j-1);
	}
}
this.createDataObject("References & Heights.csv", "");
theText="REFERENCE;HEIGHT";
for (var i=0; i<trophies.length; i++) theText+="\r"+trophies[i][0]+";"+trophies[i][1];
var oFile=util.streamFromString(theText);
this.setDataObjectContents("References & Heights.csv", oFile);
this.saveAs(this.path.replace(/.pdf$/i,"_With Attached \"References & Heights.csv\" File.pdf"));
this.viewState={overViewMode:7};

 

For a huge number of pages such as your document, I preconize to display the process to check if Acrobat doesn't crash. So, use this one instead:

 

d0=new Date();
starting=util.printd("dd/mm/yyyy – HH:MM:ss",d0);
console.show();
console.clear();
var trophies=[];
var nbRef=0;
var plurial="";
for (var p=0; p<this.numPages; p++) {
	console.clear();
	console.println("Process starting: "+starting);
	console.println("Processing the page #"+(p+1)+"/"+this.numPages);
	if (nbRef>1) plurial="s";
	console.println(nbRef+" reference"+plurial+" found");
	var numWords=this.getPageNumWords(p);
	for (var i=0; i<numWords; i++) {
		var name=this.getPageNthWord(p, i, true);
		if (name=="TROPHIE") {
			var j=1;
			var reference="";
			while (this.getPageNthWord(p, (i+j), true).indexOf("Heigh")!=0) {
				reference+=this.getPageNthWord(p, (i+j), false);
				j++;
			}
			j++;
			var height="";
			while (this.getPageNthWord(p, (i+j), true)!="TROPHIE" && ((i+j)<numWords)) {
				height+=this.getPageNthWord(p, (i+j), false);
				j++;
			}
			trophies.push([reference.replace(/^\s|\s*$/g,""),height.substr(0,height.indexOf("\"")+1).replace(/:/g,"").replace(/^\s|\s*$/g,"")]);
			nbRef++;
			console.clear();
			console.println("Process starting: "+starting);
			console.println("Processing the page #"+(p+1)+"/"+this.numPages);
			if (nbRef>1) plurial="s";
			console.println(nbRef+" reference"+plurial+" found");
		} else continue;
		i+=(j-1);
	}
}
this.createDataObject("References & Heights.csv", "");
theText="REFERENCE;HEIGHT";
for (var i=0; i<trophies.length; i++) theText+="\r"+trophies[i][0]+";"+trophies[i][1];
var oFile=util.streamFromString(theText);
this.setDataObjectContents("References & Heights.csv", oFile);
this.saveAs(this.path.replace(/.pdf$/i,"_With Attached \"References & Heights.csv\" File.pdf"));
this.viewState={overViewMode:7};
df=new Date();
ending=util.printd("dd/mm/yyyy – HH:MM:ss",df);
temps=(df.valueOf()-d0.valueOf())/1000/60;
var theMinutes=parseInt(temps);
var theSeconds=(temps-theMinutes)*60;
var theSeconds=parseInt(theSeconds*10)/10;
var theTime="";
if (theMinutes>0) {
	if (theMinutes==1) var theTime="1 minute";
	else var theTime=theMinutes+" minutes";
}
if (theSeconds>0) {
	if (theSeconds<2) var theTime=theTime+" "+theSeconds+" second";
	else var theTime=theTime+" "+theSeconds+" seconds";
}
var theTime=theTime.replace(/^\s+|\s+$/gm,"");
var txt="Process starting: "+starting+"\rProcess ending: "+ending+"\rProcess duration: "+theTime+"\r\r"+nbRef+" reference"+plurial+" extracted.";
console.clear();
console.println(txt);
app.alert(txt,3);

Here it is the result for your example file:

 

@+

 

2 replies

bebarth
Community Expert
bebarthCommunity ExpertCorrect answer
Community Expert
April 2, 2023

Hi,

Sorry to answer close to a week late and if this answer is too late!

I discovered this post only very recently and you are lucky because today the weather is not nice... 😉
Here is a script adapted to your file. You can run it via the console window or an action wizzard:

 

var trophies=[];
for (var p=0; p<this.numPages; p++) {
	var numWords=this.getPageNumWords(p);
	for (var i=0; i<numWords; i++) {
		var name=this.getPageNthWord(p, i, true);
		if (name=="TROPHIE") {
			var j=1;
			var reference="";
			while (this.getPageNthWord(p, (i+j), true).indexOf("Heigh")!=0) {
				reference+=this.getPageNthWord(p, (i+j), false);
				j++;
			}
			j++;
			var height="";
			while (this.getPageNthWord(p, (i+j), true)!="TROPHIE" && ((i+j)<numWords)) {
				height+=this.getPageNthWord(p, (i+j), false);
				j++;
			}
			trophies.push([reference.replace(/^\s|\s*$/g,""),height.substr(0,height.indexOf("\"")+1).replace(/:/g,"").replace(/^\s|\s*$/g,"")]);
		} else continue;
		i+=(j-1);
	}
}
this.createDataObject("References & Heights.csv", "");
theText="REFERENCE;HEIGHT";
for (var i=0; i<trophies.length; i++) theText+="\r"+trophies[i][0]+";"+trophies[i][1];
var oFile=util.streamFromString(theText);
this.setDataObjectContents("References & Heights.csv", oFile);
this.saveAs(this.path.replace(/.pdf$/i,"_With Attached \"References & Heights.csv\" File.pdf"));
this.viewState={overViewMode:7};

 

For a huge number of pages such as your document, I preconize to display the process to check if Acrobat doesn't crash. So, use this one instead:

 

d0=new Date();
starting=util.printd("dd/mm/yyyy – HH:MM:ss",d0);
console.show();
console.clear();
var trophies=[];
var nbRef=0;
var plurial="";
for (var p=0; p<this.numPages; p++) {
	console.clear();
	console.println("Process starting: "+starting);
	console.println("Processing the page #"+(p+1)+"/"+this.numPages);
	if (nbRef>1) plurial="s";
	console.println(nbRef+" reference"+plurial+" found");
	var numWords=this.getPageNumWords(p);
	for (var i=0; i<numWords; i++) {
		var name=this.getPageNthWord(p, i, true);
		if (name=="TROPHIE") {
			var j=1;
			var reference="";
			while (this.getPageNthWord(p, (i+j), true).indexOf("Heigh")!=0) {
				reference+=this.getPageNthWord(p, (i+j), false);
				j++;
			}
			j++;
			var height="";
			while (this.getPageNthWord(p, (i+j), true)!="TROPHIE" && ((i+j)<numWords)) {
				height+=this.getPageNthWord(p, (i+j), false);
				j++;
			}
			trophies.push([reference.replace(/^\s|\s*$/g,""),height.substr(0,height.indexOf("\"")+1).replace(/:/g,"").replace(/^\s|\s*$/g,"")]);
			nbRef++;
			console.clear();
			console.println("Process starting: "+starting);
			console.println("Processing the page #"+(p+1)+"/"+this.numPages);
			if (nbRef>1) plurial="s";
			console.println(nbRef+" reference"+plurial+" found");
		} else continue;
		i+=(j-1);
	}
}
this.createDataObject("References & Heights.csv", "");
theText="REFERENCE;HEIGHT";
for (var i=0; i<trophies.length; i++) theText+="\r"+trophies[i][0]+";"+trophies[i][1];
var oFile=util.streamFromString(theText);
this.setDataObjectContents("References & Heights.csv", oFile);
this.saveAs(this.path.replace(/.pdf$/i,"_With Attached \"References & Heights.csv\" File.pdf"));
this.viewState={overViewMode:7};
df=new Date();
ending=util.printd("dd/mm/yyyy – HH:MM:ss",df);
temps=(df.valueOf()-d0.valueOf())/1000/60;
var theMinutes=parseInt(temps);
var theSeconds=(temps-theMinutes)*60;
var theSeconds=parseInt(theSeconds*10)/10;
var theTime="";
if (theMinutes>0) {
	if (theMinutes==1) var theTime="1 minute";
	else var theTime=theMinutes+" minutes";
}
if (theSeconds>0) {
	if (theSeconds<2) var theTime=theTime+" "+theSeconds+" second";
	else var theTime=theTime+" "+theSeconds+" seconds";
}
var theTime=theTime.replace(/^\s+|\s+$/gm,"");
var txt="Process starting: "+starting+"\rProcess ending: "+ending+"\rProcess duration: "+theTime+"\r\r"+nbRef+" reference"+plurial+" extracted.";
console.clear();
console.println(txt);
app.alert(txt,3);

Here it is the result for your example file:

 

@+

 

Karl Heinz  Kremer
Community Expert
Community Expert
March 27, 2023

Not without some custom programming. Depending on the quality of your file, exporting to Excel may yield a result that can be processed further by using VBA in Excel to generate the desired output. If that does not work, you can write JavaScript that will look for text in a certain pattern (in this case, you are looking for a line that contains a product name followed by a line that starts with "Height:". If you find that, you will then take the product name and the information after "Height:" and format it as a line in a CSV file, which when finished and saved, can be opened in Excel as a spreadsheet. In JavaScript, you only have access to text in the contents of a PDF file using the doc.getPageNthWord() function. By using  

doc.getPageNthWordQuads(), you get the the box around the text and can therefore make decisions about proximity.