Copy link to clipboard
Copied
I have Adobe Acrobat Pro Adobe Continuous Release version 2023.006.20360 and Adobe Creative Cloud. I'm trying in either platform to create an Action Wizard that will take a PDF and edit the bookmarks to promote them all to Primary in order to split the PDF into multiple PDFs. The PDF itself is about 160+ pages and has 60+ bookmarks but the bookmarks are grouped into a hierarchy so I have to get the bookmarks updated to primary in order to split.
I have 2 issues here, I can't seem to find in my version a way to add an update bookmarks into the wizard, nor do I see a way to add split PDFs. I did find Split PDF as a custom tool but I don't see a way to add the custom tool into the action wizard.
I also tried using a javascript to update the bookmarks, but even though it says it runs successfully it doesn't change anything in the document.
Please help
Copy link to clipboard
Copied
OK, I'll write a script for you this evening.
@+
Copy link to clipboard
Copied
Try this script:
var invoices=[];
for (var p=0; p<this.numPages; p++) {
var nbWords=this.getPageNumWords(p);
for (var i=0; i<nbWords-1; i++) {
var theWord=this.getPageNthWord(p, i, false);
var nextWord=this.getPageNthWord(p, i+1, true);
if (theWord.indexOf("No.")==0 && !isNaN(nextWord) && nextWord.length==6) {
invoices.push([p,this.numPages-1,nextWord]);
if (invoices.length>1) invoices[(invoices.length-2)][1]=(p-1);
break;
}
}
}
// Files Extracting
var newPath=this.path.replace(this.documentFileName,"New Documents/");
for (var i=0; i<invoices.length; i++) {
this.extractPages({
nStart: invoices[i][0],
nEnd: invoices[i][1],
cPath: newPath+invoices[i][2]+".pdf"
})
}
If there are a lot of text on each page, the process can be quite long. So here is the same script with display of the progress of the process in the console window:
console.show();
console.clear();
d0=new Date();
debut=util.printd("dd-mm HH:MM",d0);
laDate=util.printd("dd-mm-yy",d0);
var invoices=[];
for (var p=0; p<this.numPages; p++) {
console.clear();
console.println("Process Starting: "+debut);
console.println("––––––––––––––");
console.println("Invoice number searching on page "+(p+1));
var nbWords=this.getPageNumWords(p);
for (var i=0; i<nbWords-1; i++) {
var theWord=this.getPageNthWord(p, i, false);
var nextWord=this.getPageNthWord(p, i+1, true);
if (theWord.indexOf("No.")==0 && !isNaN(nextWord) && nextWord.length==6) {
invoices.push([p,this.numPages-1,nextWord]);
if (invoices.length>1) invoices[(invoices.length-2)][1]=(p-1);
break;
}
}
}
var newPath=this.path.replace(this.documentFileName,"New Documents/");
for (var i=0; i<invoices.length; i++) {
console.clear();
console.println("Process Starting: "+debut);
console.println("––––––––––––––");
console.println("Extracting invoice number "+invoices[i][2]);
this.extractPages({
nStart: invoices[i][0],
nEnd: invoices[i][1],
cPath: newPath+invoices[i][2]+".pdf"
})
}
console.println("––––––––––––––");
df=new Date();
fin=util.printd("dd-mm HH:MM",df);
console.println("Process Ending: "+fin);
temps=(df.valueOf()-d0.valueOf())/1000/60;
var lesMinutes=parseInt(temps);
var lesSecondes=(temps-lesMinutes)*60;
var lesSecondes=parseInt(lesSecondes*10)/10;
var leTemps="";
if (lesMinutes>0) {
if (lesMinutes==1) {
var leTemps="1 minute";
} else {
var leTemps=lesMinutes+" minutes";
}
}
if (lesSecondes>0) {
if (lesSecondes<2) {
var leTemps=leTemps+" "+lesSecondes+" second"
} else {
var leTemps=leTemps+" "+lesSecondes+" seconds"
}
}
var leTemps=leTemps.replace(/^\s+|\s+$/gm,"");
console.clear();
console.println("Process Starting: "+debut);
console.println("––––––––––––––");
console.println("Process Duration: "+leTemps);
console.println(invoices.length+" invoices extracted.");
Let me know.
@+
Copy link to clipboard
Copied
How long should this take to run? My pdf is 179 pages and should be divided into 61 files based on 60 different invoice numbers plus the page 1 summary page. Hmmm, it ran for about 7-8 minutes, then I get this error:
Process Starting: 09-11 17:13
––––––––––––––
Extracting invoice number 421074
RaiseError: The file may be read-only, or another user may have it open. Please save the document with a different name or in a different folder.
Doc.extractPages:32:Batch undefined:Exec
===> The file may be read-only, or another user may have it open. Please save the document with a different name or in a different folder.
It only found 1 of the 60 invoices and it's the Acrobat application that has the file open so not sure why it's doing that.
Copy link to clipboard
Copied
Hi,
It's quite difficult to know why that doesn't work for you. I did a test with a fictive file and it worked fine.
Is it possible for you to share a file or an abstract in PM? I'll have a look.
@+
Copy link to clipboard
Copied
Let me redact some data and I'll send a sample.
Copy link to clipboard
Copied
Well it worked to remove the summary bookmarks but it's not working for the Split PDF by Top-level bookmark. For some reason it still doesn't think the bookmarks are top level.
Copy link to clipboard
Copied
That was regarding the JavaScript that updated the bookmarks to remove the summary bookmarks so that they were all on the same level, not your JS for splitting. I went back to trying by bookmarks while waiting to see if I could get the JS to work for splitting by text.
Copy link to clipboard
Copied
Hi,
In fact, you have to launch the OCR before runing the script.
On your invoices, sometime there is a space between "No." and the ":" symbol and sometime there is no space.
I modifyed a bit the script for catching both configurations and I found 60 invoices separated from your original example. Please check if this number is correct!
Here is the script:
console.show();
console.clear();
d0=new Date();
debut=util.printd("dd-mm HH:MM",d0);
laDate=util.printd("dd-mm-yy",d0);
var invoices=[];
for (var p=0; p<this.numPages; p++) {
console.clear();
console.println("Process Starting: "+debut);
console.println("––––––––––––––");
console.println("Invoice number searching on page "+(p+1));
console.println(invoices.length+" invoices found");
var nbWords=this.getPageNumWords(p);
for (var i=0; i<nbWords-1; i++) {
if (this.getPageNthWord(p, i, false).indexOf("No.:")==0 && !isNaN(this.getPageNthWord(p, i+1, true)) && this.getPageNthWord(p, i+1, true).length==6) {
invoices.push([p,this.numPages-1,this.getPageNthWord(p, i+1, true)]);
if (invoices.length>1) invoices[(invoices.length-2)][1]=(p-1);
break;
} else if (this.getPageNthWord(p, i, false).indexOf("No.")==0 && this.getPageNthWord(p, i+1, false).indexOf(":")==0 && !isNaN(this.getPageNthWord(p, i+2, true)) && this.getPageNthWord(p, i+2, true).length==6) {
invoices.push([p,this.numPages-1,this.getPageNthWord(p, i+2, true)]);
if (invoices.length>1) invoices[(invoices.length-2)][1]=(p-1);
break;
}
}
}
var newPath=this.path.replace(this.documentFileName,"New Documents/");
for (var i=0; i<invoices.length; i++) {
console.clear();
console.println("Process Starting: "+debut);
console.println("––––––––––––––");
console.println("Extracting invoice number "+invoices[i][2]);
this.extractPages({
nStart: invoices[i][0],
nEnd: invoices[i][1],
cPath: newPath+invoices[i][2]+".pdf"
})
}
df=new Date();
fin=util.printd("dd-mm HH:MM",df);
temps=(df.valueOf()-d0.valueOf())/1000/60;
var lesMinutes=parseInt(temps);
var lesSecondes=(temps-lesMinutes)*60;
var lesSecondes=parseInt(lesSecondes*10)/10;
var leTemps="";
if (lesMinutes>0) {
if (lesMinutes==1) {
var leTemps="1 minute";
} else {
var leTemps=lesMinutes+" minutes";
}
}
if (lesSecondes>0) {
if (lesSecondes<2) {
var leTemps=leTemps+" "+lesSecondes+" second"
} else {
var leTemps=leTemps+" "+lesSecondes+" seconds"
}
}
var leTemps=leTemps.replace(/^\s+|\s+$/gm,"");
console.clear();
console.println("Process Starting: "+debut);
console.println("Process Ending: "+fin);
console.println("––––––––––––––");
console.println("Process Duration: "+leTemps);
console.println(invoices.length+" invoices extracted.");
I'll send you by e-mail the file after the OCR, which one I worked with...
@+
Copy link to clipboard
Copied
@bebarth what configuration did you use in OCR? I seem to be able to get the script to work from your version of the file after OCR, but when I run the original invoice, non-redacted, through OCR and try it the script still hangs up after reading 1 invoice number.
Copy link to clipboard
Copied
Hi,
I used the standard settings...
@+
Copy link to clipboard
Copied
Ugh I swear I'm cursed. I've tried both Enhance PDF tool which does OCR and just OCR. The script will run through the pages and finds all the invoices but as soon as it starts to extract the first invoice I get the spinning wheel of death. The JavaScript Debugger console and Adobe are (Not Responding). Then after 12 minutes I get this:
Which naturally I have no idea what it means.