ExtendScript to find string matching regex and replace with substring using backref ($n)

Report · Oct 06, 2023

This is what I want to accomplish:

1. Find zero or more chars inside straight or curly double quotes

2. Replace with the text inside and without the quotes

3. Make the replacement text bold

I used this as a starting point (very helpful, thanks to @Russ Ward and the community):

Solved: Re: Replace Text After Find - Adobe Support Community - 13151348

I can accomplish 1 and 2 using the GUI:

I can accomplish 1 and 3 using this script (notice I had to encode the regex as UTF-16 hex):

#target framemaker
// regex for zero or more chars inside straight or curly double quotes, Unicode txt and UTF-16 hex
// ["“](.*?)["”] = \u005b\u0022\u201c\u005d\u0028\u002e\u002a\u003f\u0029\u005b\u0022\u201d\u005d
// regex for backreference to first capturing group, Unicode txt and UTF-16 hex
// $1 = \u0024\u0031

// var regex = /\u005b\u0022\u201c\u005d\u0028\u002e\u002a\u003f\u0029\u005b\u0022\u201d\u005d/;
// var unquoted = /\u0024\u0031/;

findAndReplace("\u005b\u0022\u201c\u005d\u0028\u002e\u002a\u003f\u0029\u005b\u0022\u201d\u005d", "");

function findAndReplace(findString, replaceString)
{
    var textRange = app.ActiveDoc.TextSelection;
    if(!textRange.beg.obj.ObjectValid())
    {
      alert("No insertion point or active document. Cannot continue.");
      return;
    }
    var doc = app.ActiveDoc;
        
    // Set up the find parameters
    var findParams = AllocatePropVals (4);    
    findParams[0].propIdent.num = Constants.FS_FindText;    
    findParams[0].propVal.valType = Constants.FT_String;    
    findParams[0].propVal.sval = findString;    
   
    findParams[1].propIdent.num = Constants.FS_FindWrap;    
    findParams[1].propVal.valType = Constants.FT_Integer;    
    findParams[1].propVal.ival = false;  
     
    findParams[2].propIdent.num = Constants.FS_FindCustomizationFlags;
    findParams[2].propVal.valType = Constants.FT_Integer;
    findParams[2].propVal.ival = Constants.FF_FIND_USE_REGEX;

    findParams[3].propIdent.num = Constants.FS_RegexFlavour;
    findParams[3].propVal.valType = Constants.FT_Integer;
    findParams[3].propVal.ival = Constants.FR_USE_PERL;

// Start the search 
    textRange = doc.Find(textRange.beg, findParams);
      
    while(textRange.beg.obj.ObjectValid() && FA_errno == Constants.FE_Success) 
    {
      
      var replaceTextRange = doc.TextSelection;
      ApplyCharFmt (doc, replaceTextRange, "Bold");      
      // The search string is already selected. Delete it.
      // doc.Clear(0);
      
      // Add the new text.
      //  doc.AddText(textRange.beg, replaceString);
    
      textRange = doc.Find(textRange.beg, findParams);
    
    }
}

function ApplyCharFmt (oDoc, oTR, sFormat) { // ===========================================
/*            Apply character format to text range
Arguments     oDoc            Document to be handled, not necessarily the current document
              oTR             Text range to receive the format
              sFormat         Name of character format
Reference     Rick Quatro
Used in       FMnotes etc.
History       2016-02-03
              2022-06-16 check for catalogged name
*/
var oCharFmt;
  msg_01 = "Character format «%01» is not in catalogue - cannot be applied";

  oCharFmt = oDoc.FirstCharFmtInDoc;              // is it in catalogue?
  while (true) {
    if (oCharFmt.Name == sFormat) { break;}       // it's in the catalogue
    oCharFmt = oCharFmt.NextCharFmtInDoc;
    if (!oCharFmt.ObjectValid()) {                 // it's not cataloggued
      KLD_Z.Message ("E", msg_01, "ApplyCharFmt", [sFormat]);
      return false;
    }
  }

  oCharFmt = oDoc.GetNamedCharFmt(sFormat);
  oProps = oCharFmt.GetProps();
  oDoc.SetTextProps (oTR, oCharFmt.GetProps());
  return true;
} // --- end ApplyCharFmt -------------------------------------------------------------------------

How can I accomplish all three with ES? No matter how I pass the "$1" to the script it does a literal replace.

Thanks in advance for any insights.

Report · Oct 06, 2023

Less, i think you need to exclude the curly braces from the capture group

["“](.*?)["”]

But i have it not at hand, how to do it...

Report · Oct 06, 2023

i think you need to exclude the curly braces from the capture group

By @K.Daube

Thank you, @K.Daube.

My challenge is that the doc I'm working on is fraught with the curly double-quotes/aka braces. I want to get rid of all of them. Passing the regex to the findAndReplace function as Unicode seems to work fine. What I can't figure out is what to provide as the second argument to findAndReplace so that it replaces "some text" or “some text” with some text...is this even possible?

Report · Oct 06, 2023

Less, i think you need to exclude the curly braces from the capture group

["“](.*?)["”]

By @K.Daube

I just re-read your message and now realize that you mean replacing this:

(.*?)

with something like:

([^“”]*)

I'll try that when I have time. Thanks again.

Report · Oct 10, 2023

I am uploaded a series of videos to YouTube that explain how I do finds/changes with FrameMaker ExtendScript. The videos aren't edited or polished, but they will give you useful information. Search for FindChange with FrameMaker ExtendScript on YouTube.

Report · Oct 17, 2023

@frameexpert , Thanks to you I can finally claim victory! Your videos told me what I needed to know about how to work with capture groups. My script now does all three of the things I needed to accomplish. This is what I ended up with:

#target framemaker
#include "ApplyCharFmt.jsx"

main ();

function main () {
    var doc;
    
    doc = app.ActiveDoc;
    if (doc.ObjectValid () === 1) {
        processDoc (doc);        
    }
    return 0;
}

function processDoc (doc) {
    var regex, pgf, text, matches;

//    regex for zero or more chars inside straight or curly double quotes
//    \u201c = “    \u201d = ”     i = ignore case      g = greedy match mode
    regex = /["\u201c](.*?)["\u201d]/ig; 

    if (app.Displaying === 1) {
        app.Displaying = 0;
    }

    pgf = doc.MainFlowInDoc.FirstTextFrameInFlow.FirstPgf;
    while (pgf.ObjectValid () === 1) {
        text = getPgfText (pgf);
        if (regex.test (text) === true) {
            matches = getPgfRegexMatches (regex, text);
            processPgf (pgf, matches, doc);
        }
        pgf = pgf.NextPgfInFlow;    
    }

    if (app.Displaying === 0) {
        app.Displaying = 1;
        doc.Redisplay();
    }
}

function getPgfText(pgf) {
    var text, textList, count, i;
    text = "";
    textList = pgf.GetText (Constants.FTI_String);
    count = textList.length;
    for (i = 0; i < count; i += 1) {
        text += (textList[i].sdata);    
    }
    return text;
}

function getPgfRegexMatches (regex, text) {
    var matches, match, matchObj;
    matches = [];
    while (match = regex.exec (text)) {
        matchObj = {};
        matchObj.value = match[0];
        matchObj.beginOffset = match.index;
        matchObj.endOffset = regex.lastIndex;
        matchObj.submatches = getSubmatches (match);
        matches.push (matchObj);
    }
    regex.lastIndex = 0;
    
    function getSubmatches (match) {    
        var submatches, count,  i;
        submatches = [];
        count = match.length;
        for (i = 1; i < count; i += 1) {
            submatches.push (match[i]);
        }        
        return submatches;
    }
    return matches;
}

function processPgf (pgf, matches, doc) {
    
    var count, i, matchObj, textLoc, findParams, textRange, foundPgf, newTxt, newTxtEnd, newTxtRange;
    
    count = matches.length - 1;
    for (i = count; i >= 0; i -= 1) {
        matchObj = matches[i]; 
        textLoc = new TextLoc (pgf, matchObj.beginOffset);
        doc.TextSelection = new TextRange (textLoc, textLoc);
        findParams = getFindParams(matchObj.value); 
        textRange = doc.Find (textLoc, findParams);
        foundPgf = textRange.beg.obj;
        if ((foundPgf.ObjectValid () === 1) && (foundPgf.id === pgf.id)) {
            newTxt = matchObj.submatches.join ("");
            doFindChange (textRange, newTxt, doc); 
            newTxtEnd = new TextLoc (foundPgf, (matchObj.endOffset - 2));
            newTxtRange = new TextRange (textLoc, newTxtEnd);
            ApplyCharFmt (doc, newTxtRange, "Bold");      
        }
        else {
            return;
        }
    }
}

function getFindParams (text) {
    // Set up the find parameters
    var findParams = AllocatePropVals (4);    

    findParams[0].propIdent.num = Constants.FS_FindText;    
    findParams[0].propVal.valType = Constants.FT_String;    
    findParams[0].propVal.sval = text;    
   
    findParams[1].propIdent.num = Constants.FS_FindWrap;    
    findParams[1].propVal.valType = Constants.FT_Integer;    
    findParams[1].propVal.ival = false;  
     
    findParams[2].propIdent.num = Constants.FS_FindCustomizationFlags;
    findParams[2].propVal.valType = Constants.FT_Integer;
    findParams[2].propVal.ival = Constants.FF_FIND_USE_REGEX;

    findParams[3].propIdent.num = Constants.FS_RegexFlavour;
    findParams[3].propVal.valType = Constants.FT_Integer;
    findParams[3].propVal.ival = Constants.FR_USE_PERL;
    
    return findParams;
}

function doFindChange (textRange, change, doc) {
    doc.DeleteText (textRange);
    if (change) {
        doc.AddText (textRange.beg, change);
    }
}

Report · Oct 17, 2023

Great work sifting through the content on the videos and putting it all together. It may seem like a lot of code, but remember that once a function is written and tested, it becomes a "black box" that you can reuse in other scripts. I routinely use these in new scripts, even though I have forgotten a lot of the details of how they work.

The main benefit to this approach is that you have total control over which paragraphs are processed and in which order. You may have noticed that this loop

    pgf = doc.MainFlowInDoc.FirstTextFrameInFlow.FirstPgf;
    while (pgf.ObjectValid () === 1) {
        text = getPgfText (pgf);
        if (regex.test (text) === true) {
            matches = getPgfRegexMatches (regex, text);
            processPgf (pgf, matches, doc);
        }
        pgf = pgf.NextPgfInFlow;    
    }

only processes the paragraphs in the document's main flow and will skip any paragraphs in tables. If this is an issue for you, please start a new post asking how to process body page paragraphs in document order and I will post some code. Thanks.

ExtendScript to find string matching regex and replace with substring using backref ($n)

2 Correct answers