Skip to main content
frameexpert
Community Expert
Community Expert
March 31, 2020
Question

Text extraction algorithm

  • March 31, 2020
  • 2 replies
  • 895 views

Hi All,

I need help with a text extraction algorithm. Given a string of text inside a larger string, I need to extract a string of a certain number of characters with my given string in the middle. I am going to illustrate it with some nonsense paragraphs, but hopefully it will illustrate when I am trying to do. Here are three paragraphs with the given text "zzzz":

123456789 zzzz 1234567890
1 zzzz 12345 6789 1234567
1234 6789 12345678 zzzz 1

I want to extract 10 characters with the "zzzz" in the middle. If the "zzzz" can't be in the middle (as in the 2nd and 3rd paragraphs), I still want 10 characters. So, here is the result I want:

89 zzzz 12
1 zzzz 123
678 zzzz 1

Of course, there could be instances where the overall string contains less characters, but I want to set the maximum extraction to a specific number characters (10 in this case). So, given the overall length of a container string, the length of the target string and its position in the container string, and the number of total characters to extract, I am looking for a general algorithm to extract the characters, keeping the target string as close to the center as possible.

 

I am using ExtendScript but even pseudocode would be helpful. Any ideas or pointers would be appreciated. Thank you very much. -Rick

This topic has been closed for replies.

2 replies

Participating Frequently
April 24, 2020
var sourceString = '1234 6789 12345678 zzzz 1';
var searchString = 'zzzz';
var resultLength = 10;

var numberOfCharsEitherSide;
var regex, result;

numberOfCharsEitherSide = (resultLength - Math.floor(searchString.length)) / 2;

do {
  regex = '.{0,' + numberOfCharsEitherSide + '}' +
    searchString +
    '.{0,' + numberOfCharsEitherSide + '}';
  result = sourceString.match(regex);
  numberOfCharsEitherSide++;
} while (result[0].length > 0 && result[0].length < resultLength);

alert('result: ' + result);
K.Daube
Community Expert
Community Expert
March 31, 2020

Dear Rick,

At least for the test cases this provides correct results:

main ();

function main() {
var values = ["123456789 zzzz 1234567890", "1 zzzz 12345 6789 1234567", "1234 6789 12345678 zzzz 1"] ;
var sConst = "zzzz";
  for (j= 0; j < 3; j++) {
    alert (GetMiddlePart(values[j], sConst));
  }
} // --- end main

function GetMiddlePart (string, sConst) {
var iLoc, j, lConst, lim, max, min, sResult;
  lConst = sConst.length;
  max = string.length;
  min = Math.floor((max-10)/2);
  lim = Math.floor((10-lConst)/2);
  iLoc = string.indexOf(sConst);
  if (iLoc > lim && iLoc < max-10) {
    sResult = string.substr(min, 10);
  } else if (iLoc < lim) { // towards beginning
    sResult = string.substr(0, 10);
  } else {            // towards the end
    sResult = string.substr(max-10);
  }
    return sResult;
} // --- end GetMiddlePart

k

K.Daube
Community Expert
Community Expert
April 1, 2020

This may be more general:

 

main ();
function main() {
var values = ["123456789 zzzz 1234567890", "1 zzzz 12345 6789 1234567"
            , "1234 6789 12345678 zzzz 1", "a zzzz q", "ab zzzz yz"];
var sCore = "zzzz", lBox = 10;
  for (j= 0; j < values.length; j++) {
    alert (values[j] + "\n" + GetMiddlePart(values[j], sCore, lBox));
  }
} // --- end main

function GetMiddlePart (string, sCore, lBox) {
var boundL, boundR, iLoc, j, lCore, lString, sResult, wrap;
  lString = string.length;
  lCore   = sCore.length;
  if (lBox >= lString) {
    return string;
  }
  boundL = Math.floor((lString-lBox)/2);
  iLoc   = string.indexOf(sCore);
  if (iLoc > boundL && iLoc < lString-lBox) {    // somewhere in the middle
    sResult = string.substr(boundL, lBox);
  } else if (iLoc < boundL) { // towards beginning
    sResult = string.substr(0, lBox);
  } else {            // towards the end
    sResult = string.substr(lString-lBox);
  }
    return sResult;
} // --- end GetMiddlePart