Ideas for speeding up stemming code?

Question

Just in case anybody has experience with this...I'm wondering if anybody might know how to speed this up? I found this javascript implementation. I've made a few little AS changes, but generally this is untouched.

Right now I'm testing on a list of about 23,000 unique words and it is taking about 3600 ms. But in my use I'm planning to use it on words that will be repeated many times. Maybe I could keep an object of "known word-stem" paris and do a lookup. That might be much faster....hmm...any other ideas?

public static function porterStem(w:String):String {
          var stem:String;
          var suffix:String;
          var firstch:String;
          var re,re2,re3,re4:RegExp;
          var fp:Array;
          var step2list:Object = {
          "ational" : "ate",
          "tional" : "tion",
          "enci" : "ence",
          "anci" : "ance",
          "izer" : "ize",
          "bli" : "ble",
          "alli" : "al",
          "entli" : "ent",
          "eli" : "e",
          "ousli" : "ous",
          "ization" : "ize",
          "ation" : "ate",
          "ator" : "ate",
          "alism" : "al",
          "iveness" : "ive",
          "fulness" : "ful",
          "ousness" : "ous",
          "aliti" : "al",
          "iviti" : "ive",
          "biliti" : "ble",
          "logi" : "log"
          };
          var step3list:Object = {
          "icate" : "ic",
          "ative" : "",
          "alize" : "al",
          "iciti" : "ic",
          "ical" : "ic",
          "ful" : "",
          "ness" : ""
          };
          var c:String="[^aeiou]";                                        // consonant
          var v:String="[aeiouy]";                                        // vowel
          var C:String=c+"[^aeiouy]*";                                   // consonant sequence
          var V:String=v+"[aeiou]*";                                   // vowel sequence
          var mgr0:String="^("+C+")?"+V+C;                              // VC... is m>0
          var meq1:String="^("+C+")?"+V+C+"("+V+")?$";          // VC is m=1
          var mgr1:String="^("+C+")?"+V+C+V+C;                         // VCVC... is m>1
          var s_v:String="^("+C+")?"+v;                                   // vowel in stem
          var origword:String=w;
  
          if (w.length<3) {
               return w;
          }
          firstch=w.substr(0,1);
          if (firstch=="y") {
               w=firstch.toUpperCase()+w.substr(1);
          }
          // Step 1a
          re=/^(.+?)(ss|i)es$/;
          re2=/^(.+?)([^s])s$/;
          if (re.test(w)) {
               w=w.replace(re,"$1$2");
          } else if (re2.test(w)) {
               w=w.replace(re2,"$1$2");
          }
          // Step 1b
          re=/^(.+?)eed$/;
          re2=/^(.+?)(ed|ing)$/;
          if (re.test(w)) {
               fp=re.exec(w);
               re=new RegExp(mgr0);
               if (re.test(fp[1])) {
                         re=/.$/;
                         w=w.replace(re,"");
               }
          } else if (re2.test(w)) {
               fp=re2.exec(w);
               stem=fp[1];
               re2=new RegExp(s_v);
               if (re2.test(stem)) {
                         w=stem;
                         re2=/(at|bl|iz)$/;
                         re3=new RegExp("([^aeiouylsz])\\1$");
                         re4=new RegExp("^"+C+v+"[^aeiouwxy]$");
                         if (re2.test(w)) {
                                   w=w+"e";
                         } else if (re3.test(w)) {
                                   re=/.$/;
                                   w=w.replace(re,"");
                         } else if (re4.test(w)) {
                                   w=w+"e";
                         }
               }
          }
          // Step 1c
          re=/^(.+?)y$/;
          if (re.test(w)) {
                    fp=re.exec(w);
                    stem=fp[1];
                    re=new RegExp(s_v);
                    if (re.test(stem)) {
                              w=stem+"i";
                    }
          }
          // Step 2
          re=/^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
          if (re.test(w)) {
                    fp=re.exec(w);
                    stem=fp[1];
                    suffix=fp[2];
                    re=new RegExp(mgr0);
                    if (re.test(stem)) {
                              w=stem+step2list[suffix];
                    }
          }
          // Step 3
          re=/^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
          if (re.test(w)) {
                    fp=re.exec(w);
                    stem=fp[1];
                    suffix=fp[2];
                    re=new RegExp(mgr0);
                    if (re.test(stem)) {
                              w=stem+step3list[suffix];
                    }
          }
          // Step 4
          re=/^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
          re2=/^(.+?)(s|t)(ion)$/;
          if (re.test(w)) {
                    fp=re.exec(w);
                    stem=fp[1];
                    re=new RegExp(mgr1);
                    if (re.test(stem)) {
                              w=stem;
                    }
          } else if (re2.test(w)) {
                    fp=re2.exec(w);
                    stem=fp[1]+fp[2];
                    re2=new RegExp(mgr1);
                    if (re2.test(stem)) {
                              w=stem;
                    }
          }
          // Step 5
          re=/^(.+?)e$/;
          if (re.test(w)) {
                    fp=re.exec(w);
                    stem=fp[1];
                    re=new RegExp(mgr1);
                    re2=new RegExp(meq1);
                    re3=new RegExp("^"+C+v+"[^aeiouwxy]$");
                    if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {
                              w=stem;
                    }
          }
          re=/ll$/;
          re2=new RegExp(mgr1);
          if (re.test(w)&&re2.test(w)) {
                    re=/.$/;
                    w=w.replace(re,"");
          }
          // and turn initial Y back to y
          if (firstch=="y") {
                    w=firstch.toLowerCase()+w.substr(1);
          }
          return w;
}

Rothrock · Answer

D'oh! That solution helped with the real-world use—by a lot. I feel so silly for not having thought of it before. I guess that is why I come to the forums and ask questions.

I was using this in a little search engine. I was asynchronously indexing around 150 xml "documents" and it was taking around 4900 ms before the index was ready for searching. Keeping a look up object allowed me to cut that to around 2600 ms. And it looks like it doesn't use up that much more memory 56.52 MB vs 56.67 MB.

Still if anybody has other suggestions that would be awesome.

Sign up

To post, reply, or follow discussions, please sign in with your Adobe ID.

Sign in to Adobe Community

To post, reply, or follow discussions, please sign in with your Adobe ID.

Scanning file for viruses.

This file cannot be downloaded