Skip to main content
Inspiring
January 23, 2013
Question

Ideas for speeding up stemming code?

  • January 23, 2013
  • 1 reply
  • 500 views

Just in case anybody has experience with this...I'm wondering if anybody might know how to speed this up? I found this javascript implementation. I've made a few little AS changes, but generally this is untouched.

Right now I'm testing on a list of about 23,000 unique words and it is taking about 3600 ms. But in my use I'm planning to use it on words that will be repeated many times. Maybe I could keep an object of "known word-stem" paris and do a lookup. That might be much faster....hmm...any other ideas?

public static function porterStem(w:String):String {

          var stem:String;

          var suffix:String;

          var firstch:String;

          var re,re2,re3,re4:RegExp;

          var fp:Array;

          var step2list:Object = {

          "ational" : "ate",

          "tional" : "tion",

          "enci" : "ence",

          "anci" : "ance",

          "izer" : "ize",

          "bli" : "ble",

          "alli" : "al",

          "entli" : "ent",

          "eli" : "e",

          "ousli" : "ous",

          "ization" : "ize",

          "ation" : "ate",

          "ator" : "ate",

          "alism" : "al",

          "iveness" : "ive",

          "fulness" : "ful",

          "ousness" : "ous",

          "aliti" : "al",

          "iviti" : "ive",

          "biliti" : "ble",

          "logi" : "log"

          };

          var step3list:Object = {

          "icate" : "ic",

          "ative" : "",

          "alize" : "al",

          "iciti" : "ic",

          "ical" : "ic",

          "ful" : "",

          "ness" : ""

          };

          var c:String="[^aeiou]";                                        // consonant

          var v:String="[aeiouy]";                                        // vowel

          var C:String=c+"[^aeiouy]*";                                   // consonant sequence

          var V:String=v+"[aeiou]*";                                   // vowel sequence

          var mgr0:String="^("+C+")?"+V+C;                              // VC... is m>0

          var meq1:String="^("+C+")?"+V+C+"("+V+")?$";          // VC is m=1

          var mgr1:String="^("+C+")?"+V+C+V+C;                         // VCVC... is m>1

          var s_v:String="^("+C+")?"+v;                                   // vowel in stem

          var origword:String=w;

          if (w.length<3) {

               return w;

          }

          firstch=w.substr(0,1);

          if (firstch=="y") {

               w=firstch.toUpperCase()+w.substr(1);

          }

          // Step 1a

          re=/^(.+?)(ss|i)es$/;

          re2=/^(.+?)([^s])s$/;

          if (re.test(w)) {

               w=w.replace(re,"$1$2");

          } else if (re2.test(w)) {

               w=w.replace(re2,"$1$2");

          }

          // Step 1b

          re=/^(.+?)eed$/;

          re2=/^(.+?)(ed|ing)$/;

          if (re.test(w)) {

               fp=re.exec(w);

               re=new RegExp(mgr0);

               if (re.test(fp[1])) {

                         re=/.$/;

                         w=w.replace(re,"");

               }

          } else if (re2.test(w)) {

               fp=re2.exec(w);

               stem=fp[1];

               re2=new RegExp(s_v);

               if (re2.test(stem)) {

                         w=stem;

                         re2=/(at|bl|iz)$/;

                         re3=new RegExp("([^aeiouylsz])\\1$");

                         re4=new RegExp("^"+C+v+"[^aeiouwxy]$");

                         if (re2.test(w)) {

                                   w=w+"e";

                         } else if (re3.test(w)) {

                                   re=/.$/;

                                   w=w.replace(re,"");

                         } else if (re4.test(w)) {

                                   w=w+"e";

                         }

               }

          }

          // Step 1c

          re=/^(.+?)y$/;

          if (re.test(w)) {

                    fp=re.exec(w);

                    stem=fp[1];

                    re=new RegExp(s_v);

                    if (re.test(stem)) {

                              w=stem+"i";

                    }

          }

          // Step 2

          re=/^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;

          if (re.test(w)) {

                    fp=re.exec(w);

                    stem=fp[1];

                    suffix=fp[2];

                    re=new RegExp(mgr0);

                    if (re.test(stem)) {

                              w=stem+step2list[suffix];

                    }

          }

          // Step 3

          re=/^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;

          if (re.test(w)) {

                    fp=re.exec(w);

                    stem=fp[1];

                    suffix=fp[2];

                    re=new RegExp(mgr0);

                    if (re.test(stem)) {

                              w=stem+step3list[suffix];

                    }

          }

          // Step 4

          re=/^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;

          re2=/^(.+?)(s|t)(ion)$/;

          if (re.test(w)) {

                    fp=re.exec(w);

                    stem=fp[1];

                    re=new RegExp(mgr1);

                    if (re.test(stem)) {

                              w=stem;

                    }

          } else if (re2.test(w)) {

                    fp=re2.exec(w);

                    stem=fp[1]+fp[2];

                    re2=new RegExp(mgr1);

                    if (re2.test(stem)) {

                              w=stem;

                    }

          }

          // Step 5

          re=/^(.+?)e$/;

          if (re.test(w)) {

                    fp=re.exec(w);

                    stem=fp[1];

                    re=new RegExp(mgr1);

                    re2=new RegExp(meq1);

                    re3=new RegExp("^"+C+v+"[^aeiouwxy]$");

                    if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {

                              w=stem;

                    }

          }

          re=/ll$/;

          re2=new RegExp(mgr1);

          if (re.test(w)&&re2.test(w)) {

                    re=/.$/;

                    w=w.replace(re,"");

          }

          // and turn initial Y back to y

          if (firstch=="y") {

                    w=firstch.toLowerCase()+w.substr(1);

          }

          return w;

}

This topic has been closed for replies.

1 reply

RothrockAuthor
Inspiring
January 23, 2013

D'oh! That solution helped with the real-world use—by a lot. I feel so silly for not having thought of it before. I guess that is why I come to the forums and ask questions.

I was using this in a little search engine. I was asynchronously indexing around 150 xml "documents" and it was taking around 4900 ms before the index was ready for searching. Keeping a look up object allowed me to cut that to around 2600 ms. And it looks like it doesn't use up that much more memory 56.52 MB vs 56.67 MB.

Still if anybody has other suggestions that would be awesome.