Ideas for speeding up stemming code?
Just in case anybody has experience with this...I'm wondering if anybody might know how to speed this up? I found this javascript implementation. I've made a few little AS changes, but generally this is untouched.
Right now I'm testing on a list of about 23,000 unique words and it is taking about 3600 ms. But in my use I'm planning to use it on words that will be repeated many times. Maybe I could keep an object of "known word-stem" paris and do a lookup. That might be much faster....hmm...any other ideas?
public static function porterStem(w:String):String {
var stem:String;
var suffix:String;
var firstch:String;
var re,re2,re3,re4:RegExp;
var fp:Array;
var step2list:Object = {
"ational" : "ate",
"tional" : "tion",
"enci" : "ence",
"anci" : "ance",
"izer" : "ize",
"bli" : "ble",
"alli" : "al",
"entli" : "ent",
"eli" : "e",
"ousli" : "ous",
"ization" : "ize",
"ation" : "ate",
"ator" : "ate",
"alism" : "al",
"iveness" : "ive",
"fulness" : "ful",
"ousness" : "ous",
"aliti" : "al",
"iviti" : "ive",
"biliti" : "ble",
"logi" : "log"
};
var step3list:Object = {
"icate" : "ic",
"ative" : "",
"alize" : "al",
"iciti" : "ic",
"ical" : "ic",
"ful" : "",
"ness" : ""
};
var c:String="[^aeiou]"; // consonant
var v:String="[aeiouy]"; // vowel
var C:String=c+"[^aeiouy]*"; // consonant sequence
var V:String=v+"[aeiou]*"; // vowel sequence
var mgr0:String="^("+C+")?"+V+C; //
VC... is m>0 var meq1:String="^("+C+")?"+V+C+"("+V+")?$"; //
VC is m=1 var mgr1:String="^("+C+")?"+V+C+V+C; //
VCVC... is m>1 var s_v:String="^("+C+")?"+v; // vowel in stem
var origword:String=w;
if (w.length<3) {
return w;
}
firstch=w.substr(0,1);
if (firstch=="y") {
w=firstch.toUpperCase()+w.substr(1);
}
// Step 1a
re=/^(.+?)(ss|i)es$/;
re2=/^(.+?)([^s])s$/;
if (re.test(w)) {
w=w.replace(re,"$1$2");
} else if (re2.test(w)) {
w=w.replace(re2,"$1$2");
}
// Step 1b
re=/^(.+?)eed$/;
re2=/^(.+?)(ed|ing)$/;
if (re.test(w)) {
fp=re.exec(w);
re=new RegExp(mgr0);
if (re.test(fp[1])) {
re=/.$/;
w=w.replace(re,"");
}
} else if (re2.test(w)) {
fp=re2.exec(w);
stem=fp[1];
re2=new RegExp(s_v);
if (re2.test(stem)) {
w=stem;
re2=/(at|bl|iz)$/;
re3=new RegExp("([^aeiouylsz])\\1$");
re4=new RegExp("^"+C+v+"[^aeiouwxy]$");
if (re2.test(w)) {
w=w+"e";
} else if (re3.test(w)) {
re=/.$/;
w=w.replace(re,"");
} else if (re4.test(w)) {
w=w+"e";
}
}
}
// Step 1c
re=/^(.+?)y$/;
if (re.test(w)) {
fp=re.exec(w);
stem=fp[1];
re=new RegExp(s_v);
if (re.test(stem)) {
w=stem+"i";
}
}
// Step 2
re=/^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
if (re.test(w)) {
fp=re.exec(w);
stem=fp[1];
suffix=fp[2];
re=new RegExp(mgr0);
if (re.test(stem)) {
w=stem+step2list[suffix];
}
}
// Step 3
re=/^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
if (re.test(w)) {
fp=re.exec(w);
stem=fp[1];
suffix=fp[2];
re=new RegExp(mgr0);
if (re.test(stem)) {
w=stem+step3list[suffix];
}
}
// Step 4
re=/^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
re2=/^(.+?)(s|t)(ion)$/;
if (re.test(w)) {
fp=re.exec(w);
stem=fp[1];
re=new RegExp(mgr1);
if (re.test(stem)) {
w=stem;
}
} else if (re2.test(w)) {
fp=re2.exec(w);
stem=fp[1]+fp[2];
re2=new RegExp(mgr1);
if (re2.test(stem)) {
w=stem;
}
}
// Step 5
re=/^(.+?)e$/;
if (re.test(w)) {
fp=re.exec(w);
stem=fp[1];
re=new RegExp(mgr1);
re2=new RegExp(meq1);
re3=new RegExp("^"+C+v+"[^aeiouwxy]$");
if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {
w=stem;
}
}
re=/ll$/;
re2=new RegExp(mgr1);
if (re.test(w)&&re2.test(w)) {
re=/.$/;
w=w.replace(re,"");
}
// and turn initial Y back to y
if (firstch=="y") {
w=firstch.toLowerCase()+w.substr(1);
}
return w;
}
