| | |
| | |
| | |
| |
|
| |
|
| | var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"];
|
| |
|
| |
|
| |
|
| |
|
| | |
| | |
| |
|
| | var Stemmer = function() {
|
| |
|
| | var step2list = {
|
| | ational: 'ate',
|
| | tional: 'tion',
|
| | enci: 'ence',
|
| | anci: 'ance',
|
| | izer: 'ize',
|
| | bli: 'ble',
|
| | alli: 'al',
|
| | entli: 'ent',
|
| | eli: 'e',
|
| | ousli: 'ous',
|
| | ization: 'ize',
|
| | ation: 'ate',
|
| | ator: 'ate',
|
| | alism: 'al',
|
| | iveness: 'ive',
|
| | fulness: 'ful',
|
| | ousness: 'ous',
|
| | aliti: 'al',
|
| | iviti: 'ive',
|
| | biliti: 'ble',
|
| | logi: 'log'
|
| | };
|
| |
|
| | var step3list = {
|
| | icate: 'ic',
|
| | ative: '',
|
| | alize: 'al',
|
| | iciti: 'ic',
|
| | ical: 'ic',
|
| | ful: '',
|
| | ness: ''
|
| | };
|
| |
|
| | var c = "[^aeiou]";
|
| | var v = "[aeiouy]";
|
| | var C = c + "[^aeiouy]*";
|
| | var V = v + "[aeiou]*";
|
| |
|
| | var mgr0 = "^(" + C + ")?" + V + C;
|
| | var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$";
|
| | var mgr1 = "^(" + C + ")?" + V + C + V + C;
|
| | var s_v = "^(" + C + ")?" + v;
|
| |
|
| | this.stemWord = function (w) {
|
| | var stem;
|
| | var suffix;
|
| | var firstch;
|
| | var origword = w;
|
| |
|
| | if (w.length < 3)
|
| | return w;
|
| |
|
| | var re;
|
| | var re2;
|
| | var re3;
|
| | var re4;
|
| |
|
| | firstch = w.substr(0,1);
|
| | if (firstch == "y")
|
| | w = firstch.toUpperCase() + w.substr(1);
|
| |
|
| |
|
| | re = /^(.+?)(ss|i)es$/;
|
| | re2 = /^(.+?)([^s])s$/;
|
| |
|
| | if (re.test(w))
|
| | w = w.replace(re,"$1$2");
|
| | else if (re2.test(w))
|
| | w = w.replace(re2,"$1$2");
|
| |
|
| |
|
| | re = /^(.+?)eed$/;
|
| | re2 = /^(.+?)(ed|ing)$/;
|
| | if (re.test(w)) {
|
| | var fp = re.exec(w);
|
| | re = new RegExp(mgr0);
|
| | if (re.test(fp[1])) {
|
| | re = /.$/;
|
| | w = w.replace(re,"");
|
| | }
|
| | }
|
| | else if (re2.test(w)) {
|
| | var fp = re2.exec(w);
|
| | stem = fp[1];
|
| | re2 = new RegExp(s_v);
|
| | if (re2.test(stem)) {
|
| | w = stem;
|
| | re2 = /(at|bl|iz)$/;
|
| | re3 = new RegExp("([^aeiouylsz])\\1$");
|
| | re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
|
| | if (re2.test(w))
|
| | w = w + "e";
|
| | else if (re3.test(w)) {
|
| | re = /.$/;
|
| | w = w.replace(re,"");
|
| | }
|
| | else if (re4.test(w))
|
| | w = w + "e";
|
| | }
|
| | }
|
| |
|
| |
|
| | re = /^(.+?)y$/;
|
| | if (re.test(w)) {
|
| | var fp = re.exec(w);
|
| | stem = fp[1];
|
| | re = new RegExp(s_v);
|
| | if (re.test(stem))
|
| | w = stem + "i";
|
| | }
|
| |
|
| |
|
| | re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
|
| | if (re.test(w)) {
|
| | var fp = re.exec(w);
|
| | stem = fp[1];
|
| | suffix = fp[2];
|
| | re = new RegExp(mgr0);
|
| | if (re.test(stem))
|
| | w = stem + step2list[suffix];
|
| | }
|
| |
|
| |
|
| | re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
|
| | if (re.test(w)) {
|
| | var fp = re.exec(w);
|
| | stem = fp[1];
|
| | suffix = fp[2];
|
| | re = new RegExp(mgr0);
|
| | if (re.test(stem))
|
| | w = stem + step3list[suffix];
|
| | }
|
| |
|
| |
|
| | re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
|
| | re2 = /^(.+?)(s|t)(ion)$/;
|
| | if (re.test(w)) {
|
| | var fp = re.exec(w);
|
| | stem = fp[1];
|
| | re = new RegExp(mgr1);
|
| | if (re.test(stem))
|
| | w = stem;
|
| | }
|
| | else if (re2.test(w)) {
|
| | var fp = re2.exec(w);
|
| | stem = fp[1] + fp[2];
|
| | re2 = new RegExp(mgr1);
|
| | if (re2.test(stem))
|
| | w = stem;
|
| | }
|
| |
|
| |
|
| | re = /^(.+?)e$/;
|
| | if (re.test(w)) {
|
| | var fp = re.exec(w);
|
| | stem = fp[1];
|
| | re = new RegExp(mgr1);
|
| | re2 = new RegExp(meq1);
|
| | re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
|
| | if (re.test(stem) || (re2.test(stem) && !(re3.test(stem))))
|
| | w = stem;
|
| | }
|
| | re = /ll$/;
|
| | re2 = new RegExp(mgr1);
|
| | if (re.test(w) && re2.test(w)) {
|
| | re = /.$/;
|
| | w = w.replace(re,"");
|
| | }
|
| |
|
| |
|
| | if (firstch == "y")
|
| | w = firstch.toLowerCase() + w.substr(1);
|
| | return w;
|
| | }
|
| | }
|
| |
|
| |
|