Sometimes it may be useful to remove stop words from strings, when parsing data in JavaScript. I found myself in that situation with a recent project. I looked around for a String method to do this, but didn’t have much luck, so I wrote my own.
Stop words are typically things like conjunctions, prepositions, etc. Typically, a developer would want to remove stop words from a string, in order to extract keywords from it. I found a stop word list after a quick search, and used it in my method. The method I wrote actually extends the JavaScript String data type, so it can be applied to any string. You just have to be sure to define the prototype method before calling it.
Once you’ve defined it, you just call the method like this:
foo = "here is a string that has some stop words in it"; bar = foo.removeStopWords(); // The value of bar will be "string stop words"
The method works by first defining a very large array with stop words in it. The string itself is split by whitespaces within the string, creating an array of contiguous non-whitespace characters. There are then two loops for processing the words.
The outer loop cycles through all of the words in the string, and the inner loop cycles through all of the stop words. The inner loop performs a comparison between the current word and current stopword. If the word matches the stop word, all instances of it are removed from the main string.
Here’s the code:
/* * String method to remove stop words * Written by GeekLad http://geeklad.com * Stop words obtained from http://www.lextek.com/manuals/onix/stopwords1.html * Usage: string_variable.removeStopWords(); * Output: The original String with stop words removed */ String.prototype.removeStopWords = function() { var x; var y; var word; var stop_word; var regex_str; var regex; var cleansed_string = this.valueOf(); var stop_words = new Array( 'a', 'about', 'above', 'across', 'after', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'among', 'an', 'and', 'another', 'any', 'anybody', 'anyone', 'anything', 'anywhere', 'are', 'area', 'areas', 'around', 'as', 'ask', 'asked', 'asking', 'asks', 'at', 'away', 'b', 'back', 'backed', 'backing', 'backs', 'be', 'became', 'because', 'become', 'becomes', 'been', 'before', 'began', 'behind', 'being', 'beings', 'best', 'better', 'between', 'big', 'both', 'but', 'by', 'c', 'came', 'can', 'cannot', 'case', 'cases', 'certain', 'certainly', 'clear', 'clearly', 'come', 'could', 'd', 'did', 'differ', 'different', 'differently', 'do', 'does', 'done', 'down', 'down', 'downed', 'downing', 'downs', 'during', 'e', 'each', 'early', 'either', 'end', 'ended', 'ending', 'ends', 'enough', 'even', 'evenly', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'f', 'face', 'faces', 'fact', 'facts', 'far', 'felt', 'few', 'find', 'finds', 'first', 'for', 'four', 'from', 'full', 'fully', 'further', 'furthered', 'furthering', 'furthers', 'g', 'gave', 'general', 'generally', 'get', 'gets', 'give', 'given', 'gives', 'go', 'going', 'good', 'goods', 'got', 'great', 'greater', 'greatest', 'group', 'grouped', 'grouping', 'groups', 'h', 'had', 'has', 'have', 'having', 'he', 'her', 'here', 'herself', 'high', 'high', 'high', 'higher', 'highest', 'him', 'himself', 'his', 'how', 'however', 'i', 'if', 'important', 'in', 'interest', 'interested', 'interesting', 'interests', 'into', 'is', 'it', 'its', 'itself', 'j', 'just', 'k', 'keep', 'keeps', 'kind', 'knew', 'know', 'known', 'knows', 'l', 'large', 'largely', 'last', 'later', 'latest', 'least', 'less', 'let', 'lets', 'like', 'likely', 'long', 'longer', 'longest', 'm', 'made', 'make', 'making', 'man', 'many', 'may', 'me', 'member', 'members', 'men', 'might', 'more', 'most', 'mostly', 'mr', 'mrs', 'much', 'must', 'my', 'myself', 'n', 'necessary', 'need', 'needed', 'needing', 'needs', 'never', 'new', 'new', 'newer', 'newest', 'next', 'no', 'nobody', 'non', 'noone', 'not', 'nothing', 'now', 'nowhere', 'number', 'numbers', 'o', 'of', 'off', 'often', 'old', 'older', 'oldest', 'on', 'once', 'one', 'only', 'open', 'opened', 'opening', 'opens', 'or', 'order', 'ordered', 'ordering', 'orders', 'other', 'others', 'our', 'out', 'over', 'p', 'part', 'parted', 'parting', 'parts', 'per', 'perhaps', 'place', 'places', 'point', 'pointed', 'pointing', 'points', 'possible', 'present', 'presented', 'presenting', 'presents', 'problem', 'problems', 'put', 'puts', 'q', 'quite', 'r', 'rather', 'really', 'right', 'right', 'room', 'rooms', 's', 'said', 'same', 'saw', 'say', 'says', 'second', 'seconds', 'see', 'seem', 'seemed', 'seeming', 'seems', 'sees', 'several', 'shall', 'she', 'should', 'show', 'showed', 'showing', 'shows', 'side', 'sides', 'since', 'small', 'smaller', 'smallest', 'so', 'some', 'somebody', 'someone', 'something', 'somewhere', 'state', 'states', 'still', 'still', 'such', 'sure', 't', 'take', 'taken', 'than', 'that', 'the', 'their', 'them', 'then', 'there', 'therefore', 'these', 'they', 'thing', 'things', 'think', 'thinks', 'this', 'those', 'though', 'thought', 'thoughts', 'three', 'through', 'thus', 'to', 'today', 'together', 'too', 'took', 'toward', 'turn', 'turned', 'turning', 'turns', 'two', 'u', 'under', 'until', 'up', 'upon', 'us', 'use', 'used', 'uses', 'v', 'very', 'w', 'want', 'wanted', 'wanting', 'wants', 'was', 'way', 'ways', 'we', 'well', 'wells', 'went', 'were', 'what', 'when', 'where', 'whether', 'which', 'while', 'who', 'whole', 'whose', 'why', 'will', 'with', 'within', 'without', 'work', 'worked', 'working', 'works', 'would', 'x', 'y', 'year', 'years', 'yet', 'you', 'young', 'younger', 'youngest', 'your', 'yours', 'z' ) // Split out all the individual words in the phrase words = cleansed_string.match(/[^\s]+|\s+[^\s+]$/g) // Review all the words for(x=0; x < words.length; x++) { // For each word, check all the stop words for(y=0; y < stop_words.length; y++) { // Get the current word word = words[x].replace(/\s+|[^a-z]+/ig, ""); // Trim the word and remove non-alpha // Get the stop word stop_word = stop_words[y]; // If the word matches the stop word, remove it from the keywords if(word.toLowerCase() == stop_word) { // Build the regex regex_str = "^\\s*"+stop_word+"\\s*$"; // Only word regex_str += "|^\\s*"+stop_word+"\\s+"; // First word regex_str += "|\\s+"+stop_word+"\\s*$"; // Last word regex_str += "|\\s+"+stop_word+"\\s+"; // Word somewhere in the middle regex = new RegExp(regex_str, "ig"); // Remove the word from the keywords cleansed_string = cleansed_string.replace(regex, " "); } } } return cleansed_string.replace(/^\s+|\s+$/g, ""); }
© ® 2013 by kumar and john
//testbox and search buttons used by us to test
//script to be added
/*
* This is the function that actually highlights a text string by
* adding HTML tags before and after all occurrences of the search
* term. You can pass your own tags if you’d like, or if the
* highlightStartTag or highlightEndTag parameters are omitted or
* are empty strings then the default tags will be used.
*/
function doHighlight(bodyText, searchTerm, highlightStartTag, highlightEndTag)
{
// the highlightStartTag and highlightEndTag parameters are optional
if ((!highlightStartTag) || (!highlightEndTag)) {
highlightStartTag = “”;
highlightEndTag = “”;
}
// find all occurences of the search term in the given text,
// and add some “highlight” tags to them (we’re not using a
// regular expression search, because we want to filter out
// matches that occur within HTML tags and script blocks, so
// we have to do a little extra validation)
var newText = “”;
var i = -1;
var lcSearchTerm = searchTerm.toLowerCase();
var lcBodyText = bodyText.toLowerCase();
while (bodyText.length > 0) {
i = lcBodyText.indexOf(lcSearchTerm, i+1);
if (i “, i) >= bodyText.lastIndexOf(“<", i)) {
// skip anything inside a block
if (lcBodyText.lastIndexOf(“/script>”, i) >= lcBodyText.lastIndexOf(“<script", i)) {
newText += bodyText.substring(0, i) + highlightStartTag + bodyText.substr(i, searchTerm.length) + highlightEndTag;
bodyText = bodyText.substr(i + searchTerm.length);
lcBodyText = bodyText.toLowerCase();
i = -1;
}
}
}
}
return newText;
}
/*
* This is sort of a wrapper function to the doHighlight function.
* It takes the searchText that you pass, optionally splits it into
* separate words, and transforms the text on the current web page.
* Only the "searchText" parameter is required; all other parameters
* are optional and can be omitted.
*/
function highlightSearchTerms(searchText, treatAsPhrase, warnOnFailure, highlightStartTag, highlightEndTag)
{
// if the treatAsPhrase parameter is true, then we should search for
// the entire phrase that was entered; otherwise, we will split the
// search string so that each word is searched for and highlighted
// individually
if (treatAsPhrase) {
searchArray = [searchText];
} else {
searchArray = searchText.split(" ");
}
if (!document.body || typeof(document.body.innerHTML) == "undefined") {
if (warnOnFailure) {
alert("Sorry, for some reason the text of this page is unavailable. Searching will not work.");
}
return false;
}
var bodyText = document.body.innerHTML;
for (var i = 0; i < searchArray.length; i++) {
bodyText = doHighlight(bodyText, searchArray[i], highlightStartTag, highlightEndTag);
}
document.body.innerHTML = bodyText;
return true;
}
/*
* This displays a dialog box that allows a user to enter their own
* search terms to highlight on the page, and then passes the search
* text or phrase to the highlightSearchTerms function. All parameters
* are optional.
*/
function stopwordSearch(defaultText)
{
//alert("stop promp");
var my_stop_words = new Array("a","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","all",
"allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another",
"any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are",
"around","as","aside","ask","asking","associated","at","available","away","awfully","be","became","because","become","becomes",
"becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between",
"beyond","both","brief","but","by","cs","came","can","cant","cannot","cant","cause","causes","certain","certainly","changes",
"clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains",
"corresponding","could","couldnt","course","currently","definitely","described","despite","did","didnt","different","do",
"does","doesnt","doing","dont","done","down","downwards","during","each","edu","eg","eight","either","else","elsewhere",
"enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex",
"exactly","example","except","far","few","fifth","first","five","followed","following","follows","for","former","formerly",
"forth","four","from","further","furthermore","get","gets","getting","given","gives","go","goes","going","gone","got",
"gotten","greetings","had","hadnt","happens","hardly","has","hasnt","have","havent","having","he","hes","hello","help","hence",
"her","here","heres","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully",
"how","howbeit","however","id","ill","im","ive","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate",
"indicated","indicates","inner","insofar","instead","into","inward","is","isnt","it","itd","itll","its","its","itself","just",
"keep","keeps","kept","know","knows","known","last","lately","later","latter","latterly","least","less","lest","let","lets",
"like","liked","likely","little","look","looking","looks","ltd","mainly","many","may","maybe","me","mean","meanwhile","merely",
"might","more","moreover","most","mostly","much","must","my","myself","name","namely","nd","near","nearly","necessary","need",
"needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not",
"nothing","novel","now","nowhere","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only",
"onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","particular",
"particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","que","quite","qv",
"rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","said","same",
"saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves",
"sensible","sent","serious","seriously","seven","several","shall","she","should","shouldnt","since","six","so","some",
"somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify",
"specifying","still","sub","such","sup","sure","ts","take","taken","tell","tends","th","than","thank","thanks","thanx","that",
"thats","thats","the","their","theirs","them","themselves","then","thence","there","theres","thereafter","thereby","therefore",
"therein","theres","thereupon","these","they","theyd","theyll","theyre","theyve","think","third","this","thorough","thoroughly",
"those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries",
"truly","try","trying","twice","two","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use",
"used","useful","uses","using","usually","value","various","very","via","viz","vs","want","wants","was","wasnt","way","we",
"wed","well","were","weve","welcome","well","went","were","werent","what","whats","whatever","when","whence","whenever",
"where","wheres","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither",
"who","whos","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","wont","wonder",
"would","would","wouldnt","yes","yet","you","youd","youll","youre","youve","your","yours","yourself","yourselves","zero");
var text = defaultText.toLowerCase();
var res=(my_stop_words.indexOf(text) <= 0) ? true : false;
return res;
}
function searchPrompt(defaultText, treatAsPhrase, textColor, bgColor)
{
//alert("searched promp");
//search for keyword is a stop word or not
var result= stopwordSearch(defaultText);
if(result == true)
{
// This function prompts the user for any words that should
// be highlighted on this web page
if (!defaultText) {
defaultText = "";
}
// we can optionally use our own highlight tag values
if ((!textColor) || (!bgColor)) {
highlightStartTag = "";
highlightEndTag = "";
} else {
highlightStartTag = "”;
highlightEndTag = “”;
}
if (treatAsPhrase) {
promptText = “Please enter the phrase you’d like to search for:”;
} else {
promptText = “Please enter the words you’d like to search for, separated by spaces:”;
}
//searchText = prompt(promptText, defaultText);
searchText = defaultText;
if (!searchText) {
alert(“No search terms were entered. Exiting function.”);
return false;
}
//alert(searchText);
//alert(treatAsPhrase);
//alert(highlightStartTag);
//alert(highlightEndTag);
return highlightSearchTerms(searchText, treatAsPhrase, true, highlightStartTag, highlightEndTag);
}
else
{
alert(“keyword not valid try again”);
}
}