diff options
Diffstat (limited to 'webapp/django/utils/stopwords.py')
-rw-r--r-- | webapp/django/utils/stopwords.py | 42 |
1 files changed, 42 insertions, 0 deletions
diff --git a/webapp/django/utils/stopwords.py b/webapp/django/utils/stopwords.py new file mode 100644 index 0000000000..18aeb7f5d3 --- /dev/null +++ b/webapp/django/utils/stopwords.py @@ -0,0 +1,42 @@ +# Performance note: I benchmarked this code using a set instead of +# a list for the stopwords and was surprised to find that the list +# performed /better/ than the set - maybe because it's only a small +# list. + +stopwords = ''' +i +a +an +are +as +at +be +by +for +from +how +in +is +it +of +on +or +that +the +this +to +was +what +when +where +'''.split() + +def strip_stopwords(sentence): + "Removes stopwords - also normalizes whitespace" + words = sentence.split() + sentence = [] + for word in words: + if word.lower() not in stopwords: + sentence.append(word) + return u' '.join(sentence) + |