summaryrefslogtreecommitdiffstats
path: root/webapp/django/utils/stopwords.py
diff options
context:
space:
mode:
Diffstat (limited to 'webapp/django/utils/stopwords.py')
-rw-r--r--webapp/django/utils/stopwords.py42
1 files changed, 42 insertions, 0 deletions
diff --git a/webapp/django/utils/stopwords.py b/webapp/django/utils/stopwords.py
new file mode 100644
index 0000000000..18aeb7f5d3
--- /dev/null
+++ b/webapp/django/utils/stopwords.py
@@ -0,0 +1,42 @@
+# Performance note: I benchmarked this code using a set instead of
+# a list for the stopwords and was surprised to find that the list
+# performed /better/ than the set - maybe because it's only a small
+# list.
+
+stopwords = '''
+i
+a
+an
+are
+as
+at
+be
+by
+for
+from
+how
+in
+is
+it
+of
+on
+or
+that
+the
+this
+to
+was
+what
+when
+where
+'''.split()
+
+def strip_stopwords(sentence):
+ "Removes stopwords - also normalizes whitespace"
+ words = sentence.split()
+ sentence = []
+ for word in words:
+ if word.lower() not in stopwords:
+ sentence.append(word)
+ return u' '.join(sentence)
+