Spaces:

samarthagarwal23
/

QuestionAnswering_on_annual_reports

Runtime error

samarthagarwal23 commited on Jan 29, 2022

Commit

d1391ee

•

1 Parent(s): 981b258

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -29,11 +29,12 @@ def read_pdf(file):
   # We use BM25 as retriver which will do 1st round of candidate filtering based on word based matching
 def bm25_tokenizer(text):
     tokenized_doc = []
     for token in text.lower().split():
         token = token.strip(string.punctuation)
-        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
             tokenized_doc.append(token)
     return tokenized_doc

   # We use BM25 as retriver which will do 1st round of candidate filtering based on word based matching
 def bm25_tokenizer(text):
+    stop_w = ['a', 'the', 'am', 'is' , 'are', 'who', 'how', 'where', 'when', 'why']
     tokenized_doc = []
     for token in text.lower().split():
         token = token.strip(string.punctuation)
+        if len(token) > 0 and token not in stop_w:
             tokenized_doc.append(token)
     return tokenized_doc