Spaces:

Pennywise881
/

wiki-chat

Runtime error

App Files Files Community

Pennywise881 commited on Feb 21, 2023

Commit

9f23e0b

•

1 Parent(s): 0b52499

uploaded code files

Browse files

Files changed (4) hide show

Article.py +46 -0
QueryProcessor.py +98 -0
QuestionAnswer.py +129 -0
app.py +84 -0

Article.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import wikipediaapi
+class Article:
+    def __init__(self, article_name):
+        self.article_data = {}
+        self.article = wikipediaapi.Wikipedia('en').page(article_name)
+    def article_exists(self):
+        try:
+            if self.article.exists():
+                return True
+        except:
+            return False
+    def get_sections_and_texts(self, sections):
+        if 'Summary' not in self.article_data:
+            self.article_data['Summary'] = ''
+            if self.article.summary:
+                self.article_data['Summary'] = self.article.summary.lower().split('\n')
+        for section in sections:
+            if section.text:
+                self.article_data[section.title] = section.text.lower().split('\n')
+            if len(section.sections) > 0:
+                self.get_sections_and_texts(section.sections)
+    def remove_empty_sections(self):
+        for _, docs in self.article_data.items():
+            for d in docs:
+                if len(d) <= 0:
+                    docs.remove(d)
+    def get_article_data(self):
+        self.get_sections_and_texts(self.article.sections)
+        self.remove_empty_sections()
+        num_docs = sum(len(docs) for _, docs in self.article_data.items())
+        avg_doc_len = sum(len(doc.split()) for _, docs in self.article_data.items() for doc in docs) / num_docs
+        return {
+            'article_data': self.article_data,
+            'num_docs': num_docs,
+            'avg_doc_len': avg_doc_len
+            }

QueryProcessor.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import numpy as np
+from nltk.corpus import stopwords
+from nltk.tokenize import RegexpTokenizer
+class QueryProcessor:
+    def __init__(self, question, section_texts, N, avg_doc_len):
+        self.section_texts = section_texts
+        self.N = N
+        self.avg_doc_len = avg_doc_len
+        # self.bm25_scores = {}
+        self.query_items = self.set_query(question)
+        self.section_document_idx = None
+    def set_query(self, question):
+        punct_regex = RegexpTokenizer(r'\w+')
+        return [q for q in punct_regex.tokenize(question.lower()) if q not in stopwords.words('english')]
+    def get_query(self):
+        return self.query_items
+    def bm25(self, word, paragraph, k=1.2, b=0.75):
+        # frequency of word (word) in doc (paragraph)
+        freq = paragraph.split().count(word)
+        # term frequency
+        tf = (freq * (k+1)) / (freq + k * (1 - b + b * len(paragraph.split()) / self.avg_doc_len))
+        # number of docs that contain the word
+        N_q = sum([1 for _, docs in self.section_texts.items() for doc in docs if word in doc.split()])
+        # inverse document frequency
+        idf = np.log(((self.N - N_q + 0.5) / (N_q + 0.5)) + 1)
+        return round(tf*idf, 4)
+    def get_bm25_scores(self):
+        bm25_scores = {}
+        for query in self.query_items:
+            bm25_scores[query] = {}
+            for section, docs in self.section_texts.items():
+                bm25_scores[query][section] = {}
+                for doc_index in range(len(docs)):
+                    score = self.bm25(query, docs[doc_index])
+                    if score > 0.0:
+                        bm25_scores[query][section][doc_index] = score
+                if len(bm25_scores[query][section]) <= 0:
+                    del bm25_scores[query][section]
+        return bm25_scores
+    def filter_bad_documents(self, bm25_scores):
+        section_document_idx = {}
+        for sec_docs in bm25_scores.values():
+            for sec, doc_scores in sec_docs.items():
+                if sec not in section_document_idx:
+                    section_document_idx[sec] = []
+                for doc_idx, score in doc_scores.items():
+                    if score > 0.5 and doc_idx not in section_document_idx[sec]:
+                        section_document_idx[sec].append(doc_idx)
+                if len(section_document_idx[sec]) <= 0:
+                    del section_document_idx[sec]
+        return section_document_idx
+    def get_context(self):
+        bm25_scores = self.get_bm25_scores()
+        self.section_document_idx = self.filter_bad_documents(bm25_scores)
+        # print(bm25_scores)
+        context = ' '.join([self.section_texts[section][d_id] for section, doc_ids in self.section_document_idx.items() for d_id in doc_ids])
+        # print(section_document_idx)
+        return context
+    def match_section_with_answer_text(self, text):
+    # print(text)
+        sections = []
+        for sec, doc_ids in self.section_document_idx.items():
+            for d_id in doc_ids:
+                if self.section_texts[sec][d_id].find(text) > -1:
+                    sections.append(sec)
+        return sections

QuestionAnswer.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import torch
+import numpy as np
+# # from transformers import AutoTokenizer, AutoModelForQuestionAnswering
+class QuestionAnswer:
+    def __init__(self, data, model, tokenizer, torch_device):
+        self.max_length = 384
+        self.doc_stride = 128
+        self.tokenizer = tokenizer
+        self.model = model
+        self.data = data
+        self.torch_device = torch_device
+        self.output = None
+        self.features = None
+        self.results = None
+    def get_output_from_model(self):
+        # data = {'question': question, 'context': context}
+        with torch.no_grad():
+            tokenized_data = self.tokenizer(
+                self.data['question'],
+                self.data['context'],
+                truncation='only_second',
+                max_length=self.max_length,
+                stride=self.doc_stride,
+                return_overflowing_tokens=True,
+                return_offsets_mapping=True,
+                padding='max_length',
+                return_tensors='pt'
+            ).to(self.torch_device)
+            output = self.model(tokenized_data['input_ids'], tokenized_data['attention_mask'])
+        return output
+        # print(output.keys())
+        # print(output['start_logits'].shape)
+        # print(output['end_logits'].shape)
+        # print(tokenized_data.keys())
+    def prepare_features(self, example):
+        tokenized_example = self.tokenizer(
+            example['question'],
+            example['context'],
+            truncation='only_second',
+            max_length=self.max_length,
+            stride=self.doc_stride,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            padding='max_length',
+        )
+        # sample_mapping = tokenized_example.pop("overflow_to_sample_mapping")
+        for i in range(len(tokenized_example['input_ids'])):
+            sequence_ids = tokenized_example.sequence_ids(i)
+            # print(sequence_ids)
+            context_index = 1
+            # sample_index = sample_mapping[i]
+            tokenized_example["offset_mapping"][i] = [
+                (o if sequence_ids[k] == context_index else None)
+                for k, o in enumerate(tokenized_example["offset_mapping"][i])
+            ]
+        return tokenized_example
+    def postprocess_qa_predictions(self, data, features, raw_predictions, top_n_answers=5, max_answer_length=30):
+        all_start_logits, all_end_logits = raw_predictions.start_logits, raw_predictions.end_logits
+        # print(all_start_logits)
+        results = []
+        context = data['context']
+        # print(len(features['input_ids']))
+        for i in range(len(features['input_ids'])):
+            start_logits = all_start_logits[i].cpu().numpy()
+            end_logits = all_end_logits[i].cpu().numpy()
+            # print(start_logits)
+            offset_mapping = features['offset_mapping'][i]
+            start_indices = np.argsort(start_logits)[-1: -top_n_answers - 1: -1].tolist()
+            end_indices = np.argsort(end_logits)[-1: -top_n_answers - 1: -1].tolist()
+            for start_index in start_indices:
+                for end_index in end_indices:
+                    if (
+                        start_index >= len(offset_mapping)
+                        or end_index >= len(offset_mapping)
+                        or offset_mapping[start_index] is None
+                        or offset_mapping[end_index] is None
+                        or end_index < start_index
+                        or end_index - start_index + 1 > max_answer_length
+                    ):
+                        continue
+                    start_char = offset_mapping[start_index][0]
+                    end_char = offset_mapping[end_index][1]
+                    # print(start_logits[start_index])
+                    # print(end_logits[end_index])
+                    score = start_logits[start_index] + end_logits[end_index]
+                    results.append(
+                        {
+                            'score': float('%.*g' % (3, score)),
+                            'text': context[start_char: end_char]
+                        }
+                    )
+        results = sorted(results, key=lambda x: x["score"], reverse=True)[:top_n_answers]
+        return results
+    def get_results(self):
+        self.output = self.get_output_from_model()
+        self.features = self.prepare_features(self.data)
+        self.results = self.postprocess_qa_predictions(self.data, self.features, self.output)
+        return self.results

app.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import streamlit as st
+import wikipediaapi
+from Article import Article
+from QueryProcessor import QueryProcessor
+from QuestionAnswer import QuestionAnswer
+from transformers import AutoTokenizer, AutoModelForQuestionAnswering
+model = AutoModelForQuestionAnswering.from_pretrained('Pennywise881/distilbert-base-uncased-finetuned-squad-v2')
+tokenizer = AutoTokenizer.from_pretrained('Pennywise881/distilbert-base-uncased-finetuned-squad-v2')
+st.write("""
+    # Wiki Q & A
+""")
+placeholder = st.empty()
+wiki_wiki = wikipediaapi.Wikipedia('en')
+if "found_article" not in st.session_state:
+    st.session_state.page = 0
+    st.session_state.found_article = False
+    st.session_state.article = ''
+    st.session_state.conversation = []
+    st.session_state.article_data = {}
+def get_article():
+    article_name = placeholder.text_input('Enter the name of a Wikipedia article', '')
+    if article_name:
+       page = wiki_wiki.page(article_name)
+       if page.exists():
+        st.session_state.found_article = True
+        st.session_state.article = article_name
+        article = Article(article_name=article_name)
+        st.session_state.article_data = article.get_article_data()
+        ask_questions()
+       else:
+        st.write(f'Sorry, could not find Wikipedia article: {article}')
+def ask_questions():
+    question = placeholder.text_input(f"Ask questions about {st.session_state.article}", '')
+    st.header("Questions and Answers:")
+    if question:
+        query_processor = QueryProcessor(
+            question=question,
+            section_texts=st.session_state.article_data['article_data'],
+            N=st.session_state.article_data['num_docs'],
+            avg_doc_len=st.session_state.article_data['avg_doc_len']
+        )
+        context = query_processor.get_context()
+        data = {
+            'question': question,
+            'context': context
+        }
+        qa = QuestionAnswer(data, model, tokenizer, 'cpu')
+        results = qa.get_results()
+        answer = ''
+        for r in results:
+            answer += r['text']+", "
+        answer = answer[:len(answer)-2]
+        st.session_state.conversation.append({'question' : question, 'answer': answer})
+        st.session_state.conversation.reverse()
+        # print(results)
+    if len(st.session_state.conversation) > 0:
+        for data in st.session_state.conversation:
+            st.text("Question: " + data['question'] + "\n" + "Answer: " + data['answer'] )
+if st.session_state.found_article == False:
+    get_article()
+else:
+    ask_questions()