Spaces:

profoz
/

index_demo

Sleeping

App Files Files Community

profoz commited on Nov 20, 2022

Commit

3523908

1 Parent(s): 66ea0bf

better parsed body

Browse files

Files changed (1) hide show

app.py +27 -22

app.py CHANGED Viewed

@@ -1,30 +1,30 @@
-import streamlit as st
-from transformers import pipeline
-from sentence_transformers import CrossEncoder
 import requests
 from bs4 import BeautifulSoup
-from functools import reduce
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelWithLMHead, pipeline
-import openai
 all_documents = {}
 def qa_gpt3(question, context):
     print(question, context)
     openai.api_key = st.secrets["openai_key"]
     response = openai.Completion.create(
-      model="text-davinci-002",
-      prompt=f"Answer given the following context: {context}\n\nQuestion: {question}",
-      temperature=0.7,
-      max_tokens=256,
-      top_p=1,
-      frequency_penalty=0,
-      presence_penalty=0
     )
     print(response)
     return {'answer': response['choices'][0]['text'].strip()}
 st.title('Document Question Answering System')
 qa_model = None
@@ -32,13 +32,14 @@ qa_model = None
 crawl_urls = st.checkbox('Crawl?', value=False)
 document_text = st.text_area(
-label="Links (Comma separated)", height=100,
-value='https://www.databricks.com/blog/2022/11/15/values-define-databricks-culture.html, https://databricks.com/product/databricks-runtime-for-machine-learning/faq'
 )
 query = st.text_input("Query")
 qa_option = st.selectbox('Q/A Answerer', ('gpt3', 'a-ware/bart-squadv2'))
-tokenizing = st.selectbox('How to Tokenize', ("Don't (use entire body as document)", 'Newline (split by newline character)', 'Combo'))
 if qa_option == 'gpt3':
     qa_model = qa_gpt3
@@ -48,6 +49,7 @@ st.write(f'Using {qa_option} as the Q/A model')
 encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
 def get_relevent_passage(question, documents):
     query_paragraph_list = [(question, para) for para in list(documents.keys()) if len(para.strip()) > 0]
@@ -76,12 +78,15 @@ def get_documents(document_text, crawl=crawl_urls):
             st.write('Give me a sec, crawling..')
             import re
-            more_urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', html)
-            more_urls = list(set([m for m in more_urls if m[-4] != '.' and m[-3] != '.' and m.split('/')[:3] == url.split('/')[:3]]))
             for more_url in more_urls:
                 all_documents.update(get_documents(more_url, crawl=False))
-        body = soup.get_text()
         if tokenizing == "Don't (use entire body as document)":
             document_paragraphs = [body]
@@ -109,6 +114,6 @@ if len(document_text.strip()) > 0 and len(query.strip()) > 0 and qa_model and en
     relevant_url = documents[context]
     st.write('Check the answer below...with reference text')
-    st.header("ANSWER: "+answer)
-    st.subheader("REFERENCE: "+context)
-    st.subheader("REFERENCE URL: "+relevant_url)

+import openai
 import requests
+import streamlit as st
 from bs4 import BeautifulSoup
+from sentence_transformers import CrossEncoder
+from transformers import pipeline
 all_documents = {}
 def qa_gpt3(question, context):
     print(question, context)
     openai.api_key = st.secrets["openai_key"]
     response = openai.Completion.create(
+        model="text-davinci-002",
+        prompt=f"Answer given the following context: {context}\n\nQuestion: {question}",
+        temperature=0.7,
+        max_tokens=256,
+        top_p=1,
+        frequency_penalty=0,
+        presence_penalty=0
     )
     print(response)
     return {'answer': response['choices'][0]['text'].strip()}
 st.title('Document Question Answering System')
 qa_model = None
 crawl_urls = st.checkbox('Crawl?', value=False)
 document_text = st.text_area(
+    label="Links (Comma separated)", height=100,
+    value='https://www.databricks.com/blog/2022/11/15/values-define-databricks-culture.html, https://databricks.com/product/databricks-runtime-for-machine-learning/faq'
 )
 query = st.text_input("Query")
 qa_option = st.selectbox('Q/A Answerer', ('gpt3', 'a-ware/bart-squadv2'))
+tokenizing = st.selectbox('How to Tokenize',
+                          ("Don't (use entire body as document)", 'Newline (split by newline character)', 'Combo'))
 if qa_option == 'gpt3':
     qa_model = qa_gpt3
 encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
 def get_relevent_passage(question, documents):
     query_paragraph_list = [(question, para) for para in list(documents.keys()) if len(para.strip()) > 0]
             st.write('Give me a sec, crawling..')
             import re
+            more_urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
+                                   html)
+            more_urls = list(
+                set([m for m in more_urls if m[-4] != '.' and m[-3] != '.' and m.split('/')[:3] == url.split('/')[:3]]))
             for more_url in more_urls:
                 all_documents.update(get_documents(more_url, crawl=False))
+        body = "\n".join([x for x in soup.body.get_text().split('\n') if len(x) > 10])
+        print(body)
         if tokenizing == "Don't (use entire body as document)":
             document_paragraphs = [body]
     relevant_url = documents[context]
     st.write('Check the answer below...with reference text')
+    st.header("ANSWER: " + answer)
+    st.subheader("REFERENCE: " + context)
+    st.subheader("REFERENCE URL: " + relevant_url)