profoz commited on
Commit
3523908
·
1 Parent(s): 66ea0bf

better parsed body

Browse files
Files changed (1) hide show
  1. app.py +27 -22
app.py CHANGED
@@ -1,30 +1,30 @@
1
- import streamlit as st
2
- from transformers import pipeline
3
- from sentence_transformers import CrossEncoder
4
  import requests
 
5
  from bs4 import BeautifulSoup
6
- from functools import reduce
7
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelWithLMHead, pipeline
8
- import openai
9
 
10
  all_documents = {}
11
 
 
12
  def qa_gpt3(question, context):
13
  print(question, context)
14
  openai.api_key = st.secrets["openai_key"]
15
 
16
  response = openai.Completion.create(
17
- model="text-davinci-002",
18
- prompt=f"Answer given the following context: {context}\n\nQuestion: {question}",
19
- temperature=0.7,
20
- max_tokens=256,
21
- top_p=1,
22
- frequency_penalty=0,
23
- presence_penalty=0
24
  )
25
  print(response)
26
  return {'answer': response['choices'][0]['text'].strip()}
27
 
 
28
  st.title('Document Question Answering System')
29
 
30
  qa_model = None
@@ -32,13 +32,14 @@ qa_model = None
32
  crawl_urls = st.checkbox('Crawl?', value=False)
33
 
34
  document_text = st.text_area(
35
- label="Links (Comma separated)", height=100,
36
- value='https://www.databricks.com/blog/2022/11/15/values-define-databricks-culture.html, https://databricks.com/product/databricks-runtime-for-machine-learning/faq'
37
  )
38
  query = st.text_input("Query")
39
 
40
  qa_option = st.selectbox('Q/A Answerer', ('gpt3', 'a-ware/bart-squadv2'))
41
- tokenizing = st.selectbox('How to Tokenize', ("Don't (use entire body as document)", 'Newline (split by newline character)', 'Combo'))
 
42
 
43
  if qa_option == 'gpt3':
44
  qa_model = qa_gpt3
@@ -48,6 +49,7 @@ st.write(f'Using {qa_option} as the Q/A model')
48
 
49
  encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
50
 
 
51
  def get_relevent_passage(question, documents):
52
  query_paragraph_list = [(question, para) for para in list(documents.keys()) if len(para.strip()) > 0]
53
 
@@ -76,12 +78,15 @@ def get_documents(document_text, crawl=crawl_urls):
76
  st.write('Give me a sec, crawling..')
77
  import re
78
 
79
- more_urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', html)
80
- more_urls = list(set([m for m in more_urls if m[-4] != '.' and m[-3] != '.' and m.split('/')[:3] == url.split('/')[:3]]))
 
 
81
  for more_url in more_urls:
82
  all_documents.update(get_documents(more_url, crawl=False))
83
 
84
- body = soup.get_text()
 
85
 
86
  if tokenizing == "Don't (use entire body as document)":
87
  document_paragraphs = [body]
@@ -109,6 +114,6 @@ if len(document_text.strip()) > 0 and len(query.strip()) > 0 and qa_model and en
109
  relevant_url = documents[context]
110
 
111
  st.write('Check the answer below...with reference text')
112
- st.header("ANSWER: "+answer)
113
- st.subheader("REFERENCE: "+context)
114
- st.subheader("REFERENCE URL: "+relevant_url)
 
1
+ import openai
 
 
2
  import requests
3
+ import streamlit as st
4
  from bs4 import BeautifulSoup
5
+ from sentence_transformers import CrossEncoder
6
+ from transformers import pipeline
 
7
 
8
  all_documents = {}
9
 
10
+
11
  def qa_gpt3(question, context):
12
  print(question, context)
13
  openai.api_key = st.secrets["openai_key"]
14
 
15
  response = openai.Completion.create(
16
+ model="text-davinci-002",
17
+ prompt=f"Answer given the following context: {context}\n\nQuestion: {question}",
18
+ temperature=0.7,
19
+ max_tokens=256,
20
+ top_p=1,
21
+ frequency_penalty=0,
22
+ presence_penalty=0
23
  )
24
  print(response)
25
  return {'answer': response['choices'][0]['text'].strip()}
26
 
27
+
28
  st.title('Document Question Answering System')
29
 
30
  qa_model = None
 
32
  crawl_urls = st.checkbox('Crawl?', value=False)
33
 
34
  document_text = st.text_area(
35
+ label="Links (Comma separated)", height=100,
36
+ value='https://www.databricks.com/blog/2022/11/15/values-define-databricks-culture.html, https://databricks.com/product/databricks-runtime-for-machine-learning/faq'
37
  )
38
  query = st.text_input("Query")
39
 
40
  qa_option = st.selectbox('Q/A Answerer', ('gpt3', 'a-ware/bart-squadv2'))
41
+ tokenizing = st.selectbox('How to Tokenize',
42
+ ("Don't (use entire body as document)", 'Newline (split by newline character)', 'Combo'))
43
 
44
  if qa_option == 'gpt3':
45
  qa_model = qa_gpt3
 
49
 
50
  encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
51
 
52
+
53
  def get_relevent_passage(question, documents):
54
  query_paragraph_list = [(question, para) for para in list(documents.keys()) if len(para.strip()) > 0]
55
 
 
78
  st.write('Give me a sec, crawling..')
79
  import re
80
 
81
+ more_urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
82
+ html)
83
+ more_urls = list(
84
+ set([m for m in more_urls if m[-4] != '.' and m[-3] != '.' and m.split('/')[:3] == url.split('/')[:3]]))
85
  for more_url in more_urls:
86
  all_documents.update(get_documents(more_url, crawl=False))
87
 
88
+ body = "\n".join([x for x in soup.body.get_text().split('\n') if len(x) > 10])
89
+ print(body)
90
 
91
  if tokenizing == "Don't (use entire body as document)":
92
  document_paragraphs = [body]
 
114
  relevant_url = documents[context]
115
 
116
  st.write('Check the answer below...with reference text')
117
+ st.header("ANSWER: " + answer)
118
+ st.subheader("REFERENCE: " + context)
119
+ st.subheader("REFERENCE URL: " + relevant_url)