qmaruf commited on
Commit
234623e
1 Parent(s): f0e70d4

feature added to get data from url

Browse files
Files changed (2) hide show
  1. app.py +26 -11
  2. utils.py +46 -2
app.py CHANGED
@@ -12,11 +12,13 @@ from loguru import logger
12
  from config import Config
13
  from utils import create_vectordb
14
  from utils import get_qa_chain
 
 
 
15
  load_dotenv()
16
 
17
  openai.api_key = os.environ['OPENAI_API_KEY']
18
 
19
-
20
  if 'messages' not in st.session_state:
21
  st.session_state.messages = []
22
 
@@ -25,34 +27,47 @@ for message in st.session_state.messages:
25
  st.markdown(message['content'])
26
 
27
  uploaded_file = st.sidebar.file_uploader('Upload a file', type=['pdf', 'txt'])
 
 
 
 
 
28
 
29
 
30
  def set_status():
31
  if uploaded_file is None:
32
- Path(Config.vectorstore_path).unlink(missing_ok=True)
33
  st.sidebar.info('Upoad a file to start a conversation')
34
  else:
35
  st.sidebar.info(f'Let"s talk to {Path(uploaded_file.name)}')
36
 
37
 
38
- def process_uploaded_file(uploaded_file):
39
  if 'context' not in st.session_state:
40
- logger.info(f'file uploaded {uploaded_file}')
41
- upath = f'docs/{uploaded_file.name}'
42
- logger.info(f'file saved to {upath}')
 
 
 
 
43
 
44
- with open(upath, 'wb') as hndl:
45
- hndl.write(uploaded_file.getbuffer())
46
 
47
- create_vectordb(upath)
 
 
48
  st.session_state['context'] = True
49
 
50
 
51
  set_status()
52
 
53
 
54
- if uploaded_file is not None:
55
- process_uploaded_file(uploaded_file)
 
 
 
 
56
  qr_chain = get_qa_chain()
57
 
58
  if prompt := st.chat_input('Send a message'):
 
12
  from config import Config
13
  from utils import create_vectordb
14
  from utils import get_qa_chain
15
+ from utils import load_file
16
+ from utils import load_url
17
+ from utils import save_file_locally
18
  load_dotenv()
19
 
20
  openai.api_key = os.environ['OPENAI_API_KEY']
21
 
 
22
  if 'messages' not in st.session_state:
23
  st.session_state.messages = []
24
 
 
27
  st.markdown(message['content'])
28
 
29
  uploaded_file = st.sidebar.file_uploader('Upload a file', type=['pdf', 'txt'])
30
+ doc_url = st.sidebar.text_input('Or enter a URL to a document')
31
+
32
+ if uploaded_file is not None and doc_url != '':
33
+ st.sidebar.error('Please choose one or the other')
34
+ st.stop()
35
 
36
 
37
  def set_status():
38
  if uploaded_file is None:
39
+ # Path(Config.vectorstore_path).unlink(missing_ok=True)
40
  st.sidebar.info('Upoad a file to start a conversation')
41
  else:
42
  st.sidebar.info(f'Let"s talk to {Path(uploaded_file.name)}')
43
 
44
 
45
+ def process_data(data, data_type):
46
  if 'context' not in st.session_state:
47
+ if data_type == 'file':
48
+ upath = f'docs/{uploaded_file.name}'
49
+ save_file_locally(data, upath)
50
+ load_file(upath)
51
+ else:
52
+ load_url(data)
53
+ st.session_state['context'] = True
54
 
 
 
55
 
56
+ def process_uploaded_doc():
57
+ if 'context' not in st.session_state:
58
+ loader = Uns
59
  st.session_state['context'] = True
60
 
61
 
62
  set_status()
63
 
64
 
65
+ if uploaded_file is not None or doc_url != '':
66
+ if uploaded_file is not None:
67
+ process_data(uploaded_file, data_type='file')
68
+ else:
69
+ process_data(doc_url, data_type='url')
70
+
71
  qr_chain = get_qa_chain()
72
 
73
  if prompt := st.chat_input('Send a message'):
utils.py CHANGED
@@ -7,6 +7,7 @@ from dotenv import load_dotenv
7
  from langchain.chains import ConversationalRetrievalChain
8
  from langchain.chat_models import ChatOpenAI
9
  from langchain.document_loaders import UnstructuredFileLoader
 
10
  from langchain.embeddings.openai import OpenAIEmbeddings
11
  from langchain.memory import ConversationBufferMemory
12
  from langchain.prompts import PromptTemplate
@@ -29,19 +30,62 @@ def get_prompt():
29
  """
30
  This function creates a prompt template that will be used to generate the prompt for the model.
31
  """
32
- template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
33
  ---
34
  Context: {context}
35
  Question: {question}
36
  Answer:"""
37
  qa_prompt = PromptTemplate(
38
  template=template, input_variables=[
39
- 'question', 'context', 'chat_history',
40
  ],
41
  )
42
  return qa_prompt
43
 
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  def create_vectordb(file_path):
46
  """
47
  This function creates a vectorstore from a file.
 
7
  from langchain.chains import ConversationalRetrievalChain
8
  from langchain.chat_models import ChatOpenAI
9
  from langchain.document_loaders import UnstructuredFileLoader
10
+ from langchain.document_loaders import UnstructuredURLLoader
11
  from langchain.embeddings.openai import OpenAIEmbeddings
12
  from langchain.memory import ConversationBufferMemory
13
  from langchain.prompts import PromptTemplate
 
30
  """
31
  This function creates a prompt template that will be used to generate the prompt for the model.
32
  """
33
+ template = """Only use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. If you find an answer, explain the reasoning behind it. Don't make up new terms which are not available in the context.
34
  ---
35
  Context: {context}
36
  Question: {question}
37
  Answer:"""
38
  qa_prompt = PromptTemplate(
39
  template=template, input_variables=[
40
+ 'question', 'context',
41
  ],
42
  )
43
  return qa_prompt
44
 
45
 
46
+ # def process_data():
47
+
48
+
49
+ def save_file_locally(uploaded_file, dest):
50
+ with open(dest, 'wb') as hndl:
51
+ hndl.write(uploaded_file.getbuffer())
52
+
53
+
54
+ def get_text_splitter():
55
+ """
56
+ This function creates a text splitter.
57
+ """
58
+ text_splitter = RecursiveCharacterTextSplitter(
59
+ chunk_size=Config.chunk_size,
60
+ chunk_overlap=Config.chunk_overlap,
61
+ length_function=len,
62
+ )
63
+ return text_splitter
64
+
65
+
66
+ def create_vectorstore(data):
67
+ text_splitter = get_text_splitter()
68
+ documents = text_splitter.split_documents(data)
69
+ embeddings = OpenAIEmbeddings()
70
+ vectorstore = FAISS.from_documents(documents, embeddings)
71
+ with open(Config.vectorstore_path, 'wb') as f:
72
+ pickle.dump(vectorstore, f)
73
+
74
+
75
+ def load_url(url):
76
+ loader = UnstructuredURLLoader(urls=[url])
77
+ data = loader.load()
78
+ import pdb
79
+ pdb.set_trace()
80
+ create_vectorstore(data)
81
+
82
+
83
+ def load_file(file):
84
+ loader = UnstructuredFileLoader(file)
85
+ data = loader.load()
86
+ create_vectorstore(data)
87
+
88
+
89
  def create_vectordb(file_path):
90
  """
91
  This function creates a vectorstore from a file.