Geraldine commited on
Commit
901aafa
1 Parent(s): 560daf2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -55
app.py CHANGED
@@ -1,38 +1,36 @@
1
  import os
2
  import time
3
- from transformers import pipeline
4
- from langchain.embeddings import HuggingFaceEmbeddings
5
- from huggingface_hub import InferenceClient
6
  import langchain
7
- from langchain import HuggingFaceHub
8
- from langchain.cache import InMemoryCache
9
  from langchain.document_loaders import PyPDFLoader, OnlinePDFLoader, Docx2txtLoader, UnstructuredWordDocumentLoader, UnstructuredPowerPointLoader
10
- from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter
 
 
 
 
11
  from langchain.vectorstores import Chroma
 
 
 
 
 
 
12
  from langchain.chains import RetrievalQA
13
  import gradio as gr
14
 
15
- def define_embeddings_llm(openai_key):
 
 
16
  if openai_key != "":
17
- embeddings = OpenAIEmbeddings(openai_api_key=openai_key)
18
- llm = OpenAI(
19
- temperature=0, model_name="gpt-3.5-turbo-16k", openai_api_key=openai_key, verbose=False
20
- )
21
  else:
22
- HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
23
  embeddings = HuggingFaceEmbeddings(
24
- model_name="sentence-transformers/all-MiniLM-L6-v2"
25
  )
26
- llm = HuggingFaceHub(repo_id="MBZUAI/LaMini-Flan-T5-248M" ,
27
- model_kwargs={"max_length":2048,"do_sample":True,
28
- "temperature":0.2})
29
- langchain.llm_cache = InMemoryCache()
30
- return embeddings,llm
31
-
32
- def build_context(openai_key,files,urls):
33
- embeddings, llm = define_embeddings_llm(openai_key)
34
  documents = []
35
  if files is not None:
 
36
  for idx, file in enumerate(files):
37
  if file.name.endswith('.pdf'):
38
  loader = PyPDFLoader(file.name)
@@ -44,63 +42,71 @@ def build_context(openai_key,files,urls):
44
  loader = UnstructuredPowerPointLoader(file.name)
45
  documents.extend(loader.load())
46
  if urls != "":
 
47
  list_urls = urls.split(sep=",")
48
- for url in list_urls:
49
  loader = OnlinePDFLoader(url)
50
  documents.extend(loader.load())
51
- #text_splitter = RecursiveCharacterTextSplitter(chunk_size=400,chunk_overlap=20,length_function=len,separators=["\n\n", "\n", " ", ""])
52
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
53
  chunked_documents = text_splitter.split_documents(documents)
54
  global vectordb
55
  vectordb = Chroma.from_documents(
56
  documents=chunked_documents,
57
- embedding=embeddings
58
- )
59
- global qa_chain
60
- qa_chain = RetrievalQA.from_chain_type(
61
- llm=llm,
62
- retriever=vectordb.as_retriever(search_kwargs={'k': 7}),
63
- chain_type="stuff",
64
- #return_source_documents=True
65
- )
66
- return "ready"
67
-
68
- def loading():
69
- return "Loading..."
70
 
71
- def respond(message, chat_history):
72
- result = qa_chain({"query": message})["result"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  chat_history.append((message, result))
74
  time.sleep(2)
75
  return "", chat_history
 
 
 
76
 
77
  def clear_chromadb():
78
  ids = vectordb.get()["ids"]
79
  for id in ids:
80
  vectordb._collection.delete(ids=id)
81
 
82
- with gr.Blocks() as demo:
83
  with gr.Row():
84
  openai_key = gr.Textbox(label="Enter your OpenAI API Key if you want to use the gpt-3.5-turbo-16k model. If not, the open source LaMini-Flan-T5-248M is used")
85
  with gr.Row():
86
- with gr.Column():
87
- pdf_docs = gr.Files(label="Load pdf/docx/ppt/pptx files", file_types=['.pdf','.docx','.ppt','.pptx'], type="file")
88
- with gr.Column():
89
- urls = gr.Textbox(label="Enter one of multiple online pdf urls (comma separated if multiple)", value=None)
90
  with gr.Row():
91
- load_context = gr.Button("Load documents and urls")
 
 
 
92
  with gr.Row():
93
- loading_status = gr.Textbox(label="Status", placeholder="", interactive=False)
94
- with gr.Row():
95
- with gr.Column():
96
- hg_chatbot = gr.Chatbot()
97
  msg = gr.Textbox(label="User message")
98
- clear = gr.ClearButton([msg, hg_chatbot])
99
- cleardb = gr.Button(value="Réinitialiser le contexte")
100
- load_context.click(loading, None, loading_status, queue=False)
101
- load_context.click(build_context, inputs=[openai_key,pdf_docs, urls], outputs=[loading_status], queue=False)
102
- msg.submit(respond, [msg, hg_chatbot], [msg, hg_chatbot])
 
 
103
  cleardb.click(clear_chromadb)
104
-
105
  demo.queue(concurrency_count=3)
106
  demo.launch()
 
1
  import os
2
  import time
3
+
 
 
4
  import langchain
5
+ # loaders
 
6
  from langchain.document_loaders import PyPDFLoader, OnlinePDFLoader, Docx2txtLoader, UnstructuredWordDocumentLoader, UnstructuredPowerPointLoader
7
+ # splits
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ # embeddings
10
+ from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
11
+ # vector stores
12
  from langchain.vectorstores import Chroma
13
+ # huggingface hub
14
+ from huggingface_hub import InferenceClient
15
+ from langchain import HuggingFaceHub
16
+ # models
17
+ from langchain.llms import OpenAI
18
+ # retrievers
19
  from langchain.chains import RetrievalQA
20
  import gradio as gr
21
 
22
+ HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
23
+
24
+ def build_context(openai_key,files,urls):
25
  if openai_key != "":
26
+ embeddings = OpenAIEmbeddings(model_name="text-embedding-ada-002", openai_api_key=openai_key)
 
 
 
27
  else:
 
28
  embeddings = HuggingFaceEmbeddings(
29
+ model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}
30
  )
 
 
 
 
 
 
 
 
31
  documents = []
32
  if files is not None:
33
+ print("files not none")
34
  for idx, file in enumerate(files):
35
  if file.name.endswith('.pdf'):
36
  loader = PyPDFLoader(file.name)
 
42
  loader = UnstructuredPowerPointLoader(file.name)
43
  documents.extend(loader.load())
44
  if urls != "":
45
+ print("urls not none")
46
  list_urls = urls.split(sep=",")
47
+ for url in list_urls:
48
  loader = OnlinePDFLoader(url)
49
  documents.extend(loader.load())
50
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=800,chunk_overlap=0,length_function=len,separators=["\n\n", "\n", " ", ""])
 
51
  chunked_documents = text_splitter.split_documents(documents)
52
  global vectordb
53
  vectordb = Chroma.from_documents(
54
  documents=chunked_documents,
55
+ embedding=embeddings,
56
+ )
57
+ return "loaded"
 
 
 
 
 
 
 
 
 
 
58
 
59
+ def llm_response(openai_key, message, chat_history):
60
+ if openai_key != "":
61
+ llm = OpenAI(
62
+ temperature=0, openai_api_key=openai_key, model_name="gpt-3.5-turbo", verbose=False
63
+ )
64
+ else:
65
+ llm = HuggingFaceHub(repo_id='MBZUAI/LaMini-Flan-T5-248M',
66
+ huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
67
+ model_kwargs={"max_length":512,"do_sample":True,
68
+ "temperature":0.2})
69
+ qa_chain = RetrievalQA.from_chain_type(llm = llm,
70
+ chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
71
+ retriever = vectordb.as_retriever(search_kwargs = {"k": 10}),
72
+ #chain_type_kwargs = {"prompt": PROMPT},
73
+ return_source_documents = False,
74
+ verbose = True)
75
+ result = qa_chain(message)["result"]
76
  chat_history.append((message, result))
77
  time.sleep(2)
78
  return "", chat_history
79
+
80
+ def loading():
81
+ return "Loading..."
82
 
83
  def clear_chromadb():
84
  ids = vectordb.get()["ids"]
85
  for id in ids:
86
  vectordb._collection.delete(ids=id)
87
 
88
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
89
  with gr.Row():
90
  openai_key = gr.Textbox(label="Enter your OpenAI API Key if you want to use the gpt-3.5-turbo-16k model. If not, the open source LaMini-Flan-T5-248M is used")
91
  with gr.Row():
92
+ pdf_docs = gr.Files(label="Load pdf files", file_types=['.pdf','.docx','.ppt','.pptx'], type="file")
93
+ urls = gr.Textbox(label="Enter one of multiple online pdf urls (comma separated if multiple)")
 
 
94
  with gr.Row():
95
+ load_docs = gr.Button("Load documents and urls", variant="primary", scale=1)
96
+ loading_status = gr.Textbox(label="Loading status", placeholder="", interactive=False, scale=0)
97
+ if loading_status == "loaded":
98
+ gr.Info("Documents loaded")
99
  with gr.Row():
100
+ with gr.Column(scale=1):
 
 
 
101
  msg = gr.Textbox(label="User message")
102
+ chatbot = gr.Chatbot()
103
+ with gr.Row():
104
+ clearchat = gr.ClearButton([msg, chatbot], value="New chat",)
105
+ cleardb = gr.Button(value="Reset context (for loading new documents)", variant="secondary")
106
+ load_docs.click(loading, None, loading_status, queue=False)
107
+ load_docs.click(build_context, inputs=[openai_key,pdf_docs, urls], outputs=[loading_status], queue=False)
108
+ msg.submit(llm_response, [openai_key, msg, chatbot], [msg, chatbot])
109
  cleardb.click(clear_chromadb)
110
+
111
  demo.queue(concurrency_count=3)
112
  demo.launch()