Mahadih534 commited on
Commit
b7011f1
1 Parent(s): a14dfbd

initial commit

Browse files
Files changed (1) hide show
  1. app.py +316 -0
app.py ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Callable, Dict
2
+ from llama_index.llms.huggingface import HuggingFaceInferenceAPI
3
+
4
+ from huggingface_hub import AsyncInferenceClient, InferenceClient
5
+ from llama_index.core.base.llms.types import (
6
+ CompletionResponseGen,
7
+ CompletionResponse
8
+ )
9
+
10
+
11
+ class CustomLLMInferenceWrapper(HuggingFaceInferenceAPI):
12
+
13
+ kwa = dict(
14
+ temperature=0.2,
15
+ max_new_tokens=512,
16
+ top_p=0.95,
17
+ repetition_penalty=0.93,
18
+ do_sample=True,
19
+ seed=42,
20
+ )
21
+
22
+ def __init__(self, **kwargs: Any):
23
+ super().__init__(**kwargs)
24
+ model_name=kwargs.get("model_name")
25
+ self._sync_client = InferenceClient(model=model_name)
26
+
27
+
28
+ def stream_complete(
29
+ self, prompt: str, formatted: bool = False, **kwargs: Any
30
+ ) -> CompletionResponseGen:
31
+ """Streaming completion endpoint."""
32
+ def gen() -> CompletionResponseGen:
33
+ for response in self._sync_client.text_generation(prompt,**self.kwa, stream=True, details=True, return_full_text=False):
34
+ yield CompletionResponse(text=response.token.text,delta=response.token.text)
35
+ return gen()
36
+
37
+ def complete(
38
+ self, prompt: str, formatted: bool = False, **kwargs: Any
39
+ ) -> CompletionResponse:
40
+ return CompletionResponse(
41
+ text=self._sync_client.text_generation(
42
+ prompt, **{**{"max_new_tokens": self.num_output}, **kwargs}
43
+ )
44
+ )
45
+
46
+ import os
47
+ from typing import List, Optional
48
+ from llama_index.llms.huggingface import HuggingFaceInferenceAPI
49
+
50
+
51
+ llm = CustomLLMInferenceWrapper(model_name="mistralai/Mixtral-8x7B-Instruct-v0.1")
52
+
53
+ from langchain.storage import LocalFileStore
54
+ from langchain.embeddings import CacheBackedEmbeddings
55
+ from langchain.embeddings.huggingface import HuggingFaceEmbeddings
56
+ from llama_index.core import VectorStoreIndex
57
+ from llama_index.embeddings.langchain import LangchainEmbedding
58
+ from torch import cuda
59
+
60
+
61
+ store = LocalFileStore("./CacheBackedEmbeddings/")
62
+
63
+ embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'
64
+ device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
65
+
66
+ embed_model = HuggingFaceEmbeddings(
67
+ model_name=embed_model_id,
68
+ model_kwargs={'device': device},
69
+ encode_kwargs={'device': device, 'batch_size': 32}
70
+ )
71
+
72
+ cached_embedder = CacheBackedEmbeddings.from_bytes_store(
73
+ embed_model, store, namespace="sentence-transformers/all-MiniLM-L6-v2")
74
+
75
+ emb_model = LangchainEmbedding(cached_embedder)
76
+
77
+ from llama_index.core import Settings
78
+ from llama_index.core.node_parser import SentenceSplitter
79
+ from llama_index.core import ServiceContext, set_global_service_context
80
+
81
+ service_context = ServiceContext.from_defaults(
82
+ llm_predictor=llm,
83
+ embed_model=emb_model,
84
+ node_parser=SentenceSplitter(chunk_size=512, chunk_overlap=20),
85
+ num_output=512,
86
+ context_window=3900,
87
+ )
88
+
89
+ Settings.embed_model = emb_model
90
+ Settings.llm=llm
91
+ Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
92
+ Settings.num_output = 512
93
+ Settings.context_window = 3900
94
+
95
+ from llama_index.core import PromptTemplate
96
+ from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
97
+
98
+ text_qa_template_str = (
99
+ "<INST> Context information is"
100
+ " below.\n---------------------\n{context_str}\n---------------------\nUsing"
101
+ " both the context information and also using your own knowledge, answer"
102
+ " the question: {query_str}\nIf the context isn't helpful, you can also"
103
+ " answer the question on your own.\n "
104
+ " and giving proper instruction to you that tell your name DocuGenie which is large language model (LLM) based Chatbot assistant developed by Mahadi Hassan \n "
105
+ " and tell to each and every user that you are capable of giving answers based on the context of web pages and PDFs, acting as a helpful assistant for every user. \n"
106
+ " and provide your developer Mahadi Hassan's linkedin url link but when user want to know about you not every time, his linkedin profile link is https://www.linkedin.com/in/mahadise01 <INST/>"
107
+ )
108
+ text_qa_template = PromptTemplate(text_qa_template_str)
109
+
110
+ refine_template_str = (
111
+ "<INST> The original question is as follows: {query_str}\nWe have provided an"
112
+ " existing answer: {existing_answer}\nWe have the opportunity to refine"
113
+ " the existing answer (only if needed) with some more context"
114
+ " below.\n------------\n{context_msg}\n------------\nUsing both the new"
115
+ " context and your own knowledge, update or repeat the existing answer and give answer even the context is not helpful.\n "
116
+ "and giving proper instruction to you that tell your name DocuGenie which is large language model (LLM) based Chatbot assistant developed by Mahadi Hassan \n "
117
+ "and tell to each and every user that you are capable of giving answers based on the context of web pages and PDFs, acting as a helpful assistant for every user. \n"
118
+ "and provide your developer Mahadi Hassan's linkedin url link but when user want to know about you not every time, his linkedin profile link is https://www.linkedin.com/in/mahadise01 <INST/>"
119
+ )
120
+ refine_template = PromptTemplate(refine_template_str)
121
+
122
+ import urllib.parse as urlParse
123
+ from llama_index.readers.web import SimpleWebPageReader
124
+ from llama_index.core import StorageContext, load_index_from_storage
125
+ from llama_index.core import Document
126
+ from llama_index.readers.file import PDFReader
127
+ from pathlib import Path
128
+
129
+ def is_url(url):
130
+ return urlParse.urlparse(url).scheme != ""
131
+
132
+ def store_vector(fileOrLink):
133
+ new_docs = []
134
+ if is_url(fileOrLink):
135
+ reader = SimpleWebPageReader(html_to_text=True)
136
+ docs = reader.load_data(urls=[fileOrLink])
137
+
138
+ for doc in docs:
139
+ new_doc = Document(text=doc.text, metadata=doc.metadata)
140
+ new_docs.append(new_doc)
141
+
142
+ else:
143
+ loader = PDFReader()
144
+ docs = loader.load_data(file=Path(fileOrLink))
145
+ for doc in docs:
146
+ new_doc = Document(text=doc.text, metadata=doc.metadata)
147
+ new_docs.append(new_doc)
148
+
149
+ index = VectorStoreIndex.from_documents(new_docs, embed_model=emb_model)
150
+ return index
151
+
152
+ title="<span id='logo'></span>DocuGenie"
153
+
154
+ css="""
155
+ .gradio-container {
156
+ background: rgb(131,58,180);
157
+ background: linear-gradient(90deg, rgba(131,58,180,1) 0%, rgba(253,29,29,1) 50%, rgba(252,176,69,1) 100%);
158
+ #logo {
159
+ content: url('https://i.ibb.co/6vz9WjL/chat-bot.png');
160
+ width: 42px;
161
+ height: 42px;
162
+ margin-right: 10px;
163
+ margin-top: 3px;
164
+ display:inline-block;
165
+ };
166
+ #link {
167
+ color: #fff;
168
+ background-color: transparent;
169
+ };
170
+ }
171
+ """
172
+
173
+ import gradio as gr
174
+ import urllib.request as urllib2
175
+ from bs4 import BeautifulSoup
176
+ from PIL import Image
177
+ from langchain.schema import AIMessage, HumanMessage
178
+ import fitz
179
+ import uuid
180
+ import time
181
+
182
+ qa_chain_store = {}
183
+
184
+
185
+ def predict(message, history, session_info):
186
+ session_id = session_info["session_id"]
187
+ index = qa_chain_store.get(session_id)
188
+ if index is None:
189
+ yield "hello i am your helpful assistant please upload a pdf file or insert a Web Link to start chat with me."
190
+ return
191
+ if len(message) == 0:
192
+ yield "Please ask a question related to your data."
193
+ return
194
+ query_engine = index.as_query_engine(streaming=True,text_qa_template=text_qa_template,
195
+ refine_template=refine_template,similarity_top_k=1)
196
+ streaming_response = query_engine.query(message)
197
+ partial_message = ""
198
+ for text in streaming_response.response_gen:
199
+ partial_message += text
200
+ yield partial_message
201
+
202
+
203
+ def test(text):
204
+ raise gr.Info(text)
205
+
206
+
207
+ def processData(fileOrLink,session_info):
208
+ session_id = session_info["session_id"]
209
+ if is_url(fileOrLink):
210
+ index = store_vector(fileOrLink)
211
+
212
+ qa_chain_store[session_id] = index
213
+ return "Web Page Data splitted, embeded, and ready to be searched. and your Session ID is "+session_id
214
+
215
+ else:
216
+ index = store_vector(fileOrLink.name)
217
+
218
+ qa_chain_store[session_id] = index
219
+ return "File splitted, embeded, and ready to be searched. and your Session ID is "+session_id
220
+
221
+
222
+
223
+ def generatePdf_Image(file):
224
+ try:
225
+ doc = fitz.open(file.name)
226
+ pix = doc[0].get_pixmap(matrix=fitz.Identity, dpi=None, colorspace=fitz.csRGB, clip=None, alpha=True, annots=True)
227
+ pix.save("samplepdfimag.png")
228
+ imgPdf = Image.open('samplepdfimag.png')
229
+ imgPdf.save("samplepdfimag.png")
230
+ return imgPdf
231
+ except:
232
+ return None
233
+
234
+
235
+
236
+ def getWebImage(link):
237
+ try:
238
+ page = urllib2.urlopen(link)
239
+ soup = BeautifulSoup(page.read())
240
+ icon_link = soup.find("link", rel="icon")
241
+ icon = urllib2.urlopen(icon_link['href'])
242
+ with open("test.ico", "wb") as f:
243
+ f.write(icon.read())
244
+ img = Image.open('test.ico')
245
+ img.save("test.png")
246
+ return img
247
+ except:
248
+ urllib2.urlretrieve("https://cdn-icons-png.flaticon.com/512/5909/5909151.png","icon.png")
249
+ img = Image.open("icon.png")
250
+ img.save("icon.png")
251
+ return img
252
+
253
+
254
+ def create_session_id():
255
+ return str(uuid.uuid4())
256
+
257
+ def addText(link):
258
+ return link
259
+
260
+ def submit_data(Section_text, text,raw_file,session_info):
261
+ if Section_text == "Chat With WEB":
262
+ response = processData(text,session_info)
263
+ return response
264
+ else:
265
+ response = processData(raw_file,session_info)
266
+ return response
267
+
268
+
269
+ def toggle(val):
270
+ if val == "Chat With WEB":
271
+ return { webPanel : gr.Column(visible=True),
272
+ filePanel: gr.Column(visible=False)
273
+ }
274
+ elif val == "Chat With .Pdf":
275
+ return {filePanel: gr.Column(visible=True),
276
+ webPanel : gr.Column(visible=False)
277
+ }
278
+
279
+ chatbot = gr.Chatbot(avatar_images=["https://i.ibb.co/kGd6XrM/user.png", "https://i.ibb.co/6vz9WjL/chat-bot.png"],
280
+ bubble_full_width=False, show_label=False, show_copy_button=True, likeable=True,)
281
+
282
+ with gr.Blocks(theme="soft",css=css) as demo:
283
+ session_info = gr.State(value={"session_id": create_session_id()})
284
+ with gr.Row():
285
+ with gr.Column(scale=1,min_width=800):
286
+ chatui = gr.ChatInterface(
287
+ predict,
288
+ title=title,
289
+ chatbot=chatbot,
290
+ additional_inputs=[session_info],
291
+ submit_btn="Send")
292
+ with gr.Column(scale=1,min_width=400):
293
+ select =gr.Radio(["Chat With WEB", "Chat With .Pdf"], info="you are able to Chat with web and pdf file",
294
+ label="Please Select a Data Source")
295
+ with gr.Column(visible=False) as webPanel:
296
+ with gr.Row(equal_height=True,variant='compact'):
297
+ text = gr.Textbox(scale=2, placeholder="Enter Website link")
298
+ btnAdd = gr.Button("+ Add Link",scale=1)
299
+ show = gr.Textbox(label="Your Selected Web Link",show_copy_button=True)
300
+ imgWeb = gr.Image(interactive=False,height="80",width="100",)
301
+
302
+ with gr.Column(visible=False) as filePanel:
303
+ imgFile = gr.Image(interactive=False)
304
+ raw_file = gr.File(label="Your PDFs")
305
+
306
+ clearBtn = gr.ClearButton(components=[imgFile,raw_file,show,imgWeb,text])
307
+ submit = gr.Button("Submit Data to ChatBot")
308
+ outT = gr.Textbox()
309
+
310
+ select.change(fn=toggle,inputs=[select],outputs=[webPanel,filePanel])
311
+ btnAdd.click(fn=addText,inputs=[text],outputs=[show]).success(fn=getWebImage,inputs=[text],outputs=[imgWeb])
312
+ raw_file.change(fn=generatePdf_Image,inputs=[raw_file],outputs=[imgFile])
313
+ submit.click(fn=submit_data,inputs=[select,text,raw_file,session_info],outputs=[outT])
314
+
315
+ if __name__ == "__main__":
316
+ demo.queue().launch(debug=True) # launch app