Shreyas094 commited on
Commit
462aa5d
1 Parent(s): c538f5c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -470
app.py CHANGED
@@ -6,183 +6,39 @@ import pandas as pd
6
  import requests
7
  import random
8
  import urllib.parse
9
- import spacy
10
- from sklearn.metrics.pairwise import cosine_similarity
11
- import numpy as np
12
- from typing import List, Dict
13
  from tempfile import NamedTemporaryFile
 
14
  from bs4 import BeautifulSoup
15
- from langchain.prompts import PromptTemplate
16
- from langchain.chains import LLMChain
17
- from langchain_core.prompts import ChatPromptTemplate
18
  from langchain_community.vectorstores import FAISS
19
  from langchain_community.document_loaders import PyPDFLoader
20
- from langchain_core.output_parsers import StrOutputParser
21
  from langchain_community.embeddings import HuggingFaceEmbeddings
22
- from langchain_community.llms import HuggingFaceHub
23
  from langchain_core.documents import Document
24
- from sentence_transformers import SentenceTransformer
25
- from llama_parse import LlamaParse
26
- from llama_cpp import Llama
27
- from llama_cpp_agent.llm_agent import LlamaCppAgent
28
- from llama_cpp_agent.messages_formatter import MessagesFormatterType
29
- from llama_cpp_agent.providers.llama_cpp_endpoint_provider import LlamaCppEndpointSettings
30
-
31
 
 
32
  huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
33
- llama_cloud_api_key = os.environ.get("LLAMA_CLOUD_API_KEY")
34
-
35
- # Load SentenceTransformer model
36
- sentence_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
37
-
38
- def load_spacy_model():
39
- try:
40
- # Try to load the model
41
- return spacy.load("en_core_web_sm")
42
- except OSError:
43
- # If loading fails, download the model
44
- os.system("python -m spacy download en_core_web_sm")
45
- # Try loading again
46
- return spacy.load("en_core_web_sm")
47
-
48
- # Load spaCy model
49
- nlp = load_spacy_model()
50
-
51
- class EnhancedContextDrivenChatbot:
52
- def __init__(self, history_size: int = 10, max_history_chars: int = 5000):
53
- self.history = []
54
- self.history_size = history_size
55
- self.max_history_chars = max_history_chars
56
- self.entity_tracker = {}
57
- self.conversation_context = ""
58
- self.model = None
59
- self.last_instructions = None
60
-
61
- def add_to_history(self, text: str):
62
- self.history.append(text)
63
- while len(' '.join(self.history)) > self.max_history_chars or len(self.history) > self.history_size:
64
- self.history.pop(0)
65
-
66
- # Update entity tracker
67
- doc = nlp(text)
68
- for ent in doc.ents:
69
- if ent.label_ not in self.entity_tracker:
70
- self.entity_tracker[ent.label_] = set()
71
- self.entity_tracker[ent.label_].add(ent.text)
72
-
73
- # Update conversation context
74
- self.conversation_context += f" {text}"
75
- self.conversation_context = ' '.join(self.conversation_context.split()[-100:]) # Keep last 100 words
76
-
77
- def get_context(self):
78
- return self.conversation_context
79
-
80
- def is_follow_up_question(self, question):
81
- doc = nlp(question.lower())
82
- follow_up_indicators = set(['it', 'this', 'that', 'these', 'those', 'he', 'she', 'they', 'them'])
83
- return any(token.text in follow_up_indicators for token in doc) or question.strip().startswith("What about")
84
 
85
- def extract_topics(self, text):
86
- doc = nlp(text)
87
- return [chunk.text for chunk in doc.noun_chunks]
88
-
89
- def extract_instructions(self, text):
90
- instruction_patterns = [
91
- r"(.*?),?\s*(?:please\s+)?(provide\s+(?:me\s+)?a\s+.*?|give\s+(?:me\s+)?a\s+.*?|create\s+a\s+.*?)$",
92
- r"(.*?),?\s*(?:please\s+)?(summarize|analyze|explain|describe|elaborate\s+on).*$",
93
- r"(.*?),?\s*(?:please\s+)?(in\s+detail|briefly|concisely).*$",
94
- ]
95
-
96
- for pattern in instruction_patterns:
97
- match = re.match(pattern, text, re.IGNORECASE)
98
- if match:
99
- return match.group(1).strip(), match.group(2).strip()
100
-
101
- return text, None
102
-
103
- def get_most_relevant_context(self, question):
104
- if not self.history:
105
- return question
106
-
107
- # Create a combined context from history
108
- combined_context = self.get_context()
109
-
110
- # Get embeddings
111
- context_embedding = sentence_model.encode([combined_context])[0]
112
- question_embedding = sentence_model.encode([question])[0]
113
-
114
- # Calculate similarity
115
- similarity = cosine_similarity([context_embedding], [question_embedding])[0][0]
116
-
117
- # If similarity is high, it's likely a follow-up question
118
- if similarity > 0.5: # This threshold can be adjusted
119
- return f"{combined_context} {question}"
120
-
121
- # Otherwise, it might be a new topic
122
- return question
123
-
124
- def rephrase_query(self, question, instructions=None):
125
- if not self.model:
126
- return question # Return original question if no model is available
127
-
128
- instruction_prompt = f"Instructions: {instructions}\n" if instructions else ""
129
-
130
- prompt = f"""
131
- Given the conversation context, the current question, and any provided instructions, rephrase the question to include relevant context and rephrase it to more search-engine-friendly query:
132
-
133
- Conversation context: {self.get_context()}
134
- Current question: {question}
135
- {instruction_prompt}
136
- Rephrased question:
137
- """
138
-
139
- rephrased_question = generate_chunked_response(self.model, prompt)
140
-
141
- return rephrased_question.strip()
142
-
143
- def process_question(self, question):
144
- core_question, instructions = self.extract_instructions(question)
145
-
146
- if self.is_follow_up_question(core_question):
147
- contextualized_question = self.get_most_relevant_context(core_question)
148
- contextualized_question = self.rephrase_query(contextualized_question, instructions)
149
- else:
150
- contextualized_question = core_question
151
-
152
- topics = self.extract_topics(contextualized_question)
153
-
154
- self.add_to_history(question)
155
- self.last_instructions = instructions
156
-
157
- return contextualized_question, topics, self.entity_tracker, instructions
158
-
159
- # Initialize LlamaParse
160
- llama_parser = LlamaParse(
161
- api_key=llama_cloud_api_key,
162
- result_type="markdown",
163
- num_workers=4,
164
- verbose=True,
165
- language="en",
166
- )
167
 
168
- def load_document(file: NamedTemporaryFile, parser: str = "pypdf") -> List[Document]:
169
- """Loads and splits the document into pages."""
170
- if parser == "pypdf":
171
- loader = PyPDFLoader(file.name)
172
- return loader.load_and_split()
173
- elif parser == "llamaparse":
174
- try:
175
- documents = llama_parser.load_data(file.name)
176
- return [Document(page_content=doc.text, metadata={"source": file.name}) for doc in documents]
177
- except Exception as e:
178
- print(f"Error using Llama Parse: {str(e)}")
179
- print("Falling back to PyPDF parser")
180
- loader = PyPDFLoader(file.name)
181
- return loader.load_and_split()
182
- else:
183
- raise ValueError("Invalid parser specified. Use 'pypdf' or 'llamaparse'.")
184
 
185
- def update_vectors(files, parser):
186
  if not files:
187
  return "Please upload at least one PDF file."
188
 
@@ -191,7 +47,7 @@ def update_vectors(files, parser):
191
 
192
  all_data = []
193
  for file in files:
194
- data = load_document(file, parser)
195
  all_data.extend(data)
196
  total_chunks += len(data)
197
 
@@ -203,7 +59,7 @@ def update_vectors(files, parser):
203
 
204
  database.save_local("faiss_database")
205
 
206
- return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}."
207
 
208
  def get_embeddings():
209
  return HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
@@ -215,55 +71,6 @@ def clear_cache():
215
  else:
216
  return "No cache to clear."
217
 
218
- def get_model(temperature, top_p, repetition_penalty):
219
- return HuggingFaceHub(
220
- repo_id="mistralai/Mistral-7B-Instruct-v0.3",
221
- model_kwargs={
222
- "temperature": temperature,
223
- "top_p": top_p,
224
- "repetition_penalty": repetition_penalty,
225
- "max_length": 800
226
- },
227
- huggingfacehub_api_token=huggingface_token
228
- )
229
-
230
- MAX_PROMPT_CHARS = 20000 # Adjust based on your model's limitations
231
-
232
- def chunk_text(text: str, max_chunk_size: int = 800) -> List[str]:
233
- chunks = []
234
- current_chunk = ""
235
- for sentence in re.split(r'(?<=[.!?])\s+', text):
236
- if len(current_chunk) + len(sentence) > max_chunk_size:
237
- chunks.append(current_chunk.strip())
238
- current_chunk = sentence
239
- else:
240
- current_chunk += " " + sentence
241
- if current_chunk:
242
- chunks.append(current_chunk.strip())
243
- return chunks
244
-
245
- def get_most_relevant_chunks(question: str, chunks: List[str], top_k: int = 3) -> List[str]:
246
- question_embedding = sentence_model.encode([question])[0]
247
- chunk_embeddings = sentence_model.encode(chunks)
248
- similarities = cosine_similarity([question_embedding], chunk_embeddings)[0]
249
- top_indices = np.argsort(similarities)[-top_k:]
250
- return [chunks[i] for i in top_indices]
251
-
252
- def generate_chunked_response(model, prompt, max_tokens=800, max_chunks=5):
253
- full_response = ""
254
- for i in range(max_chunks):
255
- try:
256
- chunk = model(prompt + full_response, max_new_tokens=max_tokens)
257
- chunk = chunk.strip()
258
- if chunk.endswith((".", "!", "?")):
259
- full_response += chunk
260
- break
261
- full_response += chunk
262
- except Exception as e:
263
- print(f"Error in generate_chunked_response: {e}")
264
- break
265
- return full_response.strip()
266
-
267
  def extract_text_from_webpage(html):
268
  soup = BeautifulSoup(html, 'html.parser')
269
  for script in soup(["script", "style"]):
@@ -289,8 +96,6 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
289
  all_results = []
290
  max_chars_per_page = 8000
291
 
292
- print(f"Starting Google search for term: '{term}'")
293
-
294
  with requests.Session() as session:
295
  while start < num_results:
296
  try:
@@ -312,23 +117,19 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
312
  verify=ssl_verify,
313
  )
314
  resp.raise_for_status()
315
- print(f"Successfully retrieved search results page (start={start})")
316
  except requests.exceptions.RequestException as e:
317
  print(f"Error retrieving search results: {e}")
318
  break
319
 
320
  soup = BeautifulSoup(resp.text, "html.parser")
321
- result_block = soup.find_all("div", attrs={"class": "g"})
322
  if not result_block:
323
- print("No results found on this page")
324
  break
325
 
326
- print(f"Found {len(result_block)} results on this page")
327
  for result in result_block:
328
  link = result.find("a", href=True)
329
  if link:
330
  link = link["href"]
331
- print(f"Processing link: {link}")
332
  try:
333
  webpage = session.get(link, headers=headers, timeout=timeout)
334
  webpage.raise_for_status()
@@ -336,291 +137,130 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
336
  if len(visible_text) > max_chars_per_page:
337
  visible_text = visible_text[:max_chars_per_page] + "..."
338
  all_results.append({"link": link, "text": visible_text})
339
- print(f"Successfully extracted text from {link}")
340
  except requests.exceptions.RequestException as e:
341
  print(f"Error retrieving webpage content: {e}")
342
  all_results.append({"link": link, "text": None})
343
  else:
344
- print("No link found for this result")
345
  all_results.append({"link": None, "text": None})
346
  start += len(result_block)
347
 
348
- print(f"Search completed. Total results: {len(all_results)}")
349
-
350
  if not all_results:
351
- print("No search results found. Returning a default message.")
352
  return [{"link": None, "text": "No information found in the web search results."}]
353
 
354
  return all_results
355
 
356
- def estimate_tokens(text):
357
- return len(text.split())
358
-
359
- def truncate_text(text, max_tokens):
360
- words = text.split()
361
- if len(words) <= max_tokens:
362
- return text
363
- return ' '.join(words[:max_tokens])
364
-
365
- def rerank_documents(query: str, documents: List[Document], top_k: int = 5) -> List[Document]:
366
- query_embedding = sentence_model.encode([query])[0]
367
- doc_embeddings = sentence_model.encode([doc.page_content for doc in documents])
368
-
369
- similarities = cosine_similarity([query_embedding], doc_embeddings)[0]
370
-
371
- ranked_indices = similarities.argsort()[::-1][:top_k]
372
- return [documents[i] for i in ranked_indices]
373
-
374
- def prepare_context(query: str, documents: List[Document], max_tokens: int) -> str:
375
- reranked_docs = rerank_documents(query, documents)
376
-
377
- context = ""
378
- for doc in reranked_docs:
379
- doc_content = f"Source: {doc.metadata.get('source', 'Unknown')}\nContent: {doc.page_content}\n\n"
380
- if estimate_tokens(context + doc_content) > max_tokens:
381
- break
382
- context += doc_content
383
-
384
- return truncate_text(context, max_tokens)
385
-
386
- # Initialize LlamaCppAgent
387
- def initialize_llama_cpp_agent():
388
- main_model = LlamaCppEndpointSettings(
389
- completions_endpoint_url="http://127.0.0.1:8080/completion"
390
- )
391
- llama_cpp_agent = LlamaCppAgent(
392
- main_model,
393
- debug_output=False,
394
- system_prompt="You are an AI assistant designed to help with RAG tasks.",
395
- predefined_messages_formatter_type=MessagesFormatterType.CHATML
396
- )
397
- return llama_cpp_agent
398
-
399
- # Modify the ask_question function to use LlamaCppAgent
400
- def ask_question(question, temperature, top_p, repetition_penalty, web_search, chatbot, user_instructions):
401
- if not question:
402
- return "Please enter a question."
403
-
404
- llama_cpp_agent = initialize_llama_cpp_agent()
405
  model = get_model(temperature, top_p, repetition_penalty)
406
-
407
- # Update the chatbot's model
408
- chatbot.model = model
409
 
410
- embed = get_embeddings()
 
 
 
 
411
 
 
412
  if os.path.exists("faiss_database"):
 
413
  database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
  else:
415
- database = None
416
-
417
- max_attempts = 3
418
- max_input_tokens = 20000
419
- max_output_tokens = 800
420
-
421
- if web_search:
422
- contextualized_question, topics, entity_tracker, _ = chatbot.process_question(question)
423
 
424
- try:
425
- search_results = google_search(contextualized_question, num_results=5)
426
- except Exception as e:
427
- print(f"Error in web search: {e}")
428
- return f"I apologize, but I encountered an error while searching for information: {str(e)}"
429
-
430
- all_answers = []
431
-
432
- for attempt in range(max_attempts):
433
- try:
434
- web_docs = [Document(page_content=result["text"], metadata={"source": result["link"]}) for result in search_results if result["text"]]
435
-
436
- if not web_docs:
437
- return "I'm sorry, but I couldn't find any relevant information from the web search."
438
-
439
- if database is None:
440
- database = FAISS.from_documents(web_docs, embed)
441
- else:
442
- database.add_documents(web_docs)
443
-
444
- database.save_local("faiss_database")
445
-
446
- context_str = prepare_context(contextualized_question, web_docs, max_input_tokens // 2)
447
-
448
- instruction_prompt = f"User Instructions: {user_instructions}\n" if user_instructions else ""
449
-
450
- prompt_template = f"""
451
- Answer the question based on the following web search results, conversation context, entity information, and user instructions:
452
- Web Search Results:
453
- {{context}}
454
- Conversation Context: {{conv_context}}
455
- Current Question: {{question}}
456
- Topics: {{topics}}
457
- Entity Information: {{entities}}
458
- {instruction_prompt}
459
- Provide a concise and relevant answer to the question.
460
- """
461
-
462
- current_conv_context = truncate_text(chatbot.get_context(), max_input_tokens // 4)
463
- current_topics = topics[:5]
464
- current_entities = {k: list(v)[:3] for k, v in entity_tracker.items()}
465
-
466
- formatted_prompt = prompt_template.format(
467
- context=context_str,
468
- conv_context=current_conv_context,
469
- question=question,
470
- topics=", ".join(current_topics),
471
- entities=json.dumps(current_entities)
472
- )
473
-
474
- if estimate_tokens(formatted_prompt) > max_input_tokens:
475
- formatted_prompt = truncate_text(formatted_prompt, max_input_tokens)
476
-
477
- try:
478
- # Use LlamaCppAgent for initial response generation
479
- initial_response = llama_cpp_agent.get_chat_response(formatted_prompt, temperature=temperature)
480
-
481
- # Use generate_chunked_response for further refinement if needed
482
- full_response = generate_chunked_response(model, initial_response, max_tokens=max_output_tokens)
483
-
484
- answer = extract_answer(full_response, user_instructions)
485
- all_answers.append(answer)
486
- break
487
- except Exception as e:
488
- print(f"Error in response generation: {e}")
489
- if attempt == max_attempts - 1:
490
- all_answers.append(f"I apologize, but I encountered an error while generating the response. Please try again with a simpler question.")
491
-
492
- except Exception as e:
493
- print(f"Error in ask_question (attempt {attempt + 1}): {e}")
494
- if attempt == max_attempts - 1:
495
- all_answers.append(f"I apologize, but an unexpected error occurred. Please try again with a different question or check your internet connection.")
496
-
497
- answer = "\n\n".join(all_answers)
498
- sources = set(doc.metadata['source'] for doc in web_docs)
499
- sources_section = "\n\nSources:\n" + "\n".join(f"- {source}" for source in sources)
500
- answer += sources_section
501
-
502
- chatbot.add_to_history(answer)
503
-
504
- return answer
505
-
506
- else: # PDF document chat
507
- for attempt in range(max_attempts):
508
- try:
509
- if database is None:
510
- return "No documents available. Please upload PDF documents to answer questions."
511
-
512
- retriever = database.as_retriever(search_kwargs={"k": 5})
513
- relevant_docs = retriever.get_relevant_documents(question)
514
-
515
- context_str = prepare_context(question, relevant_docs, max_input_tokens // 2)
516
-
517
- instruction_prompt = f"User Instructions: {user_instructions}\n" if user_instructions else ""
518
-
519
- prompt_template = f"""
520
- Answer the question based on the following context from the PDF document:
521
- Context:
522
- {{context}}
523
- Question: {{question}}
524
- {instruction_prompt}
525
- Provide a summarized and direct answer to the question.
526
- """
527
-
528
- formatted_prompt = prompt_template.format(context=context_str, question=question)
529
-
530
- if estimate_tokens(formatted_prompt) > max_input_tokens:
531
- formatted_prompt = truncate_text(formatted_prompt, max_input_tokens)
532
-
533
- try:
534
- # Use LlamaCppAgent for initial response generation
535
- initial_response = llama_cpp_agent.get_chat_response(formatted_prompt, temperature=temperature)
536
-
537
- # Use generate_chunked_response for further refinement if needed
538
- full_response = generate_chunked_response(model, initial_response, max_tokens=max_output_tokens)
539
-
540
- answer = extract_answer(full_response, user_instructions)
541
- return answer
542
- except Exception as e:
543
- print(f"Error in response generation: {e}")
544
- if attempt == max_attempts - 1:
545
- return f"I apologize, but I encountered an error while generating the response. Please try again with a simpler question."
546
-
547
- except Exception as e:
548
- print(f"Error in ask_question (attempt {attempt + 1}): {e}")
549
- if attempt == max_attempts - 1:
550
- return f"I apologize, but an unexpected error occurred. Please try again with a different question."
551
 
552
- return "An unexpected error occurred. Please try again later."
 
 
 
553
 
 
554
 
555
- def extract_answer(full_response, instructions=None):
556
- answer_patterns = [
557
- r"Provide a concise and direct answer to the question without mentioning the web search or these instructions:",
558
- r"Provide a concise and direct answer to the question:",
559
- r"Provide a concise and relevant answer to the question.",
560
- r"Answer:",
561
- r"Provide a summarized and direct answer to the question.",
562
- r"If the context doesn't contain relevant information, state that the information is not available in the document.",
563
- r"Provide a summarized and direct answer to the original question without mentioning the web search or these instructions:",
564
- r"Do not include any source information in your answer."
565
- ]
566
 
567
- for pattern in answer_patterns:
568
- match = re.split(pattern, full_response, flags=re.IGNORECASE)
569
- if len(match) > 1:
570
- full_response = match[-1].strip()
571
- break
572
-
573
- # Remove any remaining instruction-like phrases
574
- cleanup_patterns = [
575
- r"without mentioning the web search or these instructions\.",
576
- r"Do not include any source information in your answer\.",
577
- r"If the context doesn't contain relevant information, state that the information is not available in the document\."
578
- ]
579
 
580
- for pattern in cleanup_patterns:
581
- full_response = re.sub(pattern, "", full_response, flags=re.IGNORECASE).strip()
582
-
583
- # Remove the user instructions if present
584
- if instructions:
585
- instruction_pattern = rf"User Instructions:\s*{re.escape(instructions)}.*?\n"
586
- full_response = re.sub(instruction_pattern, "", full_response, flags=re.IGNORECASE | re.DOTALL)
587
-
588
- return full_response.strip()
589
 
590
  # Gradio interface
591
- with gr.Blocks() as demo:
592
- gr.Markdown("# Enhanced PDF Document Chat and Web Search")
 
 
593
 
594
  with gr.Row():
595
  file_input = gr.Files(label="Upload your PDF documents", file_types=[".pdf"])
596
- parser_dropdown = gr.Dropdown(choices=["pypdf", "llamaparse"], label="Select PDF Parser", value="pypdf")
597
  update_button = gr.Button("Upload PDF")
598
 
599
  update_output = gr.Textbox(label="Update Status")
600
- update_button.click(update_vectors, inputs=[file_input, parser_dropdown], outputs=update_output)
601
 
602
  with gr.Row():
603
  with gr.Column(scale=2):
604
  chatbot = gr.Chatbot(label="Conversation")
605
- question_input = gr.Textbox(label="Ask a question")
606
- instructions_input = gr.Textbox(label="Instructions for response (optional)", placeholder="Enter any specific instructions for the response here")
607
  submit_button = gr.Button("Submit")
608
  with gr.Column(scale=1):
609
- temperature_slider = gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, value=0.5, step=0.1)
610
- top_p_slider = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.9, step=0.1)
611
- repetition_penalty_slider = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, value=1.0, step=0.1)
612
- web_search_checkbox = gr.Checkbox(label="Enable Web Search", value=False)
613
-
614
- enhanced_context_driven_chatbot = EnhancedContextDrivenChatbot()
615
-
616
- # Update the chat function to use the modified ask_question function
617
- def chat(question, history, temperature, top_p, repetition_penalty, web_search, user_instructions):
618
- answer = ask_question(question, temperature, top_p, repetition_penalty, web_search, enhanced_context_driven_chatbot, user_instructions)
619
- history.append((question, answer))
620
- return "", history
 
 
 
 
 
 
 
621
 
622
- submit_button.click(chat, inputs=[question_input, chatbot, temperature_slider, top_p_slider, repetition_penalty_slider, web_search_checkbox, instructions_input], outputs=[question_input, chatbot])
623
-
624
  clear_button = gr.Button("Clear Cache")
625
  clear_output = gr.Textbox(label="Cache Status")
626
  clear_button.click(clear_cache, inputs=[], outputs=clear_output)
 
6
  import requests
7
  import random
8
  import urllib.parse
 
 
 
 
9
  from tempfile import NamedTemporaryFile
10
+ from typing import List
11
  from bs4 import BeautifulSoup
12
+ import logging
13
+
14
+ from langchain_community.llms import HuggingFaceHub
15
  from langchain_community.vectorstores import FAISS
16
  from langchain_community.document_loaders import PyPDFLoader
 
17
  from langchain_community.embeddings import HuggingFaceEmbeddings
 
18
  from langchain_core.documents import Document
19
+ from langchain.chains import LLMChain
20
+ from langchain.prompts import PromptTemplate
 
 
 
 
 
21
 
22
+ # Global variables
23
  huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ def get_model(temperature, top_p, repetition_penalty):
26
+ return HuggingFaceHub(
27
+ repo_id="mistralai/Mistral-7B-Instruct-v0.3",
28
+ model_kwargs={
29
+ "temperature": temperature,
30
+ "top_p": top_p,
31
+ "repetition_penalty": repetition_penalty,
32
+ "max_length": 1000
33
+ },
34
+ huggingfacehub_api_token=huggingface_token
35
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
+ def load_document(file: NamedTemporaryFile) -> List[Document]:
38
+ loader = PyPDFLoader(file.name)
39
+ return loader.load_and_split()
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ def update_vectors(files):
42
  if not files:
43
  return "Please upload at least one PDF file."
44
 
 
47
 
48
  all_data = []
49
  for file in files:
50
+ data = load_document(file)
51
  all_data.extend(data)
52
  total_chunks += len(data)
53
 
 
59
 
60
  database.save_local("faiss_database")
61
 
62
+ return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files."
63
 
64
  def get_embeddings():
65
  return HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
 
71
  else:
72
  return "No cache to clear."
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  def extract_text_from_webpage(html):
75
  soup = BeautifulSoup(html, 'html.parser')
76
  for script in soup(["script", "style"]):
 
96
  all_results = []
97
  max_chars_per_page = 8000
98
 
 
 
99
  with requests.Session() as session:
100
  while start < num_results:
101
  try:
 
117
  verify=ssl_verify,
118
  )
119
  resp.raise_for_status()
 
120
  except requests.exceptions.RequestException as e:
121
  print(f"Error retrieving search results: {e}")
122
  break
123
 
124
  soup = BeautifulSoup(resp.text, "html.parser")
125
+ result_block = soup.find_all("div", attrs={"class": "g"})
126
  if not result_block:
 
127
  break
128
 
 
129
  for result in result_block:
130
  link = result.find("a", href=True)
131
  if link:
132
  link = link["href"]
 
133
  try:
134
  webpage = session.get(link, headers=headers, timeout=timeout)
135
  webpage.raise_for_status()
 
137
  if len(visible_text) > max_chars_per_page:
138
  visible_text = visible_text[:max_chars_per_page] + "..."
139
  all_results.append({"link": link, "text": visible_text})
 
140
  except requests.exceptions.RequestException as e:
141
  print(f"Error retrieving webpage content: {e}")
142
  all_results.append({"link": link, "text": None})
143
  else:
 
144
  all_results.append({"link": None, "text": None})
145
  start += len(result_block)
146
 
 
 
147
  if not all_results:
 
148
  return [{"link": None, "text": "No information found in the web search results."}]
149
 
150
  return all_results
151
 
152
+ def duckduckgo_search(query):
153
+ # Implement DuckDuckGo search here
154
+ # This is a placeholder. You'll need to implement the actual DuckDuckGo search functionality
155
+ return [{"link": "https://example.com", "text": "Example search result from DuckDuckGo"}]
156
+
157
+ def respond(
158
+ message,
159
+ history: list[tuple[str, str]],
160
+ temperature,
161
+ top_p,
162
+ repetition_penalty,
163
+ max_tokens,
164
+ search_engine
165
+ ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  model = get_model(temperature, top_p, repetition_penalty)
 
 
 
167
 
168
+ # Perform web search
169
+ if search_engine == "Google":
170
+ search_results = google_search(message)
171
+ else:
172
+ search_results = duckduckgo_search(message)
173
 
174
+ # Check if we have a FAISS database
175
  if os.path.exists("faiss_database"):
176
+ embed = get_embeddings()
177
  database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
178
+ retriever = database.as_retriever()
179
+ relevant_docs = retriever.get_relevant_documents(message)
180
+ context_str = "\n".join([doc.page_content for doc in relevant_docs])
181
+
182
+ # Use the context in the prompt
183
+ prompt_template = f"""
184
+ Answer the question based on the following context and web search results:
185
+ Context from documents:
186
+ {context_str}
187
+
188
+ Web Search Results:
189
+ {{search_results}}
190
+
191
+ Question: {{message}}
192
+
193
+ If the context and web search results don't contain relevant information, state that the information is not available.
194
+ Provide a concise and direct answer to the question.
195
+ """
196
  else:
197
+ prompt_template = """
198
+ Answer the question based on the following web search results:
199
+ Web Search Results:
200
+ {search_results}
 
 
 
 
201
 
202
+ Question: {message}
203
+
204
+ If the web search results don't contain relevant information, state that the information is not available.
205
+ Provide a concise and direct answer to the question.
206
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
+ prompt = PromptTemplate(
209
+ input_variables=["search_results", "message"],
210
+ template=prompt_template
211
+ )
212
 
213
+ chain = LLMChain(llm=model, prompt=prompt)
214
 
215
+ search_results_text = "\n".join([f"- {result['text']}" for result in search_results if result['text']])
216
+ response = chain.run(search_results=search_results_text, message=message)
 
 
 
 
 
 
 
 
 
217
 
218
+ # Add sources
219
+ sources = set(result["link"] for result in search_results if result["link"])
220
+ sources_section = "\n\nSources:\n" + "\n".join(f"- {source}" for source in sources)
221
+ response += sources_section
 
 
 
 
 
 
 
 
222
 
223
+ return response
 
 
 
 
 
 
 
 
224
 
225
  # Gradio interface
226
+ demo = gr.Blocks()
227
+
228
+ with demo:
229
+ gr.Markdown("# Chat with your PDF documents and Web Search")
230
 
231
  with gr.Row():
232
  file_input = gr.Files(label="Upload your PDF documents", file_types=[".pdf"])
 
233
  update_button = gr.Button("Upload PDF")
234
 
235
  update_output = gr.Textbox(label="Update Status")
236
+ update_button.click(update_vectors, inputs=[file_input], outputs=update_output)
237
 
238
  with gr.Row():
239
  with gr.Column(scale=2):
240
  chatbot = gr.Chatbot(label="Conversation")
241
+ message_input = gr.Textbox(label="Enter your message")
 
242
  submit_button = gr.Button("Submit")
243
  with gr.Column(scale=1):
244
+ temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature")
245
+ top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p")
246
+ repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.1, label="Repetition penalty")
247
+ max_tokens = gr.Slider(minimum=1, maximum=1000, value=500, step=1, label="Max tokens")
248
+ search_engine = gr.Dropdown(["DuckDuckGo", "Google"], value="DuckDuckGo", label="Search Engine")
249
+
250
+ submit_button.click(
251
+ respond,
252
+ inputs=[
253
+ message_input,
254
+ chatbot,
255
+ temperature,
256
+ top_p,
257
+ repetition_penalty,
258
+ max_tokens,
259
+ search_engine
260
+ ],
261
+ outputs=chatbot
262
+ )
263
 
 
 
264
  clear_button = gr.Button("Clear Cache")
265
  clear_output = gr.Textbox(label="Cache Status")
266
  clear_button.click(clear_cache, inputs=[], outputs=clear_output)