pratikshahp commited on
Commit
114aa22
·
verified ·
1 Parent(s): d99cffd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +155 -38
app.py CHANGED
@@ -1,55 +1,172 @@
1
- #Running fine:)
 
 
2
  import gradio as gr
 
3
  import os
4
- from langchain_huggingface import HuggingFaceEndpoint
 
 
 
 
 
 
 
 
 
 
5
  from dotenv import load_dotenv
6
- from langchain_community.document_loaders import WhatsAppChatLoader
7
- from typing import List
8
 
9
- # Load environment variables
10
- load_dotenv()
11
 
12
- # Get Hugging Face API token
 
 
13
  HF_TOKEN = os.getenv("HF_TOKEN")
14
 
15
- # Initialize the HuggingFace model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  llm = HuggingFaceEndpoint(
17
  repo_id="mistralai/Mistral-7B-Instruct-v0.3",
18
- huggingfacehub_api_token=HF_TOKEN,
19
  temperature=0.1,
20
- max_new_tokens=300
21
  )
22
 
23
- # Load and process chat content
24
- def load_chat_content(file) -> str:
25
- # Initialize the WhatsAppChatLoader with the uploaded file
26
- loader = WhatsAppChatLoader(path=file.name)
27
- raw_messages = loader.lazy_load()
28
- messages = list(raw_messages)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- # Combine all messages into a single string
31
- chat_content = "\n".join([doc.page_content for doc in messages])
32
- return chat_content
33
-
34
- def answer_question(file, question: str) -> str:
35
- # Load the chat content from the uploaded file
36
- chat_content = load_chat_content(file)
37
- #prompt="Your task is to generate answer according to {question} based on the given {chat_content}"
38
- # Generate a response using the Hugging Face model
39
- response = llm(chat_content + "\n\n" + question)
40
- #response = llm(prompt)
41
- return response
42
-
43
- # Define the Gradio interface
44
  interface = gr.Interface(
45
- fn=answer_question,
46
- inputs=[
47
- gr.File(label="Upload WhatsApp Chat File"),
48
- gr.Textbox(label="Ask a Question", placeholder="Enter your question here...")
49
- ],
50
- outputs="text",
51
- title="WhatsApp Chat Q&A",
52
- description="Upload a WhatsApp chat file and ask questions related to the chat content.",
53
  )
54
 
55
  if __name__ == "__main__":
 
1
+ #https://medium.com/@csakash03/hybrid-search-is-a-method-to-optimize-rag-implementation-98d9d0911341
2
+ #https://medium.com/etoai/hybrid-search-combining-bm25-and-semantic-search-for-better-results-with-lan-1358038fe7e6
3
+
4
  import gradio as gr
5
+ import zipfile
6
  import os
7
+ import re
8
+ from pathlib import Path
9
+ import chromadb
10
+ from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint
11
+ from langchain_chroma import Chroma
12
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
13
+ import hashlib
14
+ import nltk
15
+ from rank_bm25 import BM25Okapi
16
+ import numpy as np
17
+ from langchain.schema import Document
18
  from dotenv import load_dotenv
 
 
19
 
20
+ # Download the required NLTK data
21
+ nltk.download('punkt')
22
 
23
+ # Define embeddings using Hugging Face models
24
+ embeddings = HuggingFaceEmbeddings()
25
+ load_dotenv()
26
  HF_TOKEN = os.getenv("HF_TOKEN")
27
 
28
+ # Initialize Chroma vector store
29
+ persist_directory = "./chroma_langchain_db"
30
+ client = chromadb.PersistentClient()
31
+ collection = client.get_or_create_collection("whatsapp_collection")
32
+
33
+ vector_store = Chroma(
34
+ collection_name="whatsapp_collection",
35
+ embedding_function=embeddings,
36
+ persist_directory=persist_directory,
37
+ )
38
+
39
+ # Define global variables
40
+ bm25 = None
41
+ all_texts = []
42
+ processed_files = {} # Dictionary to store hashes of processed files
43
+
44
  llm = HuggingFaceEndpoint(
45
  repo_id="mistralai/Mistral-7B-Instruct-v0.3",
46
+ huggingfacehub_api_token=HF_TOKEN.strip(),
47
  temperature=0.1,
48
+ max_new_tokens=200
49
  )
50
 
51
+ # Function to remove emojis and clean the text
52
+ def clean_text(text):
53
+ # Remove emojis
54
+ text = re.sub(r'[^\x00-\x7F]+', '', text)
55
+ # Additional cleaning if necessary
56
+ text = re.sub(r'\s+', ' ', text).strip()
57
+ return text
58
+
59
+ # Function to compute a file hash for identifying duplicates
60
+ def compute_file_hash(file_path):
61
+ hasher = hashlib.md5()
62
+ with open(file_path, 'rb') as f:
63
+ buf = f.read()
64
+ hasher.update(buf)
65
+ return hasher.hexdigest()
66
+
67
+ # Function to process and upload the zip file to Chroma
68
+ def process_and_upload_zip(zip_file):
69
+ global bm25, all_texts, processed_files
70
+
71
+ temp_dir = Path("temp")
72
+ temp_dir.mkdir(exist_ok=True)
73
+
74
+ # Compute hash to check if file has been processed
75
+ zip_file_hash = compute_file_hash(zip_file.name)
76
+
77
+ # If the file has been processed before, skip re-uploading
78
+ if zip_file_hash in processed_files:
79
+ return f"File '{zip_file.name}' already processed. Using existing Chroma storage."
80
+
81
+ # Extract the zip file
82
+ with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
83
+ zip_ref.extractall(temp_dir)
84
+
85
+ # Load and clean the chat text
86
+ chat_files = list(temp_dir.glob("*.txt"))
87
+ metadata = []
88
+ all_texts = []
89
+
90
+ for chat_file in chat_files:
91
+ with open(chat_file, 'r', encoding='utf-8') as file:
92
+ page_content = file.read()
93
+
94
+ # Clean the text
95
+ clean_content = clean_text(page_content)
96
+
97
+ # Split the clean_content into chunks of 2500 characters with 200 overlap
98
+ chunk_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=200)
99
+ chunks = chunk_splitter.split_text(clean_content)
100
+
101
+ for chunk_index, chunk in enumerate(chunks):
102
+ metadata.append({
103
+ "context": chunk,
104
+ "document_id": chat_file.stem,
105
+ "chunk_index": chunk_index
106
+ })
107
+ all_texts.append(chunk)
108
+
109
+ # Initialize BM25 for sparse retrieval
110
+ bm25 = BM25Okapi([doc.split() for doc in all_texts])
111
+
112
+ # Create dense embeddings and store in Chroma
113
+ chunk_embeddings = embeddings.embed_documents(all_texts)
114
+ ids = [f"{m['document_id']}_chunk_{m['chunk_index']}" for m in metadata]
115
+
116
+ documents = [Document(page_content=m["context"], metadata=m) for m in metadata]
117
+ vector_store.add_documents(documents=documents, ids=ids)
118
+
119
+ # Store the hash of the processed file to avoid reprocessing
120
+ processed_files[zip_file_hash] = zip_file.name
121
+
122
+ return "Data uploaded and stored in Chroma successfully."
123
+
124
+ def hybrid_search(query):
125
+ global bm25, all_texts
126
+
127
+ # BM25 Sparse Retrieval
128
+ query_terms = query.split()
129
+ bm25_scores = bm25.get_scores(query_terms)
130
+ bm25_top_n_indices = np.argsort(bm25_scores)[::-1][:5] # Top 5 results
131
+
132
+ sparse_results = [all_texts[i] for i in bm25_top_n_indices]
133
+
134
+ # Dense Retrieval using Chroma
135
+ dense_results = vector_store.similarity_search(query, k=5)
136
+
137
+ # Combine the results (you can enhance the combination logic here)
138
+ combined_results = sparse_results + [result.page_content for result in dense_results]
139
+
140
+ response = ""
141
+ for result in combined_results:
142
+ response += f"{result}\n\n"
143
+
144
+ return f"Hybrid Search Results:\n\n{response}"
145
+
146
+ # Gradio Interface for uploading and querying
147
+ def query_interface(zip_file, query):
148
+ upload_status = process_and_upload_zip(zip_file)
149
+ search_results = hybrid_search(query)
150
+ prompt = (f"Here is a summary of WhatsApp chat contents based on the search for the query: '{query}'. "
151
+ f"The chat content includes important messages:\n\n"
152
+ f"{search_results}\n\n"
153
+ f"Now, based on this chat content, answer the following question as an expert. "
154
+ f"Please provide a complete and precise answer in **100 words**.\n\n"
155
+ f"Question: {query}")
156
+ response = llm.invoke(prompt)
157
 
158
+ # Generate answer using the LLM
159
+ return f"{upload_status}\n\n{search_results}", response
160
+
 
 
 
 
 
 
 
 
 
 
 
161
  interface = gr.Interface(
162
+ fn=query_interface,
163
+ inputs=[gr.File(label="Upload WhatsApp Chat Zip File"), gr.Textbox(label="Enter your query")],
164
+ outputs=[
165
+ gr.Textbox(label="Chat Content"), # To display the chat content
166
+ gr.Textbox(label="Generated Answer") # To display the generated answer
167
+ ],
168
+ title="WhatsApp Chat Upload and Hybrid Search",
169
+ description="Upload a zip file containing WhatsApp chat data. This app processes the data and performs hybrid search with BM25 + Chroma."
170
  )
171
 
172
  if __name__ == "__main__":