Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,55 +1,172 @@
|
|
1 |
-
#
|
|
|
|
|
2 |
import gradio as gr
|
|
|
3 |
import os
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
from dotenv import load_dotenv
|
6 |
-
from langchain_community.document_loaders import WhatsAppChatLoader
|
7 |
-
from typing import List
|
8 |
|
9 |
-
#
|
10 |
-
|
11 |
|
12 |
-
#
|
|
|
|
|
13 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
14 |
|
15 |
-
# Initialize
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
llm = HuggingFaceEndpoint(
|
17 |
repo_id="mistralai/Mistral-7B-Instruct-v0.3",
|
18 |
-
huggingfacehub_api_token=HF_TOKEN,
|
19 |
temperature=0.1,
|
20 |
-
max_new_tokens=
|
21 |
)
|
22 |
|
23 |
-
#
|
24 |
-
def
|
25 |
-
#
|
26 |
-
|
27 |
-
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
-
#
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
def answer_question(file, question: str) -> str:
|
35 |
-
# Load the chat content from the uploaded file
|
36 |
-
chat_content = load_chat_content(file)
|
37 |
-
#prompt="Your task is to generate answer according to {question} based on the given {chat_content}"
|
38 |
-
# Generate a response using the Hugging Face model
|
39 |
-
response = llm(chat_content + "\n\n" + question)
|
40 |
-
#response = llm(prompt)
|
41 |
-
return response
|
42 |
-
|
43 |
-
# Define the Gradio interface
|
44 |
interface = gr.Interface(
|
45 |
-
fn=
|
46 |
-
inputs=[
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
title="WhatsApp Chat
|
52 |
-
description="Upload a WhatsApp chat
|
53 |
)
|
54 |
|
55 |
if __name__ == "__main__":
|
|
|
1 |
+
#https://medium.com/@csakash03/hybrid-search-is-a-method-to-optimize-rag-implementation-98d9d0911341
|
2 |
+
#https://medium.com/etoai/hybrid-search-combining-bm25-and-semantic-search-for-better-results-with-lan-1358038fe7e6
|
3 |
+
|
4 |
import gradio as gr
|
5 |
+
import zipfile
|
6 |
import os
|
7 |
+
import re
|
8 |
+
from pathlib import Path
|
9 |
+
import chromadb
|
10 |
+
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint
|
11 |
+
from langchain_chroma import Chroma
|
12 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
13 |
+
import hashlib
|
14 |
+
import nltk
|
15 |
+
from rank_bm25 import BM25Okapi
|
16 |
+
import numpy as np
|
17 |
+
from langchain.schema import Document
|
18 |
from dotenv import load_dotenv
|
|
|
|
|
19 |
|
20 |
+
# Download the required NLTK data
|
21 |
+
nltk.download('punkt')
|
22 |
|
23 |
+
# Define embeddings using Hugging Face models
|
24 |
+
embeddings = HuggingFaceEmbeddings()
|
25 |
+
load_dotenv()
|
26 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
27 |
|
28 |
+
# Initialize Chroma vector store
|
29 |
+
persist_directory = "./chroma_langchain_db"
|
30 |
+
client = chromadb.PersistentClient()
|
31 |
+
collection = client.get_or_create_collection("whatsapp_collection")
|
32 |
+
|
33 |
+
vector_store = Chroma(
|
34 |
+
collection_name="whatsapp_collection",
|
35 |
+
embedding_function=embeddings,
|
36 |
+
persist_directory=persist_directory,
|
37 |
+
)
|
38 |
+
|
39 |
+
# Define global variables
|
40 |
+
bm25 = None
|
41 |
+
all_texts = []
|
42 |
+
processed_files = {} # Dictionary to store hashes of processed files
|
43 |
+
|
44 |
llm = HuggingFaceEndpoint(
|
45 |
repo_id="mistralai/Mistral-7B-Instruct-v0.3",
|
46 |
+
huggingfacehub_api_token=HF_TOKEN.strip(),
|
47 |
temperature=0.1,
|
48 |
+
max_new_tokens=200
|
49 |
)
|
50 |
|
51 |
+
# Function to remove emojis and clean the text
|
52 |
+
def clean_text(text):
|
53 |
+
# Remove emojis
|
54 |
+
text = re.sub(r'[^\x00-\x7F]+', '', text)
|
55 |
+
# Additional cleaning if necessary
|
56 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
57 |
+
return text
|
58 |
+
|
59 |
+
# Function to compute a file hash for identifying duplicates
|
60 |
+
def compute_file_hash(file_path):
|
61 |
+
hasher = hashlib.md5()
|
62 |
+
with open(file_path, 'rb') as f:
|
63 |
+
buf = f.read()
|
64 |
+
hasher.update(buf)
|
65 |
+
return hasher.hexdigest()
|
66 |
+
|
67 |
+
# Function to process and upload the zip file to Chroma
|
68 |
+
def process_and_upload_zip(zip_file):
|
69 |
+
global bm25, all_texts, processed_files
|
70 |
+
|
71 |
+
temp_dir = Path("temp")
|
72 |
+
temp_dir.mkdir(exist_ok=True)
|
73 |
+
|
74 |
+
# Compute hash to check if file has been processed
|
75 |
+
zip_file_hash = compute_file_hash(zip_file.name)
|
76 |
+
|
77 |
+
# If the file has been processed before, skip re-uploading
|
78 |
+
if zip_file_hash in processed_files:
|
79 |
+
return f"File '{zip_file.name}' already processed. Using existing Chroma storage."
|
80 |
+
|
81 |
+
# Extract the zip file
|
82 |
+
with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
|
83 |
+
zip_ref.extractall(temp_dir)
|
84 |
+
|
85 |
+
# Load and clean the chat text
|
86 |
+
chat_files = list(temp_dir.glob("*.txt"))
|
87 |
+
metadata = []
|
88 |
+
all_texts = []
|
89 |
+
|
90 |
+
for chat_file in chat_files:
|
91 |
+
with open(chat_file, 'r', encoding='utf-8') as file:
|
92 |
+
page_content = file.read()
|
93 |
+
|
94 |
+
# Clean the text
|
95 |
+
clean_content = clean_text(page_content)
|
96 |
+
|
97 |
+
# Split the clean_content into chunks of 2500 characters with 200 overlap
|
98 |
+
chunk_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=200)
|
99 |
+
chunks = chunk_splitter.split_text(clean_content)
|
100 |
+
|
101 |
+
for chunk_index, chunk in enumerate(chunks):
|
102 |
+
metadata.append({
|
103 |
+
"context": chunk,
|
104 |
+
"document_id": chat_file.stem,
|
105 |
+
"chunk_index": chunk_index
|
106 |
+
})
|
107 |
+
all_texts.append(chunk)
|
108 |
+
|
109 |
+
# Initialize BM25 for sparse retrieval
|
110 |
+
bm25 = BM25Okapi([doc.split() for doc in all_texts])
|
111 |
+
|
112 |
+
# Create dense embeddings and store in Chroma
|
113 |
+
chunk_embeddings = embeddings.embed_documents(all_texts)
|
114 |
+
ids = [f"{m['document_id']}_chunk_{m['chunk_index']}" for m in metadata]
|
115 |
+
|
116 |
+
documents = [Document(page_content=m["context"], metadata=m) for m in metadata]
|
117 |
+
vector_store.add_documents(documents=documents, ids=ids)
|
118 |
+
|
119 |
+
# Store the hash of the processed file to avoid reprocessing
|
120 |
+
processed_files[zip_file_hash] = zip_file.name
|
121 |
+
|
122 |
+
return "Data uploaded and stored in Chroma successfully."
|
123 |
+
|
124 |
+
def hybrid_search(query):
|
125 |
+
global bm25, all_texts
|
126 |
+
|
127 |
+
# BM25 Sparse Retrieval
|
128 |
+
query_terms = query.split()
|
129 |
+
bm25_scores = bm25.get_scores(query_terms)
|
130 |
+
bm25_top_n_indices = np.argsort(bm25_scores)[::-1][:5] # Top 5 results
|
131 |
+
|
132 |
+
sparse_results = [all_texts[i] for i in bm25_top_n_indices]
|
133 |
+
|
134 |
+
# Dense Retrieval using Chroma
|
135 |
+
dense_results = vector_store.similarity_search(query, k=5)
|
136 |
+
|
137 |
+
# Combine the results (you can enhance the combination logic here)
|
138 |
+
combined_results = sparse_results + [result.page_content for result in dense_results]
|
139 |
+
|
140 |
+
response = ""
|
141 |
+
for result in combined_results:
|
142 |
+
response += f"{result}\n\n"
|
143 |
+
|
144 |
+
return f"Hybrid Search Results:\n\n{response}"
|
145 |
+
|
146 |
+
# Gradio Interface for uploading and querying
|
147 |
+
def query_interface(zip_file, query):
|
148 |
+
upload_status = process_and_upload_zip(zip_file)
|
149 |
+
search_results = hybrid_search(query)
|
150 |
+
prompt = (f"Here is a summary of WhatsApp chat contents based on the search for the query: '{query}'. "
|
151 |
+
f"The chat content includes important messages:\n\n"
|
152 |
+
f"{search_results}\n\n"
|
153 |
+
f"Now, based on this chat content, answer the following question as an expert. "
|
154 |
+
f"Please provide a complete and precise answer in **100 words**.\n\n"
|
155 |
+
f"Question: {query}")
|
156 |
+
response = llm.invoke(prompt)
|
157 |
|
158 |
+
# Generate answer using the LLM
|
159 |
+
return f"{upload_status}\n\n{search_results}", response
|
160 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
interface = gr.Interface(
|
162 |
+
fn=query_interface,
|
163 |
+
inputs=[gr.File(label="Upload WhatsApp Chat Zip File"), gr.Textbox(label="Enter your query")],
|
164 |
+
outputs=[
|
165 |
+
gr.Textbox(label="Chat Content"), # To display the chat content
|
166 |
+
gr.Textbox(label="Generated Answer") # To display the generated answer
|
167 |
+
],
|
168 |
+
title="WhatsApp Chat Upload and Hybrid Search",
|
169 |
+
description="Upload a zip file containing WhatsApp chat data. This app processes the data and performs hybrid search with BM25 + Chroma."
|
170 |
)
|
171 |
|
172 |
if __name__ == "__main__":
|