Spaces:
Sleeping
Sleeping
File size: 6,101 Bytes
e7cf3ff 7842e72 e7cf3ff 93949bf 7842e72 e7cf3ff 7842e72 e7cf3ff 7842e72 e7cf3ff 7842e72 e7cf3ff 7842e72 e7cf3ff 7842e72 e7cf3ff 7842e72 e7cf3ff 7842e72 e7cf3ff 7842e72 e7cf3ff 7842e72 e7cf3ff 7842e72 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
#https://medium.com/@csakash03/hybrid-search-is-a-method-to-optimize-rag-implementation-98d9d0911341
#https://medium.com/etoai/hybrid-search-combining-bm25-and-semantic-search-for-better-results-with-lan-1358038fe7e6
import gradio as gr
import zipfile
import os
import re
from pathlib import Path
import chromadb
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint
from langchain_chroma import Chroma
# from langchain.textsplitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
import hashlib
import nltk
from rank_bm25 import BM25Okapi
import numpy as np
from langchain.schema import Document
from dotenv import load_dotenv
# Download the required NLTK data
nltk.download('punkt')
# Define embeddings using Hugging Face models
embeddings = HuggingFaceEmbeddings()
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
# Initialize Chroma vector store
persist_directory = "./chroma_langchain_db"
client = chromadb.PersistentClient()
collection = client.get_or_create_collection("whatsapp_collection")
vector_store = Chroma(
collection_name="whatsapp_collection",
embedding_function=embeddings,
persist_directory=persist_directory,
)
# Define global variables
bm25 = None
all_texts = []
processed_files = {} # Dictionary to store hashes of processed files
llm = HuggingFaceEndpoint(
repo_id="mistralai/Mistral-7B-Instruct-v0.3",
huggingfacehub_api_token=HF_TOKEN.strip(),
temperature=0.1,
max_new_tokens=200
)
# Function to remove emojis and clean the text
def clean_text(text):
# Remove emojis
text = re.sub(r'[^\x00-\x7F]+', '', text)
# Additional cleaning if necessary
text = re.sub(r'\s+', ' ', text).strip()
return text
# Function to compute a file hash for identifying duplicates
def compute_file_hash(file_path):
hasher = hashlib.md5()
with open(file_path, 'rb') as f:
buf = f.read()
hasher.update(buf)
return hasher.hexdigest()
# Function to process and upload the zip file to Chroma
def process_and_upload_zip(zip_file):
global bm25, all_texts, processed_files
temp_dir = Path("temp")
temp_dir.mkdir(exist_ok=True)
# Compute hash to check if file has been processed
zip_file_hash = compute_file_hash(zip_file.name)
# If the file has been processed before, skip re-uploading
if zip_file_hash in processed_files:
return f"File '{zip_file.name}' already processed. Using existing Chroma storage."
# Extract the zip file
with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
zip_ref.extractall(temp_dir)
# Load and clean the chat text
chat_files = list(temp_dir.glob("*.txt"))
metadata = []
all_texts = []
for chat_file in chat_files:
with open(chat_file, 'r', encoding='utf-8') as file:
page_content = file.read()
# Clean the text
clean_content = clean_text(page_content)
# Split the clean_content into chunks of 2500 characters with 200 overlap
chunk_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=200)
chunks = chunk_splitter.split_text(clean_content)
for chunk_index, chunk in enumerate(chunks):
metadata.append({
"context": chunk,
"document_id": chat_file.stem,
"chunk_index": chunk_index
})
all_texts.append(chunk)
# Initialize BM25 for sparse retrieval
bm25 = BM25Okapi([doc.split() for doc in all_texts])
# Create dense embeddings and store in Chroma
chunk_embeddings = embeddings.embed_documents(all_texts)
ids = [f"{m['document_id']}_chunk_{m['chunk_index']}" for m in metadata]
documents = [Document(page_content=m["context"], metadata=m) for m in metadata]
vector_store.add_documents(documents=documents, ids=ids)
# Store the hash of the processed file to avoid reprocessing
processed_files[zip_file_hash] = zip_file.name
return "Data uploaded and stored in Chroma successfully."
def hybrid_search(query):
global bm25, all_texts
# BM25 Sparse Retrieval
query_terms = query.split()
bm25_scores = bm25.get_scores(query_terms)
bm25_top_n_indices = np.argsort(bm25_scores)[::-1][:5] # Top 5 results
sparse_results = [all_texts[i] for i in bm25_top_n_indices]
# Dense Retrieval using Chroma
dense_results = vector_store.similarity_search(query, k=5)
# Combine the results (you can enhance the combination logic here)
combined_results = sparse_results + [result.page_content for result in dense_results]
response = ""
for result in combined_results:
response += f"{result}\n\n"
return f"Hybrid Search Results:\n\n{response}"
# Gradio Interface for uploading and querying
def query_interface(zip_file, query):
upload_status = process_and_upload_zip(zip_file)
search_results = hybrid_search(query)
prompt = (f"Here is a summary of WhatsApp chat contents based on the search for the query: '{query}'. "
f"The chat content includes important messages:\n\n"
f"{search_results}\n\n"
f"Now, based on this chat content, answer the following question as an expert. "
f"Please provide a complete and precise answer in **100 words**.\n\n"
f"Question: {query}")
response = llm.invoke(prompt)
# Generate answer using the LLM
return f"{upload_status}\n\n{search_results}", response
interface = gr.Interface(
fn=query_interface,
inputs=[gr.File(label="Upload WhatsApp Chat Zip File"), gr.Textbox(label="Enter your query")],
outputs=[
gr.Textbox(label="Chat Content"), # To display the chat content
gr.Textbox(label="Generated Answer") # To display the generated answer
],
title="WhatsApp Chat Upload and Hybrid Search",
description="Upload a zip file containing WhatsApp chat data. This app processes the data and performs hybrid search with BM25 + Chroma."
)
if __name__ == "__main__":
interface.launch() |