Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -16,6 +16,7 @@ from langchain.chains import RetrievalQA
|
|
16 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
17 |
from PyPDF2 import PdfReader
|
18 |
from gtts import gTTS
|
|
|
19 |
|
20 |
temp_file_map = {}
|
21 |
|
@@ -43,7 +44,13 @@ vector_store = None
|
|
43 |
file_id_map = {}
|
44 |
temp_dir = "./temp_downloads"
|
45 |
os.makedirs(temp_dir, exist_ok=True)
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
# ✅ Get list of files from Google Drive
|
49 |
def get_files_from_drive():
|
@@ -102,10 +109,13 @@ def process_documents(selected_files):
|
|
102 |
# ✅ Dynamically adjust chunk size for efficiency
|
103 |
if total_words < 1000:
|
104 |
chunk_size, chunk_overlap = 500, 50 # Small
|
|
|
105 |
elif total_words < 5000:
|
106 |
chunk_size, chunk_overlap = 1000, 100 # Medium
|
|
|
107 |
else:
|
108 |
chunk_size, chunk_overlap = 2000, 200 # Large
|
|
|
109 |
|
110 |
logging.info(f"📄 Document Size: {total_words} words | Chunk Size: {chunk_size}, Overlap: {chunk_overlap}")
|
111 |
|
@@ -113,6 +123,10 @@ def process_documents(selected_files):
|
|
113 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
114 |
split_docs = text_splitter.split_documents(docs)
|
115 |
|
|
|
|
|
|
|
|
|
116 |
# ✅ Store efficiently in vector database
|
117 |
vector_store = Chroma.from_documents(split_docs, embeddings)
|
118 |
|
@@ -120,10 +134,6 @@ def process_documents(selected_files):
|
|
120 |
|
121 |
|
122 |
# ✅ Query document
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
# ✅ Ensure temp_file_map exists
|
128 |
temp_file_map = {}
|
129 |
|
@@ -215,4 +225,3 @@ with gr.Blocks() as demo:
|
|
215 |
submit_button.click(query_document, inputs=user_input, outputs=[response_output, audio_output])
|
216 |
|
217 |
demo.launch()
|
218 |
-
|
|
|
16 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
17 |
from PyPDF2 import PdfReader
|
18 |
from gtts import gTTS
|
19 |
+
from sentence_transformers import SentenceTransformer
|
20 |
|
21 |
temp_file_map = {}
|
22 |
|
|
|
44 |
file_id_map = {}
|
45 |
temp_dir = "./temp_downloads"
|
46 |
os.makedirs(temp_dir, exist_ok=True)
|
47 |
+
|
48 |
+
# ✅ Define Sentence-Transformers for both models
|
49 |
+
def get_embedding_model(file_size_category):
|
50 |
+
if file_size_category in ["small", "medium"]:
|
51 |
+
return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
52 |
+
else:
|
53 |
+
return SentenceTransformer('sentence-transformers/all-MiniLM-L3-v2')
|
54 |
|
55 |
# ✅ Get list of files from Google Drive
|
56 |
def get_files_from_drive():
|
|
|
109 |
# ✅ Dynamically adjust chunk size for efficiency
|
110 |
if total_words < 1000:
|
111 |
chunk_size, chunk_overlap = 500, 50 # Small
|
112 |
+
file_size_category = "small"
|
113 |
elif total_words < 5000:
|
114 |
chunk_size, chunk_overlap = 1000, 100 # Medium
|
115 |
+
file_size_category = "medium"
|
116 |
else:
|
117 |
chunk_size, chunk_overlap = 2000, 200 # Large
|
118 |
+
file_size_category = "large"
|
119 |
|
120 |
logging.info(f"📄 Document Size: {total_words} words | Chunk Size: {chunk_size}, Overlap: {chunk_overlap}")
|
121 |
|
|
|
123 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
124 |
split_docs = text_splitter.split_documents(docs)
|
125 |
|
126 |
+
# ✅ Choose embedding model based on file size category
|
127 |
+
embedding_model = get_embedding_model(file_size_category)
|
128 |
+
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
|
129 |
+
|
130 |
# ✅ Store efficiently in vector database
|
131 |
vector_store = Chroma.from_documents(split_docs, embeddings)
|
132 |
|
|
|
134 |
|
135 |
|
136 |
# ✅ Query document
|
|
|
|
|
|
|
|
|
137 |
# ✅ Ensure temp_file_map exists
|
138 |
temp_file_map = {}
|
139 |
|
|
|
225 |
submit_button.click(query_document, inputs=user_input, outputs=[response_output, audio_output])
|
226 |
|
227 |
demo.launch()
|
|