sreesh2804 commited on
Commit
c6b59c0
·
verified ·
1 Parent(s): 3af1614

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -6
app.py CHANGED
@@ -16,6 +16,7 @@ from langchain.chains import RetrievalQA
16
  from langchain_google_genai import ChatGoogleGenerativeAI
17
  from PyPDF2 import PdfReader
18
  from gtts import gTTS
 
19
 
20
  temp_file_map = {}
21
 
@@ -43,7 +44,13 @@ vector_store = None
43
  file_id_map = {}
44
  temp_dir = "./temp_downloads"
45
  os.makedirs(temp_dir, exist_ok=True)
46
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 
 
 
 
 
 
47
 
48
  # ✅ Get list of files from Google Drive
49
  def get_files_from_drive():
@@ -102,10 +109,13 @@ def process_documents(selected_files):
102
  # ✅ Dynamically adjust chunk size for efficiency
103
  if total_words < 1000:
104
  chunk_size, chunk_overlap = 500, 50 # Small
 
105
  elif total_words < 5000:
106
  chunk_size, chunk_overlap = 1000, 100 # Medium
 
107
  else:
108
  chunk_size, chunk_overlap = 2000, 200 # Large
 
109
 
110
  logging.info(f"📄 Document Size: {total_words} words | Chunk Size: {chunk_size}, Overlap: {chunk_overlap}")
111
 
@@ -113,6 +123,10 @@ def process_documents(selected_files):
113
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
114
  split_docs = text_splitter.split_documents(docs)
115
 
 
 
 
 
116
  # ✅ Store efficiently in vector database
117
  vector_store = Chroma.from_documents(split_docs, embeddings)
118
 
@@ -120,10 +134,6 @@ def process_documents(selected_files):
120
 
121
 
122
  # ✅ Query document
123
-
124
-
125
-
126
-
127
  # ✅ Ensure temp_file_map exists
128
  temp_file_map = {}
129
 
@@ -215,4 +225,3 @@ with gr.Blocks() as demo:
215
  submit_button.click(query_document, inputs=user_input, outputs=[response_output, audio_output])
216
 
217
  demo.launch()
218
-
 
16
  from langchain_google_genai import ChatGoogleGenerativeAI
17
  from PyPDF2 import PdfReader
18
  from gtts import gTTS
19
+ from sentence_transformers import SentenceTransformer
20
 
21
  temp_file_map = {}
22
 
 
44
  file_id_map = {}
45
  temp_dir = "./temp_downloads"
46
  os.makedirs(temp_dir, exist_ok=True)
47
+
48
+ # ✅ Define Sentence-Transformers for both models
49
+ def get_embedding_model(file_size_category):
50
+ if file_size_category in ["small", "medium"]:
51
+ return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
52
+ else:
53
+ return SentenceTransformer('sentence-transformers/all-MiniLM-L3-v2')
54
 
55
  # ✅ Get list of files from Google Drive
56
  def get_files_from_drive():
 
109
  # ✅ Dynamically adjust chunk size for efficiency
110
  if total_words < 1000:
111
  chunk_size, chunk_overlap = 500, 50 # Small
112
+ file_size_category = "small"
113
  elif total_words < 5000:
114
  chunk_size, chunk_overlap = 1000, 100 # Medium
115
+ file_size_category = "medium"
116
  else:
117
  chunk_size, chunk_overlap = 2000, 200 # Large
118
+ file_size_category = "large"
119
 
120
  logging.info(f"📄 Document Size: {total_words} words | Chunk Size: {chunk_size}, Overlap: {chunk_overlap}")
121
 
 
123
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
124
  split_docs = text_splitter.split_documents(docs)
125
 
126
+ # ✅ Choose embedding model based on file size category
127
+ embedding_model = get_embedding_model(file_size_category)
128
+ embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
129
+
130
  # ✅ Store efficiently in vector database
131
  vector_store = Chroma.from_documents(split_docs, embeddings)
132
 
 
134
 
135
 
136
  # ✅ Query document
 
 
 
 
137
  # ✅ Ensure temp_file_map exists
138
  temp_file_map = {}
139
 
 
225
  submit_button.click(query_document, inputs=user_input, outputs=[response_output, audio_output])
226
 
227
  demo.launch()