Update app.py
Browse files
app.py
CHANGED
@@ -107,6 +107,15 @@ class ModelManager:
|
|
107 |
|
108 |
model_manager = ModelManager()
|
109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
# File Handling
|
111 |
class FileHandler:
|
112 |
@staticmethod
|
@@ -118,6 +127,14 @@ class FileHandler:
|
|
118 |
return FileHandler._extract_from_docx(file_path)
|
119 |
elif ext == '.txt':
|
120 |
return FileHandler._extract_from_txt(file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
else:
|
122 |
raise ValueError(f"Unsupported file type: {ext}")
|
123 |
|
@@ -136,6 +153,46 @@ class FileHandler:
|
|
136 |
with open(file_path, 'r', encoding='utf-8') as f:
|
137 |
return f.read()
|
138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
# Text Processing
|
140 |
def simple_tokenize(text):
|
141 |
return text.split()
|
@@ -210,7 +267,9 @@ def optimize_query(
|
|
210 |
#print(llm.invoke('Hello'))
|
211 |
# Limit max time or set a timeout for LLM to avoid endless execution
|
212 |
optimized_queries = multi_query_retriever.invoke(query, max_time=30) # Timeout in seconds
|
213 |
-
|
|
|
|
|
214 |
return optimized_queries
|
215 |
|
216 |
|
@@ -1436,8 +1495,8 @@ def run_automated_tests_and_analyze(*args):
|
|
1436 |
'model_type': model_types,
|
1437 |
'model_name': [name.strip() for name in model_names.split(',')],
|
1438 |
'split_strategy': split_strategies,
|
1439 |
-
'chunk_size': [int(size.strip()) for size in chunk_sizes.split(',')],
|
1440 |
-
'overlap_size': [int(size.strip()) for size in overlap_sizes.split(',')],
|
1441 |
'vector_store_type': vector_store_types,
|
1442 |
'search_type': search_types,
|
1443 |
'top_k': [int(k.strip()) for k in top_k_values.split(',')],
|
|
|
107 |
|
108 |
model_manager = ModelManager()
|
109 |
|
110 |
+
# File Handling
|
111 |
+
import os
|
112 |
+
import json
|
113 |
+
import csv
|
114 |
+
import xml.etree.ElementTree as ET
|
115 |
+
import openpyxl # for handling .xlsx files
|
116 |
+
import pdfplumber
|
117 |
+
import docx
|
118 |
+
|
119 |
# File Handling
|
120 |
class FileHandler:
|
121 |
@staticmethod
|
|
|
127 |
return FileHandler._extract_from_docx(file_path)
|
128 |
elif ext == '.txt':
|
129 |
return FileHandler._extract_from_txt(file_path)
|
130 |
+
elif ext == '.xml':
|
131 |
+
return FileHandler._extract_from_xml(file_path)
|
132 |
+
elif ext == '.json':
|
133 |
+
return FileHandler._extract_from_json(file_path)
|
134 |
+
elif ext == '.xlsx':
|
135 |
+
return FileHandler._extract_from_xlsx(file_path)
|
136 |
+
elif ext == '.csv':
|
137 |
+
return FileHandler._extract_from_csv(file_path)
|
138 |
else:
|
139 |
raise ValueError(f"Unsupported file type: {ext}")
|
140 |
|
|
|
153 |
with open(file_path, 'r', encoding='utf-8') as f:
|
154 |
return f.read()
|
155 |
|
156 |
+
@staticmethod
|
157 |
+
def _extract_from_xml(file_path):
|
158 |
+
tree = ET.parse(file_path)
|
159 |
+
root = tree.getroot()
|
160 |
+
return FileHandler._extract_xml_text(root)
|
161 |
+
|
162 |
+
@staticmethod
|
163 |
+
def _extract_xml_text(element):
|
164 |
+
# Recursively extract text from XML elements
|
165 |
+
text = element.text or ""
|
166 |
+
for child in element:
|
167 |
+
text += FileHandler._extract_xml_text(child)
|
168 |
+
return text.strip()
|
169 |
+
|
170 |
+
@staticmethod
|
171 |
+
def _extract_from_json(file_path):
|
172 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
173 |
+
data = json.load(f)
|
174 |
+
return json.dumps(data, indent=4) # Pretty print JSON for readability
|
175 |
+
|
176 |
+
@staticmethod
|
177 |
+
def _extract_from_xlsx(file_path):
|
178 |
+
workbook = openpyxl.load_workbook(file_path)
|
179 |
+
sheet = workbook.active
|
180 |
+
data = []
|
181 |
+
for row in sheet.iter_rows(values_only=True):
|
182 |
+
data.append('\t'.join([str(cell) for cell in row if cell is not None]))
|
183 |
+
return '\n'.join(data)
|
184 |
+
|
185 |
+
@staticmethod
|
186 |
+
def _extract_from_csv(file_path):
|
187 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
188 |
+
reader = csv.reader(f)
|
189 |
+
data = []
|
190 |
+
for row in reader:
|
191 |
+
data.append(','.join(row))
|
192 |
+
return '\n'.join(data)
|
193 |
+
|
194 |
+
|
195 |
+
|
196 |
# Text Processing
|
197 |
def simple_tokenize(text):
|
198 |
return text.split()
|
|
|
267 |
#print(llm.invoke('Hello'))
|
268 |
# Limit max time or set a timeout for LLM to avoid endless execution
|
269 |
optimized_queries = multi_query_retriever.invoke(query, max_time=30) # Timeout in seconds
|
270 |
+
print(optimized_queries)
|
271 |
+
print('---- optimize query 5 ----')
|
272 |
+
|
273 |
return optimized_queries
|
274 |
|
275 |
|
|
|
1495 |
'model_type': model_types,
|
1496 |
'model_name': [name.strip() for name in model_names.split(',')],
|
1497 |
'split_strategy': split_strategies,
|
1498 |
+
'chunk_size': [int(size.strip()) for size in chunk_sizes.split(',') if size.strip()],
|
1499 |
+
'overlap_size': [int(size.strip()) for size in overlap_sizes.split(',') if size.strip()],
|
1500 |
'vector_store_type': vector_store_types,
|
1501 |
'search_type': search_types,
|
1502 |
'top_k': [int(k.strip()) for k in top_k_values.split(',')],
|