Chris4K commited on
Commit
c900cf0
1 Parent(s): 1ea5254

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -3
app.py CHANGED
@@ -107,6 +107,15 @@ class ModelManager:
107
 
108
  model_manager = ModelManager()
109
 
 
 
 
 
 
 
 
 
 
110
  # File Handling
111
  class FileHandler:
112
  @staticmethod
@@ -118,6 +127,14 @@ class FileHandler:
118
  return FileHandler._extract_from_docx(file_path)
119
  elif ext == '.txt':
120
  return FileHandler._extract_from_txt(file_path)
 
 
 
 
 
 
 
 
121
  else:
122
  raise ValueError(f"Unsupported file type: {ext}")
123
 
@@ -136,6 +153,46 @@ class FileHandler:
136
  with open(file_path, 'r', encoding='utf-8') as f:
137
  return f.read()
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  # Text Processing
140
  def simple_tokenize(text):
141
  return text.split()
@@ -210,7 +267,9 @@ def optimize_query(
210
  #print(llm.invoke('Hello'))
211
  # Limit max time or set a timeout for LLM to avoid endless execution
212
  optimized_queries = multi_query_retriever.invoke(query, max_time=30) # Timeout in seconds
213
-
 
 
214
  return optimized_queries
215
 
216
 
@@ -1436,8 +1495,8 @@ def run_automated_tests_and_analyze(*args):
1436
  'model_type': model_types,
1437
  'model_name': [name.strip() for name in model_names.split(',')],
1438
  'split_strategy': split_strategies,
1439
- 'chunk_size': [int(size.strip()) for size in chunk_sizes.split(',')],
1440
- 'overlap_size': [int(size.strip()) for size in overlap_sizes.split(',')],
1441
  'vector_store_type': vector_store_types,
1442
  'search_type': search_types,
1443
  'top_k': [int(k.strip()) for k in top_k_values.split(',')],
 
107
 
108
  model_manager = ModelManager()
109
 
110
+ # File Handling
111
+ import os
112
+ import json
113
+ import csv
114
+ import xml.etree.ElementTree as ET
115
+ import openpyxl # for handling .xlsx files
116
+ import pdfplumber
117
+ import docx
118
+
119
  # File Handling
120
  class FileHandler:
121
  @staticmethod
 
127
  return FileHandler._extract_from_docx(file_path)
128
  elif ext == '.txt':
129
  return FileHandler._extract_from_txt(file_path)
130
+ elif ext == '.xml':
131
+ return FileHandler._extract_from_xml(file_path)
132
+ elif ext == '.json':
133
+ return FileHandler._extract_from_json(file_path)
134
+ elif ext == '.xlsx':
135
+ return FileHandler._extract_from_xlsx(file_path)
136
+ elif ext == '.csv':
137
+ return FileHandler._extract_from_csv(file_path)
138
  else:
139
  raise ValueError(f"Unsupported file type: {ext}")
140
 
 
153
  with open(file_path, 'r', encoding='utf-8') as f:
154
  return f.read()
155
 
156
+ @staticmethod
157
+ def _extract_from_xml(file_path):
158
+ tree = ET.parse(file_path)
159
+ root = tree.getroot()
160
+ return FileHandler._extract_xml_text(root)
161
+
162
+ @staticmethod
163
+ def _extract_xml_text(element):
164
+ # Recursively extract text from XML elements
165
+ text = element.text or ""
166
+ for child in element:
167
+ text += FileHandler._extract_xml_text(child)
168
+ return text.strip()
169
+
170
+ @staticmethod
171
+ def _extract_from_json(file_path):
172
+ with open(file_path, 'r', encoding='utf-8') as f:
173
+ data = json.load(f)
174
+ return json.dumps(data, indent=4) # Pretty print JSON for readability
175
+
176
+ @staticmethod
177
+ def _extract_from_xlsx(file_path):
178
+ workbook = openpyxl.load_workbook(file_path)
179
+ sheet = workbook.active
180
+ data = []
181
+ for row in sheet.iter_rows(values_only=True):
182
+ data.append('\t'.join([str(cell) for cell in row if cell is not None]))
183
+ return '\n'.join(data)
184
+
185
+ @staticmethod
186
+ def _extract_from_csv(file_path):
187
+ with open(file_path, 'r', encoding='utf-8') as f:
188
+ reader = csv.reader(f)
189
+ data = []
190
+ for row in reader:
191
+ data.append(','.join(row))
192
+ return '\n'.join(data)
193
+
194
+
195
+
196
  # Text Processing
197
  def simple_tokenize(text):
198
  return text.split()
 
267
  #print(llm.invoke('Hello'))
268
  # Limit max time or set a timeout for LLM to avoid endless execution
269
  optimized_queries = multi_query_retriever.invoke(query, max_time=30) # Timeout in seconds
270
+ print(optimized_queries)
271
+ print('---- optimize query 5 ----')
272
+
273
  return optimized_queries
274
 
275
 
 
1495
  'model_type': model_types,
1496
  'model_name': [name.strip() for name in model_names.split(',')],
1497
  'split_strategy': split_strategies,
1498
+ 'chunk_size': [int(size.strip()) for size in chunk_sizes.split(',') if size.strip()],
1499
+ 'overlap_size': [int(size.strip()) for size in overlap_sizes.split(',') if size.strip()],
1500
  'vector_store_type': vector_store_types,
1501
  'search_type': search_types,
1502
  'top_k': [int(k.strip()) for k in top_k_values.split(',')],