Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- __pycache__/advanced_rag.cpython-311.pyc +0 -0
- __pycache__/test_filename_generation.cpython-311.pyc +0 -0
- advanced_rag.py +110 -10
- advanced_rag_updated.py +245 -0
- temp_import.py +2 -0
- test_filename_generation.py +92 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
__pycache__/advanced_rag.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
|
__pycache__/advanced_rag.cpython-311.pyc
CHANGED
|
Binary files a/__pycache__/advanced_rag.cpython-311.pyc and b/__pycache__/advanced_rag.cpython-311.pyc differ
|
|
|
__pycache__/test_filename_generation.cpython-311.pyc
ADDED
|
Binary file (5.14 kB). View file
|
|
|
advanced_rag.py
CHANGED
|
@@ -105,8 +105,13 @@ def process_batch_query(query, model_choice, max_tokens, param_configs, slider_v
|
|
| 105 |
"Progress": f"Query {current}/{total_combinations}"
|
| 106 |
})
|
| 107 |
|
| 108 |
-
# Format results with CSV file links
|
| 109 |
-
formatted_results, csv_path = format_batch_result_files(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
return (
|
| 112 |
formatted_results,
|
|
@@ -1388,10 +1393,64 @@ def format_response(response: str) -> str:
|
|
| 1388 |
|
| 1389 |
def reset_app_updated():
|
| 1390 |
global rag_chain
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1391 |
rag_chain = ElevatedRagChain()
|
| 1392 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1393 |
return (
|
| 1394 |
-
"App reset successfully. You can now load new files",
|
| 1395 |
"",
|
| 1396 |
"Model used: Not selected"
|
| 1397 |
)
|
|
@@ -2172,12 +2231,51 @@ https://www.gutenberg.org/ebooks/8438.txt.utf-8
|
|
| 2172 |
outputs=[csv_download_html_batch, csv_download_file_batch, csv_file_info_df_batch]
|
| 2173 |
)
|
| 2174 |
|
| 2175 |
-
def create_csv_from_batch_results(results: List[Dict], job_id: str
|
|
|
|
|
|
|
| 2176 |
"""Create a CSV file from batch query results and return the file path"""
|
| 2177 |
# Save CSV files in the current directory for HuggingFace Spaces compatibility
|
| 2178 |
-
|
|
|
|
| 2179 |
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 2180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2181 |
csv_path = os.path.abspath(csv_filename)
|
| 2182 |
|
| 2183 |
# Extract parameters and responses
|
|
@@ -2233,10 +2331,12 @@ def create_csv_from_batch_results(results: List[Dict], job_id: str) -> str:
|
|
| 2233 |
|
| 2234 |
return csv_path
|
| 2235 |
|
| 2236 |
-
def format_batch_result_files(results: List[Dict], job_id: str
|
|
|
|
|
|
|
| 2237 |
"""Format batch results with links to CSV files"""
|
| 2238 |
-
# Create CSV file
|
| 2239 |
-
csv_path = create_csv_from_batch_results(results, job_id)
|
| 2240 |
|
| 2241 |
# Format the results
|
| 2242 |
formatted_results = "### Batch Query Results\n\n"
|
|
|
|
| 105 |
"Progress": f"Query {current}/{total_combinations}"
|
| 106 |
})
|
| 107 |
|
| 108 |
+
# Format results with CSV file links - UPDATED TO PASS ADDITIONAL PARAMETERS
|
| 109 |
+
formatted_results, csv_path = format_batch_result_files(
|
| 110 |
+
results, job_id,
|
| 111 |
+
embedding_model=getattr(rag_chain, 'embedding_model', 'unknown'),
|
| 112 |
+
llm_model=model_choice,
|
| 113 |
+
param_variations=param_configs
|
| 114 |
+
)
|
| 115 |
|
| 116 |
return (
|
| 117 |
formatted_results,
|
|
|
|
| 1393 |
|
| 1394 |
def reset_app_updated():
|
| 1395 |
global rag_chain
|
| 1396 |
+
|
| 1397 |
+
# Properly clean up the existing vector database components
|
| 1398 |
+
if hasattr(rag_chain, 'vector_store'):
|
| 1399 |
+
try:
|
| 1400 |
+
del rag_chain.vector_store
|
| 1401 |
+
except:
|
| 1402 |
+
pass
|
| 1403 |
+
|
| 1404 |
+
if hasattr(rag_chain, 'faiss_retriever'):
|
| 1405 |
+
try:
|
| 1406 |
+
del rag_chain.faiss_retriever
|
| 1407 |
+
except:
|
| 1408 |
+
pass
|
| 1409 |
+
|
| 1410 |
+
if hasattr(rag_chain, 'bm25_retriever'):
|
| 1411 |
+
try:
|
| 1412 |
+
del rag_chain.bm25_retriever
|
| 1413 |
+
except:
|
| 1414 |
+
pass
|
| 1415 |
+
|
| 1416 |
+
if hasattr(rag_chain, 'ensemble_retriever'):
|
| 1417 |
+
try:
|
| 1418 |
+
del rag_chain.ensemble_retriever
|
| 1419 |
+
except:
|
| 1420 |
+
pass
|
| 1421 |
+
|
| 1422 |
+
# Clear data references
|
| 1423 |
+
if hasattr(rag_chain, 'raw_data'):
|
| 1424 |
+
rag_chain.raw_data = None
|
| 1425 |
+
if hasattr(rag_chain, 'split_data'):
|
| 1426 |
+
rag_chain.split_data = None
|
| 1427 |
+
if hasattr(rag_chain, 'context'):
|
| 1428 |
+
rag_chain.context = ""
|
| 1429 |
+
if hasattr(rag_chain, 'conversation_history'):
|
| 1430 |
+
rag_chain.conversation_history = []
|
| 1431 |
+
|
| 1432 |
+
# Clear other components
|
| 1433 |
+
if hasattr(rag_chain, 'text_splitter'):
|
| 1434 |
+
try:
|
| 1435 |
+
del rag_chain.text_splitter
|
| 1436 |
+
except:
|
| 1437 |
+
pass
|
| 1438 |
+
|
| 1439 |
+
if hasattr(rag_chain, 'elevated_rag_chain'):
|
| 1440 |
+
try:
|
| 1441 |
+
del rag_chain.elevated_rag_chain
|
| 1442 |
+
except:
|
| 1443 |
+
pass
|
| 1444 |
+
|
| 1445 |
+
# Create a new instance
|
| 1446 |
rag_chain = ElevatedRagChain()
|
| 1447 |
+
|
| 1448 |
+
# Force garbage collection to free memory
|
| 1449 |
+
gc.collect()
|
| 1450 |
+
|
| 1451 |
+
debug_print("App reset successfully. Vector database and all components cleaned up.")
|
| 1452 |
return (
|
| 1453 |
+
"App reset successfully. Vector database and all components cleaned up. You can now load new files",
|
| 1454 |
"",
|
| 1455 |
"Model used: Not selected"
|
| 1456 |
)
|
|
|
|
| 2231 |
outputs=[csv_download_html_batch, csv_download_file_batch, csv_file_info_df_batch]
|
| 2232 |
)
|
| 2233 |
|
| 2234 |
+
def create_csv_from_batch_results(results: List[Dict], job_id: str,
|
| 2235 |
+
embedding_model: str = None, llm_model: str = None,
|
| 2236 |
+
param_variations: Dict = None) -> str:
|
| 2237 |
"""Create a CSV file from batch query results and return the file path"""
|
| 2238 |
# Save CSV files in the current directory for HuggingFace Spaces compatibility
|
| 2239 |
+
|
| 2240 |
+
# Create a descriptive filename
|
| 2241 |
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 2242 |
+
|
| 2243 |
+
# Extract short names for filename
|
| 2244 |
+
def get_short_name(full_name, prefix_length=2):
|
| 2245 |
+
"""Extract short name from full model name"""
|
| 2246 |
+
if not full_name:
|
| 2247 |
+
return "unknown"
|
| 2248 |
+
# Remove emojis and get the actual model name
|
| 2249 |
+
clean_name = full_name.split(" ", 1)[-1] if " " in full_name else full_name
|
| 2250 |
+
# Get first few characters and last few characters
|
| 2251 |
+
if len(clean_name) > 8:
|
| 2252 |
+
return clean_name[:4] + clean_name[-4:]
|
| 2253 |
+
return clean_name
|
| 2254 |
+
|
| 2255 |
+
def get_param_variation_name(param_configs):
|
| 2256 |
+
"""Get the parameter that was varied"""
|
| 2257 |
+
if not param_configs:
|
| 2258 |
+
return "const"
|
| 2259 |
+
|
| 2260 |
+
varied_params = []
|
| 2261 |
+
for param, config in param_configs.items():
|
| 2262 |
+
if config != "Constant":
|
| 2263 |
+
# Extract the number from "Whole range X values"
|
| 2264 |
+
if "values" in config:
|
| 2265 |
+
num_values = config.split()[2] if len(config.split()) > 2 else "X"
|
| 2266 |
+
varied_params.append(f"{param}_{num_values}")
|
| 2267 |
+
|
| 2268 |
+
if not varied_params:
|
| 2269 |
+
return "const"
|
| 2270 |
+
return "_".join(varied_params)
|
| 2271 |
+
|
| 2272 |
+
# Build filename components
|
| 2273 |
+
embedding_short = get_short_name(embedding_model) if embedding_model else "emb"
|
| 2274 |
+
llm_short = get_short_name(llm_model) if llm_model else "llm"
|
| 2275 |
+
param_short = get_param_variation_name(param_variations) if param_variations else "const"
|
| 2276 |
+
|
| 2277 |
+
# Create filename: batch_embedding_llm_params_timestamp.csv
|
| 2278 |
+
csv_filename = f"batch_{embedding_short}_{llm_short}_{param_short}_{timestamp}.csv"
|
| 2279 |
csv_path = os.path.abspath(csv_filename)
|
| 2280 |
|
| 2281 |
# Extract parameters and responses
|
|
|
|
| 2331 |
|
| 2332 |
return csv_path
|
| 2333 |
|
| 2334 |
+
def format_batch_result_files(results: List[Dict], job_id: str,
|
| 2335 |
+
embedding_model: str = None, llm_model: str = None,
|
| 2336 |
+
param_variations: Dict = None) -> Tuple[str, str]:
|
| 2337 |
"""Format batch results with links to CSV files"""
|
| 2338 |
+
# Create CSV file with improved filename
|
| 2339 |
+
csv_path = create_csv_from_batch_results(results, job_id, embedding_model, llm_model, param_variations)
|
| 2340 |
|
| 2341 |
# Format the results
|
| 2342 |
formatted_results = "### Batch Query Results\n\n"
|
advanced_rag_updated.py
ADDED
|
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 3 |
+
import datetime
|
| 4 |
+
import functools
|
| 5 |
+
import traceback
|
| 6 |
+
from typing import List, Optional, Any, Dict, Tuple
|
| 7 |
+
import csv
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import tempfile
|
| 10 |
+
import shutil
|
| 11 |
+
import glob
|
| 12 |
+
|
| 13 |
+
import torch
|
| 14 |
+
import transformers
|
| 15 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
| 16 |
+
from langchain_community.llms import HuggingFacePipeline
|
| 17 |
+
|
| 18 |
+
# Other LangChain and community imports
|
| 19 |
+
from langchain_community.document_loaders import OnlinePDFLoader
|
| 20 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 21 |
+
from langchain_community.vectorstores import FAISS
|
| 22 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
| 23 |
+
from langchain_community.retrievers import BM25Retriever
|
| 24 |
+
from langchain.embeddings.base import Embeddings
|
| 25 |
+
from langchain.retrievers import EnsembleRetriever
|
| 26 |
+
from langchain.prompts import ChatPromptTemplate
|
| 27 |
+
from langchain.schema import StrOutputParser, Document
|
| 28 |
+
from langchain_core.runnables import RunnableParallel, RunnableLambda
|
| 29 |
+
from transformers.quantizers.auto import AutoQuantizationConfig
|
| 30 |
+
import gradio as gr
|
| 31 |
+
from pydantic import PrivateAttr
|
| 32 |
+
import pydantic
|
| 33 |
+
|
| 34 |
+
from langchain.llms.base import LLM
|
| 35 |
+
from typing import Any, Optional, List
|
| 36 |
+
import typing
|
| 37 |
+
import time
|
| 38 |
+
import re
|
| 39 |
+
import requests
|
| 40 |
+
from langchain.schema import Document
|
| 41 |
+
from langchain_community.document_loaders import PyMuPDFLoader # Updated loader
|
| 42 |
+
import tempfile
|
| 43 |
+
import mimetypes
|
| 44 |
+
import gc
|
| 45 |
+
|
| 46 |
+
# Add batch processing helper functions
|
| 47 |
+
def generate_parameter_values(min_val, max_val, num_values):
|
| 48 |
+
"""Generate evenly spaced values between min and max"""
|
| 49 |
+
if num_values == 1:
|
| 50 |
+
return [min_val]
|
| 51 |
+
step = (max_val - min_val) / (num_values - 1)
|
| 52 |
+
return [min_val + (step * i) for i in range(num_values)]
|
| 53 |
+
|
| 54 |
+
def process_batch_query(query, model_choice, max_tokens, param_configs, slider_values, job_id, use_history=True):
|
| 55 |
+
"""Process a batch of queries with different parameter combinations"""
|
| 56 |
+
results = []
|
| 57 |
+
|
| 58 |
+
# Generate all parameter combinations
|
| 59 |
+
temp_values = [slider_values['temperature']] if param_configs['temperature'] == "Constant" else generate_parameter_values(0.1, 1.0, int(param_configs['temperature'].split()[2]))
|
| 60 |
+
top_p_values = [slider_values['top_p']] if param_configs['top_p'] == "Constant" else generate_parameter_values(0.1, 0.99, int(param_configs['top_p'].split()[2]))
|
| 61 |
+
top_k_values = [slider_values['top_k']] if param_configs['top_k'] == "Constant" else generate_parameter_values(1, 100, int(param_configs['top_k'].split()[2]))
|
| 62 |
+
bm25_values = [slider_values['bm25']] if param_configs['bm25'] == "Constant" else generate_parameter_values(0.0, 1.0, int(param_configs['bm25'].split()[2]))
|
| 63 |
+
|
| 64 |
+
total_combinations = len(temp_values) * len(top_p_values) * len(top_k_values) * len(bm25_values)
|
| 65 |
+
current = 0
|
| 66 |
+
|
| 67 |
+
for temp in temp_values:
|
| 68 |
+
for top_p in top_p_values:
|
| 69 |
+
for top_k in top_k_values:
|
| 70 |
+
for bm25 in bm25_values:
|
| 71 |
+
current += 1
|
| 72 |
+
try:
|
| 73 |
+
# Update parameters
|
| 74 |
+
rag_chain.temperature = temp
|
| 75 |
+
rag_chain.top_p = top_p
|
| 76 |
+
rag_chain.top_k = top_k
|
| 77 |
+
rag_chain.bm25_weight = bm25
|
| 78 |
+
rag_chain.faiss_weight = 1.0 - bm25
|
| 79 |
+
|
| 80 |
+
# Update ensemble retriever
|
| 81 |
+
rag_chain.ensemble_retriever = EnsembleRetriever(
|
| 82 |
+
retrievers=[rag_chain.bm25_retriever, rag_chain.faiss_retriever],
|
| 83 |
+
weights=[rag_chain.bm25_weight, rag_chain.faiss_weight]
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
# Process query
|
| 87 |
+
response = rag_chain.elevated_rag_chain.invoke({"question": query})
|
| 88 |
+
|
| 89 |
+
# Store response in history if enabled
|
| 90 |
+
if use_history:
|
| 91 |
+
trimmed_response = response[:1000] + ("..." if len(response) > 1000 else "")
|
| 92 |
+
rag_chain.conversation_history.append({"query": query, "response": trimmed_response})
|
| 93 |
+
|
| 94 |
+
# Format result
|
| 95 |
+
result = {
|
| 96 |
+
"Parameters": f"Temp: {temp:.2f}, Top-p: {top_p:.2f}, Top-k: {top_k}, BM25: {bm25:.2f}",
|
| 97 |
+
"Response": response,
|
| 98 |
+
"Progress": f"Query {current}/{total_combinations}"
|
| 99 |
+
}
|
| 100 |
+
results.append(result)
|
| 101 |
+
|
| 102 |
+
except Exception as e:
|
| 103 |
+
results.append({
|
| 104 |
+
"Parameters": f"Temp: {temp:.2f}, Top-p: {top_p:.2f}, Top-k: {top_k}, BM25: {bm25:.2f}",
|
| 105 |
+
"Response": f"Error: {str(e)}",
|
| 106 |
+
"Progress": f"Query {current}/{total_combinations}"
|
| 107 |
+
})
|
| 108 |
+
|
| 109 |
+
# Format results with CSV file links - UPDATED TO PASS ADDITIONAL PARAMETERS
|
| 110 |
+
formatted_results, csv_path = format_batch_result_files(
|
| 111 |
+
results, job_id,
|
| 112 |
+
embedding_model=getattr(rag_chain, 'embedding_model', 'unknown'),
|
| 113 |
+
llm_model=model_choice,
|
| 114 |
+
param_variations=param_configs
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
return (
|
| 118 |
+
formatted_results,
|
| 119 |
+
csv_path,
|
| 120 |
+
f"Job ID: {job_id}",
|
| 121 |
+
f"Input tokens: {count_tokens(query)}",
|
| 122 |
+
f"Output tokens: {sum(count_tokens(r['Response']) for r in results)}"
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
# ... (rest of the file content would go here, but I'll focus on the specific functions that need updating)
|
| 126 |
+
|
| 127 |
+
def create_csv_from_batch_results(results: List[Dict], job_id: str,
|
| 128 |
+
embedding_model: str = None, llm_model: str = None,
|
| 129 |
+
param_variations: Dict = None) -> str:
|
| 130 |
+
"""Create a CSV file from batch query results and return the file path"""
|
| 131 |
+
# Save CSV files in the current directory for HuggingFace Spaces compatibility
|
| 132 |
+
|
| 133 |
+
# Create a descriptive filename
|
| 134 |
+
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 135 |
+
|
| 136 |
+
# Extract short names for filename
|
| 137 |
+
def get_short_name(full_name, prefix_length=2):
|
| 138 |
+
"""Extract short name from full model name"""
|
| 139 |
+
if not full_name:
|
| 140 |
+
return "unknown"
|
| 141 |
+
# Remove emojis and get the actual model name
|
| 142 |
+
clean_name = full_name.split(" ", 1)[-1] if " " in full_name else full_name
|
| 143 |
+
# Get first few characters and last few characters
|
| 144 |
+
if len(clean_name) > 8:
|
| 145 |
+
return clean_name[:4] + clean_name[-4:]
|
| 146 |
+
return clean_name
|
| 147 |
+
|
| 148 |
+
def get_param_variation_name(param_configs):
|
| 149 |
+
"""Get the parameter that was varied"""
|
| 150 |
+
if not param_configs:
|
| 151 |
+
return "const"
|
| 152 |
+
|
| 153 |
+
varied_params = []
|
| 154 |
+
for param, config in param_configs.items():
|
| 155 |
+
if config != "Constant":
|
| 156 |
+
# Extract the number from "Whole range X values"
|
| 157 |
+
if "values" in config:
|
| 158 |
+
num_values = config.split()[2] if len(config.split()) > 2 else "X"
|
| 159 |
+
varied_params.append(f"{param}_{num_values}")
|
| 160 |
+
|
| 161 |
+
if not varied_params:
|
| 162 |
+
return "const"
|
| 163 |
+
return "_".join(varied_params)
|
| 164 |
+
|
| 165 |
+
# Build filename components
|
| 166 |
+
embedding_short = get_short_name(embedding_model) if embedding_model else "emb"
|
| 167 |
+
llm_short = get_short_name(llm_model) if llm_model else "llm"
|
| 168 |
+
param_short = get_param_variation_name(param_variations) if param_variations else "const"
|
| 169 |
+
|
| 170 |
+
# Create filename: batch_embedding_llm_params_timestamp.csv
|
| 171 |
+
csv_filename = f"batch_{embedding_short}_{llm_short}_{param_short}_{timestamp}.csv"
|
| 172 |
+
csv_path = os.path.abspath(csv_filename)
|
| 173 |
+
|
| 174 |
+
# Extract parameters and responses
|
| 175 |
+
data = []
|
| 176 |
+
start_time = time.time()
|
| 177 |
+
for result in results:
|
| 178 |
+
params = result["Parameters"]
|
| 179 |
+
response = result["Response"]
|
| 180 |
+
progress = result["Progress"]
|
| 181 |
+
|
| 182 |
+
# Calculate elapsed time for this query
|
| 183 |
+
current_time = time.time()
|
| 184 |
+
elapsed_time = current_time - start_time
|
| 185 |
+
|
| 186 |
+
# Extract individual parameter values
|
| 187 |
+
temp = float(re.search(r"Temp: ([\d.]+)", params).group(1))
|
| 188 |
+
top_p = float(re.search(r"Top-p: ([\d.]+)", params).group(1))
|
| 189 |
+
top_k = int(re.search(r"Top-k: (\d+)", params).group(1))
|
| 190 |
+
bm25 = float(re.search(r"BM25: ([\d.]+)", params).group(1))
|
| 191 |
+
|
| 192 |
+
# Extract response components
|
| 193 |
+
model_info = re.search(r"Model: (.*?)\n", response)
|
| 194 |
+
model = model_info.group(1) if model_info else "Unknown"
|
| 195 |
+
|
| 196 |
+
# Extract main answer (everything between the parameters and the token counts)
|
| 197 |
+
answer_match = re.search(r"Model Parameters:.*?\n\n(.*?)\n\n---", response, re.DOTALL)
|
| 198 |
+
main_answer = answer_match.group(1).strip() if answer_match else response
|
| 199 |
+
|
| 200 |
+
# Extract token counts
|
| 201 |
+
input_tokens = re.search(r"Input tokens: (\d+)", response)
|
| 202 |
+
output_tokens = re.search(r"Output tokens: (\d+)", response)
|
| 203 |
+
|
| 204 |
+
# Extract conversation history count
|
| 205 |
+
conv_history = re.search(r"Conversation History: (\d+) conversation", response)
|
| 206 |
+
|
| 207 |
+
data.append({
|
| 208 |
+
"Temperature": temp,
|
| 209 |
+
"Top-p": top_p,
|
| 210 |
+
"Top-k": top_k,
|
| 211 |
+
"BM25 Weight": bm25,
|
| 212 |
+
"Model": model,
|
| 213 |
+
"Main Answer": main_answer,
|
| 214 |
+
"Input Tokens": input_tokens.group(1) if input_tokens else "N/A",
|
| 215 |
+
"Output Tokens": output_tokens.group(1) if output_tokens else "N/A",
|
| 216 |
+
"Conversation History": conv_history.group(1) if conv_history else "0",
|
| 217 |
+
"Progress": progress,
|
| 218 |
+
"Elapsed Time (s)": f"{elapsed_time:.2f}"
|
| 219 |
+
})
|
| 220 |
+
|
| 221 |
+
# Create DataFrame and save to CSV
|
| 222 |
+
df = pd.DataFrame(data)
|
| 223 |
+
df.to_csv(csv_path, index=False)
|
| 224 |
+
|
| 225 |
+
return csv_path
|
| 226 |
+
|
| 227 |
+
def format_batch_result_files(results: List[Dict], job_id: str,
|
| 228 |
+
embedding_model: str = None, llm_model: str = None,
|
| 229 |
+
param_variations: Dict = None) -> Tuple[str, str]:
|
| 230 |
+
"""Format batch results with links to CSV files"""
|
| 231 |
+
# Create CSV file with improved filename
|
| 232 |
+
csv_path = create_csv_from_batch_results(results, job_id, embedding_model, llm_model, param_variations)
|
| 233 |
+
|
| 234 |
+
# Format the results
|
| 235 |
+
formatted_results = "### Batch Query Results\n\n"
|
| 236 |
+
|
| 237 |
+
# Add the actual results
|
| 238 |
+
for result in results:
|
| 239 |
+
formatted_results += f"#### {result['Parameters']}\n"
|
| 240 |
+
formatted_results += f"**Progress:** {result['Progress']}\n\n"
|
| 241 |
+
formatted_results += f"{result['Response']}\n\n"
|
| 242 |
+
formatted_results += "---\n\n"
|
| 243 |
+
|
| 244 |
+
return formatted_results, csv_path
|
| 245 |
+
|
temp_import.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gc
|
| 2 |
+
|
test_filename_generation.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script to verify the new CSV filename generation functionality
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
| 9 |
+
|
| 10 |
+
from advanced_rag import get_short_embedding_name, get_short_llm_name, get_varied_parameter
|
| 11 |
+
|
| 12 |
+
def test_embedding_names():
|
| 13 |
+
"""Test embedding model name generation"""
|
| 14 |
+
test_cases = [
|
| 15 |
+
("π€ sentence-transformers/all-MiniLM-L6-v2 (384 dim, fast)", "MiniLM"),
|
| 16 |
+
("π€ BAAI/bge-base-en-v1.5 (768 dim, excellent)", "BGE-Base"),
|
| 17 |
+
("π¦ Qwen/Qwen3-Embedding-8B (1024 dim, advanced)", "Qwen3-8B"),
|
| 18 |
+
("sentence-transformers/all-mpnet-base-v2", "MPNet"),
|
| 19 |
+
("unknown-model", "unknown")
|
| 20 |
+
]
|
| 21 |
+
|
| 22 |
+
print("Testing embedding name generation:")
|
| 23 |
+
for input_name, expected in test_cases:
|
| 24 |
+
result = get_short_embedding_name(input_name)
|
| 25 |
+
status = "β" if result == expected else "β"
|
| 26 |
+
print(f" {status} {input_name} -> {result} (expected: {expected})")
|
| 27 |
+
|
| 28 |
+
def test_llm_names():
|
| 29 |
+
"""Test LLM model name generation"""
|
| 30 |
+
test_cases = [
|
| 31 |
+
("πͺπΊ Mistral-API", "Mistral"),
|
| 32 |
+
("πΊπΈ Remote Meta-Llama-3", "Llama3"),
|
| 33 |
+
("πΊπΈ GPT-4o", "GPT4o"),
|
| 34 |
+
("mistral-small-latest", "Mistral"),
|
| 35 |
+
("meta-llama/Meta-Llama-3-8B-Instruct", "Llama3"),
|
| 36 |
+
("unknown-model", "unknown")
|
| 37 |
+
]
|
| 38 |
+
|
| 39 |
+
print("\nTesting LLM name generation:")
|
| 40 |
+
for input_name, expected in test_cases:
|
| 41 |
+
result = get_short_llm_name(input_name)
|
| 42 |
+
status = "β" if result == expected else "β"
|
| 43 |
+
print(f" {status} {input_name} -> {result} (expected: {expected})")
|
| 44 |
+
|
| 45 |
+
def test_varied_parameter():
|
| 46 |
+
"""Test varied parameter detection"""
|
| 47 |
+
test_cases = [
|
| 48 |
+
({"temperature": "Constant", "top_p": "Constant", "top_k": "Constant", "bm25": "Constant"}, "None"),
|
| 49 |
+
({"temperature": "Whole range 3 values", "top_p": "Constant", "top_k": "Constant", "bm25": "Constant"}, "temperature"),
|
| 50 |
+
({"temperature": "Constant", "top_p": "Whole range 5 values", "top_k": "Constant", "bm25": "Constant"}, "top_p"),
|
| 51 |
+
({"temperature": "Whole range 3 values", "top_p": "Whole range 5 values", "top_k": "Constant", "bm25": "Constant"}, "Multi"),
|
| 52 |
+
({"temperature": "Constant", "top_p": "Constant", "top_k": "Constant", "bm25": "Whole range 7 values"}, "bm25")
|
| 53 |
+
]
|
| 54 |
+
|
| 55 |
+
print("\nTesting varied parameter detection:")
|
| 56 |
+
for param_configs, expected in test_cases:
|
| 57 |
+
result = get_varied_parameter(param_configs)
|
| 58 |
+
status = "β" if result == expected else "β"
|
| 59 |
+
print(f" {status} {param_configs} -> {result} (expected: {expected})")
|
| 60 |
+
|
| 61 |
+
def test_filename_generation():
|
| 62 |
+
"""Test complete filename generation"""
|
| 63 |
+
from datetime import datetime
|
| 64 |
+
|
| 65 |
+
# Mock timestamp for consistent testing
|
| 66 |
+
timestamp = "20241201_120000"
|
| 67 |
+
|
| 68 |
+
test_cases = [
|
| 69 |
+
("π€ sentence-transformers/all-MiniLM-L6-v2 (384 dim, fast)", "πͺπΊ Mistral-API", "temperature", "batch_MiniLM_Mistral_temperature_20241201_120000.csv"),
|
| 70 |
+
("π€ BAAI/bge-base-en-v1.5 (768 dim, excellent)", "πΊπΈ Remote Meta-Llama-3", "top_p", "batch_BGE-Base_Llama3_top_p_20241201_120000.csv"),
|
| 71 |
+
("π¦ Qwen/Qwen3-Embedding-8B (1024 dim, advanced)", "πΊπΈ GPT-4o", "Multi", "batch_Qwen3-8B_GPT4o_Multi_20241201_120000.csv"),
|
| 72 |
+
("", "", "None", "batch_Unknown_Unknown_None_20241201_120000.csv")
|
| 73 |
+
]
|
| 74 |
+
|
| 75 |
+
print("\nTesting complete filename generation:")
|
| 76 |
+
for embedding, llm, param, expected in test_cases:
|
| 77 |
+
short_embedding = get_short_embedding_name(embedding) if embedding else "Unknown"
|
| 78 |
+
short_llm = get_short_llm_name(llm) if llm else "Unknown"
|
| 79 |
+
short_param = param if param else "None"
|
| 80 |
+
|
| 81 |
+
filename = f"batch_{short_embedding}_{short_llm}_{short_param}_{timestamp}.csv"
|
| 82 |
+
status = "β" if filename == expected else "β"
|
| 83 |
+
print(f" {status} Generated: {filename}")
|
| 84 |
+
print(f" Expected: {expected}")
|
| 85 |
+
|
| 86 |
+
if __name__ == "__main__":
|
| 87 |
+
print("Testing CSV filename generation functionality\n")
|
| 88 |
+
test_embedding_names()
|
| 89 |
+
test_llm_names()
|
| 90 |
+
test_varied_parameter()
|
| 91 |
+
test_filename_generation()
|
| 92 |
+
print("\nTest completed!")
|