Spaces:

alx-d
/

PhiRAG

Running

App Files Files Community

alx-d commited on Sep 17

Commit

f840733

verified ·

1 Parent(s): 1548715

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

.gitattributes +1 -0
__pycache__/advanced_rag.cpython-311.pyc +0 -0
__pycache__/test_filename_generation.cpython-311.pyc +0 -0
advanced_rag.py +110 -10
advanced_rag_updated.py +245 -0
temp_import.py +2 -0
test_filename_generation.py +92 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+__pycache__/advanced_rag.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

__pycache__/advanced_rag.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/advanced_rag.cpython-311.pyc and b/__pycache__/advanced_rag.cpython-311.pyc differ

__pycache__/test_filename_generation.cpython-311.pyc ADDED Viewed

Binary file (5.14 kB). View file

advanced_rag.py CHANGED Viewed

@@ -105,8 +105,13 @@ def process_batch_query(query, model_choice, max_tokens, param_configs, slider_v
                             "Progress": f"Query {current}/{total_combinations}"
                         })
-    # Format results with CSV file links
-    formatted_results, csv_path = format_batch_result_files(results, job_id)
     return (
         formatted_results,
@@ -1388,10 +1393,64 @@ def format_response(response: str) -> str:
 def reset_app_updated():
     global rag_chain
     rag_chain = ElevatedRagChain()
-    debug_print("App reset successfully.")
     return (
-        "App reset successfully. You can now load new files",
         "",
         "Model used: Not selected"
     )
@@ -2172,12 +2231,51 @@ https://www.gutenberg.org/ebooks/8438.txt.utf-8
         outputs=[csv_download_html_batch, csv_download_file_batch, csv_file_info_df_batch]
     )
-def create_csv_from_batch_results(results: List[Dict], job_id: str) -> str:
     """Create a CSV file from batch query results and return the file path"""
     # Save CSV files in the current directory for HuggingFace Spaces compatibility
-    # Create a unique filename using job_id and timestamp
     timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-    csv_filename = f"batch_results_{job_id}_{timestamp}.csv"
     csv_path = os.path.abspath(csv_filename)
     # Extract parameters and responses
@@ -2233,10 +2331,12 @@ def create_csv_from_batch_results(results: List[Dict], job_id: str) -> str:
     return csv_path
-def format_batch_result_files(results: List[Dict], job_id: str) -> Tuple[str, str]:
     """Format batch results with links to CSV files"""
-    # Create CSV file
-    csv_path = create_csv_from_batch_results(results, job_id)
     # Format the results
     formatted_results = "### Batch Query Results\n\n"

                             "Progress": f"Query {current}/{total_combinations}"
                         })
+    # Format results with CSV file links - UPDATED TO PASS ADDITIONAL PARAMETERS
+    formatted_results, csv_path = format_batch_result_files(
+        results, job_id,
+        embedding_model=getattr(rag_chain, 'embedding_model', 'unknown'),
+        llm_model=model_choice,
+        param_variations=param_configs
+    )
     return (
         formatted_results,
 def reset_app_updated():
     global rag_chain
+    # Properly clean up the existing vector database components
+    if hasattr(rag_chain, 'vector_store'):
+        try:
+            del rag_chain.vector_store
+        except:
+            pass
+    if hasattr(rag_chain, 'faiss_retriever'):
+        try:
+            del rag_chain.faiss_retriever
+        except:
+            pass
+    if hasattr(rag_chain, 'bm25_retriever'):
+        try:
+            del rag_chain.bm25_retriever
+        except:
+            pass
+    if hasattr(rag_chain, 'ensemble_retriever'):
+        try:
+            del rag_chain.ensemble_retriever
+        except:
+            pass
+    # Clear data references
+    if hasattr(rag_chain, 'raw_data'):
+        rag_chain.raw_data = None
+    if hasattr(rag_chain, 'split_data'):
+        rag_chain.split_data = None
+    if hasattr(rag_chain, 'context'):
+        rag_chain.context = ""
+    if hasattr(rag_chain, 'conversation_history'):
+        rag_chain.conversation_history = []
+    # Clear other components
+    if hasattr(rag_chain, 'text_splitter'):
+        try:
+            del rag_chain.text_splitter
+        except:
+            pass
+    if hasattr(rag_chain, 'elevated_rag_chain'):
+        try:
+            del rag_chain.elevated_rag_chain
+        except:
+            pass
+    # Create a new instance
     rag_chain = ElevatedRagChain()
+    # Force garbage collection to free memory
+    gc.collect()
+    debug_print("App reset successfully. Vector database and all components cleaned up.")
     return (
+        "App reset successfully. Vector database and all components cleaned up. You can now load new files",
         "",
         "Model used: Not selected"
     )
         outputs=[csv_download_html_batch, csv_download_file_batch, csv_file_info_df_batch]
     )
+def create_csv_from_batch_results(results: List[Dict], job_id: str,
+                                embedding_model: str = None, llm_model: str = None,
+                                param_variations: Dict = None) -> str:
     """Create a CSV file from batch query results and return the file path"""
     # Save CSV files in the current directory for HuggingFace Spaces compatibility
+    # Create a descriptive filename
     timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    # Extract short names for filename
+    def get_short_name(full_name, prefix_length=2):
+        """Extract short name from full model name"""
+        if not full_name:
+            return "unknown"
+        # Remove emojis and get the actual model name
+        clean_name = full_name.split(" ", 1)[-1] if " " in full_name else full_name
+        # Get first few characters and last few characters
+        if len(clean_name) > 8:
+            return clean_name[:4] + clean_name[-4:]
+        return clean_name
+    def get_param_variation_name(param_configs):
+        """Get the parameter that was varied"""
+        if not param_configs:
+            return "const"
+        varied_params = []
+        for param, config in param_configs.items():
+            if config != "Constant":
+                # Extract the number from "Whole range X values"
+                if "values" in config:
+                    num_values = config.split()[2] if len(config.split()) > 2 else "X"
+                    varied_params.append(f"{param}_{num_values}")
+        if not varied_params:
+            return "const"
+        return "_".join(varied_params)
+    # Build filename components
+    embedding_short = get_short_name(embedding_model) if embedding_model else "emb"
+    llm_short = get_short_name(llm_model) if llm_model else "llm"
+    param_short = get_param_variation_name(param_variations) if param_variations else "const"
+    # Create filename: batch_embedding_llm_params_timestamp.csv
+    csv_filename = f"batch_{embedding_short}_{llm_short}_{param_short}_{timestamp}.csv"
     csv_path = os.path.abspath(csv_filename)
     # Extract parameters and responses
     return csv_path
+def format_batch_result_files(results: List[Dict], job_id: str,
+                            embedding_model: str = None, llm_model: str = None,
+                            param_variations: Dict = None) -> Tuple[str, str]:
     """Format batch results with links to CSV files"""
+    # Create CSV file with improved filename
+    csv_path = create_csv_from_batch_results(results, job_id, embedding_model, llm_model, param_variations)
     # Format the results
     formatted_results = "### Batch Query Results\n\n"

advanced_rag_updated.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+import datetime
+import functools
+import traceback
+from typing import List, Optional, Any, Dict, Tuple
+import csv
+import pandas as pd
+import tempfile
+import shutil
+import glob
+import torch
+import transformers
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+from langchain_community.llms import HuggingFacePipeline
+# Other LangChain and community imports
+from langchain_community.document_loaders import OnlinePDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import FAISS
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain_community.retrievers import BM25Retriever
+from langchain.embeddings.base import Embeddings
+from langchain.retrievers import EnsembleRetriever
+from langchain.prompts import ChatPromptTemplate
+from langchain.schema import StrOutputParser, Document
+from langchain_core.runnables import RunnableParallel, RunnableLambda
+from transformers.quantizers.auto import AutoQuantizationConfig
+import gradio as gr
+from pydantic import PrivateAttr
+import pydantic
+from langchain.llms.base import LLM
+from typing import Any, Optional, List
+import typing
+import time
+import re
+import requests
+from langchain.schema import Document
+from langchain_community.document_loaders import PyMuPDFLoader  # Updated loader
+import tempfile
+import mimetypes
+import gc
+# Add batch processing helper functions
+def generate_parameter_values(min_val, max_val, num_values):
+    """Generate evenly spaced values between min and max"""
+    if num_values == 1:
+        return [min_val]
+    step = (max_val - min_val) / (num_values - 1)
+    return [min_val + (step * i) for i in range(num_values)]
+def process_batch_query(query, model_choice, max_tokens, param_configs, slider_values, job_id, use_history=True):
+    """Process a batch of queries with different parameter combinations"""
+    results = []
+    # Generate all parameter combinations
+    temp_values = [slider_values['temperature']] if param_configs['temperature'] == "Constant" else generate_parameter_values(0.1, 1.0, int(param_configs['temperature'].split()[2]))
+    top_p_values = [slider_values['top_p']] if param_configs['top_p'] == "Constant" else generate_parameter_values(0.1, 0.99, int(param_configs['top_p'].split()[2]))
+    top_k_values = [slider_values['top_k']] if param_configs['top_k'] == "Constant" else generate_parameter_values(1, 100, int(param_configs['top_k'].split()[2]))
+    bm25_values = [slider_values['bm25']] if param_configs['bm25'] == "Constant" else generate_parameter_values(0.0, 1.0, int(param_configs['bm25'].split()[2]))
+    total_combinations = len(temp_values) * len(top_p_values) * len(top_k_values) * len(bm25_values)
+    current = 0
+    for temp in temp_values:
+        for top_p in top_p_values:
+            for top_k in top_k_values:
+                for bm25 in bm25_values:
+                    current += 1
+                    try:
+                        # Update parameters
+                        rag_chain.temperature = temp
+                        rag_chain.top_p = top_p
+                        rag_chain.top_k = top_k
+                        rag_chain.bm25_weight = bm25
+                        rag_chain.faiss_weight = 1.0 - bm25
+                        # Update ensemble retriever
+                        rag_chain.ensemble_retriever = EnsembleRetriever(
+                            retrievers=[rag_chain.bm25_retriever, rag_chain.faiss_retriever],
+                            weights=[rag_chain.bm25_weight, rag_chain.faiss_weight]
+                        )
+                        # Process query
+                        response = rag_chain.elevated_rag_chain.invoke({"question": query})
+                        # Store response in history if enabled
+                        if use_history:
+                            trimmed_response = response[:1000] + ("..." if len(response) > 1000 else "")
+                            rag_chain.conversation_history.append({"query": query, "response": trimmed_response})
+                        # Format result
+                        result = {
+                            "Parameters": f"Temp: {temp:.2f}, Top-p: {top_p:.2f}, Top-k: {top_k}, BM25: {bm25:.2f}",
+                            "Response": response,
+                            "Progress": f"Query {current}/{total_combinations}"
+                        }
+                        results.append(result)
+                    except Exception as e:
+                        results.append({
+                            "Parameters": f"Temp: {temp:.2f}, Top-p: {top_p:.2f}, Top-k: {top_k}, BM25: {bm25:.2f}",
+                            "Response": f"Error: {str(e)}",
+                            "Progress": f"Query {current}/{total_combinations}"
+                        })
+    # Format results with CSV file links - UPDATED TO PASS ADDITIONAL PARAMETERS
+    formatted_results, csv_path = format_batch_result_files(
+        results, job_id,
+        embedding_model=getattr(rag_chain, 'embedding_model', 'unknown'),
+        llm_model=model_choice,
+        param_variations=param_configs
+    )
+    return (
+        formatted_results,
+        csv_path,
+        f"Job ID: {job_id}",
+        f"Input tokens: {count_tokens(query)}",
+        f"Output tokens: {sum(count_tokens(r['Response']) for r in results)}"
+    )
+# ... (rest of the file content would go here, but I'll focus on the specific functions that need updating)
+def create_csv_from_batch_results(results: List[Dict], job_id: str,
+                                embedding_model: str = None, llm_model: str = None,
+                                param_variations: Dict = None) -> str:
+    """Create a CSV file from batch query results and return the file path"""
+    # Save CSV files in the current directory for HuggingFace Spaces compatibility
+    # Create a descriptive filename
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    # Extract short names for filename
+    def get_short_name(full_name, prefix_length=2):
+        """Extract short name from full model name"""
+        if not full_name:
+            return "unknown"
+        # Remove emojis and get the actual model name
+        clean_name = full_name.split(" ", 1)[-1] if " " in full_name else full_name
+        # Get first few characters and last few characters
+        if len(clean_name) > 8:
+            return clean_name[:4] + clean_name[-4:]
+        return clean_name
+    def get_param_variation_name(param_configs):
+        """Get the parameter that was varied"""
+        if not param_configs:
+            return "const"
+        varied_params = []
+        for param, config in param_configs.items():
+            if config != "Constant":
+                # Extract the number from "Whole range X values"
+                if "values" in config:
+                    num_values = config.split()[2] if len(config.split()) > 2 else "X"
+                    varied_params.append(f"{param}_{num_values}")
+        if not varied_params:
+            return "const"
+        return "_".join(varied_params)
+    # Build filename components
+    embedding_short = get_short_name(embedding_model) if embedding_model else "emb"
+    llm_short = get_short_name(llm_model) if llm_model else "llm"
+    param_short = get_param_variation_name(param_variations) if param_variations else "const"
+    # Create filename: batch_embedding_llm_params_timestamp.csv
+    csv_filename = f"batch_{embedding_short}_{llm_short}_{param_short}_{timestamp}.csv"
+    csv_path = os.path.abspath(csv_filename)
+    # Extract parameters and responses
+    data = []
+    start_time = time.time()
+    for result in results:
+        params = result["Parameters"]
+        response = result["Response"]
+        progress = result["Progress"]
+        # Calculate elapsed time for this query
+        current_time = time.time()
+        elapsed_time = current_time - start_time
+        # Extract individual parameter values
+        temp = float(re.search(r"Temp: ([\d.]+)", params).group(1))
+        top_p = float(re.search(r"Top-p: ([\d.]+)", params).group(1))
+        top_k = int(re.search(r"Top-k: (\d+)", params).group(1))
+        bm25 = float(re.search(r"BM25: ([\d.]+)", params).group(1))
+        # Extract response components
+        model_info = re.search(r"Model: (.*?)\n", response)
+        model = model_info.group(1) if model_info else "Unknown"
+        # Extract main answer (everything between the parameters and the token counts)
+        answer_match = re.search(r"Model Parameters:.*?\n\n(.*?)\n\n---", response, re.DOTALL)
+        main_answer = answer_match.group(1).strip() if answer_match else response
+        # Extract token counts
+        input_tokens = re.search(r"Input tokens: (\d+)", response)
+        output_tokens = re.search(r"Output tokens: (\d+)", response)
+        # Extract conversation history count
+        conv_history = re.search(r"Conversation History: (\d+) conversation", response)
+        data.append({
+            "Temperature": temp,
+            "Top-p": top_p,
+            "Top-k": top_k,
+            "BM25 Weight": bm25,
+            "Model": model,
+            "Main Answer": main_answer,
+            "Input Tokens": input_tokens.group(1) if input_tokens else "N/A",
+            "Output Tokens": output_tokens.group(1) if output_tokens else "N/A",
+            "Conversation History": conv_history.group(1) if conv_history else "0",
+            "Progress": progress,
+            "Elapsed Time (s)": f"{elapsed_time:.2f}"
+        })
+    # Create DataFrame and save to CSV
+    df = pd.DataFrame(data)
+    df.to_csv(csv_path, index=False)
+    return csv_path
+def format_batch_result_files(results: List[Dict], job_id: str,
+                            embedding_model: str = None, llm_model: str = None,
+                            param_variations: Dict = None) -> Tuple[str, str]:
+    """Format batch results with links to CSV files"""
+    # Create CSV file with improved filename
+    csv_path = create_csv_from_batch_results(results, job_id, embedding_model, llm_model, param_variations)
+    # Format the results
+    formatted_results = "### Batch Query Results\n\n"
+    # Add the actual results
+    for result in results:
+        formatted_results += f"#### {result['Parameters']}\n"
+        formatted_results += f"**Progress:** {result['Progress']}\n\n"
+        formatted_results += f"{result['Response']}\n\n"
+        formatted_results += "---\n\n"
+    return formatted_results, csv_path

temp_import.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ import gc
2	+

test_filename_generation.py ADDED Viewed

	@@ -0,0 +1,92 @@

+#!/usr/bin/env python3
+"""
+Test script to verify the new CSV filename generation functionality
+"""
+import sys
+import os
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+from advanced_rag import get_short_embedding_name, get_short_llm_name, get_varied_parameter
+def test_embedding_names():
+    """Test embedding model name generation"""
+    test_cases = [
+        ("🤗 sentence-transformers/all-MiniLM-L6-v2 (384 dim, fast)", "MiniLM"),
+        ("🤗 BAAI/bge-base-en-v1.5 (768 dim, excellent)", "BGE-Base"),
+        ("🟦 Qwen/Qwen3-Embedding-8B (1024 dim, advanced)", "Qwen3-8B"),
+        ("sentence-transformers/all-mpnet-base-v2", "MPNet"),
+        ("unknown-model", "unknown")
+    ]
+    print("Testing embedding name generation:")
+    for input_name, expected in test_cases:
+        result = get_short_embedding_name(input_name)
+        status = "✓" if result == expected else "✗"
+        print(f"  {status} {input_name} -> {result} (expected: {expected})")
+def test_llm_names():
+    """Test LLM model name generation"""
+    test_cases = [
+        ("🇪🇺 Mistral-API", "Mistral"),
+        ("🇺🇸 Remote Meta-Llama-3", "Llama3"),
+        ("🇺🇸 GPT-4o", "GPT4o"),
+        ("mistral-small-latest", "Mistral"),
+        ("meta-llama/Meta-Llama-3-8B-Instruct", "Llama3"),
+        ("unknown-model", "unknown")
+    ]
+    print("\nTesting LLM name generation:")
+    for input_name, expected in test_cases:
+        result = get_short_llm_name(input_name)
+        status = "✓" if result == expected else "✗"
+        print(f"  {status} {input_name} -> {result} (expected: {expected})")
+def test_varied_parameter():
+    """Test varied parameter detection"""
+    test_cases = [
+        ({"temperature": "Constant", "top_p": "Constant", "top_k": "Constant", "bm25": "Constant"}, "None"),
+        ({"temperature": "Whole range 3 values", "top_p": "Constant", "top_k": "Constant", "bm25": "Constant"}, "temperature"),
+        ({"temperature": "Constant", "top_p": "Whole range 5 values", "top_k": "Constant", "bm25": "Constant"}, "top_p"),
+        ({"temperature": "Whole range 3 values", "top_p": "Whole range 5 values", "top_k": "Constant", "bm25": "Constant"}, "Multi"),
+        ({"temperature": "Constant", "top_p": "Constant", "top_k": "Constant", "bm25": "Whole range 7 values"}, "bm25")
+    ]
+    print("\nTesting varied parameter detection:")
+    for param_configs, expected in test_cases:
+        result = get_varied_parameter(param_configs)
+        status = "✓" if result == expected else "✗"
+        print(f"  {status} {param_configs} -> {result} (expected: {expected})")
+def test_filename_generation():
+    """Test complete filename generation"""
+    from datetime import datetime
+    # Mock timestamp for consistent testing
+    timestamp = "20241201_120000"
+    test_cases = [
+        ("🤗 sentence-transformers/all-MiniLM-L6-v2 (384 dim, fast)", "🇪🇺 Mistral-API", "temperature", "batch_MiniLM_Mistral_temperature_20241201_120000.csv"),
+        ("🤗 BAAI/bge-base-en-v1.5 (768 dim, excellent)", "🇺🇸 Remote Meta-Llama-3", "top_p", "batch_BGE-Base_Llama3_top_p_20241201_120000.csv"),
+        ("🟦 Qwen/Qwen3-Embedding-8B (1024 dim, advanced)", "🇺🇸 GPT-4o", "Multi", "batch_Qwen3-8B_GPT4o_Multi_20241201_120000.csv"),
+        ("", "", "None", "batch_Unknown_Unknown_None_20241201_120000.csv")
+    ]
+    print("\nTesting complete filename generation:")
+    for embedding, llm, param, expected in test_cases:
+        short_embedding = get_short_embedding_name(embedding) if embedding else "Unknown"
+        short_llm = get_short_llm_name(llm) if llm else "Unknown"
+        short_param = param if param else "None"
+        filename = f"batch_{short_embedding}_{short_llm}_{short_param}_{timestamp}.csv"
+        status = "✓" if filename == expected else "✗"
+        print(f"  {status} Generated: {filename}")
+        print(f"    Expected: {expected}")
+if __name__ == "__main__":
+    print("Testing CSV filename generation functionality\n")
+    test_embedding_names()
+    test_llm_names()
+    test_varied_parameter()
+    test_filename_generation()
+    print("\nTest completed!")