SLM-RAG-Arena

Running on Zero

App Files Files Community

oliver-aizip commited on 8 days ago

Commit

ddaff53

1 Parent(s): 6f5e355

v1 inference code

Browse files

Files changed (3) hide show

requirements.txt +1 -0
utils/models.py +74 -11
utils/prompts.py +39 -0

requirements.txt CHANGED Viewed

@@ -6,3 +6,4 @@ numpy==1.26.4
 openai>=1.60.2
 torch>=2.5.1
 tqdm==4.67.1

 openai>=1.60.2
 torch>=2.5.1
 tqdm==4.67.1
+flash-attn>=2.7.4

utils/models.py CHANGED Viewed

@@ -1,41 +1,104 @@
 # --- Dummy Model Summaries ---
 # Define functions that simulate model summary generation
-dummy_models = {
-    "Model Alpha": lambda context, question, answerable: f"Alpha Summary: Based on the context for '{question[:20]}...', it appears the question is {'answerable' if answerable else 'unanswerable'}.",
-    "Model Beta": lambda context, question, answerable: f"Beta Summary: Regarding '{question[:20]}...', the provided documents {'allow' if answerable else 'do not allow'} for a conclusive answer based on the text.",
-    "Model Gamma": lambda context, question, answerable: f"Gamma Summary: For the question '{question[:20]}...', I {'can' if answerable else 'cannot'} provide a specific answer from the given text snippets.",
-    "Model Delta (Refusal Specialist)": lambda context, question, answerable: f"Delta Summary: The context for '{question[:20]}...' is {'sufficient' if answerable else 'insufficient'} to formulate a direct response. Therefore, I must refuse."
 }
 # List of model names for easy access
-model_names = list(dummy_models.keys())
 def generate_summaries(example, model_a_name, model_b_name):
     """
     Generates summaries for the given example using the assigned models.
     """
     # Create a plain text version of the contexts for the models
     context_text = ""
     if "contexts" in example and example["contexts"]:
-        context_parts = []
         for ctx in example["contexts"]:
             if isinstance(ctx, dict) and "content" in ctx:
                 context_parts.append(ctx["content"])
         context_text = "\n---\n".join(context_parts)
     else:
         # Fallback to full contexts if highlighted contexts are not available
-        context_parts = []
         if "full_contexts" in example:
             for ctx in example["full_contexts"]:
                 if isinstance(ctx, dict) and "content" in ctx:
                     context_parts.append(ctx["content"])
             context_text = "\n---\n".join(context_parts)
     # Pass 'Answerable' status to models (they might use it)
     answerable = example.get("Answerable", True)
     question = example.get("question", "")
     # Call the dummy model functions
-    summary_a = dummy_models[model_a_name](context_text, question, answerable)
-    summary_b = dummy_models[model_b_name](context_text, question, answerable)
     return summary_a, summary_b

+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from .prompts import format_rag_prompt
 # --- Dummy Model Summaries ---
 # Define functions that simulate model summary generation
+# models = {
+#     "Model Alpha": lambda context, question, answerable: f"Alpha Summary: Based on the context for '{question[:20]}...', it appears the question is {'answerable' if answerable else 'unanswerable'}.",
+#     "Model Beta": lambda context, question, answerable: f"Beta Summary: Regarding '{question[:20]}...', the provided documents {'allow' if answerable else 'do not allow'} for a conclusive answer based on the text.",
+#     "Model Gamma": lambda context, question, answerable: f"Gamma Summary: For the question '{question[:20]}...', I {'can' if answerable else 'cannot'} provide a specific answer from the given text snippets.",
+#     "Model Delta (Refusal Specialist)": lambda context, question, answerable: f"Delta Summary: The context for '{question[:20]}...' is {'sufficient' if answerable else 'insufficient'} to formulate a direct response. Therefore, I must refuse."
+# }
+models = {
+    "Qwen2.5-1.5b-Instruct": "qwen/qwen2.5-1.5b-instruct",
+    "Llama-3.2-1b-Instruct": "meta-llama/llama-3.2-1b-instruct",
+    #TODO add more models
 }
 # List of model names for easy access
+model_names = list(models.keys())
 def generate_summaries(example, model_a_name, model_b_name):
     """
     Generates summaries for the given example using the assigned models.
     """
     # Create a plain text version of the contexts for the models
     context_text = ""
+    context_parts = []
     if "contexts" in example and example["contexts"]:
         for ctx in example["contexts"]:
             if isinstance(ctx, dict) and "content" in ctx:
                 context_parts.append(ctx["content"])
         context_text = "\n---\n".join(context_parts)
     else:
         # Fallback to full contexts if highlighted contexts are not available
         if "full_contexts" in example:
             for ctx in example["full_contexts"]:
                 if isinstance(ctx, dict) and "content" in ctx:
                     context_parts.append(ctx["content"])
             context_text = "\n---\n".join(context_parts)
     # Pass 'Answerable' status to models (they might use it)
     answerable = example.get("Answerable", True)
     question = example.get("question", "")
     # Call the dummy model functions
+    summary_a = run_inference(models[model_a_name], context_text, question)
+    summary_b = run_inference(models[model_b_name], context_text, question)
     return summary_a, summary_b
+def run_inference(model_name, context, question):
+    """
+    Run inference using the specified model.
+    """
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Load the model and tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
+    accepts_sys = (
+        "System role not supported" not in tokenizer.chat_template
+    )  # Workaround for Gemma
+    # Set padding token if not set
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name, torch_dtype=torch.bfloat16, attn_implementation="eager"
+    ).to(device)
+    text_input = format_rag_prompt(question, context, accepts_sys)
+    # Tokenize the input
+    actual_input = tokenizer.apply_chat_template(
+        text_input,
+        return_tensors="pt",
+        tokenize=True,
+        max_length=2048,
+        add_generation_prompt=True,
+    ).to(device)
+    input_length = actual_input.shape[1]
+    # Create attention mask (1 for all tokens since we're not padding)
+    attention_mask = torch.ones_like(actual_input).to(device)
+    # Generate output
+    with torch.inference_mode():
+        # Disable gradient calculation for inference
+        outputs = model.generate(
+            actual_input,
+            attention_mask=attention_mask,
+            max_new_tokens=512,  # Use max_new_tokens instead of max_length
+            pad_token_id=tokenizer.pad_token_id,
+        )
+    # Decode the output
+    result = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
+    return result

utils/prompts.py CHANGED Viewed

	@@ -0,0 +1,39 @@

+def format_rag_prompt( query: str, context: str, accepts_sys: bool) -> str:
+    system_prompt = """
+    You are a helpful assistant that provides answers to queries based on the provided context.
+    You MUST clearly refuse to answer the query and ask for additional information from the user if the answer cannot be found in the context.
+    The output should not contain your judgment on answerability, only your answer OR your refusal + clarifications.
+    Stay within the bounds of the provided context and avoid making assumptions.
+    """
+    user_prompt = f"""
+    # Role and Task Description
+    Judge if the following query is answerable from ONLY the provided context.
+    If so, provide a complete, grounded answer to the query, and do not mention your judgement.
+    Try to address all aspects of the query, but if certain parts are not answerable, clearly state that you do not have enough information.
+    OTHERWISE, refuse clearly to answer and ask for the additional information you require from the user.
+    You should give a concise explanation of why you cannot answer the query based on the context, and ask for more relevant information from the user.
+    # Task
+    Given the following query and context, please provide your response:
+    Query: {query}
+    Context: {context}
+    WITHOUT mentioning your judgement either your grounded answer, OR refusal and clarifications:
+    """
+    messages = (
+        [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ]
+        if accepts_sys
+        else [{"role": "user", "content": system_prompt + user_prompt}]
+    )
+    return messages