SLM-RAG-Arena

Running on Zero

App Files Files Community

oliver-aizip

kai-aizip commited on 5 days ago

Commit

1db9e92

verified ·

1 Parent(s): 69f6a43

Handled interruption (#10)

Browse files

- Handled interruption (9a1fcf079875ce647f4228f03d39b0a16a575134)

Co-authored-by: Kai <kai-aizip@users.noreply.huggingface.co>

Files changed (1) hide show

utils/models.py +81 -55

utils/models.py CHANGED Viewed

@@ -1,36 +1,32 @@
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
 from .prompts import format_rag_prompt
-# --- Dummy Model Summaries ---
-# Define functions that simulate model summary generation
-# models = {
-#     "Model Alpha": lambda context, question, answerable: f"Alpha Summary: Based on the context for '{question[:20]}...', it appears the question is {'answerable' if answerable else 'unanswerable'}.",
-#     "Model Beta": lambda context, question, answerable: f"Beta Summary: Regarding '{question[:20]}...', the provided documents {'allow' if answerable else 'do not allow'} for a conclusive answer based on the text.",
-#     "Model Gamma": lambda context, question, answerable: f"Gamma Summary: For the question '{question[:20]}...', I {'can' if answerable else 'cannot'} provide a specific answer from the given text snippets.",
-#     "Model Delta (Refusal Specialist)": lambda context, question, answerable: f"Delta Summary: The context for '{question[:20]}...' is {'sufficient' if answerable else 'insufficient'} to formulate a direct response. Therefore, I must refuse."
-# }
 models = {
     "Qwen2.5-1.5b-Instruct": "qwen/qwen2.5-1.5b-instruct",
-    #"Qwen2.5-3b-Instruct": "qwen/qwen2.5-3b-instruct", # remove gated for now
-    #"Llama-3.2-3b-Instruct": "meta-llama/llama-3.2-3b-instruct",
     "Llama-3.2-1b-Instruct": "meta-llama/llama-3.2-1b-instruct",
-    "Gemma-3-1b-it" : "google/gemma-3-1b-it",
-    #"Bitnet-b1.58-2B-4T": "microsoft/bitnet-b1.58-2B-4T",
-    #TODO add more models
 }
 # List of model names for easy access
 model_names = list(models.keys())
 def generate_summaries(example, model_a_name, model_b_name):
     """
     Generates summaries for the given example using the assigned models.
     """
-    # Create a plain text version of the contexts for the models
     context_text = ""
     context_parts = []
     if "full_contexts" in example:
@@ -41,12 +37,16 @@ def generate_summaries(example, model_a_name, model_b_name):
     else:
         raise ValueError("No context found in the example.")
-    # Pass 'Answerable' status to models (they might use it)
-    answerable = example.get("Answerable", True)
     question = example.get("question", "")
-    # Call the dummy model functions
     summary_a = run_inference(models[model_a_name], context_text, question)
     summary_b = run_inference(models[model_b_name], context_text, question)
     return summary_a, summary_b
@@ -54,46 +54,72 @@ def run_inference(model_name, context, question):
     """
     Run inference using the specified model.
     """
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    # Load the model and tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left", token=True)
-    accepts_sys = (
-        "System role not supported" not in tokenizer.chat_template
-    )  # Workaround for Gemma
-    # Set padding token if not set
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name, torch_dtype=torch.bfloat16, attn_implementation="eager", token=True
-    ).to(device)
-    text_input = format_rag_prompt(question, context, accepts_sys)
-    # Tokenize the input
-    actual_input = tokenizer.apply_chat_template(
-        text_input,
-        return_tensors="pt",
-        tokenize=True,
-        max_length=2048,
-        add_generation_prompt=True,
-    ).to(device)
-    input_length = actual_input.shape[1]
-    attention_mask = torch.ones_like(actual_input).to(device)
-    # Generate output
-    with torch.inference_mode():
-        outputs = model.generate(
-            actual_input,
-            attention_mask=attention_mask,
-            max_new_tokens=512,
-            pad_token_id=tokenizer.pad_token_id,
-        )
-    # Decode the output
-    result = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
-    return result

 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList
 from .prompts import format_rag_prompt
+from .shared import generation_interrupt
 models = {
     "Qwen2.5-1.5b-Instruct": "qwen/qwen2.5-1.5b-instruct",
     "Llama-3.2-1b-Instruct": "meta-llama/llama-3.2-1b-instruct",
+    "Gemma-3-1b-it": "google/gemma-3-1b-it",
 }
 # List of model names for easy access
 model_names = list(models.keys())
+# Custom stopping criteria that checks the interrupt flag
+class InterruptCriteria(StoppingCriteria):
+    def __init__(self, interrupt_event):
+        self.interrupt_event = interrupt_event
+    def __call__(self, input_ids, scores, **kwargs):
+        return self.interrupt_event.is_set()
 def generate_summaries(example, model_a_name, model_b_name):
     """
     Generates summaries for the given example using the assigned models.
     """
+    if generation_interrupt.is_set():
+        return "", ""
     context_text = ""
     context_parts = []
     if "full_contexts" in example:
     else:
         raise ValueError("No context found in the example.")
     question = example.get("question", "")
+    if generation_interrupt.is_set():
+        return "", ""
     summary_a = run_inference(models[model_a_name], context_text, question)
+    if generation_interrupt.is_set():
+        return summary_a, ""
     summary_b = run_inference(models[model_b_name], context_text, question)
     return summary_a, summary_b
     """
     Run inference using the specified model.
     """
+    if generation_interrupt.is_set():
+        return ""
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left", token=True)
+        accepts_sys = (
+            "System role not supported" not in tokenizer.chat_template
+        )
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        if generation_interrupt.is_set():
+            return ""
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name, torch_dtype=torch.bfloat16, attn_implementation="eager", token=True
+        ).to(device)
+        text_input = format_rag_prompt(question, context, accepts_sys)
+        if generation_interrupt.is_set():
+            return ""
+        actual_input = tokenizer.apply_chat_template(
+            text_input,
+            return_tensors="pt",
+            tokenize=True,
+            max_length=2048,
+            add_generation_prompt=True,
+        ).to(device)
+        input_length = actual_input.shape[1]
+        attention_mask = torch.ones_like(actual_input).to(device)
+        if generation_interrupt.is_set():
+            return ""
+        stopping_criteria = StoppingCriteriaList([InterruptCriteria(generation_interrupt)])
+        with torch.inference_mode():
+            outputs = model.generate(
+                actual_input,
+                attention_mask=attention_mask,
+                max_new_tokens=512,
+                pad_token_id=tokenizer.pad_token_id,
+                stopping_criteria=stopping_criteria
+            )
+        if generation_interrupt.is_set():
+            return ""
+        result = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
+        return result
+    except Exception as e:
+        print(f"Error in inference: {e}")
+        return f"Error generating response: {str(e)[:100]}..."
+    finally:
+        if 'model' in locals():
+            del model
+        if 'tokenizer' in locals():
+            del tokenizer
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()