Spaces:

ActiveYixiao
/

automatic_coding

Sleeping

App Files Files Community

ActiveYixiao commited on Aug 29, 2025

Commit

77ff4b8

verified ·

1 Parent(s): 14d2dfa

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -96

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ import outlines
 import pandas as pd
 import spaces
 import torch
-from outlines import Generator
 from peft import PeftConfig, PeftModel
 from pydantic import BaseModel, ConfigDict
 from transformers import (
@@ -20,7 +20,6 @@ from transformers import (
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-MODEL_ID = "rshwndsz/ft-longformer-base-4096"
 DEVICE_MAP = "auto"
 QUANTIZATION_BITS = None
 TEMPERATURE = 0.0
@@ -39,32 +38,12 @@ AVAILABLE_MODELS = [
 ]
 DEFAULT_MODEL_ID = AVAILABLE_MODELS[0]
-SYSTEM_PROMPT = textwrap.dedent("""
-You are an assistant tasked with grading answers to a mind reading ability test. You will be provided with the following information:
-1. A story that was presented to participants as context
-2. The question that participants were asked to answer
-3. A grading scheme to evaluate the answers (Correct Responses:1, incorrect response:0, Incomplete response:0, Irrelevant:0)
-4. Grading examples
-5. A participant answer
-Your task is to grade each answer according to the grading scheme. For each answer, you should:
-1. Carefully read and understand the answer and compare it to the grading criteria
-2. Assigning an score 1 or 0 for each answer.
-""").strip()
 PROMPT_TEMPLATE = textwrap.dedent("""
-<Story>
-{story}
-</Story>
-<Question>
-{question}
-</Question>
-<GradingScheme>
-{grading_scheme}
-</GradingScheme>
-<Answer>
-{answer}
-</Answer>
 Score:""").strip()
@@ -73,9 +52,14 @@ class ResponseModel(BaseModel):
     score: Literal["0", "1"]
-def get_outlines_model(
-    model_id: str, device_map: str = "auto", quantization_bits: Optional[int] = 4
-):
     if quantization_bits == 4:
         quantization_config = BitsAndBytesConfig(
             load_in_4bit=True,
@@ -89,82 +73,95 @@ def get_outlines_model(
         quantization_config = None
     if "longformer" in model_id:
-        hf_model = AutoModelForSequenceClassification.from_pretrained(model_id)
-        hf_tokenizer = AutoTokenizer.from_pretrained(model_id)
-        return hf_model, hf_tokenizer
-    peft_config = PeftConfig.from_pretrained(model_id)
-    base_model_id = peft_config.base_model_name_or_path
-    base_model = AutoModelForCausalLM.from_pretrained(
-        base_model_id,
-        device_map=device_map,
-        quantization_config=quantization_config,
-    )
-    hf_model = PeftModel.from_pretrained(base_model, model_id)
-    hf_tokenizer = AutoTokenizer.from_pretrained(
-        base_model_id, use_fast=True, clean_up_tokenization_spaces=True
-    )
-    model = outlines.from_transformers(hf_model, hf_tokenizer)
-    return model
 def format_prompt(story: str, question: str, grading_scheme: str, answer: str) -> str:
-    prompt = PROMPT_TEMPLATE.format(
         story=story.strip(),
         question=question.strip(),
         grading_scheme=grading_scheme.strip(),
         answer=answer.strip(),
     )
-    full_prompt = SYSTEM_PROMPT + "\n\n" + prompt
-    return full_prompt
 @spaces.GPU
 def label_single_response_with_model(model_id, story, question, criteria, response):
-    prompt = format_prompt(story, question, criteria, response)
-    if "longformer" in model_id:
-        model, tokenizer = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
-        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True)
-        with torch.no_grad():
-            logits = model(**inputs).logits
-        predicted_class = torch.argmax(logits, dim=1).item()
-        return str(predicted_class)
-    else:
-        # Use structured JSON generation like in the original script
-        model = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
-        sampler = outlines.samplers.greedy()  # Match original temperature=0 behavior
-        generator = outlines.generate.json(model, ResponseModel, sampler=sampler)
-        result = generator(prompt)
-        return result.score
 @spaces.GPU
 def label_multi_responses_with_model(model_id, story, question, criteria, response_file):
-    df = pd.read_csv(response_file.name)
-    assert "response" in df.columns, "CSV must contain a 'response' column."
-    prompts = [
-        format_prompt(story, question, criteria, resp) for resp in df["response"]
-    ]
-    if "longformer" in model_id:
-        model, tokenizer = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
-        inputs = tokenizer(prompts, return_tensors="pt", truncation=True, padding=True)
-        with torch.no_grad():
-            logits = model(**inputs).logits
-        predicted_classes = torch.argmax(logits, dim=1).tolist()
-        scores = [str(cls) for cls in predicted_classes]
-    else:
-        # Use structured JSON generation for batch processing
-        model = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
-        sampler = outlines.samplers.greedy()
-        generator = outlines.generate.json(model, ResponseModel, sampler=sampler)
-        results = generator(prompts)
-        scores = [r.score for r in results]
-    df["score"] = scores
-    return df
 def single_response_ui(model_id):
@@ -185,10 +182,7 @@ def single_response_ui(model_id):
 def multi_response_ui(model_id):
     return gr.Interface(
-        fn=lambda story,
-        question,
-        criteria,
-        response_file: label_multi_responses_with_model(
             model_id.value, story, question, criteria, response_file
         ),
         inputs=[
@@ -208,7 +202,7 @@ with gr.Blocks(title="Zero-Shot Evaluation Grader") as iface:
     model_selector = gr.Dropdown(
         label="Select Model",
         choices=AVAILABLE_MODELS,
-        value=AVAILABLE_MODELS[0],
     )
     selected_model_id = gr.State(value=DEFAULT_MODEL_ID)
@@ -227,4 +221,4 @@ with gr.Blocks(title="Zero-Shot Evaluation Grader") as iface:
 if __name__ == "__main__":
-    iface.launch(share=True)

 import pandas as pd
 import spaces
 import torch
+from outlines import generate, models, samplers
 from peft import PeftConfig, PeftModel
 from pydantic import BaseModel, ConfigDict
 from transformers import (
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 DEVICE_MAP = "auto"
 QUANTIZATION_BITS = None
 TEMPERATURE = 0.0
 ]
 DEFAULT_MODEL_ID = AVAILABLE_MODELS[0]
+# Use a simpler prompt format that might be closer to your training data
 PROMPT_TEMPLATE = textwrap.dedent("""
+Story: {story}
+Question: {question}
+Grading Scheme: {grading_scheme}
+Answer: {answer}
 Score:""").strip()
     score: Literal["0", "1"]
+# Cache models to avoid reloading on every request
+_model_cache = {}
+def get_model_and_tokenizer(model_id: str, device_map: str = "auto", quantization_bits: Optional[int] = None):
+    if model_id in _model_cache:
+        return _model_cache[model_id]
     if quantization_bits == 4:
         quantization_config = BitsAndBytesConfig(
             load_in_4bit=True,
         quantization_config = None
     if "longformer" in model_id:
+        model = AutoModelForSequenceClassification.from_pretrained(model_id)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        result = (model, tokenizer, "classification")
+    else:
+        # For other models, use the same approach as your original script
+        peft_config = PeftConfig.from_pretrained(model_id)
+        base_model_id = peft_config.base_model_name_or_path
+        model = AutoModelForCausalLM.from_pretrained(
+            base_model_id,
+            device_map=device_map,
+            quantization_config=quantization_config,
+        )
+        model = PeftModel.from_pretrained(model, model_id)
+        tokenizer = AutoTokenizer.from_pretrained(
+            base_model_id, use_fast=True, clean_up_tokenization_spaces=True
+        )
+        # Convert to outlines model
+        outlines_model = models.transformers(
+            model,
+            tokenizer=tokenizer,
+            device_map=device_map,
+        )
+        result = (outlines_model, tokenizer, "generation")
+    _model_cache[model_id] = result
+    return result
 def format_prompt(story: str, question: str, grading_scheme: str, answer: str) -> str:
+    return PROMPT_TEMPLATE.format(
         story=story.strip(),
         question=question.strip(),
         grading_scheme=grading_scheme.strip(),
         answer=answer.strip(),
     )
 @spaces.GPU
 def label_single_response_with_model(model_id, story, question, criteria, response):
+    try:
+        prompt = format_prompt(story, question, criteria, response)
+        model, tokenizer, model_type = get_model_and_tokenizer(model_id, DEVICE_MAP, QUANTIZATION_BITS)
+        if model_type == "classification":
+            # For Longformer models
+            inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True)
+            with torch.no_grad():
+                logits = model(**inputs).logits
+            predicted_class = torch.argmax(logits, dim=1).item()
+            return str(predicted_class)
+        else:
+            # For generative models
+            sampler = samplers.greedy()
+            generator = generate.json(model, ResponseModel, sampler=sampler)
+            result = generator(prompt)
+            return result.score
+    except Exception as e:
+        logger.error(f"Error in label_single_response_with_model: {str(e)}")
+        return "Error: " + str(e)
 @spaces.GPU
 def label_multi_responses_with_model(model_id, story, question, criteria, response_file):
+    try:
+        df = pd.read_csv(response_file.name)
+        assert "response" in df.columns, "CSV must contain a 'response' column."
+        model, tokenizer, model_type = get_model_and_tokenizer(model_id, DEVICE_MAP, QUANTIZATION_BITS)
+        prompts = [format_prompt(story, question, criteria, resp) for resp in df["response"]]
+        if model_type == "classification":
+            inputs = tokenizer(prompts, return_tensors="pt", truncation=True, padding=True)
+            with torch.no_grad():
+                logits = model(**inputs).logits
+            predicted_classes = torch.argmax(logits, dim=1).tolist()
+            scores = [str(cls) for cls in predicted_classes]
+        else:
+            sampler = samplers.greedy()
+            generator = generate.json(model, ResponseModel, sampler=sampler)
+            results = generator(prompts)
+            scores = [r.score for r in results]
+        df["score"] = scores
+        return df
+    except Exception as e:
+        logger.error(f"Error in label_multi_responses_with_model: {str(e)}")
+        return f"Error: {str(e)}"
 def single_response_ui(model_id):
 def multi_response_ui(model_id):
     return gr.Interface(
+        fn=lambda story, question, criteria, response_file: label_multi_responses_with_model(
             model_id.value, story, question, criteria, response_file
         ),
         inputs=[
     model_selector = gr.Dropdown(
         label="Select Model",
         choices=AVAILABLE_MODELS,
+        value=DEFAULT_MODEL_ID,
     )
     selected_model_id = gr.State(value=DEFAULT_MODEL_ID)
 if __name__ == "__main__":
+    iface.launch(share=True)