Spaces:

griddava
/

pull-request-validator

Sleeping

App Files Files Community

Sgridda commited on Jul 11, 2025

Commit

8e65098

1 Parent(s): fe2db02

Fix quantization for CPU by using BitsAndBytesConfig

Browse files

Files changed (1) hide show

main.py +22 -57

main.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
-from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 import re
 import json
@@ -9,9 +10,9 @@ import json
 # 1. Configuration
 # ----------------------------
-# Define the model we want to use.
-# We use a 4-bit quantized version ("4bit") for efficiency.
 MODEL_NAME = "deepseek-ai/deepseek-coder-6.7b-instruct"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # ----------------------------
@@ -28,8 +29,6 @@ app = FastAPI(
 # 3. AI Model Loading
 # ----------------------------
-# Use a global variable to hold the model and tokenizer
-# This is lazy-loaded on the first request to speed up server startup.
 model = None
 tokenizer = None
@@ -39,13 +38,23 @@ def load_model():
     if model is None:
         print(f"Loading model: {MODEL_NAME} on device: {DEVICE}...")
         tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
-        # Load the model with 4-bit quantization to save memory
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_NAME,
             trust_remote_code=True,
-            torch_dtype=torch.bfloat16,
-            load_in_4bit=True,
         )
         print("Model loaded successfully.")
@@ -53,7 +62,6 @@ def load_model():
 async def startup_event():
     """
     On server startup, we trigger the model loading.
-    This makes the first API call after startup faster.
     """
     print("Server starting up...")
     load_model()
@@ -63,17 +71,14 @@ async def startup_event():
 # ----------------------------
 class ReviewRequest(BaseModel):
-    """The request body for the /review endpoint."""
     diff: str
 class ReviewComment(BaseModel):
-    """A single review comment."""
     file_path: str
     line_number: int
     comment_text: str
 class ReviewResponse(BaseModel):
-    """The response body for the /review endpoint."""
     comments: list[ReviewComment]
 # ----------------------------
@@ -87,37 +92,10 @@ def run_ai_inference(diff: str) -> str:
     if not model or not tokenizer:
         raise RuntimeError("Model is not loaded.")
-    # This is the prompt engineering part. We create a clear instruction
-    # for the model, telling it exactly what to do and what format to output.
     messages = [
         {
             "role": "system",
-            "content": """
-You are an expert code reviewer. Your task is to analyze a pull request diff and provide constructive feedback.
-Analyze the provided diff and identify potential issues, suggest improvements, or point out good practices.
-Your feedback should be in the form of review comments.
-IMPORTANT: Respond with a JSON array of comment objects. Each object must have three fields: 'file_path', 'line_number', and 'comment_text'.
-The 'file_path' should be the full path of the file being changed.
-The 'line_number' must be an integer corresponding to the line number in the *new* version of the file where the comment applies.
-The 'comment_text' should be your concise and clear review comment.
-Example response format:
-[
-    {
-        "file_path": "src/utils/helpers.py",
-        "line_number": 42,
-        "comment_text": "This function could be simplified by using a list comprehension."
-    },
-    {
-        "file_path": "README.md",
-        "line_number": 12,
-        "comment_text": "There is a typo in this sentence."
-    }
-]
-Do not add any introductory text or explanations outside of the JSON array.
-"""
         },
         {
             "role": "user",
@@ -125,23 +103,20 @@ Do not add any introductory text or explanations outside of the JSON array.
         }
     ]
-    inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(DEVICE)
-    # Generate the response from the model
     outputs = model.generate(inputs, max_new_tokens=1024, do_sample=False, top_k=50, top_p=0.95, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)
-    # Decode the output and clean it up
     response_text = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
     return response_text.strip()
 def parse_ai_response(response_text: str) -> list[ReviewComment]:
     """
     Parses the raw text from the AI to extract the JSON array.
-    This function is robust against the AI adding extra text before or after the JSON.
     """
     print(f"Raw AI Response:\n---\n{response_text}\n---")
-    # Find the start and end of the JSON array
     json_match = re.search(r'\[.*\]', response_text, re.DOTALL)
     if not json_match:
         print("Warning: Could not find a JSON array in the AI response.")
@@ -151,7 +126,6 @@ def parse_ai_response(response_text: str) -> list[ReviewComment]:
     try:
         comments_data = json.loads(json_string)
-        # Validate the structure of the parsed data
         validated_comments = [ReviewComment(**item) for item in comments_data]
         return validated_comments
     except (json.JSONDecodeError, TypeError, KeyError) as e:
@@ -165,20 +139,12 @@ def parse_ai_response(response_text: str) -> list[ReviewComment]:
 @app.post("/review", response_model=ReviewResponse)
 async def get_code_review(request: ReviewRequest):
-    """
-    Receives a code diff, gets a review from the AI model,
-    and returns structured review comments.
-    """
     if not request.diff:
         raise HTTPException(status_code=400, detail="Diff content cannot be empty.")
     try:
-        # 1. Run the AI model
         ai_response_text = run_ai_inference(request.diff)
-        # 2. Parse the AI's response into structured objects
         parsed_comments = parse_ai_response(ai_response_text)
         return ReviewResponse(comments=parsed_comments)
     except Exception as e:
@@ -191,5 +157,4 @@ async def get_code_review(request: ReviewRequest):
 @app.get("/health")
 async def health_check():
-    """A simple endpoint to confirm the server is running."""
-    return {"status": "ok", "model_loaded": model is not None}

 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
+# We now import BitsAndBytesConfig to specify our quantization settings
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 import torch
 import re
 import json
 # 1. Configuration
 # ----------------------------
 MODEL_NAME = "deepseek-ai/deepseek-coder-6.7b-instruct"
+# The device will be automatically handled by device_map="auto"
+# but we can keep this for logging.
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # ----------------------------
 # 3. AI Model Loading
 # ----------------------------
 model = None
 tokenizer = None
     if model is None:
         print(f"Loading model: {MODEL_NAME} on device: {DEVICE}...")
         tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+        # FIX: Define the quantization configuration for 4-bit loading.
+        # We explicitly set bnb_4bit_quant_type to "nf4", which is required for CPU execution.
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_use_double_quant=False,
+        )
+        # Load the model with the specified quantization config.
+        # We also use device_map="auto" to let transformers handle device placement.
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_NAME,
             trust_remote_code=True,
+            quantization_config=quantization_config,
+            device_map="auto", # This is crucial for bitsandbytes to work correctly
         )
         print("Model loaded successfully.")
 async def startup_event():
     """
     On server startup, we trigger the model loading.
     """
     print("Server starting up...")
     load_model()
 # ----------------------------
 class ReviewRequest(BaseModel):
     diff: str
 class ReviewComment(BaseModel):
     file_path: str
     line_number: int
     comment_text: str
 class ReviewResponse(BaseModel):
     comments: list[ReviewComment]
 # ----------------------------
     if not model or not tokenizer:
         raise RuntimeError("Model is not loaded.")
     messages = [
         {
             "role": "system",
+            "content": """You are an expert code reviewer. Your task is to analyze a pull request diff and provide constructive feedback.\nAnalyze the provided diff and identify potential issues, suggest improvements, or point out good practices.\n\nIMPORTANT: Respond with a JSON array of comment objects. Each object must have three fields: 'file_path', 'line_number', and 'comment_text'.\nThe 'file_path' should be the full path of the file being changed.\nThe 'line_number' must be an integer corresponding to the line number in the *new* version of the file where the comment applies.\nThe 'comment_text' should be your concise and clear review comment.\n\nExample response format:\n[\n    {\n        "file_path": "src/utils/helpers.py",\n        "line_number": 42,\n        "comment_text": "This function could be simplified by using a list comprehension."\n    }\n]\n\nDo not add any introductory text or explanations outside of the JSON array.\n"""
         },
         {
             "role": "user",
         }
     ]
+    # Note: We don't need to manually move inputs to a device when using device_map="auto"
+    inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
     outputs = model.generate(inputs, max_new_tokens=1024, do_sample=False, top_k=50, top_p=0.95, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)
     response_text = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
     return response_text.strip()
 def parse_ai_response(response_text: str) -> list[ReviewComment]:
     """
     Parses the raw text from the AI to extract the JSON array.
     """
     print(f"Raw AI Response:\n---\n{response_text}\n---")
     json_match = re.search(r'\[.*\]', response_text, re.DOTALL)
     if not json_match:
         print("Warning: Could not find a JSON array in the AI response.")
     try:
         comments_data = json.loads(json_string)
         validated_comments = [ReviewComment(**item) for item in comments_data]
         return validated_comments
     except (json.JSONDecodeError, TypeError, KeyError) as e:
 @app.post("/review", response_model=ReviewResponse)
 async def get_code_review(request: ReviewRequest):
     if not request.diff:
         raise HTTPException(status_code=400, detail="Diff content cannot be empty.")
     try:
         ai_response_text = run_ai_inference(request.diff)
         parsed_comments = parse_ai_response(ai_response_text)
         return ReviewResponse(comments=parsed_comments)
     except Exception as e:
 @app.get("/health")
 async def health_check():
+    return {"status": "ok", "model_loaded": model is not None}