nryadav18 commited on
Commit
b3b1b91
·
verified ·
1 Parent(s): 00651e0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -17
app.py CHANGED
@@ -1,44 +1,48 @@
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  from llama_cpp import Llama
 
4
 
5
  app = FastAPI()
6
 
7
- # Download and initialize the model when the server starts
8
  llm = Llama.from_pretrained(
9
- repo_id="Qwen/Qwen2.5-Coder-0.5B-Instruct-GGUF", # Make sure this says 0.5B!
10
- filename="*q4_k_m.gguf",
11
- n_ctx=2048,
12
- n_threads=2,
13
- n_batch=512
 
 
 
 
14
  )
15
 
16
  class EvalRequest(BaseModel):
17
  task_description: str
18
  python_code: str
19
 
20
- # --- ADDED HEALTH CHECK ROUTE HERE ---
21
  @app.get("/")
22
  async def health_check():
23
- return {"status": "Online", "message": "AI Code Evaluator is running! Send POST requests to /evaluate"}
24
- # -------------------------------------
25
 
26
  @app.post("/evaluate")
27
  async def evaluate_code(request: EvalRequest):
28
- prompt = f"Task Context:\n{request.task_description}\n\nStudent Code:\n{request.python_code}"
 
29
 
 
 
30
  response = llm.create_chat_completion(
31
  messages=[
32
- {
33
- "role": "system",
34
- "content": "You are a friendly Python grader. Output ONLY valid JSON."
35
- },
36
  {"role": "user", "content": prompt}
37
  ],
38
- max_tokens=512,
39
  temperature=0.1,
40
- # THIS IS THE MAGIC LINE:
 
41
  response_format={"type": "json_object"}
42
  )
43
 
44
- return {"evaluation": response['choices'][0]['message']['content']}
 
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  from llama_cpp import Llama
4
+ import llama_cpp
5
 
6
  app = FastAPI()
7
 
8
+ # --- HIGHEST OPTIMIZATION FOR 1.5B ---
9
  llm = Llama.from_pretrained(
10
+ repo_id="Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF",
11
+ filename="*q4_k_m.gguf",
12
+ n_ctx=1024,
13
+ n_threads=2, # Match your physical cores
14
+ n_batch=512,
15
+ flash_attn=True,
16
+ n_mlock=True, # Keep model in RAM for consistent speed
17
+ type_k=llama_cpp.GGML_TYPE_Q4_0, # 4-bit KV Cache for faster processing
18
+ verbose=False
19
  )
20
 
21
  class EvalRequest(BaseModel):
22
  task_description: str
23
  python_code: str
24
 
 
25
  @app.get("/")
26
  async def health_check():
27
+ return {"status": "Online", "message": "Optimized 1.5B Evaluator Ready"}
 
28
 
29
  @app.post("/evaluate")
30
  async def evaluate_code(request: EvalRequest):
31
+ # Minimalist prompt for faster processing
32
+ prompt = f"TASK: {request.task_description}\n\nCODE:\n{request.python_code}\n\nEVALUATE:"
33
 
34
+ system_prompt = 'You are a Python tutor. Output ONLY JSON: {"score": int, "feedback": str, "improvements": list}'
35
+
36
  response = llm.create_chat_completion(
37
  messages=[
38
+ {"role": "system", "content": system_prompt},
 
 
 
39
  {"role": "user", "content": prompt}
40
  ],
41
+ max_tokens=400,
42
  temperature=0.1,
43
+ repeat_penalty=1.1,
44
+ stop=["}"], # STOP IMMEDIATELY when JSON closes
45
  response_format={"type": "json_object"}
46
  )
47
 
48
+ return {"evaluation": response['choices'][0]['message']['content']}