import torch import os from fastapi import FastAPI, HTTPException from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel from huggingface_hub import login from pydantic import BaseModel torch.set_num_threads(os.cpu_count()) HF_TOKEN = os.getenv("HF_TOKEN") if not HF_TOKEN: raise ValueError("Hugging Face token not found. Set the 'HF_TOKEN' environment variable.") # Model name # MODEL_NAME = "noideatt/Qwen2.5-0.5B-Writing" base_model = AutoModelForCausalLM.from_pretrained("google/gemma-3-1b-it", token=HF_TOKEN) model = PeftModel.from_pretrained(base_model, "noideatt/Gemma-3-1b-it-writing", token=HF_TOKEN) model.to("cpu") model = torch.compile(model) # Load tokenizer tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-it", token=HF_TOKEN) # FastAPI app app = FastAPI() @app.get("/") def home(): return {"message": "Welcome to Writing Scoring Service"} class GenerateRequest(BaseModel): prompt: str @app.post("/generate") def generate(request: GenerateRequest): instruction ="""You are an experienced IELTS Writing examiner specializing in Task 2 assessments. Evaluate candidates' essays based on the four official IELTS Writing Task 2 scoring criteria: 1. Coherence and Cohesion (CC) – Are ideas logically organized and connected with appropriate linking words? - Logical structure (LS) - Introduction & conclusion present (ICP) - Supported main points (SMP) - Accurate linking words (ALW) - Variety in linking words (VILW) - 2. Lexical Resource (LR) – Is the vocabulary diverse, precise, and used appropriately? - Varied vocabulary (VV) - Accurate spelling & word formation (ASWF) - 3. Grammatical Range and Accuracy (GRA) – Is the grammar varied and used correctly? - Mix of complex & simple sentences (MCSS) - Clear and correct grammar (CCG) - 4. Task Response (TA) – Does the essay fully address all parts of the question with relevant, well-developed ideas? - Complete response (CR) - Clear & comprehensive ideas (CCI) - Relevant & specific examples (RSE) - Provide a band score from 4.0 to 9.0 for each criterion.""" messages = [{"role": "system", "content": [{"type": "text", "text": instruction}]}, {"role": "user", "content": [{"type": "text", "text": request.prompt}]}] prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cpu") outputs = model.generate(**inputs, max_new_tokens=250, num_return_sequences=1) text = tokenizer.decode(outputs[0], skip_special_tokens=True) response = text.split("###Analysis:")[1] return {"generated_text": response} # Run using: uvicorn app:app --host 0.0.0.0 --port 8000