Spaces:

heerjtdev
/

try_answer

Running

App Files Files Community

heerjtdev commited on 12 days ago

Commit

ccdc2fe

verified ·

1 Parent(s): 2c19d14

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -21

app.py CHANGED Viewed

@@ -22,21 +22,23 @@ print(f"⚡ Hardware Acceleration Providers: {PROVIDERS}")
 # 1. OPTIMIZED EMBEDDINGS (BGE-SMALL)
 # ---------------------------------------------------------
 class OnnxBgeEmbeddings(Embeddings):
-    # CHANGE 1: Switched to 'bge-small' (3x faster than large, similar accuracy)
-    def __init__(self, model_name="BAAI/bge-small-en-v1.5"):
-        print(f"🔄 Loading Faster Embeddings: {model_name}...")
         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
         self.model = ORTModelForFeatureExtraction.from_pretrained(
             model_name,
-            export=False,
-            provider=PROVIDERS[0] # Auto-select best hardware (CUDA/CoreML)
         )
     def _process_batch(self, texts):
         inputs = self.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
-        # Move inputs to same device as model if needed (mostly handled by Optimum)
         device = self.model.device
         inputs = {k: v.to(device) for k, v in inputs.items()}
@@ -45,7 +47,6 @@ class OnnxBgeEmbeddings(Embeddings):
         embeddings = outputs.last_hidden_state[:, 0]
         embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
-        # Detach from graph before converting to numpy
         return embeddings.cpu().numpy().tolist()
     def embed_documents(self, texts):
@@ -59,32 +60,37 @@ class OnnxBgeEmbeddings(Embeddings):
 # ---------------------------------------------------------
 class LLMEvaluator:
     def __init__(self):
-        # CHANGE 2: Switched to Qwen 2.5 0.5B (Half the size of Llama 1B, very smart)
-        self.repo_id = "Xenova/Qwen2.5-0.5B-Instruct"
         self.local_dir = "onnx_qwen_local"
         print(f"🔄 Preparing Ultra-Fast LLM: {self.repo_id}...")
         if not os.path.exists(self.local_dir):
-            print(f"📥 Downloading Model to {self.local_dir}...")
-            # Note: Xenova repos usually have the ONNX ready, no complex wildcard needed
-            snapshot_download(repo_id=self.repo_id, local_dir=self.local_dir)
             print("✅ Download complete.")
         self.tokenizer = AutoTokenizer.from_pretrained(self.local_dir)
-        # CHANGE 3: Enabled IO Binding + Explicit Provider
         self.model = ORTModelForCausalLM.from_pretrained(
             self.local_dir,
             use_cache=True,
-            use_io_binding=True, # CHANGE: Major speedup on GPU
             provider=PROVIDERS[0]
         )
     def evaluate(self, context, question, student_answer, max_marks):
-        # Qwen uses ChatML format implicitly via tokenizer
         messages = [
-            {"role": "system", "content": "You are a strict academic grader. Verify the student answer against the context. Be harsh. Do not halluncinate."},
             {"role": "user", "content": f"""
             CONTEXT: {context}
             QUESTION: {question}
@@ -101,14 +107,13 @@ class LLMEvaluator:
         input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = self.tokenizer(input_text, return_tensors="pt")
-        # Move inputs for IO Binding
         device = self.model.device
         inputs = {k: v.to(device) for k, v in inputs.items()}
         with torch.no_grad():
             outputs = self.model.generate(
                 **inputs,
-                max_new_tokens=75,  # CHANGE 4: Reduced tokens (we only need a short score/feedback)
                 temperature=0.1,
                 do_sample=False
             )
@@ -117,13 +122,13 @@ class LLMEvaluator:
         return response
 # ---------------------------------------------------------
-# 3. Main Application Logic (Unchanged but uses new classes)
 # ---------------------------------------------------------
 class VectorSystem:
     def __init__(self):
         self.vector_store = None
-        self.embeddings = OnnxBgeEmbeddings() # Uses new BGE-Small
-        self.llm = LLMEvaluator() # Uses new Qwen 0.5B
         self.all_chunks = []
         self.total_chunks = 0

 # 1. OPTIMIZED EMBEDDINGS (BGE-SMALL)
 # ---------------------------------------------------------
 class OnnxBgeEmbeddings(Embeddings):
+    def __init__(self):
+        # FIX 1: Use "Xenova/..." version which has pre-exported ONNX weights.
+        # The official "BAAI/..." repo is PyTorch-only and fails with export=False.
+        model_name = "Xenova/bge-small-en-v1.5"
+        print(f"🔄 Loading Embeddings: {model_name}...")
         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
         self.model = ORTModelForFeatureExtraction.from_pretrained(
             model_name,
+            export=False, # Now safe because Xenova repo has model.onnx
+            provider=PROVIDERS[0]
         )
     def _process_batch(self, texts):
         inputs = self.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
         device = self.model.device
         inputs = {k: v.to(device) for k, v in inputs.items()}
         embeddings = outputs.last_hidden_state[:, 0]
         embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
         return embeddings.cpu().numpy().tolist()
     def embed_documents(self, texts):
 # ---------------------------------------------------------
 class LLMEvaluator:
     def __init__(self):
+        # FIX 2: Correct Repo ID for Qwen 2.5 ONNX
+        self.repo_id = "onnx-community/Qwen2.5-0.5B-Instruct"
         self.local_dir = "onnx_qwen_local"
         print(f"🔄 Preparing Ultra-Fast LLM: {self.repo_id}...")
         if not os.path.exists(self.local_dir):
+            print(f"📥 Downloading FP16 model + data to {self.local_dir}...")
+            # We download the 'onnx' subfolder specifically
+            snapshot_download(
+                repo_id=self.repo_id,
+                local_dir=self.local_dir,
+                allow_patterns=["config.json", "generation_config.json", "tokenizer*", "special_tokens_map.json", "*.jinja", "onnx/model_fp16.onnx*"]
+            )
             print("✅ Download complete.")
         self.tokenizer = AutoTokenizer.from_pretrained(self.local_dir)
+        # FIX 3: Point to the 'onnx' subfolder inside the downloaded directory
         self.model = ORTModelForCausalLM.from_pretrained(
             self.local_dir,
+            subfolder="onnx",
+            file_name="model_fp16.onnx",
             use_cache=True,
+            use_io_binding=True,
             provider=PROVIDERS[0]
         )
     def evaluate(self, context, question, student_answer, max_marks):
         messages = [
+            {"role": "system", "content": "You are a strict academic grader. Verify the student answer against the context. Be harsh. Do not hallucinate."},
             {"role": "user", "content": f"""
             CONTEXT: {context}
             QUESTION: {question}
         input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = self.tokenizer(input_text, return_tensors="pt")
         device = self.model.device
         inputs = {k: v.to(device) for k, v in inputs.items()}
         with torch.no_grad():
             outputs = self.model.generate(
                 **inputs,
+                max_new_tokens=75,
                 temperature=0.1,
                 do_sample=False
             )
         return response
 # ---------------------------------------------------------
+# 3. Main Application Logic
 # ---------------------------------------------------------
 class VectorSystem:
     def __init__(self):
         self.vector_store = None
+        self.embeddings = OnnxBgeEmbeddings()
+        self.llm = LLMEvaluator()
         self.all_chunks = []
         self.total_chunks = 0