Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -22,21 +22,23 @@ print(f"β‘ Hardware Acceleration Providers: {PROVIDERS}")
|
|
| 22 |
# 1. OPTIMIZED EMBEDDINGS (BGE-SMALL)
|
| 23 |
# ---------------------------------------------------------
|
| 24 |
class OnnxBgeEmbeddings(Embeddings):
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
| 28 |
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 29 |
|
| 30 |
self.model = ORTModelForFeatureExtraction.from_pretrained(
|
| 31 |
model_name,
|
| 32 |
-
export=False,
|
| 33 |
-
provider=PROVIDERS[0]
|
| 34 |
)
|
| 35 |
|
| 36 |
def _process_batch(self, texts):
|
| 37 |
inputs = self.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
|
| 38 |
|
| 39 |
-
# Move inputs to same device as model if needed (mostly handled by Optimum)
|
| 40 |
device = self.model.device
|
| 41 |
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 42 |
|
|
@@ -45,7 +47,6 @@ class OnnxBgeEmbeddings(Embeddings):
|
|
| 45 |
|
| 46 |
embeddings = outputs.last_hidden_state[:, 0]
|
| 47 |
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
| 48 |
-
# Detach from graph before converting to numpy
|
| 49 |
return embeddings.cpu().numpy().tolist()
|
| 50 |
|
| 51 |
def embed_documents(self, texts):
|
|
@@ -59,32 +60,37 @@ class OnnxBgeEmbeddings(Embeddings):
|
|
| 59 |
# ---------------------------------------------------------
|
| 60 |
class LLMEvaluator:
|
| 61 |
def __init__(self):
|
| 62 |
-
#
|
| 63 |
-
self.repo_id = "
|
| 64 |
self.local_dir = "onnx_qwen_local"
|
| 65 |
|
| 66 |
print(f"π Preparing Ultra-Fast LLM: {self.repo_id}...")
|
| 67 |
|
| 68 |
if not os.path.exists(self.local_dir):
|
| 69 |
-
print(f"π₯ Downloading
|
| 70 |
-
#
|
| 71 |
-
snapshot_download(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
print("β
Download complete.")
|
| 73 |
|
| 74 |
self.tokenizer = AutoTokenizer.from_pretrained(self.local_dir)
|
| 75 |
|
| 76 |
-
#
|
| 77 |
self.model = ORTModelForCausalLM.from_pretrained(
|
| 78 |
self.local_dir,
|
|
|
|
|
|
|
| 79 |
use_cache=True,
|
| 80 |
-
use_io_binding=True,
|
| 81 |
provider=PROVIDERS[0]
|
| 82 |
)
|
| 83 |
|
| 84 |
def evaluate(self, context, question, student_answer, max_marks):
|
| 85 |
-
# Qwen uses ChatML format implicitly via tokenizer
|
| 86 |
messages = [
|
| 87 |
-
{"role": "system", "content": "You are a strict academic grader. Verify the student answer against the context. Be harsh. Do not
|
| 88 |
{"role": "user", "content": f"""
|
| 89 |
CONTEXT: {context}
|
| 90 |
QUESTION: {question}
|
|
@@ -101,14 +107,13 @@ class LLMEvaluator:
|
|
| 101 |
input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 102 |
inputs = self.tokenizer(input_text, return_tensors="pt")
|
| 103 |
|
| 104 |
-
# Move inputs for IO Binding
|
| 105 |
device = self.model.device
|
| 106 |
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 107 |
|
| 108 |
with torch.no_grad():
|
| 109 |
outputs = self.model.generate(
|
| 110 |
**inputs,
|
| 111 |
-
max_new_tokens=75,
|
| 112 |
temperature=0.1,
|
| 113 |
do_sample=False
|
| 114 |
)
|
|
@@ -117,13 +122,13 @@ class LLMEvaluator:
|
|
| 117 |
return response
|
| 118 |
|
| 119 |
# ---------------------------------------------------------
|
| 120 |
-
# 3. Main Application Logic
|
| 121 |
# ---------------------------------------------------------
|
| 122 |
class VectorSystem:
|
| 123 |
def __init__(self):
|
| 124 |
self.vector_store = None
|
| 125 |
-
self.embeddings = OnnxBgeEmbeddings()
|
| 126 |
-
self.llm = LLMEvaluator()
|
| 127 |
self.all_chunks = []
|
| 128 |
self.total_chunks = 0
|
| 129 |
|
|
|
|
| 22 |
# 1. OPTIMIZED EMBEDDINGS (BGE-SMALL)
|
| 23 |
# ---------------------------------------------------------
|
| 24 |
class OnnxBgeEmbeddings(Embeddings):
|
| 25 |
+
def __init__(self):
|
| 26 |
+
# FIX 1: Use "Xenova/..." version which has pre-exported ONNX weights.
|
| 27 |
+
# The official "BAAI/..." repo is PyTorch-only and fails with export=False.
|
| 28 |
+
model_name = "Xenova/bge-small-en-v1.5"
|
| 29 |
+
print(f"π Loading Embeddings: {model_name}...")
|
| 30 |
+
|
| 31 |
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 32 |
|
| 33 |
self.model = ORTModelForFeatureExtraction.from_pretrained(
|
| 34 |
model_name,
|
| 35 |
+
export=False, # Now safe because Xenova repo has model.onnx
|
| 36 |
+
provider=PROVIDERS[0]
|
| 37 |
)
|
| 38 |
|
| 39 |
def _process_batch(self, texts):
|
| 40 |
inputs = self.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
|
| 41 |
|
|
|
|
| 42 |
device = self.model.device
|
| 43 |
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 44 |
|
|
|
|
| 47 |
|
| 48 |
embeddings = outputs.last_hidden_state[:, 0]
|
| 49 |
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
|
|
|
| 50 |
return embeddings.cpu().numpy().tolist()
|
| 51 |
|
| 52 |
def embed_documents(self, texts):
|
|
|
|
| 60 |
# ---------------------------------------------------------
|
| 61 |
class LLMEvaluator:
|
| 62 |
def __init__(self):
|
| 63 |
+
# FIX 2: Correct Repo ID for Qwen 2.5 ONNX
|
| 64 |
+
self.repo_id = "onnx-community/Qwen2.5-0.5B-Instruct"
|
| 65 |
self.local_dir = "onnx_qwen_local"
|
| 66 |
|
| 67 |
print(f"π Preparing Ultra-Fast LLM: {self.repo_id}...")
|
| 68 |
|
| 69 |
if not os.path.exists(self.local_dir):
|
| 70 |
+
print(f"π₯ Downloading FP16 model + data to {self.local_dir}...")
|
| 71 |
+
# We download the 'onnx' subfolder specifically
|
| 72 |
+
snapshot_download(
|
| 73 |
+
repo_id=self.repo_id,
|
| 74 |
+
local_dir=self.local_dir,
|
| 75 |
+
allow_patterns=["config.json", "generation_config.json", "tokenizer*", "special_tokens_map.json", "*.jinja", "onnx/model_fp16.onnx*"]
|
| 76 |
+
)
|
| 77 |
print("β
Download complete.")
|
| 78 |
|
| 79 |
self.tokenizer = AutoTokenizer.from_pretrained(self.local_dir)
|
| 80 |
|
| 81 |
+
# FIX 3: Point to the 'onnx' subfolder inside the downloaded directory
|
| 82 |
self.model = ORTModelForCausalLM.from_pretrained(
|
| 83 |
self.local_dir,
|
| 84 |
+
subfolder="onnx",
|
| 85 |
+
file_name="model_fp16.onnx",
|
| 86 |
use_cache=True,
|
| 87 |
+
use_io_binding=True,
|
| 88 |
provider=PROVIDERS[0]
|
| 89 |
)
|
| 90 |
|
| 91 |
def evaluate(self, context, question, student_answer, max_marks):
|
|
|
|
| 92 |
messages = [
|
| 93 |
+
{"role": "system", "content": "You are a strict academic grader. Verify the student answer against the context. Be harsh. Do not hallucinate."},
|
| 94 |
{"role": "user", "content": f"""
|
| 95 |
CONTEXT: {context}
|
| 96 |
QUESTION: {question}
|
|
|
|
| 107 |
input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 108 |
inputs = self.tokenizer(input_text, return_tensors="pt")
|
| 109 |
|
|
|
|
| 110 |
device = self.model.device
|
| 111 |
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 112 |
|
| 113 |
with torch.no_grad():
|
| 114 |
outputs = self.model.generate(
|
| 115 |
**inputs,
|
| 116 |
+
max_new_tokens=75,
|
| 117 |
temperature=0.1,
|
| 118 |
do_sample=False
|
| 119 |
)
|
|
|
|
| 122 |
return response
|
| 123 |
|
| 124 |
# ---------------------------------------------------------
|
| 125 |
+
# 3. Main Application Logic
|
| 126 |
# ---------------------------------------------------------
|
| 127 |
class VectorSystem:
|
| 128 |
def __init__(self):
|
| 129 |
self.vector_store = None
|
| 130 |
+
self.embeddings = OnnxBgeEmbeddings()
|
| 131 |
+
self.llm = LLMEvaluator()
|
| 132 |
self.all_chunks = []
|
| 133 |
self.total_chunks = 0
|
| 134 |
|