heerjtdev commited on
Commit
ccdc2fe
Β·
verified Β·
1 Parent(s): 2c19d14

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -21
app.py CHANGED
@@ -22,21 +22,23 @@ print(f"⚑ Hardware Acceleration Providers: {PROVIDERS}")
22
  # 1. OPTIMIZED EMBEDDINGS (BGE-SMALL)
23
  # ---------------------------------------------------------
24
  class OnnxBgeEmbeddings(Embeddings):
25
- # CHANGE 1: Switched to 'bge-small' (3x faster than large, similar accuracy)
26
- def __init__(self, model_name="BAAI/bge-small-en-v1.5"):
27
- print(f"πŸ”„ Loading Faster Embeddings: {model_name}...")
 
 
 
28
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
29
 
30
  self.model = ORTModelForFeatureExtraction.from_pretrained(
31
  model_name,
32
- export=False,
33
- provider=PROVIDERS[0] # Auto-select best hardware (CUDA/CoreML)
34
  )
35
 
36
  def _process_batch(self, texts):
37
  inputs = self.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
38
 
39
- # Move inputs to same device as model if needed (mostly handled by Optimum)
40
  device = self.model.device
41
  inputs = {k: v.to(device) for k, v in inputs.items()}
42
 
@@ -45,7 +47,6 @@ class OnnxBgeEmbeddings(Embeddings):
45
 
46
  embeddings = outputs.last_hidden_state[:, 0]
47
  embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
48
- # Detach from graph before converting to numpy
49
  return embeddings.cpu().numpy().tolist()
50
 
51
  def embed_documents(self, texts):
@@ -59,32 +60,37 @@ class OnnxBgeEmbeddings(Embeddings):
59
  # ---------------------------------------------------------
60
  class LLMEvaluator:
61
  def __init__(self):
62
- # CHANGE 2: Switched to Qwen 2.5 0.5B (Half the size of Llama 1B, very smart)
63
- self.repo_id = "Xenova/Qwen2.5-0.5B-Instruct"
64
  self.local_dir = "onnx_qwen_local"
65
 
66
  print(f"πŸ”„ Preparing Ultra-Fast LLM: {self.repo_id}...")
67
 
68
  if not os.path.exists(self.local_dir):
69
- print(f"πŸ“₯ Downloading Model to {self.local_dir}...")
70
- # Note: Xenova repos usually have the ONNX ready, no complex wildcard needed
71
- snapshot_download(repo_id=self.repo_id, local_dir=self.local_dir)
 
 
 
 
72
  print("βœ… Download complete.")
73
 
74
  self.tokenizer = AutoTokenizer.from_pretrained(self.local_dir)
75
 
76
- # CHANGE 3: Enabled IO Binding + Explicit Provider
77
  self.model = ORTModelForCausalLM.from_pretrained(
78
  self.local_dir,
 
 
79
  use_cache=True,
80
- use_io_binding=True, # CHANGE: Major speedup on GPU
81
  provider=PROVIDERS[0]
82
  )
83
 
84
  def evaluate(self, context, question, student_answer, max_marks):
85
- # Qwen uses ChatML format implicitly via tokenizer
86
  messages = [
87
- {"role": "system", "content": "You are a strict academic grader. Verify the student answer against the context. Be harsh. Do not halluncinate."},
88
  {"role": "user", "content": f"""
89
  CONTEXT: {context}
90
  QUESTION: {question}
@@ -101,14 +107,13 @@ class LLMEvaluator:
101
  input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
102
  inputs = self.tokenizer(input_text, return_tensors="pt")
103
 
104
- # Move inputs for IO Binding
105
  device = self.model.device
106
  inputs = {k: v.to(device) for k, v in inputs.items()}
107
 
108
  with torch.no_grad():
109
  outputs = self.model.generate(
110
  **inputs,
111
- max_new_tokens=75, # CHANGE 4: Reduced tokens (we only need a short score/feedback)
112
  temperature=0.1,
113
  do_sample=False
114
  )
@@ -117,13 +122,13 @@ class LLMEvaluator:
117
  return response
118
 
119
  # ---------------------------------------------------------
120
- # 3. Main Application Logic (Unchanged but uses new classes)
121
  # ---------------------------------------------------------
122
  class VectorSystem:
123
  def __init__(self):
124
  self.vector_store = None
125
- self.embeddings = OnnxBgeEmbeddings() # Uses new BGE-Small
126
- self.llm = LLMEvaluator() # Uses new Qwen 0.5B
127
  self.all_chunks = []
128
  self.total_chunks = 0
129
 
 
22
  # 1. OPTIMIZED EMBEDDINGS (BGE-SMALL)
23
  # ---------------------------------------------------------
24
  class OnnxBgeEmbeddings(Embeddings):
25
+ def __init__(self):
26
+ # FIX 1: Use "Xenova/..." version which has pre-exported ONNX weights.
27
+ # The official "BAAI/..." repo is PyTorch-only and fails with export=False.
28
+ model_name = "Xenova/bge-small-en-v1.5"
29
+ print(f"πŸ”„ Loading Embeddings: {model_name}...")
30
+
31
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
32
 
33
  self.model = ORTModelForFeatureExtraction.from_pretrained(
34
  model_name,
35
+ export=False, # Now safe because Xenova repo has model.onnx
36
+ provider=PROVIDERS[0]
37
  )
38
 
39
  def _process_batch(self, texts):
40
  inputs = self.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
41
 
 
42
  device = self.model.device
43
  inputs = {k: v.to(device) for k, v in inputs.items()}
44
 
 
47
 
48
  embeddings = outputs.last_hidden_state[:, 0]
49
  embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
 
50
  return embeddings.cpu().numpy().tolist()
51
 
52
  def embed_documents(self, texts):
 
60
  # ---------------------------------------------------------
61
  class LLMEvaluator:
62
  def __init__(self):
63
+ # FIX 2: Correct Repo ID for Qwen 2.5 ONNX
64
+ self.repo_id = "onnx-community/Qwen2.5-0.5B-Instruct"
65
  self.local_dir = "onnx_qwen_local"
66
 
67
  print(f"πŸ”„ Preparing Ultra-Fast LLM: {self.repo_id}...")
68
 
69
  if not os.path.exists(self.local_dir):
70
+ print(f"πŸ“₯ Downloading FP16 model + data to {self.local_dir}...")
71
+ # We download the 'onnx' subfolder specifically
72
+ snapshot_download(
73
+ repo_id=self.repo_id,
74
+ local_dir=self.local_dir,
75
+ allow_patterns=["config.json", "generation_config.json", "tokenizer*", "special_tokens_map.json", "*.jinja", "onnx/model_fp16.onnx*"]
76
+ )
77
  print("βœ… Download complete.")
78
 
79
  self.tokenizer = AutoTokenizer.from_pretrained(self.local_dir)
80
 
81
+ # FIX 3: Point to the 'onnx' subfolder inside the downloaded directory
82
  self.model = ORTModelForCausalLM.from_pretrained(
83
  self.local_dir,
84
+ subfolder="onnx",
85
+ file_name="model_fp16.onnx",
86
  use_cache=True,
87
+ use_io_binding=True,
88
  provider=PROVIDERS[0]
89
  )
90
 
91
  def evaluate(self, context, question, student_answer, max_marks):
 
92
  messages = [
93
+ {"role": "system", "content": "You are a strict academic grader. Verify the student answer against the context. Be harsh. Do not hallucinate."},
94
  {"role": "user", "content": f"""
95
  CONTEXT: {context}
96
  QUESTION: {question}
 
107
  input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
108
  inputs = self.tokenizer(input_text, return_tensors="pt")
109
 
 
110
  device = self.model.device
111
  inputs = {k: v.to(device) for k, v in inputs.items()}
112
 
113
  with torch.no_grad():
114
  outputs = self.model.generate(
115
  **inputs,
116
+ max_new_tokens=75,
117
  temperature=0.1,
118
  do_sample=False
119
  )
 
122
  return response
123
 
124
  # ---------------------------------------------------------
125
+ # 3. Main Application Logic
126
  # ---------------------------------------------------------
127
  class VectorSystem:
128
  def __init__(self):
129
  self.vector_store = None
130
+ self.embeddings = OnnxBgeEmbeddings()
131
+ self.llm = LLMEvaluator()
132
  self.all_chunks = []
133
  self.total_chunks = 0
134