Spaces:

jerinaj
/

functiongemm

Sleeping

jerinaj commited on 10 days ago

Commit

47d6d47

1 Parent(s): 587fdf0

updates

Files changed (1) hide show

app.py CHANGED Viewed

@@ -21,7 +21,9 @@ hf_token = os.environ.get("HF_TOKEN")
 if hf_token:
     huggingface_hub.login(token=hf_token)
-# Export model to OpenVINO format on first run if not already done
 if not os.path.isdir(OV_MODEL_DIR):
     print(f"OpenVINO model not found at '{OV_MODEL_DIR}', exporting now...")
     subprocess.run(
@@ -29,6 +31,7 @@ if not os.path.isdir(OV_MODEL_DIR):
             "optimum-cli", "export", "openvino",
             "--model", model_name,
             "--task", "text-generation-with-past",
             OV_MODEL_DIR + "/",
         ],
         check=True,
@@ -169,6 +172,11 @@ async def generate(request: Request):
     prompt = build_prompt(messages, tools)
     inputs = tokenizer(prompt, return_tensors="pt")
     outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False, use_cache=True)
     prompt_tokens = inputs["input_ids"].shape[-1]

 if hf_token:
     huggingface_hub.login(token=hf_token)
+# Export model to OpenVINO format on first run if not already done.
+# --disable-stateful avoids the static sliding-window shape (512) that gets
+# baked in during tracing, which causes shape mismatches for long prompts.
 if not os.path.isdir(OV_MODEL_DIR):
     print(f"OpenVINO model not found at '{OV_MODEL_DIR}', exporting now...")
     subprocess.run(
             "optimum-cli", "export", "openvino",
             "--model", model_name,
             "--task", "text-generation-with-past",
+            "--disable-stateful",
             OV_MODEL_DIR + "/",
         ],
         check=True,
     prompt = build_prompt(messages, tools)
     inputs = tokenizer(prompt, return_tensors="pt")
+    # Truncate from the left if prompt exceeds model's context window (8192 tokens).
+    MAX_INPUT_TOKENS = 8192 - max_new_tokens
+    if inputs["input_ids"].shape[-1] > MAX_INPUT_TOKENS:
+        inputs = {k: v[:, -MAX_INPUT_TOKENS:] for k, v in inputs.items()}
     outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False, use_cache=True)
     prompt_tokens = inputs["input_ids"].shape[-1]