decula
/

sd

@@ -13,7 +13,13 @@ HAS_GPU = False
 # Model title and context size limit
 ctx_limit = 20000
 title = "Qwen2-72B-Instruct-2.0bpw-h-novel-exl2 with RAG"
-model_repo = "Orion-zhen/Qwen2-72B-Instruct-2.0bpw-h-novel-exl2"
 # Get the GPU count
 try:
@@ -31,20 +37,22 @@ try:
 except NVMLError as error:
     print(error)
-# Load the model using transformers
-print(f"正在加载模型: {model_repo}")
 # 设置设备配置
 device = "cpu"
 if HAS_GPU:
     device = "cuda"
-# 加载模型和分词器
-tokenizer = AutoTokenizer.from_pretrained(model_repo)
-model = AutoModelForCausalLM.from_pretrained(model_repo)
-# 将模型移动到适当的设备
-model = model.to(device)
 # 理解问题并提取关键词的函数
 async def understanding_question(question: str):

 # Model title and context size limit
 ctx_limit = 20000
 title = "Qwen2-72B-Instruct-2.0bpw-h-novel-exl2 with RAG"
+# 设置模型文件路径
+model_files = [
+    "output-00001-of-00003.safetensors",
+    "output-00002-of-00003.safetensors",
+    "output-00003-of-00003.safetensors"
+]
 # Get the GPU count
 try:
 except NVMLError as error:
     print(error)
 # 设置设备配置
 device = "cpu"
 if HAS_GPU:
     device = "cuda"
+print("正在加载模型文件...")
+# 直接从本地文件加载模型
+tokenizer = AutoTokenizer.from_pretrained(".", local_files_only=True)
+model = AutoModelForCausalLM.from_pretrained(
+    ".",
+    local_files_only=True,
+    device_map=device
+)
+print("模型加载完成")
 # 理解问题并提取关键词的函数
 async def understanding_question(question: str):