Spaces:

albertoo85
/

modelo

Sleeping

albertoo85 commited on about 1 month ago

Commit

588bbfc

verified ·

1 Parent(s): 1dc38ff

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,37 +1,39 @@
 import gradio as gr
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
-import psutil
 import os
-def get_ram():
-    return f"RAM usada: {psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024):.2f} MB"
-print("Descargando modelo GGUF...")
 model_path = hf_hub_download(
     repo_id="Qwen/Qwen2.5-7B-Instruct-GGUF",
     filename="qwen2.5-7b-instruct-q4_k_m.gguf"
 )
-print("Cargando modelo en memoria...")
-llm = Llama(model_path=model_path, n_ctx=2048, n_threads=2)
 def predict(message, system_prompt="Responde en español."):
-    prompt = f"<|im_start|>system\n{system_prompt} ({get_ram()})<|im_end|>\n<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
-    output = llm(prompt, max_tokens=1024, stop=["<|im_end|>", "<|im_start|>"], echo=False)
     return output['choices'][0]['text']
 with gr.Blocks() as demo:
-    gr.Markdown("# Nodo API Qwen2.5-7B")
     with gr.Row():
-        with gr.Column():
-            input_text = gr.Textbox(label="Mensaje", placeholder="Escribe aquí...")
-            sys_text = gr.Textbox(label="System Prompt", value="Eres un asistente útil.")
-            btn = gr.Button("Enviar", variant="primary")
-        with gr.Column():
-            output_text = gr.Textbox(label="Respuesta")
-    btn.click(predict, inputs=[input_text, sys_text], outputs=output_text, api_name="query")
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import os
+# Configuración del modelo
+print("Descargando Qwen2.5-7B GGUF...")
 model_path = hf_hub_download(
     repo_id="Qwen/Qwen2.5-7B-Instruct-GGUF",
     filename="qwen2.5-7b-instruct-q4_k_m.gguf"
 )
+# Cargamos con n_ctx reducido para ahorrar RAM en el Space gratuito
+print("Cargando modelo...")
+llm = Llama(
+    model_path=model_path,
+    n_ctx=1024,      # Reducido de 2048 para mayor estabilidad
+    n_threads=4,     # Aprovecha mejor la CPU del Space
+    n_batch=512
+)
 def predict(message, system_prompt="Responde en español."):
+    prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
+    output = llm(prompt, max_tokens=512, stop=["<|im_end|>"], echo=False)
     return output['choices'][0]['text']
 with gr.Blocks() as demo:
+    gr.Markdown("# Qwen 2.5 API Node")
     with gr.Row():
+        input_t = gr.Textbox(label="Input")
+        sys_t = gr.Textbox(label="System Prompt", value="Responde en español de forma concisa.")
+    output_t = gr.Textbox(label="Output")
+    btn = gr.Button("Enviar")
+    # Nombre de la API para tu VM externa
+    btn.click(predict, [input_t, sys_t], output_t, api_name="query")
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)