albertoo85 commited on
Commit
588bbfc
verified
1 Parent(s): 1dc38ff

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -17
app.py CHANGED
@@ -1,37 +1,39 @@
1
  import gradio as gr
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
- import psutil
5
  import os
6
 
7
- def get_ram():
8
- return f"RAM usada: {psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024):.2f} MB"
9
-
10
- print("Descargando modelo GGUF...")
11
  model_path = hf_hub_download(
12
  repo_id="Qwen/Qwen2.5-7B-Instruct-GGUF",
13
  filename="qwen2.5-7b-instruct-q4_k_m.gguf"
14
  )
15
 
16
- print("Cargando modelo en memoria...")
17
- llm = Llama(model_path=model_path, n_ctx=2048, n_threads=2)
 
 
 
 
 
 
18
 
19
  def predict(message, system_prompt="Responde en espa帽ol."):
20
- prompt = f"<|im_start|>system\n{system_prompt} ({get_ram()})<|im_end|>\n<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
21
- output = llm(prompt, max_tokens=1024, stop=["<|im_end|>", "<|im_start|>"], echo=False)
22
  return output['choices'][0]['text']
23
 
24
  with gr.Blocks() as demo:
25
- gr.Markdown("# Nodo API Qwen2.5-7B")
26
  with gr.Row():
27
- with gr.Column():
28
- input_text = gr.Textbox(label="Mensaje", placeholder="Escribe aqu铆...")
29
- sys_text = gr.Textbox(label="System Prompt", value="Eres un asistente 煤til.")
30
- btn = gr.Button("Enviar", variant="primary")
31
- with gr.Column():
32
- output_text = gr.Textbox(label="Respuesta")
33
 
34
- btn.click(predict, inputs=[input_text, sys_text], outputs=output_text, api_name="query")
 
35
 
36
  if __name__ == "__main__":
37
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  import gradio as gr
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
 
4
  import os
5
 
6
+ # Configuraci贸n del modelo
7
+ print("Descargando Qwen2.5-7B GGUF...")
 
 
8
  model_path = hf_hub_download(
9
  repo_id="Qwen/Qwen2.5-7B-Instruct-GGUF",
10
  filename="qwen2.5-7b-instruct-q4_k_m.gguf"
11
  )
12
 
13
+ # Cargamos con n_ctx reducido para ahorrar RAM en el Space gratuito
14
+ print("Cargando modelo...")
15
+ llm = Llama(
16
+ model_path=model_path,
17
+ n_ctx=1024, # Reducido de 2048 para mayor estabilidad
18
+ n_threads=4, # Aprovecha mejor la CPU del Space
19
+ n_batch=512
20
+ )
21
 
22
  def predict(message, system_prompt="Responde en espa帽ol."):
23
+ prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
24
+ output = llm(prompt, max_tokens=512, stop=["<|im_end|>"], echo=False)
25
  return output['choices'][0]['text']
26
 
27
  with gr.Blocks() as demo:
28
+ gr.Markdown("# Qwen 2.5 API Node")
29
  with gr.Row():
30
+ input_t = gr.Textbox(label="Input")
31
+ sys_t = gr.Textbox(label="System Prompt", value="Responde en espa帽ol de forma concisa.")
32
+ output_t = gr.Textbox(label="Output")
33
+ btn = gr.Button("Enviar")
 
 
34
 
35
+ # Nombre de la API para tu VM externa
36
+ btn.click(predict, [input_t, sys_t], output_t, api_name="query")
37
 
38
  if __name__ == "__main__":
39
  demo.launch(server_name="0.0.0.0", server_port=7860)