Spaces:

jordigonzm
/

Llama-2-13B

Sleeping

App Files Files Community

jordigonzm commited on Sep 6

Commit

524455a

•

1 Parent(s): 3fbcfce

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -47

app.py CHANGED Viewed

@@ -1,55 +1,132 @@
 import os
-from transformers import pipeline
 import torch
-import time
-# Cargar el token desde la variable de entorno
-token = os.environ.get("HUGGINGFACE_HUB_TOKEN")
-model_name = "google/gemma-2-27b-it"
-try:
-    generator = pipeline(
-        "text-generation",
-        model=model_name,
-        device=0 if torch.cuda.is_available() else -1,
-        use_auth_token=token
     )
-    print("Modelo cargado correctamente.")
-except Exception as e:
-    print(f"Error al cargar el modelo: {e}")
-    exit(1)
-# Función para procesar la entrada y generar la respuesta
-def generate_response(text):
-    output = generator(text, max_length=512, num_return_sequences=1)
-    response = {
-        "choices": [
-            {
-                "text": output[0]['generated_text'],
-                "index": 0,
-                "logprobs": None,
-                "finish_reason": "stop"
             }
-        ],
-        "id": "req-12345",  # Reemplazar con un ID único
-        "model": model_name,
-        "created": int(time.time())
-    }
-    return response
 # Configuración de la interfaz Gradio
-import gradio as gr
-iface = gr.Interface(
-    fn=generate_response,
-    inputs="text",
-    outputs="json",
-    title="API compatible con OpenAI",
-    description="Introduce texto para obtener una respuesta del modelo Meta-Llama-3-8B-Instruct."
-)
-# Ejecutar la interfaz
-try:
-    iface.launch()
-except Exception as e:
-    print(f"Error al iniciar la interfaz: {e}")

+import subprocess
 import os
 import torch
+from threading import Thread
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, StoppingCriteriaList, StoppingCriteria
+import gradio as gr
+# Instalar dependencias necesarias
+subprocess.run(
+    'pip install flash-attn --no-build-isolation',
+    env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
+    shell=True
+)
+# Cargar el token desde las variables de entorno
+HF_TOKEN = os.environ.get("HF_TOKEN", None)
+MODEL_NAME = "google/gemma-2-27b-it"
+# Títulos y estilos para la interfaz
+TITLE = "<h1><center>Gemma Model Chat</center></h1>"
+PLACEHOLDER = f'<h3><center>{MODEL_NAME} es un modelo avanzado capaz de generar respuestas detalladas basadas en entradas complejas.</center></h3>'
+CSS = """
+.duplicate-button {
+  margin: auto !important;
+  color: white !important;
+  background: black !important;
+  border-radius: 100vh !important;
+}
+"""
+# Cargar el modelo y el tokenizador
+model = AutoModelForCausalLM.from_pretrained(
+        MODEL_NAME,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+        trust_remote_code=True,
+        use_auth_token=HF_TOKEN
+    ).eval()
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True, use_fast=False)
+class StopOnTokens(StoppingCriteria):
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        stop_ids = [tokenizer.eos_token_id]
+        return any(input_ids[0][-1] == stop_id for stop_id in stop_ids)
+def stream_chat(message: str, history: list, temperature: float, max_new_tokens: int):
+    print(f'Mensaje: {message}')
+    print(f'Historia: {history}')
+    conversation = []
+    for prompt, answer in history:
+        conversation.extend([{"role": "user", "content": prompt}, {"role": "assistant", "content": answer}])
+    stop = StopOnTokens()
+    input_ids = tokenizer.encode(message, return_tensors='pt').to(next(model.parameters()).device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        input_ids=input_ids,
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        top_k=50,
+        temperature=temperature,
+        repetition_penalty=1.1,
+        stopping_criteria=StoppingCriteriaList([stop]),
     )
+    thread = Thread(target=model.generate, kwargs=generate_kwargs)
+    thread.start()
+    buffer = ""
+    for new_token in streamer:
+        if new_token:
+            buffer += new_token
+            # Emitir el resultado en un formato compatible con OpenAI
+            yield {
+                "choices": [
+                    {
+                        "text": buffer,
+                        "index": 0,
+                        "logprobs": None,
+                        "finish_reason": "stop"
+                    }
+                ],
+                "id": "req-12345",  # Reemplazar con un ID único si es necesario
+                "model": MODEL_NAME,
+                "created": int(time.time())
             }
 # Configuración de la interfaz Gradio
+chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
+with gr.Blocks(css=CSS) as demo:
+    gr.HTML(TITLE)
+    gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
+    gr.ChatInterface(
+        fn=stream_chat,
+        chatbot=chatbot,
+        fill_height=True,
+        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
+        additional_inputs=[
+            gr.Slider(
+                minimum=0,
+                maximum=1,
+                step=0.1,
+                value=0.5,
+                label="Temperature",
+                render=False,
+            ),
+            gr.Slider(
+                minimum=1024,
+                maximum=32768,
+                step=1,
+                value=4096,
+                label="Max New Tokens",
+                render=False,
+            ),
+        ],
+        examples=[
+            ["Help me study vocabulary: write a sentence for me to fill in the blank, and I'll try to pick the correct option."],
+            ["What are 5 creative things I could do with my kids' art? I don't want to throw them away, but it's also so much clutter."],
+            ["Tell me a random fun fact about the Roman Empire."],
+            ["Show me a code snippet of a website's sticky header in CSS and JavaScript."],
+        ],
+        cache_examples=False,
+    )
+if __name__ == "__main__":
+    demo.launch()