webpluging

Paused

App Files Files Community

ranamhamoud commited on Apr 18

Commit

b75125a

•

1 Parent(s): cf9b3fc

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -50

app.py CHANGED Viewed

@@ -1,73 +1,125 @@
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
 from threading import Thread
-from typing import Iterator, List, Tuple
 import gradio as gr
 # Constants
-MAX_INPUT_TOKEN_LENGTH = 4096
 DEFAULT_MAX_NEW_TOKENS = 930
-# Load Models and Tokenizers
-model_id = "meta-llama/Llama-2-7b-hf"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-tokenizer.pad_token = tokenizer.eos_token
-model_generate = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
-model_edit = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")  # Assuming a different setup or hyperparameters
-# Helper function to process text
-def process_text(text: str) -> str:
-    return text.replace("\n", " ").strip()
-def run_model(input_ids, model, max_new_tokens, top_p, top_k, temperature, repetition_penalty):
-    return model.generate(
-        input_ids=input_ids,
-        max_length=input_ids.shape[1] + max_new_tokens,
-        do_sample=True,
-        top_p=top_p,
-        top_k=top_k,
-        temperature=temperature,
-        num_beams=1,
-        repetition_penalty=repetition_penalty
     )
-def generate_text(mode: str, message: str, chat_history: List[Tuple[str, str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
-                  temperature: float = 0.6, top_p: float = 0.7, top_k: int = 20, repetition_penalty: float = 1.0) -> Iterator[str]:
-    if chat_history is None:
-        chat_history = []
-    conversation = [{"role": "user", "content": user} for user, _ in chat_history]
-    conversation.extend([{"role": "assistant", "content": assistant} for _, assistant in chat_history])
     conversation.append({"role": "user", "content": message})
-    context = "\n".join(f"{entry['role']}: {entry['content']}" for entry in conversation)
-    input_ids = tokenizer(context, return_tensors="pt", padding=True, truncation=True).input_ids.to(model_generate.device)
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
-    model = model_edit if mode == 'edit' else model_generate
-    outputs = []
-    t = Thread(target=lambda: outputs.extend(run_model(input_ids, model, max_new_tokens, top_p, top_k, temperature, repetition_penalty)))
     t.start()
-    t.join()
-    for output in outputs:
-        for text in tokenizer.decode(output, skip_special_tokens=True).split():
-            processed_text = process_text(text)
-            yield processed_text
-# Gradio Interface
-def switch_mode(mode: str, message: str, chat_history: List[Tuple[str, str]]):
-    return list(generate_text(mode, message, chat_history))
-with gr.Blocks() as demo:
-    with gr.Row():
-        mode_selector = gr.Radio(["generate", "edit"], label="Mode", value="generate")
-        input_text = gr.Textbox(label="Input Text")
-        output_text = gr.Textbox(label="Output")
-        chat_history = gr.State()  # Corrected 'default' keyword
-    generate_button = gr.Button("Generate/Edit")
-    generate_button.click(switch_mode, inputs=[mode_selector, input_text, chat_history], outputs=output_text)
-    demo.launch()

+import os
+import re
 import torch
 from threading import Thread
+from typing import Iterator
+from mongoengine import connect, Document, StringField, SequenceField
 import gradio as gr
+import spaces
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
+from peft import PeftModel
 # Constants
+MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 930
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+LICENSE = """
+---
+As a derivative work of [Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) by Meta,
+this demo is governed by the original [license](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/LICENSE.txt) and [acceptable use policy](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/USE_POLICY.md).
+"""
+if not torch.cuda.is_available():
+    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
+if  torch.cuda.is_available():
+    modelA_id = "meta-llama/Llama-2-7b-chat-hf"
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_use_double_quant=False,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16
     )
+    base_model = AutoModelForCausalLM.from_pretrained(modelA_id, device_map="auto", quantization_config=bnb_config)
+    modelA = PeftModel.from_pretrained(base_model, "ranamhamoud/storytell")
+    tokenizerA = AutoTokenizer.from_pretrained(modelA_id)
+    tokenizerA.pad_token = tokenizerA.eos_token
+    modelB_id = "meta-llama/Llama-2-7b-chat-hf"
+    modelB = AutoModelForCausalLM.from_pretrained(modelB_id, torch_dtype=torch.float16, device_map="auto")
+    tokenizerB = AutoTokenizer.from_pretrained(modelB_id)
+    tokenizerB.use_default_system_prompt = False
+def make_prompt(entry):
+    return  f"### Human: Don't repeat the assesments, limit to 500 words {entry} ### Assistant:"
+@spaces.GPU
+def generate(
+    model: str,
+    message: str,
+    chat_history: list[tuple[str, str]],
+    system_prompt: str,
+    max_new_tokens: int = 1024,
+    temperature: float = 0.6,
+    top_p: float = 0.9,
+    top_k: int = 50,
+    repetition_penalty: float = 1.2,
+) -> Iterator[str]:
+    if model == "A":
+        model = modelA
+        tokenizer = tokenizerA
+        enc = tokenizer(make_prompt(message), return_tensors="pt", padding=True, truncation=True)
+        input_ids = enc.input_ids.to(model.device)
+    else:
+        model = modelB
+        tokenizer = tokenizerB
+        input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
+    conversation = []
+    if system_prompt:
+        conversation.append({"role": "system", "content": system_prompt})
+    for user, assistant in chat_history:
+        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
     conversation.append({"role": "user", "content": message})
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+    input_ids = input_ids.to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        {"input_ids": input_ids},
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
+        num_beams=1,
+        repetition_penalty=repetition_penalty,
+    )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        yield "".join(outputs)
+# Gradio Interface Setup
+chat_interface = gr.ChatInterface(
+    fn=generate,
+    additional_inputs=[gr.Dropdown("Model", ["A", "B"], default="A")],
+    fill_height=True,
+    stop_btn=None,
+    examples=[
+        ["Can you explain briefly to me what is the Python programming language?"],
+        ["Could you please provide an explanation about the concept of recursion?"],
+        ["Could you explain what a URL is?"]
+    ],
+    theme='shivi/calm_seafoam'
+)
+# Gradio Web Interface
+with gr.Blocks(theme='shivi/calm_seafoam',fill_height=True) as demo:
+    # gr.Markdown(DESCRIPTION)
+    chat_interface.render()
+    gr.Markdown(LICENSE)
+# Main Execution
+if __name__ == "__main__":
+    demo.queue(max_size=20)
+    demo.launch(share=True)