Spaces:

zubairsamo
/

transformers_streaming

Sleeping

App Files Files Community

joaogante HF staff commited on Apr 4, 2023

Commit

1be532a

•

1 Parent(s): 7cb6518

limit to pythia 6.9b

Browse files

Files changed (1) hide show

app.py +13 -25

app.py CHANGED Viewed

@@ -1,9 +1,8 @@
 from threading import Thread
-from functools import lru_cache
 import torch
 import gradio as gr
-from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, TextIteratorStreamer
 torch_device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -11,26 +10,16 @@ print("Running on device:", torch_device)
 print("CPU threads:", torch.get_num_threads())
-@lru_cache(maxsize=1)  # only cache the latest model
-def get_model_and_tokenizer(model_id):
-    config = AutoConfig.from_pretrained(model_id)
-    if config.is_encoder_decoder:
-        model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
-    else:
-        model = AutoModelForCausalLM.from_pretrained(model_id)
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    model = model.to(torch_device)
-    return model, tokenizer
-def run_generation(model_id, user_text, top_p, temperature, top_k, max_new_tokens, history):
     if history is None:
         history = []
     history.append([user_text, ""])
     # Get the model and tokenizer, and tokenize the user text.
-    model, tokenizer = get_model_and_tokenizer(model_id)
     model_inputs = tokenizer([user_text], return_tensors="pt").to(torch_device)
     # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
@@ -66,17 +55,16 @@ with gr.Blocks(
     with gr.Column(elem_id="col_container"):
         duplicate_link = "https://huggingface.co/spaces/joaogante/chatbot_transformers_streaming?duplicate=true"
         gr.Markdown(
-            f"""
-            # 🤗 Transformers Gradio 🔥Streaming🔥
-            This demo showcases how to use the streaming feature of 🤗 Transformers with Gradio to generate text in real-time.
-            ⚠️ [Duplicate this Space]({duplicate_link}) if ⚠️
-            - You want to use a large model (> 1GB). Otherwise, this public space will become slow for others 💛
-            - You want to build your own app, using this demo as a template 🚀
-            - You want to bypass the queue and/or add hardware resources 👾
-            """
         )
-        model_id = gr.Textbox(value='EleutherAI/pythia-410m', label="🤗 Hub Model repo")
         chatbot = gr.Chatbot(elem_id='chatbot', label="Message history")
         user_text = gr.Textbox(placeholder="Is pineapple a pizza topping?", label="Type an input and press Enter")
         button = gr.Button(value="Clear message history")
@@ -97,7 +85,7 @@ with gr.Blocks(
     user_text.submit(
         run_generation,
-        [model_id, user_text, top_p, temperature, top_k, max_new_tokens, chatbot],
         chatbot
     )
     button.click(reset_textbox, [], [user_text])

 from threading import Thread
 import torch
 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
 torch_device = "cuda" if torch.cuda.is_available() else "cpu"
 print("CPU threads:", torch.get_num_threads())
+model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-6.9b-deduped", load_in_8bit=True, device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-6.9b-deduped")
+def run_generation(user_text, top_p, temperature, top_k, max_new_tokens, history):
     if history is None:
         history = []
     history.append([user_text, ""])
     # Get the model and tokenizer, and tokenize the user text.
     model_inputs = tokenizer([user_text], return_tensors="pt").to(torch_device)
     # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
     with gr.Column(elem_id="col_container"):
         duplicate_link = "https://huggingface.co/spaces/joaogante/chatbot_transformers_streaming?duplicate=true"
         gr.Markdown(
+            "# 🤗 Transformers Gradio 🔥Streaming🔥\n"
+            "This demo showcases the use of the "
+            "[streaming feature](https://huggingface.co/docs/transformers/main/en/generation_strategies#streaming) "
+            "of 🤗 Transformers with Gradio to generate text in real-time. It uses "
+            "[EleutherAI/pythia-6.9b-deduped](https://huggingface.co/EleutherAI/pythia-6.9b-deduped), "
+            "a 6.9B parameter GPT-NeoX model by EleutherAI, loaded in 8-bit quantized form.\n\n"
+            f"Feel free to [duplicate this Space]({duplicate_link}) to try your own models or to use this space as a "
+            "template! 💛"
         )
         chatbot = gr.Chatbot(elem_id='chatbot', label="Message history")
         user_text = gr.Textbox(placeholder="Is pineapple a pizza topping?", label="Type an input and press Enter")
         button = gr.Button(value="Clear message history")
     user_text.submit(
         run_generation,
+        [user_text, top_p, temperature, top_k, max_new_tokens, chatbot],
         chatbot
     )
     button.click(reset_textbox, [], [user_text])