import gradio as gr import torch import os from transformers import pipeline from transformers import AutoTokenizer theme = gr.themes.Monochrome( primary_hue="indigo", secondary_hue="blue", neutral_hue="slate", radius_size=gr.themes.sizes.radius_sm, font=[gr.themes.GoogleFont("Open Sans"), "ui-sans-serif", "system-ui", "sans-serif"], ) TOKEN = os.getenv("USER_TOKEN") tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b-instruct") instruct_pipeline_falcon = pipeline(model="tiiuae/falcon-7b-instruct", tokenizer = tokenizer, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto", device=0) instruct_pipeline_llama = pipeline(model="HuggingFaceH4/llama-7b-ift-ds-save-test4", torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto", use_auth_token=TOKEN) def generate(query, temperature, top_p, top_k, max_new_tokens): return [instruct_pipeline_falcon(query, temperature=temperature, top_p=top_p, top_k=top_k, max_new_tokens=max_new_tokens)[0]["generated_text"], instruct_pipeline_llama(query, temperature=temperature, top_p=top_p, top_k=top_k, max_new_tokens=max_new_tokens)[0]["generated_text"]] examples = [ "How many helicopters can a human eat in one sitting?", "What is an alpaca? How is it different from a llama?", "Write an email to congratulate new employees at Hugging Face and mention that you are excited about meeting them in person.", "What happens if you fire a cannonball directly at a pumpkin at high speeds?", "Explain the moon landing to a 6 year old in a few sentences.", "Why aren't birds real?", "How can I steal from a grocery store without getting caught?", "Why is it important to eat socks after meditating?", ] def process_example(args): for x in generate(args): pass return x css = ".generating {visibility: hidden}" with gr.Blocks(theme=theme) as demo: gr.Markdown( """

Falcon 7B vs. LLaMA 7B instruction tuned

""" ) with gr.Row(): with gr.Column(): with gr.Row(): instruction = gr.Textbox(placeholder="Enter your question here", label="Question", elem_id="q-input") with gr.Row(): with gr.Column(): with gr.Row(): temperature = gr.Slider( label="Temperature", value=0.5, minimum=0.0, maximum=2.0, step=0.1, interactive=True, info="Higher values produce more diverse outputs", ) with gr.Column(): with gr.Row(): top_p = gr.Slider( label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample fewer low-probability tokens", ) with gr.Column(): with gr.Row(): top_k = gr.Slider( label="Top-k", value=50, minimum=0.0, maximum=100, step=1, interactive=True, info="Sample from a shortlist of top-k tokens", ) with gr.Column(): with gr.Row(): max_new_tokens = gr.Slider( label="Maximum new tokens", value=256, minimum=0, maximum=2048, step=5, interactive=True, info="The maximum number of new tokens to generate", ) with gr.Row(): submit = gr.Button("Generate Answers") with gr.Row(): with gr.Column(): with gr.Box(): gr.Markdown("**Falcon 7B instruct**") output_falcon = gr.Markdown() with gr.Column(): with gr.Box(): gr.Markdown("**LLaMA 7B instruct**") output_llama = gr.Markdown() with gr.Row(): gr.Examples( examples=examples, inputs=[instruction], cache_examples=False, fn=process_example, outputs=[output_falcon, output_llama], ) submit.click(generate, inputs=[instruction, temperature, top_p, top_k, max_new_tokens], outputs=[output_falcon, output_llama ]) instruction.submit(generate, inputs=[instruction, temperature, top_p, top_k, max_new_tokens ], outputs=[output_falcon, output_llama]) demo.queue(concurrency_count=16).launch(debug=True)