from unsloth import FastLanguageModel import torch max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally! dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+ load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False. # 4bit pre quantized models we support for 4x faster downloading + no OOMs. fourbit_models = [ "unsloth/mistral-7b-v0.3-bnb-4bit", # New Mistral v3 2x faster! "unsloth/mistral-7b-instruct-v0.3-bnb-4bit", "unsloth/llama-3-8b-bnb-4bit", # Llama-3 15 trillion tokens model 2x faster! "unsloth/llama-3-8b-Instruct-bnb-4bit", "unsloth/llama-3-70b-bnb-4bit", "unsloth/Phi-3-mini-4k-instruct", # Phi-3 2x faster! "unsloth/Phi-3-medium-4k-instruct", "unsloth/mistral-7b-bnb-4bit", "unsloth/gemma-7b-bnb-4bit", # Gemma 2.2x faster! #"netmouse/Llama-3-Taiwan-8B-Instruct-finetuning-by-promisedchat", #conversational chat model #"netmouse/Llama-3-Taiwan-8B-finetuning-by-promisedchat-Instruction" #instruction model ] # More models at https://huggingface.co/unsloth model, tokenizer = FastLanguageModel.from_pretrained( model_name = "netmouse/Llama-3-Taiwan-8B-finetuning-by-promisedchat-Instruction", # YOUR MODEL YOU USED FOR TRAINING max_seq_length = 2048, dtype = None, load_in_4bit = True, ) FastLanguageModel.for_inference(model) # Enable native 2x faster inference import transformers message = [ {"role": "user", "content": "你是一個在臉書社團「應許之地」的社團成員,大家會互相稱為「應友」"}, {"role": "user", "content": "應許的精神就是「混沌」"} ] prompt = tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=False) # Create pipeline pipeline = transformers.pipeline( "text-generation", model=model, tokenizer=tokenizer ) terminators = [ pipeline.tokenizer.eos_token_id, pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>") ] # Generate text sequences = pipeline( prompt, do_sample=True, temperature=0.7, top_p=0.9, eos_token_id=terminators, num_return_sequences=1, max_length=200, ) print(sequences[0]['generated_text'][len(prompt):]) import gradio as gr messages = [] def add_text(history, text): global messages #message[list] is defined globally history = history + [(text,'')] messages = messages + [{"role":'user', 'content': text}] return history, "" def generate(history): global messages prompt = pipeline.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) terminators = [ pipeline.tokenizer.eos_token_id, pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>") ] outputs = pipeline( prompt, max_new_tokens=256, eos_token_id=terminators, do_sample=True, temperature=0.6, top_p=0.9, ) response_msg = outputs[0]["generated_text"][len(prompt):] for char in response_msg: history[-1][1] += char yield history pass with gr.Blocks() as demo: chatbot = gr.Chatbot(value=[], elem_id="chatbot") with gr.Row(): txt = gr.Textbox( show_label=False, placeholder="請輸入聊天內容", ) txt.submit(add_text, [chatbot, txt], [chatbot, txt], queue=False).then( generate, inputs =[chatbot,],outputs = chatbot,) demo.queue() demo.launch(debug=True)