import transformers from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, BitsAndBytesConfig import torch from langchain.llms import HuggingFacePipeline import gradio as gr device = 'cuda' if torch.cuda.is_available() else 'cpu' print("Device:", device) if device == 'cuda': print(torch.cuda.get_device_name(0)) origin_model_path = "mistralai/Mistral-7B-Instruct-v0.1" model_path = "filipealmeida/Mistral-7B-Instruct-v0.1-sharded" bnb_config = BitsAndBytesConfig \ ( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, ) model = AutoModelForCausalLM.from_pretrained (model_path, trust_remote_code=True, quantization_config=bnb_config, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(origin_model_path) text_generation_pipeline = transformers.pipeline( model=model, tokenizer=tokenizer, task="text-generation", eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id, repetition_penalty=1.1, return_full_text=True, max_new_tokens=100, temperature = 0.5, do_sample=True, ) mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline) def get_response(message, history): return mistral_llm.invoke(message) demo = gr.ChatInterface(get_response) demo.launch()