import os import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline import gradio as gr from google.colab import drive # Install bitsandbytes and accelerate !pip install bitsandbytes !pip install accelerate # Mount Google Drive drive.mount('/content/drive') # Set the path to the local directory where the model and tokenizer are saved MODEL_PATH = "/content/drive/My Drive/phi35" # Load the tokenizer from the local directory tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) # Load the model with 8-bit quantization model = AutoModelForCausalLM.from_pretrained( MODEL_PATH, device_map='auto', load_in_8bit=True ) # Create the text-generation pipeline pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_length=256, # Adjusted for faster inference do_sample=True, top_p=0.95, top_k=50, temperature=0.8, device_map={'': 0} ) # Define the function for the Gradio interface def chat_with_phi(message): response = pipe(message) return response[0]['generated_text'] # Set up the Gradio interface app = gr.Interface( fn=chat_with_phi, inputs=gr.Textbox(label="Type your message:"), outputs=gr.Textbox(label="Phi 3.5 Responds:"), title="Phi 3.5 Text Chat", description="Chat with Phi 3.5 model. Ask anything!", theme="default" ) # Launch the app app.launch(debug=True)