import gradio as gr import os import torch, torchvision, einops import spaces import flash_attention from transformers import AutoModelForCausalLM from huggingface_hub import login import subprocess subprocess.run( "pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True, ) hf_token = os.getenv("HF_TOKEN") login(token=hf_token, add_to_git_credential=True) @spaces.GPU def get_model_summary(model_name): # Check if CUDA is available and set the device accordingly device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load the model and move it to the selected device model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).to(device) # Return the model's architecture as a string return str(model) # Create the Gradio interface interface = gr.Interface( fn=get_model_summary, inputs="text", outputs="text", examples=[ ["google/gemma-7b"], ["microsoft/Phi-3-mini-4k-instruct"], ["google/paligemma-3b-mix-224"], ["microsoft/Phi-3-vision-128k-instruct"] ] ) interface.launch()