Spaces:
Runtime error
Runtime error
| # import gradio as gr | |
| # from transformers import AutoTokenizer, AutoModelForCausalLM | |
| # from huggingface_hub import login | |
| # import torch | |
| # import os | |
| # # Authenticate using environment variable | |
| # login(token=os.getenv('HF_TOKEN')) | |
| # # Load model (will use cached version if available) | |
| # model_id = "meta-llama/Llama-2-7b-chat-hf" | |
| # device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # def load_model(): | |
| # tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| # model = AutoModelForCausalLM.from_pretrained(model_id).to(device) | |
| # return tokenizer, model | |
| # tokenizer, model = load_model() | |
| # def generate_text(prompt, max_length=200): | |
| # inputs = tokenizer(prompt, return_tensors="pt").to(device) | |
| # outputs = model.generate( | |
| # **inputs, | |
| # max_new_tokens=max_length, | |
| # temperature=0.7, | |
| # do_sample=True | |
| # ) | |
| # return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # # Gradio interface | |
| # with gr.Blocks() as demo: | |
| # gr.Markdown("# LLaMA 2 7B Chat Demo") | |
| # with gr.Row(): | |
| # input_text = gr.Textbox(label="Input Prompt", lines=3) | |
| # output_text = gr.Textbox(label="Generated Response", lines=3) | |
| # generate_btn = gr.Button("Generate") | |
| # generate_btn.click(fn=generate_text, inputs=input_text, outputs=output_text) | |
| # demo.launch(server_name="0.0.0.0", server_port=7860) | |
| # import gradio as gr | |
| # from transformers import AutoModelForCausalLM, AutoTokenizer | |
| # from huggingface_hub import login, hf_hub_download | |
| # from tenacity import retry, stop_after_attempt, wait_exponential | |
| # import torch | |
| # import os | |
| # # Authentication | |
| # login(token=os.getenv('HF_TOKEN')) | |
| # # Configuration | |
| # CACHE_REPO = "Juna190825/cacheRepo" # Your dataset repo for cached models | |
| # MODEL_ID = "meta-llama/Llama-2-7b-chat-hf" # Original model ID | |
| # DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| # @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10)) | |
| # def load_model(): | |
| # retries = 3 | |
| # for attempt in range(retries): | |
| # try: | |
| # # First try loading from cache repo | |
| # model = AutoModelForCausalLM.from_pretrained( | |
| # CACHE_REPO, | |
| # cache_dir="/cache/models", | |
| # local_files_only=True | |
| # ).to(DEVICE) | |
| # tokenizer = AutoTokenizer.from_pretrained( | |
| # CACHE_REPO, | |
| # cache_dir="/cache/models" | |
| # ) | |
| # print("Loaded model from cache repo") | |
| # return model, tokenizer | |
| # except Exception as e: | |
| # if attempt == retries - 1: # Final attempt | |
| # print(f"Cache load failed: {str(e)}. Falling back to original repo") | |
| # # Fallback to original repo | |
| # model = AutoModelForCausalLM.from_pretrained( | |
| # MODEL_ID, | |
| # cache_dir="/cache/models" | |
| # ).to(DEVICE) | |
| # tokenizer = AutoTokenizer.from_pretrained( | |
| # MODEL_ID, | |
| # cache_dir="/cache/models" | |
| # ) | |
| # return model, tokenizer | |
| # print(f"Attempt {attempt + 1} failed, retrying...") | |
| # time.sleep(2 ** attempt) # Exponential backoff | |
| # # Load model and tokenizer | |
| # model, tokenizer = load_model() | |
| # def generate_text(prompt, max_length=200): | |
| # inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE) | |
| # outputs = model.generate( | |
| # **inputs, | |
| # max_new_tokens=max_length, | |
| # temperature=0.7, | |
| # do_sample=True | |
| # ) | |
| # return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # # Gradio interface | |
| # with gr.Blocks() as demo: | |
| # gr.Markdown("# LLaMA 2 7B Chat Demo") | |
| # with gr.Row(): | |
| # input_text = gr.Textbox(label="Input Prompt", lines=3) | |
| # output_text = gr.Textbox(label="Generated Response", lines=3) | |
| # generate_btn = gr.Button("Generate") | |
| # generate_btn.click(fn=generate_text, inputs=input_text, outputs=output_text) | |
| # demo.launch(server_name="0.0.0.0", server_port=7860) | |
| import gradio as gr | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from huggingface_hub import login | |
| import torch | |
| import os | |
| # Authentication | |
| login(token=os.getenv('HF_TOKEN')) | |
| # Configuration | |
| MODEL_ID = "meta-llama/Llama-2-7b-chat-hf" | |
| CACHE_DIR = "/cache/models" | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| def load_model(): | |
| """Load model with automatic cache handling""" | |
| try: | |
| # First try with local files only (uses cache if available) | |
| print("Checking for cached model...") | |
| return AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| cache_dir=CACHE_DIR, | |
| local_files_only=True # Will fail if not cached | |
| ).to(DEVICE), AutoTokenizer.from_pretrained( | |
| MODEL_ID, | |
| cache_dir=CACHE_DIR, | |
| local_files_only=True | |
| ) | |
| except OSError: | |
| # Fallback to download if not in cache | |
| print("Downloading model...") | |
| return AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| cache_dir=CACHE_DIR | |
| ).to(DEVICE), AutoTokenizer.from_pretrained( | |
| MODEL_ID, | |
| cache_dir=CACHE_DIR | |
| ) | |
| # Load model | |
| model, tokenizer = load_model() | |
| def generate_text(prompt, max_length=200): | |
| inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE) | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=max_length, | |
| temperature=0.7, | |
| do_sample=True | |
| ) | |
| return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Gradio interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# LLaMA 2 7B Chat Demo") | |
| with gr.Row(): | |
| input_text = gr.Textbox(label="Input Prompt", lines=3) | |
| output_text = gr.Textbox(label="Generated Response", lines=3) | |
| generate_btn = gr.Button("Generate") | |
| generate_btn.click(fn=generate_text, inputs=input_text, outputs=output_text) | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |