#!/usr/bin/env python import os import requests from threading import Thread from typing import Iterator import gradio as gr import psutil import spaces import torch from time import time from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from llama_cpp import Llama # load like this - use tne variable everywhere model_uri_hf=os.getenv("MODEL_URI_HF") # show warning, when empty and briefs description of how to set it # also add link to "how to search" with link to bloke by default + example search link + example full value (mistral base?) # info about ram requirements # DEBUG! model_uri_hf="https://huggingface.co/TheBloke/neural-chat-7B-v3-2-GGUF/blob/main/neural-chat-7b-v3-2.Q2_K.gguf" model_uri_hf="https://huggingface.co/TheBloke/neural-chat-7B-v3-2-GGUF/resolve/main/neural-chat-7b-v3-2.Q2_K.gguf" # maybe use git lfs to dl instead? # Initing things print(f"debug: init model: {model_uri_hf}") # Check if the model file already exists if not os.path.isfile('model.bin'): # Download the model response = requests.get(model_uri_hf) # Save the model to a local file with open('model.bin', 'wb') as file: file.write(response.content) llm = Llama(model_path="./model.bin") # LLaMa model print("! INITING DONE !") # Preparing things to work title = "# Demo for 7B Models - Quantized" descr = ''' Quantized to run in the free tier hosting. Have a quick way to test models or share them with others without hassle. It runs slow, as it's on cpu. Usable for basic tests. It uses quantized models in gguf-Format and llama.cpp to run them. Powered by ...''' print(f"DEBUG: Memory free: {psutil.virtual_memory().free / (1024.0 ** 3)} GiB") print(f"DEBUG: Memory available: {psutil.virtual_memory().available / (1024.0 ** 3)} GiB") print(f"DEBUG: Memory: {psutil.virtual_memory().total / (1024.0 ** 3)} GiB") DESCRIPTION = f"# Test model: {model_uri_hf}" if torch.cuda.is_available(): DESCRIPTION += "\n
This space is using CPU only. Use a different one if you want to go fast and use GPU.
" #todo - probably lower. like 200 in and maybe 500 out? Should be ok for quick test MAX_MAX_NEW_TOKENS = 2048 DEFAULT_MAX_NEW_TOKENS = 1024 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096")) if torch.cuda.is_available(): model_id = "mistralai/Mistral-7B-Instruct-v0.1" model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_id) # we need to make sure we only run one thread or we probably run out of ram def generate( message: str, chat_history: list[tuple[str, str]], max_new_tokens: int = 1024, temperature: float = 0.6, top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1.2, ) -> Iterator[str]: conversation = [] for user, assistant in chat_history: conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}]) conversation.append({"role": "user", "content": message}) input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt") if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH: input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:] gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.") input_ids = input_ids.to(model.device) streamer= Llama() streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True) generate_kwargs = dict( {"input_ids": input_ids}, streamer=streamer, max_new_tokens=max_new_tokens, do_sample=True, top_p=top_p, top_k=top_k, temperature=temperature, num_beams=1, repetition_penalty=repetition_penalty, ) t = Thread(target=model.generate, kwargs=generate_kwargs) t.start() outputs = [] for text in streamer: outputs.append(text) yield "".join(outputs) chat_interface = gr.ChatInterface( fn=generate, additional_inputs=[ gr.Slider( label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS, ), gr.Slider( label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6, ), gr.Slider( label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9, ), gr.Slider( label="Top-k", minimum=1, maximum=1000, step=1, value=50, ), gr.Slider( label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2, ), ], stop_btn=None, # add more eval examples, like a long list taken from teknium and others maybe group by type examples=[ ["Hello there! How are you doing?"], ["Can you explain briefly to me what is the Python programming language?"], ["Explain the plot of Cinderella in a sentence."], ["How many hours does it take a man to eat a Helicopter?"], ["Write a 100-word article on 'Benefits of Open-Source in AI research'"], ], ) with gr.Blocks(css="style.css") as demo: gr.Markdown(title) gr.Markdown(descr) gr.DuplicateButton( value="Duplicate Space for private use", elem_id="duplicate-button", visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1", # add ) chat_interface.render() if __name__ == "__main__": demo.queue(max_size=20).launch()