# main.py import spaces import torch import torch.nn.functional as F from torch.nn import DataParallel from torch import Tensor from transformers import AutoTokenizer, AutoModel import threading import queue import os import json import numpy as np import gradio as gr from huggingface_hub import InferenceClient import openai from openai import OpenAI from globalvars import API_BASE, API_KEY, intention_prompt os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:30' os.environ['CUDA_LAUNCH_BLOCKING'] = '1' os.environ['CUDA_CACHE_DISABLE'] = '1' device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ## add chroma vector store ## use instruct embeddings # Load the tokenizer and model tokenizer = AutoTokenizer.from_pretrained('nvidia/NV-Embed-v1', trust_remote_code=True) model = AutoModel.from_pretrained('nvidia/NV-Embed-v1', trust_remote_code=True).to(device) ## Make intention Mapper intention_client = OpenAI( api_key=API_KEY, base_url=API_BASE ) intention_completion = intention_client.chat.completions.create( model="yi-large", messages=[{"role": "system", "content": intention_prompt},{"role": "user", "content": inputext}] ) # print(completion) def respond( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, ): messages = [{"role": "system", "content": system_message}] for val in history: if val[0]: messages.append({"role": "user", "content": val[0]}) if val[1]: messages.append({"role": "assistant", "content": val[1]}) messages.append({"role": "user", "content": message}) response = "" for message in client.chat_completion( messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, ): token = message.choices[0].delta.content response += token yield response """ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface """ demo = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox(value="You are a friendly Chatbot.", label="System message"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)", ), ], ) if __name__ == "__main__": demo.launch()