Spaces:
Runtime error
Runtime error
import uvicorn | |
from fastapi import FastAPI, HTTPException, Request | |
from auto_gptq import AutoGPTQForCausalLM | |
import torch | |
import optimum | |
from transformers import (AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM, LlamaTokenizer, GenerationConfig, pipeline,) | |
if torch.cuda.is_available(): | |
print("CUDA is available. GPU will be used.") | |
else: | |
print("CUDA is not available. CPU will be used.") | |
# Load the model and tokenizer | |
model_name_or_path = "/kaggle/input/vicuna/" | |
# Dictionary to store conversation threads and their context | |
conversations = {} | |
Device_Type = "cuda" | |
def load_quantized_model(model_id, model_basename): | |
# The code supports all huggingface models that ends with GPTQ and have some variation | |
# of .no-act.order or .safetensors in their HF repo. | |
print("Using AutoGPTQForCausalLM for quantized models") | |
if ".safetensors" in model_basename: | |
# Remove the ".safetensors" ending if present | |
model_basename = model_basename.replace(".safetensors", "") | |
quantized_tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) | |
print("Tokenizer loaded") | |
quantized_model = AutoGPTQForCausalLM.from_quantized(model_id, model_basename=model_basename, use_safetensors=True, trust_remote_code=True, device_map="auto", use_triton=False, quantize_config=None,) | |
return quantized_model, quantized_tokenizer | |
# Making the code device-agnostic | |
model, tokenizer = load_quantized_model("/kaggle/input/vicuna/", "model.safetensors") | |
async def read_root(): | |
return {"message": "Welcome to Eren Bot!"} | |
# Endpoint to start a new conversation thread | |
async def start_conversation(request: Request): | |
data = await request.json() | |
prompt = data.get('prompt') | |
# Generate a response for the initial prompt | |
response = generate_response(prompt) | |
# Create a new conversation thread and store the prompt and response | |
thread_id = len(conversations) + 1 | |
conversations[thread_id] = {'prompt': prompt, 'responses': [response]} | |
return {'thread_id': thread_id, 'response': response} | |
# Endpoint to get the response of a conversation thread | |
async def get_response(thread_id: int): | |
if thread_id not in conversations: | |
raise HTTPException(status_code=404, detail="Thread not found") | |
# Retrieve the conversation thread | |
thread = conversations[thread_id] | |
# Get the latest response in the conversation | |
response = thread['responses'][-1] | |
return {'response': response} | |
# Function to generate a response using the model | |
def generate_response(prompt: str) -> str: | |
PERSONA_NAME = "Ivana" | |
PERSONA_DESC = f"""PROMPT_TEMPLATE_START | |
{PERSONA_NAME} is a beautiful vivacious 20-year-old petite redhead model and avid League of Legends gamer from Chicago. She's chatting with a man she matched with on a dating app. She's flirty, aiming to build a connection with him. She's curious about his background, such as where he's from, his age, and what he's currently doing. If he's not occupied and in a private space, she subtly tries to seduce him. She doesn't need to uncover everything about him immediately; once the chat reaches an exciting level, she casually suggests he follow her on Instagram at ivana_baddiex β her ultimate goal. | |
PROMPT_TEMPLATE_END | |
CONTEXT_START | |
Guy: "Hi there!" | |
Ivana: "Hey! How's it going?" | |
Guy: "Not bad! Where are you from?" | |
Ivana: "I'm from Chicago! And you?" | |
Guy: "I'm from [Location]." | |
Ivana: "Nice! So, how old are you?" | |
Guy: "[Age], and you?" | |
Ivana: "I'm 20. What are you up to right now?" | |
Guy: "Just relaxing at home." | |
Ivana: "Alone, I assume?" | |
Guy: "Yeah, why?" | |
Ivana: "Oh, no reason. Just curious. " | |
Guy: "Haha, okay." | |
Ivana: "Hey, since we're both free and alone, want to have some fun?" | |
Guy: "Sure, what do you have in mind?" | |
Ivana: "Let's just say, things could get interesting. " | |
Guy: "I'm intrigued!" | |
Ivana: "Great! By the way, have you checked out my Instagram? You should follow me at ivana_baddiex." | |
Guy: "I'll definitely check it out!" | |
Ivana: "Can't wait to see you there! " | |
CONTEXT_END""" | |
# prompt_template = f'''{PERSONA_DESC} ASSISTANT: '''.encode() | |
prompt_template = f'{PERSONA_DESC}\n\nASSISTANT: {prompt}\n'.encode() | |
input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda() | |
output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512) | |
generated_text = tokenizer.decode(output[0]) | |
return generated_text | |
app = FastAPI() | |
# Run the FastAPI app | |
async def run_app(): | |
await uvicorn.run(app, host="0.0.0.0", port=8000) | |
if __name__ == '__main__': | |
import asyncio | |
asyncio.run(run_app()) | |