Spaces:
Running
Running
from fastapi import FastAPI | |
import torch | |
import os | |
from llama_cpp import Llama | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
device = "cpu" | |
access_token = os.getenv("access_token") | |
tokenizer1 = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct") | |
tokenizer2 = AutoTokenizer.from_pretrained("google/gemma-2-2b-it", token=access_token) | |
tokenizer3 = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct") | |
llm1 = Llama.from_pretrained( | |
repo_id="Qwen/Qwen2-1.5B-Instruct-GGUF", | |
filename="*q8_0.gguf", | |
verbose=False | |
) | |
llm2 = Llama.from_pretrained( | |
repo_id="NexaAIDev/gemma-2-2b-it-GGUF", | |
filename="*q4_K_S.gguf", | |
verbose=False | |
) | |
llm3 = Llama.from_pretrained( | |
repo_id="microsoft/Phi-3-mini-4k-instruct-gguf", | |
filename="*q4.gguf", | |
verbose=False | |
) | |
app = FastAPI() | |
async def read_root(): | |
return {"Hello": "World!"} | |
def modelResp1(prompt): | |
messages = [ | |
{"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."}, | |
{"role": "user", "content": "Who are you?"}, | |
{"role": "assistant", "content": "I am Sia, a small language model created by Sushma."}, | |
{"role": "user", "content": f"{prompt}"} | |
] | |
text = tokenizer1.apply_chat_template( | |
messages, | |
tokenize=False, | |
add_generation_prompt=True | |
) | |
output = llm1( | |
text, | |
max_tokens=64, # Generate up to 256 tokens | |
echo=False, # Whether to echo the prompt | |
) | |
response = output['choices'][0]['text'] | |
return response | |
def modelResp2(prompt): | |
messages = [ | |
{"role": "user", "content": "Who are you?"}, | |
{"role": "assistant", "content": "I am Sia, a small language model created by Sushma."}, | |
{"role": "user", "content": f"{prompt}"} | |
] | |
text = tokenizer2.apply_chat_template( | |
messages, | |
tokenize=False, | |
add_generation_prompt=True | |
) | |
output = llm2( | |
text, | |
max_tokens=64, # Generate up to 256 tokens | |
echo=False, # Whether to echo the prompt | |
) | |
response = output['choices'][0]['text'] | |
return response | |
def modelResp3(prompt): | |
messages = [ | |
{"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."}, | |
{"role": "user", "content": "Who are you?"}, | |
{"role": "assistant", "content": "I am Sia, a small language model created by Sushma."}, | |
{"role": "user", "content": f"{prompt}"} | |
] | |
text = tokenizer3.apply_chat_template( | |
messages, | |
tokenize=False, | |
add_generation_prompt=True | |
) | |
output = llm2( | |
text, | |
max_tokens=64, # Generate up to 256 tokens | |
echo=False, # Whether to echo the prompt | |
) | |
response = output['choices'][0]['text'] | |
return response | |
async def modelApi(data: dict): | |
prompt = data.get("prompt") | |
response = modelResp1(prompt) | |
return response | |
async def modelApi(data: dict): | |
prompt = data.get("prompt") | |
response = modelResp2(prompt) | |
return response | |
async def modelApi1(data: dict): | |
prompt = data.get("prompt") | |
response = modelResp3(prompt) | |
return response |