Spaces:
Running
Running
File size: 3,329 Bytes
0fc7538 091d4d8 0fc7538 091d4d8 f63bd36 091d4d8 0fc7538 091d4d8 0fc7538 091d4d8 0fc7538 091d4d8 0fc7538 2ae8beb 0fc7538 f63bd36 0fc7538 091d4d8 0fc7538 091d4d8 0fc7538 091d4d8 0fc7538 2ae8beb 0fc7538 f63bd36 0fc7538 091d4d8 0fc7538 091d4d8 0fc7538 728e771 091d4d8 f63bd36 091d4d8 728e771 091d4d8 728e771 091d4d8 728e771 091d4d8 728e771 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
from fastapi import FastAPI
import torch
import os
from llama_cpp import Llama
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cpu"
access_token = os.getenv("access_token")
tokenizer1 = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
tokenizer2 = AutoTokenizer.from_pretrained("google/gemma-2-2b-it", token=access_token)
tokenizer3 = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
llm1 = Llama.from_pretrained(
repo_id="Qwen/Qwen2-1.5B-Instruct-GGUF",
filename="*q8_0.gguf",
verbose=False
)
llm2 = Llama.from_pretrained(
repo_id="NexaAIDev/gemma-2-2b-it-GGUF",
filename="*q4_K_S.gguf",
verbose=False
)
llm3 = Llama.from_pretrained(
repo_id="microsoft/Phi-3-mini-4k-instruct-gguf",
filename="*q4.gguf",
verbose=False
)
app = FastAPI()
@app.get("/")
async def read_root():
return {"Hello": "World!"}
def modelResp1(prompt):
messages = [
{"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."},
{"role": "user", "content": "Who are you?"},
{"role": "assistant", "content": "I am Sia, a small language model created by Sushma."},
{"role": "user", "content": f"{prompt}"}
]
text = tokenizer1.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
output = llm1(
text,
max_tokens=64, # Generate up to 256 tokens
echo=False, # Whether to echo the prompt
)
response = output['choices'][0]['text']
return response
def modelResp2(prompt):
messages = [
{"role": "user", "content": "Who are you?"},
{"role": "assistant", "content": "I am Sia, a small language model created by Sushma."},
{"role": "user", "content": f"{prompt}"}
]
text = tokenizer2.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
output = llm2(
text,
max_tokens=64, # Generate up to 256 tokens
echo=False, # Whether to echo the prompt
)
response = output['choices'][0]['text']
return response
def modelResp3(prompt):
messages = [
{"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."},
{"role": "user", "content": "Who are you?"},
{"role": "assistant", "content": "I am Sia, a small language model created by Sushma."},
{"role": "user", "content": f"{prompt}"}
]
text = tokenizer3.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
output = llm2(
text,
max_tokens=64, # Generate up to 256 tokens
echo=False, # Whether to echo the prompt
)
response = output['choices'][0]['text']
return response
@app.post("/modelapi1")
async def modelApi(data: dict):
prompt = data.get("prompt")
response = modelResp1(prompt)
return response
@app.post("/modelapi2")
async def modelApi(data: dict):
prompt = data.get("prompt")
response = modelResp2(prompt)
return response
@app.post("/modelapi3")
async def modelApi1(data: dict):
prompt = data.get("prompt")
response = modelResp3(prompt)
return response |