File size: 3,889 Bytes
c625a8c 654eaa0 30b9c64 654eaa0 f88f764 c625a8c 6a34b4c 654eaa0 c9e4960 654eaa0 c625a8c e3f2c3c c625a8c f88f764 c625a8c cc9f8de 6a34b4c c625a8c c4894e1 c625a8c 654eaa0 c4894e1 4b8eb16 ef6577b c625a8c efb4830 ce5ddf6 5b0eb6a 66da31c 5b0eb6a 66da31c 5b0eb6a f181ae2 5b0eb6a c625a8c da2cdb2 5b0eb6a c625a8c 1322444 2a52af4 1322444 28f0bb6 1322444 ef6577b 1322444 6c8cc78 c625a8c 6c8cc78 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import fastapi
from fastapi.responses import JSONResponse
from time import time
#MODEL_PATH = "./qwen1_5-0_5b-chat-q4_0.gguf" #"./qwen1_5-0_5b-chat-q4_0.gguf"
import logging
import llama_cpp
import llama_cpp.llama_tokenizer
from pydantic import BaseModel
class GenModel(BaseModel):
question: str
system: str = "You are a story writing assistant."
temperature: float = 0.7
seed: int = 42
llama = llama_cpp.Llama.from_pretrained(
repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
filename="*q4_0.gguf",
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B"),
verbose=False,
n_ctx=4096,
n_gpu_layers=0,
#chat_format="llama-2"
)
# Logger setup
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Initialize Llama model
"""
try:
llm = Llama.from_pretrained(
repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
filename="*q4_0.gguf",
verbose=False,
n_ctx=4096,
n_threads=4,
n_gpu_layers=0,
)
llm = Llama(
model_path=MODEL_PATH,
chat_format="llama-2",
n_ctx=4096,
n_threads=8,
n_gpu_layers=0,
)
except Exception as e:
logger.error(f"Failed to load model: {e}")
raise
"""
app = fastapi.FastAPI(
title="OpenGenAI",
description="Your Excellect Physician")
@app.get("/")
def index():
return fastapi.responses.RedirectResponse(url="/docs")
@app.get("/health")
def health():
return {"status": "ok"}
# Chat Completion API
@app.post("/generate/")
async def complete(gen:GenModel):
try:
messages=[
{"role": "system", "content": gen.system},
]
st = time()
output = llama.create_chat_completion(
messages = messages,
temperature=gen.temperature,
seed=gen.seed,
#stream=True
)
messages.append({"role": "user", "content": gen.question},)
print(output)
"""
for chunk in output:
delta = chunk['choices'][0]['delta']
if 'role' in delta:
print(delta['role'], end=': ')
elif 'content' in delta:
print(delta['content'], end='')
print(chunk)
"""
et = time()
output["time"] = et - st
messages.append({'role': "assistant", "content": output['choices'][0]['message']})
return output
except Exception as e:
logger.error(f"Error in /complete endpoint: {e}")
return JSONResponse(
status_code=500, content={"message": "Internal Server Error"}
)
# Chat Completion API
@app.get("/generate_stream")
async def complete(
question: str,
system: str = "You are a professional medical assistant.",
temperature: float = 0.7,
seed: int = 42,
) -> dict:
try:
st = time()
output = llama.create_chat_completion(
messages=[
{"role": "system", "content": system},
{"role": "user", "content": question},
],
temperature=temperature,
seed=seed,
#stream=True
)
"""
for chunk in output:
delta = chunk['choices'][0]['delta']
if 'role' in delta:
print(delta['role'], end=': ')
elif 'content' in delta:
print(delta['content'], end='')
print(chunk)
"""
et = time()
output["time"] = et - st
return output
except Exception as e:
logger.error(f"Error in /complete endpoint: {e}")
return JSONResponse(
status_code=500, content={"message": "Internal Server Error"}
)
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860) |