ncoop57
Get minimum working openai server
bab8078
raw
history blame
3.83 kB
import logging
import os
import torch
import json
import torch
import time
import uvicorn
from fastapi import FastAPI, Request, Response
from fastapi.responses import JSONResponse
from sse_starlette.sse import EventSourceResponse
from config.log_config import uvicorn_logger
from models import OpenAIinput
from utils.codegen import CodeGenProxy
from utils.errors import FauxPilotException
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
logging.config.dictConfig(uvicorn_logger)
# token = os.environ.get("HUB_TOKEN", None)
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
# tokenizer = AutoTokenizer.from_pretrained("bigcode/christmas-models", use_auth_token=token)
# model = AutoModelForCausalLM.from_pretrained("bigcode/christmas-models", trust_remote_code=True, use_auth_token=token).to(device)
# pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)
codegen = CodeGenProxy(
host=os.environ.get("TRITON_HOST", "triton"),
port=os.environ.get("TRITON_PORT", 8001),
verbose=os.environ.get("TRITON_VERBOSITY", False)
)
app = FastAPI(
title="FauxPilot",
description="This is an attempt to build a locally hosted version of GitHub Copilot. It uses the SalesForce CodeGen"
"models inside of NVIDIA's Triton Inference Server with the FasterTransformer backend.",
docs_url="/",
swagger_ui_parameters={"defaultModelsExpandDepth": -1}
)
@app.exception_handler(FauxPilotException)
async def fauxpilot_handler(request: Request, exc: FauxPilotException):
return JSONResponse(
status_code=400,
content=exc.json()
)
@app.post("/v1/engines/codegen/completions")
@app.post("/v1/completions")
async def completions(data: OpenAIinput):
data = data.dict()
try:
content = codegen(data=data)
# prompt = data.get("prompt")
# choices = [pipe(prompt, do_sample=True, top_p=0.95, max_new_tokens=50)[0]['generated_text']]
# completion = {
# 'id': None, # fill in
# 'model': 'codegen',
# 'object': 'text_completion',
# 'created': int(time.time()),
# 'choices': None, # fill in
# 'usage': {
# 'completion_tokens': int(sum([len(c.split()) for c in choices])),
# 'prompt_tokens': int(len(prompt.split())),
# 'total_tokens': int(sum([len(c.split()) for c in choices]) + len(prompt.split())),
# }
# }
# completion['id'] = 10
# completion['choices'] = choices
# content = json.dumps(completion)
except Exception as E:
raise FauxPilotException(
message=str(E),
type="invalid_request_error",
param=None,
code=None,
)
if data.get("stream") is not None:
return EventSourceResponse(
content=content,
status_code=200,
media_type="text/event-stream"
)
else:
return Response(
status_code=200,
content=content,
media_type="application/json"
)
if __name__ == "__main__":
uvicorn.run("app:app", host="0.0.0.0", port=5000)
# curl request to test the API
# curl -X POST "http://localhost:5000/v1/engines/codegen/completions" -H "accept: application/json" -H "Content-Type: application/json" -d "{\"prompt\": \"import numpy as np\"}"
# curl -X POST "https://huggingface.co/spaces/ncoop57/santacoder-openai/v1/engines/codegen/completions" -H "accept: application/json" -H "Content-Type: application/json" -d "{\"prompt\": \"import numpy as np\"}"
# curl -X POST "https://ncoop57-santacoder-openai.hf.space/v1/engines/codegen/completions" -H "accept: application/json" -H "Content-Type: application/json" -d "{\"prompt\": \"import numpy as np\"}"