Spaces:
Runtime error
Runtime error
File size: 3,834 Bytes
b97f6e6 bab8078 b97f6e6 bab8078 b97f6e6 bab8078 b97f6e6 bab8078 fb2b996 b97f6e6 bab8078 b97f6e6 fb2b996 caf7eff fb2b996 caf7eff fb2b996 caf7eff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import logging
import os
import torch
import json
import torch
import time
import uvicorn
from fastapi import FastAPI, Request, Response
from fastapi.responses import JSONResponse
from sse_starlette.sse import EventSourceResponse
from config.log_config import uvicorn_logger
from models import OpenAIinput
from utils.codegen import CodeGenProxy
from utils.errors import FauxPilotException
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
logging.config.dictConfig(uvicorn_logger)
# token = os.environ.get("HUB_TOKEN", None)
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
# tokenizer = AutoTokenizer.from_pretrained("bigcode/christmas-models", use_auth_token=token)
# model = AutoModelForCausalLM.from_pretrained("bigcode/christmas-models", trust_remote_code=True, use_auth_token=token).to(device)
# pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)
codegen = CodeGenProxy(
host=os.environ.get("TRITON_HOST", "triton"),
port=os.environ.get("TRITON_PORT", 8001),
verbose=os.environ.get("TRITON_VERBOSITY", False)
)
app = FastAPI(
title="FauxPilot",
description="This is an attempt to build a locally hosted version of GitHub Copilot. It uses the SalesForce CodeGen"
"models inside of NVIDIA's Triton Inference Server with the FasterTransformer backend.",
docs_url="/",
swagger_ui_parameters={"defaultModelsExpandDepth": -1}
)
@app.exception_handler(FauxPilotException)
async def fauxpilot_handler(request: Request, exc: FauxPilotException):
return JSONResponse(
status_code=400,
content=exc.json()
)
@app.post("/v1/engines/codegen/completions")
@app.post("/v1/completions")
async def completions(data: OpenAIinput):
data = data.dict()
try:
content = codegen(data=data)
# prompt = data.get("prompt")
# choices = [pipe(prompt, do_sample=True, top_p=0.95, max_new_tokens=50)[0]['generated_text']]
# completion = {
# 'id': None, # fill in
# 'model': 'codegen',
# 'object': 'text_completion',
# 'created': int(time.time()),
# 'choices': None, # fill in
# 'usage': {
# 'completion_tokens': int(sum([len(c.split()) for c in choices])),
# 'prompt_tokens': int(len(prompt.split())),
# 'total_tokens': int(sum([len(c.split()) for c in choices]) + len(prompt.split())),
# }
# }
# completion['id'] = 10
# completion['choices'] = choices
# content = json.dumps(completion)
except Exception as E:
raise FauxPilotException(
message=str(E),
type="invalid_request_error",
param=None,
code=None,
)
if data.get("stream") is not None:
return EventSourceResponse(
content=content,
status_code=200,
media_type="text/event-stream"
)
else:
return Response(
status_code=200,
content=content,
media_type="application/json"
)
if __name__ == "__main__":
uvicorn.run("app:app", host="0.0.0.0", port=5000)
# curl request to test the API
# curl -X POST "http://localhost:5000/v1/engines/codegen/completions" -H "accept: application/json" -H "Content-Type: application/json" -d "{\"prompt\": \"import numpy as np\"}"
# curl -X POST "https://huggingface.co/spaces/ncoop57/santacoder-openai/v1/engines/codegen/completions" -H "accept: application/json" -H "Content-Type: application/json" -d "{\"prompt\": \"import numpy as np\"}"
# curl -X POST "https://ncoop57-santacoder-openai.hf.space/v1/engines/codegen/completions" -H "accept: application/json" -H "Content-Type: application/json" -d "{\"prompt\": \"import numpy as np\"}" |