import logging import os import torch import json import torch import time import uvicorn from fastapi import FastAPI, Request, Response from fastapi.responses import JSONResponse from sse_starlette.sse import EventSourceResponse from config.log_config import uvicorn_logger from models import OpenAIinput from utils.codegen import CodeGenProxy from utils.errors import FauxPilotException from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline logging.config.dictConfig(uvicorn_logger) # token = os.environ.get("HUB_TOKEN", None) # device = "cuda:0" if torch.cuda.is_available() else "cpu" # tokenizer = AutoTokenizer.from_pretrained("bigcode/christmas-models", use_auth_token=token) # model = AutoModelForCausalLM.from_pretrained("bigcode/christmas-models", trust_remote_code=True, use_auth_token=token).to(device) # pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device) codegen = CodeGenProxy( host=os.environ.get("TRITON_HOST", "triton"), port=os.environ.get("TRITON_PORT", 8001), verbose=os.environ.get("TRITON_VERBOSITY", False) ) app = FastAPI( title="FauxPilot", description="This is an attempt to build a locally hosted version of GitHub Copilot. It uses the SalesForce CodeGen" "models inside of NVIDIA's Triton Inference Server with the FasterTransformer backend.", docs_url="/", swagger_ui_parameters={"defaultModelsExpandDepth": -1} ) @app.exception_handler(FauxPilotException) async def fauxpilot_handler(request: Request, exc: FauxPilotException): return JSONResponse( status_code=400, content=exc.json() ) @app.post("/v1/engines/codegen/completions") @app.post("/v1/completions") async def completions(data: OpenAIinput): data = data.dict() try: content = codegen(data=data) # prompt = data.get("prompt") # choices = [pipe(prompt, do_sample=True, top_p=0.95, max_new_tokens=50)[0]['generated_text']] # completion = { # 'id': None, # fill in # 'model': 'codegen', # 'object': 'text_completion', # 'created': int(time.time()), # 'choices': None, # fill in # 'usage': { # 'completion_tokens': int(sum([len(c.split()) for c in choices])), # 'prompt_tokens': int(len(prompt.split())), # 'total_tokens': int(sum([len(c.split()) for c in choices]) + len(prompt.split())), # } # } # completion['id'] = 10 # completion['choices'] = choices # content = json.dumps(completion) except Exception as E: raise FauxPilotException( message=str(E), type="invalid_request_error", param=None, code=None, ) if data.get("stream") is not None: return EventSourceResponse( content=content, status_code=200, media_type="text/event-stream" ) else: return Response( status_code=200, content=content, media_type="application/json" ) if __name__ == "__main__": uvicorn.run("app:app", host="0.0.0.0", port=5000) # curl request to test the API # curl -X POST "http://localhost:5000/v1/engines/codegen/completions" -H "accept: application/json" -H "Content-Type: application/json" -d "{\"prompt\": \"import numpy as np\"}" # curl -X POST "https://huggingface.co/spaces/ncoop57/santacoder-openai/v1/engines/codegen/completions" -H "accept: application/json" -H "Content-Type: application/json" -d "{\"prompt\": \"import numpy as np\"}" # curl -X POST "https://ncoop57-santacoder-openai.hf.space/v1/engines/codegen/completions" -H "accept: application/json" -H "Content-Type: application/json" -d "{\"prompt\": \"import numpy as np\"}"