from fastapi import FastAPI, WebSocket from fastapi.responses import HTMLResponse from fastapi import Form, Depends, HTTPException, status from transformers import pipeline, set_seed, AutoConfig, AutoTokenizer, GPT2LMHeadModel import torch import os import time import re import json app = FastAPI() html = """ Chat

WebSocket Chat

""" @app.get("/") async def get(): return HTMLResponse(html) @app.get("/env") async def env(): environment_variables = "

Environment Variables

" for name, value in os.environ.items(): environment_variables += f"{name}: {value}
" return HTMLResponse(environment_variables) @app.websocket("/ws") async def websocket_endpoint(websocket: WebSocket): await websocket.accept() while True: data = await websocket.receive_text() await websocket.send_text(f"Message text was: {data}") @app.post("/api/indochat/v1") async def indochat( text: str = Form(default="", description="The Prompt"), max_length: int = Form(default=250, description="Maximal length of the generated text"), do_sample: bool = Form(default=True, description="Whether to use sampling; use greedy decoding otherwise"), top_k: int = Form(default=50, description="The number of highest probability vocabulary tokens to keep " "for top-k-filtering"), top_p: float = Form(default=0.95, description="If set to float < 1, only the most probable tokens with " "probabilities that add up to top_p or higher are kept " "for generation"), temperature: float = Form(default=1.0, description="The Temperature of the softmax distribution"), penalty_alpha: float = Form(default=0.6, description="Penalty alpha"), repetition_penalty: float = Form(default=1.0, description="Repetition penalty"), seed: int = Form(default=42, description="Random Seed"), max_time: float = Form(default=60.0, description="Maximal time in seconds to generate the text") ): set_seed(seed) if repetition_penalty == 0.0: min_penalty = 1.05 max_penalty = 1.5 repetition_penalty = max(min_penalty + (1.0 - temperature) * (max_penalty - min_penalty), 0.8) prompt = f"User: {text}\nAssistant: " input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device) model.eval() print("Generating text...") print(f"max_length: {max_length}, do_sample: {do_sample}, top_k: {top_k}, top_p: {top_p}, " f"temperature: {temperature}, repetition_penalty: {repetition_penalty}, penalty_alpha: {penalty_alpha}") time_start = time.time() sample_outputs = model.generate(input_ids, penalty_alpha=penalty_alpha, do_sample=do_sample, min_length=200, max_length=max_length, top_k=top_k, top_p=top_p, temperature=temperature, repetition_penalty=repetition_penalty, num_return_sequences=1, max_time=max_time ) result = tokenizer.decode(sample_outputs[0], skip_special_tokens=True) # result = result[len(prompt) + 1:] time_end = time.time() time_diff = time_end - time_start print(f"result:\n{result}") generated_text = result return {"generated_text": generated_text, "processing_time": time_diff} def get_text_generator(model_name: str, device: str = "cpu"): hf_auth_token = os.getenv("HF_AUTH_TOKEN", False) print(f"hf_auth_token: {hf_auth_token}") print(f"Loading model with device: {device}...") tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_auth_token) model = GPT2LMHeadModel.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id, use_auth_token=hf_auth_token) model.to(device) print("Model loaded") return model, tokenizer def get_config(): return json.load(open("config.json", "r")) config = get_config() device = "cuda" if torch.cuda.is_available() else "cpu" model, tokenizer = get_text_generator(model_name=config["model_name"], device=device)