Spaces:
Runtime error
Runtime error
Oleg Lavrovsky
commited on
Completion API
Browse files
app.py
CHANGED
|
@@ -2,6 +2,7 @@ from contextlib import asynccontextmanager
|
|
| 2 |
from fastapi import FastAPI, HTTPException
|
| 3 |
from fastapi.middleware.cors import CORSMiddleware
|
| 4 |
from pydantic import BaseModel, ValidationError
|
|
|
|
| 5 |
|
| 6 |
from torch import cuda
|
| 7 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
@@ -40,10 +41,16 @@ class ModelResponse(BaseModel):
|
|
| 40 |
confidence: float
|
| 41 |
processing_time: float
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
class Completion(BaseModel):
|
| 44 |
-
model: str
|
| 45 |
-
|
| 46 |
-
max_tokens: int =
|
|
|
|
|
|
|
| 47 |
|
| 48 |
@asynccontextmanager
|
| 49 |
async def lifespan(app: FastAPI):
|
|
@@ -104,13 +111,10 @@ def fit_to_length(text, min_length=3, max_length=100):
|
|
| 104 |
return text
|
| 105 |
|
| 106 |
|
| 107 |
-
def get_model_reponse(
|
| 108 |
"""Process the text content."""
|
| 109 |
|
| 110 |
# Prepare the model input
|
| 111 |
-
messages_think = [
|
| 112 |
-
{"role": "user", "content": query}
|
| 113 |
-
]
|
| 114 |
text = tokenizer.apply_chat_template(
|
| 115 |
messages_think,
|
| 116 |
tokenize=False,
|
|
@@ -144,9 +148,7 @@ async def completion(data: Completion):
|
|
| 144 |
raise HTTPException(status_code=503, detail="Model not loaded")
|
| 145 |
|
| 146 |
try:
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
result = get_model_reponse(text, model)
|
| 150 |
|
| 151 |
return {
|
| 152 |
"choices": [
|
|
@@ -181,7 +183,10 @@ async def predict(q: str):
|
|
| 181 |
|
| 182 |
text = fit_to_length(input_data.text, input_data.min_length, input_data.max_length)
|
| 183 |
|
| 184 |
-
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
# Checkpoint
|
| 187 |
processing_time = time.time() - start_time
|
|
|
|
| 2 |
from fastapi import FastAPI, HTTPException
|
| 3 |
from fastapi.middleware.cors import CORSMiddleware
|
| 4 |
from pydantic import BaseModel, ValidationError
|
| 5 |
+
from typing import List, Optional
|
| 6 |
|
| 7 |
from torch import cuda
|
| 8 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
| 41 |
confidence: float
|
| 42 |
processing_time: float
|
| 43 |
|
| 44 |
+
class ChatMessage(BaseModel):
|
| 45 |
+
role: str
|
| 46 |
+
content: str
|
| 47 |
+
|
| 48 |
class Completion(BaseModel):
|
| 49 |
+
model: str = "apertus"
|
| 50 |
+
messages: List[ChatMessage]
|
| 51 |
+
max_tokens: Optional[int] = 512
|
| 52 |
+
temperature: Optional[float] = 0.1
|
| 53 |
+
top_p: Optional[float] = 0.9
|
| 54 |
|
| 55 |
@asynccontextmanager
|
| 56 |
async def lifespan(app: FastAPI):
|
|
|
|
| 111 |
return text
|
| 112 |
|
| 113 |
|
| 114 |
+
def get_model_reponse(messages_think):
|
| 115 |
"""Process the text content."""
|
| 116 |
|
| 117 |
# Prepare the model input
|
|
|
|
|
|
|
|
|
|
| 118 |
text = tokenizer.apply_chat_template(
|
| 119 |
messages_think,
|
| 120 |
tokenize=False,
|
|
|
|
| 148 |
raise HTTPException(status_code=503, detail="Model not loaded")
|
| 149 |
|
| 150 |
try:
|
| 151 |
+
result = get_model_reponse(data)
|
|
|
|
|
|
|
| 152 |
|
| 153 |
return {
|
| 154 |
"choices": [
|
|
|
|
| 183 |
|
| 184 |
text = fit_to_length(input_data.text, input_data.min_length, input_data.max_length)
|
| 185 |
|
| 186 |
+
messages_think = [
|
| 187 |
+
{"role": "user", "content": text}
|
| 188 |
+
]
|
| 189 |
+
result = get_model_reponse(messages_think)
|
| 190 |
|
| 191 |
# Checkpoint
|
| 192 |
processing_time = time.time() - start_time
|