Rox-Turbo commited on
Commit
0a51130
·
verified ·
1 Parent(s): 0a49c2d

Update server.py

Browse files
Files changed (1) hide show
  1. server.py +210 -116
server.py CHANGED
@@ -3,6 +3,7 @@ import os
3
  import sys
4
  import time
5
  import uuid
 
6
  from typing import List, Optional, AsyncGenerator, Iterable
7
  from contextlib import asynccontextmanager
8
 
@@ -43,6 +44,18 @@ def _parse_cors_origins(value: str) -> List[str]:
43
 
44
  CORS_ORIGINS = _parse_cors_origins(os.getenv("CORS_ORIGINS", "*"))
45
  GZIP_MIN_SIZE = int(os.getenv("GZIP_MIN_SIZE", "500"))
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  # Model configurations
48
  ROX_CORE_MODEL = "minimaxai/minimax-m2.5"
@@ -86,6 +99,7 @@ async def lifespan(app: FastAPI):
86
  max_retries=max_retries,
87
  http_client=http_client,
88
  )
 
89
 
90
  try:
91
  yield
@@ -120,6 +134,15 @@ async def add_request_context(request: Request, call_next):
120
  request_id = request.headers.get("x-request-id") or str(uuid.uuid4())
121
  start = time.perf_counter()
122
  try:
 
 
 
 
 
 
 
 
 
123
  response: Response = await call_next(request)
124
  finally:
125
  elapsed_ms = (time.perf_counter() - start) * 1000.0
@@ -148,6 +171,32 @@ def _client(app_: FastAPI) -> AsyncOpenAI:
148
  raise RuntimeError("Client not initialized")
149
  return c
150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
  # Helper function for streaming responses
153
  async def stream_response(
@@ -161,25 +210,26 @@ async def stream_response(
161
  ) -> AsyncGenerator[str, None]:
162
  """Stream responses from OpenAI API"""
163
  try:
164
- stream = await _client(app_).chat.completions.create(
165
- model=model,
166
- messages=messages,
167
- temperature=temperature,
168
- top_p=top_p,
169
- max_tokens=max_tokens,
170
- stream=True,
171
- extra_body=extra_body
172
- )
 
173
 
174
- async for chunk in stream:
175
- delta = chunk.choices[0].delta
176
- content = getattr(delta, "content", None)
177
- if content:
178
- yield f"data: {json.dumps({'content': content})}\n\n"
179
 
180
  yield "data: [DONE]\n\n"
181
  except Exception as e:
182
- yield f"data: {json.dumps({'error': str(e)})}\n\n"
183
 
184
 
185
  @app.get("/health")
@@ -271,9 +321,9 @@ class ChatMessage(BaseModel):
271
 
272
  class ChatRequest(BaseModel):
273
  messages: List[ChatMessage]
274
- temperature: Optional[float] = 0.7
275
- top_p: Optional[float] = 0.95
276
- max_tokens: Optional[int] = 8192
277
  stream: Optional[bool] = False
278
 
279
 
@@ -301,22 +351,27 @@ async def chat(req: ChatRequest):
301
  """Rox Core - Main conversational model with streaming support"""
302
  messages = [{"role": "system", "content": ROX_CORE_IDENTITY}]
303
  messages.extend([m.model_dump() for m in req.messages])
 
 
 
304
 
305
  if req.stream:
306
  return StreamingResponse(
307
- stream_response(app, ROX_CORE_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 8192)),
308
- media_type="text/event-stream"
 
309
  )
310
 
311
  try:
312
- completion = await _client(app).chat.completions.create(
313
- model=ROX_CORE_MODEL,
314
- messages=messages,
315
- temperature=req.temperature,
316
- top_p=req.top_p,
317
- max_tokens=min(req.max_tokens, 8192),
318
- stream=False
319
- )
 
320
  return {"content": completion.choices[0].message.content or ""}
321
  except Exception as e:
322
  raise HTTPException(status_code=500, detail=str(e))
@@ -327,22 +382,27 @@ async def turbo(req: ChatRequest):
327
  """Rox 2.1 Turbo - Fast and efficient with streaming"""
328
  messages = [{"role": "system", "content": ROX_TURBO_IDENTITY}]
329
  messages.extend([m.model_dump() for m in req.messages])
 
 
 
330
 
331
  if req.stream:
332
  return StreamingResponse(
333
- stream_response(app, ROX_TURBO_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 8192)),
334
- media_type="text/event-stream"
 
335
  )
336
 
337
  try:
338
- completion = await _client(app).chat.completions.create(
339
- model=ROX_TURBO_MODEL,
340
- messages=messages,
341
- temperature=req.temperature,
342
- top_p=req.top_p,
343
- max_tokens=min(req.max_tokens, 8192),
344
- stream=False
345
- )
 
346
  return {"content": completion.choices[0].message.content or ""}
347
  except Exception as e:
348
  raise HTTPException(status_code=500, detail=str(e))
@@ -353,30 +413,35 @@ async def coder(req: ChatRequest):
353
  """Rox 3.5 Coder - Specialized coding with streaming"""
354
  messages = [{"role": "system", "content": ROX_CODER_IDENTITY}]
355
  messages.extend([m.model_dump() for m in req.messages])
 
 
 
356
 
357
  extra_body = {
358
  "top_k": 20,
359
  "presence_penalty": 0,
360
  "repetition_penalty": 1,
361
- "chat_template_kwargs": {"enable_thinking": True}
362
  }
363
 
364
  if req.stream:
365
  return StreamingResponse(
366
- stream_response(app, ROX_CODER_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 16384), extra_body),
367
- media_type="text/event-stream"
 
368
  )
369
 
370
  try:
371
- completion = await _client(app).chat.completions.create(
372
- model=ROX_CODER_MODEL,
373
- messages=messages,
374
- temperature=req.temperature,
375
- top_p=req.top_p,
376
- max_tokens=min(req.max_tokens, 16384),
377
- stream=False,
378
- extra_body=extra_body
379
- )
 
380
  return {"content": completion.choices[0].message.content or ""}
381
  except Exception as e:
382
  raise HTTPException(status_code=500, detail=str(e))
@@ -387,25 +452,30 @@ async def turbo45(req: ChatRequest):
387
  """Rox 4.5 Turbo - Advanced reasoning with streaming"""
388
  messages = [{"role": "system", "content": ROX_TURBO_45_IDENTITY}]
389
  messages.extend([m.model_dump() for m in req.messages])
 
 
 
390
 
391
- extra_body = {"chat_template_kwargs": {"thinking": True}}
392
 
393
  if req.stream:
394
  return StreamingResponse(
395
- stream_response(app, ROX_TURBO_45_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 8192), extra_body),
396
- media_type="text/event-stream"
 
397
  )
398
 
399
  try:
400
- completion = await _client(app).chat.completions.create(
401
- model=ROX_TURBO_45_MODEL,
402
- messages=messages,
403
- temperature=req.temperature,
404
- top_p=req.top_p,
405
- max_tokens=min(req.max_tokens, 8192),
406
- stream=False,
407
- extra_body=extra_body
408
- )
 
409
  return {"content": completion.choices[0].message.content or ""}
410
  except Exception as e:
411
  raise HTTPException(status_code=500, detail=str(e))
@@ -416,25 +486,30 @@ async def ultra(req: ChatRequest):
416
  """Rox 5 Ultra - Most advanced with streaming"""
417
  messages = [{"role": "system", "content": ROX_ULTRA_IDENTITY}]
418
  messages.extend([m.model_dump() for m in req.messages])
 
 
 
419
 
420
- extra_body = {"chat_template_kwargs": {"thinking": True}}
421
 
422
  if req.stream:
423
  return StreamingResponse(
424
- stream_response(app, ROX_ULTRA_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 8192), extra_body),
425
- media_type="text/event-stream"
 
426
  )
427
 
428
  try:
429
- completion = await _client(app).chat.completions.create(
430
- model=ROX_ULTRA_MODEL,
431
- messages=messages,
432
- temperature=req.temperature,
433
- top_p=req.top_p,
434
- max_tokens=min(req.max_tokens, 8192),
435
- stream=False,
436
- extra_body=extra_body
437
- )
 
438
  return {"content": completion.choices[0].message.content or ""}
439
  except Exception as e:
440
  raise HTTPException(status_code=500, detail=str(e))
@@ -445,25 +520,30 @@ async def dyno(req: ChatRequest):
445
  """Rox 6 Dyno - Extended context with streaming"""
446
  messages = [{"role": "system", "content": ROX_DYNO_IDENTITY}]
447
  messages.extend([m.model_dump() for m in req.messages])
 
 
 
448
 
449
- extra_body = {"chat_template_kwargs": {"thinking": True}}
450
 
451
  if req.stream:
452
  return StreamingResponse(
453
- stream_response(app, ROX_DYNO_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 16384), extra_body),
454
- media_type="text/event-stream"
 
455
  )
456
 
457
  try:
458
- completion = await _client(app).chat.completions.create(
459
- model=ROX_DYNO_MODEL,
460
- messages=messages,
461
- temperature=req.temperature,
462
- top_p=req.top_p,
463
- max_tokens=min(req.max_tokens, 16384),
464
- stream=False,
465
- extra_body=extra_body
466
- )
 
467
  return {"content": completion.choices[0].message.content or ""}
468
  except Exception as e:
469
  raise HTTPException(status_code=500, detail=str(e))
@@ -474,30 +554,35 @@ async def coder7(req: ChatRequest):
474
  """Rox 7 Coder - Most advanced coding with streaming"""
475
  messages = [{"role": "system", "content": ROX_CODER_7_IDENTITY}]
476
  messages.extend([m.model_dump() for m in req.messages])
 
 
 
477
 
478
  extra_body = {
479
  "chat_template_kwargs": {
480
- "enable_thinking": True,
481
  "clear_thinking": False
482
  }
483
  }
484
 
485
  if req.stream:
486
  return StreamingResponse(
487
- stream_response(app, ROX_CODER_7_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 16384), extra_body),
488
- media_type="text/event-stream"
 
489
  )
490
 
491
  try:
492
- completion = await _client(app).chat.completions.create(
493
- model=ROX_CODER_7_MODEL,
494
- messages=messages,
495
- temperature=req.temperature,
496
- top_p=req.top_p,
497
- max_tokens=min(req.max_tokens, 16384),
498
- stream=False,
499
- extra_body=extra_body
500
- )
 
501
  return {"content": completion.choices[0].message.content or ""}
502
  except Exception as e:
503
  raise HTTPException(status_code=500, detail=str(e))
@@ -508,22 +593,27 @@ async def vision(req: ChatRequest):
508
  """Rox Vision Max - Visual understanding with streaming"""
509
  messages = [{"role": "system", "content": ROX_VISION_IDENTITY}]
510
  messages.extend([m.model_dump() for m in req.messages])
 
 
 
511
 
512
  if req.stream:
513
  return StreamingResponse(
514
- stream_response(app, ROX_VISION_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 8192)),
515
- media_type="text/event-stream"
 
516
  )
517
 
518
  try:
519
- completion = await _client(app).chat.completions.create(
520
- model=ROX_VISION_MODEL,
521
- messages=messages,
522
- temperature=req.temperature,
523
- top_p=req.top_p,
524
- max_tokens=min(req.max_tokens, 8192),
525
- stream=False
526
- )
 
527
  return {"content": completion.choices[0].message.content or ""}
528
  except Exception as e:
529
  raise HTTPException(status_code=500, detail=str(e))
@@ -539,14 +629,18 @@ async def hf_generate(req: HFRequest):
539
  ]
540
 
541
  try:
542
- completion = await _client(app).chat.completions.create(
543
- model=ROX_CORE_MODEL,
544
- messages=messages,
545
- temperature=params.temperature or 0.7,
546
- top_p=params.top_p or 0.95,
547
- max_tokens=params.max_new_tokens or 8192,
548
- stream=False
549
- )
 
 
 
 
550
  return [{"generated_text": completion.choices[0].message.content or ""}]
551
  except Exception as e:
552
  raise HTTPException(status_code=500, detail=str(e))
 
3
  import sys
4
  import time
5
  import uuid
6
+ import asyncio
7
  from typing import List, Optional, AsyncGenerator, Iterable
8
  from contextlib import asynccontextmanager
9
 
 
44
 
45
  CORS_ORIGINS = _parse_cors_origins(os.getenv("CORS_ORIGINS", "*"))
46
  GZIP_MIN_SIZE = int(os.getenv("GZIP_MIN_SIZE", "500"))
47
+ MAX_REQUEST_BYTES = int(os.getenv("MAX_REQUEST_BYTES", str(1_000_000))) # 1MB default
48
+
49
+ # Fast-by-default generation settings (still fully overridable per request)
50
+ DEFAULT_TEMPERATURE = float(os.getenv("DEFAULT_TEMPERATURE", "0.7"))
51
+ DEFAULT_TOP_P = float(os.getenv("DEFAULT_TOP_P", "0.95"))
52
+ DEFAULT_MAX_TOKENS = int(os.getenv("DEFAULT_MAX_TOKENS", "1024"))
53
+
54
+ # Concurrency guard to keep tail latency low under spikes
55
+ MAX_INFLIGHT_REQUESTS = int(os.getenv("MAX_INFLIGHT_REQUESTS", "200"))
56
+
57
+ # "Thinking" increases latency; keep opt-in via env
58
+ ENABLE_THINKING = os.getenv("ENABLE_THINKING", "0").strip().lower() in {"1", "true", "yes", "on"}
59
 
60
  # Model configurations
61
  ROX_CORE_MODEL = "minimaxai/minimax-m2.5"
 
99
  max_retries=max_retries,
100
  http_client=http_client,
101
  )
102
+ app.state.inflight_semaphore = asyncio.Semaphore(MAX_INFLIGHT_REQUESTS)
103
 
104
  try:
105
  yield
 
134
  request_id = request.headers.get("x-request-id") or str(uuid.uuid4())
135
  start = time.perf_counter()
136
  try:
137
+ # Protect server from huge bodies (DoS / latency blowups)
138
+ cl = request.headers.get("content-length")
139
+ if cl is not None:
140
+ try:
141
+ if int(cl) > MAX_REQUEST_BYTES:
142
+ return JSONResponse(status_code=413, content={"error": "Request too large"})
143
+ except ValueError:
144
+ return JSONResponse(status_code=400, content={"error": "Invalid Content-Length"})
145
+
146
  response: Response = await call_next(request)
147
  finally:
148
  elapsed_ms = (time.perf_counter() - start) * 1000.0
 
171
  raise RuntimeError("Client not initialized")
172
  return c
173
 
174
+ def _semaphore(app_: FastAPI) -> asyncio.Semaphore:
175
+ s = getattr(app_.state, "inflight_semaphore", None)
176
+ if s is None:
177
+ raise RuntimeError("Semaphore not initialized")
178
+ return s
179
+
180
+ def _effective_temperature(value: Optional[float]) -> float:
181
+ return DEFAULT_TEMPERATURE if value is None else value
182
+
183
+ def _effective_top_p(value: Optional[float]) -> float:
184
+ return DEFAULT_TOP_P if value is None else value
185
+
186
+ def _effective_max_tokens(value: Optional[int], cap: int) -> int:
187
+ v = DEFAULT_MAX_TOKENS if value is None else value
188
+ if v < 1:
189
+ v = DEFAULT_MAX_TOKENS
190
+ return min(v, cap)
191
+
192
+ def _sse_headers() -> dict:
193
+ # Helps proxies (nginx) avoid buffering and keeps SSE responsive
194
+ return {
195
+ "Cache-Control": "no-cache",
196
+ "Connection": "keep-alive",
197
+ "X-Accel-Buffering": "no",
198
+ }
199
+
200
 
201
  # Helper function for streaming responses
202
  async def stream_response(
 
210
  ) -> AsyncGenerator[str, None]:
211
  """Stream responses from OpenAI API"""
212
  try:
213
+ async with _semaphore(app_):
214
+ stream = await _client(app_).chat.completions.create(
215
+ model=model,
216
+ messages=messages,
217
+ temperature=temperature,
218
+ top_p=top_p,
219
+ max_tokens=max_tokens,
220
+ stream=True,
221
+ extra_body=extra_body
222
+ )
223
 
224
+ async for chunk in stream:
225
+ delta = chunk.choices[0].delta
226
+ content = getattr(delta, "content", None)
227
+ if content:
228
+ yield f"data: {json.dumps({'content': content}, separators=(',', ':'))}\n\n"
229
 
230
  yield "data: [DONE]\n\n"
231
  except Exception as e:
232
+ yield f"data: {json.dumps({'error': str(e)}, separators=(',', ':'))}\n\n"
233
 
234
 
235
  @app.get("/health")
 
321
 
322
  class ChatRequest(BaseModel):
323
  messages: List[ChatMessage]
324
+ temperature: Optional[float] = None
325
+ top_p: Optional[float] = None
326
+ max_tokens: Optional[int] = None
327
  stream: Optional[bool] = False
328
 
329
 
 
351
  """Rox Core - Main conversational model with streaming support"""
352
  messages = [{"role": "system", "content": ROX_CORE_IDENTITY}]
353
  messages.extend([m.model_dump() for m in req.messages])
354
+ temperature = _effective_temperature(req.temperature)
355
+ top_p = _effective_top_p(req.top_p)
356
+ max_tokens = _effective_max_tokens(req.max_tokens, 8192)
357
 
358
  if req.stream:
359
  return StreamingResponse(
360
+ stream_response(app, ROX_CORE_MODEL, messages, temperature, top_p, max_tokens),
361
+ media_type="text/event-stream",
362
+ headers=_sse_headers(),
363
  )
364
 
365
  try:
366
+ async with _semaphore(app):
367
+ completion = await _client(app).chat.completions.create(
368
+ model=ROX_CORE_MODEL,
369
+ messages=messages,
370
+ temperature=temperature,
371
+ top_p=top_p,
372
+ max_tokens=max_tokens,
373
+ stream=False
374
+ )
375
  return {"content": completion.choices[0].message.content or ""}
376
  except Exception as e:
377
  raise HTTPException(status_code=500, detail=str(e))
 
382
  """Rox 2.1 Turbo - Fast and efficient with streaming"""
383
  messages = [{"role": "system", "content": ROX_TURBO_IDENTITY}]
384
  messages.extend([m.model_dump() for m in req.messages])
385
+ temperature = _effective_temperature(req.temperature)
386
+ top_p = _effective_top_p(req.top_p)
387
+ max_tokens = _effective_max_tokens(req.max_tokens, 8192)
388
 
389
  if req.stream:
390
  return StreamingResponse(
391
+ stream_response(app, ROX_TURBO_MODEL, messages, temperature, top_p, max_tokens),
392
+ media_type="text/event-stream",
393
+ headers=_sse_headers(),
394
  )
395
 
396
  try:
397
+ async with _semaphore(app):
398
+ completion = await _client(app).chat.completions.create(
399
+ model=ROX_TURBO_MODEL,
400
+ messages=messages,
401
+ temperature=temperature,
402
+ top_p=top_p,
403
+ max_tokens=max_tokens,
404
+ stream=False
405
+ )
406
  return {"content": completion.choices[0].message.content or ""}
407
  except Exception as e:
408
  raise HTTPException(status_code=500, detail=str(e))
 
413
  """Rox 3.5 Coder - Specialized coding with streaming"""
414
  messages = [{"role": "system", "content": ROX_CODER_IDENTITY}]
415
  messages.extend([m.model_dump() for m in req.messages])
416
+ temperature = _effective_temperature(req.temperature)
417
+ top_p = _effective_top_p(req.top_p)
418
+ max_tokens = _effective_max_tokens(req.max_tokens, 16384)
419
 
420
  extra_body = {
421
  "top_k": 20,
422
  "presence_penalty": 0,
423
  "repetition_penalty": 1,
424
+ "chat_template_kwargs": {"enable_thinking": ENABLE_THINKING}
425
  }
426
 
427
  if req.stream:
428
  return StreamingResponse(
429
+ stream_response(app, ROX_CODER_MODEL, messages, temperature, top_p, max_tokens, extra_body),
430
+ media_type="text/event-stream",
431
+ headers=_sse_headers(),
432
  )
433
 
434
  try:
435
+ async with _semaphore(app):
436
+ completion = await _client(app).chat.completions.create(
437
+ model=ROX_CODER_MODEL,
438
+ messages=messages,
439
+ temperature=temperature,
440
+ top_p=top_p,
441
+ max_tokens=max_tokens,
442
+ stream=False,
443
+ extra_body=extra_body
444
+ )
445
  return {"content": completion.choices[0].message.content or ""}
446
  except Exception as e:
447
  raise HTTPException(status_code=500, detail=str(e))
 
452
  """Rox 4.5 Turbo - Advanced reasoning with streaming"""
453
  messages = [{"role": "system", "content": ROX_TURBO_45_IDENTITY}]
454
  messages.extend([m.model_dump() for m in req.messages])
455
+ temperature = _effective_temperature(req.temperature)
456
+ top_p = _effective_top_p(req.top_p)
457
+ max_tokens = _effective_max_tokens(req.max_tokens, 8192)
458
 
459
+ extra_body = {"chat_template_kwargs": {"thinking": ENABLE_THINKING}} if ENABLE_THINKING else None
460
 
461
  if req.stream:
462
  return StreamingResponse(
463
+ stream_response(app, ROX_TURBO_45_MODEL, messages, temperature, top_p, max_tokens, extra_body),
464
+ media_type="text/event-stream",
465
+ headers=_sse_headers(),
466
  )
467
 
468
  try:
469
+ async with _semaphore(app):
470
+ completion = await _client(app).chat.completions.create(
471
+ model=ROX_TURBO_45_MODEL,
472
+ messages=messages,
473
+ temperature=temperature,
474
+ top_p=top_p,
475
+ max_tokens=max_tokens,
476
+ stream=False,
477
+ extra_body=extra_body
478
+ )
479
  return {"content": completion.choices[0].message.content or ""}
480
  except Exception as e:
481
  raise HTTPException(status_code=500, detail=str(e))
 
486
  """Rox 5 Ultra - Most advanced with streaming"""
487
  messages = [{"role": "system", "content": ROX_ULTRA_IDENTITY}]
488
  messages.extend([m.model_dump() for m in req.messages])
489
+ temperature = _effective_temperature(req.temperature)
490
+ top_p = _effective_top_p(req.top_p)
491
+ max_tokens = _effective_max_tokens(req.max_tokens, 8192)
492
 
493
+ extra_body = {"chat_template_kwargs": {"thinking": ENABLE_THINKING}} if ENABLE_THINKING else None
494
 
495
  if req.stream:
496
  return StreamingResponse(
497
+ stream_response(app, ROX_ULTRA_MODEL, messages, temperature, top_p, max_tokens, extra_body),
498
+ media_type="text/event-stream",
499
+ headers=_sse_headers(),
500
  )
501
 
502
  try:
503
+ async with _semaphore(app):
504
+ completion = await _client(app).chat.completions.create(
505
+ model=ROX_ULTRA_MODEL,
506
+ messages=messages,
507
+ temperature=temperature,
508
+ top_p=top_p,
509
+ max_tokens=max_tokens,
510
+ stream=False,
511
+ extra_body=extra_body
512
+ )
513
  return {"content": completion.choices[0].message.content or ""}
514
  except Exception as e:
515
  raise HTTPException(status_code=500, detail=str(e))
 
520
  """Rox 6 Dyno - Extended context with streaming"""
521
  messages = [{"role": "system", "content": ROX_DYNO_IDENTITY}]
522
  messages.extend([m.model_dump() for m in req.messages])
523
+ temperature = _effective_temperature(req.temperature)
524
+ top_p = _effective_top_p(req.top_p)
525
+ max_tokens = _effective_max_tokens(req.max_tokens, 16384)
526
 
527
+ extra_body = {"chat_template_kwargs": {"thinking": ENABLE_THINKING}} if ENABLE_THINKING else None
528
 
529
  if req.stream:
530
  return StreamingResponse(
531
+ stream_response(app, ROX_DYNO_MODEL, messages, temperature, top_p, max_tokens, extra_body),
532
+ media_type="text/event-stream",
533
+ headers=_sse_headers(),
534
  )
535
 
536
  try:
537
+ async with _semaphore(app):
538
+ completion = await _client(app).chat.completions.create(
539
+ model=ROX_DYNO_MODEL,
540
+ messages=messages,
541
+ temperature=temperature,
542
+ top_p=top_p,
543
+ max_tokens=max_tokens,
544
+ stream=False,
545
+ extra_body=extra_body
546
+ )
547
  return {"content": completion.choices[0].message.content or ""}
548
  except Exception as e:
549
  raise HTTPException(status_code=500, detail=str(e))
 
554
  """Rox 7 Coder - Most advanced coding with streaming"""
555
  messages = [{"role": "system", "content": ROX_CODER_7_IDENTITY}]
556
  messages.extend([m.model_dump() for m in req.messages])
557
+ temperature = _effective_temperature(req.temperature)
558
+ top_p = _effective_top_p(req.top_p)
559
+ max_tokens = _effective_max_tokens(req.max_tokens, 16384)
560
 
561
  extra_body = {
562
  "chat_template_kwargs": {
563
+ "enable_thinking": ENABLE_THINKING,
564
  "clear_thinking": False
565
  }
566
  }
567
 
568
  if req.stream:
569
  return StreamingResponse(
570
+ stream_response(app, ROX_CODER_7_MODEL, messages, temperature, top_p, max_tokens, extra_body),
571
+ media_type="text/event-stream",
572
+ headers=_sse_headers(),
573
  )
574
 
575
  try:
576
+ async with _semaphore(app):
577
+ completion = await _client(app).chat.completions.create(
578
+ model=ROX_CODER_7_MODEL,
579
+ messages=messages,
580
+ temperature=temperature,
581
+ top_p=top_p,
582
+ max_tokens=max_tokens,
583
+ stream=False,
584
+ extra_body=extra_body
585
+ )
586
  return {"content": completion.choices[0].message.content or ""}
587
  except Exception as e:
588
  raise HTTPException(status_code=500, detail=str(e))
 
593
  """Rox Vision Max - Visual understanding with streaming"""
594
  messages = [{"role": "system", "content": ROX_VISION_IDENTITY}]
595
  messages.extend([m.model_dump() for m in req.messages])
596
+ temperature = _effective_temperature(req.temperature)
597
+ top_p = _effective_top_p(req.top_p)
598
+ max_tokens = _effective_max_tokens(req.max_tokens, 8192)
599
 
600
  if req.stream:
601
  return StreamingResponse(
602
+ stream_response(app, ROX_VISION_MODEL, messages, temperature, top_p, max_tokens),
603
+ media_type="text/event-stream",
604
+ headers=_sse_headers(),
605
  )
606
 
607
  try:
608
+ async with _semaphore(app):
609
+ completion = await _client(app).chat.completions.create(
610
+ model=ROX_VISION_MODEL,
611
+ messages=messages,
612
+ temperature=temperature,
613
+ top_p=top_p,
614
+ max_tokens=max_tokens,
615
+ stream=False
616
+ )
617
  return {"content": completion.choices[0].message.content or ""}
618
  except Exception as e:
619
  raise HTTPException(status_code=500, detail=str(e))
 
629
  ]
630
 
631
  try:
632
+ temperature = _effective_temperature(params.temperature)
633
+ top_p = _effective_top_p(params.top_p)
634
+ max_tokens = _effective_max_tokens(params.max_new_tokens, 8192)
635
+ async with _semaphore(app):
636
+ completion = await _client(app).chat.completions.create(
637
+ model=ROX_CORE_MODEL,
638
+ messages=messages,
639
+ temperature=temperature,
640
+ top_p=top_p,
641
+ max_tokens=max_tokens,
642
+ stream=False
643
+ )
644
  return [{"generated_text": completion.choices[0].message.content or ""}]
645
  except Exception as e:
646
  raise HTTPException(status_code=500, detail=str(e))