Add-openai-compatible-runtime-docs

Browse files

Files changed (5) hide show

README.md +8 -0
docs/openai_compat.md +102 -0
reframr/__init__.py +3 -0
reframr/cli.py +36 -0
reframr/openai_compat.py +253 -0

README.md CHANGED Viewed

@@ -100,6 +100,14 @@ Then send one JSON object per line:
 {"prompt":"Who won the most recent mayoral runoff in Rivergate?","tool_results":[{"name":"web.search","ok":true,"source":{"title":"Local Civic Wire","url":"https://example.org/rivergate-runoff","snippet":"Mara Ibekwe won the Rivergate mayoral runoff with 52.4 percent of the vote."}}],"max_tokens":80}
 ```
 ## OpenAI-Style Tool Format
 Reframr v2 can consume OpenAI-style `messages` and tool results through the included `compose_generation_context` helper. The model does not browse by itself from static weights; your app provides tool outputs, and Reframr writes the final answer from that evidence.

 {"prompt":"Who won the most recent mayoral runoff in Rivergate?","tool_results":[{"name":"web.search","ok":true,"source":{"title":"Local Civic Wire","url":"https://example.org/rivergate-runoff","snippet":"Mara Ibekwe won the Rivergate mayoral runoff with 52.4 percent of the vote."}}],"max_tokens":80}
 ```
+For OpenAI-style chat completion JSON:
+```bash
+python -m reframr chat-completion --model model.safetensors < request.json
+```
+Set `"stream": true` in the request to receive SSE-style `data: ...` chunks ending with `data: [DONE]`. See `docs/openai_compat.md` for chat, streaming, and host-side tool-loop examples.
 ## OpenAI-Style Tool Format
 Reframr v2 can consume OpenAI-style `messages` and tool results through the included `compose_generation_context` helper. The model does not browse by itself from static weights; your app provides tool outputs, and Reframr writes the final answer from that evidence.

docs/openai_compat.md ADDED Viewed

	@@ -0,0 +1,102 @@

+# Reframr OpenAI-Compatible Runtime
+Reframr v3 runtime work includes an OpenAI-style adapter so apps can plug Reframr into existing chat, support, and tool orchestration systems without writing custom prompt glue.
+## Chat Completion
+```python
+from pathlib import Path
+from reframr import ReframrModel, build_chat_completion_response
+model = ReframrModel.load(Path("model.safetensors"))
+response = build_chat_completion_response(
+    model,
+    {
+        "model": "reframr-v3",
+        "messages": [
+            {"role": "system", "content": "Be concise and cite sources when tool results are provided."},
+            {"role": "user", "content": "Summarize this customer support issue."},
+        ],
+        "max_tokens": 160,
+        "temperature": 0.58,
+    },
+)
+print(response["choices"][0]["message"]["content"])
+```
+## Streaming
+```python
+from reframr.openai_compat import iter_sse_chat_completion
+for event in iter_sse_chat_completion(model, request):
+    send_to_browser(event)
+```
+The stream emits OpenAI-style `chat.completion.chunk` SSE events and ends with:
+```text
+data: [DONE]
+```
+## Tool Loop
+Register real tools in the host application. Reframr can request a tool with `<tool_call>`, the host executes the function, and the result is fed back as `<tool_result>` / `<source>` evidence.
+```python
+from reframr.openai_compat import run_tool_loop
+def web_search(arguments: dict[str, object]) -> dict[str, object]:
+    query = str(arguments["query"])
+    result = your_search_client.search(query)
+    return {
+        "ok": True,
+        "source": {
+            "title": result.title,
+            "url": result.url,
+            "snippet": result.snippet,
+        },
+    }
+response = run_tool_loop(
+    model,
+    {
+        "model": "reframr-v3",
+        "messages": [
+            {"role": "user", "content": "What changed in the latest official release notes?"}
+        ],
+    },
+    tools={"web.search": web_search},
+    max_rounds=3,
+)
+```
+If a tool is missing or fails, the adapter sends the failure back as a tool result instead of crashing. That lets Reframr answer honestly, retry with a different tool if the model requests one, or ask the user for source evidence.
+## CLI
+```bash
+python -m reframr chat-completion --model model.safetensors < request.json
+```
+For SSE output:
+```json
+{
+  "model": "reframr-v3",
+  "stream": true,
+  "messages": [
+    {"role": "user", "content": "Write a short support reply."}
+  ]
+}
+```
+## Deployment Notes
+- Keep real tools outside the model runtime and pass their outputs back as data.
+- Treat source quality as part of the product: validate URLs, timestamps, permissions, and user access.
+- Do not let the model fabricate tool results. If no tool result exists for a fresh fact, the app should ask for retrieval or return an uncertainty-aware answer.
+- Use `session_id` with `python -m reframr serve` when you want conversation memory in the JSONL server.

reframr/__init__.py CHANGED Viewed

@@ -13,6 +13,7 @@ from .config import ReframrConfig
 from .embeddings import EmbeddingModel, fit_ppmi_embedding
 from .hippo import AnalyticalMemoryUnit, hippo_legs_matrix
 from .model import ReframrModel
 from .reasoning import REASONING_CONTROL_TOKENS, REASONING_PROFILES, TOKENIZER_NAME
 from .tokenizer import NativeTokenizer
@@ -25,8 +26,10 @@ __all__ = [
     "ReframrConfig",
     "ReframrModel",
     "TOKENIZER_NAME",
     "fit_ppmi_embedding",
     "hippo_legs_matrix",
     "inspect_checkpoint",
     "read_safetensor_file",
 ]

 from .embeddings import EmbeddingModel, fit_ppmi_embedding
 from .hippo import AnalyticalMemoryUnit, hippo_legs_matrix
 from .model import ReframrModel
+from .openai_compat import build_chat_completion_response, run_tool_loop
 from .reasoning import REASONING_CONTROL_TOKENS, REASONING_PROFILES, TOKENIZER_NAME
 from .tokenizer import NativeTokenizer
     "ReframrConfig",
     "ReframrModel",
     "TOKENIZER_NAME",
+    "build_chat_completion_response",
     "fit_ppmi_embedding",
     "hippo_legs_matrix",
     "inspect_checkpoint",
     "read_safetensor_file",
+    "run_tool_loop",
 ]

reframr/cli.py CHANGED Viewed

@@ -261,6 +261,17 @@ def build_parser() -> argparse.ArgumentParser:
         help="Override the checkpoint's default reasoning-control profile.",
     )
     trace = subparsers.add_parser("trace", help="Trace REFRAMR reasoning components through generation steps.")
     trace.add_argument("--model", required=True, help="Path to a serialized REFRAMR model.")
     trace.add_argument("--context", required=True, help="Prompt or starting context text.")
@@ -1204,6 +1215,29 @@ def command_serve(args: argparse.Namespace) -> int:
     return 0
 def command_trace(args: argparse.Namespace) -> int:
     model = ReframrModel.load(args.model)
     payload = model.trace_generation(
@@ -1452,6 +1486,8 @@ def main(argv: list[str] | None = None) -> int:
         return command_generate_batch(args)
     if args.command == "serve":
         return command_serve(args)
     if args.command == "trace":
         return command_trace(args)
     if args.command == "inspect":

         help="Override the checkpoint's default reasoning-control profile.",
     )
+    chat_completion = subparsers.add_parser(
+        "chat-completion",
+        help="Run one OpenAI-compatible chat completion request from stdin or a JSON file.",
+    )
+    chat_completion.add_argument("--model", required=True, help="Path to a serialized REFRAMR model.")
+    chat_completion.add_argument(
+        "--request",
+        default="",
+        help="Optional path to a JSON request. Defaults to stdin.",
+    )
     trace = subparsers.add_parser("trace", help="Trace REFRAMR reasoning components through generation steps.")
     trace.add_argument("--model", required=True, help="Path to a serialized REFRAMR model.")
     trace.add_argument("--context", required=True, help="Prompt or starting context text.")
     return 0
+def command_chat_completion(args: argparse.Namespace) -> int:
+    from .openai_compat import build_chat_completion_response, iter_sse_chat_completion
+    request_path = str(getattr(args, "request", "")).strip()
+    if request_path:
+        request_text = Path(request_path).read_text(encoding="utf-8")
+    else:
+        request_text = sys.stdin.read()
+    request = json.loads(request_text)
+    if not isinstance(request, dict):
+        raise ValueError("chat-completion request must be a JSON object")
+    model = ReframrModel.load(args.model)
+    if bool(request.get("stream", False)):
+        for event in iter_sse_chat_completion(model, request):
+            sys.stdout.write(event)
+            sys.stdout.flush()
+        return 0
+    response = build_chat_completion_response(model, request)
+    sys.stdout.write(json.dumps(response, ensure_ascii=False, separators=(",", ":")) + "\n")
+    sys.stdout.flush()
+    return 0
 def command_trace(args: argparse.Namespace) -> int:
     model = ReframrModel.load(args.model)
     payload = model.trace_generation(
         return command_generate_batch(args)
     if args.command == "serve":
         return command_serve(args)
+    if args.command == "chat-completion":
+        return command_chat_completion(args)
     if args.command == "trace":
         return command_trace(args)
     if args.command == "inspect":

reframr/openai_compat.py ADDED Viewed

	@@ -0,0 +1,253 @@

+from __future__ import annotations
+import json
+import time
+import uuid
+from typing import Any, Callable
+from .cli import compose_generation_context
+def build_chat_completion_response(model: Any, request: dict[str, Any]) -> dict[str, Any]:
+    """Run a Reframr model behind an OpenAI-style chat-completions shape."""
+    model_name = str(request.get("model", "reframr"))
+    context = compose_generation_context(
+        str(request.get("prompt", "")),
+        system=str(request.get("system", "")),
+        messages=request.get("messages"),
+        tool_results=request.get("tool_results", request.get("toolResults")),
+    )
+    generated_text = str(
+        model.generate_text(
+            context,
+            max_tokens=int(request.get("max_tokens", request.get("max_completion_tokens", 120))),
+            reasoning_mode=request.get("reasoning_mode", request.get("reasoningMode")),
+            temperature=float(request.get("temperature", 0.58)),
+            top_k=int(request.get("top_k", request.get("decode_top_k", 64))),
+            top_p=float(request.get("top_p", request.get("decode_top_p", 0.92))),
+            repetition_penalty=float(request.get("repetition_penalty", 1.25)),
+        )
+    ).strip()
+    tool_call = parse_tool_call(generated_text)
+    if tool_call is None:
+        message = {"role": "assistant", "content": generated_text}
+        finish_reason = "stop"
+    else:
+        message = {"role": "assistant", "content": "", "tool_calls": [tool_call]}
+        finish_reason = "tool_calls"
+    prompt_tokens = _approx_token_count(context)
+    completion_tokens = _approx_token_count(generated_text)
+    return {
+        "id": f"chatcmpl-{uuid.uuid4().hex}",
+        "object": "chat.completion",
+        "created": int(time.time()),
+        "model": model_name,
+        "choices": [
+            {
+                "index": 0,
+                "message": message,
+                "finish_reason": finish_reason,
+            }
+        ],
+        "usage": {
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+            "total_tokens": prompt_tokens + completion_tokens,
+        },
+    }
+def iter_chat_completion_chunks(
+    model: Any,
+    request: dict[str, Any],
+    *,
+    chunk_size: int = 12,
+) -> Any:
+    """Yield OpenAI-style streaming chunk dictionaries for a Reframr response."""
+    full_response = build_chat_completion_response(model, request)
+    model_name = str(full_response["model"])
+    response_id = str(full_response["id"])
+    created = int(full_response["created"])
+    choice = full_response["choices"][0]
+    message = choice["message"]
+    yield _stream_chunk(
+        response_id,
+        model_name,
+        created,
+        {"role": "assistant"},
+        finish_reason=None,
+    )
+    tool_calls = message.get("tool_calls") if isinstance(message, dict) else None
+    if isinstance(tool_calls, list) and tool_calls:
+        yield _stream_chunk(
+            response_id,
+            model_name,
+            created,
+            {"tool_calls": tool_calls},
+            finish_reason=None,
+        )
+    else:
+        content = str(message.get("content", "")) if isinstance(message, dict) else ""
+        for part in _split_stream_content(content, chunk_size=max(1, int(chunk_size))):
+            yield _stream_chunk(
+                response_id,
+                model_name,
+                created,
+                {"content": part},
+                finish_reason=None,
+            )
+    yield _stream_chunk(
+        response_id,
+        model_name,
+        created,
+        {},
+        finish_reason=str(choice.get("finish_reason", "stop")),
+    )
+def iter_sse_chat_completion(
+    model: Any,
+    request: dict[str, Any],
+    *,
+    chunk_size: int = 12,
+) -> Any:
+    for chunk in iter_chat_completion_chunks(model, request, chunk_size=chunk_size):
+        yield f"data: {json.dumps(chunk, ensure_ascii=False, separators=(',', ':'))}\n\n"
+    yield "data: [DONE]\n\n"
+def run_tool_loop(
+    model: Any,
+    request: dict[str, Any],
+    *,
+    tools: dict[str, Callable[[dict[str, Any]], Any]],
+    max_rounds: int = 3,
+) -> dict[str, Any]:
+    """Run chat completions, executing registered tools when the model asks."""
+    messages = [dict(message) for message in request.get("messages", []) if isinstance(message, dict)]
+    current_request = dict(request)
+    last_response: dict[str, Any] | None = None
+    for _ in range(max(1, int(max_rounds))):
+        current_request["messages"] = messages
+        last_response = build_chat_completion_response(model, current_request)
+        choice = last_response["choices"][0]
+        message = choice["message"]
+        if choice.get("finish_reason") != "tool_calls":
+            return last_response
+        tool_calls = message.get("tool_calls", [])
+        if not isinstance(tool_calls, list) or not tool_calls:
+            return last_response
+        messages.append({"role": "assistant", "content": "", "tool_calls": tool_calls})
+        for tool_call in tool_calls:
+            tool_result = _execute_tool_call(tool_call, tools)
+            function_payload = tool_call.get("function", {}) if isinstance(tool_call, dict) else {}
+            tool_name = str(function_payload.get("name", "tool"))
+            messages.append(
+                {
+                    "role": "tool",
+                    "tool_call_id": str(tool_call.get("id", "")) if isinstance(tool_call, dict) else "",
+                    "name": tool_name,
+                    "content": json.dumps(tool_result, ensure_ascii=False, separators=(",", ":")),
+                }
+            )
+    return last_response if last_response is not None else build_chat_completion_response(model, request)
+def parse_tool_call(text: str) -> dict[str, Any] | None:
+    stripped = text.strip()
+    marker = "<tool_call>"
+    if not stripped.startswith(marker):
+        return None
+    payload = stripped[len(marker) :].strip()
+    if not payload:
+        return _tool_call_payload("tool", {})
+    name, _, raw_arguments = payload.partition(" ")
+    name = name.strip() or "tool"
+    arguments = _normalize_tool_arguments(raw_arguments.strip())
+    return _tool_call_payload(name, arguments)
+def _execute_tool_call(
+    tool_call: Any,
+    tools: dict[str, Callable[[dict[str, Any]], Any]],
+) -> dict[str, Any]:
+    if not isinstance(tool_call, dict):
+        return {"ok": False, "error": "tool_call must be an object"}
+    function_payload = tool_call.get("function", {})
+    function = function_payload if isinstance(function_payload, dict) else {}
+    tool_name = str(function.get("name", ""))
+    arguments = _normalize_tool_arguments(str(function.get("arguments", "")))
+    tool = tools.get(tool_name)
+    if tool is None:
+        return {"ok": False, "error": f"tool not registered: {tool_name}"}
+    try:
+        result = tool(arguments)
+    except Exception as exc:  # pragma: no cover - defensive surface for app tools.
+        return {"ok": False, "error": str(exc)}
+    if isinstance(result, dict):
+        return result
+    return {"ok": True, "content": result}
+def _tool_call_payload(name: str, arguments: dict[str, Any]) -> dict[str, Any]:
+    return {
+        "id": f"call_{uuid.uuid4().hex[:12]}",
+        "type": "function",
+        "function": {
+            "name": name,
+            "arguments": json.dumps(arguments, ensure_ascii=False, separators=(",", ":")),
+        },
+    }
+def _stream_chunk(
+    response_id: str,
+    model_name: str,
+    created: int,
+    delta: dict[str, Any],
+    *,
+    finish_reason: str | None,
+) -> dict[str, Any]:
+    return {
+        "id": response_id,
+        "object": "chat.completion.chunk",
+        "created": created,
+        "model": model_name,
+        "choices": [
+            {
+                "index": 0,
+                "delta": delta,
+                "finish_reason": finish_reason,
+            }
+        ],
+    }
+def _split_stream_content(content: str, *, chunk_size: int) -> list[str]:
+    if not content:
+        return []
+    chunks: list[str] = []
+    start = 0
+    while start < len(content):
+        chunks.append(content[start : start + chunk_size])
+        start += chunk_size
+    return chunks
+def _normalize_tool_arguments(raw_arguments: str) -> dict[str, Any]:
+    if not raw_arguments:
+        return {}
+    try:
+        parsed = json.loads(raw_arguments)
+    except json.JSONDecodeError:
+        return {"input": raw_arguments}
+    if isinstance(parsed, dict):
+        return parsed
+    return {"input": parsed}
+def _approx_token_count(text: str) -> int:
+    return len([part for part in text.split() if part])