Spaces:

Shyamnath
/

inferencing-llm

Sleeping

App Files Files Community

inferencing-llm / litellm /responses /streaming_iterator.py

Shyamnath

Push core package and essential files

469eae6 11 days ago

raw

history blame contribute delete

11.1 kB

	import asyncio
	import json
	from datetime import datetime
	from typing import Any, Dict, Optional

	import httpx

	from litellm.constants import STREAM_SSE_DONE_STRING
	from litellm.litellm_core_utils.asyncify import run_async_function
	from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
	from litellm.litellm_core_utils.thread_pool_executor import executor
	from litellm.llms.base_llm.responses.transformation import BaseResponsesAPIConfig
	from litellm.responses.utils import ResponsesAPIRequestUtils
	from litellm.types.llms.openai import (
	OutputTextDeltaEvent,
	ResponseCompletedEvent,
	ResponsesAPIResponse,
	ResponsesAPIStreamEvents,
	ResponsesAPIStreamingResponse,
	)
	from litellm.utils import CustomStreamWrapper


	class BaseResponsesAPIStreamingIterator:
	"""
	Base class for streaming iterators that process responses from the Responses API.

	This class contains shared logic for both synchronous and asynchronous iterators.
	"""

	def __init__(
	self,
	response: httpx.Response,
	model: str,
	responses_api_provider_config: BaseResponsesAPIConfig,
	logging_obj: LiteLLMLoggingObj,
	litellm_metadata: Optional[Dict[str, Any]] = None,
	custom_llm_provider: Optional[str] = None,
	):
	self.response = response
	self.model = model
	self.logging_obj = logging_obj
	self.finished = False
	self.responses_api_provider_config = responses_api_provider_config
	self.completed_response: Optional[ResponsesAPIStreamingResponse] = None
	self.start_time = datetime.now()

	# set request kwargs
	self.litellm_metadata = litellm_metadata
	self.custom_llm_provider = custom_llm_provider

	def _process_chunk(self, chunk) -> Optional[ResponsesAPIStreamingResponse]:
	"""Process a single chunk of data from the stream"""
	if not chunk:
	return None

	# Handle SSE format (data: {...})
	chunk = CustomStreamWrapper._strip_sse_data_from_chunk(chunk)
	if chunk is None:
	return None

	# Handle "[DONE]" marker
	if chunk == STREAM_SSE_DONE_STRING:
	self.finished = True
	return None

	try:
	# Parse the JSON chunk
	parsed_chunk = json.loads(chunk)

	# Format as ResponsesAPIStreamingResponse
	if isinstance(parsed_chunk, dict):
	openai_responses_api_chunk = (
	self.responses_api_provider_config.transform_streaming_response(
	model=self.model,
	parsed_chunk=parsed_chunk,
	logging_obj=self.logging_obj,
	)
	)

	# if "response" in parsed_chunk, then encode litellm specific information like custom_llm_provider
	response_object = getattr(openai_responses_api_chunk, "response", None)
	if response_object:
	response = ResponsesAPIRequestUtils._update_responses_api_response_id_with_model_id(
	responses_api_response=response_object,
	litellm_metadata=self.litellm_metadata,
	custom_llm_provider=self.custom_llm_provider,
	)
	setattr(openai_responses_api_chunk, "response", response)

	# Store the completed response
	if (
	openai_responses_api_chunk
	and openai_responses_api_chunk.type
	== ResponsesAPIStreamEvents.RESPONSE_COMPLETED
	):
	self.completed_response = openai_responses_api_chunk
	self._handle_logging_completed_response()

	return openai_responses_api_chunk

	return None
	except json.JSONDecodeError:
	# If we can't parse the chunk, continue
	return None

	def _handle_logging_completed_response(self):
	"""Base implementation - should be overridden by subclasses"""
	pass


	class ResponsesAPIStreamingIterator(BaseResponsesAPIStreamingIterator):
	"""
	Async iterator for processing streaming responses from the Responses API.
	"""

	def __init__(
	self,
	response: httpx.Response,
	model: str,
	responses_api_provider_config: BaseResponsesAPIConfig,
	logging_obj: LiteLLMLoggingObj,
	litellm_metadata: Optional[Dict[str, Any]] = None,
	custom_llm_provider: Optional[str] = None,
	):
	super().__init__(
	response,
	model,
	responses_api_provider_config,
	logging_obj,
	litellm_metadata,
	custom_llm_provider,
	)
	self.stream_iterator = response.aiter_lines()

	def __aiter__(self):
	return self

	async def __anext__(self) -> ResponsesAPIStreamingResponse:
	try:
	while True:
	# Get the next chunk from the stream
	try:
	chunk = await self.stream_iterator.__anext__()
	except StopAsyncIteration:
	self.finished = True
	raise StopAsyncIteration

	result = self._process_chunk(chunk)

	if self.finished:
	raise StopAsyncIteration
	elif result is not None:
	return result
	# If result is None, continue the loop to get the next chunk

	except httpx.HTTPError as e:
	# Handle HTTP errors
	self.finished = True
	raise e

	def _handle_logging_completed_response(self):
	"""Handle logging for completed responses in async context"""
	asyncio.create_task(
	self.logging_obj.async_success_handler(
	result=self.completed_response,
	start_time=self.start_time,
	end_time=datetime.now(),
	cache_hit=None,
	)
	)

	executor.submit(
	self.logging_obj.success_handler,
	result=self.completed_response,
	cache_hit=None,
	start_time=self.start_time,
	end_time=datetime.now(),
	)


	class SyncResponsesAPIStreamingIterator(BaseResponsesAPIStreamingIterator):
	"""
	Synchronous iterator for processing streaming responses from the Responses API.
	"""

	def __init__(
	self,
	response: httpx.Response,
	model: str,
	responses_api_provider_config: BaseResponsesAPIConfig,
	logging_obj: LiteLLMLoggingObj,
	litellm_metadata: Optional[Dict[str, Any]] = None,
	custom_llm_provider: Optional[str] = None,
	):
	super().__init__(
	response,
	model,
	responses_api_provider_config,
	logging_obj,
	litellm_metadata,
	custom_llm_provider,
	)
	self.stream_iterator = response.iter_lines()

	def __iter__(self):
	return self

	def __next__(self):
	try:
	while True:
	# Get the next chunk from the stream
	try:
	chunk = next(self.stream_iterator)
	except StopIteration:
	self.finished = True
	raise StopIteration

	result = self._process_chunk(chunk)

	if self.finished:
	raise StopIteration
	elif result is not None:
	return result
	# If result is None, continue the loop to get the next chunk

	except httpx.HTTPError as e:
	# Handle HTTP errors
	self.finished = True
	raise e

	def _handle_logging_completed_response(self):
	"""Handle logging for completed responses in sync context"""
	run_async_function(
	async_function=self.logging_obj.async_success_handler,
	result=self.completed_response,
	start_time=self.start_time,
	end_time=datetime.now(),
	cache_hit=None,
	)

	executor.submit(
	self.logging_obj.success_handler,
	result=self.completed_response,
	cache_hit=None,
	start_time=self.start_time,
	end_time=datetime.now(),
	)


	class MockResponsesAPIStreamingIterator(BaseResponsesAPIStreamingIterator):
	"""
	Mock iterator—fake a stream by slicing the full response text into
	5 char deltas, then emit a completed event.

	Models like o1-pro don't support streaming, so we fake it.
	"""

	CHUNK_SIZE = 5

	def __init__(
	self,
	response: httpx.Response,
	model: str,
	responses_api_provider_config: BaseResponsesAPIConfig,
	logging_obj: LiteLLMLoggingObj,
	litellm_metadata: Optional[Dict[str, Any]] = None,
	custom_llm_provider: Optional[str] = None,
	):
	super().__init__(
	response=response,
	model=model,
	responses_api_provider_config=responses_api_provider_config,
	logging_obj=logging_obj,
	litellm_metadata=litellm_metadata,
	custom_llm_provider=custom_llm_provider,
	)

	# one-time transform
	transformed = (
	self.responses_api_provider_config.transform_response_api_response(
	model=self.model,
	raw_response=response,
	logging_obj=logging_obj,
	)
	)
	full_text = self._collect_text(transformed)

	# build a list of 5‑char delta events
	deltas = [
	OutputTextDeltaEvent(
	type=ResponsesAPIStreamEvents.OUTPUT_TEXT_DELTA,
	delta=full_text[i : i + self.CHUNK_SIZE],
	item_id=transformed.id,
	output_index=0,
	content_index=0,
	)
	for i in range(0, len(full_text), self.CHUNK_SIZE)
	]

	# append the completed event
	self._events = deltas + [
	ResponseCompletedEvent(
	type=ResponsesAPIStreamEvents.RESPONSE_COMPLETED,
	response=transformed,
	)
	]
	self._idx = 0

	def __aiter__(self):
	return self

	async def __anext__(self) -> ResponsesAPIStreamingResponse:
	if self._idx >= len(self._events):
	raise StopAsyncIteration
	evt = self._events[self._idx]
	self._idx += 1
	return evt

	def __iter__(self):
	return self

	def __next__(self) -> ResponsesAPIStreamingResponse:
	if self._idx >= len(self._events):
	raise StopIteration
	evt = self._events[self._idx]
	self._idx += 1
	return evt

	def _collect_text(self, resp: ResponsesAPIResponse) -> str:
	out = ""
	for out_item in resp.output:
	if out_item.type == "message":
	for c in getattr(out_item, "content", []):
	out += c.text
	return out