#
# Copyright (c) 2024-2026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
"""Event models and data structures for OpenAI Realtime API communication."""
import json
import uuid
from typing import Any, Literal
from pydantic import BaseModel, ConfigDict, Field
from pipecat.adapters.schemas.tools_schema import ToolsSchema
#
# session properties
#
[docs]
class TurnDetection(BaseModel):
"""Server-side voice activity detection configuration.
Parameters:
type: Detection type, must be "server_vad".
threshold: Voice activity detection threshold (0.0-1.0). Defaults to 0.5.
prefix_padding_ms: Padding before speech starts in milliseconds. Defaults to 300.
silence_duration_ms: Silence duration to detect speech end in milliseconds. Defaults to 500.
"""
type: Literal["server_vad"] | None = "server_vad"
threshold: float | None = 0.5
prefix_padding_ms: int | None = 300
silence_duration_ms: int | None = 500
[docs]
class SemanticTurnDetection(BaseModel):
"""Semantic-based turn detection configuration.
Parameters:
type: Detection type, must be "semantic_vad".
eagerness: Turn detection eagerness level. Can be "low", "medium", "high", or "auto".
create_response: Whether to automatically create responses on turn detection.
interrupt_response: Whether to interrupt ongoing responses on turn detection.
"""
type: Literal["semantic_vad"] | None = "semantic_vad"
eagerness: Literal["low", "medium", "high", "auto"] | None = None
create_response: bool | None = None
interrupt_response: bool | None = None
[docs]
class AudioOutput(BaseModel):
"""Audio output configuration.
Parameters:
format: The format of the output audio.
voice: The voice the model uses to respond.
speed: The speed of the model's spoken response.
"""
format: PCMAudioFormat | PCMUAudioFormat | PCMAAudioFormat | None = None
voice: str | None = None
speed: float | None = None
[docs]
class AudioConfiguration(BaseModel):
"""Audio configuration for input and output.
Parameters:
input: Configuration for input audio.
output: Configuration for output audio.
"""
input: AudioInput | None = None
output: AudioOutput | None = None
[docs]
class SessionProperties(BaseModel):
"""Configuration properties for an OpenAI Realtime session.
Parameters:
type: The type of session, always "realtime".
object: Object type identifier, always "realtime.session".
id: Unique identifier for the session.
model: The Realtime model used for this session.
Note: The model is set at connection time via model arg in __init__
and cannot be changed during the session.
output_modalities: The set of modalities the model can respond with.
instructions: System instructions for the assistant.
audio: Configuration for input and output audio.
tools: Available function tools for the assistant.
tool_choice: Tool usage strategy ("auto", "none", or "required").
max_output_tokens: Maximum tokens in response or "inf" for unlimited.
tracing: Configuration options for tracing.
prompt: Reference to a prompt template and its variables.
expires_at: Session expiration timestamp.
include: Additional fields to include in server outputs.
"""
# Needed to support ToolSchema in tools field.
model_config = ConfigDict(arbitrary_types_allowed=True)
type: Literal["realtime"] | None = "realtime"
object: Literal["realtime.session"] | None = None
id: str | None = None
model: str | None = None
output_modalities: list[Literal["text", "audio"]] | None = None
instructions: str | None = None
audio: AudioConfiguration | None = None
# Tools can only be ToolsSchema when provided by the user, in either the
# OpenAIRealtimeLLMService constructor or through LLMUpdateSettingsFrame.
# We'll never serialize/deserialize ToolsSchema when talking to the server.
tools: ToolsSchema | list[dict] | None = None
tool_choice: Literal["auto", "none", "required"] | None = None
max_output_tokens: int | Literal["inf"] | None = None
tracing: Literal["auto"] | dict | None = None
prompt: dict | None = None
expires_at: int | None = None
include: list[str] | None = None
#
# context
#
[docs]
class ItemContent(BaseModel):
"""Content within a conversation item.
Parameters:
type: Content type (text, audio, input_text, input_audio, input_image, output_text, or output_audio).
text: Text content for text-based items.
audio: Base64-encoded audio data for audio items.
transcript: Transcribed text for audio items.
image_url: Base64-encoded image data as a data URI for input_image items.
detail: Detail level for image processing ("auto", "low", or "high").
"""
type: Literal[
"text", "audio", "input_text", "input_audio", "input_image", "output_text", "output_audio"
]
text: str | None = None
audio: str | None = None # base64-encoded audio
transcript: str | None = None
image_url: str | None = None # base64-encoded image as data URI
detail: Literal["auto", "low", "high"] | None = None
[docs]
class ConversationItem(BaseModel):
"""A conversation item in the realtime session.
Parameters:
id: Unique identifier for the item, auto-generated if not provided.
object: Object type identifier for the realtime API.
type: Item type (message, function_call, or function_call_output).
status: Current status of the item.
role: Speaker role for message items (user, assistant, or system).
content: Content list for message items.
call_id: Function call identifier for function_call items.
name: Function name for function_call items.
arguments: Function arguments as JSON string for function_call items.
output: Function output as JSON string for function_call_output items.
"""
id: str = Field(default_factory=lambda: str(uuid.uuid4().hex))
object: Literal["realtime.item"] | None = None
type: Literal["message", "function_call", "function_call_output"]
status: Literal["completed", "in_progress", "incomplete"] | None = None
# role and content are present for message items
role: Literal["user", "assistant", "system"] | None = None
content: list[ItemContent] | None = None
# these four fields are present for function_call items
call_id: str | None = None
name: str | None = None
arguments: str | None = None
output: str | None = None
[docs]
class RealtimeConversation(BaseModel):
"""A realtime conversation session.
Parameters:
id: Unique identifier for the conversation.
object: Object type identifier, always "realtime.conversation".
"""
id: str
object: Literal["realtime.conversation"]
[docs]
class ResponseProperties(BaseModel):
"""Properties for configuring assistant responses.
Parameters:
output_modalities: Output modalities for the response. Must be either ["text"] or ["audio"]. Defaults to ["audio"].
instructions: Specific instructions for this response.
audio: Audio configuration for this response.
tools: Available tools for this response.
tool_choice: Tool usage strategy for this response.
temperature: Sampling temperature for this response.
max_output_tokens: Maximum tokens for this response.
"""
output_modalities: list[Literal["text", "audio"]] | None = ["audio"]
instructions: str | None = None
audio: AudioConfiguration | None = None
tools: list[dict] | None = None
tool_choice: Literal["auto", "none", "required"] | None = None
temperature: float | None = None
max_output_tokens: int | Literal["inf"] | None = None
#
# error class
#
[docs]
class RealtimeError(BaseModel):
"""Error information from the realtime API.
Parameters:
type: Error type identifier.
code: Specific error code.
message: Human-readable error message.
param: Parameter name that caused the error, if applicable.
event_id: Event ID associated with the error, if applicable.
"""
type: str
code: str | None = ""
message: str
param: str | None = None
event_id: str | None = None
#
# client events
#
[docs]
class ClientEvent(BaseModel):
"""Base class for client events sent to the realtime API.
Parameters:
event_id: Unique identifier for the event, auto-generated if not provided.
"""
event_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
[docs]
class SessionUpdateEvent(ClientEvent):
"""Event to update session properties.
Parameters:
type: Event type, always "session.update".
session: Updated session properties.
"""
type: Literal["session.update"] = "session.update"
session: SessionProperties
[docs]
def model_dump(self, *args, **kwargs) -> dict[str, Any]:
"""Serialize the event to a dictionary.
Handles special serialization for turn_detection where False becomes null.
Args:
*args: Positional arguments passed to parent model_dump.
**kwargs: Keyword arguments passed to parent model_dump.
Returns:
Dictionary representation of the event.
"""
dump = super().model_dump(*args, **kwargs)
# Handle turn_detection in audio.input so that False becomes null
if "audio" in dump["session"] and dump["session"]["audio"]:
if "input" in dump["session"]["audio"] and dump["session"]["audio"]["input"]:
if "turn_detection" in dump["session"]["audio"]["input"]:
if dump["session"]["audio"]["input"]["turn_detection"] is False:
dump["session"]["audio"]["input"]["turn_detection"] = None
return dump
[docs]
class ConversationItemCreateEvent(ClientEvent):
"""Event to create a new conversation item.
Parameters:
type: Event type, always "conversation.item.create".
previous_item_id: ID of the item to insert after, if any.
item: The conversation item to create.
"""
type: Literal["conversation.item.create"] = "conversation.item.create"
previous_item_id: str | None = None
item: ConversationItem
[docs]
class ConversationItemTruncateEvent(ClientEvent):
"""Event to truncate a conversation item's audio content.
Parameters:
type: Event type, always "conversation.item.truncate".
item_id: ID of the item to truncate.
content_index: Index of the content to truncate within the item.
audio_end_ms: End time in milliseconds for the truncated audio.
"""
type: Literal["conversation.item.truncate"] = "conversation.item.truncate"
item_id: str
content_index: int
audio_end_ms: int
[docs]
class ConversationItemDeleteEvent(ClientEvent):
"""Event to delete a conversation item.
Parameters:
type: Event type, always "conversation.item.delete".
item_id: ID of the item to delete.
"""
type: Literal["conversation.item.delete"] = "conversation.item.delete"
item_id: str
[docs]
class ConversationItemRetrieveEvent(ClientEvent):
"""Event to retrieve a conversation item by ID.
Parameters:
type: Event type, always "conversation.item.retrieve".
item_id: ID of the item to retrieve.
"""
type: Literal["conversation.item.retrieve"] = "conversation.item.retrieve"
item_id: str
[docs]
class ResponseCreateEvent(ClientEvent):
"""Event to create a new assistant response.
Parameters:
type: Event type, always "response.create".
response: Optional response configuration properties.
"""
type: Literal["response.create"] = "response.create"
response: ResponseProperties | None = None
[docs]
class ResponseCancelEvent(ClientEvent):
"""Event to cancel the current assistant response.
Parameters:
type: Event type, always "response.cancel".
"""
type: Literal["response.cancel"] = "response.cancel"
#
# server events
#
[docs]
class ServerEvent(BaseModel):
"""Base class for server events received from the realtime API.
Parameters:
event_id: Unique identifier for the event.
type: Type of the server event.
"""
model_config = ConfigDict(arbitrary_types_allowed=True)
event_id: str
type: str
[docs]
class SessionCreatedEvent(ServerEvent):
"""Event indicating a session has been created.
Parameters:
type: Event type, always "session.created".
session: The created session properties.
"""
type: Literal["session.created"]
session: SessionProperties
[docs]
class SessionUpdatedEvent(ServerEvent):
"""Event indicating a session has been updated.
Parameters:
type: Event type, always "session.updated".
session: The updated session properties.
"""
type: Literal["session.updated"]
session: SessionProperties
[docs]
class ConversationCreated(ServerEvent):
"""Event indicating a conversation has been created.
Parameters:
type: Event type, always "conversation.created".
conversation: The created conversation.
"""
type: Literal["conversation.created"]
conversation: RealtimeConversation
[docs]
class ConversationItemAdded(ServerEvent):
"""Event indicating a conversation item has been added.
Parameters:
type: Event type, always "conversation.item.added".
previous_item_id: ID of the previous item, if any.
item: The added conversation item.
"""
type: Literal["conversation.item.added"]
previous_item_id: str | None = None
item: ConversationItem
[docs]
class ConversationItemDone(ServerEvent):
"""Event indicating a conversation item is done processing.
Parameters:
type: Event type, always "conversation.item.done".
previous_item_id: ID of the previous item, if any.
item: The completed conversation item.
"""
type: Literal["conversation.item.done"]
previous_item_id: str | None = None
item: ConversationItem
[docs]
class ConversationItemTruncated(ServerEvent):
"""Event indicating a conversation item has been truncated.
Parameters:
type: Event type, always "conversation.item.truncated".
item_id: ID of the truncated conversation item.
content_index: Index of the content within the item.
audio_end_ms: End time in milliseconds for the truncated audio.
"""
type: Literal["conversation.item.truncated"]
item_id: str
content_index: int
audio_end_ms: int
[docs]
class ConversationItemDeleted(ServerEvent):
"""Event indicating a conversation item has been deleted.
Parameters:
type: Event type, always "conversation.item.deleted".
item_id: ID of the deleted conversation item.
"""
type: Literal["conversation.item.deleted"]
item_id: str
[docs]
class ConversationItemRetrieved(ServerEvent):
"""Event containing a retrieved conversation item.
Parameters:
type: Event type, always "conversation.item.retrieved".
item: The retrieved conversation item.
"""
type: Literal["conversation.item.retrieved"]
item: ConversationItem
[docs]
class ResponseCreated(ServerEvent):
"""Event indicating an assistant response has been created.
Parameters:
type: Event type, always "response.created".
response: The created response object.
"""
type: Literal["response.created"]
response: "Response"
[docs]
class ResponseDone(ServerEvent):
"""Event indicating an assistant response is complete.
Parameters:
type: Event type, always "response.done".
response: The completed response object.
"""
type: Literal["response.done"]
response: "Response"
[docs]
class ResponseOutputItemAdded(ServerEvent):
"""Event indicating an output item has been added to a response.
Parameters:
type: Event type, always "response.output_item.added".
response_id: ID of the response.
output_index: Index of the output item.
item: The added conversation item.
"""
type: Literal["response.output_item.added"]
response_id: str
output_index: int
item: ConversationItem
[docs]
class ResponseOutputItemDone(ServerEvent):
"""Event indicating an output item is complete.
Parameters:
type: Event type, always "response.output_item.done".
response_id: ID of the response.
output_index: Index of the output item.
item: The completed conversation item.
"""
type: Literal["response.output_item.done"]
response_id: str
output_index: int
item: ConversationItem
[docs]
class ResponseContentPartAdded(ServerEvent):
"""Event indicating a content part has been added to a response.
Parameters:
type: Event type, always "response.content_part.added".
response_id: ID of the response.
item_id: ID of the conversation item.
output_index: Index of the output item.
content_index: Index of the content part.
part: The added content part.
"""
type: Literal["response.content_part.added"]
response_id: str
item_id: str
output_index: int
content_index: int
part: ItemContent
[docs]
class ResponseContentPartDone(ServerEvent):
"""Event indicating a content part is complete.
Parameters:
type: Event type, always "response.content_part.done".
response_id: ID of the response.
item_id: ID of the conversation item.
output_index: Index of the output item.
content_index: Index of the content part.
part: The completed content part.
"""
type: Literal["response.content_part.done"]
response_id: str
item_id: str
output_index: int
content_index: int
part: ItemContent
[docs]
class ResponseTextDelta(ServerEvent):
"""Event containing incremental text from a response.
Parameters:
type: Event type, always "response.output_text.delta".
response_id: ID of the response.
item_id: ID of the conversation item.
output_index: Index of the output item.
content_index: Index of the content part.
delta: Incremental text content.
"""
type: Literal["response.output_text.delta"]
response_id: str
item_id: str
output_index: int
content_index: int
delta: str
[docs]
class ResponseTextDone(ServerEvent):
"""Event indicating text content is complete.
Parameters:
type: Event type, always "response.output_text.done".
response_id: ID of the response.
item_id: ID of the conversation item.
output_index: Index of the output item.
content_index: Index of the content part.
text: Complete text content.
"""
type: Literal["response.output_text.done"]
response_id: str
item_id: str
output_index: int
content_index: int
text: str
[docs]
class ResponseAudioTranscriptDelta(ServerEvent):
"""Event containing incremental audio transcript from a response.
Parameters:
type: Event type, always "response.output_audio_transcript.delta".
response_id: ID of the response.
item_id: ID of the conversation item.
output_index: Index of the output item.
content_index: Index of the content part.
delta: Incremental transcript text.
"""
type: Literal["response.output_audio_transcript.delta"]
response_id: str
item_id: str
output_index: int
content_index: int
delta: str
[docs]
class ResponseAudioTranscriptDone(ServerEvent):
"""Event indicating audio transcript is complete.
Parameters:
type: Event type, always "response.output_audio_transcript.done".
response_id: ID of the response.
item_id: ID of the conversation item.
output_index: Index of the output item.
content_index: Index of the content part.
transcript: Complete transcript text.
"""
type: Literal["response.output_audio_transcript.done"]
response_id: str
item_id: str
output_index: int
content_index: int
transcript: str
[docs]
class ResponseAudioDelta(ServerEvent):
"""Event containing incremental audio data from a response.
Parameters:
type: Event type, always "response.output_audio.delta".
response_id: ID of the response.
item_id: ID of the conversation item.
output_index: Index of the output item.
content_index: Index of the content part.
delta: Base64-encoded incremental audio data.
"""
type: Literal["response.output_audio.delta"]
response_id: str
item_id: str
output_index: int
content_index: int
delta: str # base64-encoded audio
[docs]
class ResponseAudioDone(ServerEvent):
"""Event indicating audio content is complete.
Parameters:
type: Event type, always "response.output_audio.done".
response_id: ID of the response.
item_id: ID of the conversation item.
output_index: Index of the output item.
content_index: Index of the content part.
"""
type: Literal["response.output_audio.done"]
response_id: str
item_id: str
output_index: int
content_index: int
[docs]
class ResponseFunctionCallArgumentsDelta(ServerEvent):
"""Event containing incremental function call arguments.
Parameters:
type: Event type, always "response.function_call_arguments.delta".
response_id: ID of the response.
item_id: ID of the conversation item.
output_index: Index of the output item.
call_id: ID of the function call.
delta: Incremental function arguments as JSON.
"""
type: Literal["response.function_call_arguments.delta"]
response_id: str
item_id: str
output_index: int
call_id: str
delta: str
[docs]
class ResponseFunctionCallArgumentsDone(ServerEvent):
"""Event indicating function call arguments are complete.
Parameters:
type: Event type, always "response.function_call_arguments.done".
response_id: ID of the response.
item_id: ID of the conversation item.
output_index: Index of the output item.
call_id: ID of the function call.
arguments: Complete function arguments as JSON string.
"""
type: Literal["response.function_call_arguments.done"]
response_id: str
item_id: str
output_index: int
call_id: str
arguments: str
[docs]
class ErrorEvent(ServerEvent):
"""Event indicating an error occurred.
Parameters:
type: Event type, always "error".
error: Error details.
"""
type: Literal["error"]
error: RealtimeError
[docs]
class RateLimitsUpdated(ServerEvent):
"""Event indicating rate limits have been updated.
Parameters:
type: Event type, always "rate_limits.updated".
rate_limits: List of rate limit information.
"""
type: Literal["rate_limits.updated"]
rate_limits: list[dict[str, Any]]
[docs]
class CachedTokensDetails(BaseModel):
"""Details about cached tokens.
Parameters:
text_tokens: Number of cached text tokens.
audio_tokens: Number of cached audio tokens.
"""
text_tokens: int | None = 0
audio_tokens: int | None = 0
[docs]
class TokenDetails(BaseModel):
"""Detailed token usage information.
Parameters:
cached_tokens: Number of cached tokens used. Defaults to 0.
text_tokens: Number of text tokens used. Defaults to 0.
audio_tokens: Number of audio tokens used. Defaults to 0.
cached_tokens_details: Detailed breakdown of cached tokens.
image_tokens: Number of image tokens used (for input only).
"""
model_config = ConfigDict(extra="allow")
cached_tokens: int | None = 0
text_tokens: int | None = 0
audio_tokens: int | None = 0
cached_tokens_details: CachedTokensDetails | None = None
image_tokens: int | None = 0
[docs]
class Usage(BaseModel):
"""Token usage statistics for a response.
Parameters:
total_tokens: Total number of tokens used.
input_tokens: Number of input tokens used.
output_tokens: Number of output tokens used.
input_token_details: Detailed breakdown of input token usage.
output_token_details: Detailed breakdown of output token usage.
"""
total_tokens: int
input_tokens: int
output_tokens: int
input_token_details: TokenDetails
output_token_details: TokenDetails
[docs]
class Response(BaseModel):
"""A complete assistant response.
Parameters:
id: Unique identifier for the response.
object: Object type, always "realtime.response".
status: Current status of the response.
status_details: Additional status information.
output: List of conversation items in the response.
conversation_id: Which conversation the response is added to.
output_modalities: The set of modalities the model used to respond.
max_output_tokens: Maximum number of output tokens used.
audio: Audio configuration for the response.
usage: Token usage statistics for the response.
voice: The voice the model used to respond.
temperature: Sampling temperature used for the response.
output_audio_format: The format of output audio.
"""
id: str
object: Literal["realtime.response"]
status: Literal["completed", "in_progress", "incomplete", "cancelled", "failed"]
status_details: Any
output: list[ConversationItem]
output_modalities: list[Literal["text", "audio"]] | None = None
max_output_tokens: int | Literal["inf"] | None = None
audio: AudioConfiguration | None = None
usage: Usage | None = None
voice: str | None = None
temperature: float | None = None
output_audio_format: str | None = None
_server_event_types = {
"error": ErrorEvent,
"session.created": SessionCreatedEvent,
"session.updated": SessionUpdatedEvent,
"conversation.created": ConversationCreated,
"input_audio_buffer.committed": InputAudioBufferCommitted,
"input_audio_buffer.cleared": InputAudioBufferCleared,
"input_audio_buffer.speech_started": InputAudioBufferSpeechStarted,
"input_audio_buffer.speech_stopped": InputAudioBufferSpeechStopped,
"conversation.item.added": ConversationItemAdded,
"conversation.item.done": ConversationItemDone,
"conversation.item.input_audio_transcription.delta": ConversationItemInputAudioTranscriptionDelta,
"conversation.item.input_audio_transcription.completed": ConversationItemInputAudioTranscriptionCompleted,
"conversation.item.input_audio_transcription.failed": ConversationItemInputAudioTranscriptionFailed,
"conversation.item.truncated": ConversationItemTruncated,
"conversation.item.deleted": ConversationItemDeleted,
"conversation.item.retrieved": ConversationItemRetrieved,
"response.created": ResponseCreated,
"response.done": ResponseDone,
"response.output_item.added": ResponseOutputItemAdded,
"response.output_item.done": ResponseOutputItemDone,
"response.content_part.added": ResponseContentPartAdded,
"response.content_part.done": ResponseContentPartDone,
"response.output_text.delta": ResponseTextDelta,
"response.output_text.done": ResponseTextDone,
"response.output_audio_transcript.delta": ResponseAudioTranscriptDelta,
"response.output_audio_transcript.done": ResponseAudioTranscriptDone,
"response.output_audio.delta": ResponseAudioDelta,
"response.output_audio.done": ResponseAudioDone,
"response.function_call_arguments.delta": ResponseFunctionCallArgumentsDelta,
"response.function_call_arguments.done": ResponseFunctionCallArgumentsDone,
"rate_limits.updated": RateLimitsUpdated,
}
[docs]
def parse_server_event(str):
"""Parse a server event from JSON string.
Args:
str: JSON string containing the server event.
Returns:
Parsed server event object of the appropriate type.
Raises:
Exception: If the event type is unimplemented or parsing fails.
"""
try:
event = json.loads(str)
event_type = event["type"]
if event_type not in _server_event_types:
raise Exception(f"Unimplemented server event type: {event_type}")
return _server_event_types[event_type].model_validate(event)
except Exception as e:
raise Exception(f"{e} \n\n{str}")