Source code for pipecat.services.openai.realtime.events

#
# Copyright (c) 2024-2026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#

"""Event models and data structures for OpenAI Realtime API communication."""

import json
import uuid
from typing import Any, Literal

from pydantic import BaseModel, ConfigDict, Field

from pipecat.adapters.schemas.tools_schema import ToolsSchema

#
# session properties
#



[docs]
class AudioFormat(BaseModel):
    """Base class for audio format configuration."""

    type: str




[docs]
class PCMAudioFormat(AudioFormat):
    """PCM audio format configuration.

    Parameters:
        type: Audio format type, always "audio/pcm".
        rate: Sample rate, always 24000 for PCM.
    """

    type: Literal["audio/pcm"] = "audio/pcm"
    rate: Literal[24000] = 24000




[docs]
class PCMUAudioFormat(AudioFormat):
    """PCMU (G.711 μ-law) audio format configuration.

    Parameters:
        type: Audio format type, always "audio/pcmu".
    """

    type: Literal["audio/pcmu"] = "audio/pcmu"




[docs]
class PCMAAudioFormat(AudioFormat):
    """PCMA (G.711 A-law) audio format configuration.

    Parameters:
        type: Audio format type, always "audio/pcma".
    """

    type: Literal["audio/pcma"] = "audio/pcma"




[docs]
class InputAudioTranscription(BaseModel):
    """Configuration for audio transcription settings."""

    model: str = "gpt-4o-transcribe"
    language: str | None
    prompt: str | None


[docs]
    def __init__(
        self,
        model: str | None = "gpt-4o-transcribe",
        language: str | None = None,
        prompt: str | None = None,
    ):
        """Initialize InputAudioTranscription.

        Args:
            model: Transcription model to use (e.g., "gpt-4o-transcribe", "whisper-1").
            language: Optional language code for transcription.
            prompt: Optional transcription hint text.
        """
        super().__init__(model=model, language=language, prompt=prompt)





[docs]
class TurnDetection(BaseModel):
    """Server-side voice activity detection configuration.

    Parameters:
        type: Detection type, must be "server_vad".
        threshold: Voice activity detection threshold (0.0-1.0). Defaults to 0.5.
        prefix_padding_ms: Padding before speech starts in milliseconds. Defaults to 300.
        silence_duration_ms: Silence duration to detect speech end in milliseconds. Defaults to 500.
    """

    type: Literal["server_vad"] | None = "server_vad"
    threshold: float | None = 0.5
    prefix_padding_ms: int | None = 300
    silence_duration_ms: int | None = 500




[docs]
class SemanticTurnDetection(BaseModel):
    """Semantic-based turn detection configuration.

    Parameters:
        type: Detection type, must be "semantic_vad".
        eagerness: Turn detection eagerness level. Can be "low", "medium", "high", or "auto".
        create_response: Whether to automatically create responses on turn detection.
        interrupt_response: Whether to interrupt ongoing responses on turn detection.
    """

    type: Literal["semantic_vad"] | None = "semantic_vad"
    eagerness: Literal["low", "medium", "high", "auto"] | None = None
    create_response: bool | None = None
    interrupt_response: bool | None = None




[docs]
class InputAudioNoiseReduction(BaseModel):
    """Input audio noise reduction configuration.

    Parameters:
        type: Noise reduction type for different microphone scenarios.
    """

    type: Literal["near_field", "far_field"] | None




[docs]
class AudioInput(BaseModel):
    """Audio input configuration.

    Parameters:
        format: The format of the input audio.
        transcription: Configuration for input audio transcription.
        noise_reduction: Configuration for input audio noise reduction.
        turn_detection: Configuration for turn detection, or False to disable.
    """

    format: PCMAudioFormat | PCMUAudioFormat | PCMAAudioFormat | None = None
    transcription: InputAudioTranscription | None = None
    noise_reduction: InputAudioNoiseReduction | None = None
    turn_detection: TurnDetection | SemanticTurnDetection | bool | None = None




[docs]
class AudioOutput(BaseModel):
    """Audio output configuration.

    Parameters:
        format: The format of the output audio.
        voice: The voice the model uses to respond.
        speed: The speed of the model's spoken response.
    """

    format: PCMAudioFormat | PCMUAudioFormat | PCMAAudioFormat | None = None
    voice: str | None = None
    speed: float | None = None




[docs]
class AudioConfiguration(BaseModel):
    """Audio configuration for input and output.

    Parameters:
        input: Configuration for input audio.
        output: Configuration for output audio.
    """

    input: AudioInput | None = None
    output: AudioOutput | None = None




[docs]
class SessionProperties(BaseModel):
    """Configuration properties for an OpenAI Realtime session.

    Parameters:
        type: The type of session, always "realtime".
        object: Object type identifier, always "realtime.session".
        id: Unique identifier for the session.
        model: The Realtime model used for this session.
            Note: The model is set at connection time via model arg in __init__
            and cannot be changed during the session.
        output_modalities: The set of modalities the model can respond with.
        instructions: System instructions for the assistant.
        audio: Configuration for input and output audio.
        tools: Available function tools for the assistant.
        tool_choice: Tool usage strategy ("auto", "none", or "required").
        max_output_tokens: Maximum tokens in response or "inf" for unlimited.
        tracing: Configuration options for tracing.
        prompt: Reference to a prompt template and its variables.
        expires_at: Session expiration timestamp.
        include: Additional fields to include in server outputs.
    """

    # Needed to support ToolSchema in tools field.
    model_config = ConfigDict(arbitrary_types_allowed=True)

    type: Literal["realtime"] | None = "realtime"
    object: Literal["realtime.session"] | None = None
    id: str | None = None
    model: str | None = None
    output_modalities: list[Literal["text", "audio"]] | None = None
    instructions: str | None = None
    audio: AudioConfiguration | None = None
    # Tools can only be ToolsSchema when provided by the user, in either the
    # OpenAIRealtimeLLMService constructor or through LLMUpdateSettingsFrame.
    # We'll never serialize/deserialize ToolsSchema when talking to the server.
    tools: ToolsSchema | list[dict] | None = None
    tool_choice: Literal["auto", "none", "required"] | None = None
    max_output_tokens: int | Literal["inf"] | None = None
    tracing: Literal["auto"] | dict | None = None
    prompt: dict | None = None
    expires_at: int | None = None
    include: list[str] | None = None



#
# context
#



[docs]
class ItemContent(BaseModel):
    """Content within a conversation item.

    Parameters:
        type: Content type (text, audio, input_text, input_audio, input_image, output_text, or output_audio).
        text: Text content for text-based items.
        audio: Base64-encoded audio data for audio items.
        transcript: Transcribed text for audio items.
        image_url: Base64-encoded image data as a data URI for input_image items.
        detail: Detail level for image processing ("auto", "low", or "high").
    """

    type: Literal[
        "text", "audio", "input_text", "input_audio", "input_image", "output_text", "output_audio"
    ]
    text: str | None = None
    audio: str | None = None  # base64-encoded audio
    transcript: str | None = None
    image_url: str | None = None  # base64-encoded image as data URI
    detail: Literal["auto", "low", "high"] | None = None




[docs]
class ConversationItem(BaseModel):
    """A conversation item in the realtime session.

    Parameters:
        id: Unique identifier for the item, auto-generated if not provided.
        object: Object type identifier for the realtime API.
        type: Item type (message, function_call, or function_call_output).
        status: Current status of the item.
        role: Speaker role for message items (user, assistant, or system).
        content: Content list for message items.
        call_id: Function call identifier for function_call items.
        name: Function name for function_call items.
        arguments: Function arguments as JSON string for function_call items.
        output: Function output as JSON string for function_call_output items.
    """

    id: str = Field(default_factory=lambda: str(uuid.uuid4().hex))
    object: Literal["realtime.item"] | None = None
    type: Literal["message", "function_call", "function_call_output"]
    status: Literal["completed", "in_progress", "incomplete"] | None = None
    # role and content are present for message items
    role: Literal["user", "assistant", "system"] | None = None
    content: list[ItemContent] | None = None
    # these four fields are present for function_call items
    call_id: str | None = None
    name: str | None = None
    arguments: str | None = None
    output: str | None = None




[docs]
class RealtimeConversation(BaseModel):
    """A realtime conversation session.

    Parameters:
        id: Unique identifier for the conversation.
        object: Object type identifier, always "realtime.conversation".
    """

    id: str
    object: Literal["realtime.conversation"]




[docs]
class ResponseProperties(BaseModel):
    """Properties for configuring assistant responses.

    Parameters:
        output_modalities: Output modalities for the response. Must be either ["text"] or ["audio"]. Defaults to ["audio"].
        instructions: Specific instructions for this response.
        audio: Audio configuration for this response.
        tools: Available tools for this response.
        tool_choice: Tool usage strategy for this response.
        temperature: Sampling temperature for this response.
        max_output_tokens: Maximum tokens for this response.
    """

    output_modalities: list[Literal["text", "audio"]] | None = ["audio"]
    instructions: str | None = None
    audio: AudioConfiguration | None = None
    tools: list[dict] | None = None
    tool_choice: Literal["auto", "none", "required"] | None = None
    temperature: float | None = None
    max_output_tokens: int | Literal["inf"] | None = None



#
# error class
#

[docs]
class RealtimeError(BaseModel):
    """Error information from the realtime API.

    Parameters:
        type: Error type identifier.
        code: Specific error code.
        message: Human-readable error message.
        param: Parameter name that caused the error, if applicable.
        event_id: Event ID associated with the error, if applicable.
    """

    type: str
    code: str | None = ""
    message: str
    param: str | None = None
    event_id: str | None = None



#
# client events
#



[docs]
class ClientEvent(BaseModel):
    """Base class for client events sent to the realtime API.

    Parameters:
        event_id: Unique identifier for the event, auto-generated if not provided.
    """

    event_id: str = Field(default_factory=lambda: str(uuid.uuid4()))




[docs]
class SessionUpdateEvent(ClientEvent):
    """Event to update session properties.

    Parameters:
        type: Event type, always "session.update".
        session: Updated session properties.
    """

    type: Literal["session.update"] = "session.update"
    session: SessionProperties


[docs]
    def model_dump(self, *args, **kwargs) -> dict[str, Any]:
        """Serialize the event to a dictionary.

        Handles special serialization for turn_detection where False becomes null.

        Args:
            *args: Positional arguments passed to parent model_dump.
            **kwargs: Keyword arguments passed to parent model_dump.

        Returns:
            Dictionary representation of the event.
        """
        dump = super().model_dump(*args, **kwargs)

        # Handle turn_detection in audio.input so that False becomes null
        if "audio" in dump["session"] and dump["session"]["audio"]:
            if "input" in dump["session"]["audio"] and dump["session"]["audio"]["input"]:
                if "turn_detection" in dump["session"]["audio"]["input"]:
                    if dump["session"]["audio"]["input"]["turn_detection"] is False:
                        dump["session"]["audio"]["input"]["turn_detection"] = None

        return dump





[docs]
class InputAudioBufferAppendEvent(ClientEvent):
    """Event to append audio data to the input buffer.

    Parameters:
        type: Event type, always "input_audio_buffer.append".
        audio: Base64-encoded audio data to append.
    """

    type: Literal["input_audio_buffer.append"] = "input_audio_buffer.append"
    audio: str  # base64-encoded audio




[docs]
class InputAudioBufferCommitEvent(ClientEvent):
    """Event to commit the current input audio buffer.

    Parameters:
        type: Event type, always "input_audio_buffer.commit".
    """

    type: Literal["input_audio_buffer.commit"] = "input_audio_buffer.commit"




[docs]
class InputAudioBufferClearEvent(ClientEvent):
    """Event to clear the input audio buffer.

    Parameters:
        type: Event type, always "input_audio_buffer.clear".
    """

    type: Literal["input_audio_buffer.clear"] = "input_audio_buffer.clear"




[docs]
class ConversationItemCreateEvent(ClientEvent):
    """Event to create a new conversation item.

    Parameters:
        type: Event type, always "conversation.item.create".
        previous_item_id: ID of the item to insert after, if any.
        item: The conversation item to create.
    """

    type: Literal["conversation.item.create"] = "conversation.item.create"
    previous_item_id: str | None = None
    item: ConversationItem




[docs]
class ConversationItemTruncateEvent(ClientEvent):
    """Event to truncate a conversation item's audio content.

    Parameters:
        type: Event type, always "conversation.item.truncate".
        item_id: ID of the item to truncate.
        content_index: Index of the content to truncate within the item.
        audio_end_ms: End time in milliseconds for the truncated audio.
    """

    type: Literal["conversation.item.truncate"] = "conversation.item.truncate"
    item_id: str
    content_index: int
    audio_end_ms: int




[docs]
class ConversationItemDeleteEvent(ClientEvent):
    """Event to delete a conversation item.

    Parameters:
        type: Event type, always "conversation.item.delete".
        item_id: ID of the item to delete.
    """

    type: Literal["conversation.item.delete"] = "conversation.item.delete"
    item_id: str




[docs]
class ConversationItemRetrieveEvent(ClientEvent):
    """Event to retrieve a conversation item by ID.

    Parameters:
        type: Event type, always "conversation.item.retrieve".
        item_id: ID of the item to retrieve.
    """

    type: Literal["conversation.item.retrieve"] = "conversation.item.retrieve"
    item_id: str




[docs]
class ResponseCreateEvent(ClientEvent):
    """Event to create a new assistant response.

    Parameters:
        type: Event type, always "response.create".
        response: Optional response configuration properties.
    """

    type: Literal["response.create"] = "response.create"
    response: ResponseProperties | None = None




[docs]
class ResponseCancelEvent(ClientEvent):
    """Event to cancel the current assistant response.

    Parameters:
        type: Event type, always "response.cancel".
    """

    type: Literal["response.cancel"] = "response.cancel"



#
# server events
#



[docs]
class ServerEvent(BaseModel):
    """Base class for server events received from the realtime API.

    Parameters:
        event_id: Unique identifier for the event.
        type: Type of the server event.
    """

    model_config = ConfigDict(arbitrary_types_allowed=True)

    event_id: str
    type: str




[docs]
class SessionCreatedEvent(ServerEvent):
    """Event indicating a session has been created.

    Parameters:
        type: Event type, always "session.created".
        session: The created session properties.
    """

    type: Literal["session.created"]
    session: SessionProperties




[docs]
class SessionUpdatedEvent(ServerEvent):
    """Event indicating a session has been updated.

    Parameters:
        type: Event type, always "session.updated".
        session: The updated session properties.
    """

    type: Literal["session.updated"]
    session: SessionProperties




[docs]
class ConversationCreated(ServerEvent):
    """Event indicating a conversation has been created.

    Parameters:
        type: Event type, always "conversation.created".
        conversation: The created conversation.
    """

    type: Literal["conversation.created"]
    conversation: RealtimeConversation




[docs]
class ConversationItemAdded(ServerEvent):
    """Event indicating a conversation item has been added.

    Parameters:
        type: Event type, always "conversation.item.added".
        previous_item_id: ID of the previous item, if any.
        item: The added conversation item.
    """

    type: Literal["conversation.item.added"]
    previous_item_id: str | None = None
    item: ConversationItem




[docs]
class ConversationItemDone(ServerEvent):
    """Event indicating a conversation item is done processing.

    Parameters:
        type: Event type, always "conversation.item.done".
        previous_item_id: ID of the previous item, if any.
        item: The completed conversation item.
    """

    type: Literal["conversation.item.done"]
    previous_item_id: str | None = None
    item: ConversationItem




[docs]
class ConversationItemInputAudioTranscriptionDelta(ServerEvent):
    """Event containing incremental input audio transcription.

    Parameters:
        type: Event type, always "conversation.item.input_audio_transcription.delta".
        item_id: ID of the conversation item being transcribed.
        content_index: Index of the content within the item.
        delta: Incremental transcription text.
    """

    type: Literal["conversation.item.input_audio_transcription.delta"]
    item_id: str
    content_index: int
    delta: str




[docs]
class ConversationItemInputAudioTranscriptionCompleted(ServerEvent):
    """Event indicating input audio transcription is complete.

    Parameters:
        type: Event type, always "conversation.item.input_audio_transcription.completed".
        item_id: ID of the conversation item that was transcribed.
        content_index: Index of the content within the item.
        transcript: Complete transcription text.
    """

    type: Literal["conversation.item.input_audio_transcription.completed"]
    item_id: str
    content_index: int
    transcript: str




[docs]
class ConversationItemInputAudioTranscriptionFailed(ServerEvent):
    """Event indicating input audio transcription failed.

    Parameters:
        type: Event type, always "conversation.item.input_audio_transcription.failed".
        item_id: ID of the conversation item that failed transcription.
        content_index: Index of the content within the item.
        error: Error details for the transcription failure.
    """

    type: Literal["conversation.item.input_audio_transcription.failed"]
    item_id: str
    content_index: int
    error: RealtimeError




[docs]
class ConversationItemTruncated(ServerEvent):
    """Event indicating a conversation item has been truncated.

    Parameters:
        type: Event type, always "conversation.item.truncated".
        item_id: ID of the truncated conversation item.
        content_index: Index of the content within the item.
        audio_end_ms: End time in milliseconds for the truncated audio.
    """

    type: Literal["conversation.item.truncated"]
    item_id: str
    content_index: int
    audio_end_ms: int




[docs]
class ConversationItemDeleted(ServerEvent):
    """Event indicating a conversation item has been deleted.

    Parameters:
        type: Event type, always "conversation.item.deleted".
        item_id: ID of the deleted conversation item.
    """

    type: Literal["conversation.item.deleted"]
    item_id: str




[docs]
class ConversationItemRetrieved(ServerEvent):
    """Event containing a retrieved conversation item.

    Parameters:
        type: Event type, always "conversation.item.retrieved".
        item: The retrieved conversation item.
    """

    type: Literal["conversation.item.retrieved"]
    item: ConversationItem




[docs]
class ResponseCreated(ServerEvent):
    """Event indicating an assistant response has been created.

    Parameters:
        type: Event type, always "response.created".
        response: The created response object.
    """

    type: Literal["response.created"]
    response: "Response"




[docs]
class ResponseDone(ServerEvent):
    """Event indicating an assistant response is complete.

    Parameters:
        type: Event type, always "response.done".
        response: The completed response object.
    """

    type: Literal["response.done"]
    response: "Response"




[docs]
class ResponseOutputItemAdded(ServerEvent):
    """Event indicating an output item has been added to a response.

    Parameters:
        type: Event type, always "response.output_item.added".
        response_id: ID of the response.
        output_index: Index of the output item.
        item: The added conversation item.
    """

    type: Literal["response.output_item.added"]
    response_id: str
    output_index: int
    item: ConversationItem




[docs]
class ResponseOutputItemDone(ServerEvent):
    """Event indicating an output item is complete.

    Parameters:
        type: Event type, always "response.output_item.done".
        response_id: ID of the response.
        output_index: Index of the output item.
        item: The completed conversation item.
    """

    type: Literal["response.output_item.done"]
    response_id: str
    output_index: int
    item: ConversationItem




[docs]
class ResponseContentPartAdded(ServerEvent):
    """Event indicating a content part has been added to a response.

    Parameters:
        type: Event type, always "response.content_part.added".
        response_id: ID of the response.
        item_id: ID of the conversation item.
        output_index: Index of the output item.
        content_index: Index of the content part.
        part: The added content part.
    """

    type: Literal["response.content_part.added"]
    response_id: str
    item_id: str
    output_index: int
    content_index: int
    part: ItemContent




[docs]
class ResponseContentPartDone(ServerEvent):
    """Event indicating a content part is complete.

    Parameters:
        type: Event type, always "response.content_part.done".
        response_id: ID of the response.
        item_id: ID of the conversation item.
        output_index: Index of the output item.
        content_index: Index of the content part.
        part: The completed content part.
    """

    type: Literal["response.content_part.done"]
    response_id: str
    item_id: str
    output_index: int
    content_index: int
    part: ItemContent




[docs]
class ResponseTextDelta(ServerEvent):
    """Event containing incremental text from a response.

    Parameters:
        type: Event type, always "response.output_text.delta".
        response_id: ID of the response.
        item_id: ID of the conversation item.
        output_index: Index of the output item.
        content_index: Index of the content part.
        delta: Incremental text content.
    """

    type: Literal["response.output_text.delta"]
    response_id: str
    item_id: str
    output_index: int
    content_index: int
    delta: str




[docs]
class ResponseTextDone(ServerEvent):
    """Event indicating text content is complete.

    Parameters:
        type: Event type, always "response.output_text.done".
        response_id: ID of the response.
        item_id: ID of the conversation item.
        output_index: Index of the output item.
        content_index: Index of the content part.
        text: Complete text content.
    """

    type: Literal["response.output_text.done"]
    response_id: str
    item_id: str
    output_index: int
    content_index: int
    text: str




[docs]
class ResponseAudioTranscriptDelta(ServerEvent):
    """Event containing incremental audio transcript from a response.

    Parameters:
        type: Event type, always "response.output_audio_transcript.delta".
        response_id: ID of the response.
        item_id: ID of the conversation item.
        output_index: Index of the output item.
        content_index: Index of the content part.
        delta: Incremental transcript text.
    """

    type: Literal["response.output_audio_transcript.delta"]
    response_id: str
    item_id: str
    output_index: int
    content_index: int
    delta: str




[docs]
class ResponseAudioTranscriptDone(ServerEvent):
    """Event indicating audio transcript is complete.

    Parameters:
        type: Event type, always "response.output_audio_transcript.done".
        response_id: ID of the response.
        item_id: ID of the conversation item.
        output_index: Index of the output item.
        content_index: Index of the content part.
        transcript: Complete transcript text.
    """

    type: Literal["response.output_audio_transcript.done"]
    response_id: str
    item_id: str
    output_index: int
    content_index: int
    transcript: str




[docs]
class ResponseAudioDelta(ServerEvent):
    """Event containing incremental audio data from a response.

    Parameters:
        type: Event type, always "response.output_audio.delta".
        response_id: ID of the response.
        item_id: ID of the conversation item.
        output_index: Index of the output item.
        content_index: Index of the content part.
        delta: Base64-encoded incremental audio data.
    """

    type: Literal["response.output_audio.delta"]
    response_id: str
    item_id: str
    output_index: int
    content_index: int
    delta: str  # base64-encoded audio




[docs]
class ResponseAudioDone(ServerEvent):
    """Event indicating audio content is complete.

    Parameters:
        type: Event type, always "response.output_audio.done".
        response_id: ID of the response.
        item_id: ID of the conversation item.
        output_index: Index of the output item.
        content_index: Index of the content part.
    """

    type: Literal["response.output_audio.done"]
    response_id: str
    item_id: str
    output_index: int
    content_index: int




[docs]
class ResponseFunctionCallArgumentsDelta(ServerEvent):
    """Event containing incremental function call arguments.

    Parameters:
        type: Event type, always "response.function_call_arguments.delta".
        response_id: ID of the response.
        item_id: ID of the conversation item.
        output_index: Index of the output item.
        call_id: ID of the function call.
        delta: Incremental function arguments as JSON.
    """

    type: Literal["response.function_call_arguments.delta"]
    response_id: str
    item_id: str
    output_index: int
    call_id: str
    delta: str




[docs]
class ResponseFunctionCallArgumentsDone(ServerEvent):
    """Event indicating function call arguments are complete.

    Parameters:
        type: Event type, always "response.function_call_arguments.done".
        response_id: ID of the response.
        item_id: ID of the conversation item.
        output_index: Index of the output item.
        call_id: ID of the function call.
        arguments: Complete function arguments as JSON string.
    """

    type: Literal["response.function_call_arguments.done"]
    response_id: str
    item_id: str
    output_index: int
    call_id: str
    arguments: str




[docs]
class InputAudioBufferSpeechStarted(ServerEvent):
    """Event indicating speech has started in the input audio buffer.

    Parameters:
        type: Event type, always "input_audio_buffer.speech_started".
        audio_start_ms: Start time of speech in milliseconds.
        item_id: ID of the associated conversation item.
    """

    type: Literal["input_audio_buffer.speech_started"]
    audio_start_ms: int
    item_id: str




[docs]
class InputAudioBufferSpeechStopped(ServerEvent):
    """Event indicating speech has stopped in the input audio buffer.

    Parameters:
        type: Event type, always "input_audio_buffer.speech_stopped".
        audio_end_ms: End time of speech in milliseconds.
        item_id: ID of the associated conversation item.
    """

    type: Literal["input_audio_buffer.speech_stopped"]
    audio_end_ms: int
    item_id: str




[docs]
class InputAudioBufferCommitted(ServerEvent):
    """Event indicating the input audio buffer has been committed.

    Parameters:
        type: Event type, always "input_audio_buffer.committed".
        previous_item_id: ID of the previous item, if any.
        item_id: ID of the committed conversation item.
    """

    type: Literal["input_audio_buffer.committed"]
    previous_item_id: str | None = None
    item_id: str




[docs]
class InputAudioBufferCleared(ServerEvent):
    """Event indicating the input audio buffer has been cleared.

    Parameters:
        type: Event type, always "input_audio_buffer.cleared".
    """

    type: Literal["input_audio_buffer.cleared"]




[docs]
class ErrorEvent(ServerEvent):
    """Event indicating an error occurred.

    Parameters:
        type: Event type, always "error".
        error: Error details.
    """

    type: Literal["error"]
    error: RealtimeError




[docs]
class RateLimitsUpdated(ServerEvent):
    """Event indicating rate limits have been updated.

    Parameters:
        type: Event type, always "rate_limits.updated".
        rate_limits: List of rate limit information.
    """

    type: Literal["rate_limits.updated"]
    rate_limits: list[dict[str, Any]]




[docs]
class CachedTokensDetails(BaseModel):
    """Details about cached tokens.

    Parameters:
        text_tokens: Number of cached text tokens.
        audio_tokens: Number of cached audio tokens.
    """

    text_tokens: int | None = 0
    audio_tokens: int | None = 0




[docs]
class TokenDetails(BaseModel):
    """Detailed token usage information.

    Parameters:
        cached_tokens: Number of cached tokens used. Defaults to 0.
        text_tokens: Number of text tokens used. Defaults to 0.
        audio_tokens: Number of audio tokens used. Defaults to 0.
        cached_tokens_details: Detailed breakdown of cached tokens.
        image_tokens: Number of image tokens used (for input only).
    """

    model_config = ConfigDict(extra="allow")

    cached_tokens: int | None = 0
    text_tokens: int | None = 0
    audio_tokens: int | None = 0
    cached_tokens_details: CachedTokensDetails | None = None
    image_tokens: int | None = 0




[docs]
class Usage(BaseModel):
    """Token usage statistics for a response.

    Parameters:
        total_tokens: Total number of tokens used.
        input_tokens: Number of input tokens used.
        output_tokens: Number of output tokens used.
        input_token_details: Detailed breakdown of input token usage.
        output_token_details: Detailed breakdown of output token usage.
    """

    total_tokens: int
    input_tokens: int
    output_tokens: int
    input_token_details: TokenDetails
    output_token_details: TokenDetails




[docs]
class Response(BaseModel):
    """A complete assistant response.

    Parameters:
        id: Unique identifier for the response.
        object: Object type, always "realtime.response".
        status: Current status of the response.
        status_details: Additional status information.
        output: List of conversation items in the response.
        conversation_id: Which conversation the response is added to.
        output_modalities: The set of modalities the model used to respond.
        max_output_tokens: Maximum number of output tokens used.
        audio: Audio configuration for the response.
        usage: Token usage statistics for the response.
        voice: The voice the model used to respond.
        temperature: Sampling temperature used for the response.
        output_audio_format: The format of output audio.
    """

    id: str
    object: Literal["realtime.response"]
    status: Literal["completed", "in_progress", "incomplete", "cancelled", "failed"]
    status_details: Any
    output: list[ConversationItem]
    output_modalities: list[Literal["text", "audio"]] | None = None
    max_output_tokens: int | Literal["inf"] | None = None
    audio: AudioConfiguration | None = None
    usage: Usage | None = None
    voice: str | None = None
    temperature: float | None = None
    output_audio_format: str | None = None



_server_event_types = {
    "error": ErrorEvent,
    "session.created": SessionCreatedEvent,
    "session.updated": SessionUpdatedEvent,
    "conversation.created": ConversationCreated,
    "input_audio_buffer.committed": InputAudioBufferCommitted,
    "input_audio_buffer.cleared": InputAudioBufferCleared,
    "input_audio_buffer.speech_started": InputAudioBufferSpeechStarted,
    "input_audio_buffer.speech_stopped": InputAudioBufferSpeechStopped,
    "conversation.item.added": ConversationItemAdded,
    "conversation.item.done": ConversationItemDone,
    "conversation.item.input_audio_transcription.delta": ConversationItemInputAudioTranscriptionDelta,
    "conversation.item.input_audio_transcription.completed": ConversationItemInputAudioTranscriptionCompleted,
    "conversation.item.input_audio_transcription.failed": ConversationItemInputAudioTranscriptionFailed,
    "conversation.item.truncated": ConversationItemTruncated,
    "conversation.item.deleted": ConversationItemDeleted,
    "conversation.item.retrieved": ConversationItemRetrieved,
    "response.created": ResponseCreated,
    "response.done": ResponseDone,
    "response.output_item.added": ResponseOutputItemAdded,
    "response.output_item.done": ResponseOutputItemDone,
    "response.content_part.added": ResponseContentPartAdded,
    "response.content_part.done": ResponseContentPartDone,
    "response.output_text.delta": ResponseTextDelta,
    "response.output_text.done": ResponseTextDone,
    "response.output_audio_transcript.delta": ResponseAudioTranscriptDelta,
    "response.output_audio_transcript.done": ResponseAudioTranscriptDone,
    "response.output_audio.delta": ResponseAudioDelta,
    "response.output_audio.done": ResponseAudioDone,
    "response.function_call_arguments.delta": ResponseFunctionCallArgumentsDelta,
    "response.function_call_arguments.done": ResponseFunctionCallArgumentsDone,
    "rate_limits.updated": RateLimitsUpdated,
}



[docs]
def parse_server_event(str):
    """Parse a server event from JSON string.

    Args:
        str: JSON string containing the server event.

    Returns:
        Parsed server event object of the appropriate type.

    Raises:
        Exception: If the event type is unimplemented or parsing fails.
    """
    try:
        event = json.loads(str)
        event_type = event["type"]
        if event_type not in _server_event_types:
            raise Exception(f"Unimplemented server event type: {event_type}")
        return _server_event_types[event_type].model_validate(event)
    except Exception as e:
        raise Exception(f"{e} \n\n{str}")