Source code for pipecat.services.assemblyai.models

#
# Copyright (c) 2024-2026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#

"""AssemblyAI WebSocket API message models and connection parameters.

This module defines Pydantic models for handling AssemblyAI's real-time
transcription WebSocket messages and connection configuration.
"""

from typing import Literal

from loguru import logger
from pydantic import BaseModel, ConfigDict, Field, model_validator



[docs]
class Word(BaseModel):
    """Represents a single word in a transcription with timing and confidence.

    Parameters:
        start: Start time of the word in milliseconds.
        end: End time of the word in milliseconds.
        text: The transcribed word text.
        confidence: Confidence score for the word (0.0 to 1.0).
        word_is_final: Whether this word is finalized and won't change.
    """

    start: int
    end: int
    text: str
    confidence: float
    word_is_final: bool = Field(..., alias="word_is_final")




[docs]
class BaseMessage(BaseModel):
    """Base class for all AssemblyAI WebSocket messages.

    Parameters:
        type: The message type identifier.
    """

    type: str




[docs]
class BeginMessage(BaseMessage):
    """Message sent when a new session begins.

    Parameters:
        type: Always "Begin" for this message type.
        id: Unique session identifier.
        expires_at: Unix timestamp when the session expires.
    """

    type: Literal["Begin"] = "Begin"
    id: str
    expires_at: int




[docs]
class TurnMessage(BaseMessage):
    """Message containing transcription data for a turn of speech.

    Parameters:
        type: Always "Turn" for this message type.
        turn_order: Sequential number of this turn in the session.
        turn_is_formatted: Whether the transcript has been formatted.
        end_of_turn: Whether this marks the end of a speaking turn.
        transcript: The transcribed text for this turn.
        end_of_turn_confidence: Confidence score for end-of-turn detection.
        words: List of individual words with timing and confidence data.
        language_code: Detected language code (e.g., "es", "fr"). Only present with
            complete utterances or when end_of_turn is True.
        language_confidence: Confidence score (0-1) for language detection. Only present
            with complete utterances or when end_of_turn is True.
        speaker: Speaker label (e.g., "A", "B"). Only present when speaker_labels is
            enabled and end_of_turn is True. Maps to 'speaker_label' in JSON response.
    """

    model_config = ConfigDict(populate_by_name=True)

    type: Literal["Turn"] = "Turn"
    turn_order: int
    turn_is_formatted: bool
    end_of_turn: bool
    transcript: str
    end_of_turn_confidence: float
    words: list[Word]
    language_code: str | None = None
    language_confidence: float | None = None
    speaker: str | None = Field(default=None, alias="speaker_label")




[docs]
class SpeechStartedMessage(BaseMessage):
    """Message sent when speech is first detected in the audio stream.

    Parameters:
        type: Always "SpeechStarted" for this message type.
        timestamp: Audio timestamp in milliseconds when speech was detected.
    """

    type: Literal["SpeechStarted"] = "SpeechStarted"
    timestamp: int




[docs]
class TerminationMessage(BaseMessage):
    """Message sent when the session is terminated.

    Parameters:
        type: Always "Termination" for this message type.
        audio_duration_seconds: Total duration of audio processed.
        session_duration_seconds: Total duration of the session.
    """

    type: Literal["Termination"] = "Termination"
    audio_duration_seconds: float
    session_duration_seconds: float



# Union type for all possible message types
AnyMessage = BeginMessage | TurnMessage | SpeechStartedMessage | TerminationMessage



[docs]
class AssemblyAIConnectionParams(BaseModel):
    """Configuration parameters for AssemblyAI WebSocket connection.

    .. deprecated:: 0.0.105
        Use ``settings=AssemblyAISTTService.Settings(foo=...)`` instead.

    Parameters:
        sample_rate: Audio sample rate in Hz. Defaults to 16000.
        encoding: Audio encoding format. Defaults to "pcm_s16le".
        end_of_turn_confidence_threshold: Confidence threshold for end-of-turn detection.
        min_turn_silence: Minimum silence duration when confident about end-of-turn.
        min_end_of_turn_silence_when_confident: DEPRECATED. Use min_turn_silence instead.
        max_turn_silence: Maximum silence duration before forcing end-of-turn.
        keyterms_prompt: List of key terms to guide transcription. Will be JSON serialized before sending.
        prompt: Optional text prompt to guide the transcription. Only used when speech_model is "u3-rt-pro".
        speech_model: Select between English, multilingual, and u3-rt-pro models. Defaults to "u3-rt-pro".
        language_detection: Enable automatic language detection. Only applicable to
            universal-streaming-multilingual. When enabled, Turn messages include
            language_code and language_confidence fields. Defaults to None (not sent).
        format_turns: Whether to format transcript turns. Only applicable to
            universal-streaming-english and universal-streaming-multilingual models.
            For u3-rt-pro, formatting is automatic and built-in. Defaults to True.
        speaker_labels: Enable speaker diarization. When enabled, final transcripts
            (end_of_turn=True) include a speaker field identifying the speaker
            (e.g., "Speaker A", "Speaker B"). Defaults to None (not sent).
        vad_threshold: Voice activity detection confidence threshold. Only applicable to
            u3-rt-pro. The confidence threshold (0.0 to 1.0) for classifying audio frames
            as silence. Frames with VAD confidence below this value are considered silent.
            Increase for noisy environments to reduce false speech detection. Defaults to
            0.3 (API default). For best performance when using with external VAD (e.g., Silero),
            align this value with your VAD's activation threshold to avoid the "dead zone"
            where AssemblyAI transcribes speech that your VAD hasn't detected yet.
            Defaults to None (not sent).
    """

    sample_rate: int = 16000
    encoding: Literal["pcm_s16le", "pcm_mulaw"] = "pcm_s16le"
    end_of_turn_confidence_threshold: float | None = None
    min_turn_silence: int | None = None
    min_end_of_turn_silence_when_confident: int | None = None  # Deprecated
    max_turn_silence: int | None = None
    keyterms_prompt: list[str] | None = None
    prompt: str | None = None
    speech_model: Literal[
        "universal-streaming-english", "universal-streaming-multilingual", "u3-rt-pro"
    ] = "u3-rt-pro"
    language_detection: bool | None = None
    format_turns: bool = True
    speaker_labels: bool | None = None
    vad_threshold: float | None = None


[docs]
    @model_validator(mode="after")
    def handle_deprecated_param(self):
        """Handle deprecated min_end_of_turn_silence_when_confident parameter."""
        if self.min_end_of_turn_silence_when_confident is not None:
            logger.warning(
                "The 'min_end_of_turn_silence_when_confident' parameter is deprecated and will be "
                "removed in a future version. Please use 'min_turn_silence' instead."
            )
            # If min_turn_silence is not set, use the deprecated value
            if self.min_turn_silence is None:
                self.min_turn_silence = self.min_end_of_turn_silence_when_confident
        return self