#
# Copyright (c) 2024-2026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
"""AssemblyAI WebSocket API message models and connection parameters.
This module defines Pydantic models for handling AssemblyAI's real-time
transcription WebSocket messages and connection configuration.
"""
from typing import Literal
from loguru import logger
from pydantic import BaseModel, ConfigDict, Field, model_validator
[docs]
class Word(BaseModel):
"""Represents a single word in a transcription with timing and confidence.
Parameters:
start: Start time of the word in milliseconds.
end: End time of the word in milliseconds.
text: The transcribed word text.
confidence: Confidence score for the word (0.0 to 1.0).
word_is_final: Whether this word is finalized and won't change.
"""
start: int
end: int
text: str
confidence: float
word_is_final: bool = Field(..., alias="word_is_final")
[docs]
class BaseMessage(BaseModel):
"""Base class for all AssemblyAI WebSocket messages.
Parameters:
type: The message type identifier.
"""
type: str
[docs]
class BeginMessage(BaseMessage):
"""Message sent when a new session begins.
Parameters:
type: Always "Begin" for this message type.
id: Unique session identifier.
expires_at: Unix timestamp when the session expires.
"""
type: Literal["Begin"] = "Begin"
id: str
expires_at: int
[docs]
class TurnMessage(BaseMessage):
"""Message containing transcription data for a turn of speech.
Parameters:
type: Always "Turn" for this message type.
turn_order: Sequential number of this turn in the session.
turn_is_formatted: Whether the transcript has been formatted.
end_of_turn: Whether this marks the end of a speaking turn.
transcript: The transcribed text for this turn.
end_of_turn_confidence: Confidence score for end-of-turn detection.
words: List of individual words with timing and confidence data.
language_code: Detected language code (e.g., "es", "fr"). Only present with
complete utterances or when end_of_turn is True.
language_confidence: Confidence score (0-1) for language detection. Only present
with complete utterances or when end_of_turn is True.
speaker: Speaker label (e.g., "A", "B"). Only present when speaker_labels is
enabled and end_of_turn is True. Maps to 'speaker_label' in JSON response.
"""
model_config = ConfigDict(populate_by_name=True)
type: Literal["Turn"] = "Turn"
turn_order: int
turn_is_formatted: bool
end_of_turn: bool
transcript: str
end_of_turn_confidence: float
words: list[Word]
language_code: str | None = None
language_confidence: float | None = None
speaker: str | None = Field(default=None, alias="speaker_label")
[docs]
class SpeechStartedMessage(BaseMessage):
"""Message sent when speech is first detected in the audio stream.
Parameters:
type: Always "SpeechStarted" for this message type.
timestamp: Audio timestamp in milliseconds when speech was detected.
"""
type: Literal["SpeechStarted"] = "SpeechStarted"
timestamp: int
[docs]
class TerminationMessage(BaseMessage):
"""Message sent when the session is terminated.
Parameters:
type: Always "Termination" for this message type.
audio_duration_seconds: Total duration of audio processed.
session_duration_seconds: Total duration of the session.
"""
type: Literal["Termination"] = "Termination"
audio_duration_seconds: float
session_duration_seconds: float
# Union type for all possible message types
AnyMessage = BeginMessage | TurnMessage | SpeechStartedMessage | TerminationMessage
[docs]
class AssemblyAIConnectionParams(BaseModel):
"""Configuration parameters for AssemblyAI WebSocket connection.
.. deprecated:: 0.0.105
Use ``settings=AssemblyAISTTService.Settings(foo=...)`` instead.
Parameters:
sample_rate: Audio sample rate in Hz. Defaults to 16000.
encoding: Audio encoding format. Defaults to "pcm_s16le".
end_of_turn_confidence_threshold: Confidence threshold for end-of-turn detection.
min_turn_silence: Minimum silence duration when confident about end-of-turn.
min_end_of_turn_silence_when_confident: DEPRECATED. Use min_turn_silence instead.
max_turn_silence: Maximum silence duration before forcing end-of-turn.
keyterms_prompt: List of key terms to guide transcription. Will be JSON serialized before sending.
prompt: Optional text prompt to guide the transcription. Only used when speech_model is "u3-rt-pro".
speech_model: Select between English, multilingual, and u3-rt-pro models. Defaults to "u3-rt-pro".
language_detection: Enable automatic language detection. Only applicable to
universal-streaming-multilingual. When enabled, Turn messages include
language_code and language_confidence fields. Defaults to None (not sent).
format_turns: Whether to format transcript turns. Only applicable to
universal-streaming-english and universal-streaming-multilingual models.
For u3-rt-pro, formatting is automatic and built-in. Defaults to True.
speaker_labels: Enable speaker diarization. When enabled, final transcripts
(end_of_turn=True) include a speaker field identifying the speaker
(e.g., "Speaker A", "Speaker B"). Defaults to None (not sent).
vad_threshold: Voice activity detection confidence threshold. Only applicable to
u3-rt-pro. The confidence threshold (0.0 to 1.0) for classifying audio frames
as silence. Frames with VAD confidence below this value are considered silent.
Increase for noisy environments to reduce false speech detection. Defaults to
0.3 (API default). For best performance when using with external VAD (e.g., Silero),
align this value with your VAD's activation threshold to avoid the "dead zone"
where AssemblyAI transcribes speech that your VAD hasn't detected yet.
Defaults to None (not sent).
"""
sample_rate: int = 16000
encoding: Literal["pcm_s16le", "pcm_mulaw"] = "pcm_s16le"
end_of_turn_confidence_threshold: float | None = None
min_turn_silence: int | None = None
min_end_of_turn_silence_when_confident: int | None = None # Deprecated
max_turn_silence: int | None = None
keyterms_prompt: list[str] | None = None
prompt: str | None = None
speech_model: Literal[
"universal-streaming-english", "universal-streaming-multilingual", "u3-rt-pro"
] = "u3-rt-pro"
language_detection: bool | None = None
format_turns: bool = True
speaker_labels: bool | None = None
vad_threshold: float | None = None
[docs]
@model_validator(mode="after")
def handle_deprecated_param(self):
"""Handle deprecated min_end_of_turn_silence_when_confident parameter."""
if self.min_end_of_turn_silence_when_confident is not None:
logger.warning(
"The 'min_end_of_turn_silence_when_confident' parameter is deprecated and will be "
"removed in a future version. Please use 'min_turn_silence' instead."
)
# If min_turn_silence is not set, use the deprecated value
if self.min_turn_silence is None:
self.min_turn_silence = self.min_end_of_turn_silence_when_confident
return self