Source code for pipecat.services.gladia.config

#
# Copyright (c) 2024-2026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#

"""Configuration for the Gladia STT service."""

from typing import Any

from pydantic import BaseModel



[docs]
class LanguageConfig(BaseModel):
    """Configuration for language detection and handling.

    Parameters:
        languages: List of language codes to use for transcription
        code_switching: Whether to auto-detect language changes during transcription
    """

    languages: list[str] | None = None
    code_switching: bool | None = None




[docs]
class PreProcessingConfig(BaseModel):
    """Configuration for audio pre-processing options.

    Parameters:
        audio_enhancer: Apply pre-processing to the audio stream to enhance quality
        speech_threshold: Sensitivity for speech detection (0-1)
    """

    audio_enhancer: bool | None = None
    speech_threshold: float | None = None




[docs]
class CustomVocabularyItem(BaseModel):
    """Represents a custom vocabulary item with an intensity value.

    Parameters:
        value: The vocabulary word or phrase
        intensity: The bias intensity for this vocabulary item (0-1)
        pronunciations: The pronunciations used in the transcription.
        language: Specify the language in which it will be pronounced when sound comparison occurs. Default to transcription language.
    """

    value: str
    intensity: float
    pronunciations: list[str] | None = None
    language: str | None = None




[docs]
class CustomVocabularyConfig(BaseModel):
    """Configuration for custom vocabulary.

    Parameters:
        vocabulary: List of words/phrases or CustomVocabularyItem objects
        default_intensity: Default intensity for simple string vocabulary items
    """

    vocabulary: list[str | CustomVocabularyItem] | None = None
    default_intensity: float | None = None




[docs]
class CustomSpellingConfig(BaseModel):
    """Configuration for custom spelling rules.

    Parameters:
        spelling_dictionary: Mapping of correct spellings to phonetic variations
    """

    spelling_dictionary: dict[str, list[str]] | None = None




[docs]
class TranslationConfig(BaseModel):
    """Configuration for real-time translation.

    Parameters:
        target_languages: List of target language codes for translation
        model: Translation model to use ("base" or "enhanced")
        match_original_utterances: Whether to align translations with original utterances
        lipsync: Whether to enable lip-sync optimization for translations
        context_adaptation: Whether to enable context-aware translation adaptation
        context: Additional context to help with translation accuracy
        informal: Force informal language forms when available
    """

    target_languages: list[str] | None = None
    model: str | None = None
    match_original_utterances: bool | None = None
    lipsync: bool | None = None
    context_adaptation: bool | None = None
    context: str | None = None
    informal: bool | None = None




[docs]
class RealtimeProcessingConfig(BaseModel):
    """Configuration for real-time processing features.

    Parameters:
        words_accurate_timestamps: Whether to provide per-word timestamps
        custom_vocabulary: Whether to enable custom vocabulary
        custom_vocabulary_config: Custom vocabulary configuration
        custom_spelling: Whether to enable custom spelling
        custom_spelling_config: Custom spelling configuration
        translation: Whether to enable translation
        translation_config: Translation configuration
        named_entity_recognition: Whether to enable named entity recognition
        sentiment_analysis: Whether to enable sentiment analysis
    """

    words_accurate_timestamps: bool | None = None
    custom_vocabulary: bool | None = None
    custom_vocabulary_config: CustomVocabularyConfig | None = None
    custom_spelling: bool | None = None
    custom_spelling_config: CustomSpellingConfig | None = None
    translation: bool | None = None
    translation_config: TranslationConfig | None = None
    named_entity_recognition: bool | None = None
    sentiment_analysis: bool | None = None




[docs]
class MessagesConfig(BaseModel):
    """Configuration for controlling which message types are sent via WebSocket.

    Parameters:
        receive_partial_transcripts: Whether to receive intermediate transcription results
        receive_final_transcripts: Whether to receive final transcription results
        receive_speech_events: Whether to receive speech begin/end events
        receive_pre_processing_events: Whether to receive pre-processing events
        receive_realtime_processing_events: Whether to receive real-time processing events
        receive_post_processing_events: Whether to receive post-processing events
        receive_acknowledgments: Whether to receive acknowledgment messages
        receive_errors: Whether to receive error messages
        receive_lifecycle_events: Whether to receive lifecycle events
    """

    receive_partial_transcripts: bool | None = None
    receive_final_transcripts: bool | None = None
    receive_speech_events: bool | None = None
    receive_pre_processing_events: bool | None = None
    receive_realtime_processing_events: bool | None = None
    receive_post_processing_events: bool | None = None
    receive_acknowledgments: bool | None = None
    receive_errors: bool | None = None
    receive_lifecycle_events: bool | None = None




[docs]
class GladiaInputParams(BaseModel):
    """Configuration parameters for the Gladia STT service.

    .. deprecated:: 0.0.105
        Use ``settings=GladiaSTTService.Settings(...)`` for runtime-updatable
        fields and direct init parameters for encoding/bit_depth/channels.

    Parameters:
        encoding: Audio encoding format
        bit_depth: Audio bit depth
        channels: Number of audio channels
        custom_metadata: Additional metadata to include with requests
        endpointing: Silence duration in seconds to mark end of speech
        maximum_duration_without_endpointing: Maximum utterance duration without silence
        language_config: Detailed language configuration
        pre_processing: Audio pre-processing options
        realtime_processing: Real-time processing features
        messages_config: WebSocket message filtering options
        enable_vad: Enable VAD to trigger end of utterance detection. This should be used
            without any other VAD enabled in the agent and will emit the speaker started
            and stopped frames. Defaults to False.
    """

    encoding: str | None = "wav/pcm"
    bit_depth: int | None = 16
    channels: int | None = 1
    custom_metadata: dict[str, Any] | None = None
    endpointing: float | None = None
    maximum_duration_without_endpointing: int | None = 5
    language_config: LanguageConfig | None = None
    pre_processing: PreProcessingConfig | None = None
    realtime_processing: RealtimeProcessingConfig | None = None
    messages_config: MessagesConfig | None = None
    enable_vad: bool = False