Source code for pipecat.services.mistral.tts

#
# Copyright (c) 2024-2026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#

"""Mistral text-to-speech service implementation.

This module provides integration with Mistral's Voxtral TTS API for
generating speech from text input using HTTP streaming with Server-Sent Events.
"""

import base64
import struct
from collections.abc import AsyncGenerator
from dataclasses import dataclass

from loguru import logger

from pipecat.frames.frames import (
    ErrorFrame,
    Frame,
    TTSAudioRawFrame,
)
from pipecat.services.settings import TTSSettings, assert_given
from pipecat.services.tts_service import TTSService
from pipecat.utils.tracing.service_decorators import traced_tts

try:
    from mistralai.client import Mistral
except ModuleNotFoundError as e:
    logger.error(f"Exception: {e}")
    logger.error("In order to use Mistral TTS, you need to `pip install pipecat-ai[mistral]`.")
    raise Exception(f"Missing module: {e}")



[docs]
@dataclass
class MistralTTSSettings(TTSSettings):
    """Settings for MistralTTSService.

    Parameters:
        model: TTS model identifier.
        voice: Voice identifier.
        language: Language for speech synthesis.
    """

    pass




[docs]
class MistralTTSService(TTSService):
    """Mistral Text-to-Speech service using the Voxtral TTS API.

    This service uses Mistral's streaming TTS API to generate PCM-encoded audio
    at 24kHz. The API returns base64-encoded float32 PCM chunks via Server-Sent
    Events, which are converted to int16 for the Pipecat pipeline.
    """

    Settings = MistralTTSSettings
    _settings: Settings

    MISTRAL_SAMPLE_RATE = 24000


[docs]
    def __init__(
        self,
        *,
        api_key: str | None = None,
        sample_rate: int | None = None,
        settings: Settings | None = None,
        **kwargs,
    ):
        """Initialize Mistral TTS service.

        Args:
            api_key: Mistral API key for authentication.
            sample_rate: Output audio sample rate in Hz. Audio is resampled from
                Mistral's native 24kHz when a different rate is requested.
            settings: Runtime-updatable settings.
            **kwargs: Additional keyword arguments passed to TTSService.
        """
        # Initialize default_settings with hardcoded defaults
        default_settings = self.Settings(
            model="voxtral-mini-tts-2603",
            voice=None,
            language=None,
        )

        # Apply settings delta (canonical API, always wins)
        if settings is not None:
            default_settings.apply_update(settings)

        super().__init__(
            sample_rate=sample_rate,
            push_start_frame=True,
            push_stop_frames=True,
            settings=default_settings,
            **kwargs,
        )

        self._client = Mistral(api_key=api_key)



[docs]
    def can_generate_metrics(self) -> bool:
        """Check if this service can generate processing metrics.

        Returns:
            True, as Mistral TTS service supports metrics generation.
        """
        return True


    @staticmethod
    def _float32_to_int16(data: bytes) -> bytes:
        """Convert float32 PCM audio data to int16 PCM.

        Args:
            data: Raw bytes containing float32 LE PCM samples.

        Returns:
            Raw bytes containing int16 LE PCM samples.
        """
        n = len(data) // 4
        floats = struct.unpack(f"<{n}f", data)
        return struct.pack(f"<{n}h", *(min(32767, max(-32768, int(f * 32767))) for f in floats))


[docs]
    @traced_tts
    async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]:
        """Generate speech from text using Mistral's TTS API.

        Args:
            text: The text to synthesize into speech.
            context_id: The context ID for tracking audio frames.

        Yields:
            Frame: Audio frames containing the synthesized speech data.
        """
        logger.debug(f"{self}: Generating TTS [{text}]")
        try:
            await self.start_tts_usage_metrics(text)

            async with await self._client.audio.speech.complete_async(
                input=text,
                model=assert_given(self._settings.model),
                voice_id=assert_given(self._settings.voice),
                response_format="pcm",
                stream=True,
            ) as event_stream:
                async for event in event_stream:
                    if event.event == "speech.audio.delta":
                        audio_bytes = base64.b64decode(event.data.audio_data)
                        audio_int16 = self._float32_to_int16(audio_bytes)
                        audio_data = await self._resampler.resample(
                            audio_int16, self.MISTRAL_SAMPLE_RATE, self.sample_rate
                        )
                        await self.stop_ttfb_metrics()
                        yield TTSAudioRawFrame(
                            audio_data, self.sample_rate, 1, context_id=context_id
                        )
                    elif event.event == "speech.audio.done":
                        if hasattr(event.data, "usage") and event.data.usage:
                            logger.debug(f"{self}: Usage info: {event.data.usage}")
        except Exception as e:
            logger.error(f"{self} error generating TTS: {e}")
            yield ErrorFrame(error=f"Error generating TTS: {e}")