#
# Copyright (c) 2024-2026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
"""Google Cloud Text-to-Speech service implementations.
This module provides integration with Google Cloud Text-to-Speech API,
offering both HTTP-based synthesis with SSML support and streaming synthesis
for real-time applications.
It also includes GeminiTTSService which uses Gemini's TTS-specific models
for natural voice control and multi-speaker conversations.
"""
import json
import os
from pipecat.utils.tracing.service_decorators import traced_tts
# Suppress gRPC fork warnings
os.environ["GRPC_ENABLE_FORK_SUPPORT"] = "false"
from collections.abc import AsyncGenerator
from dataclasses import dataclass, field
from typing import Any, Literal
from loguru import logger
from pydantic import BaseModel
from pipecat.frames.frames import (
ErrorFrame,
Frame,
StartFrame,
TTSAudioRawFrame,
)
from pipecat.services.settings import (
NOT_GIVEN,
TTSSettings,
_NotGiven,
assert_given,
is_given,
)
from pipecat.services.tts_service import TTSService
from pipecat.transcriptions.language import Language, resolve_language
try:
from google.api_core.client_options import ClientOptions
from google.auth import default
from google.auth.exceptions import GoogleAuthError
from google.cloud import texttospeech_v1
from google.oauth2 import service_account
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error(
"In order to use Google AI, you need to `pip install pipecat-ai[google]`. Also, set `GOOGLE_APPLICATION_CREDENTIALS` environment variable."
)
raise Exception(f"Missing module: {e}")
[docs]
def language_to_google_tts_language(language: Language) -> str | None:
"""Convert a Language enum to Google TTS language code.
Source:
https://docs.cloud.google.com/text-to-speech/docs/chirp3-hd
Args:
language: The Language enum value to convert.
Returns:
The corresponding Google TTS language code, or None if not supported.
"""
LANGUAGE_MAP = {
# Arabic
Language.AR: "ar-XA",
# Bengali
Language.BN: "bn-IN",
Language.BN_IN: "bn-IN",
# Bulgarian
Language.BG: "bg-BG",
Language.BG_BG: "bg-BG",
# Croatian
Language.HR: "hr-HR",
Language.HR_HR: "hr-HR",
# Czech
Language.CS: "cs-CZ",
Language.CS_CZ: "cs-CZ",
# Danish
Language.DA: "da-DK",
Language.DA_DK: "da-DK",
# Dutch
Language.NL: "nl-NL",
Language.NL_BE: "nl-BE",
Language.NL_NL: "nl-NL",
# English
Language.EN: "en-US",
Language.EN_US: "en-US",
Language.EN_AU: "en-AU",
Language.EN_GB: "en-GB",
Language.EN_IN: "en-IN",
# Estonian
Language.ET: "et-EE",
Language.ET_EE: "et-EE",
# Finnish
Language.FI: "fi-FI",
Language.FI_FI: "fi-FI",
# French
Language.FR: "fr-FR",
Language.FR_CA: "fr-CA",
Language.FR_FR: "fr-FR",
# German
Language.DE: "de-DE",
Language.DE_DE: "de-DE",
# Greek
Language.EL: "el-GR",
Language.EL_GR: "el-GR",
# Gujarati
Language.GU: "gu-IN",
Language.GU_IN: "gu-IN",
# Hebrew
Language.HE: "he-IL",
Language.HE_IL: "he-IL",
# Hindi
Language.HI: "hi-IN",
Language.HI_IN: "hi-IN",
# Hungarian
Language.HU: "hu-HU",
Language.HU_HU: "hu-HU",
# Indonesian
Language.ID: "id-ID",
Language.ID_ID: "id-ID",
# Italian
Language.IT: "it-IT",
Language.IT_IT: "it-IT",
# Japanese
Language.JA: "ja-JP",
Language.JA_JP: "ja-JP",
# Kannada
Language.KN: "kn-IN",
Language.KN_IN: "kn-IN",
# Korean
Language.KO: "ko-KR",
Language.KO_KR: "ko-KR",
# Latvian
Language.LV: "lv-LV",
Language.LV_LV: "lv-LV",
# Lithuanian
Language.LT: "lt-LT",
Language.LT_LT: "lt-LT",
# Malayalam
Language.ML: "ml-IN",
Language.ML_IN: "ml-IN",
# Chinese (Mandarin)
Language.ZH: "cmn-CN",
Language.ZH_CN: "cmn-CN",
# Marathi
Language.MR: "mr-IN",
Language.MR_IN: "mr-IN",
# Norwegian
Language.NO: "nb-NO",
Language.NB: "nb-NO",
Language.NB_NO: "nb-NO",
# Polish
Language.PL: "pl-PL",
Language.PL_PL: "pl-PL",
# Portuguese
Language.PT: "pt-BR",
Language.PT_BR: "pt-BR",
# Romanian
Language.RO: "ro-RO",
Language.RO_RO: "ro-RO",
# Russian
Language.RU: "ru-RU",
Language.RU_RU: "ru-RU",
# Serbian
Language.SR: "sr-RS",
Language.SR_RS: "sr-RS",
# Slovak
Language.SK: "sk-SK",
Language.SK_SK: "sk-SK",
# Slovenian
Language.SL: "sl-SI",
Language.SL_SI: "sl-SI",
# Spanish
Language.ES: "es-ES",
Language.ES_ES: "es-ES",
Language.ES_US: "es-US",
# Swahili
Language.SW: "sw-KE",
Language.SW_KE: "sw-KE",
# Swedish
Language.SV: "sv-SE",
Language.SV_SE: "sv-SE",
# Tamil
Language.TA: "ta-IN",
Language.TA_IN: "ta-IN",
# Telugu
Language.TE: "te-IN",
Language.TE_IN: "te-IN",
# Thai
Language.TH: "th-TH",
Language.TH_TH: "th-TH",
# Turkish
Language.TR: "tr-TR",
Language.TR_TR: "tr-TR",
# Ukrainian
Language.UK: "uk-UA",
Language.UK_UA: "uk-UA",
# Urdu
Language.UR: "ur-IN",
Language.UR_IN: "ur-IN",
# Vietnamese
Language.VI: "vi-VN",
Language.VI_VN: "vi-VN",
}
return resolve_language(language, LANGUAGE_MAP, use_base_code=False)
[docs]
def language_to_gemini_tts_language(language: Language) -> str | None:
"""Convert a Language enum to Gemini TTS language code.
Source:
https://docs.cloud.google.com/text-to-speech/docs/gemini-tts#available_languages
Args:
language: The Language enum value to convert.
Returns:
The corresponding Gemini TTS language code, or None if not supported.
"""
LANGUAGE_MAP = {
# Afrikaans (Preview)
Language.AF: "af-ZA",
Language.AF_ZA: "af-ZA",
# Albanian (Preview)
Language.SQ: "sq-AL",
Language.SQ_AL: "sq-AL",
# Amharic (Preview)
Language.AM: "am-ET",
Language.AM_ET: "am-ET",
# Arabic
Language.AR: "ar-EG", # GA: Egypt
Language.AR_EG: "ar-EG",
Language.AR_001: "ar-001", # Preview: World
# Armenian (Preview)
Language.HY: "hy-AM",
Language.HY_AM: "hy-AM",
# Azerbaijani (Preview)
Language.AZ: "az-AZ",
Language.AZ_AZ: "az-AZ",
# Basque (Preview)
Language.EU: "eu-ES",
Language.EU_ES: "eu-ES",
# Belarusian (Preview)
Language.BE: "be-BY",
Language.BE_BY: "be-BY",
# Bengali (GA)
Language.BN: "bn-BD",
Language.BN_BD: "bn-BD",
# Bulgarian (Preview)
Language.BG: "bg-BG",
Language.BG_BG: "bg-BG",
# Burmese (Preview)
Language.MY: "my-MM",
Language.MY_MM: "my-MM",
# Catalan (Preview)
Language.CA: "ca-ES",
Language.CA_ES: "ca-ES",
# Cebuano (Preview)
Language.CEB: "ceb-PH",
Language.CEB_PH: "ceb-PH",
# Chinese (Mandarin)
Language.ZH: "cmn-CN", # Preview
Language.ZH_CN: "cmn-CN",
Language.ZH_TW: "cmn-TW", # Preview
# Croatian (Preview)
Language.HR: "hr-HR",
Language.HR_HR: "hr-HR",
# Czech (Preview)
Language.CS: "cs-CZ",
Language.CS_CZ: "cs-CZ",
# Danish (Preview)
Language.DA: "da-DK",
Language.DA_DK: "da-DK",
# Dutch (GA)
Language.NL: "nl-NL",
Language.NL_NL: "nl-NL",
# English
Language.EN: "en-US", # GA
Language.EN_US: "en-US",
Language.EN_AU: "en-AU", # Preview
Language.EN_GB: "en-GB", # Preview
Language.EN_IN: "en-IN", # GA
# Estonian (Preview)
Language.ET: "et-EE",
Language.ET_EE: "et-EE",
# Filipino (Preview)
Language.FIL: "fil-PH",
Language.FIL_PH: "fil-PH",
# Finnish (Preview)
Language.FI: "fi-FI",
Language.FI_FI: "fi-FI",
# French
Language.FR: "fr-FR", # GA
Language.FR_FR: "fr-FR",
Language.FR_CA: "fr-CA", # Preview
# Galician (Preview)
Language.GL: "gl-ES",
Language.GL_ES: "gl-ES",
# Georgian (Preview)
Language.KA: "ka-GE",
Language.KA_GE: "ka-GE",
# German (GA)
Language.DE: "de-DE",
Language.DE_DE: "de-DE",
# Greek (Preview)
Language.EL: "el-GR",
Language.EL_GR: "el-GR",
# Gujarati (Preview)
Language.GU: "gu-IN",
Language.GU_IN: "gu-IN",
# Haitian Creole (Preview)
Language.HT: "ht-HT",
Language.HT_HT: "ht-HT",
# Hebrew (Preview)
Language.HE: "he-IL",
Language.HE_IL: "he-IL",
# Hindi (GA)
Language.HI: "hi-IN",
Language.HI_IN: "hi-IN",
# Hungarian (Preview)
Language.HU: "hu-HU",
Language.HU_HU: "hu-HU",
# Icelandic (Preview)
Language.IS: "is-IS",
Language.IS_IS: "is-IS",
# Indonesian (GA)
Language.ID: "id-ID",
Language.ID_ID: "id-ID",
# Italian (GA)
Language.IT: "it-IT",
Language.IT_IT: "it-IT",
# Japanese (GA)
Language.JA: "ja-JP",
Language.JA_JP: "ja-JP",
# Javanese (Preview)
Language.JV: "jv-JV",
Language.JV_JV: "jv-JV",
# Kannada (Preview)
Language.KN: "kn-IN",
Language.KN_IN: "kn-IN",
# Konkani (Preview)
Language.KOK: "kok-IN",
Language.KOK_IN: "kok-IN",
# Korean (GA)
Language.KO: "ko-KR",
Language.KO_KR: "ko-KR",
# Lao (Preview)
Language.LO: "lo-LA",
Language.LO_LA: "lo-LA",
# Latin (Preview)
Language.LA: "la-VA",
Language.LA_VA: "la-VA",
# Latvian (Preview)
Language.LV: "lv-LV",
Language.LV_LV: "lv-LV",
# Lithuanian (Preview)
Language.LT: "lt-LT",
Language.LT_LT: "lt-LT",
# Luxembourgish (Preview)
Language.LB: "lb-LU",
Language.LB_LU: "lb-LU",
# Macedonian (Preview)
Language.MK: "mk-MK",
Language.MK_MK: "mk-MK",
# Maithili (Preview)
Language.MAI: "mai-IN",
Language.MAI_IN: "mai-IN",
# Malagasy (Preview)
Language.MG: "mg-MG",
Language.MG_MG: "mg-MG",
# Malay (Preview)
Language.MS: "ms-MY",
Language.MS_MY: "ms-MY",
# Malayalam (Preview)
Language.ML: "ml-IN",
Language.ML_IN: "ml-IN",
# Marathi (GA)
Language.MR: "mr-IN",
Language.MR_IN: "mr-IN",
# Mongolian (Preview)
Language.MN: "mn-MN",
Language.MN_MN: "mn-MN",
# Nepali (Preview)
Language.NE: "ne-NP",
Language.NE_NP: "ne-NP",
# Norwegian
Language.NO: "nb-NO", # Preview: Bokmål
Language.NB: "nb-NO",
Language.NB_NO: "nb-NO",
Language.NN: "nn-NO", # Preview: Nynorsk
Language.NN_NO: "nn-NO",
# Odia (Preview)
Language.OR: "or-IN",
Language.OR_IN: "or-IN",
# Pashto (Preview)
Language.PS: "ps-AF",
Language.PS_AF: "ps-AF",
# Persian (Preview)
Language.FA: "fa-IR",
Language.FA_IR: "fa-IR",
# Polish (GA)
Language.PL: "pl-PL",
Language.PL_PL: "pl-PL",
# Portuguese
Language.PT: "pt-BR", # GA: Brazil
Language.PT_BR: "pt-BR",
Language.PT_PT: "pt-PT", # Preview: Portugal
# Punjabi (Preview)
Language.PA: "pa-IN",
Language.PA_IN: "pa-IN",
# Romanian (GA)
Language.RO: "ro-RO",
Language.RO_RO: "ro-RO",
# Russian (GA)
Language.RU: "ru-RU",
Language.RU_RU: "ru-RU",
# Serbian (Preview)
Language.SR: "sr-RS",
Language.SR_RS: "sr-RS",
# Sindhi (Preview)
Language.SD: "sd-IN",
Language.SD_IN: "sd-IN",
# Sinhala (Preview)
Language.SI: "si-LK",
Language.SI_LK: "si-LK",
# Slovak (Preview)
Language.SK: "sk-SK",
Language.SK_SK: "sk-SK",
# Slovenian (Preview)
Language.SL: "sl-SI",
Language.SL_SI: "sl-SI",
# Spanish
Language.ES: "es-ES", # GA
Language.ES_ES: "es-ES",
Language.ES_419: "es-419", # Preview: Latin America
Language.ES_MX: "es-MX", # Preview: Mexico
# Swahili (Preview)
Language.SW: "sw-KE",
Language.SW_KE: "sw-KE",
# Swedish (Preview)
Language.SV: "sv-SE",
Language.SV_SE: "sv-SE",
# Tamil (GA)
Language.TA: "ta-IN",
Language.TA_IN: "ta-IN",
# Telugu (GA)
Language.TE: "te-IN",
Language.TE_IN: "te-IN",
# Thai (GA)
Language.TH: "th-TH",
Language.TH_TH: "th-TH",
# Turkish (GA)
Language.TR: "tr-TR",
Language.TR_TR: "tr-TR",
# Ukrainian (GA)
Language.UK: "uk-UA",
Language.UK_UA: "uk-UA",
# Urdu (Preview)
Language.UR: "ur-PK",
Language.UR_PK: "ur-PK",
# Vietnamese (GA)
Language.VI: "vi-VN",
Language.VI_VN: "vi-VN",
}
return resolve_language(language, LANGUAGE_MAP, use_base_code=False)
[docs]
@dataclass
class GoogleHttpTTSSettings(TTSSettings):
"""Settings for GoogleHttpTTSService.
Parameters:
pitch: Voice pitch adjustment (e.g., "+2st", "-50%").
rate: Speaking rate adjustment (e.g., "slow", "fast", "125%"). Used for
SSML prosody tags (non-Chirp voices).
speaking_rate: Speaking rate for AudioConfig (Chirp/Journey voices).
Range [0.25, 2.0].
volume: Volume adjustment (e.g., "loud", "soft", "+6dB").
emphasis: Emphasis level for the text.
gender: Voice gender preference.
google_style: Google-specific voice style.
"""
pitch: str | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
rate: str | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
speaking_rate: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
volume: str | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
emphasis: Literal["strong", "moderate", "reduced", "none"] | None | _NotGiven = field(
default_factory=lambda: NOT_GIVEN
)
gender: Literal["male", "female", "neutral"] | None | _NotGiven = field(
default_factory=lambda: NOT_GIVEN
)
google_style: (
Literal["apologetic", "calm", "empathetic", "firm", "lively"] | None | _NotGiven
) = field(default_factory=lambda: NOT_GIVEN)
[docs]
@dataclass
class GoogleTTSSettings(TTSSettings):
"""Settings for GoogleTTSService.
Parameters:
speaking_rate: The speaking rate, in the range [0.25, 2.0].
"""
speaking_rate: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
#: *Deprecated since 0.0.105:* Use ``GoogleTTSService.Settings`` instead.
GoogleStreamTTSSettings = GoogleTTSSettings
[docs]
@dataclass
class GeminiTTSSettings(TTSSettings):
"""Settings for GeminiTTSService.
Parameters:
prompt: Optional style instructions for how to synthesize the content.
multi_speaker: Whether to enable multi-speaker support.
speaker_configs: List of speaker configurations for multi-speaker mode.
"""
prompt: str | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
multi_speaker: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
speaker_configs: list[dict[str, Any]] | None | _NotGiven = field(
default_factory=lambda: NOT_GIVEN
)
[docs]
class GoogleHttpTTSService(TTSService):
"""Google Cloud Text-to-Speech HTTP service with SSML support.
Provides text-to-speech synthesis using Google Cloud's HTTP API with
comprehensive SSML support for voice customization, prosody control,
and styling options. Ideal for applications requiring fine-grained
control over speech output.
Note:
Requires Google Cloud credentials via service account JSON, credentials file,
or default application credentials (GOOGLE_APPLICATION_CREDENTIALS).
Chirp and Journey voices don't support SSML and will use plain text input.
"""
Settings = GoogleHttpTTSSettings
_settings: Settings
[docs]
def __init__(
self,
*,
credentials: str | None = None,
credentials_path: str | None = None,
location: str | None = None,
voice_id: str | None = None,
sample_rate: int | None = None,
params: InputParams | None = None,
settings: Settings | None = None,
**kwargs,
):
"""Initializes the Google HTTP TTS service.
Args:
credentials: JSON string containing Google Cloud service account credentials.
credentials_path: Path to Google Cloud service account JSON file.
location: Google Cloud location for regional endpoint (e.g., "us-central1").
voice_id: Google TTS voice identifier (e.g., "en-US-Standard-A").
.. deprecated:: 0.0.105
Use ``settings=GoogleHttpTTSService.Settings(voice=...)`` instead.
sample_rate: Audio sample rate in Hz. If None, uses default.
params: Voice customization parameters including pitch, rate, volume, etc.
.. deprecated:: 0.0.105
Use ``settings=GoogleHttpTTSService.Settings(...)`` instead.
settings: Runtime-updatable settings. When provided alongside deprecated
parameters, ``settings`` values take precedence.
**kwargs: Additional arguments passed to parent TTSService.
"""
# 1. Initialize default_settings with hardcoded defaults
default_settings = self.Settings(
model=None,
voice="en-US-Chirp3-HD-Charon",
language="en-US",
pitch=None,
rate=None,
speaking_rate=None,
volume=None,
emphasis=None,
gender=None,
google_style=None,
)
# 2. Apply direct init arg overrides (deprecated)
if voice_id is not None:
self._warn_init_param_moved_to_settings("voice_id", "voice")
default_settings.voice = voice_id
# 3. Apply params overrides — only if settings not provided
if params is not None:
self._warn_init_param_moved_to_settings("params")
if not settings:
if params.pitch is not None:
default_settings.pitch = params.pitch
if params.rate is not None:
default_settings.rate = params.rate
if params.speaking_rate is not None:
default_settings.speaking_rate = params.speaking_rate
if params.volume is not None:
default_settings.volume = params.volume
if params.emphasis is not None:
default_settings.emphasis = params.emphasis
if params.language is not None:
default_settings.language = params.language
if params.gender is not None:
default_settings.gender = params.gender
if params.google_style is not None:
default_settings.google_style = params.google_style
# 4. Apply settings delta (canonical API, always wins)
if settings is not None:
default_settings.apply_update(settings)
super().__init__(
sample_rate=sample_rate,
push_start_frame=True,
push_stop_frames=True,
settings=default_settings,
**kwargs,
)
self._location = location
self._client: texttospeech_v1.TextToSpeechAsyncClient = self._create_client(
credentials, credentials_path
)
def _create_client(
self, credentials: str | None, credentials_path: str | None
) -> texttospeech_v1.TextToSpeechAsyncClient:
"""Create authenticated Google Text-to-Speech client.
Args:
credentials: JSON string with service account credentials.
credentials_path: Path to service account JSON file.
Returns:
Authenticated TextToSpeechAsyncClient instance.
Raises:
ValueError: If no valid credentials are provided.
"""
creds: service_account.Credentials | None = None
if credentials:
# Use provided credentials JSON string
json_account_info = json.loads(credentials)
creds = service_account.Credentials.from_service_account_info(json_account_info)
elif credentials_path:
# Use service account JSON file if provided
creds = service_account.Credentials.from_service_account_file(credentials_path)
else:
try:
creds, project_id = default(
scopes=["https://www.googleapis.com/auth/cloud-platform"]
)
except GoogleAuthError:
pass
if not creds:
raise ValueError("No valid credentials provided.")
client_options = None
if self._location:
client_options = ClientOptions(
api_endpoint=f"{self._location}-texttospeech.googleapis.com"
)
return texttospeech_v1.TextToSpeechAsyncClient(
credentials=creds, client_options=client_options
)
[docs]
def can_generate_metrics(self) -> bool:
"""Check if this service can generate processing metrics.
Returns:
True, as Google HTTP TTS service supports metrics generation.
"""
return True
[docs]
def language_to_service_language(self, language: Language) -> str | None:
"""Convert a Language enum to Google TTS language format.
Args:
language: The language to convert.
Returns:
The Google TTS-specific language code, or None if not supported.
"""
return language_to_google_tts_language(language)
async def _update_settings(self, delta: TTSSettings) -> dict[str, Any]:
"""Override to handle speaking_rate validation.
Args:
delta: Settings delta. Can include 'speaking_rate' (float).
"""
if isinstance(delta, self.Settings) and is_given(delta.speaking_rate):
rate_value = float(delta.speaking_rate)
if not (0.25 <= rate_value <= 2.0):
logger.warning(
f"Invalid speaking_rate value: {rate_value}. Must be between 0.25 and 2.0"
)
delta.speaking_rate = NOT_GIVEN
return await super()._update_settings(delta)
def _construct_ssml(self, text: str) -> str:
ssml = "<speak>"
# Voice tag
voice_attrs = [f"name='{self._settings.voice}'"]
language = self._settings.language
voice_attrs.append(f"language='{language}'")
if self._settings.gender:
voice_attrs.append(f"gender='{self._settings.gender}'")
ssml += f"<voice {' '.join(voice_attrs)}>"
# Prosody tag
prosody_attrs = []
if self._settings.pitch:
prosody_attrs.append(f"pitch='{self._settings.pitch}'")
if self._settings.rate:
prosody_attrs.append(f"rate='{self._settings.rate}'")
if self._settings.volume:
prosody_attrs.append(f"volume='{self._settings.volume}'")
if prosody_attrs:
ssml += f"<prosody {' '.join(prosody_attrs)}>"
# Emphasis tag
if self._settings.emphasis:
ssml += f"<emphasis level='{self._settings.emphasis}'>"
# Google style tag
if self._settings.google_style:
ssml += f"<google:style name='{self._settings.google_style}'>"
ssml += text
# Close tags
if self._settings.google_style:
ssml += "</google:style>"
if self._settings.emphasis:
ssml += "</emphasis>"
if prosody_attrs:
ssml += "</prosody>"
ssml += "</voice></speak>"
return ssml
[docs]
@traced_tts
async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]:
"""Generate speech from text using Google's HTTP TTS API.
Args:
text: The text to synthesize into speech.
context_id: The context ID for tracking audio frames.
Yields:
Frame: Audio frames containing the synthesized speech.
"""
logger.debug(f"{self}: Generating TTS [{text}]")
try:
# Check if the voice is a Chirp voice (including Chirp 3) or Journey voice
voice_name = assert_given(self._settings.voice)
is_chirp_voice = "chirp" in (voice_name or "").lower()
is_journey_voice = "journey" in (voice_name or "").lower()
# Create synthesis input based on voice_id
if is_chirp_voice or is_journey_voice:
# Chirp and Journey voices don't support SSML, use plain text
synthesis_input = texttospeech_v1.SynthesisInput(text=text)
else:
ssml = self._construct_ssml(text)
synthesis_input = texttospeech_v1.SynthesisInput(ssml=ssml)
voice = texttospeech_v1.VoiceSelectionParams(
language_code=self._settings.language, name=self._settings.voice
)
# Build audio config with conditional speaking_rate
audio_config_params = {
"audio_encoding": texttospeech_v1.AudioEncoding.LINEAR16,
"sample_rate_hertz": self.sample_rate,
}
# For Chirp and Journey voices, include speaking_rate in AudioConfig
if (is_chirp_voice or is_journey_voice) and self._settings.speaking_rate is not None:
audio_config_params["speaking_rate"] = self._settings.speaking_rate
audio_config = texttospeech_v1.AudioConfig(**audio_config_params)
request = texttospeech_v1.SynthesizeSpeechRequest(
input=synthesis_input, voice=voice, audio_config=audio_config
)
response = await self._client.synthesize_speech(request=request)
await self.start_tts_usage_metrics(text)
# Skip the first 44 bytes to remove the WAV header
audio_content = response.audio_content[44:]
CHUNK_SIZE = self.chunk_size
for i in range(0, len(audio_content), CHUNK_SIZE):
chunk = audio_content[i : i + CHUNK_SIZE]
if not chunk:
break
await self.stop_ttfb_metrics()
frame = TTSAudioRawFrame(chunk, self.sample_rate, 1, context_id=context_id)
yield frame
except Exception as e:
error_message = f"TTS generation error: {str(e)}"
yield ErrorFrame(error=error_message)
[docs]
class GoogleBaseTTSService(TTSService):
"""Base class for Google Cloud Text-to-Speech streaming services.
Provides shared streaming synthesis logic for Google TTS services.
This is an abstract base class. Use GoogleTTSService or GeminiTTSService instead.
"""
def _create_client(
self, credentials: str | None, credentials_path: str | None
) -> texttospeech_v1.TextToSpeechAsyncClient:
"""Create authenticated Google Text-to-Speech client.
Args:
credentials: JSON string with service account credentials.
credentials_path: Path to service account JSON file.
Returns:
Authenticated TextToSpeechAsyncClient instance.
Raises:
ValueError: If no valid credentials are provided.
"""
creds: service_account.Credentials | None = None
if credentials:
# Use provided credentials JSON string
json_account_info = json.loads(credentials)
creds = service_account.Credentials.from_service_account_info(json_account_info)
elif credentials_path:
# Use service account JSON file if provided
creds = service_account.Credentials.from_service_account_file(credentials_path)
else:
try:
creds, project_id = default(
scopes=["https://www.googleapis.com/auth/cloud-platform"]
)
except GoogleAuthError:
pass
if not creds:
raise ValueError("No valid credentials provided.")
client_options = None
if self._location:
client_options = ClientOptions(
api_endpoint=f"{self._location}-texttospeech.googleapis.com"
)
return texttospeech_v1.TextToSpeechAsyncClient(
credentials=creds, client_options=client_options
)
[docs]
def can_generate_metrics(self) -> bool:
"""Check if this service can generate processing metrics.
Returns:
True, as Google streaming TTS services support metrics generation.
"""
return True
[docs]
def language_to_service_language(self, language: Language) -> str | None:
"""Convert a Language enum to Google TTS language format.
Args:
language: The language to convert.
Returns:
The Google TTS-specific language code, or None if not supported.
"""
return language_to_google_tts_language(language)
async def _stream_tts(
self,
streaming_config: texttospeech_v1.StreamingSynthesizeConfig,
text: str,
context_id: str,
prompt: str | None = None,
) -> AsyncGenerator[Frame, None]:
"""Shared streaming synthesis logic.
Args:
streaming_config: The streaming configuration.
text: The text to synthesize.
context_id: Unique identifier for this TTS context.
prompt: Optional prompt for style instructions (Gemini only).
Yields:
Frame: Audio frames containing the synthesized speech.
"""
config_request = texttospeech_v1.StreamingSynthesizeRequest(
streaming_config=streaming_config
)
async def request_generator():
yield config_request
synthesis_input_params = {"text": text}
if prompt is not None:
synthesis_input_params["prompt"] = prompt
yield texttospeech_v1.StreamingSynthesizeRequest(
input=texttospeech_v1.StreamingSynthesisInput(**synthesis_input_params)
)
streaming_responses = await self._client.streaming_synthesize(request_generator())
await self.start_tts_usage_metrics(text)
audio_buffer = b""
first_chunk_for_ttfb = False
CHUNK_SIZE = self.chunk_size
async for response in streaming_responses:
chunk = response.audio_content
if not chunk:
continue
if not first_chunk_for_ttfb:
await self.stop_ttfb_metrics()
first_chunk_for_ttfb = True
audio_buffer += chunk
while len(audio_buffer) >= CHUNK_SIZE:
piece = audio_buffer[:CHUNK_SIZE]
audio_buffer = audio_buffer[CHUNK_SIZE:]
yield TTSAudioRawFrame(piece, self.sample_rate, 1, context_id=context_id)
if audio_buffer:
yield TTSAudioRawFrame(audio_buffer, self.sample_rate, 1, context_id=context_id)
[docs]
class GoogleTTSService(GoogleBaseTTSService):
"""Google Cloud Text-to-Speech streaming service.
Provides real-time text-to-speech synthesis using Google Cloud's streaming API
for low-latency applications. Optimized for Chirp 3 HD and Journey voices
with continuous audio streaming capabilities.
Note:
Requires Google Cloud credentials via service account JSON, file path, or
default application credentials (GOOGLE_APPLICATION_CREDENTIALS env var).
Only Chirp 3 HD and Journey voices are supported. Use GoogleHttpTTSService for other voices.
Example::
tts = GoogleTTSService(
credentials_path="/path/to/service-account.json",
settings=GoogleTTSService.Settings(
voice="en-US-Chirp3-HD-Charon",
language=Language.EN_US,
)
)
"""
Settings = GoogleTTSSettings
_settings: Settings
[docs]
def __init__(
self,
*,
credentials: str | None = None,
credentials_path: str | None = None,
location: str | None = None,
voice_id: str | None = None,
voice_cloning_key: str | None = None,
sample_rate: int | None = None,
params: InputParams | None = None,
settings: Settings | None = None,
**kwargs,
):
"""Initializes the Google streaming TTS service.
Args:
credentials: JSON string containing Google Cloud service account credentials.
credentials_path: Path to Google Cloud service account JSON file.
location: Google Cloud location for regional endpoint (e.g., "us-central1").
voice_id: Google TTS voice identifier (e.g., "en-US-Chirp3-HD-Charon").
.. deprecated:: 0.0.105
Use ``settings=GoogleTTSService.Settings(voice=...)`` instead.
voice_cloning_key: The voice cloning key for Chirp 3 custom voices.
sample_rate: Audio sample rate in Hz. If None, uses default.
params: Language configuration parameters.
.. deprecated:: 0.0.105
Use ``settings=GoogleTTSService.Settings(...)`` instead.
settings: Runtime-updatable settings. When provided alongside deprecated
parameters, ``settings`` values take precedence.
**kwargs: Additional arguments passed to parent TTSService.
"""
# 1. Initialize default_settings with hardcoded defaults
default_settings = self.Settings(
model=None,
voice="en-US-Chirp3-HD-Charon",
language="en-US",
speaking_rate=None,
)
# 2. Apply direct init arg overrides (deprecated)
if voice_id is not None:
self._warn_init_param_moved_to_settings("voice_id", "voice")
default_settings.voice = voice_id
# 3. Apply params overrides — only if settings not provided
if params is not None:
self._warn_init_param_moved_to_settings("params")
if not settings:
if params.language is not None:
default_settings.language = params.language
if params.speaking_rate is not None:
default_settings.speaking_rate = params.speaking_rate
# 4. Apply settings delta (canonical API, always wins)
if settings is not None:
default_settings.apply_update(settings)
super().__init__(
sample_rate=sample_rate,
push_start_frame=True,
push_stop_frames=True,
settings=default_settings,
**kwargs,
)
self._location = location
self._voice_cloning_key = voice_cloning_key
self._client: texttospeech_v1.TextToSpeechAsyncClient = self._create_client(
credentials, credentials_path
)
async def _update_settings(self, delta: TTSSettings) -> dict[str, Any]:
"""Override to handle speaking_rate validation.
Args:
delta: Settings delta. Can include 'speaking_rate' (float).
"""
if isinstance(delta, self.Settings) and is_given(delta.speaking_rate):
rate_value = float(delta.speaking_rate)
if not (0.25 <= rate_value <= 2.0):
logger.warning(
f"Invalid speaking_rate value: {rate_value}. Must be between 0.25 and 2.0"
)
delta.speaking_rate = NOT_GIVEN
return await super()._update_settings(delta)
[docs]
@traced_tts
async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]:
"""Generate streaming speech from text using Google's streaming API.
Args:
text: The text to synthesize into speech.
context_id: The context ID for tracking audio frames.
Yields:
Frame: Audio frames containing the synthesized speech as it's generated.
"""
logger.debug(f"{self}: Generating TTS [{text}]")
try:
# Build voice selection params
if self._voice_cloning_key:
voice_clone_params = texttospeech_v1.VoiceCloneParams(
voice_cloning_key=self._voice_cloning_key
)
voice = texttospeech_v1.VoiceSelectionParams(
language_code=self._settings.language, voice_clone=voice_clone_params
)
else:
voice = texttospeech_v1.VoiceSelectionParams(
language_code=self._settings.language, name=self._settings.voice
)
# Create streaming config
streaming_config = texttospeech_v1.StreamingSynthesizeConfig(
voice=voice,
streaming_audio_config=texttospeech_v1.StreamingAudioConfig(
audio_encoding=texttospeech_v1.AudioEncoding.PCM,
sample_rate_hertz=self.sample_rate,
speaking_rate=self._settings.speaking_rate,
),
)
# Use base class streaming logic
async for frame in self._stream_tts(streaming_config, text, context_id):
yield frame
except Exception as e:
await self.push_error(error_msg=f"TTS generation error: {str(e)}", exception=e)
[docs]
class GeminiTTSService(GoogleBaseTTSService):
"""Gemini Text-to-Speech streaming service using Gemini TTS models.
Provides real-time text-to-speech synthesis using Gemini's TTS-specific models
(gemini-2.5-flash-tts and gemini-2.5-pro-tts) with support for natural
voice control, prompts for style instructions, expressive markup tags,
and multi-speaker conversations.
Note:
Requires Google Cloud credentials via service account JSON, credentials file,
or default application credentials (GOOGLE_APPLICATION_CREDENTIALS).
Uses the Google Cloud Text-to-Speech streaming API for low-latency synthesis.
Example::
tts = GeminiTTSService(
credentials_path="/path/to/service-account.json",
settings=GeminiTTSService.Settings(
model="gemini-2.5-flash-tts",
voice="Kore",
language=Language.EN_US,
prompt="Say this in a friendly and helpful tone"
)
)
"""
Settings = GeminiTTSSettings
_settings: Settings
GOOGLE_SAMPLE_RATE = 24000 # Google TTS always outputs at 24kHz
# List of available Gemini TTS voices
AVAILABLE_VOICES = [
"Achernar",
"Achird",
"Algenib",
"Algieba",
"Alnilam",
"Aoede",
"Autonoe",
"Callirhoe",
"Charon",
"Despina",
"Enceladus",
"Erinome",
"Fenrir",
"Gacrux",
"Iapetus",
"Kore",
"Laomedeia",
"Leda",
"Orus",
"Puck",
"Pulcherrima",
"Rasalgethi",
"Sadachbia",
"Sadaltager",
"Schedar",
"Sulafar",
"Umbriel",
"Vindemiatrix",
"Zephyr",
"Zubenelgenubi",
]
[docs]
def __init__(
self,
*,
model: str | None = None,
credentials: str | None = None,
credentials_path: str | None = None,
location: str | None = None,
voice_id: str | None = None,
sample_rate: int | None = None,
params: InputParams | None = None,
settings: Settings | None = None,
**kwargs,
):
"""Initializes the Gemini TTS service.
Args:
model: Gemini TTS model to use. Must be a TTS model like
"gemini-2.5-flash-tts" or "gemini-2.5-pro-tts".
.. deprecated:: 0.0.105
Use ``settings=GeminiTTSService.Settings(model=...)`` instead.
credentials: JSON string containing Google Cloud service account credentials.
credentials_path: Path to Google Cloud service account JSON file.
location: Google Cloud location for regional endpoint (e.g., "us-central1").
voice_id: Voice name from the available Gemini voices.
.. deprecated:: 0.0.105
Use ``settings=GeminiTTSService.Settings(voice=...)`` instead.
sample_rate: Audio sample rate in Hz. If None, uses Google's default 24kHz.
params: TTS configuration parameters.
.. deprecated:: 0.0.105
Use ``settings=GeminiTTSService.Settings(...)`` instead.
settings: Runtime-updatable settings. When provided alongside deprecated
parameters, ``settings`` values take precedence.
**kwargs: Additional arguments passed to parent TTSService.
"""
if sample_rate and sample_rate != self.GOOGLE_SAMPLE_RATE:
logger.warning(
f"Google TTS only supports {self.GOOGLE_SAMPLE_RATE}Hz sample rate. "
f"Current rate of {sample_rate}Hz may cause issues."
)
# 1. Initialize default_settings with hardcoded defaults
default_settings = self.Settings(
model="gemini-2.5-flash-tts",
voice="Kore",
language="en-US",
prompt=None,
multi_speaker=False,
speaker_configs=None,
)
# 2. Apply direct init arg overrides (deprecated)
if model is not None:
self._warn_init_param_moved_to_settings("model", "model")
default_settings.model = model
if voice_id is not None:
self._warn_init_param_moved_to_settings("voice_id", "voice")
default_settings.voice = voice_id
if default_settings.voice not in self.AVAILABLE_VOICES:
logger.warning(
f"Voice '{default_settings.voice}' not in known voices list. Using anyway."
)
# 3. Apply params overrides — only if settings not provided
if params is not None:
self._warn_init_param_moved_to_settings("params")
if not settings:
if params.language is not None:
default_settings.language = params.language
if params.prompt is not None:
default_settings.prompt = params.prompt
if params.multi_speaker is not None:
default_settings.multi_speaker = params.multi_speaker
if params.speaker_configs is not None:
default_settings.speaker_configs = params.speaker_configs
# 4. Apply settings delta (canonical API, always wins)
if settings is not None:
default_settings.apply_update(settings)
super().__init__(
sample_rate=sample_rate,
push_start_frame=True,
push_stop_frames=True,
settings=default_settings,
**kwargs,
)
self._location = location
self._client: texttospeech_v1.TextToSpeechAsyncClient = self._create_client(
credentials, credentials_path
)
[docs]
def language_to_service_language(self, language: Language) -> str | None:
"""Convert a Language enum to Gemini TTS language format.
Args:
language: The language to convert.
Returns:
The Gemini TTS-specific language code, or None if not supported.
"""
return language_to_gemini_tts_language(language)
[docs]
async def start(self, frame: StartFrame):
"""Start the Gemini TTS service.
Args:
frame: The start frame containing initialization parameters.
"""
await super().start(frame)
if self.sample_rate != self.GOOGLE_SAMPLE_RATE:
logger.warning(
f"Google TTS requires {self.GOOGLE_SAMPLE_RATE}Hz sample rate. "
f"Current rate of {self.sample_rate}Hz may cause issues."
)
async def _update_settings(self, delta: TTSSettings) -> dict[str, Any]:
"""Apply a settings delta with voice validation.
Args:
delta: Settings delta. Can include 'voice', 'prompt', etc.
Returns:
Dict mapping changed field names to their previous values.
"""
if is_given(delta.voice) and delta.voice not in self.AVAILABLE_VOICES:
logger.warning(f"Voice '{delta.voice}' not in known voices list. Using anyway.")
return await super()._update_settings(delta)
[docs]
@traced_tts
async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]:
"""Generate streaming speech from text using Gemini TTS models.
Args:
text: The text to synthesize into speech.
context_id: The context ID for tracking audio frames. Can include markup tags
like [sigh], [laughing], [whispering] for expressive control.
Yields:
Frame: Audio frames containing the synthesized speech as it's generated.
"""
logger.debug(f"{self}: Generating TTS [{text}]")
try:
# Build voice selection params
if self._settings.multi_speaker and self._settings.speaker_configs:
# Multi-speaker mode
speaker_voice_configs = []
for speaker_config in self._settings.speaker_configs:
speaker_voice_configs.append(
texttospeech_v1.MultispeakerPrebuiltVoice(
speaker_alias=speaker_config["speaker_alias"],
speaker_id=speaker_config.get("speaker_id", self._settings.voice),
)
)
multi_speaker_voice_config = texttospeech_v1.MultiSpeakerVoiceConfig(
speaker_voice_configs=speaker_voice_configs
)
voice = texttospeech_v1.VoiceSelectionParams(
language_code=self._settings.language,
model_name=self._settings.model,
multi_speaker_voice_config=multi_speaker_voice_config,
)
else:
# Single speaker mode
voice = texttospeech_v1.VoiceSelectionParams(
language_code=self._settings.language,
name=self._settings.voice,
model_name=self._settings.model,
)
# Create streaming config
streaming_config = texttospeech_v1.StreamingSynthesizeConfig(
voice=voice,
streaming_audio_config=texttospeech_v1.StreamingAudioConfig(
audio_encoding=texttospeech_v1.AudioEncoding.PCM,
sample_rate_hertz=self.sample_rate,
),
)
# Use base class streaming logic with prompt support
async for frame in self._stream_tts(
streaming_config, text, context_id, assert_given(self._settings.prompt)
):
yield frame
except Exception as e:
error_message = f"Gemini TTS generation error: {str(e)}"
yield ErrorFrame(error=error_message)