#
# Copyright (c) 2024-2026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
"""AWS Polly text-to-speech service implementation.
This module provides integration with Amazon Polly for text-to-speech synthesis,
supporting multiple languages, voices, and SSML features.
"""
import os
from collections.abc import AsyncGenerator
from dataclasses import dataclass, field
from loguru import logger
from pydantic import BaseModel
from pipecat.audio.utils import create_stream_resampler
from pipecat.frames.frames import (
ErrorFrame,
Frame,
TTSAudioRawFrame,
)
from pipecat.services.settings import NOT_GIVEN, TTSSettings, _NotGiven
from pipecat.services.tts_service import TTSService
from pipecat.transcriptions.language import Language, resolve_language
from pipecat.utils.tracing.service_decorators import traced_tts
try:
import aioboto3
from botocore.exceptions import BotoCoreError, ClientError
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error("In order to use AWS services, you need to `pip install pipecat-ai[aws]`.")
raise Exception(f"Missing module: {e}")
[docs]
def language_to_aws_language(language: Language) -> str | None:
"""Convert a Language enum to AWS Polly language code.
Args:
language: The Language enum value to convert.
Returns:
The corresponding AWS Polly language code, or None if not supported.
"""
LANGUAGE_MAP = {
# Arabic
Language.AR: "arb",
Language.AR_AE: "ar-AE",
# Catalan
Language.CA: "ca-ES",
# Chinese
Language.ZH: "cmn-CN", # Mandarin
Language.YUE: "yue-CN", # Cantonese
Language.YUE_CN: "yue-CN",
# Czech
Language.CS: "cs-CZ",
# Danish
Language.DA: "da-DK",
# Dutch
Language.NL: "nl-NL",
Language.NL_BE: "nl-BE",
# English
Language.EN: "en-US", # Default to US English
Language.EN_AU: "en-AU",
Language.EN_GB: "en-GB",
Language.EN_IN: "en-IN",
Language.EN_NZ: "en-NZ",
Language.EN_US: "en-US",
Language.EN_ZA: "en-ZA",
# Finnish
Language.FI: "fi-FI",
# French
Language.FR: "fr-FR",
Language.FR_BE: "fr-BE",
Language.FR_CA: "fr-CA",
# German
Language.DE: "de-DE",
Language.DE_AT: "de-AT",
Language.DE_CH: "de-CH",
# Hindi
Language.HI: "hi-IN",
# Icelandic
Language.IS: "is-IS",
# Italian
Language.IT: "it-IT",
# Japanese
Language.JA: "ja-JP",
# Korean
Language.KO: "ko-KR",
# Norwegian
Language.NO: "nb-NO",
Language.NB: "nb-NO",
Language.NB_NO: "nb-NO",
# Polish
Language.PL: "pl-PL",
# Portuguese
Language.PT: "pt-PT",
Language.PT_BR: "pt-BR",
Language.PT_PT: "pt-PT",
# Romanian
Language.RO: "ro-RO",
# Russian
Language.RU: "ru-RU",
# Spanish
Language.ES: "es-ES",
Language.ES_MX: "es-MX",
Language.ES_US: "es-US",
# Swedish
Language.SV: "sv-SE",
# Turkish
Language.TR: "tr-TR",
# Welsh
Language.CY: "cy-GB",
Language.CY_GB: "cy-GB",
}
return resolve_language(language, LANGUAGE_MAP, use_base_code=False)
[docs]
@dataclass
class AWSPollyTTSSettings(TTSSettings):
"""Settings for AWSPollyTTSService.
Parameters:
engine: TTS engine to use ('standard', 'neural', etc.).
pitch: Voice pitch adjustment (for standard engine only).
rate: Speech rate adjustment.
volume: Voice volume adjustment.
lexicon_names: List of pronunciation lexicons to apply.
"""
engine: str | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
pitch: str | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
rate: str | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
volume: str | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
lexicon_names: list[str] | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
[docs]
class AWSPollyTTSService(TTSService):
"""AWS Polly text-to-speech service.
Provides text-to-speech synthesis using Amazon Polly with support for
multiple languages, voices, SSML features, and voice customization
options including prosody controls.
"""
Settings = AWSPollyTTSSettings
_settings: Settings
[docs]
def __init__(
self,
*,
api_key: str | None = None,
aws_access_key_id: str | None = None,
aws_session_token: str | None = None,
region: str | None = None,
voice_id: str | None = None,
sample_rate: int | None = None,
params: InputParams | None = None,
settings: Settings | None = None,
**kwargs,
):
"""Initializes the AWS Polly TTS service.
Args:
api_key: AWS secret access key. If None, uses AWS_SECRET_ACCESS_KEY environment variable.
aws_access_key_id: AWS access key ID. If None, uses AWS_ACCESS_KEY_ID environment variable.
aws_session_token: AWS session token for temporary credentials.
region: AWS region for Polly service. Defaults to 'us-east-1'.
voice_id: Voice ID to use for synthesis. Defaults to 'Joanna'.
.. deprecated:: 0.0.105
Use ``settings=AWSPollyTTSService.Settings(voice=...)`` instead.
sample_rate: Audio sample rate. If None, uses service default.
params: Additional input parameters for voice customization.
.. deprecated:: 0.0.105
Use ``settings=AWSPollyTTSService.Settings(...)`` instead.
settings: Runtime-updatable settings. When provided alongside deprecated
parameters, ``settings`` values take precedence.
**kwargs: Additional arguments passed to parent TTSService class.
"""
# 1. Initialize default_settings with hardcoded defaults
default_settings = self.Settings(
model=None,
voice="Joanna",
language="en-US",
engine=None,
pitch=None,
rate=None,
volume=None,
lexicon_names=None,
)
# 2. Apply direct init arg overrides (deprecated)
if voice_id is not None:
self._warn_init_param_moved_to_settings("voice_id", "voice")
default_settings.voice = voice_id
# 3. Apply params overrides — only if settings not provided
if params is not None:
self._warn_init_param_moved_to_settings("params")
if not settings:
default_settings.engine = params.engine
default_settings.language = params.language if params.language else "en-US"
default_settings.pitch = params.pitch
default_settings.rate = params.rate
default_settings.volume = params.volume
default_settings.lexicon_names = params.lexicon_names
# 4. Apply settings delta (canonical API, always wins)
if settings is not None:
default_settings.apply_update(settings)
super().__init__(
sample_rate=sample_rate,
push_start_frame=True,
push_stop_frames=True,
settings=default_settings,
**kwargs,
)
# Get credentials from environment variables if not provided
self._aws_params = {
"aws_access_key_id": aws_access_key_id or os.getenv("AWS_ACCESS_KEY_ID"),
"aws_secret_access_key": api_key or os.getenv("AWS_SECRET_ACCESS_KEY"),
"aws_session_token": aws_session_token or os.getenv("AWS_SESSION_TOKEN"),
"region_name": region or os.getenv("AWS_REGION", "us-east-1"),
}
self._aws_session = aioboto3.Session()
self._resampler = create_stream_resampler()
[docs]
def can_generate_metrics(self) -> bool:
"""Check if this service can generate processing metrics.
Returns:
True, as AWS Polly service supports metrics generation.
"""
return True
[docs]
def language_to_service_language(self, language: Language) -> str | None:
"""Convert a Language enum to AWS Polly language format.
Args:
language: The language to convert.
Returns:
The AWS Polly-specific language code, or None if not supported.
"""
return language_to_aws_language(language)
def _construct_ssml(self, text: str) -> str:
ssml = "<speak>"
language = self._settings.language
ssml += f"<lang xml:lang='{language}'>"
prosody_attrs = []
# Prosody tags are only supported for standard and neural engines
if self._settings.engine == "standard":
if self._settings.pitch:
prosody_attrs.append(f"pitch='{self._settings.pitch}'")
if self._settings.rate:
prosody_attrs.append(f"rate='{self._settings.rate}'")
if self._settings.volume:
prosody_attrs.append(f"volume='{self._settings.volume}'")
if prosody_attrs:
ssml += f"<prosody {' '.join(prosody_attrs)}>"
ssml += text
if prosody_attrs:
ssml += "</prosody>"
ssml += "</lang>"
ssml += "</speak>"
logger.trace(f"{self} SSML: {ssml}")
return ssml
[docs]
@traced_tts
async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]:
"""Generate speech from text using AWS Polly.
Args:
text: The text to synthesize into speech.
context_id: The context ID for tracking audio frames.
Yields:
Frame: Audio frames containing the synthesized speech.
"""
logger.debug(f"{self}: Generating TTS [{text}]")
try:
# Construct the parameters dictionary
ssml = self._construct_ssml(text)
params = {
"Text": ssml,
"TextType": "ssml",
"OutputFormat": "pcm",
"VoiceId": self._settings.voice,
"Engine": self._settings.engine,
# AWS only supports 8000 and 16000 for PCM. We select 16000.
"SampleRate": "16000",
"LexiconNames": self._settings.lexicon_names,
}
# Filter out None values
filtered_params = {k: v for k, v in params.items() if v is not None}
async with self._aws_session.client("polly", **self._aws_params) as polly:
response = await polly.synthesize_speech(**filtered_params)
if "AudioStream" in response:
# Get the streaming body and read it
stream = response["AudioStream"]
audio_data = await stream.read()
else:
logger.error(f"{self} No audio stream in response")
audio_data = None
audio_data = await self._resampler.resample(audio_data, 16000, self.sample_rate)
await self.start_tts_usage_metrics(text)
CHUNK_SIZE = self.chunk_size
for i in range(0, len(audio_data), CHUNK_SIZE):
chunk = audio_data[i : i + CHUNK_SIZE]
if len(chunk) > 0:
await self.stop_ttfb_metrics()
frame = TTSAudioRawFrame(chunk, self.sample_rate, 1, context_id=context_id)
yield frame
except (BotoCoreError, ClientError) as error:
error_message = f"AWS Polly TTS error: {str(error)}"
yield ErrorFrame(error=error_message)