Source code for pipecat.transcriptions.language

#
# Copyright (c) 2024-2026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#

"""Language code enumerations for Pipecat.

This module provides comprehensive language code constants following ISO 639
and BCP 47 standards, supporting both language-only and language-region
combinations for various speech and text processing services.
"""

from enum import StrEnum

from loguru import logger


[docs] class Language(StrEnum): """Language codes for speech and text processing services. Provides comprehensive language code constants following ISO 639 and BCP 47 standards. Includes both language-only codes (e.g., 'en') and language-region combinations (e.g., 'en-US') to support various speech synthesis, recognition, and translation services. """ # Afrikaans AF = "af" AF_ZA = "af-ZA" # Amharic AM = "am" AM_ET = "am-ET" # Arabic AR = "ar" AR_AE = "ar-AE" AR_BH = "ar-BH" AR_DZ = "ar-DZ" AR_EG = "ar-EG" AR_IQ = "ar-IQ" AR_JO = "ar-JO" AR_KW = "ar-KW" AR_LB = "ar-LB" AR_LY = "ar-LY" AR_MA = "ar-MA" AR_OM = "ar-OM" AR_QA = "ar-QA" AR_SA = "ar-SA" AR_SY = "ar-SY" AR_TN = "ar-TN" AR_XA = "ar-XA" AR_YE = "ar-YE" AR_001 = "ar-001" # Assamese AS = "as" AS_IN = "as-IN" # Asturian AST = "ast" # Azerbaijani AZ = "az" AZ_AZ = "az-AZ" # Bashkir BA = "ba" # Belarusian BE = "be" BE_BY = "be-BY" # Bulgarian BG = "bg" BG_BG = "bg-BG" # Bengali BN = "bn" BN_BD = "bn-BD" BN_IN = "bn-IN" # Tibetan BO = "bo" # Breton BR = "br" # Bosnian BS = "bs" BS_BA = "bs-BA" # Catalan CA = "ca" CA_ES = "ca-ES" # Cebuano CEB = "ceb" CEB_PH = "ceb-PH" # Mandarin Chinese CMN = "cmn" CMN_CN = "cmn-CN" # Czech CS = "cs" CS_CZ = "cs-CZ" # Welsh CY = "cy" CY_GB = "cy-GB" # Danish DA = "da" DA_DK = "da-DK" # German DE = "de" DE_AT = "de-AT" DE_CH = "de-CH" DE_DE = "de-DE" # Greek EL = "el" EL_GR = "el-GR" # English EN = "en" EN_AU = "en-AU" EN_CA = "en-CA" EN_GB = "en-GB" EN_GH = "en-GH" EN_HK = "en-HK" EN_IE = "en-IE" EN_IN = "en-IN" EN_KE = "en-KE" EN_NG = "en-NG" EN_NZ = "en-NZ" EN_PH = "en-PH" EN_SG = "en-SG" EN_TZ = "en-TZ" EN_US = "en-US" EN_ZA = "en-ZA" # Esperanto EO = "eo" # Spanish ES = "es" ES_AR = "es-AR" ES_BO = "es-BO" ES_CL = "es-CL" ES_CO = "es-CO" ES_CR = "es-CR" ES_CU = "es-CU" ES_DO = "es-DO" ES_EC = "es-EC" ES_ES = "es-ES" ES_GQ = "es-GQ" ES_GT = "es-GT" ES_HN = "es-HN" ES_MX = "es-MX" ES_NI = "es-NI" ES_PA = "es-PA" ES_PE = "es-PE" ES_PR = "es-PR" ES_PY = "es-PY" ES_SV = "es-SV" ES_US = "es-US" ES_UY = "es-UY" ES_VE = "es-VE" ES_419 = "es-419" # Estonian ET = "et" ET_EE = "et-EE" # Basque EU = "eu" EU_ES = "eu-ES" # Persian FA = "fa" FA_IR = "fa-IR" # Fulah FF = "ff" # Finnish FI = "fi" FI_FI = "fi-FI" # Filipino FIL = "fil" FIL_PH = "fil-PH" # Faroese FO = "fo" # French FR = "fr" FR_BE = "fr-BE" FR_CA = "fr-CA" FR_CH = "fr-CH" FR_FR = "fr-FR" # Irish GA = "ga" GA_IE = "ga-IE" # Gaelic GD = "gd" # Galician GL = "gl" GL_ES = "gl-ES" # Gujarati GU = "gu" GU_IN = "gu-IN" # Hausa HA = "ha" # Hawaiian HAW = "haw" # Hebrew HE = "he" HE_IL = "he-IL" # Hindi HI = "hi" HI_IN = "hi-IN" # Croatian HR = "hr" HR_HR = "hr-HR" # Haitian Creole HT = "ht" HT_HT = "ht-HT" # Hungarian HU = "hu" HU_HU = "hu-HU" # Armenian HY = "hy" HY_AM = "hy-AM" # Indonesian ID = "id" ID_ID = "id-ID" # Igbo IG = "ig" # Icelandic IS = "is" IS_IS = "is-IS" # Italian IT = "it" IT_IT = "it-IT" IT_CH = "it-CH" # Inuktitut IU_CANS = "iu-Cans" IU_CANS_CA = "iu-Cans-CA" IU_LATN = "iu-Latn" IU_LATN_CA = "iu-Latn-CA" # Japanese JA = "ja" JA_JP = "ja-JP" # Javanese JV = "jv" JV_ID = "jv-ID" JV_JV = "jv-JV" JW = "jw" # Fal requires for Javanese # Georgian KA = "ka" KA_GE = "ka-GE" # Kabuverdianu KEA = "kea" # Kazakh KK = "kk" KK_KZ = "kk-KZ" # Khmer KM = "km" KM_KH = "km-KH" # Kannada KN = "kn" KN_IN = "kn-IN" # Konkani KOK = "kok" KOK_IN = "kok-IN" # Korean KO = "ko" KO_KR = "ko-KR" # Kurdish KU = "ku" # Kyrgyz KY = "ky" KY_KG = "ky-KG" # Latin LA = "la" LA_VA = "la-VA" # Luxembourgish LB = "lb" LB_LU = "lb-LU" # Lingala LN = "ln" # Lao LO = "lo" LO_LA = "lo-LA" # Lithuanian LT = "lt" LT_LT = "lt-LT" # Ganda LG = "lg" # Luo LUO = "luo" # Latvian LV = "lv" LV_LV = "lv-LV" # Malagasy MG = "mg" MG_MG = "mg-MG" # Maori MI = "mi" # Macedonian MK = "mk" MK_MK = "mk-MK" # Maithili MAI = "mai" MAI_IN = "mai-IN" # Malayalam ML = "ml" ML_IN = "ml-IN" # Mongolian MN = "mn" MN_MN = "mn-MN" # Marathi MR = "mr" MR_IN = "mr-IN" # Malay MS = "ms" MS_MY = "ms-MY" # Maltese MT = "mt" MT_MT = "mt-MT" # Burmese MY = "my" MY_MM = "my-MM" MY_MR = "mymr" # Norwegian NB = "nb" # Norwegian Bokmål NB_NO = "nb-NO" NO = "no" NN = "nn" # Norwegian Nynorsk NN_NO = "nn-NO" # Nepali NE = "ne" NE_NP = "ne-NP" # Dutch NL = "nl" NL_BE = "nl-BE" NL_NL = "nl-NL" # Northern Sotho NSO = "nso" # Chichewa NY = "ny" # Occitan OC = "oc" # Odia OR = "or" OR_IN = "or-IN" # Punjabi PA = "pa" PA_IN = "pa-IN" # Polish PL = "pl" PL_PL = "pl-PL" # Pashto PS = "ps" PS_AF = "ps-AF" # Portuguese PT = "pt" PT_BR = "pt-BR" PT_PT = "pt-PT" # Romanian RO = "ro" RO_RO = "ro-RO" # Russian RU = "ru" RU_RU = "ru-RU" # Sanskrit SA = "sa" # Sindhi SD = "sd" SD_IN = "sd-IN" # Sinhala SI = "si" SI_LK = "si-LK" # Slovak SK = "sk" SK_SK = "sk-SK" # Slovenian SL = "sl" SL_SI = "sl-SI" # Shona SN = "sn" # Somali SO = "so" SO_SO = "so-SO" # Albanian SQ = "sq" SQ_AL = "sq-AL" # Serbian SR = "sr" SR_RS = "sr-RS" SR_LATN = "sr-Latn" SR_LATN_RS = "sr-Latn-RS" # Sundanese SU = "su" SU_ID = "su-ID" # Swedish SV = "sv" SV_SE = "sv-SE" # Swahili SW = "sw" SW_KE = "sw-KE" SW_TZ = "sw-TZ" # Tamil TA = "ta" TA_IN = "ta-IN" TA_LK = "ta-LK" TA_MY = "ta-MY" TA_SG = "ta-SG" # Telugu TE = "te" TE_IN = "te-IN" # Tajik TG = "tg" # Thai TH = "th" TH_TH = "th-TH" # Turkmen TK = "tk" # Tagalog TL = "tl" # Turkish TR = "tr" TR_TR = "tr-TR" # Tatar TT = "tt" # Uyghur UG = "ug" # Ukrainian UK = "uk" UK_UA = "uk-UA" # Umbundu UMB = "umb" # Urdu UR = "ur" UR_IN = "ur-IN" UR_PK = "ur-PK" # Uzbek UZ = "uz" UZ_UZ = "uz-UZ" # Vietnamese VI = "vi" VI_VN = "vi-VN" # Wolof WO = "wo" # Wu Chinese WUU = "wuu" WUU_CN = "wuu-CN" # Yiddish YI = "yi" # Yoruba YO = "yo" # Yue Chinese (Cantonese) YUE = "yue" YUE_CN = "yue-CN" # Chinese ZH = "zh" ZH_CN = "zh-CN" ZH_CN_GUANGXI = "zh-CN-guangxi" ZH_CN_HENAN = "zh-CN-henan" ZH_CN_LIAONING = "zh-CN-liaoning" ZH_CN_SHAANXI = "zh-CN-shaanxi" ZH_CN_SHANDONG = "zh-CN-shandong" ZH_CN_SICHUAN = "zh-CN-sichuan" ZH_HK = "zh-HK" ZH_TW = "zh-TW" # Xhosa XH = "xh-ZA" # Zulu ZU = "zu" ZU_ZA = "zu-ZA"
[docs] def resolve_language( language: Language, language_map: dict[Language, str], use_base_code: bool = True ) -> str: """Resolve a Language enum to a service-specific language code. Checks the language map first, then falls back to extracting the appropriate code format with a warning if not found in the verified list. Args: language: The Language enum value to convert. language_map: Dictionary mapping Language enums to service language codes. use_base_code: If True, extracts base code (e.g., 'en' from 'en-US'). If False, uses full language code as-is. Returns: The resolved language code for the service. Examples:: # Service expecting base codes (e.g., Cartesia) >>> LANGUAGE_MAP = {Language.EN: "en", Language.ES: "es"} >>> resolve_language(Language.EN_US, LANGUAGE_MAP, use_base_code=True) # Logs: "Language en-US not verified. Using base code 'en'." "en" # Service expecting full codes (e.g., AWS) >>> LANGUAGE_MAP = {Language.EN_US: "en-US", Language.ES_ES: "es-ES"} >>> resolve_language(Language.EN_GB, LANGUAGE_MAP, use_base_code=False) # Logs: "Language en-GB not verified. Using 'en-GB'." "en-GB" """ # Check if language is in the verified map result = language_map.get(language) if result is not None: return result # Not in map - fall back with warning lang_str = str(language) if use_base_code: # Extract base code (e.g., "en" from "en-US") base_code = lang_str.split("-")[0].lower() logger.warning(f"Language {language} not verified. Using base code '{base_code}'.") return base_code else: logger.warning(f"Language {language} not verified. Using '{lang_str}'.") return lang_str