Source code for pipecat.services.whisper.utils

#
# Copyright (c) 2024-2026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#

"""Utility functions for extracting probability metrics from STT services."""

import math

from pipecat.frames.frames import TranscriptionFrame



[docs]
def extract_whisper_probability(frame: TranscriptionFrame) -> float | None:
    """Extract probability from Whisper-based TranscriptionFrame result.

    Works with Groq, OpenAI Whisper, or other Whisper-based services that use
    verbose_json format with segments containing avg_logprob.

    Converts avg_logprob to probability.

    Args:
        frame: TranscriptionFrame with result from GroqSTTService or OpenAISTTService
            (when include_prob_metrics=True and using Whisper models).

    Returns:
        Probability (0-1) if available, None otherwise.

    Example::

        from pipecat.services.groq.stt import GroqSTTService
        from pipecat.services.whisper.utils import extract_whisper_probability

        stt = GroqSTTService(include_prob_metrics=True)
        # ... use stt in pipeline ...
        # In your frame processor:
        if isinstance(frame, TranscriptionFrame):
            prob = extract_whisper_probability(frame)
            if prob:
                print(f"Transcription confidence: {prob:.2%}")
    """
    if not frame.result:
        return None

    # Whisper verbose_json format: response.segments[0].avg_logprob
    if hasattr(frame.result, "segments") and frame.result.segments:
        segment = frame.result.segments[0]
        avg_logprob = getattr(segment, "avg_logprob", None)
        if avg_logprob is not None:
            return math.exp(avg_logprob)

    return None




[docs]
def extract_openai_gpt4o_probability(frame: TranscriptionFrame) -> float | None:
    """Extract probability from OpenAI GPT-4o-transcribe TranscriptionFrame result.

    Args:
        frame: TranscriptionFrame with result from OpenAISTTService
            using GPT-4o-transcribe model (when include_prob_metrics=True).

    Returns:
        Probability (0-1) if available, None otherwise.

    Example::

        from pipecat.services.openai.stt import OpenAISTTService
        from pipecat.services.whisper.utils import extract_openai_gpt4o_probability

        stt = OpenAISTTService(model="gpt-4o-transcribe", include_prob_metrics=True)
        # ... use stt in pipeline ...
        # In your frame processor:
        if isinstance(frame, TranscriptionFrame):
            prob = extract_openai_gpt4o_probability(frame)
            if prob:
                print(f"Transcription confidence: {prob:.2%}")
    """
    if not frame.result:
        return None

    # OpenAI GPT-4o-transcribe format: response.logprobs
    if hasattr(frame.result, "logprobs"):
        logprobs = frame.result.logprobs
        if logprobs:
            # Calculate average logprob and convert to probability
            avg_logprob = sum(logprobs) / len(logprobs)
            return math.exp(avg_logprob)

    return None




[docs]
def extract_deepgram_probability(frame: TranscriptionFrame) -> float | None:
    """Extract probability from Deepgram TranscriptionFrame result.

    Args:
        frame: TranscriptionFrame with result from DeepgramSTTService.

    Returns:
        Probability (0-1) if available, None otherwise.
        Returns alternative-level confidence if available, otherwise calculates
        average confidence from word-level confidences.

    Example::

        from pipecat.services.deepgram.stt import DeepgramSTTService
        from pipecat.services.whisper.utils import extract_deepgram_probability

        stt = DeepgramSTTService()
        # ... use stt in pipeline ...
        # In your frame processor:
        if isinstance(frame, TranscriptionFrame):
            prob = extract_deepgram_probability(frame)
            if prob:
                print(f"Transcription confidence: {prob:.2%}")
    """
    if not frame.result:
        return None

    result = frame.result
    if hasattr(result, "channel") and result.channel:
        if hasattr(result.channel, "alternatives") and result.channel.alternatives:
            alt = result.channel.alternatives[0]
            conf = getattr(alt, "confidence", None)
            if conf is not None:
                return float(conf)

            words = getattr(alt, "words", None)
            if words:
                word_confs = [getattr(w, "confidence", None) for w in words]
                word_confs = [c for c in word_confs if c is not None]
                if word_confs:
                    return float(sum(word_confs) / len(word_confs))

    return None