## **Installing Dependencies**

<div style="text-align: center;">
    <img src="https://learnopencv.com/wp-content/uploads/2024/05/ASR-Feature-Automatic-Speech-Recognition.gif" alt="ASR Diarization">
</div>


**Change runtime type to `T4 GPU` in Colab**

**OpenAI's whisper** does not natively support batching, and can be inaccurate by several seconds. So we will leverage [**whisperX**](https://github.com/m-bain/whisperX) which supports batched inference and  offers realtime transcription.


In [1]:
!pip install -qq git+https://github.com/m-bain/whisperX.git@78dcfaab51005aa703ee21375f81ed31bc248560

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m208.7/208.7 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.4/179.4 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m60.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m62.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m53.4 MB/s[0m eta [3

**Restart Session (recommended)**

Next let’s install the Nvidia [Nemo Toolkit for ASR](https://github.com/NVIDIA/NeMo/tree/main/tutorials/speaker_tasks)

In [2]:
!pip install -qq --no-build-isolation nemo_toolkit[asr]==1.22.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m60.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m101.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.3/49.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m105.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.9/63.9 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ...

**Restart Session**

And other dependencies like
* `Demucs` to separate music and vocals from audio source.

* `Dora search` for grid search and optimization.
* `Deepmultilingualpunctuation` for cleaning and structuring transcription.
* `Pydub` for manipulating audio.


In [3]:
!pip install -q --no-deps git+https://github.com/facebookresearch/demucs #egg=demucs
!pip install -q dora-search "lameenc>=1.2" openunmix
!pip install -q deepmultilingualpunctuation
!pip install -q wget pydub

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for demucs (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.1/87.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.8/239.8 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.0/40.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.7/74.7 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for dora-search (pyproject.toml) ... [?25l[?25hdone
  Building wheel for treetable (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resol

## **Import Dependencies**

In [4]:
import os
import wget
import requests
import zipfile
from omegaconf import OmegaConf
import json
import shutil
from faster_whisper import WhisperModel
import whisperx
import torch
from pydub import AudioSegment
from nemo.collections.asr.models.msdd_models import NeuralDiarizer
from deepmultilingualpunctuation import PunctuationModel
import re
import logging
import nltk
from whisperx.alignment import DEFAULT_ALIGN_MODELS_HF, DEFAULT_ALIGN_MODELS_TORCH
from whisperx.utils import LANGUAGES, TO_LANGUAGE_CODE

  torchaudio.set_audio_backend("soundfile")
[NeMo W 2024-05-14 06:13:47 transformer_bpe_models:59] Could not import NeMo NLP collection which is required for speech translation model.


### **Download Files**

In [9]:
# Ensure the directory exists
if not os.path.exists('whisper_examples'):
    os.mkdir('whisper_examples')

def download_file(url, save_name):
    if not os.path.exists(save_name):
        # Handling potential redirection in requests
        with requests.get(url, allow_redirects=True) as r:
            if r.status_code == 200:
                with open(save_name, 'wb') as f:
                    f.write(r.content)
            else:
                print("Failed to download the file, status code:", r.status_code)

def unzip(zip_file=None, target_dir='./whisper_examples'):
    try:
        with zipfile.ZipFile(zip_file, 'r') as z:
            z.extractall(target_dir)
            print("Extracted all to:", target_dir)
    except zipfile.BadZipFile:
        print("Invalid file or error during extraction: Bad Zip File")
    except Exception as e:
        print("An error occurred:", e)

# Correct Dropbox link (Ensure this is the direct download link or properly redirects)
download_url = 'https://www.dropbox.com/scl/fi/gaxpaq6d8aqnbz9mpzlr6/whisper_examples.zip?rlkey=x69vv03tu657bbxbmbe7z322m&st=iabgc5et&dl=1'
save_path = 'whisper_examples/whisper_examples.zip'

download_file(download_url, save_path)
unzip(zip_file=save_path)


Extracted all to: ./whisper_examples




As our goal is to get a highly accurate and reliable transcription,  we will be using a whisper `large-v3` multilingual model of 1550M parameters with a WER of 4.1 on Google Fleurs dataset.
The enable_stemming flag determines to preprocess the audio with Demucs to remove music. Setting `batch_size=8` indicates that 8 chunks will be processed at a time to make transcription coherent, so increasing batch_size may produce better results.
Additionally the ` suppress_numeral=True` flag helps to increase the accuracy by reducing WER. In this ASR pipeline, whisper automatically identifies the language using the first 30 sec of input audio sample, as it is set as `language=None`.


In [None]:
# Name of the audio file
audio_path = "whisper_examples/Old_Farmer.mp3"

# Whether to enable music removal from speech, helps increase diarization quality but uses alot of ram
enable_stemming = True

# (choose from 'tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large-v2', 'large-v3', 'large')
whisper_model_name = "large-v3"

# replaces numerical digits with their pronounciation, increases diarization accuracy
suppress_numerals = True

batch_size = 8

language = None  # autodetect language

device = "cuda" if torch.cuda.is_available() else "cpu"

## **Processing Audio with Demucs**


This preprocessing step, conditionally isolates vocals from an audio file using the Meta Demucs model  which increases the diarization quality otherwise, it defaults to using the original audio file.

In [None]:
if enable_stemming:
    # Isolate vocals from the rest of the audio

    return_code = os.system(
        f'python3 -m demucs.separate -n htdemucs --two-stems=vocals "{audio_path}" -o "temp_outputs"'
    )

    if return_code != 0:
        logging.warning("Source splitting failed, using original audio file.")
        vocal_target = audio_path
    else:
        vocal_target = os.path.join(
            "temp_outputs",
            "htdemucs",
            os.path.splitext(os.path.basename(audio_path))[0],
            "vocals.wav",
        )
else:
    vocal_target = audio_path

## **WHISPER: ASR PIPELINE**

Next, the `find_numeral_symbol_tokens` function identifies and returns a list of token IDs , inherited from whisper’s pretrained tokenizer's vocabulary that contain numeral symbols or characters such as digits and currency symbols. The vocab size of whisper is 51865.
Eg: $100 , suppressing numeral tokens will result in one hundred dollars.


In [None]:
def find_numeral_symbol_tokens(tokenizer):
    numeral_symbol_tokens = [
        -1,
    ]
    for token, token_id in tokenizer.get_vocab().items():
        has_numeral_symbol = any(c in "0123456789%$£" for c in token)
        if has_numeral_symbol:
            numeral_symbol_tokens.append(token_id)
    return numeral_symbol_tokens

**Transcribe with Faster Whisper**

In [None]:
def transcribe(
    audio_file: str,
    language: str,
    model_name: str,
    compute_dtype: str,
    suppress_numerals: bool,
    device: str,
):

    # Faster Whisper non-batched
    # Run on GPU with FP16
    whisper_model = WhisperModel(model_name, device=device, compute_type=compute_dtype)

    # or run on GPU with INT8
    # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
    # or run on CPU with INT8
    # model = WhisperModel(model_size, device="cpu", compute_type="int8")

    if suppress_numerals:
        numeral_symbol_tokens = find_numeral_symbol_tokens(whisper_model.hf_tokenizer)
    else:
        numeral_symbol_tokens = None

    if language is not None and language in wav2vec2_langs:
        word_timestamps = False
    else:
        word_timestamps = True

    segments, info = whisper_model.transcribe(
        audio_file,
        language=language,
        beam_size=5,
        word_timestamps=word_timestamps,
        suppress_tokens=numeral_symbol_tokens,
        vad_filter=True,
    )
    whisper_results = []
    for segment in segments:
        whisper_results.append(segment._asdict())
    # clear gpu vram
    del whisper_model
    torch.cuda.empty_cache()
    return whisper_results, language

**WhisperX**

This section is responsible for performing ASR on our input audio file. The `transcribe_batched` function handles batched audio transcription using the Whisper model, configured for a  specific language, numeral and symbol suppression, and computation settings. This streamlined approach optimizes resource utilization during batch processing.


* In a typical **whisperX pipeline**, input audio gets chopped , only where a sound activity is detected with a **VAD** model, into 30-second chunks and sent on a two-track adventure. Track one is the OpenAI Whisper model, a transcription maestro adept at capturing spoken words but occasionally stumbling over precise timings. It's like a talented lyricist who can't quite keep up with the rhythm.

* Running parallel is track two, the timestamp tsar **Wav2Vec2.0**, which may not have Whisper's lyrical prowess but boasts an uncanny ability to pinpoint each word's timing in the audio. So, why not combine the strengths of these two models and get the best of both worlds? It's like having a dynamic duo – one handles the lyrics, and the other keeps the beat. Together, they can create a harmonious symphony of transcriptions with accurate timestamps.


<img src="https://learnopencv.com/wp-content/uploads/2024/05/whisperx-Automatic-Speech-Recognition.png">

In [None]:
def transcribe_batched(
    audio_file: str,
    language: str,
    batch_size: int,
    model_name: str,
    compute_dtype: str,
    suppress_numerals: bool,
    device: str,
):


    # Faster Whisper batched
    whisper_model = whisperx.load_model(
        model_name,
        device,
        compute_type=compute_dtype,
        asr_options={"suppress_numerals": suppress_numerals},
    )
    audio = whisperx.load_audio(audio_file)
    result = whisper_model.transcribe(audio, language=language, batch_size=batch_size)
    del whisper_model
    torch.cuda.empty_cache()
    return result["segments"], result["language"]

### **Transcribing audio using WhisperX**
---

This snippet results in the transcription using batch processing on input audio with the WhisperX inference pipeline based on the specified `batch_size`, applying settings for computation type, numeral suppression, and device with a fp16 compute precision.



In [None]:
compute_type = "float16"
# or run on GPU with INT8
# compute_type = "int8_float16"
# or run on CPU with INT8
# compute_type = "int8"

if batch_size != 0:
    whisper_results, language = transcribe_batched(
        vocal_target,
        language,
        batch_size,
        whisper_model_name,
        compute_type,
        suppress_numerals,
        device,
    )
else:
    whisper_results, language = transcribe(
        vocal_target,
        language,
        whisper_model_name,
        compute_type,
        suppress_numerals,
        device,
    )

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.39k [00:00<?, ?B/s]

vocabulary.json:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

model.bin:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

No language specified, language will be first be detected for each audio file (increases inference time).


100%|█████████████████████████████████████| 16.9M/16.9M [00:01<00:00, 11.6MiB/s]
INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.0.7. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file ../root/.cache/torch/whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.2.1+cu121. Bad things might happen unless you revert torch to 1.x.
Detected language: en (0.66) in first 30s of audio...
Suppressing numeral and symbol tokens


### Forced Alignment with Wav2Vec2.0: WhisperX
---
Forced alignment refers to the process by which orthographic transcriptions are aligned to audio recordings to automatically generate phone level segmentation.

From default alignment models, `wav2vec2_langs` list combines all languages, offered by [PyTorch](https://pytorch.org/audio/stable/tutorials/forced_alignment_tutorial.html) and HuggingFace. Meanwhile, `whisper_langs` brings together languages supported by the Whisper model, including a wide array of global languages and additional language codes, ensuring extensive multilingual capabilities for Automatic Speech Recognition (ASR).


<img src="https://learnopencv.com/wp-content/uploads/2024/05/forced-alignment-wav2vec-Automatic-Speech-Recognition.png">

In [None]:
wav2vec2_langs = list(DEFAULT_ALIGN_MODELS_TORCH.keys()) + list(
    DEFAULT_ALIGN_MODELS_HF.keys()
)

whisper_langs = sorted(LANGUAGES.keys()) + sorted(
    [k.title() for k in TO_LANGUAGE_CODE.keys()]
)

Here, the `_get_next_start_timestamp` function is responsible for figuring out when the next word in a list of word timestamps should start. If we're looking at the last word in the list, it simply returns the start time of that word. However, if the next word doesn't have a timestamp defined, things get a bit trickier. In that case, the function merges the current word with the word lacking a timestamp, essentially extending the current word's duration until it either encounters a word with a defined start time or reaches the end of the list. If it's the latter scenario, the function returns a predefined final timestamp value.

In [None]:
def _get_next_start_timestamp(word_timestamps, current_word_index, final_timestamp):
    # if current word is the last word
    if current_word_index == len(word_timestamps) - 1:
        return word_timestamps[current_word_index]["start"]

    next_word_index = current_word_index + 1
    while current_word_index < len(word_timestamps) - 1:
        if word_timestamps[next_word_index].get("start") is None:
            # if next word doesn't have a start timestamp
            # merge it with the current word and delete it
            word_timestamps[current_word_index]["word"] += (
                " " + word_timestamps[next_word_index]["word"]
            )

            word_timestamps[next_word_index]["word"] = None
            next_word_index += 1
            if next_word_index == len(word_timestamps):
                return final_timestamp

        else:
            return word_timestamps[next_word_index]["start"]


This `filter_missing_timestamps` utility processes a list of word timestamps, ensuring each word has a start and end time by filling missing values based on adjacent timestamps or specified default boundaries, and compiles the cleaned list into `result`.


In [None]:
def filter_missing_timestamps(
    word_timestamps, initial_timestamp=0, final_timestamp=None
):
    # handle the first and last word
    if word_timestamps[0].get("start") is None:
        word_timestamps[0]["start"] = (
            initial_timestamp if initial_timestamp is not None else 0
        )
        word_timestamps[0]["end"] = _get_next_start_timestamp(
            word_timestamps, 0, final_timestamp
        )

    result = [
        word_timestamps[0],
    ]

    for i, ws in enumerate(word_timestamps[1:], start=1):
        # if ws doesn't have a start and end
        # use the previous end as start and next start as end
        if ws.get("start") is None and ws.get("word") is not None:
            ws["start"] = word_timestamps[i - 1]["end"]
            ws["end"] = _get_next_start_timestamp(word_timestamps, i, final_timestamp)

        if ws["word"] is not None:
            result.append(ws)
    return result

After Whisper generates the transcription, the next step in the WhisperX pipeline utilizes Wav2Vec 2.0 for forced alignment if the language is supported. If the language is unsupported and batch processing is not being used, WhisperX extracts the timestamps directly from Whisper's output instead. As we know  it is not as accurate as Wav2Vec's forced alignment, but this method ensures that each word in the transcription has an associated start and end time. Once the timestamping process is complete, the GPU memory is freed up to save resources.



In [None]:
if language in wav2vec2_langs:
    device = "cuda"
    alignment_model, metadata = whisperx.load_align_model(
        language_code=language, device=device
    )
    result_aligned = whisperx.align(
        whisper_results, alignment_model, metadata, vocal_target, device
    )
    word_timestamps = filter_missing_timestamps(
        result_aligned["word_segments"],
        initial_timestamp=whisper_results[0].get("start"),
        final_timestamp=whisper_results[-1].get("end"),
    )

    # clear gpu vram
    del alignment_model
    torch.cuda.empty_cache()
else:
    assert batch_size == 0, (  # TODO: add a better check for word timestamps existence
        f"Unsupported language: {language}, use --batch_size to 0"
        " to generate word timestamps using whisper directly and fix this error."
    )
    word_timestamps = []
    for segment in whisper_results:
        for word in segment["words"]:
            word_timestamps.append({"word": word[2], "start": word[0], "end": word[1]})



Downloading: "https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960_asr_ls960.pth" to /root/.cache/torch/hub/checkpoints/wav2vec2_fairseq_base_ls960_asr_ls960.pth
100%|██████████| 360M/360M [00:01<00:00, 199MB/s]


In [None]:
print("Transcription after Forced Alignment with Wav2Vec2.0:")
print(word_timestamps)

Transcription after Forced Alignment with Wav2Vec2.0:
[{'word': "I'm", 'start': 1.155, 'end': 1.295, 'score': 0.46}, {'word': 'going', 'start': 1.315, 'end': 1.475, 'score': 0.89}, {'word': 'to', 'start': 1.515, 'end': 1.575, 'score': 0.764}, {'word': 'introduce', 'start': 1.635, 'end': 1.995, 'score': 0.881}, {'word': 'you', 'start': 2.036, 'end': 2.136, 'score': 0.858}, {'word': 'to', 'start': 2.176, 'end': 2.316, 'score': 0.97}, {'word': 'a', 'start': 2.336, 'end': 2.356, 'score': 0.0}, {'word': 'rather', 'start': 2.656, 'end': 2.896, 'score': 0.864}, {'word': 'remarkable', 'start': 2.956, 'end': 3.496, 'score': 0.94}, {'word': 'man.', 'start': 3.556, 'end': 3.776, 'score': 0.844}, {'word': "He's", 'start': 4.577, 'end': 4.717, 'score': 0.719}, {'word': 'Mr.', 'start': 4.757, 'end': 4.997, 'score': 0.64}, {'word': 'Michael', 'start': 5.037, 'end': 5.277, 'score': 0.463}, {'word': 'Fitzpatrick', 'start': 5.297, 'end': 5.898, 'score': 0.853}, {'word': 'from', 'start': 5.958, 'end': 6.

<img src="https://learnopencv.com/wp-content/uploads/2024/05/ASRDiarization-Automatic-Speech-Recognition.jpg">

## **NEMO: DIARIZATION PIPELINE**

Now let’s understand the Nemo inference  pipeline and configuration. The input audio is passed to a  MarbleNet VAD model which helps to find the occurrence of voice as timestamps. Following this it's passed to a TitatNet model which does speaker extraction as embeddings. Finally a MSDD model adeptly does speaker diarization with exact timestamps with milliseconds precision. Now let’s describe these one by one in code implementation.


**Nemo Models Configuration**

Ok, it's time to define configuration for Nemo, so the create_config  utility function is used to set up the environment for speaker diarization. As we are processing a support call in our experiment , this fetches a YAML configuration file and it downloads a pre-configured model from NVIDIA's NeMo project that is optimized for telephonic or phone call audio. Finally, it generates a JSON manifest file. This manifest file contains metadata about the audio file that needs to be processed, like the file path and name.

If you want to try other audio samples, like online meetings or general conversations, feel free to change the domain type accordingly in the configuration.


#### **VAD Configuration**


Voice Activity Detection (VAD) is the detection of the presence or absence of human speech for a particular timestamp which is helpful in diarization.
As discussed initially, we will use a lightweight vad_multilingual_marblenet having trained on Google Speech Command v2 dataset offering robust and real time VAD.

For our tasks which require speaker verification and to capture the essence of the speaker's voice, TitaNet-Large model is used. It uses 1D depth-wise separable convolutions enhanced with Squeeze-and-Excitation (SE) layers and a channel attention-based statistics pooling layer. This architecture efficiently converts variable-length speech utterances into fixed-length speaker embeddings.


Additionally, we will configure our  system, that not to assume a fixed number of speakers (`config.diarizer.clustering.parameters.oracle_num_speakers = False`), allowing it to dynamically adapt to the actual number of speakers in each audio session.

Then we will specify the `config.diarizer.vad.model_path`, which is a pretrained model that optimizes voice activity detection with onset sensitivity set at `0.8` and offset at `0.6`. These settings enhance the VAD’s responsiveness, while a `pad_offset` of  `-0.05` fine-tunes segment endpoints for cleaner and more precise speech boundaries.



In [None]:
def create_config(output_dir):
    DOMAIN_TYPE = "telephonic"  # Can be meeting, telephonic, or general based on domain type of the audio file
    CONFIG_FILE_NAME = f"diar_infer_{DOMAIN_TYPE}.yaml"
    CONFIG_URL = f"https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/{CONFIG_FILE_NAME}"
    MODEL_CONFIG = os.path.join(output_dir, CONFIG_FILE_NAME)
    if not os.path.exists(MODEL_CONFIG):
        MODEL_CONFIG = wget.download(CONFIG_URL, output_dir)

    config = OmegaConf.load(MODEL_CONFIG)

    data_dir = os.path.join(output_dir, "data")
    os.makedirs(data_dir, exist_ok=True)

    meta = {
        "audio_filepath": os.path.join(output_dir, "mono_file.wav"),
        "offset": 0,
        "duration": None,
        "label": "infer",
        "text": "-",
        "rttm_filepath": None,
        "uem_filepath": None,
    }
    with open(os.path.join(data_dir, "input_manifest.json"), "w") as fp:
        json.dump(meta, fp)
        fp.write("\n")

    pretrained_vad = "vad_multilingual_marblenet"
    pretrained_speaker_model = "titanet_large"
    config.num_workers = 0  # Workaround for multiprocessing hanging with ipython issue
    config.diarizer.manifest_filepath = os.path.join(data_dir, "input_manifest.json")
    config.diarizer.out_dir = (
        output_dir  # Directory to store intermediate files and prediction outputs
    )

    config.diarizer.speaker_embeddings.model_path = pretrained_speaker_model
    config.diarizer.oracle_vad = (
        False  # compute VAD provided with model_path to vad config
    )
    config.diarizer.clustering.parameters.oracle_num_speakers = False

    # Here, we use our in-house pretrained NeMo VAD model
    config.diarizer.vad.model_path = pretrained_vad
    config.diarizer.vad.parameters.onset = 0.8
    config.diarizer.vad.parameters.offset = 0.6
    config.diarizer.vad.parameters.pad_offset = -0.05
    config.diarizer.msdd_model.model_path = (
        "diar_msdd_telephonic"  # Telephonic speaker diarization model
    )

    return config

* This output from VAD is passed to the Titanet model for speaker embedding extraction on multiple scales, which is then clustered to average the multi speaker clusters to pass through next stages in the Nemo pipeline.

* **MSDD Configuration**
The MSDD (Multiscale Diarization Decoder) model is a sequence model optimized for diarization, using a diligent technique that selectively weighs speaker embeddings at multiple scales. This enhances performance, particularly in transcribing a telephonic speech handling the overlapping speech. It operates on five scales with varying hop lengths to provide flexible temporal resolution, with the default being 0.25 seconds, adjustable for finer detail.


* Using Binary Cross entropy loss function, time corresponding speaker labels are identified by the neural Diarizer(MSDD).

### **Convert audio to mono for NeMo compatibility**

Further we need to convert our audio file to mono channel for Nemo’s audio processing compatible format using Pydub, as a .wav file to a designated temporary directory,

In [None]:
sound = AudioSegment.from_file(vocal_target).set_channels(1)
ROOT = os.getcwd()
temp_path = os.path.join(ROOT, "temp_outputs")
os.makedirs(temp_path, exist_ok=True)
sound.export(os.path.join(temp_path, "mono_file.wav"), format="wav")

<_io.BufferedRandom name='/content/temp_outputs/mono_file.wav'>

### **Speaker Diarization with MSDD -  Nvidia Nemo Toolkit for ASR**
---
Next we will initialize the MSDD model with the NeuralDiarizer pipeline.


In [None]:
# Initialize NeMo MSDD diarization model
msdd_model = NeuralDiarizer(cfg=create_config(temp_path)).to("cuda")
msdd_model.diarize()

del msdd_model
torch.cuda.empty_cache()

[NeMo I 2024-05-13 13:37:35 msdd_models:1092] Loading pretrained diar_msdd_telephonic model from NGC
[NeMo I 2024-05-13 13:37:35 cloud:68] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemo/diar_msdd_telephonic/versions/1.0.1/files/diar_msdd_telephonic.nemo to /root/.cache/torch/NeMo/NeMo_1.22.0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo
[NeMo I 2024-05-13 13:37:38 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2024-05-13 13:37:39 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: true
    
[NeMo W 2024-05-13 13:37:39 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: false
    
[NeMo W 2024-05-13 13:37:39 modelPT:174] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple

[NeMo I 2024-05-13 13:37:39 features:289] PADDING: 16
[NeMo I 2024-05-13 13:37:39 features:289] PADDING: 16
[NeMo I 2024-05-13 13:37:40 save_restore_connector:249] Model EncDecDiarLabelModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.22.0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo.
[NeMo I 2024-05-13 13:37:40 features:289] PADDING: 16
[NeMo I 2024-05-13 13:37:41 clustering_diarizer:127] Loading pretrained vad_multilingual_marblenet model from NGC
[NeMo I 2024-05-13 13:37:41 cloud:68] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemo/vad_multilingual_marblenet/versions/1.10.0/files/vad_multilingual_marblenet.nemo to /root/.cache/torch/NeMo/NeMo_1.22.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo
[NeMo I 2024-05-13 13:37:41 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2024-05-13 13:37:41 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/ami_train_0.63.json,/manifests/freesound_background_train.json,/manifests/freesound_laughter_train.json,/manifests/fisher_2004_background.json,/manifests/fisher_2004_speech_sampled.json,/manifests/google_train_manifest.json,/manifests/icsi_all_0.63.json,/manifests/musan_freesound_train.json,/manifests/musan_music_train.json,/manifests/musan_soundbible_train.json,/manifests/mandarin_train_sample.json,/manifests/german_train_sample.json,/manifests/spanish_train_sample.json,/manifests/french_train_sample.json,/manifests/russian_train_sample.json
    sample_rate: 16000
    labels:
    - background
    - speech
    batch_size: 256
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: sca

[NeMo I 2024-05-13 13:37:41 features:289] PADDING: 16
[NeMo I 2024-05-13 13:37:41 save_restore_connector:249] Model EncDecClassificationModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.22.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2024-05-13 13:37:41 msdd_models:864] Multiscale Weights: [1, 1, 1, 1, 1]
[NeMo I 2024-05-13 13:37:41 msdd_models:865] Clustering Parameters: {
        "oracle_num_speakers": false,
        "max_num_speakers": 8,
        "enhanced_count_thres": 80,
        "max_rp_threshold": 0.25,
        "sparse_search_volume": 30,
        "maj_vote_spk_count": false,
        "chunk_cluster_count": 50,
        "embeddings_per_chunk": 10000
    }
[NeMo I 2024-05-13 13:37:41 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2024-05-13 13:37:41 clustering_diarizer:309] Split long audio file to avoid CUDA memory issue


splitting manifest: 100%|██████████| 1/1 [00:12<00:00, 12.51s/it]


[NeMo I 2024-05-13 13:37:54 classification_models:273] Perform streaming frame-level VAD
[NeMo I 2024-05-13 13:37:54 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-05-13 13:37:54 collections:446] Dataset loaded with 4 items, total duration of  0.05 hours.
[NeMo I 2024-05-13 13:37:54 collections:448] # 4 files loaded accounting to # 1 labels


vad: 100%|██████████| 4/4 [00:02<00:00,  1.48it/s]


[NeMo I 2024-05-13 13:37:57 clustering_diarizer:250] Generating predictions with overlapping input segments




[NeMo I 2024-05-13 13:37:59 clustering_diarizer:262] Converting frame level prediction to speech/no-speech segment in start and end times format.


creating speech segments: 100%|██████████| 1/1 [00:00<00:00,  5.39it/s]


[NeMo I 2024-05-13 13:37:59 clustering_diarizer:287] Subsegmentation for embedding extraction: scale0, /content/temp_outputs/speaker_outputs/subsegments_scale0.json
[NeMo I 2024-05-13 13:37:59 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-05-13 13:37:59 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-05-13 13:37:59 collections:446] Dataset loaded with 140 items, total duration of  0.04 hours.
[NeMo I 2024-05-13 13:37:59 collections:448] # 140 files loaded accounting to # 1 labels


[1/5] extract embeddings: 100%|██████████| 3/3 [00:00<00:00,  4.86it/s]


[NeMo I 2024-05-13 13:37:59 clustering_diarizer:389] Saved embedding files to /content/temp_outputs/speaker_outputs/embeddings
[NeMo I 2024-05-13 13:37:59 clustering_diarizer:287] Subsegmentation for embedding extraction: scale1, /content/temp_outputs/speaker_outputs/subsegments_scale1.json
[NeMo I 2024-05-13 13:37:59 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-05-13 13:38:00 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-05-13 13:38:00 collections:446] Dataset loaded with 159 items, total duration of  0.04 hours.
[NeMo I 2024-05-13 13:38:00 collections:448] # 159 files loaded accounting to # 1 labels


[2/5] extract embeddings: 100%|██████████| 3/3 [00:00<00:00,  6.57it/s]


[NeMo I 2024-05-13 13:38:00 clustering_diarizer:389] Saved embedding files to /content/temp_outputs/speaker_outputs/embeddings
[NeMo I 2024-05-13 13:38:00 clustering_diarizer:287] Subsegmentation for embedding extraction: scale2, /content/temp_outputs/speaker_outputs/subsegments_scale2.json
[NeMo I 2024-05-13 13:38:00 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-05-13 13:38:00 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-05-13 13:38:00 collections:446] Dataset loaded with 197 items, total duration of  0.04 hours.
[NeMo I 2024-05-13 13:38:00 collections:448] # 197 files loaded accounting to # 1 labels


[3/5] extract embeddings: 100%|██████████| 4/4 [00:00<00:00,  7.48it/s]


[NeMo I 2024-05-13 13:38:01 clustering_diarizer:389] Saved embedding files to /content/temp_outputs/speaker_outputs/embeddings
[NeMo I 2024-05-13 13:38:01 clustering_diarizer:287] Subsegmentation for embedding extraction: scale3, /content/temp_outputs/speaker_outputs/subsegments_scale3.json
[NeMo I 2024-05-13 13:38:01 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-05-13 13:38:01 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-05-13 13:38:01 collections:446] Dataset loaded with 262 items, total duration of  0.05 hours.
[NeMo I 2024-05-13 13:38:01 collections:448] # 262 files loaded accounting to # 1 labels


[4/5] extract embeddings: 100%|██████████| 5/5 [00:00<00:00,  8.26it/s]


[NeMo I 2024-05-13 13:38:01 clustering_diarizer:389] Saved embedding files to /content/temp_outputs/speaker_outputs/embeddings
[NeMo I 2024-05-13 13:38:01 clustering_diarizer:287] Subsegmentation for embedding extraction: scale4, /content/temp_outputs/speaker_outputs/subsegments_scale4.json
[NeMo I 2024-05-13 13:38:01 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-05-13 13:38:01 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-05-13 13:38:01 collections:446] Dataset loaded with 397 items, total duration of  0.05 hours.
[NeMo I 2024-05-13 13:38:01 collections:448] # 397 files loaded accounting to # 1 labels


[5/5] extract embeddings: 100%|██████████| 7/7 [00:00<00:00,  9.03it/s]


[NeMo I 2024-05-13 13:38:02 clustering_diarizer:389] Saved embedding files to /content/temp_outputs/speaker_outputs/embeddings


clustering: 100%|██████████| 1/1 [00:01<00:00,  1.04s/it]


[NeMo I 2024-05-13 13:38:03 clustering_diarizer:464] Outputs are saved in /content/temp_outputs directory


[NeMo W 2024-05-13 13:38:03 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-05-13 13:38:03 msdd_models:960] Loading embedding pickle file of scale:0 at /content/temp_outputs/speaker_outputs/embeddings/subsegments_scale0_embeddings.pkl
[NeMo I 2024-05-13 13:38:03 msdd_models:960] Loading embedding pickle file of scale:1 at /content/temp_outputs/speaker_outputs/embeddings/subsegments_scale1_embeddings.pkl
[NeMo I 2024-05-13 13:38:03 msdd_models:960] Loading embedding pickle file of scale:2 at /content/temp_outputs/speaker_outputs/embeddings/subsegments_scale2_embeddings.pkl
[NeMo I 2024-05-13 13:38:03 msdd_models:960] Loading embedding pickle file of scale:3 at /content/temp_outputs/speaker_outputs/embeddings/subsegments_scale3_embeddings.pkl
[NeMo I 2024-05-13 13:38:03 msdd_models:960] Loading embedding pickle file of scale:4 at /content/temp_outputs/speaker_outputs/embeddings/subsegments_scale4_embeddings.pkl
[NeMo I 2024-05-13 13:38:03 msdd_models:938] Loading cluster label file from /content/temp_outputs/speaker_outputs/subsegments_scale4_cluste

100%|██████████| 1/1 [00:00<00:00, 12.07it/s]


[NeMo I 2024-05-13 13:38:03 msdd_models:1403]      [Threshold: 0.7000] [use_clus_as_main=False] [diar_window=50]
[NeMo I 2024-05-13 13:38:03 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2024-05-13 13:38:03 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-05-13 13:38:03 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-05-13 13:38:03 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-05-13 13:38:03 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-05-13 13:38:03 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-05-13 13:38:03 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-05-13 13:38:03 msdd_models:1431]   
    


The VAD output is saved as `vad_out.json` and it contains:

```
{"audio_filepath": "/content/temp_outputs/mono_file.wav", "offset": 1.18, "duration": 2.46, "label": "UNK", "uniq_id": "mono_file"}
{"audio_filepath": "/content/temp_outputs/mono_file.wav", "offset": 4.54, "duration": 2.22, "label": "UNK", "uniq_id": "mono_file"}

```

Then the embeddings from `TitaNet-L` are stored in `temp_outputs/speaker_outputs/embeddings`

At the end of this operation a `temp_outputs/pred_rtmms/mono_file.rtmm` file is saved which contains diarized timestamps as follows:

```python
SPEAKER mono_file 1   39.660   0.140 <NA> <NA> speaker_0 <NA> <NA>
SPEAKER mono_file 1   40.460   1.420 <NA> <NA> speaker_0 <NA> <NA>
SPEAKER mono_file 1   42.140   0.140 <NA> <NA> speaker_1 <NA> <NA>
SPEAKER mono_file 1   43.180   0.540 <NA> <NA> speaker_1 <NA> <NA>
SPEAKER mono_file 1   43.980   0.940 <NA> <NA> speaker_1 <NA> <NA>
```

#### **Mapping Speakers to Sentences According to Timestamps**

As transcribing a customer support conversation involves multiple speakers, we will define a function `get_sentences_speaker_mapping` that constructs a list of sentences from word-level mappings, each tagged with speaker information and timestamps. We will use an NLTK sentence tokenizer to determine when a new sentence starts, either due to a speaker change or because the current sentence(`snt`) has reached a natural break. As it processes each word, the function updates the current sentence or starts a new one, ensuring that each sentence in the output list(`snts`) captures coherent spoken segments, accurately labeled with the correct speaker and timing details.


In [None]:
def get_sentences_speaker_mapping(word_speaker_mapping, spk_ts):
    sentence_checker = nltk.tokenize.PunktSentenceTokenizer().text_contains_sentbreak
    s, e, spk = spk_ts[0]
    prev_spk = spk

    snts = []
    snt = {"speaker": f"Speaker {spk}", "start_time": s, "end_time": e, "text": ""}

    for wrd_dict in word_speaker_mapping:
        wrd, spk = wrd_dict["word"], wrd_dict["speaker"]
        s, e = wrd_dict["start_time"], wrd_dict["end_time"]
        if spk != prev_spk or sentence_checker(snt["text"] + " " + wrd):
            snts.append(snt)
            snt = {
                "speaker": f"Speaker {spk}",
                "start_time": s,
                "end_time": e,
                "text": "",
            }
        else:
            snt["end_time"] = e
        snt["text"] += wrd + " "
        prev_spk = spk

    snts.append(snt)
    return snts

Next, the `get_word_ts_anchors`  function is defined which returns a word's timestamp: the end (e) if option is "end", the midpoint if "mid", or the start (s) by default.

Then, the `get_words_speaker_mapping` function maps words to their corresponding speakers based on timing information. At first, we will iterate through word timestamps, adjust their anchor points depending on the chosen `word_anchor_option`, and match them to the closest speaker's time span. Following that, we handle speaker turns by updating the speaker indices and ensuring words at the list's end are correctly assigned to the last speaker. Thus, our result is a list of dictionaries, each containing a word, its start and end times, and the assigned speaker.

In [None]:
def get_word_ts_anchor(s, e, option="start"):
    if option == "end":
        return e
    elif option == "mid":
        return (s + e) / 2
    return s


def get_words_speaker_mapping(wrd_ts, spk_ts, word_anchor_option="start"):
    s, e, sp = spk_ts[0]
    wrd_pos, turn_idx = 0, 0
    wrd_spk_mapping = []
    for wrd_dict in wrd_ts:
        ws, we, wrd = (
            int(wrd_dict["start"] * 1000),
            int(wrd_dict["end"] * 1000),
            wrd_dict["word"],
        )
        wrd_pos = get_word_ts_anchor(ws, we, word_anchor_option)
        while wrd_pos > float(e):
            turn_idx += 1
            turn_idx = min(turn_idx, len(spk_ts) - 1)
            s, e, sp = spk_ts[turn_idx]
            if turn_idx == len(spk_ts) - 1:
                e = get_word_ts_anchor(ws, we, option="end")
        wrd_spk_mapping.append(
            {"word": wrd, "start_time": ws, "end_time": we, "speaker": sp}
        )
    return wrd_spk_mapping

Then, we will read the RTMM file, which is the output from the MSDD Neural Diarizer, to map speaker labels to timestamps and use these mappings to associate speakers with sentences (wsm) based on their start times.

In [None]:
# Reading timestamps <> Speaker Labels mapping

speaker_ts = []
with open(os.path.join(temp_path, "pred_rttms", "mono_file.rttm"), "r") as f:
    lines = f.readlines()
    for line in lines:
        line_list = line.split(" ")
        s = int(float(line_list[5]) * 1000)
        e = s + int(float(line_list[8]) * 1000)
        speaker_ts.append([s, e, int(line_list[11].split("_")[-1])])

wsm = get_words_speaker_mapping(word_timestamps, speaker_ts, "start")
ssm = get_sentences_speaker_mapping(wsm, speaker_ts)

## **Utility Functions:**

The `format_timestamp` function converts the WhisperX output timestamps, which are in milliseconds, into an hourly formatted string (hh:mm:ss.sss), ensuring the result is always non-negative.

In [None]:
def format_timestamp(
    milliseconds: float, always_include_hours: bool = False, decimal_marker: str = "."
):
    assert milliseconds >= 0, "non-negative timestamp expected"

    hours = milliseconds // 3_600_000
    milliseconds -= hours * 3_600_000

    minutes = milliseconds // 60_000
    milliseconds -= minutes * 60_000

    seconds = milliseconds // 1_000
    milliseconds -= seconds * 1_000

    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
    return (
        f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
    )

To save outputs a transcript to a file in SRT format,  the `write_srt` utility formats timestamps and text content for each segment and appropriately handles special characters in dialogue.

In [None]:
def write_srt(transcript, file):
    """
    Write a transcript to a file in SRT format.

    """
    for i, segment in enumerate(transcript, start=1):
        # write srt lines
        print(
            f"{i}\n"
            f"{format_timestamp(segment['start_time'], always_include_hours=True, decimal_marker=',')} --> "
            f"{format_timestamp(segment['end_time'], always_include_hours=True, decimal_marker=',')}\n"
            f"{segment['speaker']}: {segment['text'].strip().replace('-->', '->')}\n",
            file=file,
            flush=True,
        )

After processing the speaker information, finally this generates an SRT-formatted transcript with speaker labels.

In [None]:
with open(f"{os.path.splitext(audio_path)[0]}.srt", "w", encoding="utf-8-sig") as srt:
    write_srt(ssm, srt)

**RESULTS**

The srt file contains output transcription results:
```

00:00:29,810 --> 00:00:33,453
Speaker 0: You have seen a lot of changes, Mr Fitzpatrick, in farming.

00:00:34,173 --> 00:00:35,655
Speaker 0: What would you say was the biggest change?

00:00:36,195 --> 00:00:36,996
Speaker 1: Well, machinery.

00:00:37,956 --> 00:00:41,879
Speaker 0: And what sort of a machine would you think made the biggest impression?

00:00:42,139 --> 00:00:50,226
Speaker 1: Well, the reaper and binder is a great one, but by God, the one for cutting up the ground and throwing the crop is a powerful one too.

```

## **Helper Functions:- Alignment with Punctuations**

In [None]:
punct_model_langs = [
    "en",
    "fr",
    "de",
    "es",
    "it",
    "nl",
    "pt",
    "bg",
    "pl",
    "cs",
    "sk",
    "sl",

]

sentence_ending_punctuations = ".?!"

In [None]:
def get_first_word_idx_of_sentence(word_idx, word_list, speaker_list, max_words):
    is_word_sentence_end = (
        lambda x: x >= 0 and word_list[x][-1] in sentence_ending_punctuations
    )
    left_idx = word_idx
    while (
        left_idx > 0
        and word_idx - left_idx < max_words
        and speaker_list[left_idx - 1] == speaker_list[left_idx]
        and not is_word_sentence_end(left_idx - 1)
    ):
        left_idx -= 1

    return left_idx if left_idx == 0 or is_word_sentence_end(left_idx - 1) else -1


def get_last_word_idx_of_sentence(word_idx, word_list, max_words):
    is_word_sentence_end = (
        lambda x: x >= 0 and word_list[x][-1] in sentence_ending_punctuations
    )
    right_idx = word_idx
    while (
        right_idx < len(word_list)
        and right_idx - word_idx < max_words
        and not is_word_sentence_end(right_idx)
    ):
        right_idx += 1

    return (
        right_idx
        if right_idx == len(word_list) - 1 or is_word_sentence_end(right_idx)
        else -1
    )

In [None]:
def get_realigned_ws_mapping_with_punctuation(
    word_speaker_mapping, max_words_in_sentence=50
):
    is_word_sentence_end = (
        lambda x: x >= 0
        and word_speaker_mapping[x]["word"][-1] in sentence_ending_punctuations
    )
    wsp_len = len(word_speaker_mapping)

    words_list, speaker_list = [], []
    for k, line_dict in enumerate(word_speaker_mapping):
        word, speaker = line_dict["word"], line_dict["speaker"]
        words_list.append(word)
        speaker_list.append(speaker)

    k = 0
    while k < len(word_speaker_mapping):
        line_dict = word_speaker_mapping[k]
        if (
            k < wsp_len - 1
            and speaker_list[k] != speaker_list[k + 1]
            and not is_word_sentence_end(k)
        ):
            left_idx = get_first_word_idx_of_sentence(
                k, words_list, speaker_list, max_words_in_sentence
            )
            right_idx = (
                get_last_word_idx_of_sentence(
                    k, words_list, max_words_in_sentence - k + left_idx - 1
                )
                if left_idx > -1
                else -1
            )
            if min(left_idx, right_idx) == -1:
                k += 1
                continue

            spk_labels = speaker_list[left_idx : right_idx + 1]
            mod_speaker = max(set(spk_labels), key=spk_labels.count)
            if spk_labels.count(mod_speaker) < len(spk_labels) // 2:
                k += 1
                continue

            speaker_list[left_idx : right_idx + 1] = [mod_speaker] * (
                right_idx - left_idx + 1
            )
            k = right_idx

        k += 1

    k, realigned_list = 0, []
    while k < len(word_speaker_mapping):
        line_dict = word_speaker_mapping[k].copy()
        line_dict["speaker"] = speaker_list[k]
        realigned_list.append(line_dict)
        k += 1

    return realigned_list

In [None]:
def get_speaker_aware_transcript(sentences_speaker_mapping, f):
    previous_speaker = sentences_speaker_mapping[0]["speaker"]
    f.write(f"{previous_speaker}: ")

    for sentence_dict in sentences_speaker_mapping:
        speaker = sentence_dict["speaker"]
        sentence = sentence_dict["text"]

        # If this speaker doesn't match the previous one, start a new paragraph
        if speaker != previous_speaker:
            f.write(f"\n\n{speaker}: ")
            previous_speaker = speaker

        # No matter what, write the current sentence
        f.write(sentence + " ")

**Optional**

**Realigning Speech segments using Punctuation**

---

This code provides a method for disambiguating speaker labels in cases where a sentence is split between two different speakers. It uses punctuation markings to determine the dominant speaker for each sentence in the transcription.

```
Speaker A: It's got to come from somewhere else. Yeah, that one's also fun because you know the lows are
Speaker B: going to suck, right? So it's actually it hits you on both sides.
```

For example, if a sentence is split between two speakers, the code takes the mode of speaker labels for each word in the sentence, and uses that speaker label for the whole sentence. This can help to improve the accuracy of speaker diarization, especially in cases where the Whisper model may not take fine utterances like "hmm" and "yeah" into account, but the Diarization Model (Nemo) may include them, leading to inconsistent results.

The code also handles cases where one speaker is giving a monologue while other speakers are making occasional comments in the background. It ignores the comments and assigns the entire monologue to the speaker who is speaking the majority of the time. This provides a robust and reliable method for realigning speech segments to their respective speakers based on punctuation in the transcription.

In [None]:
def cleanup(path: str):
    """path could either be relative or absolute."""
    # check if file or directory exists
    if os.path.isfile(path) or os.path.islink(path):
        # remove file
        os.remove(path)
    elif os.path.isdir(path):
        # remove directory and all its content
        shutil.rmtree(path)
    else:
        raise ValueError("Path {} is not a file or dir.".format(path))


In [None]:
if language in punct_model_langs:
    # restoring punctuation in the transcript to help realign the sentences
    punct_model = PunctuationModel(model="kredor/punctuate-all")

    words_list = list(map(lambda x: x["word"], wsm))

    labled_words = punct_model.predict(words_list)

    ending_puncts = ".?!"
    model_puncts = ".,;:!?"

    # We don't want to punctuate U.S.A. with a period. Right?
    is_acronym = lambda x: re.fullmatch(r"\b(?:[a-zA-Z]\.){2,}", x)

    for word_dict, labeled_tuple in zip(wsm, labled_words):
        word = word_dict["word"]
        if (
            word
            and labeled_tuple[1] in ending_puncts
            and (word[-1] not in model_puncts or is_acronym(word))
        ):
            word += labeled_tuple[1]
            if word.endswith(".."):
                word = word.rstrip(".")
            word_dict["word"] = word

else:
    logging.warning(
        f"Punctuation restoration is not available for {language} language. Using the original punctuation."
    )

wsm = get_realigned_ws_mapping_with_punctuation(wsm)


config.json:   0%|          | 0.00/914 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/447 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

**Cleanup and Exporing the results**

In [None]:
with open(f"{os.path.splitext(audio_path)[0]}.txt", "w", encoding="utf-8-sig") as f:
    get_speaker_aware_transcript(ssm, f)

cleanup(temp_path)

At the end of this a .txt file is saved which will have properly formatted transcription and Diarization:

```
Speaker 1: They had money, but they weren't minding it.  They could afford it, but more of them got out of it.  

Speaker 0: And some of them, I think you told me, wouldn't have it on the land at all.  

Speaker 1: You see, ten men cut it at three and six months a day.  

Speaker 0: Do you remember cutting the harvest with the reaping hook?  

```

## **WORD ERROR RATE CALCULATION**

In [None]:
!pip install -q jiwer

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/3.4 MB[0m [31m14.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/3.4 MB[0m [31m16.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━[0m [32m1.9/3.4 MB[0m [31m18.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━[0m [32m3.0/3.4 MB[0m [31m21.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.4/3.4 MB[0m [31m22.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Define the ground truth and hypothesis
ground_truth = """
I'm going to introduce you to a rather remarkable man. He's Mister Michael Fitzpatrick from Killenie Maynooth.
Now he started to draw the old edge pension in 1927 and seven years ago he got the president's bounty on his hundredth birthday.
Now he's from County Clare. He came up from Clare in 1940 to a land commission farm in Maynooth where he lives now you have seen a lot of changes, Mister Fitzpatrick, in farming.
What would you say was the biggest change?
Well, machinery.
And what sort of a machine would you think that made the biggest impression?
Well, the reaper and binder is a great one, but by God, the one for cutting up the ground and throwing a crop is a powerful one too.
Well, you were. You were saying at the time you saw the mowing machine first it made a tremendous impression on you.
It did, because it was a  wonder. How could it be done at all?
What was the reaction of the people at that time to the mowing machine, Mr.Fitzpatrick?
A great many of them wasn't minding it or could afford it but a more of them got at it.
And some of them, I think you told me, wouldn't have it on the land at all.
Well,a man that has a good farm with us...outside of us our townsland. He wouldn't allow bring in it. He used to be paying men for to cut it at three and six pence a day.
Do you remember ah..ah cutting the harvest with the reaping hook?
Ah, it was it was all of it... cut... for years and years and years. Nothing else ever cut it.
And how do you think that the reaping hook compares with the combine at the present time?
Aw well, there's an awful difference. An awful difference I see anyway because the combine did as good in one start of a day as the poor reaper and binder wouldn't bring in a week.
Now you also remember, I think, a rather historic thing in the land history of this country, the bodyke evictions.
I do I do.
Could you describe for us what happened at those evictions? You were at them?
I was at one of them for about 5 hours in the one day.
And what happened?
Aw they threw out...They were very cruel. They threw out children and woman on the roadside. Well, there was one of them thrown out the one day I was in it and the baby was only about three days old.
And they were sitting, they were thrown on the banks of the road for the lake. Aw It was cruel.
And how about the...ah... type of food you had to eat at that time, Mr. Fitzpatrick..what sort of, what did you live on?
Well we lived, We had to live on it there a long time on an Indian meal and flour.

"""

assembly_ai_output = """
I'm going to introduce you to a rather remarkable man. He's Mister Michael Fitzpatrick from Killenie Maynooth.
Now he started to draw the old edge pension in 1927 and seven years ago he got the president's bounty on his hundredth birthday.
Now he's from County Clare. He came up from Clare in 1940 to a land commission farm in Maynooth where he lives now you have seen a lot of changes, Mister Fitzpatrick, in farming.
What would you say was the biggest change?
Well, machinery.
And what sort of a machine would you think that made the biggest impression?
Well, the reaper and binder is a great one, but by God, the one for cutting up the ground and throwing a crop is a powerful one too.
Well, you were. You were saying you're at the time you saw the mowing machine for it made a tremendous impression on you.
It did, because it is one. How could it be done at all?
What was the reaction of the people at that time to the Moor machine, mister?
Great money was maintained, I could afford it, but more habit.
And some of them, I think you told me, wouldn't have it on the land at all.
Well, I had a good father, motors land. He wouldn't allow it in it. He used to pay a minimum profit at three and sixpence a day.
Do you remember cutting the harvest with the reaping hook?
Oh, all of it cut for years and years and years. Nothing else about cutting.
And how do you think that the reaping hook compares with the combine at the present time?
Well, there's an awful difference. And now for different safety anywhere because if it started here is a poor rape around there wouldn't be enough.
Now you also remember, I think, a rather historic thing in the land history of this country, the bodike evictions.
Could you describe for us what happened at those evictions? You were at them?
I was at one of them about 5 hours.
And what happened?
That was very cruel. That's from children and the whole shit.
 Well, there was one of them thrown out one day I was in it and the baby was only about three days old and they were sitting, they were thrown on the banks. It was cruel.
And how about the type of food you had to eat at that time, Mr. Fisvatik, what did you live on?
We had to live there a long time on an indian maid on plover.
"""

whisper_output = """
I'm going to introduce you to a rather remarkable man.  He's Mr.  Michael Fitzpatrick from Killeney, Maynooth.  Now, he started to draw the old age pension.  in nineteen twenty seven and seven years ago he got the president's bounty on his hundredth birthday.  Now, he's from County Clare.  He came up from Clare in nineteen forty to a land commission farm in Maynooth, where he lives now.  You have seen a lot of changes, Mr Fitzpatrick, in farming.  What would you say was the biggest change?
Well, machinery.
And what sort of a machine would you think made the biggest impression?
Well, the reaper and binder is a great one, but by God, the one for cutting up the ground and throwing the crop is a powerful one too.
Well, you were saying at the time you saw the mowing machine first, it made a tremendous impression on you.
It did, because you didn't wonder how could it be done at all.
What was the reaction of the people at that time to the mowing machine, Mr.  Scott?
They had money, but they weren't minding it.  They could afford it, but more of them got out of it.
And some of them, I think you told me, wouldn't have it on the land at all.
You see, ten men cut it at three and six months a day.
Do you remember cutting the harvest with the reaping hook?
Oh, all of it.  Of course, for years and years and years, nothing else about cutting.
And how do you think that the reaping hook compares with the combine at the present time?
Oh, well, there's an awful difference.  An awful difference I see anywhere.  Because if you couldn't find it, if you wouldn't want it, start to put it there, if you put it on the vine, there wouldn't be no mice.
Now, you also remember, I think, a rather historic thing in the land history of this country, the Bowdike evictions.  I do.  Could you describe for us what happened at those evictions?  You were at them.
was at one of them about five hours.One day there.
And what happened?
Ah, that was very cruel.  They throw children and women on the roadside.  Well, there was one of them thrown out, but one day I was in it, and the baby was only about three days old.  And they were sitting, they were thrown on the bank of the river, and we all fell down like this.  Oh, it was cruel.
And how about the type of food you had to eat at that time, Mr.  Fitzpatrick?  What sort of, what did you live on?
Well, we had to live there a long time on an Indian maid and flowers.

"""

deepgram_output = """
I'm going to introduce you to a rather remarkable man. He's mister Michael Fitzpatrick from Kilenny, Maynooth.
Now he started to draw the old age pension in 1927, and 7 years ago, he got the president's bounty on his 100th birthday.
Now he's from County Clare. He came up from Clare in 1940 to a land commission farm in Maynooth where he lives now.
You have seen a lot of changes, mister Fitzpatrick, in farming. What would you say was the biggest change? With machinery.
And what sort of a machine do you think made the biggest impression?
Well, the reaper and binders are graffing, but, by god, the one pulled caution up the ground and thrown a flap is a powerful one too.
Well, you were you were saying you're at at the time you saw the mowing machine first, it it made a tremendous impression on you.
Indeed. Because it is not how could it be done at all? What was the the reaction of the people at that time to the mower machine, mister Scott? Money.
If I wasn't mowing in it, I could afford it, but the mower would have it.
And some of them, I think, you told me, wouldn't have it on the land at all.
Well, and Do do you remember, cutting the harvest with the reap and hope? Oh, probably all of it. Quite for years years years.
That's it. It's about quite. And how do you think that the ripen hook compares with the combine no You can't come by and just go in the morning, start to bid here, and just put a report on by and there wouldn't be no noise.
Now you also remember, I think, a rather historic thing in the land history of this country, the Bovah, Burdike evictions.
I remember. Could you could you describe for us what happened at those evictions? You were at them. I was at one of them.
About about by. And what happened? That's, you know, that was very cruel. That's all children and women and the bullshit.
Well, there was one of them thrown out, put in one day, everything in it, and the baby was only about 3 days old.
And they were sitting they were throwing on the banks of the then go for jelly. Oh, it was cruel.
And how about the type of food you had to eat at that time, mister Fisbati? What sort of what did you live on?
What was it? We had to live there a long time on an Indian land and trouble. """


gladia_whisper_zero_output = """
I'm going to introduce you to a rather remarkable man. He's Mr Michael Fitzpatrick from Killenie, Maynooth.
Now... He started to draw the old age pension in 1927 and seven years ago he got the president's bounty on his hundredth birthday.
Now he's from County Clare. He came up from Clare in 1940 to a land commission farm in Maynooth where he lives now.
You have seen a lot of changes Mr Fitzpatrick in farming. What would you say was the biggest change? Well machinery.
And what sort of a machine machine? What do you think made the biggest impression?
Well, the Rupert and Binder, the great one, but by God, the one for cutting up the ground and throwing a crop is a powerful one too.
Well, you were saying at the time you saw the mowing machine first, it made a tremendous impression on you.
It did, because I didn't know how good it would be doing at all. What was the reaction of the people at that time to the mowing machine?
They had great money but they weren't minding it and couldn't afford it. But the mowing got at it.
And some of them I think you told me wouldn't have it on the land at all.
Well, I had a good father who said about our town's land, he wouldn't have it on the land. You see, ten men cut it, at three and six months a day. Do you remember cutting the harvest with the reaping hook?
Oh, all of it. Of course, for years and years and years, not a hint about cutting.
How do you think that the reaping hook compares with the combine at the present time?
Oh, well, there's an awful difference. An awful difference, I see, anyway.
Because if you come round and you see a woman slapping her hair, you just put a wiper on and there wouldn't be no mice.
Now you also remember, I think, a rather historic thing in the land history of this country, the Boddike Evictions. I do.
Could you describe for us what happened at those evictions? You were at them. I was at one of them about five hours.
One day. And what happened? Ah, they were very cruel. They threw children and women on the roadside.
Well, there was one of them thrown out, but one day I was in it, and the baby was only about three days old.
And they were sitting, they were thrown on the bank, and they were off on their legs. Ah, it was cruel.
And how about the type of food you had to eat at that time, Mr Fispartic?
What sort of, what did you live on? What did we live on? We had to live there a long time on an Indian land on the prowl. """

In [None]:
import jiwer
import re

def preprocess_text(text):
    # Normalize the case and strip whitespace
    text = text.lower().strip()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    return text

# Preprocess both ground truth and AI output
ground_truth_clean = preprocess_text(ground_truth)
assembly_ai_output_clean = preprocess_text(assembly_ai_output)
whisper_output_clean = preprocess_text(whisper_output)
deepgram_output_clean = preprocess_text(deepgram_output)
gladia_whisper_zero_output_clean = preprocess_text(gladia_whisper_zero_output)

print(f"Ground Truth Normalised:",ground_truth_clean)

print("********")

print(f"Assembly AI Output Normalised:", assembly_ai_output_clean)


print("********")

print(f" Gladia Whisper Zero Output Normalised:", gladia_whisper_zero_output_clean)

print("********")



print(f"Deepgram Output Normalised:", deepgram_output_clean)


print("********")

print(f"Whisper Output Normalised:", whisper_output_clean)

print("********")

aai_wer = jiwer.wer(
    ground_truth_clean,
    assembly_ai_output_clean,
)

gladia_wer = jiwer.wer(
    ground_truth_clean,
    gladia_whisper_zero_output_clean,)


dgram_wer = jiwer.wer(
    ground_truth_clean,
    deepgram_output_clean
)

whisper_wer = jiwer.wer(
    ground_truth_clean,
    whisper_output_clean
)

print(f"Assembly AI Word Error Rate: {aai_wer:.2f}")
print("********")
print(f"Gladia Whisper Zero Word Error Rate: {gladia_wer:.2f}")
print("********")
print(f"Deepgram Word Error Rate: {dgram_wer:.2f}")
print("********")
print(f"Whisper Word Error Rate: {whisper_wer:.2f}")
print("********")

Ground Truth Normalised: im going to introduce you to a rather remarkable man hes mister michael fitzpatrick from killenie maynooth now he started to draw the old edge pension in 1927 and seven years ago he got the presidents bounty on his hundredth birthday now hes from county clare he came up from clare in 1940 to a land commission farm in maynooth where he lives now you have seen a lot of changes mister fitzpatrick in farming what would you say was the biggest change well machinery and what sort of a machine would you think that made the biggest impression well the reaper and binder is a great one but by god the one for cutting up the ground and throwing a crop is a powerful one too well you were you were saying at the time you saw the mowing machine first it made a tremendous impression on you it did because it was a wonder how could it be done at all what was the reaction of the people at that time to the mowing machine mrfitzpatrick a great many of them wasnt minding it or could 

## **DIARIZATION ERROR RATE**

In [None]:
!pip install -qq pyannote.audio

In [None]:
from pyannote.core import Annotation, Segment
from pyannote.metrics.diarization import DiarizationErrorRate

# Create annotations for ground truth and hypothesis
ground_truth = Annotation()
assembly_ai_output = Annotation()
whisper_output = Annotation()

# Assuming we have start times and durations (in seconds)
# Ground Truth (you would fill in actual start times and durations)
ground_truth[Segment(0, 35)] = 'A'
ground_truth[Segment(36, 37)] = 'B'
ground_truth[Segment(37, 41)] = 'A'
ground_truth[Segment(42, 50)] = 'B'
ground_truth[Segment(51, 56)] = 'A'
ground_truth[Segment(57, 61)] = 'B'
ground_truth[Segment(62, 67)] = 'A'
ground_truth[Segment(67, 72)] = 'B'
ground_truth[Segment(73, 77)] = 'A'
ground_truth[Segment(77,87)] = 'B'
ground_truth[Segment(88, 92)] = 'A'
ground_truth[Segment(93, 99)] = 'B'
ground_truth[Segment(99, 103)] = 'A'
ground_truth[Segment(104, 116)] = 'B'
ground_truth[Segment(116, 124)] = 'A'
ground_truth[Segment(124, 126)] = 'B'
ground_truth[Segment(126, 129)] = 'A'
ground_truth[Segment(130, 135)] = 'B'
ground_truth[Segment(136, 136)] = 'A'
ground_truth[Segment(137, 155)] = 'B'
ground_truth[Segment(156, 163)] = 'A'
ground_truth[Segment(164, 169)] = 'B'

# Assembly AI Output (assuming some errors)
assembly_ai_output[Segment(0, 35)] = 'A'
assembly_ai_output[Segment(36, 37)] = 'B'  # Overlap error, different end times
assembly_ai_output[Segment(37, 41)] = 'A'  # Shift in time
assembly_ai_output[Segment(42, 50)] = 'B'
assembly_ai_output[Segment(51, 56)] = 'A'
assembly_ai_output[Segment(57, 61)] = 'B'
assembly_ai_output[Segment(62, 67)] = 'A'
assembly_ai_output[Segment(67, 72)] = 'B'
assembly_ai_output[Segment(73, 77)] = 'A'
assembly_ai_output[Segment(77, 87)] = 'B'
assembly_ai_output[Segment(88, 92)] = 'A'
assembly_ai_output[Segment(93, 99)] = 'B'
assembly_ai_output[Segment(99, 103)] = 'A'
assembly_ai_output[Segment(104, 116)] = 'B'
assembly_ai_output[Segment(116, 129)] = 'A'
assembly_ai_output[Segment(130, 133)] = 'B'
assembly_ai_output[Segment(136, 136)] = 'A'
assembly_ai_output[Segment(137, 155)] = 'B'
assembly_ai_output[Segment(156, 163)] = 'A'
assembly_ai_output[Segment(164, 169)] = 'B'



# Whisper Output (assuming some errors)
whisper_output[Segment(0, 35)] = 'A'
whisper_output[Segment(36, 37)] = 'B'  # Overlap error, different end times
whisper_output[Segment(37, 41)] = 'A'  # Shift in time
whisper_output[Segment(42, 50)] = 'B'
whisper_output[Segment(51, 56)] = 'A'
whisper_output[Segment(57, 61)] = 'B'
whisper_output[Segment(62, 67)] = 'A'
whisper_output[Segment(67, 72)] = 'B'
whisper_output[Segment(73, 77)] = 'A'
whisper_output[Segment(84, 87)] = 'B'
whisper_output[Segment(88, 92)] = 'A'
whisper_output[Segment(93, 99)] = 'B'
whisper_output[Segment(99, 103)] = 'A'
whisper_output[Segment(104, 116)] = 'B'
whisper_output[Segment(116, 124)] = 'A'
whisper_output[Segment(124, 126)] = 'B'
whisper_output[Segment(126, 129)] = 'A'
whisper_output[Segment(130, 133)] = 'B'
whisper_output[Segment(136, 136)] = 'A'
whisper_output[Segment(137, 155)] = 'B'
whisper_output[Segment(156, 163)] = 'A'
whisper_output[Segment(164, 169)] = 'B'


# Initialize metric
metric = DiarizationErrorRate()

# Calculate DER
aai_der = metric(ground_truth, assembly_ai_output)
print(f"Assembly AI Diarization Error Rate: {aai_der:.2%}")

print("********")

whisper_der = metric(ground_truth, whisper_output)
print(f"Whisper Diarization Error Rate: {whisper_der:.2%}")

Assembly AI Diarization Error Rate: 2.58%
********
Whisper Diarization Error Rate: 5.81%
