Updating the README.md file

Browse files

Signed-off-by: taejinp <tango4j@gmail.com>

Files changed (1) hide show

README.md +15 -14

README.md CHANGED Viewed

@@ -239,8 +239,6 @@ Each model instance:
 This architecture enables the model to handle severe speech overlap by having each instance focus exclusively on one speaker, eliminating the permutation problem that affects other multitalker ASR approaches.
 ## NVIDIA NeMo
 To train, fine-tune or perform multitalker ASR with this model, you will need to install [NVIDIA NeMo](https://github.com/NVIDIA/NeMo)[7]. We recommend you install it after you've installed Cython and latest PyTorch version.
@@ -259,31 +257,36 @@ The model is available for use in the NeMo Framework[7], and can be used as a pr
 ### Method 1. Code snippet
 ```python
 from nemo.collections.asr.models import SortformerEncLabelModel
 import torch
-# Step 1: Load streaming diarization model (provides speaker activity predictions)
 diar_model = SortformerEncLabelModel.from_pretrained("nvidia/diar_streaming_sortformer_4spk-v2.1")
 diar_model.eval().to(torch.device("cuda"))
-# Step 2: Load streaming multitalker ASR model (transcribes each speaker separately)
 asr_model = ASRModel.from_pretrained("nvidia/multitalker-parakeet-streaming-0.6b-v1.nemo")
 asr_model.eval().to(torch.device("cuda"))
 from multitalker_transcript_config import MultitalkerTranscriptionConfig
 from omegaconf import OmegaConf
-# Step 3: Configure models with streaming parameters (latency, chunk sizes, etc.)
 cfg = OmegaConf.structured(MultitalkerTranscriptionConfig())
 cfg.audio_file = "/path/to/your/audio.wav"
 cfg.output_path = "/path/to/output_transcription.json"
-# Initialize diarization model with streaming config (sets chunk_len, context, etc.)
 diar_model = MultitalkerTranscriptionConfig.init_diar_model(cfg, diar_model)
 from nemo.collections.asr.parts.utils.streaming_utils import CacheAwareStreamingAudioBuffer
-# Step 4: Setup streaming buffer (simulates real-time audio stream)
 samples = [{'audio_filepath': cfg.audio_file}]
 streaming_buffer = CacheAwareStreamingAudioBuffer(
     model=asr_model,
@@ -292,13 +295,12 @@ streaming_buffer = CacheAwareStreamingAudioBuffer(
 )
 streaming_buffer.append_audio_file(audio_filepath=cfg.audio_file, stream_id=-1)
 streaming_buffer_iter = iter(streaming_buffer)
 from nemo.collections.asr.parts.utils.multispk_transcribe_utils import SpeakerTaggedASR
-# Step 5: Initialize multi-instance ASR streamer (manages per-speaker ASR instances)
 multispk_asr_streamer = SpeakerTaggedASR(cfg, asr_model, diar_model)
-# Step 6: Process audio chunks iteratively (streaming inference loop)
 for step_num, (chunk_audio, chunk_lengths) in enumerate(streaming_buffer_iter):
     drop_extra_pre_encoded = (
         0
@@ -315,10 +317,9 @@ for step_num, (chunk_audio, chunk_lengths) in enumerate(streaming_buffer_iter):
                     is_buffer_empty=streaming_buffer.is_buffer_empty(),
                     drop_extra_pre_encoded=drop_extra_pre_encoded,
                 )
-# Step 7: Generate final transcriptions in SegLST format (speaker-tagged with timestamps)
-seglst_dict_list = multispk_asr_streamer.generate_seglst_dicts_from_parallel_streaming(samples=samples)
-# Display speaker-tagged transcriptions with timestamps
 print(seglst_dict_list)
 ```

 This architecture enables the model to handle severe speech overlap by having each instance focus exclusively on one speaker, eliminating the permutation problem that affects other multitalker ASR approaches.
 ## NVIDIA NeMo
 To train, fine-tune or perform multitalker ASR with this model, you will need to install [NVIDIA NeMo](https://github.com/NVIDIA/NeMo)[7]. We recommend you install it after you've installed Cython and latest PyTorch version.
 ### Method 1. Code snippet
+Load a speaker diarization model [Streaming Sortformer Diarizer v2.1](https://huggingface.co/nvidia/diar_streaming_sortformer_4spk-v2.1) for generating speaker timestamps.
+A speaker diarization model is needed for tracking the speech activity of each speaker.
 ```python
 from nemo.collections.asr.models import SortformerEncLabelModel
 import torch
 diar_model = SortformerEncLabelModel.from_pretrained("nvidia/diar_streaming_sortformer_4spk-v2.1")
 diar_model.eval().to(torch.device("cuda"))
 asr_model = ASRModel.from_pretrained("nvidia/multitalker-parakeet-streaming-0.6b-v1.nemo")
 asr_model.eval().to(torch.device("cuda"))
+"""
+Use the pre-defined dataclass template `MultitalkerTranscriptionConfig` from `multitalker_transcript_config.py`.
+Configure the diarization model using streaming parameters:
+"""
 from multitalker_transcript_config import MultitalkerTranscriptionConfig
 from omegaconf import OmegaConf
 cfg = OmegaConf.structured(MultitalkerTranscriptionConfig())
 cfg.audio_file = "/path/to/your/audio.wav"
 cfg.output_path = "/path/to/output_transcription.json"
 diar_model = MultitalkerTranscriptionConfig.init_diar_model(cfg, diar_model)
+"""
+Load a streaming audio buffer to simulate a real-time audio session.
+"""
 from nemo.collections.asr.parts.utils.streaming_utils import CacheAwareStreamingAudioBuffer
 samples = [{'audio_filepath': cfg.audio_file}]
 streaming_buffer = CacheAwareStreamingAudioBuffer(
     model=asr_model,
 )
 streaming_buffer.append_audio_file(audio_filepath=cfg.audio_file, stream_id=-1)
 streaming_buffer_iter = iter(streaming_buffer)
+"""
+Use a helper class `SpeakerTaggedASR` that handles all ASR and diarization cache data for streaming.
+"""
 from nemo.collections.asr.parts.utils.multispk_transcribe_utils import SpeakerTaggedASR
 multispk_asr_streamer = SpeakerTaggedASR(cfg, asr_model, diar_model)
 for step_num, (chunk_audio, chunk_lengths) in enumerate(streaming_buffer_iter):
     drop_extra_pre_encoded = (
         0
                     is_buffer_empty=streaming_buffer.is_buffer_empty(),
                     drop_extra_pre_encoded=drop_extra_pre_encoded,
                 )
+# Generate the speaker-tagged transcript and print it.
+seglst_dict_list = multispk_asr_streamer.generate_seglst_dicts_from_parallel_streaming(samples=samples)
 print(seglst_dict_list)
 ```