Audio-To-MIDI-And-Advanced-Renderer

Sleeping

avans06 commited on Aug 22

Commit

820e77f

1 Parent(s): d050f96

feat: Implement advanced MIDI experimental correction tools

Adds a new suite of post-processing tools to allow for advanced refinement of transcribed MIDI files.

New features include:
- Spurious (noise) note filtering based on duration and velocity
- Rhythm Stabilization with silence-based segmentation
- Rhythmic Quantization for both straight and swing/triplet feels
- Velocity processing with Smoothing and Compression modes

Files changed (1) hide show

app.py +415 -2

app.py CHANGED Viewed

@@ -152,6 +152,20 @@ class AppParameters:
     render_output_as_solo_piano: bool = False
     render_remove_drums: bool = False
     # 8-bit Synthesizer Settings
     s8bit_waveform_type: str = 'Square'
     s8bit_pulse_width: float = 0.5
@@ -209,7 +223,7 @@ class AppParameters:
     s8bit_delay_division: str = "Dotted 8th Note"
     s8bit_delay_feedback: float = 0.5                  # Velocity scale for each subsequent echo (50%)
     s8bit_delay_repeats: int = 3                       # Number of echoes to generate
-    # --- NEW: Low-End Management for Delay ---
     s8bit_delay_highpass_cutoff_hz: int = 100          # High-pass filter frequency for delay echoes (removes low-end rumble from echoes)
     s8bit_delay_bass_pitch_shift: int = 0              # Pitch shift (in semitones) applied to low notes in delay echoes
     # --- High-End Management for Delay ---
@@ -220,6 +234,274 @@ class AppParameters:
 # === Helper Functions ===
 # =================================================================================================
 def analyze_audio_for_adaptive_params(audio_data: np.ndarray, sample_rate: int):
     """
     Analyzes raw audio data to dynamically determine optimal parameters for basic-pitch.
@@ -1858,6 +2140,50 @@ def Render_MIDI(*, input_midi_path: str, params: AppParameters, progress: gr.Pro
                     o[1] *= 200
                     o[2] *= 200
         print('Final adjustments complete.')
         print('=' * 70)
@@ -3484,6 +3810,15 @@ if __name__ == "__main__":
         updates[enable_advanced_separation] = gr.update(visible=is_demucs, value=False)
         return updates
     # --- Use the dataclass to define the master list of parameter keys ---
     # This is now the single source of truth for parameter order.
     ALL_PARAM_KEYS = [field.name for field in fields(AppParameters) if field.name not in ["input_file", "batch_input_files"]]
@@ -3726,6 +4061,67 @@ if __name__ == "__main__":
                             info="Quantizes the score to a fixed bar length. 'Start Times' aligns onsets. "
                             "'Durations' trims notes at the bar line. 'Split Durations' splits notes that cross the bar line."
                         )
                 with gr.Column(scale=1):
                     # --- 8-bit Synthesizer Settings ---
@@ -4134,7 +4530,7 @@ if __name__ == "__main__":
         s8bit_ui_keys = [key for key in ALL_PARAM_KEYS if key.startswith('s8bit_')]
         s8bit_ui_components = [ui_component_map[key] for key in s8bit_ui_keys]
-        # NEW: Create a separate list containing only the 13 controls to be updated
         s8bit_control_components = [comp for comp in s8bit_ui_components if comp != s8bit_preset_selector]
         # The list of basic_pitch UI components that can be updated by its preset selector.
@@ -4276,6 +4672,23 @@ if __name__ == "__main__":
             inputs=s8bit_enable_delay,
             outputs=delay_settings_box
         )
     # Launch the Gradio app
     app.queue().launch(inbrowser=True, debug=True)

     render_output_as_solo_piano: bool = False
     render_remove_drums: bool = False
+    # EXPERIMENTAL: MIDI Post-Processing & Correction Tools
+    enable_midi_corrections: bool = False                 # Master switch for enabling MIDI correction tools
+    correction_filter_spurious_notes: bool = True         # Enable filtering of spurious (noise) notes
+    correction_spurious_duration_ms: int = 50             # Maximum duration (ms) for a note to be considered spurious
+    correction_spurious_velocity: int = 20                # Maximum velocity for a note to be considered spurious
+    correction_remove_abnormal_rhythm: bool = False       # Enable rhythm stabilization for abnormal rhythm
+    correction_rhythm_stab_by_segment: bool = False       # Enable segmentation by silence before rhythm stabilization
+    correction_rhythm_stab_segment_silence_s: float = 1.0 # Silence threshold (seconds) for segmenting MIDI
+    correction_quantize_level: str = "None"               # Quantization level for note timing (e.g., "1/16", "None")
+    correction_velocity_mode: str = "None"                # Velocity processing mode ("None", "Smooth", "Compress")
+    correction_velocity_smooth_factor: float = 0.5        # Smoothing factor for velocity processing
+    correction_velocity_compress_min: int = 30            # Minimum velocity after compression
+    correction_velocity_compress_max: int = 100           # Maximum velocity after compression
     # 8-bit Synthesizer Settings
     s8bit_waveform_type: str = 'Square'
     s8bit_pulse_width: float = 0.5
     s8bit_delay_division: str = "Dotted 8th Note"
     s8bit_delay_feedback: float = 0.5                  # Velocity scale for each subsequent echo (50%)
     s8bit_delay_repeats: int = 3                       # Number of echoes to generate
+    # --- Low-End Management for Delay ---
     s8bit_delay_highpass_cutoff_hz: int = 100          # High-pass filter frequency for delay echoes (removes low-end rumble from echoes)
     s8bit_delay_bass_pitch_shift: int = 0              # Pitch shift (in semitones) applied to low notes in delay echoes
     # --- High-End Management for Delay ---
 # === Helper Functions ===
 # =================================================================================================
+def quantize_escore(escore, bpm, quantize_level_str="1/16"):
+    """
+    Quantizes the start times of notes in an escore to a rhythmic grid.
+    Args:
+        escore (list): The list of events.
+        bpm (float): The Beats Per Minute of the track.
+        quantize_level_str (str): The quantization level, e.g., "1/8", "1/16", "1/32".
+    Returns:
+        list: The quantized escore.
+    """
+    print(f"  - Quantizing notes to {quantize_level_str} at {bpm:.2f} BPM...")
+    level_map = {
+        "1/4": 1.0,
+        "1/8": 2.0,
+        "1/12": 3.0,  # 3 notes per beat
+        "1/16": 4.0,
+        "1/24": 6.0,  # 6 notes per beat
+        "1/32": 8.0,
+        "1/64": 16.0
+        }
+    division = level_map.get(quantize_level_str)
+    if not division:
+        print("    - Invalid quantization level. Skipping.")
+        return escore
+    # Calculate the duration of a single grid step in milliseconds
+    grid_ms = (60000.0 / bpm) / division
+    quantized_escore = []
+    notes_quantized = 0
+    for event in escore:
+        # Only quantize note events (which start with a number)
+        if isinstance(event[0], (int, float)):
+            original_start_time = event[0]
+            # The core quantization logic: find the nearest grid point
+            quantized_start_time = round(original_start_time / grid_ms) * grid_ms
+            event[0] = int(quantized_start_time)
+            notes_quantized += 1
+        quantized_escore.append(event)
+    print(f"    - Quantized {notes_quantized} notes.")
+    return quantized_escore
+def filter_spurious_notes_escore(escore, max_dur_ms=50, max_vel=20):
+    """
+    Filters out very short and quiet notes that are likely transcription noise.
+    Args:
+        escore (list): The list of events.
+        max_dur_ms (int): Notes with duration shorter than this will be considered.
+        max_vel (int): Notes with velocity lower than this will be considered.
+    Returns:
+        list: The cleaned escore.
+    """
+    print(f"  - Filtering spurious notes (duration < {max_dur_ms}ms AND velocity < {max_vel})...")
+    note_events = [note for note in escore if isinstance(note[0], (int, float))]
+    metadata_events = [meta for meta in escore if not isinstance(meta[0], (int, float))]
+    # The condition for keeping a note is that it's NOT a spurious note
+    cleaned_notes = [
+        note for note in note_events
+        if not (note[1] < max_dur_ms and note[3] < max_vel)
+    ]
+    notes_removed = len(note_events) - len(cleaned_notes)
+    print(f"    - Removed {notes_removed} spurious notes.")
+    # Recombine and re-sort
+    final_escore = metadata_events + cleaned_notes
+    final_escore.sort(key=lambda event: event[1] if isinstance(event[0], str) else event[0])
+    return final_escore
+def process_velocity_escore(escore, mode="None", smooth_factor=0.5, compress_min=30, compress_max=100):
+    """
+    Applies smoothing or compression to note velocities.
+    Args:
+        escore (list): The list of events.
+        mode (str): "Smooth", "Compress", or "None".
+        smooth_factor (float): How much to blend with neighbors (0=none, 1=full average).
+        compress_min (int): The target minimum velocity for compression.
+        compress_max (int): The target maximum velocity for compression.
+    Returns:
+        list: The escore with processed velocities.
+    """
+    if mode == "None":
+        return escore
+    print(f"  - Processing velocities with mode: {mode}...")
+    note_events = [note for note in escore if isinstance(note[0], (int, float))]
+    metadata_events = [meta for meta in escore if not isinstance(meta[0], (int, float))]
+    if not note_events:
+        return escore
+    velocities = [note[3] for note in note_events]
+    if mode == "Smooth":
+        new_velocities = list(velocities) # Start with a copy
+        # Iterate from the second to the second-to-last note
+        for i in range(1, len(velocities) - 1):
+            prev_vel = velocities[i-1]
+            current_vel = velocities[i]
+            next_vel = velocities[i+1]
+            neighbor_avg = (prev_vel + next_vel) / 2.0
+            # Blend the current velocity with the average of its neighbors
+            smoothed_vel = (current_vel * (1 - smooth_factor)) + (neighbor_avg * smooth_factor)
+            new_velocities[i] = int(max(1, min(127, smoothed_vel)))
+        for i, note in enumerate(note_events):
+            note[3] = new_velocities[i]
+        print(f"    - Smoothed {len(note_events)} velocities.")
+    elif mode == "Compress":
+        min_vel_orig = min(velocities)
+        max_vel_orig = max(velocities)
+        # Avoid division by zero if all notes have the same velocity
+        if max_vel_orig == min_vel_orig:
+            return escore
+        for note in note_events:
+            # Linear mapping from original range to target range
+            original_vel = note[3]
+            new_vel = compress_min + (original_vel - min_vel_orig) * \
+                      (compress_max - compress_min) / (max_vel_orig - min_vel_orig)
+            note[3] = int(max(1, min(127, new_vel)))
+        print(f"    - Compressed {len(note_events)} velocities to range [{compress_min}, {compress_max}].")
+    final_escore = metadata_events + note_events
+    final_escore.sort(key=lambda event: event[1] if isinstance(event[0], str) else event[0])
+    return final_escore
+def stabilize_midi_rhythm(escore,
+                          ioi_threshold_ratio=0.30,
+                          min_ioi_ms=30,
+                          enable_segmentation=True,
+                          silence_split_threshold_s=2.0):
+    """
+    Removes or merges rhythmically unstable notes from an escore list.
+    This is designed to clean up MIDI generated by basic-pitch with multiple pitch bends,
+    which can create clusters of very short, dense notes to approximate a slide.
+    This version can segment the MIDI based on silence before processing, making it robust
+    for files containing multiple songs with different tempos (like an album).
+    Args:
+        escore (list): The list of events, which can include notes and metadata strings.
+        ioi_threshold_ratio (float): The ratio of the median IOI below which a note is considered unstable.
+        min_ioi_ms (int): An absolute minimum IOI in milliseconds.
+        enable_segmentation (bool): If True, splits the notes into segments based on silence.
+        silence_split_threshold_s (float): The duration of silence in seconds to define a new segment.
+    Returns:
+        list: The cleaned escore with unstable notes removed or merged, and metadata preserved.
+    """
+    # 1. Separate note events from metadata events based on the type of the first element
+    note_events = [note for note in escore if isinstance(note[0], (int, float))]
+    metadata_events = [meta for meta in escore if not isinstance(meta[0], (int, float))]
+    # Only proceed if there are enough notes to analyze for a stable rhythm
+    if len(note_events) < 20:
+        print("  - Rhythm stabilization skipped: not enough notes to analyze.")
+        return escore # Return original escore if there's nothing to process
+    print("  - Running rhythm stabilization...")
+    # Ensure notes are sorted by start time before processing, as this is critical for IOI calculation
+    note_events.sort(key=lambda x: x[0])
+    # 2. Segment the notes based on silence if enabled
+    segments = []
+    if enable_segmentation and len(note_events) > 1:
+        print(f"    - Segmentation enabled (silence > {silence_split_threshold_s}s).")
+        current_segment = [note_events[0]]
+        silence_threshold_ms = silence_split_threshold_s * 1000
+        for i in range(1, len(note_events)):
+            prev_note_end_ms = note_events[i-1][0] + note_events[i-1][1]
+            current_note_start_ms = note_events[i][0]
+            gap_ms = current_note_start_ms - prev_note_end_ms
+            if gap_ms > silence_threshold_ms:
+                if current_segment: segments.append(current_segment)
+                current_segment = [] # Start a new segment
+            current_segment.append(note_events[i])
+        if current_segment: segments.append(current_segment) # Add the last segment
+        print(f"    - Split MIDI into {len(segments)} segment(s) for individual processing.")
+    else:
+        # If segmentation is disabled, treat the entire file as a single segment
+        segments = [note_events]
+    # 3. Process each segment individually
+    all_cleaned_notes = []
+    total_merged_count = 0
+    for i, segment in enumerate(segments):
+        if len(segment) < 20: # Skip stabilization for very short segments
+            all_cleaned_notes.extend(segment)
+            continue
+        # --- Core stabilization logic applied per-segment ---
+        # Calculate Inter-Onset Intervals (IOIs) using only the filtered note events
+        iois = [segment[j][0] - segment[j-1][0] for j in range(1, len(segment))]
+        # Filter out zero or negative IOIs (which can happen with chords) before calculating the median
+        positive_iois = [ioi for ioi in iois if ioi > 0]
+        if not positive_iois:
+            all_cleaned_notes.extend(segment)
+            continue
+        median_ioi = np.median(positive_iois)
+        # The threshold for merging is the greater of the ratio-based value or the absolute minimum
+        threshold_ms = max(median_ioi * ioi_threshold_ratio, min_ioi_ms)
+        # Process only the note events to merge unstable ones
+        cleaned_segment = [copy.deepcopy(segment[0])] # Start with a deepcopy of the first note
+        notes_merged_in_segment = 0
+        for j in range(1, len(segment)):
+            current_note = segment[j]
+            last_kept_note = cleaned_segment[-1]
+            # Calculate the IOI between the current note and the last *accepted* note
+            actual_ioi = current_note[0] - last_kept_note[0]
+            # Check pitch proximity to avoid merging unrelated grace notes into main notes
+            pitch_difference = abs(current_note[2] - last_kept_note[2])
+            # Merge condition: notes are too close in time AND similar in pitch
+            if actual_ioi < threshold_ms and pitch_difference < 5:
+                notes_merged_in_segment += 1
+                # Merge by extending the previous note's duration to cover the current note
+                new_end_time = current_note[0] + current_note[1]
+                last_kept_note[1] = new_end_time - last_kept_note[0]
+            else:
+                # Note is rhythmically stable, so we keep it
+                cleaned_segment.append(copy.deepcopy(current_note))
+        if len(segments) > 1:
+            print(f"      - Segment {i+1}: Median IOI {median_ioi:.2f}ms, merged {notes_merged_in_segment} notes.")
+        all_cleaned_notes.extend(cleaned_segment)
+        total_merged_count += notes_merged_in_segment
+    if total_merged_count > 0:
+        print(f"    - Rhythm stabilization complete. Total merged notes: {total_merged_count}.")
+    # 4. Recombine metadata with the globally cleaned notes and re-sort
+    final_escore = metadata_events + all_cleaned_notes
+    # Re-sort the entire list by time to ensure correct MIDI event order.
+    # The sort key must handle both event types: metadata time is at index 1, note time is at index 0.
+    final_escore.sort(key=lambda event: event[1] if isinstance(event[0], str) else event[0])
+    return final_escore
 def analyze_audio_for_adaptive_params(audio_data: np.ndarray, sample_rate: int):
     """
     Analyzes raw audio data to dynamically determine optimal parameters for basic-pitch.
                     o[1] *= 200
                     o[2] *= 200
+        # --- MIDI Post-Processing & Correction Block ---
+        if getattr(params, 'enable_midi_corrections', False):
+            print("Applying MIDI Post-Processing & Corrections...")
+            # Filter spurious notes first to clean the data for other processes
+            if getattr(params, 'correction_filter_spurious_notes', False):
+                output_score = filter_spurious_notes_escore(
+                    output_score,
+                    max_dur_ms=getattr(params, 'correction_spurious_duration_ms', 50),
+                    max_vel=getattr(params, 'correction_spurious_velocity', 20)
+                )
+            # Then, stabilize rhythm on the cleaned notes
+            if getattr(params, 'correction_remove_abnormal_rhythm', False):
+                output_score = stabilize_midi_rhythm(
+                    output_score,
+                    enable_segmentation=getattr(params, 'correction_rhythm_stab_by_segment', False),
+                    silence_split_threshold_s=getattr(params, 'correction_rhythm_stab_segment_silence_s', 1.0)
+                )
+            # Then, quantize the stabilized rhythm
+            quantize_level = getattr(params, 'correction_quantize_level', "None")
+            if quantize_level != "None":
+                try:
+                    # We need to get the BPM for quantization. We do this once here.
+                    midi_obj_for_bpm = pretty_midi.PrettyMIDI(input_midi_path)
+                    estimated_bpm = midi_obj_for_bpm.estimate_tempo()
+                    output_score = quantize_escore(output_score, estimated_bpm, quantize_level)
+                except Exception as e:
+                    print(f"    - Could not estimate BPM for quantization. Skipping. Error: {e}")
+            # Finally, process velocity as it doesn't affect timing or notes
+            velocity_mode = getattr(params, 'correction_velocity_mode', "None")
+            if velocity_mode != "None":
+                output_score = process_velocity_escore(
+                    output_score,
+                    mode=velocity_mode,
+                    smooth_factor=getattr(params, 'correction_velocity_smooth_factor', 0.5),
+                    compress_min=getattr(params, 'correction_velocity_compress_min', 30),
+                    compress_max=getattr(params, 'correction_velocity_compress_max', 100)
+                )
+            print("Corrections finished.")
+            print('=' * 70)
         print('Final adjustments complete.')
         print('=' * 70)
         updates[enable_advanced_separation] = gr.update(visible=is_demucs, value=False)
         return updates
+    # Event listener for the velocity processing mode dropdown
+    def update_velocity_options(mode):
+        is_smooth = (mode == "Smooth")
+        is_compress = (mode == "Compress")
+        return {
+            correction_velocity_smooth_factor: gr.update(visible=is_smooth),
+            velocity_compress_sliders: gr.update(visible=is_compress)
+        }
     # --- Use the dataclass to define the master list of parameter keys ---
     # This is now the single source of truth for parameter order.
     ALL_PARAM_KEYS = [field.name for field in fields(AppParameters) if field.name not in ["input_file", "batch_input_files"]]
                             info="Quantizes the score to a fixed bar length. 'Start Times' aligns onsets. "
                             "'Durations' trims notes at the bar line. 'Split Durations' splits notes that cross the bar line."
                         )
+                        with gr.Accordion("EXPERIMENTAL: MIDI Post-Processing & Correction Tools", open=False):
+                            enable_midi_corrections = gr.Checkbox(
+                                label="Enable MIDI Correction Suite",
+                                value=False,
+                                info="Master switch for all post-processing tools below. Use these to clean up and refine the transcribed MIDI before rendering."
+                            )
+                            with gr.Group(visible=False) as midi_correction_settings:
+                                # --- Spurious Note Filtering Group ---
+                                with gr.Group():
+                                    correction_filter_spurious_notes = gr.Checkbox(
+                                        label="Filter Spurious (Noise) Notes",
+                                        value=True,
+                                        info="Removes very short, quiet notes that are likely transcription errors from background noise."
+                                    )
+                                    with gr.Row():
+                                        correction_spurious_duration_ms = gr.Slider(
+                                            10, 200, value=50, step=5,
+                                            label="Max Duration (ms)",
+                                            info="Notes shorter than this duration..."
+                                        )
+                                        correction_spurious_velocity = gr.Slider(
+                                            1, 50, value=20, step=1,
+                                            label="Max Velocity",
+                                            info="...and quieter than this velocity will be removed."
+                                        )
+                                # --- stabilize rhythm on the cleaned notes ---
+                                with gr.Group():
+                                    correction_remove_abnormal_rhythm = gr.Checkbox(label="Stabilize Rhythm (for Pitch Bend)", value=False,
+                                        info="Attempts to merge overly dense, rhythmically unstable notes often created when 'Allow Multiple Pitch Bends' is used. This can clean up the rhythm but may lose some pitch slide nuance.")
+                                    with gr.Group(visible=False) as rhythm_stab_options: # This group is initially hidden
+                                        correction_rhythm_stab_by_segment = gr.Checkbox(label="Enable Segmentation by Silence", value=False,
+                                            info="Highly recommended for albums or long files. Splits the MIDI by silent parts before stabilizing rhythm, ensuring accuracy for songs with different tempos.")
+                                        correction_rhythm_stab_segment_silence_s = gr.Slider(minimum=0.5, maximum=10.0, value=1.0, step=0.5,
+                                            label="Silence Threshold for Segmentation (seconds)",
+                                            info="The amount of silence required to start a new segment. 1-3 seconds is usually enough to separate songs on an album.")
+                                # --- Quantization Group ---
+                                with gr.Group():
+                                    correction_quantize_level = gr.Dropdown(
+                                        ["None", "1/64", "1/32", "1/16", "1/8", "1/4", "1/24", "1/12"],
+                                        value="None",
+                                        label="Quantize Rhythm",
+                                        info="Quantizes notes to the nearest rhythmic grid line. '1/16' is recommended for most pop and rock music. For expressive genres like classical or jazz, use with caution as it may reduce natural timing nuances. Straight divisions (1/8, 1/16, etc.) suit most modern music, while swing divisions (1/12, 1/24) are ideal for jazz, blues, or shuffle styles."
+                                    )
+                                # --- Velocity Processing Group ---
+                                with gr.Group():
+                                    correction_velocity_mode = gr.Dropdown(
+                                        ["None", "Smooth", "Compress"],
+                                        value="None",
+                                        label="Process Velocity",
+                                        info="'Smooth' reduces sudden jumps in volume. 'Compress' scales all velocities into a specific range."
+                                    )
+                                    with gr.Group() as velocity_options_group: # This group will have its visibility toggled
+                                        correction_velocity_smooth_factor = gr.Slider(
+                                            0.0, 1.0, value=0.5, step=0.05,
+                                            label="Smoothing Factor",
+                                            info="Controls the amount of smoothing. 0 = no change, 1 = full averaging with neighbors.",
+                                            visible=False # Initially hidden
+                                        )
+                                        with gr.Row(visible=False) as velocity_compress_sliders: # Initially hidden
+                                            correction_velocity_compress_min = gr.Slider(1, 127, value=30, step=1, label="Target Min Velocity")
+                                            correction_velocity_compress_max = gr.Slider(1, 127, value=100, step=1, label="Target Max Velocity")
                 with gr.Column(scale=1):
                     # --- 8-bit Synthesizer Settings ---
         s8bit_ui_keys = [key for key in ALL_PARAM_KEYS if key.startswith('s8bit_')]
         s8bit_ui_components = [ui_component_map[key] for key in s8bit_ui_keys]
+        # Create a separate list containing only the 13 controls to be updated
         s8bit_control_components = [comp for comp in s8bit_ui_components if comp != s8bit_preset_selector]
         # The list of basic_pitch UI components that can be updated by its preset selector.
             inputs=s8bit_enable_delay,
             outputs=delay_settings_box
         )
+        # Event listener to show/hide the main correction settings group
+        enable_midi_corrections.change(
+            fn=lambda x: gr.update(visible=x),
+            inputs=enable_midi_corrections,
+            outputs=midi_correction_settings
+        )
+        # Event listener to show/hide the rhythm stabilization sub-options
+        correction_remove_abnormal_rhythm.change(
+            fn=lambda x: gr.update(visible=x),
+            inputs=correction_remove_abnormal_rhythm,
+            outputs=rhythm_stab_options
+        )
+        correction_velocity_mode.change(
+            fn=update_velocity_options,
+            inputs=correction_velocity_mode,
+            outputs=[correction_velocity_smooth_factor, velocity_compress_sliders]
+        )
     # Launch the Gradio app
     app.queue().launch(inbrowser=True, debug=True)