File size: 4,681 Bytes
359afe5
 
 
 
 
 
 
 
 
 
 
 
 
 
5adf0a6
359afe5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5adf0a6
 
359afe5
5adf0a6
 
359afe5
 
 
 
 
5adf0a6
359afe5
 
5adf0a6
359afe5
5adf0a6
359afe5
 
 
5adf0a6
359afe5
5adf0a6
359afe5
5adf0a6
359afe5
 
 
5adf0a6
359afe5
 
5adf0a6
359afe5
 
5adf0a6
359afe5
 
 
 
5adf0a6
359afe5
 
 
 
 
 
5adf0a6
359afe5
 
 
 
 
 
 
 
 
 
 
 
5adf0a6
359afe5
 
 
 
5adf0a6
 
359afe5
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import pandas as pd
from src.utils.load_model import run_hubert_base, run_whisper, run_model, run_timit
from src.utils.audio_process import calculate_error_rate, load_audio
from src.utils.cmu_process import clean_cmu, cmu_to_ipa

def set_output(model, pre_pho, ref_pho, duration, per, score):
    return {
        "model": model,
        "phonemes": pre_pho,
        "ref_phonemes": ref_pho,
        "duration": duration,
        "PER": per,
        "score": score
    }

# Map model names to their runner functions
MODEL_RUNNERS = {
    "HuBERT-Base": run_hubert_base,
    # "Whisper": run_whisper,
    "HuBERT fine-tuned": run_model,
    "Timit": run_timit
}

def get_output(model, wav, reference_phoneme):
    """
    Run the given model, compute error rate, and return formatted output.
    """
    if model not in MODEL_RUNNERS:
        raise ValueError(f"Unknown model: {model}")

    run_func = MODEL_RUNNERS[model]
    phonemes, dur = run_func(wav)
    per, score = calculate_error_rate(reference_phoneme, phonemes)

    return set_output(model, phonemes, reference_phoneme, dur, per, score)


def benchmark_all(example):
    """
    Run all models on a single dataset example.
    """
    # Load waveform manually to avoid datasets' torchcodec dependency
    wav = load_audio(example["audio"]) 
    reference_phoneme = example["phonetic"]
    reference_phoneme = cmu_to_ipa(clean_cmu(reference_phoneme))

    # Run all models
    results = [
        get_output("HuBERT-Base", wav, reference_phoneme),
        # get_output("Whisper", wav, reference_phoneme),
        get_output("HuBERT fine-tuned", wav, reference_phoneme),
        get_output("Timit", wav, reference_phoneme),
    ]

    return pd.DataFrame(results)

def benchmark_dataset(dataset):
    """
    Run benchmark_all on each sample and compute average PER and duration per model.
    """
    all_results = []
    for example in dataset:
        df = benchmark_all(example)
        all_results.append(df)

    full_df = pd.concat(all_results, ignore_index=True)

    # Compute average PER and duration per model
    avg_stats = (
        full_df.groupby("model")[["PER", "duration"]]
        .mean()
        .reset_index()
        .rename(columns={"PER": "Average PER", "duration": "Average Duration (s)"})
    )

    return full_df, avg_stats


from datasets import load_dataset, Audio


def main():
    dataset = load_dataset("mirfan899/phoneme_asr", split="train")
    # Disable automatic audio decoding to avoid torchcodec requirement
    dataset = dataset.cast_column("audio", Audio(decode=False))
    field = "phonetic"

    unique_texts = dataset.unique(field)
    print("Unique phonetic strings:", len(unique_texts))

    dataset_unique = dataset.filter(lambda x: x[field] in unique_texts)

    def is_valid(example):
        phoneme_tokens = example[field].split()
        return len(phoneme_tokens) >= 10

    dataset_filtered = dataset_unique.filter(is_valid)

    dataset_final = dataset_filtered.shuffle(seed=42).select(range(min(100, len(dataset_filtered))))

    print(dataset_final)
    print("Final size:", len(dataset_final))
    full_results, avg_stats = benchmark_dataset(dataset_final.select(range(10)))

    print("Average Statistic per model:")
    print(avg_stats)

    # Optional: inspect detailed results
    print(full_results.head())

    # Save results for leaderboard consumption (one JSON per model)
    import json, os, time
    results_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "eval-results")
    os.makedirs(results_dir, exist_ok=True)

    timestamp = int(time.time())
    for _, row in avg_stats.iterrows():
        model_name = str(row["model"]).replace(" ", "-")
        org_model = f"local/{model_name}"
        per = float(row["Average PER"]) if row["Average PER"] is not None else None
        avg_dur = float(row["Average Duration (s)"]) if row["Average Duration (s)"] is not None else None

        payload = {
            "config": {
                "model_name": org_model,
                "model_dtype": "float32",
                "model_sha": ""
            },
            "results": {
                # Populate both keys expected by Tasks to avoid NaNs in the leaderboard
                "phoneme_dev": {"per": per, "avg_duration": avg_dur},
                "phoneme_test": {"per": per, "avg_duration": avg_dur}
            }
        }

        out_path = os.path.join(results_dir, f"results_{timestamp}_{model_name}.json")
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(payload, f, ensure_ascii=False, indent=2)
        print(f"Saved leaderboard result: {out_path}")


if __name__ == "__main__":
    main()