File size: 11,427 Bytes
c968fc3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import json
from tqdm import tqdm
import os
import torchaudio
import torch


from utils.mfa_prepare import (
    process_wav_files,
    get_wav_files,
    filter_wav_files_by_length,
)
from utils.cut_by_vad import cut_segments
from utils.whisper_transcription import asr_main
from utils.util import has_existed

import subprocess
import random
from collections import defaultdict
from glob import glob
import shutil


def librilight_statistics(data_dir):
    """Get statistics for librilight dataset"""
    distribution2speakers2utts = defaultdict(lambda: defaultdict(list))
    distribution_infos = glob(data_dir + "/*")
    for distribution_info in distribution_infos:
        distribution = distribution_info.split("/")[-1]
        print(distribution)
        speaker_infos = glob(distribution_info + "/*")
        if len(speaker_infos) == 0:
            continue
        for speaker_info in speaker_infos:
            speaker = speaker_info.split("/")[-1]
            utts = glob(speaker_info + "/*.wav")
            for utt in utts:
                uid = utt.split("/")[-1].split(".")[0]
                distribution2speakers2utts[distribution][speaker].append(uid)
    return distribution2speakers2utts


def get_speakers_from_directory(directory):
    return [
        d for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))
    ]


def split_dataset_by_speaker(base_dir, train_ratio=0.8, dev_ratio=0.1):
    train_dir = os.path.join(base_dir, "train")
    dev_dir = os.path.join(base_dir, "dev")
    eval_dir = os.path.join(base_dir, "eval")

    # Check if dataset is already split
    if has_existed(train_dir) or has_existed(dev_dir) or has_existed(eval_dir):
        print("Dataset already split. Calculating speakers...")
        train_speakers = get_speakers_from_directory(train_dir)
        dev_speakers = get_speakers_from_directory(dev_dir)
        eval_speakers = get_speakers_from_directory(eval_dir)
        all_speakers = train_speakers + dev_speakers + eval_speakers
        unique_speakers = list(set(all_speakers))
        unique_speakers.sort()
        return unique_speakers

    # List all directories in the base directory
    all_speakers = [
        d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))
    ]
    random.shuffle(all_speakers)

    # Calculate split sizes
    total_speakers = len(all_speakers)
    train_size = int(total_speakers * train_ratio)
    dev_size = int(total_speakers * dev_ratio)
    eval_size = total_speakers - train_size - dev_size
    print("Total speakers:", total_speakers)
    print("Train speakers:", train_size)
    print("Dev speakers:", dev_size)
    print("Eval speakers:", eval_size)

    # Split directories
    train_speakers = all_speakers[:train_size]
    dev_speakers = all_speakers[train_size : train_size + dev_size]
    eval_speakers = all_speakers[train_size + dev_size :]

    # Function to move directories
    def move_speakers(speakers, target_dir):
        for speaker in speakers:
            shutil.move(
                os.path.join(base_dir, speaker), os.path.join(target_dir, speaker)
            )

    # Move directories
    print("Moving directories...")
    print("Moving Train speakers...")
    move_speakers(train_speakers, train_dir)
    print("Moving Dev speakers...")
    move_speakers(dev_speakers, dev_dir)
    print("Moving Eval speakers...")
    move_speakers(eval_speakers, eval_dir)

    unique_speakers = list(set(all_speakers))
    unique_speakers.sort()
    return unique_speakers


def save_meta_data(save_dir, processed_dir, distribution2speakers2utts, speakers):
    """Save metadata for librilight dataset"""
    os.makedirs(save_dir, exist_ok=True)
    train_output_file = os.path.join(save_dir, "train.json")
    valid_output_file = os.path.join(save_dir, "dev.json")
    test_output_file = os.path.join(save_dir, "eval.json")
    singer_dict_file = os.path.join(save_dir, "singers.json")
    utt2singer_file = os.path.join(save_dir, "utt2singer")
    utt2singer = open(utt2singer_file, "w")
    if has_existed(train_output_file):
        print("Metadata already exists. Skipping...")
        return

    train = []
    test = []
    valid = []

    train_index_count = 0
    test_index_count = 0
    valid_index_count = 0

    train_total_duration = 0
    test_total_duration = 0
    valid_total_duration = 0

    # Save metadata
    for distribution, speakers2utts in tqdm(distribution2speakers2utts.items()):
        for speaker, utts in tqdm(speakers2utts.items()):
            for chosen_uid in utts:
                res = {
                    "Dataset": "librilight",
                    "Singer": speaker,
                    "Uid": "{}#{}#{}".format(distribution, speaker, chosen_uid),
                }
                res["Path"] = "{}/{}/{}.wav".format(distribution, speaker, chosen_uid)
                res["Path"] = os.path.join(processed_dir, res["Path"])
                assert os.path.exists(res["Path"])

                text_file_path = os.path.join(
                    processed_dir,
                    distribution,
                    speaker,
                    chosen_uid + ".txt",
                )
                with open(text_file_path, "r") as f:
                    lines = f.readlines()
                    assert len(lines) == 1
                    text = lines[0].strip()
                    res["Text"] = text

                waveform, sample_rate = torchaudio.load(res["Path"])
                duration = waveform.size(-1) / sample_rate
                res["Duration"] = duration

                if "train" in distribution:
                    res["index"] = train_index_count
                    train_total_duration += duration
                    train.append(res)
                    train_index_count += 1
                elif "dev" in distribution:
                    res["index"] = valid_index_count
                    valid_total_duration += duration
                    valid.append(res)
                    valid_index_count += 1
                elif "eval" in distribution:
                    res["index"] = test_index_count
                    test_total_duration += duration
                    test.append(res)
                    test_index_count += 1
                utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
    print("Done!")
    print(
        "Utterance count: train = {}, dev = {}, eval = {}".format(
            len(train), len(valid), len(test)
        )
    )
    print(
        "#Train duration= {}, #Dev duration= {}, #Eval duration= {}".format(
            train_total_duration / 3600,
            valid_total_duration / 3600,
            test_total_duration / 3600,
        )
    )
    with open(train_output_file, "w") as f:
        json.dump(train, f, indent=4, ensure_ascii=False)
    with open(test_output_file, "w") as f:
        json.dump(test, f, indent=4, ensure_ascii=False)
    with open(valid_output_file, "w") as f:
        json.dump(valid, f, indent=4, ensure_ascii=False)
    utt2singer.close()
    singer_lut = {name: i for i, name in enumerate(speakers)}
    with open(singer_dict_file, "w") as f:
        json.dump(singer_lut, f, indent=4, ensure_ascii=False)
    print("Metadata saved to", save_dir)


def main(output_path, dataset_path, cfg):
    """Preprocess librilight dataset"""
    n_cpus = cfg.n_cpus  # number of cpus to use for preprocessing
    n_gpus = cfg.n_gpus  # number of gpus to use for transcription
    cut_length = cfg.cut_length  # target length of utterance in seconds
    max_length = cfg.max_length  # max length of utterance in seconds

    # MFA files
    mfa_config_path = cfg.mfa_config_path  # path to mfa config file
    mfa_dict_path = cfg.mfa_dict_path  # path to mfa dict file
    mfa_model_path = cfg.mfa_model_path  # path to mfa model file

    # check if mfa files exist
    if (
        not os.path.exists(mfa_dict_path)
        or not os.path.exists(mfa_model_path)
        or not os.path.exists(mfa_config_path)
    ):
        raise Exception("MFA files not found.")

    # Whisper model id
    model_id = cfg.whisper_model_id  # id of whisper model to use for transcription

    subsets = [
        d
        for d in os.listdir(dataset_path)
        if (
            os.path.isdir(os.path.join(dataset_path, d))
            and d in ["tiny", "small", "medium", "large"]
        )
    ]
    print("Found subsets:", subsets)

    if len(subsets) == 0:
        print("No subsets found. Exiting...")
        return
    # Preprocess each subset
    for subset in subsets:
        # Construct paths based on the base path
        print("Pre-proccessing Libri-light subset:", subset)
        raw_dir = f"{dataset_path}/{subset}"
        save_dir = f"{output_path}/{subset}"
        processed_dir = f"{dataset_path}/processed/{subset}"
        os.makedirs(processed_dir, exist_ok=True)
        os.makedirs(save_dir, exist_ok=True)

        # Step 1: Segmentation
        print("-" * 10)
        print("Step 1: Segmentation")
        print("Cutting audio files...")

        cut_segments(raw_dir, processed_dir, cut_length, n_cpus)

        # Steps 2 & 3: Filter and Preprocess
        print("-" * 10)
        print("Step 2 & 3: Filter and Preprocess")
        print("Filtering and preprocessing audio files...")

        wav_files = get_wav_files(processed_dir)
        filtered_wav_files = filter_wav_files_by_length(wav_files, max_length)
        process_wav_files(filtered_wav_files, processed_dir, n_cpus)

        # Step 4 & 5: Transcription & Text-preprocess
        print("-" * 10)
        print("Step 4 & 5: Transcription & Text-preprocess")
        print("Transcribing audio files...")

        n_gpus = min(n_gpus, torch.cuda.device_count())
        asr_main(processed_dir, n_gpus, model_id)

        # Step 6: MFA Align
        print("-" * 10)
        print("Step 6: MFA Align")
        print("Aligning audio files...")

        command = [
            "mfa",
            "align",
            "-v",
            "-j",
            str(n_cpus),
            "-c",
            mfa_config_path,
            processed_dir,
            mfa_dict_path,
            mfa_model_path,
            processed_dir,
            "--output_format",
            "long_textgrid",
            "--clean",
            "--overwrite",
        ]
        subprocess.run(command, text=True)

        # Step 7: train/dev/eval split
        print("-" * 10)
        print("Step 7: train/dev/eval split")
        print("Splitting dataset by speaker...")

        speakers = split_dataset_by_speaker(processed_dir)

        # Step 8: Statistics
        print("-" * 10)
        print("Step 8: Statistics")
        print("Calculating statistics...")

        distribution2speakers2utts = librilight_statistics(processed_dir)

        # Step 9: Save metadata
        print("-" * 10)
        print("Step 9: Save metadata")
        print("Preparing Metadata for Librilight...")

        save_meta_data(save_dir, processed_dir, distribution2speakers2utts, speakers)
        print("Preprocessing subset", subset, "done!")
        print("-" * 10)


if __name__ == "__main__":
    dataset_path = "/path/to/dataset/librilight"
    output_path = "/path/to/output"
    main(output_path, dataset_path)