# Copyright (c) 2023 Amphion. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import json from tqdm import tqdm import os import torchaudio import torch from utils.mfa_prepare import ( process_wav_files, get_wav_files, filter_wav_files_by_length, ) from utils.cut_by_vad import cut_segments from utils.whisper_transcription import asr_main from utils.util import has_existed import subprocess import random from collections import defaultdict from glob import glob import shutil def librilight_statistics(data_dir): """Get statistics for librilight dataset""" distribution2speakers2utts = defaultdict(lambda: defaultdict(list)) distribution_infos = glob(data_dir + "/*") for distribution_info in distribution_infos: distribution = distribution_info.split("/")[-1] print(distribution) speaker_infos = glob(distribution_info + "/*") if len(speaker_infos) == 0: continue for speaker_info in speaker_infos: speaker = speaker_info.split("/")[-1] utts = glob(speaker_info + "/*.wav") for utt in utts: uid = utt.split("/")[-1].split(".")[0] distribution2speakers2utts[distribution][speaker].append(uid) return distribution2speakers2utts def get_speakers_from_directory(directory): return [ d for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d)) ] def split_dataset_by_speaker(base_dir, train_ratio=0.8, dev_ratio=0.1): train_dir = os.path.join(base_dir, "train") dev_dir = os.path.join(base_dir, "dev") eval_dir = os.path.join(base_dir, "eval") # Check if dataset is already split if has_existed(train_dir) or has_existed(dev_dir) or has_existed(eval_dir): print("Dataset already split. Calculating speakers...") train_speakers = get_speakers_from_directory(train_dir) dev_speakers = get_speakers_from_directory(dev_dir) eval_speakers = get_speakers_from_directory(eval_dir) all_speakers = train_speakers + dev_speakers + eval_speakers unique_speakers = list(set(all_speakers)) unique_speakers.sort() return unique_speakers # List all directories in the base directory all_speakers = [ d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d)) ] random.shuffle(all_speakers) # Calculate split sizes total_speakers = len(all_speakers) train_size = int(total_speakers * train_ratio) dev_size = int(total_speakers * dev_ratio) eval_size = total_speakers - train_size - dev_size print("Total speakers:", total_speakers) print("Train speakers:", train_size) print("Dev speakers:", dev_size) print("Eval speakers:", eval_size) # Split directories train_speakers = all_speakers[:train_size] dev_speakers = all_speakers[train_size : train_size + dev_size] eval_speakers = all_speakers[train_size + dev_size :] # Function to move directories def move_speakers(speakers, target_dir): for speaker in speakers: shutil.move( os.path.join(base_dir, speaker), os.path.join(target_dir, speaker) ) # Move directories print("Moving directories...") print("Moving Train speakers...") move_speakers(train_speakers, train_dir) print("Moving Dev speakers...") move_speakers(dev_speakers, dev_dir) print("Moving Eval speakers...") move_speakers(eval_speakers, eval_dir) unique_speakers = list(set(all_speakers)) unique_speakers.sort() return unique_speakers def save_meta_data(save_dir, processed_dir, distribution2speakers2utts, speakers): """Save metadata for librilight dataset""" os.makedirs(save_dir, exist_ok=True) train_output_file = os.path.join(save_dir, "train.json") valid_output_file = os.path.join(save_dir, "dev.json") test_output_file = os.path.join(save_dir, "eval.json") singer_dict_file = os.path.join(save_dir, "singers.json") utt2singer_file = os.path.join(save_dir, "utt2singer") utt2singer = open(utt2singer_file, "w") if has_existed(train_output_file): print("Metadata already exists. Skipping...") return train = [] test = [] valid = [] train_index_count = 0 test_index_count = 0 valid_index_count = 0 train_total_duration = 0 test_total_duration = 0 valid_total_duration = 0 # Save metadata for distribution, speakers2utts in tqdm(distribution2speakers2utts.items()): for speaker, utts in tqdm(speakers2utts.items()): for chosen_uid in utts: res = { "Dataset": "librilight", "Singer": speaker, "Uid": "{}#{}#{}".format(distribution, speaker, chosen_uid), } res["Path"] = "{}/{}/{}.wav".format(distribution, speaker, chosen_uid) res["Path"] = os.path.join(processed_dir, res["Path"]) assert os.path.exists(res["Path"]) text_file_path = os.path.join( processed_dir, distribution, speaker, chosen_uid + ".txt", ) with open(text_file_path, "r") as f: lines = f.readlines() assert len(lines) == 1 text = lines[0].strip() res["Text"] = text waveform, sample_rate = torchaudio.load(res["Path"]) duration = waveform.size(-1) / sample_rate res["Duration"] = duration if "train" in distribution: res["index"] = train_index_count train_total_duration += duration train.append(res) train_index_count += 1 elif "dev" in distribution: res["index"] = valid_index_count valid_total_duration += duration valid.append(res) valid_index_count += 1 elif "eval" in distribution: res["index"] = test_index_count test_total_duration += duration test.append(res) test_index_count += 1 utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"])) print("Done!") print( "Utterance count: train = {}, dev = {}, eval = {}".format( len(train), len(valid), len(test) ) ) print( "#Train duration= {}, #Dev duration= {}, #Eval duration= {}".format( train_total_duration / 3600, valid_total_duration / 3600, test_total_duration / 3600, ) ) with open(train_output_file, "w") as f: json.dump(train, f, indent=4, ensure_ascii=False) with open(test_output_file, "w") as f: json.dump(test, f, indent=4, ensure_ascii=False) with open(valid_output_file, "w") as f: json.dump(valid, f, indent=4, ensure_ascii=False) utt2singer.close() singer_lut = {name: i for i, name in enumerate(speakers)} with open(singer_dict_file, "w") as f: json.dump(singer_lut, f, indent=4, ensure_ascii=False) print("Metadata saved to", save_dir) def main(output_path, dataset_path, cfg): """Preprocess librilight dataset""" n_cpus = cfg.n_cpus # number of cpus to use for preprocessing n_gpus = cfg.n_gpus # number of gpus to use for transcription cut_length = cfg.cut_length # target length of utterance in seconds max_length = cfg.max_length # max length of utterance in seconds # MFA files mfa_config_path = cfg.mfa_config_path # path to mfa config file mfa_dict_path = cfg.mfa_dict_path # path to mfa dict file mfa_model_path = cfg.mfa_model_path # path to mfa model file # check if mfa files exist if ( not os.path.exists(mfa_dict_path) or not os.path.exists(mfa_model_path) or not os.path.exists(mfa_config_path) ): raise Exception("MFA files not found.") # Whisper model id model_id = cfg.whisper_model_id # id of whisper model to use for transcription subsets = [ d for d in os.listdir(dataset_path) if ( os.path.isdir(os.path.join(dataset_path, d)) and d in ["tiny", "small", "medium", "large"] ) ] print("Found subsets:", subsets) if len(subsets) == 0: print("No subsets found. Exiting...") return # Preprocess each subset for subset in subsets: # Construct paths based on the base path print("Pre-proccessing Libri-light subset:", subset) raw_dir = f"{dataset_path}/{subset}" save_dir = f"{output_path}/{subset}" processed_dir = f"{dataset_path}/processed/{subset}" os.makedirs(processed_dir, exist_ok=True) os.makedirs(save_dir, exist_ok=True) # Step 1: Segmentation print("-" * 10) print("Step 1: Segmentation") print("Cutting audio files...") cut_segments(raw_dir, processed_dir, cut_length, n_cpus) # Steps 2 & 3: Filter and Preprocess print("-" * 10) print("Step 2 & 3: Filter and Preprocess") print("Filtering and preprocessing audio files...") wav_files = get_wav_files(processed_dir) filtered_wav_files = filter_wav_files_by_length(wav_files, max_length) process_wav_files(filtered_wav_files, processed_dir, n_cpus) # Step 4 & 5: Transcription & Text-preprocess print("-" * 10) print("Step 4 & 5: Transcription & Text-preprocess") print("Transcribing audio files...") n_gpus = min(n_gpus, torch.cuda.device_count()) asr_main(processed_dir, n_gpus, model_id) # Step 6: MFA Align print("-" * 10) print("Step 6: MFA Align") print("Aligning audio files...") command = [ "mfa", "align", "-v", "-j", str(n_cpus), "-c", mfa_config_path, processed_dir, mfa_dict_path, mfa_model_path, processed_dir, "--output_format", "long_textgrid", "--clean", "--overwrite", ] subprocess.run(command, text=True) # Step 7: train/dev/eval split print("-" * 10) print("Step 7: train/dev/eval split") print("Splitting dataset by speaker...") speakers = split_dataset_by_speaker(processed_dir) # Step 8: Statistics print("-" * 10) print("Step 8: Statistics") print("Calculating statistics...") distribution2speakers2utts = librilight_statistics(processed_dir) # Step 9: Save metadata print("-" * 10) print("Step 9: Save metadata") print("Preparing Metadata for Librilight...") save_meta_data(save_dir, processed_dir, distribution2speakers2utts, speakers) print("Preprocessing subset", subset, "done!") print("-" * 10) if __name__ == "__main__": dataset_path = "/path/to/dataset/librilight" output_path = "/path/to/output" main(output_path, dataset_path)