|
import argparse |
|
import os |
|
import soundfile |
|
from typing import NoReturn |
|
|
|
import musdb |
|
import numpy as np |
|
|
|
from bytesep.utils import load_audio |
|
|
|
|
|
def create_evaluation(args) -> NoReturn: |
|
r"""Random mix and write out audios for evaluation. |
|
|
|
Args: |
|
vctk_dataset_dir: str, the directory of the VCTK dataset |
|
symphony_dataset_dir: str, the directory of the symphony dataset |
|
evaluation_audios_dir: str, the directory to write out randomly selected and mixed audio segments |
|
sample_rate: int |
|
channels: int, e.g., 1 | 2 |
|
evaluation_segments_num: int |
|
mono: bool |
|
|
|
Returns: |
|
NoReturn |
|
""" |
|
|
|
|
|
vctk_dataset_dir = args.vctk_dataset_dir |
|
musdb18_dataset_dir = args.musdb18_dataset_dir |
|
evaluation_audios_dir = args.evaluation_audios_dir |
|
sample_rate = args.sample_rate |
|
channels = args.channels |
|
evaluation_segments_num = args.evaluation_segments_num |
|
mono = True if channels == 1 else False |
|
|
|
split = 'test' |
|
random_state = np.random.RandomState(1234) |
|
|
|
|
|
audios_dir = os.path.join(vctk_dataset_dir, "wav48", split) |
|
|
|
for source_type in ['speech', 'music', 'mixture']: |
|
output_dir = os.path.join(evaluation_audios_dir, split, source_type) |
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
speech_audio_paths = [] |
|
speaker_ids = sorted(os.listdir(audios_dir)) |
|
|
|
for speaker_id in speaker_ids: |
|
speaker_audios_dir = os.path.join(audios_dir, speaker_id) |
|
|
|
audio_names = sorted(os.listdir(speaker_audios_dir)) |
|
|
|
for audio_name in audio_names: |
|
speaker_audio_path = os.path.join(speaker_audios_dir, audio_name) |
|
speech_audio_paths.append(speaker_audio_path) |
|
|
|
|
|
mus = musdb.DB(root=musdb18_dataset_dir, subsets=[split]) |
|
track_indexes = np.arange(len(mus.tracks)) |
|
|
|
for n in range(evaluation_segments_num): |
|
|
|
print('{} / {}'.format(n, evaluation_segments_num)) |
|
|
|
|
|
speech_audio_path = random_state.choice(speech_audio_paths) |
|
|
|
speech_audio = load_audio( |
|
audio_path=speech_audio_path, mono=mono, sample_rate=sample_rate |
|
) |
|
|
|
|
|
if channels == 2: |
|
speech_audio = np.tile(speech_audio, (2, 1)) |
|
|
|
|
|
output_speech_path = os.path.join( |
|
evaluation_audios_dir, split, 'speech', '{:04d}.wav'.format(n) |
|
) |
|
soundfile.write( |
|
file=output_speech_path, data=speech_audio.T, samplerate=sample_rate |
|
) |
|
print("Write out to {}".format(output_speech_path)) |
|
|
|
|
|
track_index = random_state.choice(track_indexes) |
|
track = mus[track_index] |
|
|
|
segment_samples = speech_audio.shape[1] |
|
start_sample = int( |
|
random_state.uniform(0.0, segment_samples - speech_audio.shape[1]) |
|
) |
|
|
|
music_audio = track.audio[start_sample : start_sample + segment_samples, :].T |
|
|
|
|
|
output_music_path = os.path.join( |
|
evaluation_audios_dir, split, 'music', '{:04d}.wav'.format(n) |
|
) |
|
soundfile.write( |
|
file=output_music_path, data=music_audio.T, samplerate=sample_rate |
|
) |
|
print("Write out to {}".format(output_music_path)) |
|
|
|
|
|
mixture_audio = speech_audio + music_audio |
|
|
|
|
|
output_mixture_path = os.path.join( |
|
evaluation_audios_dir, split, 'mixture', '{:04d}.wav'.format(n) |
|
) |
|
soundfile.write( |
|
file=output_mixture_path, data=mixture_audio.T, samplerate=sample_rate |
|
) |
|
print("Write out to {}".format(output_mixture_path)) |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
|
|
parser.add_argument( |
|
"--vctk_dataset_dir", |
|
type=str, |
|
required=True, |
|
help="The directory of the VCTK dataset.", |
|
) |
|
parser.add_argument( |
|
"--musdb18_dataset_dir", |
|
type=str, |
|
required=True, |
|
help="The directory of the MUSDB18 dataset.", |
|
) |
|
parser.add_argument( |
|
"--evaluation_audios_dir", |
|
type=str, |
|
required=True, |
|
help="The directory to write out randomly selected and mixed audio segments.", |
|
) |
|
parser.add_argument( |
|
"--sample_rate", |
|
type=int, |
|
required=True, |
|
help="Sample rate", |
|
) |
|
parser.add_argument( |
|
"--channels", |
|
type=int, |
|
required=True, |
|
help="Audio channels, e.g, 1 or 2.", |
|
) |
|
parser.add_argument( |
|
"--evaluation_segments_num", |
|
type=int, |
|
required=True, |
|
help="The number of segments to create for evaluation.", |
|
) |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
create_evaluation(args) |
|
|