|
import gradio as gr |
|
|
|
import argparse |
|
import soundfile as sf |
|
import numpy as np |
|
import tempfile |
|
from pathlib import Path |
|
import os |
|
import subprocess |
|
import sys |
|
import re |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process(audio, model, lang, format): |
|
with tempfile.TemporaryDirectory() as tmpdir: |
|
print(">>> preparing tmp manifest dir ...", file=sys.stderr) |
|
tmpdir = Path(tmpdir) |
|
with open(tmpdir / "dev.tsv", "w") as fw: |
|
fw.write("/\n") |
|
for audio in audio: |
|
nsample = sf.SoundFile(audio).frames |
|
fw.write(f"{audio}\t{nsample}\n") |
|
with open(tmpdir / "dev.uid", "w") as fw: |
|
fw.write(f"{audio}\n"*len(audio)) |
|
with open(tmpdir / "dev.ltr", "w") as fw: |
|
fw.write("d u m m y | d u m m y\n"*len(audio)) |
|
with open(tmpdir / "dev.wrd", "w") as fw: |
|
fw.write("dummy dummy\n"*len(audio)) |
|
cmd = f""" |
|
PYTHONPATH=. PREFIX=INFER HYDRA_FULL_ERROR=1 python infer.py -m decoding.type=viterbi dataset.max_tokens=4000000 distributed_training.distributed_world_size=1 "common_eval.path='{model}'" task.data={tmpdir} dataset.gen_subset="{lang}:dev" common_eval.post_process={format} decoding.results_path={tmpdir} |
|
""" |
|
print(">>> loading model & running inference ...", file=sys.stderr) |
|
subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL,) |
|
with open(tmpdir/"hypo.word") as fr: |
|
for ii, hypo in enumerate(fr): |
|
hypo = re.sub("\(\S+\)$", "", hypo).strip() |
|
print(f'===============\nInput: {audio[ii]}\nOutput: {hypo}') |
|
|
|
def transcribe(audio): |
|
model = "base_300m.pt" |
|
lang = "shi" |
|
format = "letter" |
|
process(np.ravel(audio), model, lang, format) |
|
|
|
gr.Interface( |
|
title = 'MetaAI (Facebook Research) MMS (Massively Multilingual Speech) ASR', |
|
fn=transcribe, |
|
inputs=[ |
|
gr.inputs.Audio(source="microphone", type="filepath") |
|
], |
|
outputs=[ |
|
"textbox" |
|
], |
|
live=True).launch() |