Spaces:

clr
/

phonalign

App Files Files Community

cati commited on Feb 7, 2023

Commit

69d94dc

•

1 Parent(s): 8d8a9b2

..

Browse files

Files changed (2) hide show

app.py +3 -1
ctcalign.py +187 -0

app.py CHANGED Viewed

@@ -29,7 +29,9 @@ with bl:
 # Long and short Icelandic vowels
 Choose a word, speaker group, and aligner type. Available speaker groups are native speakers, second-language speakers, or all. Aligner options are Montreal Forced Aligner (MFA) and CTC decoding with Wav2vec-2.0.
-The general expectation is that syllables with long stressed vowels followed by short consonants have a higher vowel:consonant duration ratio, while syllables with short stressed vowels followed by long consonants have a lower vowel:consonant ratio. However, a great many other factors affect the relative duration in any one recorded token. See Pind 1999, 'Speech segment durations and quantity in Icelandic' (J. Acoustical Society of America, 106(2)) for a review of the acoustics of Icelandic vowel duration. All phoneme durations are measured automatically with no human correction. The purpose of this demo is to evaluate the role of such tools in large-scale phonetic research. Therefore, no measurements shown in this demo should be taken as conclusive without some independent verification.
             """
                 )

 # Long and short Icelandic vowels
 Choose a word, speaker group, and aligner type. Available speaker groups are native speakers, second-language speakers, or all. Aligner options are Montreal Forced Aligner (MFA) and CTC decoding with Wav2vec-2.0.
+The general expectation is that syllables with long stressed vowels followed by short consonants have a higher vowel:consonant duration ratio, while syllables with short stressed vowels followed by long consonants have a lower vowel:consonant ratio. However, a great many other factors affect the relative duration in any one recorded token. See Pind 1999, 'Speech segment durations and quantity in Icelandic' (J. Acoustical Society of America, 106(2)) for a review of the acoustics of Icelandic vowel duration.
+All phoneme durations are measured automatically with no human correction. The purpose of this demo is to evaluate the role of such tools in large-scale phonetic research. Therefore, no measurements shown in this demo should be taken as conclusive without some independent verification.
             """
                 )

ctcalign.py ADDED Viewed

	@@ -0,0 +1,187 @@

+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+import torch, torchaudio
+import soundfile as sf
+import numpy as np
+#------------------------------------------
+# setup wav2vec2
+#------------------------------------------
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+torch.random.manual_seed(0)
+# info: https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h/blob/main/vocab.json
+MODEL_PATH="/work/caitlinr/w2vrec/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
+model_blank_token = '[PAD]' # important to know for CTC decoding
+model_word_separator = '|'
+labels_dict = {"f": 0, "a": 1, "é": 2, "t": 3, "o": 4, "n": 5, "e": 6, "y": 8, "k": 9, "j": 10, "u": 11, "d": 12, "w": 13, "l": 14, "ú": 15, "q": 16, "g": 17, "í": 18, "s": 19, "r": 20, "ý": 21, "i": 22, "z": 23, "m": 24, "h": 25, "ó": 26, "þ": 27, "æ": 28, "c": 29, "á": 30, "v": 31, "b": 32, "ð": 33, "x": 34, "ö": 35, "p": 36, "|": 7, "[UNK]": 37, "[PAD]": 38}
+model = Wav2Vec2ForCTC.from_pretrained(MODEL_PATH).to(device)
+processor = Wav2Vec2Processor.from_pretrained(MODEL_PATH)
+inverse_dict = {v:k for k,v in labels_dict.items()}
+all_labels = tuple(labels_dict.keys())
+blank_id = labels_dict[model_blank_token]
+#------------------------------------------
+# forced alignment with ctc decoder
+#   originally based on implementation of
+#   https://pytorch.org/audio/main/tutorials/forced_alignment_tutorial.html
+#------------------------------------------
+# return the label class probability of each audio frame
+def get_frame_probs(wav_path):
+    wav = readwav(wav_path)
+    with torch.inference_mode(): # similar to with torch.no_grad():
+        input_values = processor(wav,sampling_rate=16000).input_values[0]
+        input_values = torch.tensor(input_values, device=device).unsqueeze(0)
+        emits = model(input_values).logits
+        emits = torch.log_softmax(emits, dim=-1)
+        emit = emits[0].cpu().detach()
+    return emit
+def get_trellis(emission, tokens, blank_id):
+    num_frame = emission.size(0)
+    num_tokens = len(tokens)
+    # Trellis has extra diemsions for both time axis and tokens.
+    # The extra dim for tokens represents <SoS> (start-of-sentence)
+    # The extra dim for time axis is for simplification of the code.
+    trellis = torch.empty((num_frame + 1, num_tokens + 1))
+    trellis[0, 0] = 0
+    trellis[1:, 0] = torch.cumsum(emission[:, 0], 0) # len of this slice of trellis is len of audio frames)
+    trellis[0, -num_tokens:] = -float("inf") # len of this slice of trellis is len of transcript tokens
+    trellis[-num_tokens:, 0] = float("inf")
+    for t in range(num_frame):
+        trellis[t + 1, 1:] = torch.max(
+            # Score for staying at the same token
+            trellis[t, 1:] + emission[t, blank_id],
+            # Score for changing to the next token
+            trellis[t, :-1] + emission[t, tokens],
+        )
+    return trellis
+def backtrack(trellis, emission, tokens, blank_id):
+    # Note:
+    # j and t are indices for trellis, which has extra dimensions
+    # for time and tokens at the beginning.
+    # When referring to time frame index `T` in trellis,
+    # the corresponding index in emission is `T-1`.
+    # Similarly, when referring to token index `J` in trellis,
+    # the corresponding index in transcript is `J-1`.
+    j = trellis.size(1) - 1
+    t_start = torch.argmax(trellis[:, j]).item()
+    path = []
+    for t in range(t_start, 0, -1):
+        # 1. Figure out if the current position was stay or change
+        # Note (again):
+        # `emission[J-1]` is the emission at time frame `J` of trellis dimension.
+        # Score for token staying the same from time frame J-1 to T.
+        stayed = trellis[t - 1, j] + emission[t - 1, blank_id]
+        # Score for token changing from C-1 at T-1 to J at T.
+        changed = trellis[t - 1, j - 1] + emission[t - 1, tokens[j - 1]]
+        # 2. Store the path with frame-wise probability.
+        prob = emission[t - 1, tokens[j - 1] if changed > stayed else 0].exp().item()
+        # Return token index and time index in non-trellis coordinate.
+        path.append((j - 1, t - 1, prob))
+        # 3. Update the token
+        if changed > stayed:
+            j -= 1
+            if j == 0:
+                break
+    else:
+        raise ValueError("Failed to align")
+    return path[::-1]
+def merge_repeats(path,transcript):
+    i1, i2 = 0, 0
+    segments = []
+    while i1 < len(path):
+        while i2 < len(path) and path[i1][0] == path[i2][0]: # while both path steps point to the same token index
+            i2 += 1
+        segments.append( # when i2 finally switches to a different token,
+            #Segment(
+                (transcript[path[i1][0]], # to the list of segments, append the token from i1
+                path[i1][1], # time of the first path-point of that token
+                path[i2 - 1][1] + 1, # time of the final path-point for that token.
+            )
+        )
+        i1 = i2
+    return segments
+def merge_words(segments, separator):
+    words = []
+    i1, i2 = 0, 0
+    while i1 < len(segments):
+        if i2 >= len(segments) or segments[i2][0] == separator:
+            if i1 != i2:
+                segs = segments[i1:i2]
+                word = "".join([seg[0] for seg in segs])
+                words.append((word, segments[i1][1], segments[i2 - 1][2]))
+            i1 = i2 + 1
+            i2 = i1
+        else:
+            i2 += 1
+    return words
+#------------------------------------------
+# handle in/out/etc.
+#------------------------------------------
+def readwav(wav_path):
+    wav, sr = sf.read(wav_path, dtype=np.float32)
+    if len(wav.shape) == 2:
+        wav = wav.mean(1)
+    if sr != 16000:
+        wlen = int(wav.shape[0] / sr * 16000)
+        wav = signal.resample(wav, wlen)
+    return wav
+#convert frame-numbers to timestamps in seconds
+# w2v2 step size is about 20ms, or 50 frames per second
+def f2s(fr):
+	return fr/50
+def fmt(frame_aligns):
+	return [(label,f2s(start),f2s(end)) for label,start,end in frame_aligns]
+# prepare the input transcript text string
+# TODO:
+# handle input strings that still have punctuation,
+# or that have characters not present in labels_dict
+def prep_transcript(xcp):
+    xcp = xcp.lower()
+    while '  ' in xcp:
+        xcp = xcp.replace('  ', ' ')
+    xcp = xcp.replace(' ',model_word_separator)
+    label_ids = [labels_dict[c] for c in xcp]
+    return xcp, label_ids
+def ctcalign(wav_path,transcript_string):
+	norm_txt, rec_label_ids = prep_transcript(transcript_string)
+	emit = get_frame_probs(wav_path)
+	trellis = get_trellis(emit, rec_label_ids, blank_id)
+	path = backtrack(trellis, emit, rec_label_ids, blank_id)
+	segments = merge_repeats(path,norm_txt)
+	words = merge_words(segments, model_word_separator)
+	#segments = [s for s in segments if s[0] != model_word_separator]
+	return fmt(segments), fmt(words)