Spaces:

clr
/

phonalign

App Files Files Community

clr commited on Mar 21, 2023

Commit

60edeed

•

1 Parent(s): dea339f

Update ctcalign.py

Browse files

Files changed (1) hide show

ctcalign.py +38 -14

ctcalign.py CHANGED Viewed

@@ -3,6 +3,7 @@ import torch, torchaudio
 import soundfile as sf
 import numpy as np
 from scipy import signal
 #------------------------------------------
 # setup wav2vec2
@@ -96,6 +97,29 @@ def get_trellis(emission, tokens, blank_id):
     return trellis
 def backtrack(trellis, emission, tokens, blank_id):
     # Note:
     # j and t are indices for trellis, which has extra dimensions
@@ -106,11 +130,10 @@ def backtrack(trellis, emission, tokens, blank_id):
     # the corresponding index in transcript is `J-1`.
     j = trellis.size(1) - 1
     t_start = torch.argmax(trellis[:, j]).item()
     path = []
     for t in range(t_start, 0, -1):
         # 1. Figure out if the current position was stay or change
-        # Note (again):
         # `emission[J-1]` is the emission at time frame `J` of trellis dimension.
         # Score for token staying the same from time frame J-1 to T.
         stayed = trellis[t - 1, j] + emission[t - 1, blank_id]
@@ -120,7 +143,7 @@ def backtrack(trellis, emission, tokens, blank_id):
         # 2. Store the path with frame-wise probability.
         prob = emission[t - 1, tokens[j - 1] if changed > stayed else 0].exp().item()
         # Return token index and time index in non-trellis coordinate.
-        path.append((j - 1, t - 1, prob))
         # 3. Update the token
         if changed > stayed:
@@ -132,32 +155,35 @@ def backtrack(trellis, emission, tokens, blank_id):
     return path[::-1]
 def merge_repeats(path,transcript):
     i1, i2 = 0, 0
     segments = []
     while i1 < len(path):
-        while i2 < len(path) and path[i1][0] == path[i2][0]: # while both path steps point to the same token index
             i2 += 1
         segments.append( # when i2 finally switches to a different token,
-            #Segment(
-                (transcript[path[i1][0]], # to the list of segments, append the token from i1
-                path[i1][1], # time of the first path-point of that token
-                path[i2 - 1][1] + 1, # time of the final path-point for that token.
             )
         )
         i1 = i2
     return segments
 def merge_words(segments, separator):
     words = []
     i1, i2 = 0, 0
     while i1 < len(segments):
-        if i2 >= len(segments) or segments[i2][0] == separator:
             if i1 != i2:
                 segs = segments[i1:i2]
-                word = "".join([seg[0] for seg in segs])
-                words.append((word, segments[i1][1], segments[i2 - 1][2]))
             i1 = i2 + 1
             i2 = i1
         else:
@@ -165,8 +191,6 @@ def merge_words(segments, separator):
     return words
 #------------------------------------------
 # handle in/out/etc.
 #------------------------------------------

 import soundfile as sf
 import numpy as np
 from scipy import signal
+from dataclasses import dataclass
 #------------------------------------------
 # setup wav2vec2
     return trellis
+@dataclass
+class Point:
+    token_index: int
+    time_index: int
+    score: float
+@dataclass
+class Segment:
+    label: str
+    start: int
+    end: int
+    score: float
+    def __repr__(self):
+        return f"{self.label}\t({self.score:4.2f}): [{self.start:5d}, {self.end:5d})"
+    @property
+    def length(self):
+        return self.end - self.start
 def backtrack(trellis, emission, tokens, blank_id):
     # Note:
     # j and t are indices for trellis, which has extra dimensions
     # the corresponding index in transcript is `J-1`.
     j = trellis.size(1) - 1
     t_start = torch.argmax(trellis[:, j]).item()
     path = []
     for t in range(t_start, 0, -1):
         # 1. Figure out if the current position was stay or change
         # `emission[J-1]` is the emission at time frame `J` of trellis dimension.
         # Score for token staying the same from time frame J-1 to T.
         stayed = trellis[t - 1, j] + emission[t - 1, blank_id]
         # 2. Store the path with frame-wise probability.
         prob = emission[t - 1, tokens[j - 1] if changed > stayed else 0].exp().item()
         # Return token index and time index in non-trellis coordinate.
+        path.append(Point(j - 1, t - 1, prob))
         # 3. Update the token
         if changed > stayed:
     return path[::-1]
 def merge_repeats(path,transcript):
     i1, i2 = 0, 0
     segments = []
     while i1 < len(path):
+        while i2 < len(path) and path[i1].token_index == path[i2].token_index: # while both path steps point to the same token index
             i2 += 1
+        score = sum(path[k].score for k in range(i1, i2)) / (i2 - i1)
         segments.append( # when i2 finally switches to a different token,
+            Segment(
+                transcript[path[i1].token_index],# to the list of segments, append the token from i1
+                path[i1].time_index, # time of the first path-point of that token
+                path[i2 - 1].time_index + 1, # time of the final path-point for that token.
+                score,
             )
         )
         i1 = i2
     return segments
 def merge_words(segments, separator):
     words = []
     i1, i2 = 0, 0
     while i1 < len(segments):
+        if i2 >= len(segments) or segments[i2].label == separator:
             if i1 != i2:
                 segs = segments[i1:i2]
+                word = "".join([seg.label for seg in segs])
+                words.append((word, segments[i1].start, segments[i2 - 1].end))
             i1 = i2 + 1
             i2 = i1
         else:
     return words
 #------------------------------------------
 # handle in/out/etc.
 #------------------------------------------