Update ctcalign.py
Browse files- ctcalign.py +13 -15
ctcalign.py
CHANGED
@@ -57,6 +57,10 @@ d = {"Icelandic": {'model': is_model, 'processor': is_processor, 'inverse_dict':
|
|
57 |
|
58 |
|
59 |
|
|
|
|
|
|
|
|
|
60 |
|
61 |
#------------------------------------------
|
62 |
# forced alignment with ctc decoder
|
@@ -97,7 +101,6 @@ def get_trellis(emission, tokens, blank_id):
|
|
97 |
return trellis
|
98 |
|
99 |
|
100 |
-
|
101 |
@dataclass
|
102 |
class Point:
|
103 |
token_index: int
|
@@ -111,8 +114,9 @@ class Segment:
|
|
111 |
end: int
|
112 |
score: float
|
113 |
|
114 |
-
|
115 |
-
|
|
|
116 |
|
117 |
@property
|
118 |
def length(self):
|
@@ -183,7 +187,8 @@ def merge_words(segments, separator):
|
|
183 |
if i1 != i2:
|
184 |
segs = segments[i1:i2]
|
185 |
word = "".join([seg.label for seg in segs])
|
186 |
-
|
|
|
187 |
i1 = i2 + 1
|
188 |
i2 = i1
|
189 |
else:
|
@@ -205,19 +210,11 @@ def readwav(wav_path):
|
|
205 |
return wav
|
206 |
|
207 |
|
208 |
-
#convert frame-numbers to timestamps in seconds
|
209 |
-
# w2v2 step size is about 20ms, or 50 frames per second
|
210 |
-
def f2s(fr):
|
211 |
-
return fr/50
|
212 |
-
|
213 |
-
def fmt(frame_aligns):
|
214 |
-
return [(label,f2s(start),f2s(end)) for label,start,end in frame_aligns]
|
215 |
-
|
216 |
# generate mfa format for character (phone) and word alignments
|
217 |
def mfalike(chars,wds):
|
218 |
hed = ['Begin,End,Label,Type,Speaker\n']
|
219 |
-
wlines = [f'{
|
220 |
-
slines = [f'{
|
221 |
return (''.join(hed+wlines+slines))
|
222 |
|
223 |
|
@@ -243,5 +240,6 @@ def langsalign(wav_path,transcript_string,lang):
|
|
243 |
words = merge_words(segments, d[lang]['model_word_separator'])
|
244 |
|
245 |
#segments = [s for s in segments if s[0] != model_word_separator]
|
246 |
-
|
|
|
247 |
|
|
|
57 |
|
58 |
|
59 |
|
60 |
+
#convert frame-numbers to timestamps in seconds
|
61 |
+
# w2v2 step size is about 20ms, or 50 frames per second
|
62 |
+
def f2s(fr):
|
63 |
+
return fr/50
|
64 |
|
65 |
#------------------------------------------
|
66 |
# forced alignment with ctc decoder
|
|
|
101 |
return trellis
|
102 |
|
103 |
|
|
|
104 |
@dataclass
|
105 |
class Point:
|
106 |
token_index: int
|
|
|
114 |
end: int
|
115 |
score: float
|
116 |
|
117 |
+
@property
|
118 |
+
def mfaform(self):
|
119 |
+
return f"{f2s(self.start),{f2s(self.end)},{self.label}"
|
120 |
|
121 |
@property
|
122 |
def length(self):
|
|
|
187 |
if i1 != i2:
|
188 |
segs = segments[i1:i2]
|
189 |
word = "".join([seg.label for seg in segs])
|
190 |
+
score = sum(seg.score * seg.length for seg in segs) / sum(seg.length for seg in segs)
|
191 |
+
words.append(Segment(word, segments[i1].start, segments[i2 - 1].end, score))
|
192 |
i1 = i2 + 1
|
193 |
i2 = i1
|
194 |
else:
|
|
|
210 |
return wav
|
211 |
|
212 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
213 |
# generate mfa format for character (phone) and word alignments
|
214 |
def mfalike(chars,wds):
|
215 |
hed = ['Begin,End,Label,Type,Speaker\n']
|
216 |
+
wlines = [f'{w.mfaform},words,000\n' for w in wds]
|
217 |
+
slines = [f'{ch.mfaform},phones,000\n' for ch in chars]
|
218 |
return (''.join(hed+wlines+slines))
|
219 |
|
220 |
|
|
|
240 |
words = merge_words(segments, d[lang]['model_word_separator'])
|
241 |
|
242 |
#segments = [s for s in segments if s[0] != model_word_separator]
|
243 |
+
print(segments)
|
244 |
+
return mfalike(segments,words)
|
245 |
|