Spaces:

clr
/

phonalign

App Files Files Community

clr commited on Mar 21, 2023

Commit

ff0ee4d

•

1 Parent(s): 60edeed

Update ctcalign.py

Browse files

Files changed (1) hide show

ctcalign.py +13 -15

ctcalign.py CHANGED Viewed

@@ -57,6 +57,10 @@ d = {"Icelandic": {'model': is_model, 'processor': is_processor, 'inverse_dict':
 #------------------------------------------
 # forced alignment with ctc decoder
@@ -97,7 +101,6 @@ def get_trellis(emission, tokens, blank_id):
     return trellis
 @dataclass
 class Point:
     token_index: int
@@ -111,8 +114,9 @@ class Segment:
     end: int
     score: float
-    def __repr__(self):
-        return f"{self.label}\t({self.score:4.2f}): [{self.start:5d}, {self.end:5d})"
     @property
     def length(self):
@@ -183,7 +187,8 @@ def merge_words(segments, separator):
             if i1 != i2:
                 segs = segments[i1:i2]
                 word = "".join([seg.label for seg in segs])
-                words.append((word, segments[i1].start, segments[i2 - 1].end))
             i1 = i2 + 1
             i2 = i1
         else:
@@ -205,19 +210,11 @@ def readwav(wav_path):
     return wav
-#convert frame-numbers to timestamps in seconds
-# w2v2 step size is about 20ms, or 50 frames per second
-def f2s(fr):
-	return fr/50
-def fmt(frame_aligns):
-	return [(label,f2s(start),f2s(end)) for label,start,end in frame_aligns]
 # generate mfa format for character (phone) and word alignments
 def mfalike(chars,wds):
 	hed = ['Begin,End,Label,Type,Speaker\n']
-	wlines = [f'{s},{e},{w},words,000\n' for w,s,e in wds]
-	slines = [f'{s},{e},{sg},phones,000\n' for sg,s,e in chars]
 	return (''.join(hed+wlines+slines))
@@ -243,5 +240,6 @@ def langsalign(wav_path,transcript_string,lang):
 	words = merge_words(segments, d[lang]['model_word_separator'])
 	#segments = [s for s in segments if s[0] != model_word_separator]
-	return mfalike(fmt(segments), fmt(words))

+#convert frame-numbers to timestamps in seconds
+# w2v2 step size is about 20ms, or 50 frames per second
+def f2s(fr):
+	return fr/50
 #------------------------------------------
 # forced alignment with ctc decoder
     return trellis
 @dataclass
 class Point:
     token_index: int
     end: int
     score: float
+    @property
+    def mfaform(self):
+        return f"{f2s(self.start),{f2s(self.end)},{self.label}"
     @property
     def length(self):
             if i1 != i2:
                 segs = segments[i1:i2]
                 word = "".join([seg.label for seg in segs])
+                score = sum(seg.score * seg.length for seg in segs) / sum(seg.length for seg in segs)
+                words.append(Segment(word, segments[i1].start, segments[i2 - 1].end, score))
             i1 = i2 + 1
             i2 = i1
         else:
     return wav
 # generate mfa format for character (phone) and word alignments
 def mfalike(chars,wds):
 	hed = ['Begin,End,Label,Type,Speaker\n']
+	wlines = [f'{w.mfaform},words,000\n' for w in wds]
+	slines = [f'{ch.mfaform},phones,000\n' for ch in chars]
 	return (''.join(hed+wlines+slines))
 	words = merge_words(segments, d[lang]['model_word_separator'])
 	#segments = [s for s in segments if s[0] != model_word_separator]
+    print(segments)
+	return mfalike(segments,words)