clr commited on
Commit
ff0ee4d
1 Parent(s): 60edeed

Update ctcalign.py

Browse files
Files changed (1) hide show
  1. ctcalign.py +13 -15
ctcalign.py CHANGED
@@ -57,6 +57,10 @@ d = {"Icelandic": {'model': is_model, 'processor': is_processor, 'inverse_dict':
57
 
58
 
59
 
 
 
 
 
60
 
61
  #------------------------------------------
62
  # forced alignment with ctc decoder
@@ -97,7 +101,6 @@ def get_trellis(emission, tokens, blank_id):
97
  return trellis
98
 
99
 
100
-
101
  @dataclass
102
  class Point:
103
  token_index: int
@@ -111,8 +114,9 @@ class Segment:
111
  end: int
112
  score: float
113
 
114
- def __repr__(self):
115
- return f"{self.label}\t({self.score:4.2f}): [{self.start:5d}, {self.end:5d})"
 
116
 
117
  @property
118
  def length(self):
@@ -183,7 +187,8 @@ def merge_words(segments, separator):
183
  if i1 != i2:
184
  segs = segments[i1:i2]
185
  word = "".join([seg.label for seg in segs])
186
- words.append((word, segments[i1].start, segments[i2 - 1].end))
 
187
  i1 = i2 + 1
188
  i2 = i1
189
  else:
@@ -205,19 +210,11 @@ def readwav(wav_path):
205
  return wav
206
 
207
 
208
- #convert frame-numbers to timestamps in seconds
209
- # w2v2 step size is about 20ms, or 50 frames per second
210
- def f2s(fr):
211
- return fr/50
212
-
213
- def fmt(frame_aligns):
214
- return [(label,f2s(start),f2s(end)) for label,start,end in frame_aligns]
215
-
216
  # generate mfa format for character (phone) and word alignments
217
  def mfalike(chars,wds):
218
  hed = ['Begin,End,Label,Type,Speaker\n']
219
- wlines = [f'{s},{e},{w},words,000\n' for w,s,e in wds]
220
- slines = [f'{s},{e},{sg},phones,000\n' for sg,s,e in chars]
221
  return (''.join(hed+wlines+slines))
222
 
223
 
@@ -243,5 +240,6 @@ def langsalign(wav_path,transcript_string,lang):
243
  words = merge_words(segments, d[lang]['model_word_separator'])
244
 
245
  #segments = [s for s in segments if s[0] != model_word_separator]
246
- return mfalike(fmt(segments), fmt(words))
 
247
 
 
57
 
58
 
59
 
60
+ #convert frame-numbers to timestamps in seconds
61
+ # w2v2 step size is about 20ms, or 50 frames per second
62
+ def f2s(fr):
63
+ return fr/50
64
 
65
  #------------------------------------------
66
  # forced alignment with ctc decoder
 
101
  return trellis
102
 
103
 
 
104
  @dataclass
105
  class Point:
106
  token_index: int
 
114
  end: int
115
  score: float
116
 
117
+ @property
118
+ def mfaform(self):
119
+ return f"{f2s(self.start),{f2s(self.end)},{self.label}"
120
 
121
  @property
122
  def length(self):
 
187
  if i1 != i2:
188
  segs = segments[i1:i2]
189
  word = "".join([seg.label for seg in segs])
190
+ score = sum(seg.score * seg.length for seg in segs) / sum(seg.length for seg in segs)
191
+ words.append(Segment(word, segments[i1].start, segments[i2 - 1].end, score))
192
  i1 = i2 + 1
193
  i2 = i1
194
  else:
 
210
  return wav
211
 
212
 
 
 
 
 
 
 
 
 
213
  # generate mfa format for character (phone) and word alignments
214
  def mfalike(chars,wds):
215
  hed = ['Begin,End,Label,Type,Speaker\n']
216
+ wlines = [f'{w.mfaform},words,000\n' for w in wds]
217
+ slines = [f'{ch.mfaform},phones,000\n' for ch in chars]
218
  return (''.join(hed+wlines+slines))
219
 
220
 
 
240
  words = merge_words(segments, d[lang]['model_word_separator'])
241
 
242
  #segments = [s for s in segments if s[0] != model_word_separator]
243
+ print(segments)
244
+ return mfalike(segments,words)
245