|
import glob,json,unicodedata,re |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_segs(tx_path): |
|
with open(tx_path,'r') as handle: |
|
tx = json.load(handle) |
|
tx = tx['segments'] |
|
|
|
return [get_sent(sent) for sent in tx] |
|
|
|
|
|
def get_sent(sent): |
|
def ms(t): |
|
return int(float(t.replace('s',''))*1000) |
|
|
|
st = sent['startTime'] |
|
et = sent['endTime'] |
|
ws = ''.join([wd['word'] for wd in sent['words']]) |
|
if st is None: |
|
st=sent['words'][0]['startTime'] |
|
if et is None: |
|
et=sent['words'][-1]['endTime'] |
|
return(ms(st),ms(et),ws) |
|
|
|
|
|
def html_line(match_line,url): |
|
w,sk,ix = match_line |
|
h = f'<p>({sk}) [{ix}] <a href="{url}#{ix}">{w}</a></p>' |
|
return h |
|
|
|
|
|
def snorm(s): |
|
s = ''.join([c.lower() for c in s if not unicodedata.category(c).startswith("P") ]) |
|
while ' ' in s: |
|
s = s.replace(' ', ' ') |
|
return s |
|
|
|
|
|
|
|
|
|
def search_convos(corpus_dir, base_url, output_path, search_func, search_string=None): |
|
convos = glob.glob(corpus_dir+'*/') |
|
convos = [c.split(corpus_dir)[1].split('/')[0] for c in convos] |
|
convos = sorted(convos) |
|
|
|
result_html = '' |
|
|
|
for i, convo in enumerate(convos): |
|
|
|
convo_url = f'{base_url}{convo}.html' |
|
txa = f'{corpus_dir}{convo}/speaker_a_convo_{convo}_transcript.json' |
|
txb = f'{corpus_dir}{convo}/speaker_b_convo_{convo}_transcript.json' |
|
|
|
sega = [(s,e,w,'a') for s,e,w in get_segs(txa)] |
|
segb = [(s,e,w,'b') for s,e,w in get_segs(txb)] |
|
segs = sega + segb |
|
segs.sort(key=lambda s: s[0]) |
|
|
|
|
|
segs = [(segs[i][2], segs[i][3], i) for i in range(len(segs))] |
|
|
|
matches = search_func(segs,search_string) |
|
|
|
if matches: |
|
result_html += f'<h4>{convo}</h4>' |
|
result_html += '\n'.join([html_line(m,convo_url) for m in matches]) |
|
result_html += f'<hr />' |
|
|
|
with open(output_path,'w') as handle: |
|
handle.write(result_html) |
|
|
|
|
|
def simple_search1(convo,search_string): |
|
search_string = snorm(search_string) |
|
norm = [(snorm(w),sk,ln) for w,sk,ln in convo] |
|
matches = [(w,sk,ln) for w,sk,ln in norm if search_string in w] |
|
return matches |
|
|
|
|
|
def regex_search1(convo,search_rx): |
|
matches = [(w,sk,ln) for w,sk,ln in convo if re.findall(search_rx,snorm(w))] |
|
return matches |
|
|
|
|
|
if __name__ == "__main__": |
|
corpus_dir = './full_conversations/' |
|
base_url = 'https://clr-spjall.static.hf.space/pages/' |
|
|
|
output_path = './tmp.html' |
|
|
|
|
|
search_func = regex_search1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
search_string = r'^\w+ sem' |
|
|
|
|
|
search_convos(corpus_dir, base_url, output_path, search_func, search_string) |
|
|
|
|