Spaces:

clr
/

spjall

App Files Files Community

spjall / searcher.py

clr

offline text search

b663aa2 verified 5 months ago

raw

history blame contribute delete

3.52 kB

	import glob,json,unicodedata,re

	# basic search helper:
	# download transcript data, run searcher.py on it,
	# open output html file in a browser,
	# it contains quick links to listen to the results.

	# notes:
	# linked segment times are very approximate.
	# audio loading can be delayed, click segments again until they play.
	# transcripts aren't tagged or parsed, DIY morphosyntax by regex.



	def get_segs(tx_path):
	with open(tx_path,'r') as handle:
	tx = json.load(handle)
	tx = tx['segments']

	return [get_sent(sent) for sent in tx]


	def get_sent(sent):
	def ms(t): # time in ms
	return int(float(t.replace('s',''))*1000)
	# keys of sent are 'startTime', 'endTime', 'words', 'speakerId'
	st = sent['startTime']
	et = sent['endTime']
	ws = ''.join([wd['word'] for wd in sent['words']])
	if st is None:
	st=sent['words'][0]['startTime']
	if et is None:
	et=sent['words'][-1]['endTime']
	return(ms(st),ms(et),ws)


	def html_line(match_line,url):
	w,sk,ix = match_line
	h = f'<p>({sk}) [{ix}] <a href="{url}#{ix}">{w}</a></p>'
	return h


	def snorm(s):
	s = ''.join([c.lower() for c in s if not unicodedata.category(c).startswith("P") ])
	while ' ' in s:
	s = s.replace(' ', ' ')
	return s


	# the search function must operate on the conversation
	# and return the results in expected format
	def search_convos(corpus_dir, base_url, output_path, search_func, search_string=None):
	convos = glob.glob(corpus_dir+'*/')
	convos = [c.split(corpus_dir)[1].split('/')[0] for c in convos]
	convos = sorted(convos)

	result_html = ''

	for i, convo in enumerate(convos):

	convo_url = f'{base_url}{convo}.html'
	txa = f'{corpus_dir}{convo}/speaker_a_convo_{convo}_transcript.json'
	txb = f'{corpus_dir}{convo}/speaker_b_convo_{convo}_transcript.json'

	sega = [(s,e,w,'a') for s,e,w in get_segs(txa)]
	segb = [(s,e,w,'b') for s,e,w in get_segs(txb)]
	segs = sega + segb
	segs.sort(key=lambda s: s[0])

	# discard timestamps but add turn number
	segs = [(segs[i][2], segs[i][3], i) for i in range(len(segs))]

	matches = search_func(segs,search_string)

	if matches:
	result_html += f'<h4>{convo}</h4>'
	result_html += '\n'.join([html_line(m,convo_url) for m in matches])
	result_html += f'<hr />'

	with open(output_path,'w') as handle:
	handle.write(result_html)


	def simple_search1(convo,search_string):
	search_string = snorm(search_string)
	norm = [(snorm(w),sk,ln) for w,sk,ln in convo]
	matches = [(w,sk,ln) for w,sk,ln in norm if search_string in w]
	return matches


	def regex_search1(convo,search_rx):
	matches = [(w,sk,ln) for w,sk,ln in convo if re.findall(search_rx,snorm(w))]
	return matches


	if __name__ == "__main__":
	corpus_dir = './full_conversations/'
	base_url = 'https://clr-spjall.static.hf.space/pages/'

	output_path = './tmp.html'

	#search_func = simple_search1
	search_func = regex_search1
	#search_string = 'kannski'
	#search_string = 'eða'
	#search_string = r'\Wá \w+ eða \w+'
	#search_string = r'\Wí \w+ eða \w+'
	#search_string = r'nei\S? \w+ \w+ (ekki\|aldrei\|ekkert)'#\|enga\|engu\|eng\w\w)'
	#search_string = r'hvor\S* .* eða'
	#search_string = r'\Wef .* þá'
	search_string = r'^\w+ sem'


	search_convos(corpus_dir, base_url, output_path, search_func, search_string)