|
|
|
|
|
import gradio as gr |
|
from transformers import pipeline |
|
import tokenizer |
|
from difflib import Differ, SequenceMatcher |
|
|
|
text1 = "Kver á á þenan bússtað" |
|
text2 = "Hver á þennan bústað?" |
|
|
|
def diff_texts(text1, text2): |
|
d = Differ() |
|
return [ |
|
(token[2:], token[0] if token[0] != " " else None) |
|
for token in d.compare(text1, text2) |
|
] |
|
|
|
def split_text(text): |
|
sentence_list = [i for i in tokenizer.split_into_sentences(text, original=True)] |
|
return sentence_list |
|
|
|
def mark_text( text, tag,): |
|
return (text, tag, ) |
|
|
|
def mark_span(text, tag,): |
|
return [mark_text(token, tag) for token in text] |
|
|
|
def markup_diff(a, b, |
|
mark=mark_span, |
|
default_mark = lambda x: x, |
|
isjunk=None): |
|
"""Returns a and b with any differences processed by mark |
|
|
|
Junk is ignored by the differ |
|
""" |
|
seqmatcher = SequenceMatcher(isjunk=isjunk, a=a, b=b, autojunk=False) |
|
|
|
out_a, out_b = [], [] |
|
for tag, a0, a1, b0, b1 in seqmatcher.get_opcodes(): |
|
|
|
markup=mark |
|
out_a += markup(a[a0:a1], tag) |
|
out_b += markup(b[b0:b1], tag) |
|
assert len(out_a) == len(a) |
|
assert len(out_b) == len(b) |
|
return out_a, out_b |
|
|
|
print(diff_texts(text1, text2)) |
|
print(markup_diff(text1.split(" "), text2.split(" "))) |