|
import itertools |
|
import re |
|
|
|
LANGUAGE_UNICODE_RANGE_MAP = { |
|
"ZH": [(0x4E00, 0x9FFF)], |
|
"JP": [(0x4E00, 0x9FFF), (0x3040, 0x309F), (0x30A0, 0x30FF), (0x31F0, 0x31FF)], |
|
"EN": [(0x0000, 0x007F)], |
|
} |
|
|
|
SYMBOLS_MAPPING = { |
|
"οΌ": ",", |
|
"οΌ": ",", |
|
"οΌ": ",", |
|
"γ": ".", |
|
"οΌ": "!", |
|
"οΌ": "?", |
|
"\n": ".", |
|
"Β·": ",", |
|
"γ": ",", |
|
"...": "β¦", |
|
"$": ".", |
|
"β": "'", |
|
"β": "'", |
|
"β": "'", |
|
"β": "'", |
|
"οΌ": "'", |
|
"οΌ": "'", |
|
"(": "'", |
|
")": "'", |
|
"γ": "'", |
|
"γ": "'", |
|
"γ": "'", |
|
"γ": "'", |
|
"[": "'", |
|
"]": "'", |
|
"β": "-", |
|
"ο½": "-", |
|
"~": "-", |
|
"γ»": "-", |
|
"γ": "'", |
|
"γ": "'", |
|
";": ",", |
|
":": ",", |
|
} |
|
|
|
REPLACE_SYMBOL_REGEX = re.compile( |
|
"|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys()) |
|
) |
|
ALL_KNOWN_UTF8_RANGE = list( |
|
itertools.chain.from_iterable(LANGUAGE_UNICODE_RANGE_MAP.values()) |
|
) |
|
REMOVE_UNKNOWN_SYMBOL_REGEX = re.compile( |
|
"[^" |
|
+ "".join( |
|
f"{re.escape(chr(start))}-{re.escape(chr(end))}" |
|
for start, end in ALL_KNOWN_UTF8_RANGE |
|
) |
|
+ "]" |
|
) |
|
|
|
|
|
def clean_text(text): |
|
|
|
text = text.strip() |
|
|
|
text = re.sub(r"<p:(.*?)>", r"<PPP\1PPP>", text) |
|
|
|
text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text) |
|
text = REMOVE_UNKNOWN_SYMBOL_REGEX.sub("", text) |
|
|
|
text = re.sub(r"<PPP(.*?)PPP>", r"<p:\1>", text) |
|
|
|
return text |
|
|