# %% txt_file = "data/transcript.txt" full_data = [] with open(txt_file, "r") as f: data = f.readlines() data = [d.strip().split("|")[1] for d in data] full_data.extend(data) # %% data # %% # 。か」で終わるところでsplitする。 sentences = [] sentence = "" for d in "".join(full_data): if d == "。": sentence += "。" sentences.append(sentence) sentence = "" elif d == "」": sentence += "」" sentences.append(sentence) sentence = "" else: sentence += d len(sentences) # %% train_sentences = sentences[:10000] test_sentences = sentences[10000:] # 適当に5sentenceごとに結合して、train.txtに書き込む。 with open("data/train.txt", "w") as f: for i in range(0, len(train_sentences), 5): f.write("".join(sentences[i : i + 5]) + "\n") with open("data/test.txt", "w") as f: for i in range(0, len(test_sentences), 5): f.write("".join(test_sentences[i : i + 5]) + "\n")