bert_japanese_punctuation / prepare_dataset.py
bobfromjapan's picture
Upload 7 files
ae56469 verified
raw
history blame contribute delete
No virus
1 kB
# %%
txt_file = "data/transcript.txt"
full_data = []
with open(txt_file, "r") as f:
data = f.readlines()
data = [d.strip().split("|")[1] for d in data]
full_data.extend(data)
# %%
data
# %%
# 。か」で終わるところでsplitする。
sentences = []
sentence = ""
for d in "".join(full_data):
if d == "。":
sentence += "。"
sentences.append(sentence)
sentence = ""
elif d == "」":
sentence += "」"
sentences.append(sentence)
sentence = ""
else:
sentence += d
len(sentences)
# %%
train_sentences = sentences[:10000]
test_sentences = sentences[10000:]
# 適当に5sentenceごとに結合して、train.txtに書き込む。
with open("data/train.txt", "w") as f:
for i in range(0, len(train_sentences), 5):
f.write("".join(sentences[i : i + 5]) + "\n")
with open("data/test.txt", "w") as f:
for i in range(0, len(test_sentences), 5):
f.write("".join(test_sentences[i : i + 5]) + "\n")