Gosse Minnema
Add sociofillmore code, load dataset via private dataset repo
b11ac48
raw
history blame contribute delete
No virus
2.99 kB
import random
import datetime
import pandas as pd
random.seed(1996)
DEV_RATIO = 0.10
def choose_best_casing(orig, predicted):
num_upper_tokens = len([c == c.upper() for c in orig.upper()])
if num_upper_tokens > 0.5 * len(orig):
return predicted
return predicted
def split_data():
events_main = []
texts_main = []
events_dev = []
texts_dev = []
with open("data/migration/corpus_titoli_all_raw.truecase_bilstm.txt", encoding="utf-8") as f:
titles_tc = [line.strip() for line in f]
df_all = pd.read_csv("data/migration/corpus_all.csv", encoding="latin-1")
for idx, (_, row) in enumerate(df_all.iterrows()):
if idx % 1000 == 0:
print("Processing line:", idx)
year = int(row["Anno"])
event_data = {
"event:id": idx,
"event:year": year,
}
text_data = {
"event_id": idx,
"text_id": idx,
"pubyear": year,
"language": "Italian",
"provider": row["Testata"].lstrip("*T_"),
"title": choose_best_casing(row["Titolo"], titles_tc[idx]),
"title_truecased": titles_tc[idx],
"title_orig": row["Titolo"]
}
if random.random() < DEV_RATIO:
events_dev.append(event_data)
texts_dev.append(text_data)
with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.best.txt", "w", encoding="utf-8") as f_out:
f_out.write(text_data["title"])
with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.orig.txt", "w", encoding="utf-8") as f_out:
f_out.write(text_data["title_orig"])
with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.truecase.txt", "w", encoding="utf-8") as f_out:
f_out.write(text_data["title_truecased"])
else:
events_main.append(event_data)
texts_main.append(text_data)
with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.best.txt", "w", encoding="utf-8") as f_out:
f_out.write(text_data["title"])
with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.orig.txt", "w", encoding="utf-8") as f_out:
f_out.write(text_data["title_orig"])
with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.truecase.txt", "w", encoding="utf-8") as f_out:
f_out.write(text_data["title_truecased"])
pd.DataFrame(events_main).to_csv("output/migration/split_data/split_main.events.csv")
pd.DataFrame(texts_main).to_csv("output/migration/split_data/split_main.texts.meta.csv")
pd.DataFrame(events_dev).to_csv("output/migration/split_data/split_dev10.events.csv")
pd.DataFrame(texts_dev).to_csv("output/migration/split_data/split_dev10.texts.meta.csv")
if __name__ == "__main__":
split_data()