|
import random |
|
import datetime |
|
|
|
import pandas as pd |
|
|
|
random.seed(1996) |
|
|
|
|
|
DEV_RATIO = 0.10 |
|
|
|
|
|
def choose_best_casing(orig, predicted): |
|
num_upper_tokens = len([c == c.upper() for c in orig.upper()]) |
|
if num_upper_tokens > 0.5 * len(orig): |
|
return predicted |
|
return predicted |
|
|
|
|
|
def split_data(): |
|
events_main = [] |
|
texts_main = [] |
|
events_dev = [] |
|
texts_dev = [] |
|
|
|
with open("data/migration/corpus_titoli_all_raw.truecase_bilstm.txt", encoding="utf-8") as f: |
|
titles_tc = [line.strip() for line in f] |
|
|
|
df_all = pd.read_csv("data/migration/corpus_all.csv", encoding="latin-1") |
|
for idx, (_, row) in enumerate(df_all.iterrows()): |
|
|
|
if idx % 1000 == 0: |
|
print("Processing line:", idx) |
|
|
|
year = int(row["Anno"]) |
|
|
|
event_data = { |
|
"event:id": idx, |
|
"event:year": year, |
|
|
|
} |
|
text_data = { |
|
"event_id": idx, |
|
"text_id": idx, |
|
"pubyear": year, |
|
"language": "Italian", |
|
"provider": row["Testata"].lstrip("*T_"), |
|
"title": choose_best_casing(row["Titolo"], titles_tc[idx]), |
|
"title_truecased": titles_tc[idx], |
|
"title_orig": row["Titolo"] |
|
} |
|
|
|
if random.random() < DEV_RATIO: |
|
events_dev.append(event_data) |
|
texts_dev.append(text_data) |
|
|
|
with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.best.txt", "w", encoding="utf-8") as f_out: |
|
f_out.write(text_data["title"]) |
|
|
|
with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.orig.txt", "w", encoding="utf-8") as f_out: |
|
f_out.write(text_data["title_orig"]) |
|
|
|
with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.truecase.txt", "w", encoding="utf-8") as f_out: |
|
f_out.write(text_data["title_truecased"]) |
|
|
|
else: |
|
events_main.append(event_data) |
|
texts_main.append(text_data) |
|
|
|
with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.best.txt", "w", encoding="utf-8") as f_out: |
|
f_out.write(text_data["title"]) |
|
|
|
with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.orig.txt", "w", encoding="utf-8") as f_out: |
|
f_out.write(text_data["title_orig"]) |
|
|
|
with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.truecase.txt", "w", encoding="utf-8") as f_out: |
|
f_out.write(text_data["title_truecased"]) |
|
|
|
pd.DataFrame(events_main).to_csv("output/migration/split_data/split_main.events.csv") |
|
pd.DataFrame(texts_main).to_csv("output/migration/split_data/split_main.texts.meta.csv") |
|
pd.DataFrame(events_dev).to_csv("output/migration/split_data/split_dev10.events.csv") |
|
pd.DataFrame(texts_dev).to_csv("output/migration/split_data/split_dev10.texts.meta.csv") |
|
|
|
|
|
if __name__ == "__main__": |
|
split_data() |