Gosse Minnema
Add sociofillmore code, load dataset via private dataset repo
b11ac48
raw
history blame
No virus
2.99 kB
import random
import datetime
import pandas as pd
random.seed(1996)
DEV_RATIO = 0.10
def choose_best_casing(orig, predicted):
num_upper_tokens = len([c == c.upper() for c in orig.upper()])
if num_upper_tokens > 0.5 * len(orig):
return predicted
return predicted
def split_data():
events_main = []
texts_main = []
events_dev = []
texts_dev = []
with open("data/migration/corpus_titoli_all_raw.truecase_bilstm.txt", encoding="utf-8") as f:
titles_tc = [line.strip() for line in f]
df_all = pd.read_csv("data/migration/corpus_all.csv", encoding="latin-1")
for idx, (_, row) in enumerate(df_all.iterrows()):
if idx % 1000 == 0:
print("Processing line:", idx)
year = int(row["Anno"])
event_data = {
"event:id": idx,
"event:year": year,
}
text_data = {
"event_id": idx,
"text_id": idx,
"pubyear": year,
"language": "Italian",
"provider": row["Testata"].lstrip("*T_"),
"title": choose_best_casing(row["Titolo"], titles_tc[idx]),
"title_truecased": titles_tc[idx],
"title_orig": row["Titolo"]
}
if random.random() < DEV_RATIO:
events_dev.append(event_data)
texts_dev.append(text_data)
with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.best.txt", "w", encoding="utf-8") as f_out:
f_out.write(text_data["title"])
with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.orig.txt", "w", encoding="utf-8") as f_out:
f_out.write(text_data["title_orig"])
with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.truecase.txt", "w", encoding="utf-8") as f_out:
f_out.write(text_data["title_truecased"])
else:
events_main.append(event_data)
texts_main.append(text_data)
with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.best.txt", "w", encoding="utf-8") as f_out:
f_out.write(text_data["title"])
with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.orig.txt", "w", encoding="utf-8") as f_out:
f_out.write(text_data["title_orig"])
with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.truecase.txt", "w", encoding="utf-8") as f_out:
f_out.write(text_data["title_truecased"])
pd.DataFrame(events_main).to_csv("output/migration/split_data/split_main.events.csv")
pd.DataFrame(texts_main).to_csv("output/migration/split_data/split_main.texts.meta.csv")
pd.DataFrame(events_dev).to_csv("output/migration/split_data/split_dev10.events.csv")
pd.DataFrame(texts_dev).to_csv("output/migration/split_data/split_dev10.texts.meta.csv")
if __name__ == "__main__":
split_data()