File size: 2,985 Bytes
b11ac48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import random
import datetime

import pandas as pd

random.seed(1996)


DEV_RATIO = 0.10


def choose_best_casing(orig, predicted):
    num_upper_tokens = len([c == c.upper() for c in orig.upper()])
    if num_upper_tokens > 0.5 * len(orig):
        return predicted
    return predicted


def split_data():
    events_main = []
    texts_main = []
    events_dev = []
    texts_dev = []
    
    with open("data/migration/corpus_titoli_all_raw.truecase_bilstm.txt", encoding="utf-8") as f:
        titles_tc = [line.strip() for line in f]

    df_all = pd.read_csv("data/migration/corpus_all.csv", encoding="latin-1")
    for idx, (_, row) in enumerate(df_all.iterrows()):
        
        if idx % 1000 == 0:
            print("Processing line:", idx)

        year = int(row["Anno"])

        event_data = {
            "event:id": idx,
            "event:year": year,
    
        }
        text_data = {
            "event_id": idx,
            "text_id": idx,
            "pubyear": year,
            "language": "Italian",
            "provider": row["Testata"].lstrip("*T_"),
            "title": choose_best_casing(row["Titolo"], titles_tc[idx]), 
            "title_truecased": titles_tc[idx],
            "title_orig": row["Titolo"]
        }

        if random.random() < DEV_RATIO:
            events_dev.append(event_data)
            texts_dev.append(text_data)

            with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.best.txt", "w", encoding="utf-8") as f_out:
                f_out.write(text_data["title"])

            with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.orig.txt", "w", encoding="utf-8") as f_out:
                f_out.write(text_data["title_orig"])

            with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.truecase.txt", "w", encoding="utf-8") as f_out:
                f_out.write(text_data["title_truecased"])

        else:
            events_main.append(event_data)
            texts_main.append(text_data)

            with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.best.txt", "w", encoding="utf-8") as f_out:
                f_out.write(text_data["title"])

            with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.orig.txt", "w", encoding="utf-8") as f_out:
                f_out.write(text_data["title_orig"])

            with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.truecase.txt", "w", encoding="utf-8") as f_out:
                f_out.write(text_data["title_truecased"])

    pd.DataFrame(events_main).to_csv("output/migration/split_data/split_main.events.csv")
    pd.DataFrame(texts_main).to_csv("output/migration/split_data/split_main.texts.meta.csv")
    pd.DataFrame(events_dev).to_csv("output/migration/split_data/split_dev10.events.csv")
    pd.DataFrame(texts_dev).to_csv("output/migration/split_data/split_dev10.texts.meta.csv")


if __name__ == "__main__":
    split_data()