|
"""
|
|
Split the dataset into a "DEV10" portion (10% of events) for initial experimentation; and "MAIN", the rest of the
|
|
dataset, to be used later
|
|
"""
|
|
import os
|
|
import json
|
|
import random
|
|
import argparse
|
|
from collections import defaultdict
|
|
from typing import List, Tuple, Dict, Any
|
|
|
|
import pandas as pd
|
|
import nltk
|
|
|
|
random.seed(1996)
|
|
|
|
|
|
def split_rai_femicides():
|
|
|
|
print("Processing excel file...")
|
|
femicide_events = pd.read_excel("data/femicides/rai/EventiFemminicidio_from2015to2017.edited_colnames.xlsx",
|
|
sheet_name="dati", header=0)
|
|
event_ids, victim_to_event_id, event_id_to_victims, victim_duplicate_counts = read_events(femicide_events)
|
|
|
|
|
|
|
|
|
|
dicts_to_save = (
|
|
(victim_to_event_id, "victim_to_event_id"),
|
|
(event_id_to_victims, "event_id_to_victims"),
|
|
(victim_duplicate_counts, "victim_duplicate_counts")
|
|
)
|
|
write_dict_to_json(dicts_to_save)
|
|
|
|
|
|
print("Shuffling and splitting...")
|
|
shuffled_event_ids = list(event_ids)
|
|
random.shuffle(shuffled_event_ids)
|
|
dev10_idx = shuffled_event_ids[:78]
|
|
main_idx = shuffled_event_ids[78:]
|
|
dev10_df, main_df = create_split_df(dev10_idx, femicide_events)
|
|
|
|
|
|
for df, df_name in ((dev10_df, "dev10"), (main_df, "main")):
|
|
df.to_csv(f"output/femicides/split_data/rai/split_{df_name}.events.csv")
|
|
df.to_excel(f"output/femicides/split_data/rai/split_{df_name}.events.xlsx")
|
|
|
|
|
|
dev10_victims = {e: victims for e, victims in event_id_to_victims.items() if e in dev10_idx}
|
|
main_victims = {e: victims for e, victims in event_id_to_victims.items() if e in main_idx}
|
|
filtered_dicts_to_save = (
|
|
(dev10_victims, "event_id_to_victims.dev10"),
|
|
(main_victims, "event_id_to_victims.main"),
|
|
)
|
|
write_dict_to_json(filtered_dicts_to_save)
|
|
|
|
|
|
print("Filtering & writing texts...")
|
|
texts_df = pd.read_excel("data/femicides/rai/EventiFemminicidio_from2015to2017_fonti.xlsx")
|
|
filter_texts("dev10", texts_df, dev10_victims, victim_duplicate_counts)
|
|
filter_texts("main", texts_df, main_victims, victim_duplicate_counts)
|
|
|
|
|
|
def split_olv_femicides():
|
|
texts_df = pd.read_csv("data/femicides/olv/texts_scrape_match_scrape_2021-10-28.csv")
|
|
events_df = pd.read_csv("data/femicides/olv/events_scrape_match_scrape_2021-10-28.csv")
|
|
|
|
event_ids = events_df["event:id"].tolist()
|
|
random.shuffle(event_ids)
|
|
|
|
num_dev_events = round(len(event_ids) * 0.10)
|
|
dev10_ids = event_ids[:num_dev_events]
|
|
dev10_df, main_df = create_split_df(dev10_ids, events_df)
|
|
|
|
|
|
dev10_texts_df = texts_df[texts_df["event_id"].isin(dev10_ids)]
|
|
main_texts_df = texts_df[~texts_df["event_id"].isin(dev10_ids)]
|
|
|
|
|
|
for events_df, texts_df, split_name in ((dev10_df, dev10_texts_df, "dev10"), (main_df, main_texts_df, "main")):
|
|
events_df.to_csv(f"output/femicides/split_data/olv/split_{split_name}.events.csv")
|
|
texts_df.to_csv(f"output/femicides/split_data/olv/split_{split_name}.texts.csv")
|
|
events_df.to_excel(f"output/femicides/split_data/olv/split_{split_name}.events.xlsx")
|
|
texts_df.to_excel(f"output/femicides/split_data/olv/split_{split_name}.texts.xlsx")
|
|
|
|
for _, row in texts_df.iterrows():
|
|
event_id = row["event_id"]
|
|
text_id = row["text_id"]
|
|
event_dir = f"output/femicides/split_data/olv/split_{split_name}_texts_by_event/{event_id}/"
|
|
os.makedirs(event_dir, exist_ok=True)
|
|
with open(event_dir + f"/{text_id}.txt", "w", encoding="utf-8") as f_by_event:
|
|
title = row["title"] if not pd.isna(row["title"]) else ""
|
|
for line in nltk.sent_tokenize(title, language="italian"):
|
|
f_by_event.write(line + os.linesep)
|
|
f_by_event.write(os.linesep)
|
|
fulltext = row["fulltext"] if not pd.isna(row["fulltext"]) else ""
|
|
if not fulltext:
|
|
print(f"WARNING: empty fulltext in text_id={text_id}")
|
|
for line in nltk.sent_tokenize(fulltext, language="italian"):
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
f_by_event.write(line + os.linesep)
|
|
|
|
|
|
def write_dict_to_json(filtered_dicts_to_save):
|
|
for dict_data, dict_name in filtered_dicts_to_save:
|
|
with open(f"output/femicides/split_data/rai/{dict_name}.json", "w", encoding="utf-8") as f:
|
|
json.dump(dict_data, f, indent=4, sort_keys=True)
|
|
|
|
|
|
def create_split_df(dev10: List[int], femicide_events: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
dev10_rows = []
|
|
main_rows = []
|
|
for idx, row in femicide_events.iterrows():
|
|
event_id = row["event:id"]
|
|
if pd.isna(event_id):
|
|
continue
|
|
event_id = int(event_id)
|
|
if event_id in dev10:
|
|
dev10_rows.append(row)
|
|
else:
|
|
main_rows.append(row)
|
|
dev10_df = pd.DataFrame(dev10_rows)
|
|
main_df = pd.DataFrame(main_rows)
|
|
return dev10_df, main_df
|
|
|
|
|
|
def read_events(events_df):
|
|
event_ids: List[int] = []
|
|
victim_to_event_id: Dict[str, int] = {}
|
|
event_id_to_victims: Dict[int, List[Tuple[str, int]]] = defaultdict(list)
|
|
victim_duplicate_counts: Dict[str, int] = defaultdict(int)
|
|
|
|
for idx, row in events_df.iterrows():
|
|
event_id = row["event:id"]
|
|
if pd.isna(event_id):
|
|
continue
|
|
event_id = int(event_id)
|
|
|
|
|
|
victim = row["victim:name"]
|
|
if victim == "non rilevato" or pd.isna(victim):
|
|
victim = f"UNKNOWN_{event_id}"
|
|
|
|
|
|
victim_duplicate_counts[victim] += 1
|
|
duplicate_id = victim_duplicate_counts[victim]
|
|
|
|
event_ids.append(event_id)
|
|
victim_to_event_id[f"{victim}/{duplicate_id}"] = event_id
|
|
event_id_to_victims[event_id].append((victim, duplicate_id))
|
|
return event_ids, victim_to_event_id, event_id_to_victims, victim_duplicate_counts
|
|
|
|
|
|
def filter_texts(split_name: str,
|
|
texts_df: pd.DataFrame,
|
|
event_idx_to_victims: Dict[int, List[Tuple[str, int]]],
|
|
victim_duplicate_counts: Dict[str, int]):
|
|
|
|
print(f"\tfilter_texts: filtering split {split_name}")
|
|
|
|
|
|
victim_to_event_idx = {}
|
|
for e_id in event_idx_to_victims:
|
|
for victim in event_idx_to_victims[e_id]:
|
|
victim_name, victim_dup_id = victim
|
|
if victim_duplicate_counts[victim_name] > 1:
|
|
print(f"\tfilter_texts: removing ambiguous victim name '{victim_name}'")
|
|
continue
|
|
victim_to_event_idx[victim_name] = e_id
|
|
|
|
meta_rows: List[Dict[str, Any]] = []
|
|
with open(f"output/femicides/split_data/rai/split_{split_name}.texts.text.txt", "w", encoding="utf-8") as f_txt, \
|
|
open(f"output/femicides/split_data/rai/split_{split_name}.texts.ids.txt", "w", encoding="utf-8") as f_id:
|
|
for _, row in texts_df.iterrows():
|
|
text_victim = row["vittima"].strip()
|
|
if text_victim in victim_to_event_idx:
|
|
e_id = victim_to_event_idx[text_victim]
|
|
text_id = int(row["ID"])
|
|
url = row["link"]
|
|
pubdate = row["pubdate"]
|
|
provider = row["provider"]
|
|
title = row["title"]
|
|
|
|
meta_rows.append({
|
|
"event_id": e_id,
|
|
"text_id": text_id,
|
|
"url": url,
|
|
"pubdate": pubdate,
|
|
"provider": provider,
|
|
"title": title
|
|
})
|
|
|
|
|
|
body_text_lines = nltk.sent_tokenize(row["text"], language="italian")
|
|
title_lines = nltk.sent_tokenize(title, language="italian")
|
|
|
|
|
|
|
|
for line in title_lines:
|
|
f_txt.write(line + os.linesep)
|
|
f_id.write(f"event {e_id}\ttext {text_id}\ttitle" + os.linesep)
|
|
|
|
event_dir = f"output/femicides/split_data/rai/split_{split_name}_texts_by_event/{e_id}/"
|
|
os.makedirs(event_dir, exist_ok=True)
|
|
with open(event_dir + f"/{text_id}.txt", "w", encoding="utf-8") as f_by_event:
|
|
|
|
for line in title_lines:
|
|
f_by_event.write(line + os.linesep)
|
|
f_by_event.write(os.linesep)
|
|
for line in body_text_lines:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
f_txt.write(line + os.linesep)
|
|
f_by_event.write(line + os.linesep)
|
|
f_id.write(f"event {e_id}\ttext {text_id}\tbody" + os.linesep)
|
|
|
|
meta_df = pd.DataFrame(meta_rows)
|
|
meta_df.to_csv(f"output/femicides/split_data/rai/split_{split_name}.texts.meta.csv")
|
|
meta_df.to_excel(f"output/femicides/split_data/rai/split_{split_name}.texts.meta.xlsx")
|
|
print()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("dataset", choices=["rai", "olv"])
|
|
args = ap.parse_args()
|
|
|
|
if args.dataset == "rai":
|
|
split_rai_femicides()
|
|
else:
|
|
split_olv_femicides()
|
|
|