""" Split the dataset into a "DEV10" portion (10% of events) for initial experimentation; and "MAIN", the rest of the dataset, to be used later """ import os import json import random import argparse from collections import defaultdict from typing import List, Tuple, Dict, Any import pandas as pd import nltk random.seed(1996) def split_rai_femicides(): # process the excel file print("Processing excel file...") femicide_events = pd.read_excel("data/femicides/rai/EventiFemminicidio_from2015to2017.edited_colnames.xlsx", sheet_name="dati", header=0) event_ids, victim_to_event_id, event_id_to_victims, victim_duplicate_counts = read_events(femicide_events) # save information about correspondences between victims and events # (we will need this later to retrieve the correct texts for each event, because the XLSX with texts uses victim # names as keys) dicts_to_save = ( (victim_to_event_id, "victim_to_event_id"), (event_id_to_victims, "event_id_to_victims"), (victim_duplicate_counts, "victim_duplicate_counts") ) write_dict_to_json(dicts_to_save) # shuffle and split print("Shuffling and splitting...") shuffled_event_ids = list(event_ids) random.shuffle(shuffled_event_ids) dev10_idx = shuffled_event_ids[:78] main_idx = shuffled_event_ids[78:] dev10_df, main_df = create_split_df(dev10_idx, femicide_events) # write split dataframes for df, df_name in ((dev10_df, "dev10"), (main_df, "main")): df.to_csv(f"output/femicides/split_data/rai/split_{df_name}.events.csv") df.to_excel(f"output/femicides/split_data/rai/split_{df_name}.events.xlsx") # write filtered victim data dev10_victims = {e: victims for e, victims in event_id_to_victims.items() if e in dev10_idx} main_victims = {e: victims for e, victims in event_id_to_victims.items() if e in main_idx} filtered_dicts_to_save = ( (dev10_victims, "event_id_to_victims.dev10"), (main_victims, "event_id_to_victims.main"), ) write_dict_to_json(filtered_dicts_to_save) # retrieve texts for filtered data print("Filtering & writing texts...") texts_df = pd.read_excel("data/femicides/rai/EventiFemminicidio_from2015to2017_fonti.xlsx") filter_texts("dev10", texts_df, dev10_victims, victim_duplicate_counts) filter_texts("main", texts_df, main_victims, victim_duplicate_counts) def split_olv_femicides(): texts_df = pd.read_csv("data/femicides/olv/texts_scrape_match_scrape_2021-10-28.csv") events_df = pd.read_csv("data/femicides/olv/events_scrape_match_scrape_2021-10-28.csv") event_ids = events_df["event:id"].tolist() random.shuffle(event_ids) num_dev_events = round(len(event_ids) * 0.10) dev10_ids = event_ids[:num_dev_events] dev10_df, main_df = create_split_df(dev10_ids, events_df) # split texts dev10_texts_df = texts_df[texts_df["event_id"].isin(dev10_ids)] main_texts_df = texts_df[~texts_df["event_id"].isin(dev10_ids)] # write to files for events_df, texts_df, split_name in ((dev10_df, dev10_texts_df, "dev10"), (main_df, main_texts_df, "main")): events_df.to_csv(f"output/femicides/split_data/olv/split_{split_name}.events.csv") texts_df.to_csv(f"output/femicides/split_data/olv/split_{split_name}.texts.csv") events_df.to_excel(f"output/femicides/split_data/olv/split_{split_name}.events.xlsx") texts_df.to_excel(f"output/femicides/split_data/olv/split_{split_name}.texts.xlsx") for _, row in texts_df.iterrows(): event_id = row["event_id"] text_id = row["text_id"] event_dir = f"output/femicides/split_data/olv/split_{split_name}_texts_by_event/{event_id}/" os.makedirs(event_dir, exist_ok=True) with open(event_dir + f"/{text_id}.txt", "w", encoding="utf-8") as f_by_event: title = row["title"] if not pd.isna(row["title"]) else "" for line in nltk.sent_tokenize(title, language="italian"): f_by_event.write(line + os.linesep) f_by_event.write(os.linesep) fulltext = row["fulltext"] if not pd.isna(row["fulltext"]) else "" if not fulltext: print(f"WARNING: empty fulltext in text_id={text_id}") for line in nltk.sent_tokenize(fulltext, language="italian"): line = line.strip() if not line: continue f_by_event.write(line + os.linesep) def write_dict_to_json(filtered_dicts_to_save): for dict_data, dict_name in filtered_dicts_to_save: with open(f"output/femicides/split_data/rai/{dict_name}.json", "w", encoding="utf-8") as f: json.dump(dict_data, f, indent=4, sort_keys=True) def create_split_df(dev10: List[int], femicide_events: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: dev10_rows = [] main_rows = [] for idx, row in femicide_events.iterrows(): event_id = row["event:id"] if pd.isna(event_id): continue event_id = int(event_id) if event_id in dev10: dev10_rows.append(row) else: main_rows.append(row) dev10_df = pd.DataFrame(dev10_rows) main_df = pd.DataFrame(main_rows) return dev10_df, main_df def read_events(events_df): event_ids: List[int] = [] victim_to_event_id: Dict[str, int] = {} event_id_to_victims: Dict[int, List[Tuple[str, int]]] = defaultdict(list) victim_duplicate_counts: Dict[str, int] = defaultdict(int) for idx, row in events_df.iterrows(): event_id = row["event:id"] if pd.isna(event_id): continue event_id = int(event_id) # unspecified name --> "UNKNOWN_X" victim = row["victim:name"] if victim == "non rilevato" or pd.isna(victim): victim = f"UNKNOWN_{event_id}" # disambiguate victims with duplicate names victim_duplicate_counts[victim] += 1 duplicate_id = victim_duplicate_counts[victim] event_ids.append(event_id) victim_to_event_id[f"{victim}/{duplicate_id}"] = event_id event_id_to_victims[event_id].append((victim, duplicate_id)) return event_ids, victim_to_event_id, event_id_to_victims, victim_duplicate_counts def filter_texts(split_name: str, texts_df: pd.DataFrame, event_idx_to_victims: Dict[int, List[Tuple[str, int]]], victim_duplicate_counts: Dict[str, int]): print(f"\tfilter_texts: filtering split {split_name}") # first filter victims victim_to_event_idx = {} for e_id in event_idx_to_victims: for victim in event_idx_to_victims[e_id]: victim_name, victim_dup_id = victim if victim_duplicate_counts[victim_name] > 1: print(f"\tfilter_texts: removing ambiguous victim name '{victim_name}'") continue victim_to_event_idx[victim_name] = e_id meta_rows: List[Dict[str, Any]] = [] with open(f"output/femicides/split_data/rai/split_{split_name}.texts.text.txt", "w", encoding="utf-8") as f_txt, \ open(f"output/femicides/split_data/rai/split_{split_name}.texts.ids.txt", "w", encoding="utf-8") as f_id: for _, row in texts_df.iterrows(): text_victim = row["vittima"].strip() if text_victim in victim_to_event_idx: e_id = victim_to_event_idx[text_victim] text_id = int(row["ID"]) url = row["link"] pubdate = row["pubdate"] provider = row["provider"] title = row["title"] meta_rows.append({ "event_id": e_id, "text_id": text_id, "url": url, "pubdate": pubdate, "provider": provider, "title": title }) # body_text_lines = row["text"].split("\n") body_text_lines = nltk.sent_tokenize(row["text"], language="italian") title_lines = nltk.sent_tokenize(title, language="italian") # f_txt.write(title.strip() + os.linesep) # f_id.write(f"event {e_id}\ttext {text_id}\ttitle" + os.linesep) for line in title_lines: f_txt.write(line + os.linesep) f_id.write(f"event {e_id}\ttext {text_id}\ttitle" + os.linesep) event_dir = f"output/femicides/split_data/rai/split_{split_name}_texts_by_event/{e_id}/" os.makedirs(event_dir, exist_ok=True) with open(event_dir + f"/{text_id}.txt", "w", encoding="utf-8") as f_by_event: # f_by_event.write(title.strip() + os.linesep) for line in title_lines: f_by_event.write(line + os.linesep) f_by_event.write(os.linesep) for line in body_text_lines: line = line.strip() if not line: continue f_txt.write(line + os.linesep) f_by_event.write(line + os.linesep) f_id.write(f"event {e_id}\ttext {text_id}\tbody" + os.linesep) meta_df = pd.DataFrame(meta_rows) meta_df.to_csv(f"output/femicides/split_data/rai/split_{split_name}.texts.meta.csv") meta_df.to_excel(f"output/femicides/split_data/rai/split_{split_name}.texts.meta.xlsx") print() if __name__ == '__main__': ap = argparse.ArgumentParser() ap.add_argument("dataset", choices=["rai", "olv"]) args = ap.parse_args() if args.dataset == "rai": split_rai_femicides() else: split_olv_femicides()