Spaces:
Build error
Build error
""" | |
Split the dataset into a "DEV10" portion (10% of events) for initial experimentation; and "MAIN", the rest of the | |
dataset, to be used later | |
""" | |
import os | |
import json | |
import random | |
import argparse | |
from collections import defaultdict | |
from typing import List, Tuple, Dict, Any | |
import pandas as pd | |
import nltk | |
random.seed(1996) | |
def split_rai_femicides(): | |
# process the excel file | |
print("Processing excel file...") | |
femicide_events = pd.read_excel("data/femicides/rai/EventiFemminicidio_from2015to2017.edited_colnames.xlsx", | |
sheet_name="dati", header=0) | |
event_ids, victim_to_event_id, event_id_to_victims, victim_duplicate_counts = read_events(femicide_events) | |
# save information about correspondences between victims and events | |
# (we will need this later to retrieve the correct texts for each event, because the XLSX with texts uses victim | |
# names as keys) | |
dicts_to_save = ( | |
(victim_to_event_id, "victim_to_event_id"), | |
(event_id_to_victims, "event_id_to_victims"), | |
(victim_duplicate_counts, "victim_duplicate_counts") | |
) | |
write_dict_to_json(dicts_to_save) | |
# shuffle and split | |
print("Shuffling and splitting...") | |
shuffled_event_ids = list(event_ids) | |
random.shuffle(shuffled_event_ids) | |
dev10_idx = shuffled_event_ids[:78] | |
main_idx = shuffled_event_ids[78:] | |
dev10_df, main_df = create_split_df(dev10_idx, femicide_events) | |
# write split dataframes | |
for df, df_name in ((dev10_df, "dev10"), (main_df, "main")): | |
df.to_csv(f"output/femicides/split_data/rai/split_{df_name}.events.csv") | |
df.to_excel(f"output/femicides/split_data/rai/split_{df_name}.events.xlsx") | |
# write filtered victim data | |
dev10_victims = {e: victims for e, victims in event_id_to_victims.items() if e in dev10_idx} | |
main_victims = {e: victims for e, victims in event_id_to_victims.items() if e in main_idx} | |
filtered_dicts_to_save = ( | |
(dev10_victims, "event_id_to_victims.dev10"), | |
(main_victims, "event_id_to_victims.main"), | |
) | |
write_dict_to_json(filtered_dicts_to_save) | |
# retrieve texts for filtered data | |
print("Filtering & writing texts...") | |
texts_df = pd.read_excel("data/femicides/rai/EventiFemminicidio_from2015to2017_fonti.xlsx") | |
filter_texts("dev10", texts_df, dev10_victims, victim_duplicate_counts) | |
filter_texts("main", texts_df, main_victims, victim_duplicate_counts) | |
def split_olv_femicides(): | |
texts_df = pd.read_csv("data/femicides/olv/texts_scrape_match_scrape_2021-10-28.csv") | |
events_df = pd.read_csv("data/femicides/olv/events_scrape_match_scrape_2021-10-28.csv") | |
event_ids = events_df["event:id"].tolist() | |
random.shuffle(event_ids) | |
num_dev_events = round(len(event_ids) * 0.10) | |
dev10_ids = event_ids[:num_dev_events] | |
dev10_df, main_df = create_split_df(dev10_ids, events_df) | |
# split texts | |
dev10_texts_df = texts_df[texts_df["event_id"].isin(dev10_ids)] | |
main_texts_df = texts_df[~texts_df["event_id"].isin(dev10_ids)] | |
# write to files | |
for events_df, texts_df, split_name in ((dev10_df, dev10_texts_df, "dev10"), (main_df, main_texts_df, "main")): | |
events_df.to_csv(f"output/femicides/split_data/olv/split_{split_name}.events.csv") | |
texts_df.to_csv(f"output/femicides/split_data/olv/split_{split_name}.texts.csv") | |
events_df.to_excel(f"output/femicides/split_data/olv/split_{split_name}.events.xlsx") | |
texts_df.to_excel(f"output/femicides/split_data/olv/split_{split_name}.texts.xlsx") | |
for _, row in texts_df.iterrows(): | |
event_id = row["event_id"] | |
text_id = row["text_id"] | |
event_dir = f"output/femicides/split_data/olv/split_{split_name}_texts_by_event/{event_id}/" | |
os.makedirs(event_dir, exist_ok=True) | |
with open(event_dir + f"/{text_id}.txt", "w", encoding="utf-8") as f_by_event: | |
title = row["title"] if not pd.isna(row["title"]) else "" | |
for line in nltk.sent_tokenize(title, language="italian"): | |
f_by_event.write(line + os.linesep) | |
f_by_event.write(os.linesep) | |
fulltext = row["fulltext"] if not pd.isna(row["fulltext"]) else "" | |
if not fulltext: | |
print(f"WARNING: empty fulltext in text_id={text_id}") | |
for line in nltk.sent_tokenize(fulltext, language="italian"): | |
line = line.strip() | |
if not line: | |
continue | |
f_by_event.write(line + os.linesep) | |
def write_dict_to_json(filtered_dicts_to_save): | |
for dict_data, dict_name in filtered_dicts_to_save: | |
with open(f"output/femicides/split_data/rai/{dict_name}.json", "w", encoding="utf-8") as f: | |
json.dump(dict_data, f, indent=4, sort_keys=True) | |
def create_split_df(dev10: List[int], femicide_events: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: | |
dev10_rows = [] | |
main_rows = [] | |
for idx, row in femicide_events.iterrows(): | |
event_id = row["event:id"] | |
if pd.isna(event_id): | |
continue | |
event_id = int(event_id) | |
if event_id in dev10: | |
dev10_rows.append(row) | |
else: | |
main_rows.append(row) | |
dev10_df = pd.DataFrame(dev10_rows) | |
main_df = pd.DataFrame(main_rows) | |
return dev10_df, main_df | |
def read_events(events_df): | |
event_ids: List[int] = [] | |
victim_to_event_id: Dict[str, int] = {} | |
event_id_to_victims: Dict[int, List[Tuple[str, int]]] = defaultdict(list) | |
victim_duplicate_counts: Dict[str, int] = defaultdict(int) | |
for idx, row in events_df.iterrows(): | |
event_id = row["event:id"] | |
if pd.isna(event_id): | |
continue | |
event_id = int(event_id) | |
# unspecified name --> "UNKNOWN_X" | |
victim = row["victim:name"] | |
if victim == "non rilevato" or pd.isna(victim): | |
victim = f"UNKNOWN_{event_id}" | |
# disambiguate victims with duplicate names | |
victim_duplicate_counts[victim] += 1 | |
duplicate_id = victim_duplicate_counts[victim] | |
event_ids.append(event_id) | |
victim_to_event_id[f"{victim}/{duplicate_id}"] = event_id | |
event_id_to_victims[event_id].append((victim, duplicate_id)) | |
return event_ids, victim_to_event_id, event_id_to_victims, victim_duplicate_counts | |
def filter_texts(split_name: str, | |
texts_df: pd.DataFrame, | |
event_idx_to_victims: Dict[int, List[Tuple[str, int]]], | |
victim_duplicate_counts: Dict[str, int]): | |
print(f"\tfilter_texts: filtering split {split_name}") | |
# first filter victims | |
victim_to_event_idx = {} | |
for e_id in event_idx_to_victims: | |
for victim in event_idx_to_victims[e_id]: | |
victim_name, victim_dup_id = victim | |
if victim_duplicate_counts[victim_name] > 1: | |
print(f"\tfilter_texts: removing ambiguous victim name '{victim_name}'") | |
continue | |
victim_to_event_idx[victim_name] = e_id | |
meta_rows: List[Dict[str, Any]] = [] | |
with open(f"output/femicides/split_data/rai/split_{split_name}.texts.text.txt", "w", encoding="utf-8") as f_txt, \ | |
open(f"output/femicides/split_data/rai/split_{split_name}.texts.ids.txt", "w", encoding="utf-8") as f_id: | |
for _, row in texts_df.iterrows(): | |
text_victim = row["vittima"].strip() | |
if text_victim in victim_to_event_idx: | |
e_id = victim_to_event_idx[text_victim] | |
text_id = int(row["ID"]) | |
url = row["link"] | |
pubdate = row["pubdate"] | |
provider = row["provider"] | |
title = row["title"] | |
meta_rows.append({ | |
"event_id": e_id, | |
"text_id": text_id, | |
"url": url, | |
"pubdate": pubdate, | |
"provider": provider, | |
"title": title | |
}) | |
# body_text_lines = row["text"].split("\n") | |
body_text_lines = nltk.sent_tokenize(row["text"], language="italian") | |
title_lines = nltk.sent_tokenize(title, language="italian") | |
# f_txt.write(title.strip() + os.linesep) | |
# f_id.write(f"event {e_id}\ttext {text_id}\ttitle" + os.linesep) | |
for line in title_lines: | |
f_txt.write(line + os.linesep) | |
f_id.write(f"event {e_id}\ttext {text_id}\ttitle" + os.linesep) | |
event_dir = f"output/femicides/split_data/rai/split_{split_name}_texts_by_event/{e_id}/" | |
os.makedirs(event_dir, exist_ok=True) | |
with open(event_dir + f"/{text_id}.txt", "w", encoding="utf-8") as f_by_event: | |
# f_by_event.write(title.strip() + os.linesep) | |
for line in title_lines: | |
f_by_event.write(line + os.linesep) | |
f_by_event.write(os.linesep) | |
for line in body_text_lines: | |
line = line.strip() | |
if not line: | |
continue | |
f_txt.write(line + os.linesep) | |
f_by_event.write(line + os.linesep) | |
f_id.write(f"event {e_id}\ttext {text_id}\tbody" + os.linesep) | |
meta_df = pd.DataFrame(meta_rows) | |
meta_df.to_csv(f"output/femicides/split_data/rai/split_{split_name}.texts.meta.csv") | |
meta_df.to_excel(f"output/femicides/split_data/rai/split_{split_name}.texts.meta.xlsx") | |
print() | |
if __name__ == '__main__': | |
ap = argparse.ArgumentParser() | |
ap.add_argument("dataset", choices=["rai", "olv"]) | |
args = ap.parse_args() | |
if args.dataset == "rai": | |
split_rai_femicides() | |
else: | |
split_olv_femicides() | |