Gosse Minnema
Add sociofillmore code, load dataset via private dataset repo
b11ac48
raw
history blame
10.1 kB
"""
Split the dataset into a "DEV10" portion (10% of events) for initial experimentation; and "MAIN", the rest of the
dataset, to be used later
"""
import os
import json
import random
import argparse
from collections import defaultdict
from typing import List, Tuple, Dict, Any
import pandas as pd
import nltk
random.seed(1996)
def split_rai_femicides():
# process the excel file
print("Processing excel file...")
femicide_events = pd.read_excel("data/femicides/rai/EventiFemminicidio_from2015to2017.edited_colnames.xlsx",
sheet_name="dati", header=0)
event_ids, victim_to_event_id, event_id_to_victims, victim_duplicate_counts = read_events(femicide_events)
# save information about correspondences between victims and events
# (we will need this later to retrieve the correct texts for each event, because the XLSX with texts uses victim
# names as keys)
dicts_to_save = (
(victim_to_event_id, "victim_to_event_id"),
(event_id_to_victims, "event_id_to_victims"),
(victim_duplicate_counts, "victim_duplicate_counts")
)
write_dict_to_json(dicts_to_save)
# shuffle and split
print("Shuffling and splitting...")
shuffled_event_ids = list(event_ids)
random.shuffle(shuffled_event_ids)
dev10_idx = shuffled_event_ids[:78]
main_idx = shuffled_event_ids[78:]
dev10_df, main_df = create_split_df(dev10_idx, femicide_events)
# write split dataframes
for df, df_name in ((dev10_df, "dev10"), (main_df, "main")):
df.to_csv(f"output/femicides/split_data/rai/split_{df_name}.events.csv")
df.to_excel(f"output/femicides/split_data/rai/split_{df_name}.events.xlsx")
# write filtered victim data
dev10_victims = {e: victims for e, victims in event_id_to_victims.items() if e in dev10_idx}
main_victims = {e: victims for e, victims in event_id_to_victims.items() if e in main_idx}
filtered_dicts_to_save = (
(dev10_victims, "event_id_to_victims.dev10"),
(main_victims, "event_id_to_victims.main"),
)
write_dict_to_json(filtered_dicts_to_save)
# retrieve texts for filtered data
print("Filtering & writing texts...")
texts_df = pd.read_excel("data/femicides/rai/EventiFemminicidio_from2015to2017_fonti.xlsx")
filter_texts("dev10", texts_df, dev10_victims, victim_duplicate_counts)
filter_texts("main", texts_df, main_victims, victim_duplicate_counts)
def split_olv_femicides():
texts_df = pd.read_csv("data/femicides/olv/texts_scrape_match_scrape_2021-10-28.csv")
events_df = pd.read_csv("data/femicides/olv/events_scrape_match_scrape_2021-10-28.csv")
event_ids = events_df["event:id"].tolist()
random.shuffle(event_ids)
num_dev_events = round(len(event_ids) * 0.10)
dev10_ids = event_ids[:num_dev_events]
dev10_df, main_df = create_split_df(dev10_ids, events_df)
# split texts
dev10_texts_df = texts_df[texts_df["event_id"].isin(dev10_ids)]
main_texts_df = texts_df[~texts_df["event_id"].isin(dev10_ids)]
# write to files
for events_df, texts_df, split_name in ((dev10_df, dev10_texts_df, "dev10"), (main_df, main_texts_df, "main")):
events_df.to_csv(f"output/femicides/split_data/olv/split_{split_name}.events.csv")
texts_df.to_csv(f"output/femicides/split_data/olv/split_{split_name}.texts.csv")
events_df.to_excel(f"output/femicides/split_data/olv/split_{split_name}.events.xlsx")
texts_df.to_excel(f"output/femicides/split_data/olv/split_{split_name}.texts.xlsx")
for _, row in texts_df.iterrows():
event_id = row["event_id"]
text_id = row["text_id"]
event_dir = f"output/femicides/split_data/olv/split_{split_name}_texts_by_event/{event_id}/"
os.makedirs(event_dir, exist_ok=True)
with open(event_dir + f"/{text_id}.txt", "w", encoding="utf-8") as f_by_event:
title = row["title"] if not pd.isna(row["title"]) else ""
for line in nltk.sent_tokenize(title, language="italian"):
f_by_event.write(line + os.linesep)
f_by_event.write(os.linesep)
fulltext = row["fulltext"] if not pd.isna(row["fulltext"]) else ""
if not fulltext:
print(f"WARNING: empty fulltext in text_id={text_id}")
for line in nltk.sent_tokenize(fulltext, language="italian"):
line = line.strip()
if not line:
continue
f_by_event.write(line + os.linesep)
def write_dict_to_json(filtered_dicts_to_save):
for dict_data, dict_name in filtered_dicts_to_save:
with open(f"output/femicides/split_data/rai/{dict_name}.json", "w", encoding="utf-8") as f:
json.dump(dict_data, f, indent=4, sort_keys=True)
def create_split_df(dev10: List[int], femicide_events: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
dev10_rows = []
main_rows = []
for idx, row in femicide_events.iterrows():
event_id = row["event:id"]
if pd.isna(event_id):
continue
event_id = int(event_id)
if event_id in dev10:
dev10_rows.append(row)
else:
main_rows.append(row)
dev10_df = pd.DataFrame(dev10_rows)
main_df = pd.DataFrame(main_rows)
return dev10_df, main_df
def read_events(events_df):
event_ids: List[int] = []
victim_to_event_id: Dict[str, int] = {}
event_id_to_victims: Dict[int, List[Tuple[str, int]]] = defaultdict(list)
victim_duplicate_counts: Dict[str, int] = defaultdict(int)
for idx, row in events_df.iterrows():
event_id = row["event:id"]
if pd.isna(event_id):
continue
event_id = int(event_id)
# unspecified name --> "UNKNOWN_X"
victim = row["victim:name"]
if victim == "non rilevato" or pd.isna(victim):
victim = f"UNKNOWN_{event_id}"
# disambiguate victims with duplicate names
victim_duplicate_counts[victim] += 1
duplicate_id = victim_duplicate_counts[victim]
event_ids.append(event_id)
victim_to_event_id[f"{victim}/{duplicate_id}"] = event_id
event_id_to_victims[event_id].append((victim, duplicate_id))
return event_ids, victim_to_event_id, event_id_to_victims, victim_duplicate_counts
def filter_texts(split_name: str,
texts_df: pd.DataFrame,
event_idx_to_victims: Dict[int, List[Tuple[str, int]]],
victim_duplicate_counts: Dict[str, int]):
print(f"\tfilter_texts: filtering split {split_name}")
# first filter victims
victim_to_event_idx = {}
for e_id in event_idx_to_victims:
for victim in event_idx_to_victims[e_id]:
victim_name, victim_dup_id = victim
if victim_duplicate_counts[victim_name] > 1:
print(f"\tfilter_texts: removing ambiguous victim name '{victim_name}'")
continue
victim_to_event_idx[victim_name] = e_id
meta_rows: List[Dict[str, Any]] = []
with open(f"output/femicides/split_data/rai/split_{split_name}.texts.text.txt", "w", encoding="utf-8") as f_txt, \
open(f"output/femicides/split_data/rai/split_{split_name}.texts.ids.txt", "w", encoding="utf-8") as f_id:
for _, row in texts_df.iterrows():
text_victim = row["vittima"].strip()
if text_victim in victim_to_event_idx:
e_id = victim_to_event_idx[text_victim]
text_id = int(row["ID"])
url = row["link"]
pubdate = row["pubdate"]
provider = row["provider"]
title = row["title"]
meta_rows.append({
"event_id": e_id,
"text_id": text_id,
"url": url,
"pubdate": pubdate,
"provider": provider,
"title": title
})
# body_text_lines = row["text"].split("\n")
body_text_lines = nltk.sent_tokenize(row["text"], language="italian")
title_lines = nltk.sent_tokenize(title, language="italian")
# f_txt.write(title.strip() + os.linesep)
# f_id.write(f"event {e_id}\ttext {text_id}\ttitle" + os.linesep)
for line in title_lines:
f_txt.write(line + os.linesep)
f_id.write(f"event {e_id}\ttext {text_id}\ttitle" + os.linesep)
event_dir = f"output/femicides/split_data/rai/split_{split_name}_texts_by_event/{e_id}/"
os.makedirs(event_dir, exist_ok=True)
with open(event_dir + f"/{text_id}.txt", "w", encoding="utf-8") as f_by_event:
# f_by_event.write(title.strip() + os.linesep)
for line in title_lines:
f_by_event.write(line + os.linesep)
f_by_event.write(os.linesep)
for line in body_text_lines:
line = line.strip()
if not line:
continue
f_txt.write(line + os.linesep)
f_by_event.write(line + os.linesep)
f_id.write(f"event {e_id}\ttext {text_id}\tbody" + os.linesep)
meta_df = pd.DataFrame(meta_rows)
meta_df.to_csv(f"output/femicides/split_data/rai/split_{split_name}.texts.meta.csv")
meta_df.to_excel(f"output/femicides/split_data/rai/split_{split_name}.texts.meta.xlsx")
print()
if __name__ == '__main__':
ap = argparse.ArgumentParser()
ap.add_argument("dataset", choices=["rai", "olv"])
args = ap.parse_args()
if args.dataset == "rai":
split_rai_femicides()
else:
split_olv_femicides()