Spaces:

responsibility-framing
/

sociofillmore_public

Build error

sociofillmore_public / sociofillmore /crashes /split_data.py

Gosse Minnema

Add sociofillmore code, load dataset via private dataset repo

b11ac48 8 months ago

8.49 kB

	import json
	import os
	import random
	import shutil
	from datetime import datetime

	import langdetect
	import nltk
	import pandas as pd
	from langdetect import DetectorFactory, LangDetectException

	DATA_FILE = "data/crashes/thecrashes_data_all_text.json"

	DEV_PORTION = .10

	random.seed(2001)
	DetectorFactory.seed = 0


	def is_a_real_time(timestamp):
	"""Helper function, checks if a given timestamp really has a time"""

	# 00:00:00 (midnight) is the "empty" timestamp, ignore it
	if timestamp.hour == timestamp.minute == timestamp.second == 0:
	return False
	return True


	def main():
	process_events()


	def detect_language(article):
	if article["alltext"]:
	sample = article["alltext"]
	elif article["summary"]:
	sample = article["summary"]
	else:
	sample = article["title"]

	try:
	return langdetect.detect(sample)
	except LangDetectException:
	print(f"\tCould not detect language for text_id={article['id']}")
	print(f"\tSample={sample})")
	print()
	return "UNK_LANG"


	def extract_text_info(event):
	ev_text_lines = []
	ev_id_lines = []
	ev_meta_rows = []

	for article in event["articles"]:
	text_id = article["id"]
	try:
	pubdate = datetime.fromisoformat(article["publishedtime"]).strftime("%Y-%m-%d %H:%M:%S")
	except ValueError:
	print(f"\t\tcould not parse date {article['publishedtime']}")
	pubdate = None
	url = article["url"]
	provider = article["sitename"]
	title = article["title"]
	language = detect_language(article)
	ev_meta_rows.append({
	"event_id": event["id"],
	"text_id": text_id,
	"pubdate": pubdate,
	"language": language,
	"url": url,
	"provider": provider,
	"title": title
	})

	summary = article["summary"]
	body = article["alltext"]

	text_lines = []
	id_lines = []

	for line in segment(title, language):
	text_lines.append(line)
	id_lines.append(f"event {event['id']}\ttext {text_id}\ttitle")

	for line in segment(summary, language):
	text_lines.append(line)
	id_lines.append(f"event {event['id']}\ttext {text_id}\tsummary")

	for line in segment(body, language):
	text_lines.append(line)
	id_lines.append(f"event {event['id']}\ttext {text_id}\tbody")

	ev_text_lines.append(text_lines)
	ev_id_lines.append(id_lines)

	return ev_text_lines, ev_id_lines, ev_meta_rows


	def segment(text, language):
	# don't split Hebrew and Vietnamese (because we don't have a segmenter for it)
	if language in ["he", "vi"]:
	return text

	lang_map = {
	"nl": "dutch",
	"en": "english",
	"es": "spanish",
	"de": "german",
	"fr": "french",
	"ru": "russian",
	"pt": "portuguese"
	}

	nltk_lang = lang_map.get(language)

	# what to do with languages without sent tokenizer in NLTK (apart from Hebrew):
	if not nltk_lang:
	if language == "af":
	# treat Afrikaans as Dutch
	nltk_lang = "dutch"
	else:
	print(f"Found an article with unsupported language={language}, falling back to English NLTK")
	nltk_lang = "english"

	return nltk.sent_tokenize(text, nltk_lang)


	def write_to_text_by_event(text_lines, text_meta_lines, event_id, split_to_dir, split):
	event_dir = f"{split_to_dir[split]}/{event_id}"
	os.makedirs(event_dir, exist_ok=True)
	for art_lines, row in zip(text_lines, text_meta_lines):
	text_file = f"{event_dir}/{row['text_id']}.txt"
	with open(text_file, "w", encoding="utf-8") as f:
	for line in art_lines:
	f.write(line + os.linesep)


	def process_events():
	print("Loading data file...")
	with open(DATA_FILE, encoding="utf-8") as f:
	data = json.load(f)
	event_all_rows = []
	event_dev_rows = []
	event_main_rows = []

	text_all_rows = []
	text_dev_rows = []
	text_main_rows = []

	# make empty text files
	text_file_basenames = {
	"all": "output/crashes/split_data/all.texts",
	"dev": "output/crashes/split_data/split_dev10.texts",
	"main": "output/crashes/split_data/split_main.texts"
	}
	for split, bn in text_file_basenames.items():
	for ext in [".text.txt", ".ids.txt"]:
	f = open(f"{bn}{ext}", "w", encoding="utf-8")
	f.close()

	# clear & make text file directories
	text_files_by_event_dir = {}
	for split in ["all", "dev", "main"]:
	prefix = "split_dev10" if split == "dev" else "split_main" if split == "main" else "all"
	text_dir = f"output/crashes/split_data/{prefix}_texts_by_event"
	text_files_by_event_dir[split] = text_dir
	if os.path.exists(text_dir):
	shutil.rmtree(text_dir)
	os.mkdir(text_dir)

	# helper function for writing text files
	def append_to_txt(txt_file, lines):
	with open(txt_file, "a", encoding="utf-8") as f_out:
	for art_lines in lines:
	for line in art_lines:
	f_out.write(line + os.linesep)

	print("Processing events...")
	for event in data:
	event_id = event["id"]
	print(f"\tevent_id={event_id}")
	try:
	timestamp = datetime.fromisoformat(event["date"])
	except ValueError:
	timestamp = None

	event_row = {
	"event:id": event_id,
	"event:date": timestamp.strftime("%Y-%m-%d") if timestamp else None,
	"event:time": timestamp.strftime("%H-%M-%S") if timestamp and is_a_real_time(timestamp) else None,
	"event:coordinates": f"{event['latitude'], event['longitude']}",
	"vehicle_involved": 1 if any(p for p in event["persons"] if p["transportationmode"] in range(5, 14)) else 0
	}

	for health, health_code in (("dead", 3), ("injured", 2)):
	all_with_health = [p for p in event["persons"] if p["health"] == health_code]
	event_row[f"outcomes:{health}:total"] = len(all_with_health)
	event_row[f"outcomes:{health}:child"] = len([p for p in all_with_health if p["child"] == 1])
	for mode, mode_codes in (("pedestrian", [1]), ("cyclist", [2]), ("vehicle", range(5, 14))):
	event_row[f"outcomes:{health}:{mode}"] = len([p for p in all_with_health
	if p["transportationmode"] in mode_codes])

	text_lines, text_id_lines, text_meta_rows = extract_text_info(event)

	event_all_rows.append(event_row)
	text_all_rows.extend(text_meta_rows)
	append_to_txt(text_file_basenames["all"] + ".text.txt", text_lines)
	append_to_txt(text_file_basenames["all"] + ".ids.txt", text_id_lines)
	write_to_text_by_event(text_lines, text_meta_rows, event_id, text_files_by_event_dir, "all")

	if random.random() < DEV_PORTION:
	event_dev_rows.append(event_row)
	text_dev_rows.extend(text_meta_rows)
	append_to_txt(text_file_basenames["dev"] + ".text.txt", text_lines)
	append_to_txt(text_file_basenames["dev"] + ".ids.txt", text_id_lines)
	write_to_text_by_event(text_lines, text_meta_rows, event_id, text_files_by_event_dir, "dev")

	else:
	event_main_rows.append(event_row)
	text_main_rows.extend(text_meta_rows)
	append_to_txt(text_file_basenames["main"] + ".text.txt", text_lines)
	append_to_txt(text_file_basenames["main"] + ".ids.txt", text_id_lines)
	write_to_text_by_event(text_lines, text_meta_rows, event_id, text_files_by_event_dir, "main")

	all_ev_df = pd.DataFrame(event_all_rows)
	main_ev_df = pd.DataFrame(event_main_rows)
	dev_ev_df = pd.DataFrame(event_dev_rows)
	for df, file in ((all_ev_df, "all.events"), (main_ev_df, "split_main.events"), (dev_ev_df, "split_dev10.events")):
	df.to_csv(f"output/crashes/split_data/{file}.csv")

	all_txt_df = pd.DataFrame(text_all_rows)
	main_txt_df = pd.DataFrame(text_main_rows)
	dev_txt_df = pd.DataFrame(text_dev_rows)
	for df, file in ((all_txt_df, "all.texts"), (main_txt_df, "split_main.texts"), (dev_txt_df, "split_dev10.texts")):
	df.to_csv(f"output/crashes/split_data/{file}.meta.csv")


	if __name__ == '__main__':
	main()