Gosse Minnema
Temporarily disable NLTK download
7199915
raw history blame
No virus
58.2 kB
import dataclasses
import io
import json
import os
import sys
import glob
import random
import re
import tarfile
import datetime
from collections import defaultdict
from typing import Dict, List, Optional
import functools
import requests
import tempfile
import lxml.etree as ET
import pandas as pd
import numpy as np
import gensim
import spacy
import nltk
from nltk.corpus import framenet as fn
from nltk.corpus.reader.framenet import FramenetError
from flask import Flask, request, render_template, jsonify, redirect, abort, session, url_for
from sociofillmore.common.analyze_text import (
FrameStructure,
get_syntax_info,
is_at_root,
process_prediction_file,
POSSIBLE_CONSTRUCTIONS,
SYNTAX_ANALYSIS_CACHE_FILES,
enrich_texts_df,
read_frames_of_interest,
load_deep_frames_cache,
get_text_meta,
analyze_single_document,
get_tarball_blocks,
analyze_external_file
)
from sociofillmore.crashes.utils import is_a_dutch_text
# # download nltk packages if needed
# if sys.argv[2] != "local":
# nltk.download("framenet_v17", download_dir="/nltk_data")
# nltk.download("punkt", download_dir="/nltk_data")
# print("Done!")
# security (very basic!)
PROTECTED_DATASETS = [] # "femicides/rai"
if os.path.exists("secrets.json"):
with open("secrets.json", encoding="utf-8") as f:
secrets = json.load(f)
AUTH_KEY = secrets["auth_key"]
PASSWORD = secrets["password"]
SECRET_KEY = bytes(secrets["flask_secret_key"], "utf-8")
else:
AUTH_KEY = os.environ.get("AUTH_KEY")
PASSWORD = os.environ.get("PASSWORD")
SECRET_KEY = os.environ.get("FLASK_SECRET_KEY")
# global app object
print("Defining app...")
app = Flask(__name__)
app.secret_key = SECRET_KEY
# gensim & spacy models
def load_gensim_model(limit):
print("Loading GENSIM model... [this can take a few minutes]")
return gensim.models.word2vec.KeyedVectors.load_word2vec_format("data/embeddings/concat_glove_frames.w2v.txt", limit=limit)
gensim_m = None
gensim_m = load_gensim_model(100_000)
print("Loading SpaCy models...")
spacy_model_ud = spacy.load("xx_sent_ud_sm")
spacy_model_langs = {
"it": spacy.load("it_core_news_md"),
"nl": spacy.load("nl_core_news_md"),
"en": spacy.load("en_core_web_md")
}
# frequency information cache
frame_freq_cache = {}
with open("resources/fn_frames_to_roles.json", encoding="utf-8") as f:
fn_frames_to_roles = json.load(f)
# data processing constants
VICTIM_AGE_GROUPS = ["0-12", "12-18", "18-30", "30-50", "50-70", "70-120"]
ALL_FOREIGN_NATIONALITIES = "estero (tutto)"
deep_frames_cache = load_deep_frames_cache()
def read_rai_provider_attrs():
df = pd.read_excel("resources/RAI_sources_mr.xlsx")
return {
"politics:man:left": df[df["politics_man"] == "L"]["source"].unique().tolist(),
"politics:tc:left": df[df["politics_tc_cat"] == "L"]["source"]
.unique()
.tolist(),
"politics:agg:left": df[df["politics_agg"] == "L"]["source"].unique().tolist(),
"politics:man:right": df[df["politics_man"] == "R"]["source"].unique().tolist(),
"politics:tc:right": df[df["politics_tc_cat"] == "R"]["source"]
.unique()
.tolist(),
"politics:agg:right": df[df["politics_agg"] == "R"]["source"].unique().tolist(),
"politics:man:neutral": df[df["politics_man"] == "N"]["source"]
.unique()
.tolist(),
"politics:tc:neutral": df[df["politics_tc_cat"] == "N"]["source"]
.unique()
.tolist(),
"politics:agg:neutral": df[df["politics_agg"] == "N"]["source"]
.unique()
.tolist(),
"type:agency": df[df["type"] == "A"]["source"].unique().tolist(),
"type:outlet": df[df["type"] == "OUTLET"]["source"].unique().tolist(),
"area:national": df[df["area"] == "nat"]["source"].unique().tolist(),
"area:regional": df[df["area"] == "loc"]["source"].unique().tolist(),
}
def read_migration_provider_attrs():
df = pd.read_csv("data/migration/provider_pol_rel_ratings.csv")
return {
"politics:sc:left": df[df["political_stance"] == -1]["provider"].unique().tolist(),
"politics:sc:right": df[df["political_stance"] == 1]["provider"].unique().tolist(),
"politics:sc:neutral": df[df["political_stance"] == 0]["provider"].unique().tolist(),
"religion:sc:catholic": df[df["religious"] == True]["provider"].unique().tolist(),
"religion:sc:non_catholic": df[df["religious"] == False]["provider"].unique().tolist()
}
def read_crashes_provider_attrs():
df = pd.read_csv("resources/crashes_sources.csv")
# remove empty rows
df = df.dropna(subset=["ProviderNameCorr"])
# map to correct names
name_map = {
row["ProviderName"]: row["ProviderNameCorr"].strip('"')
for _, row in df.iterrows()
}
# merge duplicates
df = df.groupby(list(df.columns[2:11]))["ProviderFreq"].apply(sum).reset_index()
# "explode" multiple province fields
df = df.assign(**{"Province": df["Province"].str.split("|")}).explode("Province")
attr_map = {
f"{col}:{val}": df[df[col] == val]["ProviderNameCorr"].unique().tolist()
for col in df.columns[1:9]
for val in set(df[col].values)
if val != "-"
}
return attr_map, name_map
PROVIDER_ATTRS = {
"femicides/rai": read_rai_provider_attrs(),
"femicides/olv": {},
"crashes/thecrashes": read_crashes_provider_attrs()[0],
"migration/pavia": read_migration_provider_attrs()
}
# current active dataset
def get_dataset_variables(dataset_name):
if dataset_name == "femicides/rai":
spacy_model = "it_core_news_md"
elif dataset_name == "femicides/olv":
spacy_model = "it_core_news_md"
elif dataset_name == "crashes/thecrashes":
spacy_model = "nl_core_news_md"
elif dataset_name == "migration/pavia":
spacy_model = "it_core_news_md"
else:
raise ValueError("Unsupported dataset!")
return {
"dataset": dataset_name,
"frames": read_frames_of_interest(dataset_name),
"spacy_model": spacy_model,
}
# ==== DATA PROCESSING FUNCTIONS ====
# event data
def load_event_data(dataset):
if dataset == "femicides/rai":
event_data_file = "output/femicides/split_data/rai/split_ALL.events.csv"
texts_data_file = "output/femicides/split_data/rai/split_ALL.texts.meta.csv"
elif dataset == "femicides/olv":
event_data_file = "output/femicides/split_data/olv/split_dev10.events.csv"
texts_data_file = "output/femicides/split_data/olv/split_dev10.texts.csv"
elif dataset == "crashes/thecrashes":
event_data_file = "output/crashes/split_data/split_dev10.events.csv"
texts_data_file = "output/crashes/split_data/split_dev10.texts.meta.csv"
elif dataset == "migration/pavia":
event_data_file = "output/migration/split_data/split_dev10.events.csv"
texts_data_file = "output/migration/split_data/split_dev10.texts.meta.csv"
else:
raise ValueError("Unsupported dataset")
events = pd.read_csv(event_data_file, dtype={"event:id": int}, index_col=0)
texts = enrich_texts_df(pd.read_csv(texts_data_file, index_col=0), events)
return {"events_df": events, "texts_df": texts}
DATASETS = {
"femicides/rai": load_event_data("femicides/rai"),
"femicides/olv": load_event_data("femicides/olv"),
"crashes/thecrashes": load_event_data("crashes/thecrashes"),
"migration/pavia": load_event_data("migration/pavia"),
}
SKIP_FUNCTIONS = {
"femicides/rai": None,
"femicides/olv": None,
"crashes/thecrashes": lambda doc: not is_a_dutch_text(doc),
"migration/pavia": None
}
def read_frames_to_event_roles(dataset):
if dataset == "femicides/rai":
ftr_df = pd.read_csv("resources/femicides_frame_to_roles.csv")
if dataset == "femicides/olv":
ftr_df = pd.read_csv("resources/femicides_frame_to_roles.csv")
elif dataset == "crashes/thecrashes":
ftr_df = pd.read_csv("resources/crashes_frame_to_roles.csv")
else:
raise ValueError("Unsupported dataset!")
frames_to_event_roles: Dict[str, Dict[str, List[str]]] = {}
role_types = [col for col in ftr_df.columns if col.startswith("role:")]
for _, row in ftr_df.iterrows():
frame_roles = defaultdict(list)
for rt in role_types:
role_key = rt.split(":")[1]
if row[rt] == "-":
frame_roles[role_key] = []
else:
for role in row[rt].split("|"):
frame_roles[role_key].append(role)
frames_to_event_roles[row["frame"]] = frame_roles
return frames_to_event_roles
def get_role_expressions(
struct: FrameStructure,
roles_dep_map: Dict[int, Dict[str, str]],
frame_to_role_map: Optional[Dict[str, Dict[str, List[str]]]],
depth_filter: int,
output_depth_only: bool = False,
) -> str:
role_exps = []
role_deps = roles_dep_map[struct.target.tokens_idx[0]]
def make_exp(_role, _dep, _depth):
if output_depth_only:
return _role + "::" + str(_depth)
else:
if _depth > depth_filter:
_dep = None
return _role + "::" + (_dep or "_UNK_DEP")
# no event role mapping: just use roles as-is
if frame_to_role_map is None:
for role, _ in struct.roles:
dep, depth = role_deps.get(role, (None, -1))
role_exps.append(make_exp(role, dep, depth))
elif struct.frame in frame_to_role_map:
for role_type, rt_roles in frame_to_role_map[struct.frame].items():
for role in rt_roles:
if role in [r[0] for r in struct.roles]:
dep, depth = role_deps.get(role, (None, -1))
role_exps.append(make_exp(role, dep, depth))
# else:
# exp = role_type + "::_ABSENT"
# role_exps.append(exp)
return role_exps
def get_analyze_frame_samples(
construction,
frame,
dependency,
role,
lome_model,
max_samples_per_doc,
samples_to_find,
selected_documents,
dataset_vars,
texts_df,
):
dataset = dataset_vars["dataset"]
print("# selected documents", len(selected_documents))
samples_found = {}
tar_blocks = get_tarball_blocks(dataset, lome_model)
syntax_cache = SYNTAX_ANALYSIS_CACHE_FILES[dataset]
prediction_files = []
print("# tar blocks", len(glob.glob(tar_blocks + "/*.tar")))
for block in glob.glob(tar_blocks + "/*.tar"):
with tarfile.open(block, "r") as tar_f:
block_prediction_files = [
m.name for m in tar_f.getmembers() if m.name.endswith(".comm.json")
]
print("\t# prediction files", len(prediction_files))
matching_prediction_files = [
pf
for pf in block_prediction_files
if re.search(r"/(\d+)/lome_(\d+).comm.json", pf).group(2)
in selected_documents
]
print("\t# matching prediction files", len(matching_prediction_files))
print("\t")
prediction_files.extend(matching_prediction_files)
print(len(prediction_files))
while prediction_files and len(samples_found) < samples_to_find:
print(
f"\t\tsamples_found: {len(samples_found)}//prediction_files left: {len(prediction_files)}"
)
# choose a random prediction file
pf = random.choice(prediction_files)
prediction_files.remove(pf)
print(pf)
# filter for selected frame
doc_id = os.path.basename(pf).split(".")[0].split("_")[1]
doc_key = doc_id[:2]
tarball = get_tarball_blocks(dataset, lome_model) + f"/block_{doc_key}.tar"
with tarfile.open(tarball, "r") as tar_f:
pf_obj = io.TextIOWrapper(tar_f.extractfile(pf))
(
sents,
pred_structures,
syntax_analyses,
role_analyses,
) = process_prediction_file(
filename=pf,
file_obj=pf_obj,
dataset_name=dataset_vars["dataset"],
syntax_cache=syntax_cache,
deep_frames_cache=deep_frames_cache,
)
if syntax_analyses is None:
continue
(
frame_sents,
frame_pred_structures,
frame_syntax_analyses,
frame_role_mappings,
) = ([], [], [], [])
for s, pred, syn, rol in zip(
sents, pred_structures, syntax_analyses, role_analyses
):
for fs in pred.values():
fs_syn = get_syntax_info(fs, syn)
fs_rol = rol[fs.target.tokens_idx[0]]
frame_matches = frame == "*" or fs.frame == frame
construction_matches = (
construction == "*"
or fs_syn["syn_construction"] == construction
)
role_matches = role == "*" or role in [r for r, _ in fs.roles]
if role != "*":
dependency_matches = dependency == "*" or (
role,
dependency,
) in [(r, d) for r, (d, _) in fs_rol.items()]
else:
dependency_matches = dependency == "*" or dependency in [
d for d, _ in fs_rol.values()
]
if (
frame_matches
and construction_matches
and role_matches
and dependency_matches
):
frame_sents.append(s)
frame_pred_structures.append(pred)
frame_syntax_analyses.append(syn)
frame_role_mappings.append(rol)
# select random frames
if not frame_sents:
continue
for _ in range(max_samples_per_doc):
selected_idx = random.randrange(len(frame_sents))
if not (pf, selected_idx) in samples_found:
sample = (
frame_sents[selected_idx],
frame_pred_structures[selected_idx],
frame_syntax_analyses[selected_idx],
frame_role_mappings[selected_idx],
)
if sample not in samples_found.values():
samples_found[(pf, selected_idx)] = sample
# format output
output = []
for (pf, idx), (sent, structs, syntax, roles) in samples_found.items():
# extract eventID and docID from file string
re_m = re.search(r"/(\d+)/lome_(\d+).comm.json", pf)
event_id = re_m.group(1)
doc_id = re_m.group(2)
output.append(
{
"sentence": sent,
"fn_structures": [
dataclasses.asdict(fs) for fs in structs.values()
],
"syntax": syntax,
"roles": roles,
"meta": {
"event_id": event_id,
"doc_id": doc_id,
"text_meta": get_text_meta(doc_id, texts_df),
},
}
)
return output
# ==== WEB SERVER UTILITY FUNCTIONS ====
def security_check():
return True
# if request.cookies.get("auth_key", None) == AUTH_KEY:
# return True
# if request.args.get("auth_key", None) == AUTH_KEY:
# return True
# return False
# ==== APP ROUTE FUNCTIONS ====
@app.before_request
def init_session():
if session.get("initialized"):
return
print("Initializing...")
_switch_dataset("femicides/olv", True)
session["initialized"] = True
return
@app.route("/")
def index():
return redirect(url_for("demo"))
@app.route("/explore")
def start():
return render_template("index.html")
@app.route("/demo")
def demo():
return render_template("demo.html")
@app.route("/check_password", methods=["POST"])
def check_password():
entered_password = request.form["password"]
if entered_password == PASSWORD:
resp = jsonify({"success": True})
resp.set_cookie("auth_key", AUTH_KEY)
return resp
else:
return jsonify({"success": False})
@app.route("/switch_dataset")
def switch_dataset():
new_dataset = request.args.get("dataset")
_switch_dataset(new_dataset)
return jsonify({"result": "changed_dataset"})
def _switch_dataset(new_dataset, first_time=False):
print(first_time)
if not first_time:
if new_dataset == session["dataset_vars"]["dataset"]:
return jsonify({"result": "no_change"})
session["dataset_vars"] = get_dataset_variables(new_dataset)
if new_dataset == "femicides/rai":
session["provider_name_map"] = {}
elif new_dataset == "crashes/thecrashes":
_, name_map = read_crashes_provider_attrs()
session["provider_name_map"] = name_map
elif new_dataset == "migration/pavia":
session["provider_name_map"] = {}
else:
session["provider_name_map"] = {}
# session["frames_to_roles"] = read_frames_to_event_roles(new_dataset)
@app.route("/analyze")
def analyze():
event_id = request.args.get("event")
doc_id = request.args.get("document")
lome_model = request.args.get("model")
dataset = session["dataset_vars"]["dataset"]
if dataset in PROTECTED_DATASETS:
if not security_check():
abort(403)
print(dataset)
output = analyze_single_document(
doc_id,
event_id,
lome_model,
dataset,
DATASETS[session["dataset_vars"]["dataset"]]["texts_df"],
deep_frames_cache=deep_frames_cache
)
return jsonify(output)
@app.route("/sample_frame")
def sample_frame():
dataset = session["dataset_vars"]["dataset"]
if dataset in PROTECTED_DATASETS and not security_check():
abort(403)
frame = request.args.get("frame")
construction = request.args.get("construction")
role = request.args.get("role")
dependency = request.args.get("dependency")
texts_df = DATASETS[session["dataset_vars"]["dataset"]]["texts_df"]
events_df = DATASETS[session["dataset_vars"]["dataset"]]["events_df"]
filtered_docs = filter_documents(
session["dataset_vars"],
events_df,
texts_df,
PROVIDER_ATTRS[session["dataset_vars"]["dataset"]],
session["provider_name_map"],
)
selected_documents = {doc["doc_id"] for doc in filtered_docs}
lome_model = request.args.get("model")
samples_to_find = int(request.args.get("nsamples", 5))
max_samples_per_doc = int(request.args.get("nperdoc", 10))
output = get_analyze_frame_samples(
construction,
frame,
dependency,
role,
lome_model,
max_samples_per_doc,
samples_to_find,
selected_documents,
session["dataset_vars"],
DATASETS[session["dataset_vars"]["dataset"]]["texts_df"],
)
return jsonify(output)
@app.route("/lus_to_frames")
def get_frames_from_lus():
lus = request.args.get("lus", "").split("+")
frames = set()
for lu in lus:
frames.update({lu_info.frame.name for lu_info in fn.lus(r"^" + lu + r"\.")})
print(frames)
return jsonify({"frames": sorted(frames)})
def format_frame_description(frame_def_xml):
frame_def_fmt = [frame_def_xml.text] if frame_def_xml.text else []
for elem in frame_def_xml:
if elem.tag == "ex":
break
elif elem.tag == "fen":
frame_def_fmt.append("<b>" + elem.text + "</b>")
elif elem.text:
frame_def_fmt.append(elem.text)
if elem.tail:
frame_def_fmt.append(elem.tail)
return frame_def_fmt
def get_alt_perspectives(frame_info, frame_name):
alt_perspectives = []
result_frames = [fr.subFrameName for fr in frame_info.frameRelations if fr.type.name == "Causative_of" and fr.superFrameName == frame_name]
if result_frames:
alt_perspectives.append({"frame": result_frames[0], "type": "result"})
cause_frames = [fr.superFrameName for fr in frame_info.frameRelations if fr.type.name == "Causative_of" and fr.subFrameName == frame_name]
if cause_frames:
alt_perspectives.append({"frame": cause_frames[0], "type": "causer"})
# user_frames = [fr.subFrameName for fr in frame_info.frameRelations if fr.type.name == "Using" and fr.superFrameName == frame_name]
# if user_frames:
# alt_perspectives.append({"frame": user_frames[0], "type": "user"})
neutral_frames = [fr.superFrameName for fr in frame_info.frameRelations if fr.type.name == "Perspective_on" and fr.subFrameName == frame_name]
if neutral_frames:
flipped_frames = [fr.subFrameName for fr in fn.frame(neutral_frames[0]).frameRelations if fr.type.name == "Perspective_on" and fr.superFrameName == neutral_frames[0] and fr.subFrameName != frame_name]
if flipped_frames:
alt_perspectives.extend([{"frame": ff, "type": "flipped"} for ff in flipped_frames])
return alt_perspectives
@app.route("/frame_info")
def get_frame_info():
frame_name = request.args.get("frame").strip()
try:
print(repr(frame_name))
print(type(frame_name))
try:
frame_info = fn.frame(frame_name)
except KeyError:
# sometimes NLTK (randomly??) throws a key error, in which case, just try again
frame_info = fn.frame(frame_name)
try:
# try extracting just the first sentence
definition_first_sent = nltk.sent_tokenize(frame_info.definitionMarkup)[0] + "</def-root>"
frame_def_xml = ET.fromstring(definition_first_sent)
except ET.XMLSyntaxError:
# otherwise, use the full definition
frame_def_xml = ET.fromstring(frame_info.definitionMarkup)
frame_def_fmt = format_frame_description(frame_def_xml)
exemplars = [
{
"text": exemplar.text,
"target_lu": lu_name,
"target_idx": list(exemplar["Target"][0]),
"core_fes": {
role: exemplar.text[start_idx:end_idx]
for role, start_idx, end_idx in exemplar.FE[0]
if role in [fe for fe, fe_info in frame_info.FE.items() if fe_info.coreType == "Core"]
}
}
for lu_name, lu_info in frame_info["lexUnit"].items()
for exemplar in lu_info.exemplars
]
# try to find a "typical" exemplar --- typical -> as short as possible, as many FEs as possible
exa_typicality_scores = [(exa, len(exa["text"]) - 25 * len(exa["core_fes"])) for exa in exemplars]
if exa_typicality_scores:
typical_exemplar = min(exa_typicality_scores, key=lambda t: t[1])[0]
else:
typical_exemplar = None
alt_perspectives = get_alt_perspectives(frame_info, frame_name)
return jsonify({
"result": "OK",
"frameDefinition": frame_def_fmt,
"exemplars": exemplars,
"altPerspectives": alt_perspectives,
"typicalExemplar": {
"text": typical_exemplar["text"],
"coreFrameElements": typical_exemplar["core_fes"]
} if typical_exemplar else None
})
except FramenetError:
return jsonify({"result": "FAIL", "info": "could not find frame"})
@app.route("/frames")
def get_frames():
return jsonify(session["dataset_vars"]["frames"])
@app.route("/constructions")
def get_constructions():
return jsonify(POSSIBLE_CONSTRUCTIONS)
@app.route("/event_filters")
def get_event_filters():
dataset = session["dataset_vars"]["dataset"]
if dataset in PROTECTED_DATASETS and not security_check():
abort(403)
if session["dataset_vars"]["dataset"] == "femicides/rai":
events_df = DATASETS[session["dataset_vars"]["dataset"]]["events_df"]
event_categories = events_df["event:category"].unique().tolist()
regions = sorted(events_df["event:region"].unique().tolist())
sem_location = sorted(events_df["event:semantic_location"].unique().tolist())
victim_age = VICTIM_AGE_GROUPS
victim_nationality = [
"Italia",
ALL_FOREIGN_NATIONALITIES,
"non rilevato",
] + sorted(
[
i
for i in events_df["victim:nationality"].dropna().unique().tolist()
if i not in ["Italia", "non rilevato"]
]
)
attacker_nationality = [
"Italia",
ALL_FOREIGN_NATIONALITIES,
"non rilevato",
] + sorted(
[
i
for i in events_df["attacker:nationality"].dropna().unique().tolist()
if i not in ["Italia", "non rilevato"]
]
)
victim_occupation = sorted(
[
i
for i in events_df["victim:occupation"].dropna().unique().tolist()
]
)
attacker_occupation = sorted(
[
i
for i in events_df["attacker:occupation"].dropna().unique().tolist()
]
)
return jsonify(
{
"event_categories": event_categories,
"regions": regions,
"sem_location": sem_location,
"victim_age": victim_age,
"victim_nationality": victim_nationality,
"attacker_nationality": attacker_nationality,
"victim_occupation": victim_occupation,
"attacker_occupation": attacker_occupation
}
)
elif session["dataset_vars"]["dataset"] == "femicides/olv":
events_df = DATASETS[session["dataset_vars"]["dataset"]]["events_df"]
regions = sorted([str(r) for r in events_df["event:region"].unique().tolist()])
victim_age = VICTIM_AGE_GROUPS
return jsonify(
{
"regions": regions,
"victim_age": victim_age,
}
)
elif session["dataset_vars"]["dataset"] == "crashes/thecrashes":
events_df = DATASETS[session["dataset_vars"]["dataset"]]["events_df"]
filters = {
"outcomes": ["no one", "one or more people"],
"imbalanced": ["yes", "no"],
}
return jsonify(filters)
else:
return jsonify({})
@app.route("/dep_labels")
def get_dep_labels():
dep_labels = set()
with open("resources/dep_labels.txt", encoding="utf-8") as f:
for line in f:
dep_labels.add(line.strip())
return jsonify(sorted(dep_labels))
@app.route("/role_labels")
def get_role_labels():
frame = request.args.get("frame")
roles = fn_frames_to_roles.get(frame)
if roles is not None:
return jsonify(roles)
else:
return jsonify([])
@app.route("/doc_filters")
def get_doc_filters():
texts_df = DATASETS[session["dataset_vars"]["dataset"]]["texts_df"]
provider_attrs = PROVIDER_ATTRS[session["dataset_vars"]["dataset"]]
if session["dataset_vars"]["dataset"] == "crashes/thecrashes":
name_map = session["provider_name_map"]
providers = sorted(
texts_df["provider"]
.apply(lambda prov: name_map.get(prov))
.dropna()
.unique()
.tolist()
)
provider_provinces = sorted(
key.split(":")[1]
for key in provider_attrs
if key.startswith("Province:")
)
provider_content_types = sorted(
key.split(":")[1]
for key in provider_attrs
if key.startswith("ContentType:")
)
provider_medium_types = sorted(
key.split(":")[1]
for key in provider_attrs
if key.startswith("MediumType:")
)
provider_owners = sorted(
key.split(":")[1]
for key in provider_attrs
if key.startswith("MediaOwner:")
)
else:
providers = sorted(texts_df["provider"].dropna().unique().tolist())
provider_provinces = []
provider_content_types = []
provider_medium_types = []
provider_owners = []
return jsonify(
{
"providers": providers,
"provider_provinces": provider_provinces,
"provider_content_types": provider_content_types,
"provider_medium_types": provider_medium_types,
"provider_owners": provider_owners,
}
)
def apply_doc_filters(
doc_filters: List[str],
provider_attrs: dict,
prov_name_map: dict,
texts_df: pd.DataFrame,
):
if not doc_filters:
all_doc_ids = set(int(eid) for eid in texts_df["text_id"].tolist())
return all_doc_ids
filters_attr_values = defaultdict(list)
for doc_filter in doc_filters:
_, attribute, value = doc_filter.split("::")
filters_attr_values[attribute].append(value)
selected_docs = texts_df
for attribute, values in filters_attr_values.items():
attr_conditions = []
for value in values:
if attribute == "days_after":
# if a filter for `days_after` is set, first remove any rows with null values
selected_docs = selected_docs.dropna(subset=["days_after_event"])
if value == "day":
condition = selected_docs["days_after_event"] < 1
elif value == "week":
condition = selected_docs["days_after_event"].isin(range(1, 7))
elif value == "month":
condition = selected_docs["days_after_event"].isin(range(7, 30))
elif value == "year":
condition = selected_docs["days_after_event"].isin(range(30, 365))
else:
condition = selected_docs["days_after_event"] > 365
elif session["dataset_vars"]["dataset"] == "femicides/rai":
if any(attribute.startswith(key) for key in ["politics:", "type", "area"]):
providers = provider_attrs[attribute + ":" + value]
condition = selected_docs["provider"].isin(providers)
else:
condition = selected_docs[attribute] == value
elif session["dataset_vars"]["dataset"] == "femicides/olv":
condition = selected_docs[attribute] == value
elif session["dataset_vars"]["dataset"] == "crashes/thecrashes":
if attribute == "provider":
condition = selected_docs["provider"].apply(lambda prov: prov_name_map.get(prov)) == value
elif attribute in [
"area",
"country",
"province",
"content_type",
"medium_type",
"owner",
]:
# map to the correct name in the spreadsheet
attribute_altname = {
"area": "RegionalScope",
"country": "Country",
"province": "Province",
"content_type": "ContentType",
"medium_type": "MediumType",
"owner": "MediaOwner",
}[attribute]
providers = provider_attrs[attribute_altname + ":" + value]
condition = selected_docs["provider"].apply(lambda prov: prov_name_map.get(prov)).isin(providers)
else:
condition = selected_docs[attribute] == value
elif session["dataset_vars"]["dataset"] == "migration/pavia":
if attribute.startswith("politics") or attribute.startswith("religion"):
providers = provider_attrs[attribute + ":" + value]
condition = selected_docs["provider"].isin(providers)
else:
condition = selected_docs[attribute] == value
attr_conditions.append(condition)
selected_docs = selected_docs[functools.reduce(np.logical_or, attr_conditions)]
return set(int(eid) for eid in selected_docs["text_id"].tolist())
def apply_event_filters(ev_filters: List[str], events_df: pd.DataFrame):
if not ev_filters:
all_event_ids = set(int(eid) for eid in events_df["event:id"].tolist())
return all_event_ids
selected_events = events_df
for ev_filter in ev_filters:
print(ev_filter)
_, attribute, value = ev_filter.split("::")
print(attribute)
if session["dataset_vars"]["dataset"] in ["femicides/rai", "femicides/olv"]:
if attribute in ["victim:age"]:
print(value)
if "-" not in value:
# age_from = int(value.split("+")[0])
age_from = int(value)
age_to = 200
else:
age_from = int(value.split("-")[0])
age_to = int(value.split("-")[1])
events_with_age = selected_events[
selected_events[attribute] != "non rilevato"
]
selected_events = events_with_age[
events_with_age[attribute].astype(int).isin(range(age_from, age_to))
]
elif attribute in ["victim:nationality", "attacker:nationality"]:
if value == ALL_FOREIGN_NATIONALITIES:
selected_events = selected_events.dropna(subset=[attribute])
selected_events = selected_events[
~selected_events[attribute].isin(["Italia", "non rilevato", "nessuno", "sconosciuto"])
]
else:
selected_events = selected_events[
selected_events[attribute] == value
]
else:
selected_events = selected_events[selected_events[attribute] == value]
elif session["dataset_vars"]["dataset"] == "crashes/thecrashes":
if attribute.startswith("imbalanced"):
# at least one pedestrian or cyclist was injured or died
selected_events = selected_events[
(selected_events["outcomes:dead:cyclist"] > 0)
| (selected_events["outcomes:dead:pedestrian"] > 0)
| (selected_events["outcomes:injured:cyclist"] > 0)
| (selected_events["outcomes:injured:pedestrian"] > 0)
]
# no person in a vehicle was injured or died
selected_events = selected_events[
(selected_events["outcomes:injured:vehicle"] == 0)
& (selected_events["outcomes:dead:vehicle"] == 0)
]
# vehicle was involved
selected_events = selected_events[
(selected_events["vehicle_involved"] == 1)
]
if attribute.startswith("outcomes:"):
outcome = attribute.split(":")[1]
person = attribute.split(":")[2]
if outcome == "deadinjured":
if person == "cyclistpedestrian":
if value == "no one":
selected_events = selected_events[
(selected_events["outcomes:dead:cyclist"] == 0)
& (selected_events["outcomes:dead:pedestrian"] == 0)
& (selected_events["outcomes:injured:cyclist"] == 0)
& (selected_events["outcomes:injured:pedestrian"] == 0)
]
else:
selected_events = selected_events[
(selected_events["outcomes:dead:cyclist"] > 0)
| (selected_events["outcomes:dead:pedestrian"] > 0)
| (selected_events["outcomes:injured:cyclist"] > 0)
| (selected_events["outcomes:injured:pedestrian"] > 0)
]
else:
if value == "no one":
selected_events = selected_events[
(selected_events[f"outcomes:dead:{person}"] == 0)
& (selected_events[f"outcomes:injured:{person}"] == 0)
]
else:
selected_events = selected_events[
(selected_events[f"outcomes:dead:{person}"] > 0)
| (selected_events[f"outcomes:injured:{person}"] > 0)
]
else:
if person == "cyclistpedestrian":
if value == "no one":
selected_events = selected_events[
(selected_events[f"outcomes:{outcome}:cyclist"] == 0)
& (
selected_events[f"outcomes:{outcome}:pedestrian"]
== 0
)
]
else:
selected_events = selected_events[
(selected_events[f"outcomes:{outcome}:cyclist"] == 0)
| (
selected_events[f"outcomes:{outcome}:pedestrian"]
> 0
)
]
else:
if value == "no one":
selected_events = selected_events[
selected_events[attribute] == 0
]
else:
selected_events = selected_events[
selected_events[attribute] > 0
]
return set(int(eid) for eid in selected_events["event:id"].tolist())
@app.route("/documents")
def documents():
events_df = DATASETS[session["dataset_vars"]["dataset"]]["events_df"]
texts_df = DATASETS[session["dataset_vars"]["dataset"]]["texts_df"]
document_list = filter_documents(
session["dataset_vars"],
events_df,
texts_df,
PROVIDER_ATTRS[session["dataset_vars"]["dataset"]],
session["provider_name_map"],
)
return jsonify(sorted(document_list, key=lambda d: int(d["event_id"])))
def filter_documents(dataset_vars, events_df, texts_df, provider_attrs, name_map):
event_filters = read_filters("event_filters")
doc_filters = read_filters("doc_filters")
selected_events = apply_event_filters(event_filters, events_df)
selected_documents = apply_doc_filters(
doc_filters, provider_attrs, name_map, texts_df
)
document_list = []
blocks = get_tarball_blocks(dataset_vars["dataset"])
for tarball in glob.glob(blocks + "/*.tar"):
with tarfile.open(tarball, "r") as tar_f:
for doc in [f.name for f in tar_f.getmembers() if f.name.endswith(".comm.txt")]:
# extract eventID and docID from file string
re_m = re.search(r"/(\d+)/lome_(\d+).comm.txt", doc)
event_id = re_m.group(1)
doc_id = re_m.group(2)
if (int(event_id) not in selected_events) or (
int(doc_id) not in selected_documents
):
continue
document_list.append({"event_id": event_id, "doc_id": doc_id})
return document_list
def read_filters(arg_name):
filter_str = request.args.get(arg_name)
if filter_str:
filters = filter_str.split("+")
else:
filters = []
return filters
@app.route("/frame_freq")
def frame_freq():
texts_df = DATASETS[session["dataset_vars"]["dataset"]]["texts_df"]
# arg: "model"
lome_model = request.args.get("model")
# arg: "filter"
event_filters = read_filters("event_filters")
doc_filters = read_filters("doc_filters")
events_df = DATASETS[session["dataset_vars"]["dataset"]]["events_df"]
selected_events = apply_event_filters(event_filters, events_df)
selected_documents = apply_doc_filters(
doc_filters,
PROVIDER_ATTRS[session["dataset_vars"]["dataset"]],
session["provider_name_map"],
texts_df,
)
# arg: "headlines" (consider only headlines?)
only_headlines = request.args.get("headlines", "n") == "y"
# arg: "frames"
frame_string = request.args.get("frames").strip()
frame_filter: List[str] = frame_string.split("+")
# arg: "constructions"
constr_string = request.args.get("constructions").strip()
constr_filter: List[str] = constr_string.split("+") if constr_string else []
# arg: "group_by_cat"
group_by_cat: bool = request.args.get("group_by_cat") == "y"
# arg: "group_by_tgt"
group_by_tgt: bool = request.args.get("group_by_tgt") == "y"
# arg: "group_by_constr"
group_by_constr: bool = request.args.get("group_by_constr") == "y"
# arg: "group_by_root"
group_by_root: bool = request.args.get("group_by_root") == "y"
# arg: "group_by_role_expression"
group_by_role_expr: int = int(request.args.get("group_by_role_expr"))
# arg: "relative"
relative: bool = request.args.get("relative") == "y"
# arg: "plot_over_days_post"
plot_over_days_post: bool = request.args.get("plot_over_days_post") == "y"
#arg: "plot_by_year"
plot_by_year: bool = request.args.get("plot_by_year") == "y"
assert not (plot_over_days_post and plot_by_year)
# arg: "days_time_window"
days_time_window: int
try:
days_time_window_str = request.args.get("days_time_window")
if days_time_window_str is None:
days_time_window = 10
else:
days_time_window = int(days_time_window_str)
except ValueError:
days_time_window = 10
if plot_over_days_post or plot_by_year:
relevant_frame_counts = defaultdict(lambda: defaultdict(int))
deep_frame_counts = defaultdict(lambda: defaultdict(int))
all_frame_counts = defaultdict(lambda: defaultdict(int))
else:
relevant_frame_counts = defaultdict(int)
deep_frame_counts = defaultdict(int)
all_frame_counts = defaultdict(int)
# total times each frame is found
totals_by_frame = defaultdict(int)
print("Processing documents....")
blocks = get_tarball_blocks(session["dataset_vars"]["dataset"], lome_model)
# allow share syntax cache across different calls of process_prediction_file()
tmp_syntax_cache = {}
for tarball in sorted(glob.glob(blocks + "/*.tar")):
with tarfile.open(tarball, "r") as tar_f:
for mem in sorted(tar_f.getmembers(), key=lambda mem: mem.name):
if mem is None or not mem.name.endswith(".comm.json"):
continue
# extract eventID and docID from file string
re_m = re.search(r"/(\d+)/lome_(\d+).comm.json", mem.name)
event_id = re_m.group(1)
doc_id = re_m.group(2)
# document / event filter
if (int(doc_id) not in selected_documents) or (
int(event_id) not in selected_events
):
continue
if plot_over_days_post:
texts_df = DATASETS[session["dataset_vars"]["dataset"]]["texts_df"]
texts_df_dropna = texts_df.dropna(subset=["days_after_event"])
try:
df_filter = texts_df_dropna["text_id"] == int(doc_id)
time_bucket = int(
texts_df_dropna[df_filter].iloc[0]["days_after_event"]
)
except IndexError:
# ignore files with null days_post values
continue
# adjust negative values
time_bucket = max(time_bucket, 0)
# round to 10-day periods
time_bucket = (time_bucket // days_time_window) * days_time_window
else:
time_bucket = 0
if plot_by_year:
texts_df = DATASETS[session["dataset_vars"]["dataset"]]["texts_df"]
df_filter = texts_df["text_id"] == int(doc_id)
if "pubyear" in texts_df.columns:
time_bucket = int(texts_df[df_filter].iloc[0]["pubyear"])
elif "pubdate" in texts_df.columns:
pubdate_str = texts_df[df_filter].iloc[0]["pubdate"]
if pd.isna(pubdate_str):
continue
pub_date = datetime.datetime.strptime(pubdate_str, "%Y-%m-%d %H:%M:%S")
time_bucket = pub_date.year
else:
raise ValueError("Cannot plot by year if no `pubyear` or `pubdate` is specified!")
# continue
mem_obj = io.TextIOWrapper(tar_f.extractfile(mem))
skip_func = SKIP_FUNCTIONS[session["dataset_vars"]["dataset"]]
if skip_func is not None and skip_func(doc_id):
print(f"\tskip_func: skipping file {mem}")
continue
# spacy_model = session["dataset_vars"]["spacy_model"]
(
_,
pred_structures,
syntax_analyses,
role_analyses,
) = process_prediction_file(
filename=mem.name,
dataset_name=session["dataset_vars"]["dataset"],
syntax_cache=SYNTAX_ANALYSIS_CACHE_FILES[session["dataset_vars"]["dataset"]],
tmp_cache=tmp_syntax_cache,
file_obj=mem_obj,
deep_frames_cache=deep_frames_cache
)
for sent_idx, (struct_dict, syntax_dict, roles) in enumerate(zip(
pred_structures, syntax_analyses, role_analyses
)):
if only_headlines and sent_idx > 1:
continue
for struct in struct_dict.values():
frame_key = struct.frame
deep_frame_key = struct.deep_frame
syntax_info = get_syntax_info(struct, syntax_dict)
syntax_constr = syntax_info["syn_construction"]
syntax_cat = syntax_info["syn_category"]
syntax_at_root = is_at_root(syntax_info)
if constr_filter and syntax_constr not in constr_filter:
continue
totals_by_frame[struct.frame] += 1
if group_by_cat:
count_keys = [f"{frame_key}::{syntax_cat}"]
deep_count_keys = [f"{frame_key}::{syntax_cat}"]
elif group_by_tgt:
tgt_str = (
" ".join(struct.target.tokens_str)
.strip("«».,()□�?'\"")
.strip()
.lower()
)
count_keys = [f"{frame_key}::{tgt_str}"]
deep_count_keys = [f"{frame_key}::{tgt_str}"]
elif group_by_constr and group_by_root:
count_keys = [
f"{frame_key}/{syntax_constr}::{'root' if syntax_at_root else 'non-root'}"
]
deep_count_keys = [
f"{deep_frame_key}::{syntax_constr}::{'root' if syntax_at_root else 'non-root'}"
]
elif group_by_constr:
count_keys = [f"{frame_key}::{syntax_constr}"]
deep_count_keys = [f"{deep_frame_key}::{syntax_constr}"]
elif group_by_root:
count_keys = [
f"{frame_key}::{'root' if syntax_at_root else 'non-root'}"
]
deep_count_keys = [
f"{deep_frame_key}::{'root' if syntax_at_root else 'non-root'}"
]
elif group_by_role_expr:
if group_by_role_expr == 1:
role_exprs = [r for r, _ in struct.roles]
elif group_by_role_expr == 2:
role_exprs = get_role_expressions(
struct, roles, None, 1, False
)
elif group_by_role_expr == 3:
role_exprs = get_role_expressions(
struct, roles, session["frames_to_roles"], 1, False
)
elif group_by_role_expr == 4:
role_exprs = get_role_expressions(
struct, roles, None, None, True
)
else:
raise ValueError(
"Unknown value for param group_by_role_expr"
)
count_keys = []
deep_count_keys = []
for role_expr in role_exprs:
if group_by_role_expr == 4:
role_name, depth = role_expr.split("::")
depth = abs(int(depth))
if depth > 3:
depth = ">3"
role_expr = f"{role_name}::{depth}"
count_keys.append(f"{frame_key}::{role_expr}")
deep_count_keys.append(f"{deep_frame_key}::{role_expr}")
else:
count_keys = [struct.frame]
deep_count_keys = [struct.deep_frame]
for ck, dck in zip(count_keys, deep_count_keys):
if struct.frame in frame_filter:
if plot_over_days_post or plot_by_year:
relevant_frame_counts[time_bucket][ck] += 1
deep_frame_counts[time_bucket][dck] += 1
else:
relevant_frame_counts[ck] += 1
deep_frame_counts[dck] += 1
if plot_over_days_post or plot_by_year:
all_frame_counts[time_bucket][ck] += 1
else:
all_frame_counts[ck] += 1
print("Computing frame counts...")
if plot_over_days_post or plot_by_year:
data_and_names = [
(relevant_frame_counts, "relevant_frame_counts"),
(deep_frame_counts, "deep_frame_counts"),
(all_frame_counts, "all_frame_counts"),
]
data_out = {}
for (data, name) in data_and_names:
traces = defaultdict(lambda: {"x": [], "y": []})
for time_bucket in sorted(data):
total_count = sum(data[time_bucket].values())
for count_keys, count in data[time_bucket].items():
traces[count_keys]["x"].append(time_bucket)
traces[count_keys]["y"].append(
count / total_count if relative else count
)
data_out[name] = traces
return jsonify(data_out)
else:
relevant_frames_sr = pd.Series(data=relevant_frame_counts).sort_values(
ascending=False
)
deep_frames_sr = pd.Series(data=deep_frame_counts).sort_values(ascending=False)
all_frames_sr = pd.Series(data=all_frame_counts).sort_values(ascending=False)
if relative and group_by_role_expr > 0:
print("totals_by_frame=", totals_by_frame)
print("frame_filter=", frame_filter)
denom = totals_by_frame[frame_filter[0]]
print("denom=", denom)
relevant_frames_sr /= denom
deep_frames_sr /= deep_frames_sr.sum() # TODO: what to do with this???
all_frames_sr /= all_frames_sr.sum()
elif relative:
relevant_frames_sr /= relevant_frames_sr.sum()
deep_frames_sr /= deep_frames_sr.sum()
all_frames_sr /= all_frames_sr.sum()
return jsonify(
{
"relevant_frame_counts": {
"x": relevant_frames_sr.index.tolist(),
"y": relevant_frames_sr.values.tolist(),
},
"deep_frame_counts": {
"x": deep_frames_sr.index.tolist(),
"y": deep_frames_sr.values.tolist(),
},
"all_frame_counts": {
"x": all_frames_sr.index.tolist(),
"y": all_frames_sr.values.tolist(),
},
}
)
# for demo app
@app.route("/similar_frames")
def similar_frames():
if gensim_m is None:
return jsonify({"result": "FAIL", "reason": "no GENSIM model has been loaded, please call /load_gensim and try again"})
words_in = [w for w in request.args.get("words_in").split("+") if "glove_" + w in gensim_m]
if not words_in:
return jsonify({"result": "FAIL", "reason": "No input words given"})
try:
matches = [res for res in gensim_m.most_similar(positive=["glove_" + w for w in words_in], topn=100) if res[0].startswith("fn_")]
except KeyError:
return jsonify({"result": "FAIL", "reason": "One of the input words does not exist in the GloVe vocabulary"})
frames = [m[0].lstrip("fn_") for m in matches]
probas = [m[1] for m in matches]
return jsonify({
"result": "OK",
"frames": frames,
"probabilities": probas,
})
@app.route("/sociofillmore")
def sociofillmore():
# step 1: LOME analysis
input_text = request.args.get("text", "")
language = request.args.get("language", "en")
sentences = [s.text for s in spacy_model_ud(input_text).sents]
r = requests.get("http://localhost:9090/analyze", {"text": "\n".join(sentences)})
# r = requests.get("http://lome:9090/analyze", {"text": "\n".join(sentences)})
lome_analyses = json.loads(r.text)["analyses"]
# intermediate step: make temporary files for saving input/output analyses
tmp_in = tempfile.NamedTemporaryFile(mode="w+", delete=False)
tmp_in.write(json.dumps(lome_analyses))
tmp_in.close()
tmp_out = tempfile.NamedTemporaryFile(mode="w+", delete=False)
tmp_out.close()
# step 2: SocioFillmore
analyze_external_file(tmp_in.name, tmp_out.name, spacy_model_langs[language])
with open(tmp_out.name, "r") as f_out:
data_out = json.load(f_out)
os.unlink(tmp_in.name)
os.unlink(tmp_out.name)
return jsonify(data_out)
if __name__ == "__main__":
from waitress import serve
if len(sys.argv) > 1:
host = sys.argv[1]
else:
host = "127.0.0.1"
debug = False
if len(sys.argv) > 2:
if sys.argv[2] == "debug":
debug = True
serve(app, host="0.0.0.0", port="5000")
# app.run(host=host, debug=False, ssl_context="adhoc")