import dataclasses import io import json import os import sys import glob import random import re import tarfile import datetime from collections import defaultdict from typing import Dict, List, Optional import functools import requests import tempfile import lxml.etree as ET import pandas as pd import numpy as np import gensim import spacy import nltk from nltk.corpus import framenet as fn from nltk.corpus.reader.framenet import FramenetError from flask import Flask, request, render_template, jsonify, redirect, abort, session, url_for from sociofillmore.common.analyze_text import ( FrameStructure, get_syntax_info, is_at_root, process_prediction_file, POSSIBLE_CONSTRUCTIONS, SYNTAX_ANALYSIS_CACHE_FILES, enrich_texts_df, read_frames_of_interest, load_deep_frames_cache, get_text_meta, analyze_single_document, get_tarball_blocks, analyze_external_file ) from sociofillmore.crashes.utils import is_a_dutch_text # # download nltk packages if needed # if sys.argv[2] != "local": # nltk.download("framenet_v17", download_dir="/nltk_data") # nltk.download("punkt", download_dir="/nltk_data") # print("Done!") # security (very basic!) PROTECTED_DATASETS = [] # "femicides/rai" if os.path.exists("secrets.json"): with open("secrets.json", encoding="utf-8") as f: secrets = json.load(f) AUTH_KEY = secrets["auth_key"] PASSWORD = secrets["password"] SECRET_KEY = bytes(secrets["flask_secret_key"], "utf-8") else: AUTH_KEY = os.environ.get("AUTH_KEY") PASSWORD = os.environ.get("PASSWORD") SECRET_KEY = os.environ.get("FLASK_SECRET_KEY") # global app object print("Defining app...") app = Flask(__name__) app.secret_key = SECRET_KEY # gensim & spacy models def load_gensim_model(limit): print("Loading GENSIM model... [this can take a few minutes]") return gensim.models.word2vec.KeyedVectors.load_word2vec_format("data/embeddings/concat_glove_frames.w2v.txt", limit=limit) gensim_m = None gensim_m = load_gensim_model(100_000) print("Loading SpaCy models...") spacy_model_ud = spacy.load("xx_sent_ud_sm") spacy_model_langs = { "it": spacy.load("it_core_news_md"), "nl": spacy.load("nl_core_news_md"), "en": spacy.load("en_core_web_md") } # frequency information cache frame_freq_cache = {} with open("resources/fn_frames_to_roles.json", encoding="utf-8") as f: fn_frames_to_roles = json.load(f) # data processing constants VICTIM_AGE_GROUPS = ["0-12", "12-18", "18-30", "30-50", "50-70", "70-120"] ALL_FOREIGN_NATIONALITIES = "estero (tutto)" deep_frames_cache = load_deep_frames_cache() def read_rai_provider_attrs(): df = pd.read_excel("resources/RAI_sources_mr.xlsx") return { "politics:man:left": df[df["politics_man"] == "L"]["source"].unique().tolist(), "politics:tc:left": df[df["politics_tc_cat"] == "L"]["source"] .unique() .tolist(), "politics:agg:left": df[df["politics_agg"] == "L"]["source"].unique().tolist(), "politics:man:right": df[df["politics_man"] == "R"]["source"].unique().tolist(), "politics:tc:right": df[df["politics_tc_cat"] == "R"]["source"] .unique() .tolist(), "politics:agg:right": df[df["politics_agg"] == "R"]["source"].unique().tolist(), "politics:man:neutral": df[df["politics_man"] == "N"]["source"] .unique() .tolist(), "politics:tc:neutral": df[df["politics_tc_cat"] == "N"]["source"] .unique() .tolist(), "politics:agg:neutral": df[df["politics_agg"] == "N"]["source"] .unique() .tolist(), "type:agency": df[df["type"] == "A"]["source"].unique().tolist(), "type:outlet": df[df["type"] == "OUTLET"]["source"].unique().tolist(), "area:national": df[df["area"] == "nat"]["source"].unique().tolist(), "area:regional": df[df["area"] == "loc"]["source"].unique().tolist(), } def read_migration_provider_attrs(): df = pd.read_csv("data/migration/provider_pol_rel_ratings.csv") return { "politics:sc:left": df[df["political_stance"] == -1]["provider"].unique().tolist(), "politics:sc:right": df[df["political_stance"] == 1]["provider"].unique().tolist(), "politics:sc:neutral": df[df["political_stance"] == 0]["provider"].unique().tolist(), "religion:sc:catholic": df[df["religious"] == True]["provider"].unique().tolist(), "religion:sc:non_catholic": df[df["religious"] == False]["provider"].unique().tolist() } def read_crashes_provider_attrs(): df = pd.read_csv("resources/crashes_sources.csv") # remove empty rows df = df.dropna(subset=["ProviderNameCorr"]) # map to correct names name_map = { row["ProviderName"]: row["ProviderNameCorr"].strip('"') for _, row in df.iterrows() } # merge duplicates df = df.groupby(list(df.columns[2:11]))["ProviderFreq"].apply(sum).reset_index() # "explode" multiple province fields df = df.assign(**{"Province": df["Province"].str.split("|")}).explode("Province") attr_map = { f"{col}:{val}": df[df[col] == val]["ProviderNameCorr"].unique().tolist() for col in df.columns[1:9] for val in set(df[col].values) if val != "-" } return attr_map, name_map PROVIDER_ATTRS = { "femicides/rai": read_rai_provider_attrs(), "femicides/olv": {}, "crashes/thecrashes": read_crashes_provider_attrs()[0], "migration/pavia": read_migration_provider_attrs() } # current active dataset def get_dataset_variables(dataset_name): if dataset_name == "femicides/rai": spacy_model = "it_core_news_md" elif dataset_name == "femicides/olv": spacy_model = "it_core_news_md" elif dataset_name == "crashes/thecrashes": spacy_model = "nl_core_news_md" elif dataset_name == "migration/pavia": spacy_model = "it_core_news_md" else: raise ValueError("Unsupported dataset!") return { "dataset": dataset_name, "frames": read_frames_of_interest(dataset_name), "spacy_model": spacy_model, } # ==== DATA PROCESSING FUNCTIONS ==== # event data def load_event_data(dataset): if dataset == "femicides/rai": event_data_file = "output/femicides/split_data/rai/split_ALL.events.csv" texts_data_file = "output/femicides/split_data/rai/split_ALL.texts.meta.csv" elif dataset == "femicides/olv": event_data_file = "output/femicides/split_data/olv/split_dev10.events.csv" texts_data_file = "output/femicides/split_data/olv/split_dev10.texts.csv" elif dataset == "crashes/thecrashes": event_data_file = "output/crashes/split_data/split_dev10.events.csv" texts_data_file = "output/crashes/split_data/split_dev10.texts.meta.csv" elif dataset == "migration/pavia": event_data_file = "output/migration/split_data/split_dev10.events.csv" texts_data_file = "output/migration/split_data/split_dev10.texts.meta.csv" else: raise ValueError("Unsupported dataset") events = pd.read_csv(event_data_file, dtype={"event:id": int}, index_col=0) texts = enrich_texts_df(pd.read_csv(texts_data_file, index_col=0), events) return {"events_df": events, "texts_df": texts} DATASETS = { "femicides/rai": load_event_data("femicides/rai"), "femicides/olv": load_event_data("femicides/olv"), "crashes/thecrashes": load_event_data("crashes/thecrashes"), "migration/pavia": load_event_data("migration/pavia"), } SKIP_FUNCTIONS = { "femicides/rai": None, "femicides/olv": None, "crashes/thecrashes": lambda doc: not is_a_dutch_text(doc), "migration/pavia": None } def read_frames_to_event_roles(dataset): if dataset == "femicides/rai": ftr_df = pd.read_csv("resources/femicides_frame_to_roles.csv") if dataset == "femicides/olv": ftr_df = pd.read_csv("resources/femicides_frame_to_roles.csv") elif dataset == "crashes/thecrashes": ftr_df = pd.read_csv("resources/crashes_frame_to_roles.csv") else: raise ValueError("Unsupported dataset!") frames_to_event_roles: Dict[str, Dict[str, List[str]]] = {} role_types = [col for col in ftr_df.columns if col.startswith("role:")] for _, row in ftr_df.iterrows(): frame_roles = defaultdict(list) for rt in role_types: role_key = rt.split(":")[1] if row[rt] == "-": frame_roles[role_key] = [] else: for role in row[rt].split("|"): frame_roles[role_key].append(role) frames_to_event_roles[row["frame"]] = frame_roles return frames_to_event_roles def get_role_expressions( struct: FrameStructure, roles_dep_map: Dict[int, Dict[str, str]], frame_to_role_map: Optional[Dict[str, Dict[str, List[str]]]], depth_filter: int, output_depth_only: bool = False, ) -> str: role_exps = [] role_deps = roles_dep_map[struct.target.tokens_idx[0]] def make_exp(_role, _dep, _depth): if output_depth_only: return _role + "::" + str(_depth) else: if _depth > depth_filter: _dep = None return _role + "::" + (_dep or "_UNK_DEP") # no event role mapping: just use roles as-is if frame_to_role_map is None: for role, _ in struct.roles: dep, depth = role_deps.get(role, (None, -1)) role_exps.append(make_exp(role, dep, depth)) elif struct.frame in frame_to_role_map: for role_type, rt_roles in frame_to_role_map[struct.frame].items(): for role in rt_roles: if role in [r[0] for r in struct.roles]: dep, depth = role_deps.get(role, (None, -1)) role_exps.append(make_exp(role, dep, depth)) # else: # exp = role_type + "::_ABSENT" # role_exps.append(exp) return role_exps def get_analyze_frame_samples( construction, frame, dependency, role, lome_model, max_samples_per_doc, samples_to_find, selected_documents, dataset_vars, texts_df, ): dataset = dataset_vars["dataset"] print("# selected documents", len(selected_documents)) samples_found = {} tar_blocks = get_tarball_blocks(dataset, lome_model) syntax_cache = SYNTAX_ANALYSIS_CACHE_FILES[dataset] prediction_files = [] print("# tar blocks", len(glob.glob(tar_blocks + "/*.tar"))) for block in glob.glob(tar_blocks + "/*.tar"): with tarfile.open(block, "r") as tar_f: block_prediction_files = [ m.name for m in tar_f.getmembers() if m.name.endswith(".comm.json") ] print("\t# prediction files", len(prediction_files)) matching_prediction_files = [ pf for pf in block_prediction_files if re.search(r"/(\d+)/lome_(\d+).comm.json", pf).group(2) in selected_documents ] print("\t# matching prediction files", len(matching_prediction_files)) print("\t") prediction_files.extend(matching_prediction_files) print(len(prediction_files)) while prediction_files and len(samples_found) < samples_to_find: print( f"\t\tsamples_found: {len(samples_found)}//prediction_files left: {len(prediction_files)}" ) # choose a random prediction file pf = random.choice(prediction_files) prediction_files.remove(pf) print(pf) # filter for selected frame doc_id = os.path.basename(pf).split(".")[0].split("_")[1] doc_key = doc_id[:2] tarball = get_tarball_blocks(dataset, lome_model) + f"/block_{doc_key}.tar" with tarfile.open(tarball, "r") as tar_f: pf_obj = io.TextIOWrapper(tar_f.extractfile(pf)) ( sents, pred_structures, syntax_analyses, role_analyses, ) = process_prediction_file( filename=pf, file_obj=pf_obj, dataset_name=dataset_vars["dataset"], syntax_cache=syntax_cache, deep_frames_cache=deep_frames_cache, ) if syntax_analyses is None: continue ( frame_sents, frame_pred_structures, frame_syntax_analyses, frame_role_mappings, ) = ([], [], [], []) for s, pred, syn, rol in zip( sents, pred_structures, syntax_analyses, role_analyses ): for fs in pred.values(): fs_syn = get_syntax_info(fs, syn) fs_rol = rol[fs.target.tokens_idx[0]] frame_matches = frame == "*" or fs.frame == frame construction_matches = ( construction == "*" or fs_syn["syn_construction"] == construction ) role_matches = role == "*" or role in [r for r, _ in fs.roles] if role != "*": dependency_matches = dependency == "*" or ( role, dependency, ) in [(r, d) for r, (d, _) in fs_rol.items()] else: dependency_matches = dependency == "*" or dependency in [ d for d, _ in fs_rol.values() ] if ( frame_matches and construction_matches and role_matches and dependency_matches ): frame_sents.append(s) frame_pred_structures.append(pred) frame_syntax_analyses.append(syn) frame_role_mappings.append(rol) # select random frames if not frame_sents: continue for _ in range(max_samples_per_doc): selected_idx = random.randrange(len(frame_sents)) if not (pf, selected_idx) in samples_found: sample = ( frame_sents[selected_idx], frame_pred_structures[selected_idx], frame_syntax_analyses[selected_idx], frame_role_mappings[selected_idx], ) if sample not in samples_found.values(): samples_found[(pf, selected_idx)] = sample # format output output = [] for (pf, idx), (sent, structs, syntax, roles) in samples_found.items(): # extract eventID and docID from file string re_m = re.search(r"/(\d+)/lome_(\d+).comm.json", pf) event_id = re_m.group(1) doc_id = re_m.group(2) output.append( { "sentence": sent, "fn_structures": [ dataclasses.asdict(fs) for fs in structs.values() ], "syntax": syntax, "roles": roles, "meta": { "event_id": event_id, "doc_id": doc_id, "text_meta": get_text_meta(doc_id, texts_df), }, } ) return output # ==== WEB SERVER UTILITY FUNCTIONS ==== def security_check(): return True # if request.cookies.get("auth_key", None) == AUTH_KEY: # return True # if request.args.get("auth_key", None) == AUTH_KEY: # return True # return False # ==== APP ROUTE FUNCTIONS ==== @app.before_request def init_session(): if session.get("initialized"): return print("Initializing...") _switch_dataset("femicides/olv", True) session["initialized"] = True return @app.route("/") def index(): return redirect(url_for("demo")) @app.route("/explore") def start(): return render_template("index.html") @app.route("/demo") def demo(): return render_template("demo.html") @app.route("/check_password", methods=["POST"]) def check_password(): entered_password = request.form["password"] if entered_password == PASSWORD: resp = jsonify({"success": True}) resp.set_cookie("auth_key", AUTH_KEY) return resp else: return jsonify({"success": False}) @app.route("/switch_dataset") def switch_dataset(): new_dataset = request.args.get("dataset") _switch_dataset(new_dataset) return jsonify({"result": "changed_dataset"}) def _switch_dataset(new_dataset, first_time=False): print(first_time) if not first_time: if new_dataset == session["dataset_vars"]["dataset"]: return jsonify({"result": "no_change"}) session["dataset_vars"] = get_dataset_variables(new_dataset) if new_dataset == "femicides/rai": session["provider_name_map"] = {} elif new_dataset == "crashes/thecrashes": _, name_map = read_crashes_provider_attrs() session["provider_name_map"] = name_map elif new_dataset == "migration/pavia": session["provider_name_map"] = {} else: session["provider_name_map"] = {} # session["frames_to_roles"] = read_frames_to_event_roles(new_dataset) @app.route("/analyze") def analyze(): event_id = request.args.get("event") doc_id = request.args.get("document") lome_model = request.args.get("model") dataset = session["dataset_vars"]["dataset"] if dataset in PROTECTED_DATASETS: if not security_check(): abort(403) print(dataset) output = analyze_single_document( doc_id, event_id, lome_model, dataset, DATASETS[session["dataset_vars"]["dataset"]]["texts_df"], deep_frames_cache=deep_frames_cache ) return jsonify(output) @app.route("/sample_frame") def sample_frame(): dataset = session["dataset_vars"]["dataset"] if dataset in PROTECTED_DATASETS and not security_check(): abort(403) frame = request.args.get("frame") construction = request.args.get("construction") role = request.args.get("role") dependency = request.args.get("dependency") texts_df = DATASETS[session["dataset_vars"]["dataset"]]["texts_df"] events_df = DATASETS[session["dataset_vars"]["dataset"]]["events_df"] filtered_docs = filter_documents( session["dataset_vars"], events_df, texts_df, PROVIDER_ATTRS[session["dataset_vars"]["dataset"]], session["provider_name_map"], ) selected_documents = {doc["doc_id"] for doc in filtered_docs} lome_model = request.args.get("model") samples_to_find = int(request.args.get("nsamples", 5)) max_samples_per_doc = int(request.args.get("nperdoc", 10)) output = get_analyze_frame_samples( construction, frame, dependency, role, lome_model, max_samples_per_doc, samples_to_find, selected_documents, session["dataset_vars"], DATASETS[session["dataset_vars"]["dataset"]]["texts_df"], ) return jsonify(output) @app.route("/lus_to_frames") def get_frames_from_lus(): lus = request.args.get("lus", "").split("+") frames = set() for lu in lus: frames.update({lu_info.frame.name for lu_info in fn.lus(r"^" + lu + r"\.")}) print(frames) return jsonify({"frames": sorted(frames)}) def format_frame_description(frame_def_xml): frame_def_fmt = [frame_def_xml.text] if frame_def_xml.text else [] for elem in frame_def_xml: if elem.tag == "ex": break elif elem.tag == "fen": frame_def_fmt.append("" + elem.text + "") elif elem.text: frame_def_fmt.append(elem.text) if elem.tail: frame_def_fmt.append(elem.tail) return frame_def_fmt def get_alt_perspectives(frame_info, frame_name): alt_perspectives = [] result_frames = [fr.subFrameName for fr in frame_info.frameRelations if fr.type.name == "Causative_of" and fr.superFrameName == frame_name] if result_frames: alt_perspectives.append({"frame": result_frames[0], "type": "result"}) cause_frames = [fr.superFrameName for fr in frame_info.frameRelations if fr.type.name == "Causative_of" and fr.subFrameName == frame_name] if cause_frames: alt_perspectives.append({"frame": cause_frames[0], "type": "causer"}) # user_frames = [fr.subFrameName for fr in frame_info.frameRelations if fr.type.name == "Using" and fr.superFrameName == frame_name] # if user_frames: # alt_perspectives.append({"frame": user_frames[0], "type": "user"}) neutral_frames = [fr.superFrameName for fr in frame_info.frameRelations if fr.type.name == "Perspective_on" and fr.subFrameName == frame_name] if neutral_frames: flipped_frames = [fr.subFrameName for fr in fn.frame(neutral_frames[0]).frameRelations if fr.type.name == "Perspective_on" and fr.superFrameName == neutral_frames[0] and fr.subFrameName != frame_name] if flipped_frames: alt_perspectives.extend([{"frame": ff, "type": "flipped"} for ff in flipped_frames]) return alt_perspectives @app.route("/frame_info") def get_frame_info(): frame_name = request.args.get("frame").strip() try: print(repr(frame_name)) print(type(frame_name)) try: frame_info = fn.frame(frame_name) except KeyError: # sometimes NLTK (randomly??) throws a key error, in which case, just try again frame_info = fn.frame(frame_name) try: # try extracting just the first sentence definition_first_sent = nltk.sent_tokenize(frame_info.definitionMarkup)[0] + "" frame_def_xml = ET.fromstring(definition_first_sent) except ET.XMLSyntaxError: # otherwise, use the full definition frame_def_xml = ET.fromstring(frame_info.definitionMarkup) frame_def_fmt = format_frame_description(frame_def_xml) exemplars = [ { "text": exemplar.text, "target_lu": lu_name, "target_idx": list(exemplar["Target"][0]), "core_fes": { role: exemplar.text[start_idx:end_idx] for role, start_idx, end_idx in exemplar.FE[0] if role in [fe for fe, fe_info in frame_info.FE.items() if fe_info.coreType == "Core"] } } for lu_name, lu_info in frame_info["lexUnit"].items() for exemplar in lu_info.exemplars ] # try to find a "typical" exemplar --- typical -> as short as possible, as many FEs as possible exa_typicality_scores = [(exa, len(exa["text"]) - 25 * len(exa["core_fes"])) for exa in exemplars] if exa_typicality_scores: typical_exemplar = min(exa_typicality_scores, key=lambda t: t[1])[0] else: typical_exemplar = None alt_perspectives = get_alt_perspectives(frame_info, frame_name) return jsonify({ "result": "OK", "frameDefinition": frame_def_fmt, "exemplars": exemplars, "altPerspectives": alt_perspectives, "typicalExemplar": { "text": typical_exemplar["text"], "coreFrameElements": typical_exemplar["core_fes"] } if typical_exemplar else None }) except FramenetError: return jsonify({"result": "FAIL", "info": "could not find frame"}) @app.route("/frames") def get_frames(): return jsonify(session["dataset_vars"]["frames"]) @app.route("/constructions") def get_constructions(): return jsonify(POSSIBLE_CONSTRUCTIONS) @app.route("/event_filters") def get_event_filters(): dataset = session["dataset_vars"]["dataset"] if dataset in PROTECTED_DATASETS and not security_check(): abort(403) if session["dataset_vars"]["dataset"] == "femicides/rai": events_df = DATASETS[session["dataset_vars"]["dataset"]]["events_df"] event_categories = events_df["event:category"].unique().tolist() regions = sorted(events_df["event:region"].unique().tolist()) sem_location = sorted(events_df["event:semantic_location"].unique().tolist()) victim_age = VICTIM_AGE_GROUPS victim_nationality = [ "Italia", ALL_FOREIGN_NATIONALITIES, "non rilevato", ] + sorted( [ i for i in events_df["victim:nationality"].dropna().unique().tolist() if i not in ["Italia", "non rilevato"] ] ) attacker_nationality = [ "Italia", ALL_FOREIGN_NATIONALITIES, "non rilevato", ] + sorted( [ i for i in events_df["attacker:nationality"].dropna().unique().tolist() if i not in ["Italia", "non rilevato"] ] ) victim_occupation = sorted( [ i for i in events_df["victim:occupation"].dropna().unique().tolist() ] ) attacker_occupation = sorted( [ i for i in events_df["attacker:occupation"].dropna().unique().tolist() ] ) return jsonify( { "event_categories": event_categories, "regions": regions, "sem_location": sem_location, "victim_age": victim_age, "victim_nationality": victim_nationality, "attacker_nationality": attacker_nationality, "victim_occupation": victim_occupation, "attacker_occupation": attacker_occupation } ) elif session["dataset_vars"]["dataset"] == "femicides/olv": events_df = DATASETS[session["dataset_vars"]["dataset"]]["events_df"] regions = sorted([str(r) for r in events_df["event:region"].unique().tolist()]) victim_age = VICTIM_AGE_GROUPS return jsonify( { "regions": regions, "victim_age": victim_age, } ) elif session["dataset_vars"]["dataset"] == "crashes/thecrashes": events_df = DATASETS[session["dataset_vars"]["dataset"]]["events_df"] filters = { "outcomes": ["no one", "one or more people"], "imbalanced": ["yes", "no"], } return jsonify(filters) else: return jsonify({}) @app.route("/dep_labels") def get_dep_labels(): dep_labels = set() with open("resources/dep_labels.txt", encoding="utf-8") as f: for line in f: dep_labels.add(line.strip()) return jsonify(sorted(dep_labels)) @app.route("/role_labels") def get_role_labels(): frame = request.args.get("frame") roles = fn_frames_to_roles.get(frame) if roles is not None: return jsonify(roles) else: return jsonify([]) @app.route("/doc_filters") def get_doc_filters(): texts_df = DATASETS[session["dataset_vars"]["dataset"]]["texts_df"] provider_attrs = PROVIDER_ATTRS[session["dataset_vars"]["dataset"]] if session["dataset_vars"]["dataset"] == "crashes/thecrashes": name_map = session["provider_name_map"] providers = sorted( texts_df["provider"] .apply(lambda prov: name_map.get(prov)) .dropna() .unique() .tolist() ) provider_provinces = sorted( key.split(":")[1] for key in provider_attrs if key.startswith("Province:") ) provider_content_types = sorted( key.split(":")[1] for key in provider_attrs if key.startswith("ContentType:") ) provider_medium_types = sorted( key.split(":")[1] for key in provider_attrs if key.startswith("MediumType:") ) provider_owners = sorted( key.split(":")[1] for key in provider_attrs if key.startswith("MediaOwner:") ) else: providers = sorted(texts_df["provider"].dropna().unique().tolist()) provider_provinces = [] provider_content_types = [] provider_medium_types = [] provider_owners = [] return jsonify( { "providers": providers, "provider_provinces": provider_provinces, "provider_content_types": provider_content_types, "provider_medium_types": provider_medium_types, "provider_owners": provider_owners, } ) def apply_doc_filters( doc_filters: List[str], provider_attrs: dict, prov_name_map: dict, texts_df: pd.DataFrame, ): if not doc_filters: all_doc_ids = set(int(eid) for eid in texts_df["text_id"].tolist()) return all_doc_ids filters_attr_values = defaultdict(list) for doc_filter in doc_filters: _, attribute, value = doc_filter.split("::") filters_attr_values[attribute].append(value) selected_docs = texts_df for attribute, values in filters_attr_values.items(): attr_conditions = [] for value in values: if attribute == "days_after": # if a filter for `days_after` is set, first remove any rows with null values selected_docs = selected_docs.dropna(subset=["days_after_event"]) if value == "day": condition = selected_docs["days_after_event"] < 1 elif value == "week": condition = selected_docs["days_after_event"].isin(range(1, 7)) elif value == "month": condition = selected_docs["days_after_event"].isin(range(7, 30)) elif value == "year": condition = selected_docs["days_after_event"].isin(range(30, 365)) else: condition = selected_docs["days_after_event"] > 365 elif session["dataset_vars"]["dataset"] == "femicides/rai": if any(attribute.startswith(key) for key in ["politics:", "type", "area"]): providers = provider_attrs[attribute + ":" + value] condition = selected_docs["provider"].isin(providers) else: condition = selected_docs[attribute] == value elif session["dataset_vars"]["dataset"] == "femicides/olv": condition = selected_docs[attribute] == value elif session["dataset_vars"]["dataset"] == "crashes/thecrashes": if attribute == "provider": condition = selected_docs["provider"].apply(lambda prov: prov_name_map.get(prov)) == value elif attribute in [ "area", "country", "province", "content_type", "medium_type", "owner", ]: # map to the correct name in the spreadsheet attribute_altname = { "area": "RegionalScope", "country": "Country", "province": "Province", "content_type": "ContentType", "medium_type": "MediumType", "owner": "MediaOwner", }[attribute] providers = provider_attrs[attribute_altname + ":" + value] condition = selected_docs["provider"].apply(lambda prov: prov_name_map.get(prov)).isin(providers) else: condition = selected_docs[attribute] == value elif session["dataset_vars"]["dataset"] == "migration/pavia": if attribute.startswith("politics") or attribute.startswith("religion"): providers = provider_attrs[attribute + ":" + value] condition = selected_docs["provider"].isin(providers) else: condition = selected_docs[attribute] == value attr_conditions.append(condition) selected_docs = selected_docs[functools.reduce(np.logical_or, attr_conditions)] return set(int(eid) for eid in selected_docs["text_id"].tolist()) def apply_event_filters(ev_filters: List[str], events_df: pd.DataFrame): if not ev_filters: all_event_ids = set(int(eid) for eid in events_df["event:id"].tolist()) return all_event_ids selected_events = events_df for ev_filter in ev_filters: print(ev_filter) _, attribute, value = ev_filter.split("::") print(attribute) if session["dataset_vars"]["dataset"] in ["femicides/rai", "femicides/olv"]: if attribute in ["victim:age"]: print(value) if "-" not in value: # age_from = int(value.split("+")[0]) age_from = int(value) age_to = 200 else: age_from = int(value.split("-")[0]) age_to = int(value.split("-")[1]) events_with_age = selected_events[ selected_events[attribute] != "non rilevato" ] selected_events = events_with_age[ events_with_age[attribute].astype(int).isin(range(age_from, age_to)) ] elif attribute in ["victim:nationality", "attacker:nationality"]: if value == ALL_FOREIGN_NATIONALITIES: selected_events = selected_events.dropna(subset=[attribute]) selected_events = selected_events[ ~selected_events[attribute].isin(["Italia", "non rilevato", "nessuno", "sconosciuto"]) ] else: selected_events = selected_events[ selected_events[attribute] == value ] else: selected_events = selected_events[selected_events[attribute] == value] elif session["dataset_vars"]["dataset"] == "crashes/thecrashes": if attribute.startswith("imbalanced"): # at least one pedestrian or cyclist was injured or died selected_events = selected_events[ (selected_events["outcomes:dead:cyclist"] > 0) | (selected_events["outcomes:dead:pedestrian"] > 0) | (selected_events["outcomes:injured:cyclist"] > 0) | (selected_events["outcomes:injured:pedestrian"] > 0) ] # no person in a vehicle was injured or died selected_events = selected_events[ (selected_events["outcomes:injured:vehicle"] == 0) & (selected_events["outcomes:dead:vehicle"] == 0) ] # vehicle was involved selected_events = selected_events[ (selected_events["vehicle_involved"] == 1) ] if attribute.startswith("outcomes:"): outcome = attribute.split(":")[1] person = attribute.split(":")[2] if outcome == "deadinjured": if person == "cyclistpedestrian": if value == "no one": selected_events = selected_events[ (selected_events["outcomes:dead:cyclist"] == 0) & (selected_events["outcomes:dead:pedestrian"] == 0) & (selected_events["outcomes:injured:cyclist"] == 0) & (selected_events["outcomes:injured:pedestrian"] == 0) ] else: selected_events = selected_events[ (selected_events["outcomes:dead:cyclist"] > 0) | (selected_events["outcomes:dead:pedestrian"] > 0) | (selected_events["outcomes:injured:cyclist"] > 0) | (selected_events["outcomes:injured:pedestrian"] > 0) ] else: if value == "no one": selected_events = selected_events[ (selected_events[f"outcomes:dead:{person}"] == 0) & (selected_events[f"outcomes:injured:{person}"] == 0) ] else: selected_events = selected_events[ (selected_events[f"outcomes:dead:{person}"] > 0) | (selected_events[f"outcomes:injured:{person}"] > 0) ] else: if person == "cyclistpedestrian": if value == "no one": selected_events = selected_events[ (selected_events[f"outcomes:{outcome}:cyclist"] == 0) & ( selected_events[f"outcomes:{outcome}:pedestrian"] == 0 ) ] else: selected_events = selected_events[ (selected_events[f"outcomes:{outcome}:cyclist"] == 0) | ( selected_events[f"outcomes:{outcome}:pedestrian"] > 0 ) ] else: if value == "no one": selected_events = selected_events[ selected_events[attribute] == 0 ] else: selected_events = selected_events[ selected_events[attribute] > 0 ] return set(int(eid) for eid in selected_events["event:id"].tolist()) @app.route("/documents") def documents(): events_df = DATASETS[session["dataset_vars"]["dataset"]]["events_df"] texts_df = DATASETS[session["dataset_vars"]["dataset"]]["texts_df"] document_list = filter_documents( session["dataset_vars"], events_df, texts_df, PROVIDER_ATTRS[session["dataset_vars"]["dataset"]], session["provider_name_map"], ) return jsonify(sorted(document_list, key=lambda d: int(d["event_id"]))) def filter_documents(dataset_vars, events_df, texts_df, provider_attrs, name_map): event_filters = read_filters("event_filters") doc_filters = read_filters("doc_filters") selected_events = apply_event_filters(event_filters, events_df) selected_documents = apply_doc_filters( doc_filters, provider_attrs, name_map, texts_df ) document_list = [] blocks = get_tarball_blocks(dataset_vars["dataset"]) for tarball in glob.glob(blocks + "/*.tar"): with tarfile.open(tarball, "r") as tar_f: for doc in [f.name for f in tar_f.getmembers() if f.name.endswith(".comm.txt")]: # extract eventID and docID from file string re_m = re.search(r"/(\d+)/lome_(\d+).comm.txt", doc) event_id = re_m.group(1) doc_id = re_m.group(2) if (int(event_id) not in selected_events) or ( int(doc_id) not in selected_documents ): continue document_list.append({"event_id": event_id, "doc_id": doc_id}) return document_list def read_filters(arg_name): filter_str = request.args.get(arg_name) if filter_str: filters = filter_str.split("+") else: filters = [] return filters @app.route("/frame_freq") def frame_freq(): texts_df = DATASETS[session["dataset_vars"]["dataset"]]["texts_df"] # arg: "model" lome_model = request.args.get("model") # arg: "filter" event_filters = read_filters("event_filters") doc_filters = read_filters("doc_filters") events_df = DATASETS[session["dataset_vars"]["dataset"]]["events_df"] selected_events = apply_event_filters(event_filters, events_df) selected_documents = apply_doc_filters( doc_filters, PROVIDER_ATTRS[session["dataset_vars"]["dataset"]], session["provider_name_map"], texts_df, ) # arg: "headlines" (consider only headlines?) only_headlines = request.args.get("headlines", "n") == "y" # arg: "frames" frame_string = request.args.get("frames").strip() frame_filter: List[str] = frame_string.split("+") # arg: "constructions" constr_string = request.args.get("constructions").strip() constr_filter: List[str] = constr_string.split("+") if constr_string else [] # arg: "group_by_cat" group_by_cat: bool = request.args.get("group_by_cat") == "y" # arg: "group_by_tgt" group_by_tgt: bool = request.args.get("group_by_tgt") == "y" # arg: "group_by_constr" group_by_constr: bool = request.args.get("group_by_constr") == "y" # arg: "group_by_root" group_by_root: bool = request.args.get("group_by_root") == "y" # arg: "group_by_role_expression" group_by_role_expr: int = int(request.args.get("group_by_role_expr")) # arg: "relative" relative: bool = request.args.get("relative") == "y" # arg: "plot_over_days_post" plot_over_days_post: bool = request.args.get("plot_over_days_post") == "y" #arg: "plot_by_year" plot_by_year: bool = request.args.get("plot_by_year") == "y" assert not (plot_over_days_post and plot_by_year) # arg: "days_time_window" days_time_window: int try: days_time_window_str = request.args.get("days_time_window") if days_time_window_str is None: days_time_window = 10 else: days_time_window = int(days_time_window_str) except ValueError: days_time_window = 10 if plot_over_days_post or plot_by_year: relevant_frame_counts = defaultdict(lambda: defaultdict(int)) deep_frame_counts = defaultdict(lambda: defaultdict(int)) all_frame_counts = defaultdict(lambda: defaultdict(int)) else: relevant_frame_counts = defaultdict(int) deep_frame_counts = defaultdict(int) all_frame_counts = defaultdict(int) # total times each frame is found totals_by_frame = defaultdict(int) print("Processing documents....") blocks = get_tarball_blocks(session["dataset_vars"]["dataset"], lome_model) # allow share syntax cache across different calls of process_prediction_file() tmp_syntax_cache = {} for tarball in sorted(glob.glob(blocks + "/*.tar")): with tarfile.open(tarball, "r") as tar_f: for mem in sorted(tar_f.getmembers(), key=lambda mem: mem.name): if mem is None or not mem.name.endswith(".comm.json"): continue # extract eventID and docID from file string re_m = re.search(r"/(\d+)/lome_(\d+).comm.json", mem.name) event_id = re_m.group(1) doc_id = re_m.group(2) # document / event filter if (int(doc_id) not in selected_documents) or ( int(event_id) not in selected_events ): continue if plot_over_days_post: texts_df = DATASETS[session["dataset_vars"]["dataset"]]["texts_df"] texts_df_dropna = texts_df.dropna(subset=["days_after_event"]) try: df_filter = texts_df_dropna["text_id"] == int(doc_id) time_bucket = int( texts_df_dropna[df_filter].iloc[0]["days_after_event"] ) except IndexError: # ignore files with null days_post values continue # adjust negative values time_bucket = max(time_bucket, 0) # round to 10-day periods time_bucket = (time_bucket // days_time_window) * days_time_window else: time_bucket = 0 if plot_by_year: texts_df = DATASETS[session["dataset_vars"]["dataset"]]["texts_df"] df_filter = texts_df["text_id"] == int(doc_id) if "pubyear" in texts_df.columns: time_bucket = int(texts_df[df_filter].iloc[0]["pubyear"]) elif "pubdate" in texts_df.columns: pubdate_str = texts_df[df_filter].iloc[0]["pubdate"] if pd.isna(pubdate_str): continue pub_date = datetime.datetime.strptime(pubdate_str, "%Y-%m-%d %H:%M:%S") time_bucket = pub_date.year else: raise ValueError("Cannot plot by year if no `pubyear` or `pubdate` is specified!") # continue mem_obj = io.TextIOWrapper(tar_f.extractfile(mem)) skip_func = SKIP_FUNCTIONS[session["dataset_vars"]["dataset"]] if skip_func is not None and skip_func(doc_id): print(f"\tskip_func: skipping file {mem}") continue # spacy_model = session["dataset_vars"]["spacy_model"] ( _, pred_structures, syntax_analyses, role_analyses, ) = process_prediction_file( filename=mem.name, dataset_name=session["dataset_vars"]["dataset"], syntax_cache=SYNTAX_ANALYSIS_CACHE_FILES[session["dataset_vars"]["dataset"]], tmp_cache=tmp_syntax_cache, file_obj=mem_obj, deep_frames_cache=deep_frames_cache ) for sent_idx, (struct_dict, syntax_dict, roles) in enumerate(zip( pred_structures, syntax_analyses, role_analyses )): if only_headlines and sent_idx > 1: continue for struct in struct_dict.values(): frame_key = struct.frame deep_frame_key = struct.deep_frame syntax_info = get_syntax_info(struct, syntax_dict) syntax_constr = syntax_info["syn_construction"] syntax_cat = syntax_info["syn_category"] syntax_at_root = is_at_root(syntax_info) if constr_filter and syntax_constr not in constr_filter: continue totals_by_frame[struct.frame] += 1 if group_by_cat: count_keys = [f"{frame_key}::{syntax_cat}"] deep_count_keys = [f"{frame_key}::{syntax_cat}"] elif group_by_tgt: tgt_str = ( " ".join(struct.target.tokens_str) .strip("«».,()□�?'\"") .strip() .lower() ) count_keys = [f"{frame_key}::{tgt_str}"] deep_count_keys = [f"{frame_key}::{tgt_str}"] elif group_by_constr and group_by_root: count_keys = [ f"{frame_key}/{syntax_constr}::{'root' if syntax_at_root else 'non-root'}" ] deep_count_keys = [ f"{deep_frame_key}::{syntax_constr}::{'root' if syntax_at_root else 'non-root'}" ] elif group_by_constr: count_keys = [f"{frame_key}::{syntax_constr}"] deep_count_keys = [f"{deep_frame_key}::{syntax_constr}"] elif group_by_root: count_keys = [ f"{frame_key}::{'root' if syntax_at_root else 'non-root'}" ] deep_count_keys = [ f"{deep_frame_key}::{'root' if syntax_at_root else 'non-root'}" ] elif group_by_role_expr: if group_by_role_expr == 1: role_exprs = [r for r, _ in struct.roles] elif group_by_role_expr == 2: role_exprs = get_role_expressions( struct, roles, None, 1, False ) elif group_by_role_expr == 3: role_exprs = get_role_expressions( struct, roles, session["frames_to_roles"], 1, False ) elif group_by_role_expr == 4: role_exprs = get_role_expressions( struct, roles, None, None, True ) else: raise ValueError( "Unknown value for param group_by_role_expr" ) count_keys = [] deep_count_keys = [] for role_expr in role_exprs: if group_by_role_expr == 4: role_name, depth = role_expr.split("::") depth = abs(int(depth)) if depth > 3: depth = ">3" role_expr = f"{role_name}::{depth}" count_keys.append(f"{frame_key}::{role_expr}") deep_count_keys.append(f"{deep_frame_key}::{role_expr}") else: count_keys = [struct.frame] deep_count_keys = [struct.deep_frame] for ck, dck in zip(count_keys, deep_count_keys): if struct.frame in frame_filter: if plot_over_days_post or plot_by_year: relevant_frame_counts[time_bucket][ck] += 1 deep_frame_counts[time_bucket][dck] += 1 else: relevant_frame_counts[ck] += 1 deep_frame_counts[dck] += 1 if plot_over_days_post or plot_by_year: all_frame_counts[time_bucket][ck] += 1 else: all_frame_counts[ck] += 1 print("Computing frame counts...") if plot_over_days_post or plot_by_year: data_and_names = [ (relevant_frame_counts, "relevant_frame_counts"), (deep_frame_counts, "deep_frame_counts"), (all_frame_counts, "all_frame_counts"), ] data_out = {} for (data, name) in data_and_names: traces = defaultdict(lambda: {"x": [], "y": []}) for time_bucket in sorted(data): total_count = sum(data[time_bucket].values()) for count_keys, count in data[time_bucket].items(): traces[count_keys]["x"].append(time_bucket) traces[count_keys]["y"].append( count / total_count if relative else count ) data_out[name] = traces return jsonify(data_out) else: relevant_frames_sr = pd.Series(data=relevant_frame_counts).sort_values( ascending=False ) deep_frames_sr = pd.Series(data=deep_frame_counts).sort_values(ascending=False) all_frames_sr = pd.Series(data=all_frame_counts).sort_values(ascending=False) if relative and group_by_role_expr > 0: print("totals_by_frame=", totals_by_frame) print("frame_filter=", frame_filter) denom = totals_by_frame[frame_filter[0]] print("denom=", denom) relevant_frames_sr /= denom deep_frames_sr /= deep_frames_sr.sum() # TODO: what to do with this??? all_frames_sr /= all_frames_sr.sum() elif relative: relevant_frames_sr /= relevant_frames_sr.sum() deep_frames_sr /= deep_frames_sr.sum() all_frames_sr /= all_frames_sr.sum() return jsonify( { "relevant_frame_counts": { "x": relevant_frames_sr.index.tolist(), "y": relevant_frames_sr.values.tolist(), }, "deep_frame_counts": { "x": deep_frames_sr.index.tolist(), "y": deep_frames_sr.values.tolist(), }, "all_frame_counts": { "x": all_frames_sr.index.tolist(), "y": all_frames_sr.values.tolist(), }, } ) # for demo app @app.route("/similar_frames") def similar_frames(): if gensim_m is None: return jsonify({"result": "FAIL", "reason": "no GENSIM model has been loaded, please call /load_gensim and try again"}) words_in = [w for w in request.args.get("words_in").split("+") if "glove_" + w in gensim_m] if not words_in: return jsonify({"result": "FAIL", "reason": "No input words given"}) try: matches = [res for res in gensim_m.most_similar(positive=["glove_" + w for w in words_in], topn=100) if res[0].startswith("fn_")] except KeyError: return jsonify({"result": "FAIL", "reason": "One of the input words does not exist in the GloVe vocabulary"}) frames = [m[0].lstrip("fn_") for m in matches] probas = [m[1] for m in matches] return jsonify({ "result": "OK", "frames": frames, "probabilities": probas, }) @app.route("/sociofillmore") def sociofillmore(): # step 1: LOME analysis input_text = request.args.get("text", "") language = request.args.get("language", "en") sentences = [s.text for s in spacy_model_ud(input_text).sents] r = requests.get("http://localhost:9090/analyze", {"text": "\n".join(sentences)}) # r = requests.get("http://lome:9090/analyze", {"text": "\n".join(sentences)}) lome_analyses = json.loads(r.text)["analyses"] # intermediate step: make temporary files for saving input/output analyses tmp_in = tempfile.NamedTemporaryFile(mode="w+", delete=False) tmp_in.write(json.dumps(lome_analyses)) tmp_in.close() tmp_out = tempfile.NamedTemporaryFile(mode="w+", delete=False) tmp_out.close() # step 2: SocioFillmore analyze_external_file(tmp_in.name, tmp_out.name, spacy_model_langs[language]) with open(tmp_out.name, "r") as f_out: data_out = json.load(f_out) os.unlink(tmp_in.name) os.unlink(tmp_out.name) return jsonify(data_out) if __name__ == "__main__": from waitress import serve if len(sys.argv) > 1: host = sys.argv[1] else: host = "127.0.0.1" debug = False if len(sys.argv) > 2: if sys.argv[2] == "debug": debug = True serve(app, host="0.0.0.0", port="5000") # app.run(host=host, debug=False, ssl_context="adhoc")