Spaces:

gossminn
/

fillmorle-app

Build error

App Files Files Community

gossminn commited on Mar 1, 2022

Commit

6680682

•

0 Parent(s):

First version

Browse files

Files changed (49) hide show

.gitattributes +2 -0
.gitignore +210 -0
deploy.py +3 -0
fillmorle/app.py +524 -0
model.mod.tar.gz +3 -0
requirements.txt +19 -0
setup.py +9 -0
sftp/__init__.py +10 -0
sftp/data_reader/__init__.py +6 -0
sftp/data_reader/batch_sampler/__init__.py +1 -0
sftp/data_reader/batch_sampler/mix_sampler.py +50 -0
sftp/data_reader/better_reader.py +286 -0
sftp/data_reader/concrete_reader.py +44 -0
sftp/data_reader/concrete_srl.py +169 -0
sftp/data_reader/span_reader.py +197 -0
sftp/data_reader/srl_reader.py +107 -0
sftp/metrics/__init__.py +4 -0
sftp/metrics/base_f.py +27 -0
sftp/metrics/exact_match.py +29 -0
sftp/metrics/fbeta_mix_measure.py +34 -0
sftp/metrics/srl_metrics.py +138 -0
sftp/models/__init__.py +1 -0
sftp/models/span_model.py +362 -0
sftp/modules/__init__.py +4 -0
sftp/modules/smooth_crf.py +77 -0
sftp/modules/span_extractor/__init__.py +1 -0
sftp/modules/span_extractor/combo.py +36 -0
sftp/modules/span_finder/__init__.py +2 -0
sftp/modules/span_finder/bio_span_finder.py +216 -0
sftp/modules/span_finder/span_finder.py +87 -0
sftp/modules/span_typing/__init__.py +2 -0
sftp/modules/span_typing/mlp_span_typing.py +99 -0
sftp/modules/span_typing/span_typing.py +64 -0
sftp/predictor/__init__.py +1 -0
sftp/predictor/span_predictor.orig.py +362 -0
sftp/predictor/span_predictor.py +401 -0
sftp/training/__init__.py +0 -0
sftp/training/transformer_optimizer.py +121 -0
sftp/utils/__init__.py +7 -0
sftp/utils/bio_smoothing.py +62 -0
sftp/utils/common.py +3 -0
sftp/utils/db_storage.py +87 -0
sftp/utils/functions.py +75 -0
sftp/utils/label_smoothing.py +48 -0
sftp/utils/span.py +420 -0
sftp/utils/span_utils.py +57 -0
sociolome/combine_models.py +130 -0
sociolome/evalita_eval.py +319 -0
sociolome/lome_wrapper.py +83 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ spanfinder/model.mod.tar.gz filter=lfs diff=lfs merge=lfs -text
2	+ model.mod.tar.gz filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,210 @@

+# Created by .ignore support plugin (hsz.mobi)
+### JetBrains template
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+# User-specific stuff
+.idea/
+data
+cache
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+# CMake
+cmake-build-*/
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+# File-based project format
+*.iws
+# IntelliJ
+out/
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+# JIRA plugin
+atlassian-ide-plugin.xml
+# Cursive Clojure plugin
+.idea/replstate.xml
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+# Editor-based Rest Client
+.idea/httpRequests
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+### JupyterNotebooks template
+# gitignore template for Jupyter Notebooks
+# website: http://jupyter.org/
+.ipynb_checkpoints
+*/.ipynb_checkpoints/*
+# IPython
+profile_default/
+ipython_config.py
+# Remove previous ipynb_checkpoints
+#   git rm -r .ipynb_checkpoints/
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/

deploy.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ import runpy
2	+
3	+ runpy.run_module("fillmorle.app", run_name="__main__", alter_sys=True)

fillmorle/app.py ADDED Viewed

	@@ -0,0 +1,524 @@

+from itertools import product
+import random
+from turtle import hideturtle
+import requests
+import json
+import lxml.etree as ET
+import gensim
+import pandas as pd
+import nltk
+# from nltk.corpus import framenet as fn
+# --- circumvent threading issues with FrameNet
+fn_root = nltk.data.find("{}/{}".format("corpora", "framenet_v17"))
+print(fn_root)
+fn_files = ["frRelation.xml", "frameIndex.xml", "fulltextIndex.xml", "luIndex.xml", "semTypes.xml"]
+fn = nltk.corpus.reader.framenet.FramenetCorpusReader(fn_root, fn_files)
+# ---
+import streamlit as st
+from sociolome import lome_wrapper
+def similarity(gensim_m, frame_1, frame_2):
+    if f"fn_{frame_1}" not in gensim_m or f"fn_{frame_2}" not in gensim_m:
+        return None
+    return 1 - gensim_m.distance(f"fn_{frame_1}", f"fn_{frame_2}")
+def rank(gensim_m, frame_1, frame_2):
+    frame_1 = f"fn_{frame_1}"
+    frame_2 = f"fn_{frame_2}"
+    if frame_1 == frame_2:
+        return 0
+    for i, (word, _) in enumerate(gensim_m.most_similar(frame_1, topn=1200)):
+        if word == frame_2:
+            return i + 1
+    return -1
+def format_frame_description(frame_def_xml):
+    frame_def_fmt = [frame_def_xml.text] if frame_def_xml.text else []
+    for elem in frame_def_xml:
+        if elem.tag == "ex":
+            break
+        elif elem.tag == "fen":
+            frame_def_fmt.append(elem.text.upper())
+        elif elem.text:
+            frame_def_fmt.append(elem.text)
+        if elem.tail:
+            frame_def_fmt.append(elem.tail)
+    return "".join(frame_def_fmt).replace("frames", "stories").replace("frame", "story")
+def get_frame_definition(frame_info):
+    try:
+        # try extracting just the first sentence
+        definition_first_sent = nltk.sent_tokenize(frame_info.definitionMarkup)[0] + "</def-root>"
+        frame_def_xml = ET.fromstring(definition_first_sent)
+    except ET.XMLSyntaxError:
+        # otherwise, use the full definition
+        frame_def_xml = ET.fromstring(frame_info.definitionMarkup)
+    return format_frame_description(frame_def_xml)
+def get_random_example(frame_info):
+    exemplars = [
+        {
+            "text": exemplar.text,
+            "target_lu": lu_name,
+            "target_idx": list(exemplar["Target"][0]),
+            "core_fes": {
+                role: exemplar.text[start_idx:end_idx]
+                for role, start_idx, end_idx in exemplar.FE[0]
+                if role in [fe for fe, fe_info in frame_info.FE.items() if fe_info.coreType == "Core"]
+                }
+        }
+        for lu_name, lu_info in frame_info["lexUnit"].items()
+        for exemplar in lu_info.exemplars if len(exemplar.text) > 30
+    ]
+    if exemplars:
+        return random.choice(exemplars)
+    return None
+def make_hint(gensim_m, target, current_closest):
+    if target == current_closest:
+        return None
+    most_similar = gensim_m.most_similar(f"fn_{target}", topn=1200)
+    current_position = [word for word, _ in most_similar].index(f"fn_{current_closest}")
+    while current_position > 0:
+        next_closest, _ = most_similar[current_position - 1]
+        info = fn.frame(next_closest.replace("fn_", ""))
+        if len(info.lexUnit) > 10:
+            exemplar = get_random_example(info)
+            if exemplar:
+                return next_closest, exemplar
+        current_position -= 1
+    return None
+def get_typical_exemplar(frame_info):
+    exemplars = [
+        {
+            "text": exemplar.text,
+            "target_lu": lu_name,
+            "target_idx": list(exemplar["Target"][0]),
+            "core_fes": {
+                role: exemplar.text[start_idx:end_idx]
+                for role, start_idx, end_idx in exemplar.FE[0]
+                if role in [fe for fe, fe_info in frame_info.FE.items() if fe_info.coreType == "Core"]
+                }
+        }
+        for lu_name, lu_info in frame_info["lexUnit"].items()
+        for exemplar in lu_info.exemplars
+    ]
+    # try to find a "typical" exemplar --- typical -> as short as possible, as many FEs as possible
+    exa_typicality_scores = [(exa, len(exa["text"]) - 25 * len(exa["core_fes"])) for exa in exemplars]
+    if exa_typicality_scores:
+        typical_exemplar = min(exa_typicality_scores, key=lambda t: t[1])[0]
+    else:
+        typical_exemplar = None
+    return typical_exemplar
+def find_all_inheriting_frames(frame_name):
+    frame_info = fn.frame(frame_name)
+    inheritance_rels = [rel for rel in frame_info.frameRelations if rel.type.name == "Inheritance" and rel.superFrame.name == frame_name]
+    inheritors = [rel.subFrame.name for rel in inheritance_rels]
+    for inh in inheritors:
+        inheritors.extend(find_all_inheriting_frames(inh))
+    return inheritors
+def has_enough_lus(frame, n=10):
+    return len(fn.frame(frame).lexUnit) > n
+def choose_secret_frames():
+    event_frames = [frm for frm in find_all_inheriting_frames("Event") if has_enough_lus(frm)]
+    entity_frames = [frm for frm in find_all_inheriting_frames("Entity") if has_enough_lus(frm)]
+    return random.choice(list(product(event_frames, entity_frames)))
+def get_frame_info(frames):
+    frames_and_info = []
+    for evoked_frame in frames:
+        try:
+            frame_info = fn.frame(evoked_frame)
+            typical_sentence = get_typical_exemplar(frame_info)
+            frames_and_info.append((evoked_frame, frame_info, typical_sentence))
+        except FileNotFoundError:
+            continue
+    return frames_and_info
+def get_frame_feedback(frames_and_info, gensim_m, secret_event, secret_entity):
+    frame_feedback = []
+    for evoked_frame, frame_info, typical_sentence in frames_and_info:
+        lexunits = list(frame_info.lexUnit.keys())[:5]
+        similarity_score_1 = similarity(gensim_m, secret_event, evoked_frame)
+        similarity_rank_1 = rank(gensim_m, secret_event, evoked_frame)
+        similarity_score_2 = similarity(gensim_m, secret_entity, evoked_frame)
+        similarity_rank_2 = rank(gensim_m, secret_entity, evoked_frame)
+        if typical_sentence:
+            typical_sentence_txt = typical_sentence['text']
+        else:
+            typical_sentence_txt = None
+        frame_feedback.append({
+            "frame": evoked_frame,
+            "similarity_1": similarity_score_1 * 100 if similarity_score_1 else None,
+            "rank_1": similarity_rank_1 if similarity_rank_1 != -1 else "far away",
+            "similarity_2": similarity_score_2 * 100 if similarity_score_2 else None,
+            "rank_2": similarity_rank_2 if similarity_rank_2 != -1 else "far away",
+            "typical_words": lexunits,
+            "typical_sentence": typical_sentence_txt
+        })
+    return frame_feedback
+def run_game_cli(debug=True):
+    secret_event, secret_entity = choose_secret_frames()
+    if debug:
+        print(f"Shhhhhh you're not supposed to know, but the secret frames are {secret_event} and {secret_entity}")
+        print("--------\n\n\n\n")
+    print("Welcome to FillmorLe!")
+    print("Words are not just words: behind every word, a story is hidden that appears in our imagination when we hear the word.")
+    print()
+    print("In this game, your job is to activate TWO SECRET STORIES by writing sentences.")
+    print("There will be new secret stories every day -- the first story is always about an EVENT (something that happens in the world) and the second one about an ENTITY (a thing or concept).")
+    print("Every time you write a sentence, I will tell you which stories are hidden below the surface, and how close these stories are to the secret stories.")
+    print("Once you write a sentence that has both of the secret stories in it, you win. Good luck and be creative!")
+    gensim_m = gensim.models.word2vec.KeyedVectors.load_word2vec_format("data/frame_embeddings.w2v.txt")
+    num_guesses = 0
+    guesses_event = []
+    guesses_entity = []
+    while True:
+        num_guesses += 1
+        closest_to_event = sorted(guesses_event, key=lambda g: g[1], reverse=True)[:5]
+        closest_to_entity = sorted(guesses_entity, key=lambda g: g[1], reverse=True)[:5]
+        closest_to_event_txt = ", ".join([f"{frm.upper()} ({sim:.2f})" for frm, sim in closest_to_event])
+        closest_to_entity_txt = ", ".join([f"{frm.upper()} ({sim:.2f})" for frm, sim in closest_to_entity])
+        print()
+        print(f"==== Guess #{num_guesses} ====")
+        if secret_event in guesses_event:
+            print("You already guessed SECRET STORY #1: ", secret_event.upper())
+        elif closest_to_event:
+            print(f"Best guesses (SECRET STORY #1):", closest_to_event_txt)
+        if secret_entity in guesses_entity:
+            print("You already guessed SECRET STORY #1: ", secret_entity.upper())
+        elif closest_to_entity:
+            print(f"Best guesses (SECRET STORY #2):", closest_to_entity_txt)
+        sentence = input("Enter a sentence or type 'HINT' if you're stuck >>>> ").strip()
+        if sentence == "HINT":
+            hint_target = None
+            while not hint_target:
+                hint_choice = input("For which story do you want a hint? Type '1' or '2' >>>> ").strip()
+                if hint_choice == "1":
+                    hint_target = secret_event
+                    hint_current = closest_to_event[0][0] if closest_to_event else "Event"
+                elif hint_choice == "2":
+                    hint_target = secret_entity
+                    hint_current = closest_to_entity[0][0] if closest_to_entity else "Entity"
+                else:
+                    print("Please type '1' or '2'.")
+            if hint_current == hint_target:
+                print("You don't need a hint for this story! Maybe you want a hint for the other one?")
+                continue
+            hint = make_hint(gensim_m, hint_target, hint_current)
+            if hint is None:
+                print("Sorry, you're already too close to give you a hint!")
+            else:
+                _, hint_example = hint
+                hint_tgt_idx = hint_example["target_idx"]
+                hint_example_redacted = hint_example["text"][:hint_tgt_idx[0]] + "******" + hint_example["text"][hint_tgt_idx[1]:]
+                print(f"Your hint sentence is: «{hint_example_redacted}»")
+                print(f"PRO TIP 1: the '******' hide a secret word. Guess the word and you will find a story that takes your one step closer to find SECRET STORY #{hint_choice}")
+                print(f"PRO TIP 2: if you don't get the hint, just ask for a new one! You can do this as often as you want.")
+            print("\n\n")
+            continue
+        r = requests.get("http://127.0.0.1:9090/analyze", params={"text": sentence})
+        lome_data = json.loads(r.text)
+        frames = set()
+        for token_items in lome_data["analyses"][0]["frame_list"]:
+            for item in token_items:
+                if item.startswith("T:"):
+                    evoked_frame = item.split("@")[0].replace("T:", "")
+                    frames.add(evoked_frame)
+        frames_and_info = get_frame_info(frames)
+        frame_feedback = get_frame_feedback(frames_and_info)
+        for i, feedback in enumerate(frame_feedback):
+            print(f"STORY {i}: {feedback['frame'].upper()}")
+            if feedback["typical_sentence"]:
+                print(f"\ttypical context: «{feedback['typical_sentence']}»")
+            print("\ttypical words:", ", ".join(feedback["typical_words"]), "...")
+            if feedback["similarity_1"]:
+                guesses_event.append((evoked_frame, feedback["similarity_1"]))
+                guesses_entity.append((evoked_frame, feedback["similarity_2"]))
+                print(f"\tsimilarity to SECRET STORY #1: {feedback['similarity_1']:.2f}")
+                print(f"\tsimilarity to SECRET STORY #2: {feedback['similarity_2']:.2f}")
+            else:
+                print("similarity: unknown")
+            print()
+        if not frames_and_info:
+            print("I don't know any of the stories in your sentence. Try entering another sentence.")
+        elif secret_event in frames and secret_entity in frames:
+            print(f"YOU WIN! You made a sentence with both of the SECRET STORIES: {secret_event.upper()} and {secret_entity.upper()}.\nYou won the game in {num_guesses} guesses, great job!")
+            break
+        elif secret_event in frames:
+            print(f"Great, you guessed SECRET STORY #1! It was {secret_event.upper()}!")
+            print("To win, make a sentence with this story and SECRET STORY #2 hidden in it.")
+        elif secret_entity in frames:
+            print(f"Great, you guessed SECRET STORY #2! It was {secret_entity.upper()}!")
+            print("To win, make a sentence with this story and SECRET STORY #1 hidden in it.")
+# dummy version
+# def analyze_sentence(sentence):
+#     return sentence.split()
+def analyze_sentence(sentence):
+    lome_data = lome_wrapper.analyze(sentence)
+    frames = set()
+    for token_items in lome_data["analyses"][0]["frame_list"]:
+        for item in token_items:
+            if item.startswith("T:"):
+                evoked_frame = item.split("@")[0].replace("T:", "")
+                frames.add(evoked_frame)
+    return frames
+def make_frame_feedback_msg(frame_feedback):
+    feedback_msg = []
+    for i, feedback in enumerate(frame_feedback):
+        feedback_msg.append(f"* STORY {i}: *{feedback['frame'].upper()}*")
+        feedback_msg.append("\t* typical words: *" + " ".join(feedback["typical_words"]) + "* ...")
+        if feedback["typical_sentence"]:
+            feedback_msg.append(f"\t* typical context: «{feedback['typical_sentence']}»")
+        if feedback["similarity_1"]:
+            feedback_msg.append(f"\t* similarity to SECRET STORY #1: {feedback['similarity_1']:.2f}")
+            feedback_msg.append(f"\t* similarity to SECRET STORY #2: {feedback['similarity_2']:.2f}")
+        else:
+            feedback_msg.append(f"\t* similarity: unknown")
+    return "\n".join(feedback_msg)
+def format_hint_sentence(hint_example):
+    hint_tgt_idx = hint_example["target_idx"]
+    hint_example_redacted = hint_example["text"][:hint_tgt_idx[0]] + "******" + hint_example["text"][hint_tgt_idx[1]:]
+    return hint_example_redacted.strip()
+def play_turn():
+    # remove text from input
+    sentence = st.session_state["cur_sentence"]
+    st.session_state["cur_sentence"] = ""
+    # get previous game state
+    game_state = st.session_state["game_state"]
+    secret_event, secret_entity = game_state["secret_event"], game_state["secret_entity"]
+    guesses_event, guesses_entity = game_state["guesses_event"], game_state["guesses_entity"]
+    # reset hints
+    st.session_state["hints"] = [None, None]
+    # reveal correct frames
+    if sentence.strip().lower() == "show me the frames":
+        st.warning(f"The correct frames are: {secret_event.upper()} and {secret_entity.upper()}")
+    # process hints
+    elif sentence.strip() == "HINT":
+        guesses_event = sorted(game_state["guesses_event"], key=lambda t: t[1], reverse=True)
+        guesses_entity = sorted(game_state["guesses_entity"], key=lambda t: t[1], reverse=True)
+        best_guess_event = guesses_event[0][0] if guesses_event else "Event"
+        best_guess_entity = guesses_entity[0][0] if guesses_entity else "Entity"
+        event_hint = make_hint(st.session_state["gensim_model"], secret_event, best_guess_event)
+        entity_hint = make_hint(st.session_state["gensim_model"], secret_entity, best_guess_entity)
+        if event_hint:
+            st.session_state["hints"][0] = format_hint_sentence(event_hint[1])
+        if entity_hint:
+            st.session_state["hints"][1] = format_hint_sentence(entity_hint[1])
+    else:
+        frames = analyze_sentence(sentence)
+        frames_and_info = get_frame_info(frames)
+        frame_feedback = get_frame_feedback(frames_and_info, st.session_state["gensim_model"], secret_event, secret_entity)
+        # update game state post analysis
+        game_state["num_guesses"] += 1
+        for fdb in frame_feedback:
+            if fdb["similarity_1"]:
+                guesses_event.add((fdb["frame"], fdb["similarity_1"], fdb["rank_1"]))
+                guesses_entity.add((fdb["frame"], fdb["similarity_2"], fdb["rank_2"]))
+        st.session_state["frame_feedback"] = frame_feedback
+        if secret_event in frames and secret_entity in frames:
+            st.session_state["game_over"] = True
+            st.session_state["guesses_to_win"] = game_state["num_guesses"]
+def display_guess_status():
+    game_state = st.session_state["game_state"]
+    guesses_entity = sorted(game_state["guesses_entity"], key=lambda t: t[1], reverse=True)
+    guesses_event = sorted(game_state["guesses_event"], key=lambda t: t[1], reverse=True)
+    if guesses_event or guesses_entity:
+        st.header("Best guesses")
+    event_col, entity_col = st.columns(2)
+    if guesses_event:
+        with event_col:
+            st.subheader("Secret Story #1")
+            st.table(pd.DataFrame(guesses_event, columns=["Story", "Similarity", "Steps To Go"]))
+            if game_state["secret_event"] in [g for g, _, _ in guesses_event]:
+                st.info("Great, you guessed the Event story! In order to win, make a sentence containing both the secret stories.")
+    if guesses_entity:
+        with entity_col:
+            st.subheader("Secret Story #2")
+            st.table(pd.DataFrame(guesses_entity, columns=["Story", "Similarity", "Steps To Go"]))
+            if game_state["secret_entity"] in [g for g, _, _ in guesses_entity]:
+                st.info("Great, you guessed the Thing story! In order to win, make a sentence containing both the secret stories.")
+def format_feedback(frame_feedback):
+    out = []
+    for fdb in frame_feedback:
+        out.append({
+            "Story": fdb["frame"],
+            "Similarity (Event)": f"{fdb['similarity_1']:.2f}",
+            "Similarity (Thing)": f"{fdb['similarity_2']:.2f}",
+            "Typical Context": fdb["typical_sentence"],
+            "Typical Words": " ".join(fdb["typical_words"])
+        })
+    return out
+def display_introduction():
+    st.subheader("Why this game?")
+    st.markdown(
+    """
+    Words are not just words: behind every word, a _mini-story_ (also known as "frame") is hidden
+    that appears in our imagination when we hear the word. For example, when we hear the word
+    "talking" we can imagine a mini-story that involves several people who are interacting
+    with each other. Or, if we hear the word "cookie", we might think of someone eating a cookie.
+    """.strip())
+    st.subheader("How does it work?")
+    st.markdown(
+    "* In this game, there are two secret mini-stories, and it's your job to figure out which ones!"
+    "\n"
+    "* The first mini-story is about an _Event_ (something that happens in the world, like a thunderstorm, "
+    "people talking, someone eating pasta), and the other one is a _Thing_ (a concrete thing like a tree"
+    "or something abstract like 'love')."
+    "\n"
+    "* How to guess the stories? Well, just type a sentence, and we'll tell you which mini-stories are "
+    "hidden in the sentence. For each of the stories, we'll tell you how close they are to the secret ones."
+    "\n"
+    "* Once you type a sentence with both of the secret mini-stories, you win!"
+    )
+def display_hints():
+    event_hint, entity_hint = st.session_state["hints"]
+    if event_hint or entity_hint:
+        st.header("Hints")
+        st.info("So you need some help? Here you get your hint sentences! Guess the hidden word, use it in a sentence, and we'll help you get one step closer.")
+    if event_hint:
+        st.markdown(f"**Event Hint**:\n>_{event_hint}_")
+    if entity_hint:
+        st.markdown(f"**Thing Hint**:\n>_{entity_hint}_")
+def display_frame_feedback():
+    frame_feedback = st.session_state["frame_feedback"]
+    if frame_feedback:
+        st.header("Feedback")
+        st.text("Your sentence contains the following stories: ")
+        feedback_df = format_feedback(frame_feedback)
+        st.table(pd.DataFrame(feedback_df))
+def run_game_st(debug=True):
+    if not st.session_state.get("initialized", False):
+        secret_event, secret_entity = choose_secret_frames()
+        gensim_m = gensim.models.word2vec.KeyedVectors.load_word2vec_format("data/frame_embeddings.w2v.txt")
+        game_state = {
+            "secret_event": secret_event,
+            "secret_entity": secret_entity,
+            "num_guesses": 0,
+            "guesses_event": set(),
+            "guesses_entity": set(),
+        }
+        st.session_state["initialized"] = True
+        st.session_state["show_introduction"] = False
+        st.session_state["game_over"] = False
+        st.session_state["guesses_to_win"] = -1
+        st.session_state["game_state"] = game_state
+        st.session_state["gensim_model"] = gensim_m
+        st.session_state["frame_feedback"] = None
+        st.session_state["hints"] = [None, None]
+    else:
+        gensim_m = st.session_state["gensim_model"]
+        game_state = st.session_state["game_state"]
+    secret_event, secret_entity = game_state["secret_event"], game_state["secret_entity"]
+    header = st.container()
+    with header:
+        st.title("FillmorLe")
+        st.checkbox("Show explanation?", key="show_introduction")
+        if st.session_state["show_introduction"]:
+            display_introduction()
+        st.header(f"Guess #{st.session_state['game_state']['num_guesses'] + 1}")
+        st.text_input("Enter a sentence or type 'HINT' if you're stuck", key="cur_sentence", on_change=play_turn)
+        if st.session_state["game_over"]:
+            st.success(f"You won in {st.session_state['guesses_to_win']}!")
+        display_hints()
+        display_frame_feedback()
+        display_guess_status()
+if __name__ == "__main__":
+    run_game_st()

model.mod.tar.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f5be5aeef50b2f4840317b8196c51186f9f138a853dc1eb2da980b1947ceb23
+size 1795605184

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+allennlp>=2.0.0
+allennlp-models>=2.0.0
+transformers>=4.0.0 # Why is huggingface so unstable?
+numpy
+torch>=1.7.0,<1.8.0
+tqdm
+nltk
+overrides
+concrete
+flask
+scipy
+requests
+lxml
+gensim
+streamlit
+https://github.com/explosion/spacy-models/releases/download/it_core_news_md-3.0.0/it_core_news_md-3.0.0-py3-none-any.whl
+https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.0.0/en_core_web_md-3.0.0-py3-none-any.whl
+https://github.com/explosion/spacy-models/releases/download/nl_core_news_md-3.0.0/nl_core_news_md-3.0.0-py3-none-any.whl
+https://github.com/explosion/spacy-models/releases/download/xx_sent_ud_sm-3.0.0/xx_sent_ud_sm-3.0.0-py3-none-any.whl

setup.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from setuptools import setup, find_packages
+setup(
+      name='sftp',
+      version='0.0.2',
+      author='Guanghui Qin',
+      packages=find_packages(),
+)

sftp/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from .data_reader import (
+    BetterDatasetReader, SRLDatasetReader
+)
+from .metrics import SRLMetric, BaseF, ExactMatch, FBetaMixMeasure
+from .models import SpanModel
+from .modules import (
+    MLPSpanTyping, SpanTyping, SpanFinder, BIOSpanFinder
+)
+from .predictor import SpanPredictor
+from .utils import Span

sftp/data_reader/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .batch_sampler import MixSampler
+from .better_reader import BetterDatasetReader
+from .span_reader import SpanReader
+from .srl_reader import SRLDatasetReader
+from .concrete_srl import concrete_doc, concrete_doc_tokenized, collect_concrete_srl
+from .concrete_reader import ConcreteDatasetReader

sftp/data_reader/batch_sampler/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .mix_sampler import MixSampler

sftp/data_reader/batch_sampler/mix_sampler.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import logging
+import random
+from typing import *
+from allennlp.data.samplers.batch_sampler import BatchSampler
+from allennlp.data.samplers.max_tokens_batch_sampler import MaxTokensBatchSampler
+from torch.utils import data
+logger = logging.getLogger('mix_sampler')
+@BatchSampler.register('mix_sampler')
+class MixSampler(MaxTokensBatchSampler):
+    def __init__(
+            self,
+            max_tokens: int,
+            sorting_keys: List[str] = None,
+            padding_noise: float = 0.1,
+            sampling_ratios: Optional[Dict[str, float]] = None,
+    ):
+        super().__init__(max_tokens, sorting_keys, padding_noise)
+        self.sampling_ratios = sampling_ratios or dict()
+    def __iter__(self):
+        indices, lengths = self._argsort_by_padding(self.data_source)
+        original_num = len(indices)
+        instance_types = [
+            ins.fields['meta'].metadata.get('type', 'default') if 'meta' in ins.fields else 'default'
+            for ins in self.data_source
+        ]
+        instance_thresholds = [
+            self.sampling_ratios[ins_type] if ins_type in self.sampling_ratios else 1.0 for ins_type in instance_types
+        ]
+        for idx, threshold in enumerate(instance_thresholds):
+            if random.random() > threshold:
+                # Reject
+                list_idx = indices.index(idx)
+                del indices[list_idx], lengths[list_idx]
+        if original_num != len(indices):
+            logger.info(f'#instances reduced from {original_num} to {len(indices)}.')
+        max_lengths = [max(length) for length in lengths]
+        group_iterator = self._lazy_groups_of_max_size(indices, max_lengths)
+        batches = [list(group) for group in group_iterator]
+        random.shuffle(batches)
+        for batch in batches:
+            yield batch

sftp/data_reader/better_reader.py ADDED Viewed

	@@ -0,0 +1,286 @@

+import json
+import logging
+import os
+from collections import defaultdict, namedtuple
+from typing import *
+from allennlp.data.dataset_readers.dataset_reader import DatasetReader
+from allennlp.data.instance import Instance
+from .span_reader import SpanReader
+from ..utils import Span
+# logging.basicConfig(level=logging.DEBUG)
+# for v in logging.Logger.manager.loggerDict.values():
+# v.disabled = True
+logger = logging.getLogger(__name__)
+SpanTuple = namedtuple('Span', ['start', 'end'])
+@DatasetReader.register('better')
+class BetterDatasetReader(SpanReader):
+    def __init__(
+            self,
+            eval_type,
+            consolidation_strategy='first',
+            span_set_type='single',
+            max_argument_ss_size=1,
+            use_ref_events=False,
+            **extra
+    ):
+        super().__init__(**extra)
+        self.eval_type = eval_type
+        assert self.eval_type in ['abstract', 'basic']
+        self.consolidation_strategy = consolidation_strategy
+        self.unitary_spans = span_set_type == 'single'
+        # event anchors are always singleton spans
+        self.max_arg_spans = max_argument_ss_size
+        self.use_ref_events = use_ref_events
+        self.n_overlap_arg = 0
+        self.n_overlap_trigger = 0
+        self.n_skip = 0
+        self.n_too_long = 0
+    @staticmethod
+    def post_process_basic_span(predicted_span, basic_entry):
+        # Convert token offsets back to characters, also get the text spans as a sanity check
+        # !!!!!
+        # SF outputs inclusive idxs
+        # char offsets are inc-exc
+        # token offsets are inc-inc
+        # !!!!!
+        start_idx = predicted_span['start_idx']  # inc
+        end_idx = predicted_span['end_idx']  # inc
+        char_start_idx = basic_entry['tok2char'][predicted_span['start_idx']][0]  # inc
+        char_end_idx = basic_entry['tok2char'][predicted_span['end_idx']][-1] + 1  # exc
+        span_text = basic_entry['segment-text'][char_start_idx:char_end_idx]  # inc exc
+        span_text_tok = basic_entry['segment-text-tok'][start_idx:end_idx + 1]  # inc exc
+        span = {'string': span_text,
+                'start': char_start_idx,
+                'end': char_end_idx,
+                'start-token': start_idx,
+                'end-token': end_idx,
+                'string-tok': span_text_tok,
+                'label': predicted_span['label'],
+                'predicted': True}
+        return span
+    @staticmethod
+    def _get_shortest_span(spans):
+        # shortest_span_length = float('inf')
+        # shortest_span = None
+        # for span in spans:
+        # span_tokens = span['string-tok']
+        # span_length = len(span_tokens)
+        # if span_length < shortest_span_length:
+        # shortest_span_length = span_length
+        # shortest_span = span
+        # return shortest_span
+        return [s[-1] for s in sorted([(len(span['string']), ix, span) for ix, span in enumerate(spans)])]
+    @staticmethod
+    def _get_first_span(spans):
+        spans = [(span['start'], -len(span['string']), ix, span) for ix, span in enumerate(spans)]
+        try:
+            return [s[-1] for s in sorted(spans)]
+        except:
+            breakpoint()
+    @staticmethod
+    def _get_longest_span(spans):
+        return [s[-1] for s in sorted([(len(span['string']), ix, span) for ix, span in enumerate(spans)], reverse=True)]
+    @staticmethod
+    def _subfinder(text, pattern):
+        # https://stackoverflow.com/a/12576755
+        matches = []
+        pattern_length = len(pattern)
+        for i, token in enumerate(text):
+            try:
+                if token == pattern[0] and text[i:i + pattern_length] == pattern:
+                    matches.append(SpanTuple(start=i, end=i + pattern_length - 1))  # inclusive boundaries
+            except:
+                continue
+        return matches
+    def consolidate_span_set(self, spans):
+        if self.consolidation_strategy == 'first':
+            spans = BetterDatasetReader._get_first_span(spans)
+        elif self.consolidation_strategy == 'shortest':
+            spans = BetterDatasetReader._get_shortest_span(spans)
+        elif self.consolidation_strategy == 'longest':
+            spans = BetterDatasetReader._get_longest_span(spans)
+        else:
+            raise NotImplementedError(f"{self.consolidation_strategy} does not exist")
+        if self.unitary_spans:
+            spans = [spans[0]]
+        else:
+            spans = spans[:self.max_arg_spans]
+        # TODO add some sanity checks here
+        return spans
+    def get_mention_spans(self, text: List[str], span_sets: Dict):
+        mention_spans = defaultdict(list)
+        for span_set_id in span_sets.keys():
+            spans = span_sets[span_set_id]['spans']
+            # span = BetterDatasetReader._get_shortest_span(spans)
+            # span = BetterDatasetReader._get_earliest_span(spans)
+            consolidated_spans = self.consolidate_span_set(spans)
+            # if len(spans) > 1:
+            # logging.info(f"Truncated a spanset from {len(spans)} spans to 1")
+            if self.eval_type == 'abstract':
+                span = consolidated_spans[0]
+                span_tokens = span['string-tok']
+                span_indices = BetterDatasetReader._subfinder(text=text, pattern=span_tokens)
+                if len(span_indices) > 1:
+                    pass
+                if len(span_indices) == 0:
+                    continue
+                mention_spans[span_set_id] = span_indices[0]
+            else:
+                # in basic, we already have token offsets in the right form
+                # if not span['string-tok'] == text[span['start-token']:span['end-token'] + 1]:
+                # print(span, text[span['start-token']:span['end-token'] + 1])
+                # we should use these token offsets only!
+                for span in consolidated_spans:
+                    mention_spans[span_set_id].append(SpanTuple(start=span['start-token'], end=span['end-token']))
+        return mention_spans
+    def _read_single_file(self, file_path):
+        with open(file_path) as fp:
+            json_content = json.load(fp)
+        if 'entries' in json_content:
+            for doc_name, entry in json_content['entries'].items():
+                instance = self.text_to_instance(entry, 'train' in file_path)
+                yield instance
+        else:  # TODO why is this split in 2 cases?
+            for doc_name, entry in json_content.items():
+                instance = self.text_to_instance(entry, True)
+                yield instance
+        logger.warning(f'{self.n_overlap_arg} overlapped args detected!')
+        logger.warning(f'{self.n_overlap_trigger} overlapped triggers detected!')
+        logger.warning(f'{self.n_skip} skipped detected!')
+        logger.warning(f'{self.n_too_long} were skipped because they are too long!')
+        self.n_overlap_arg = self.n_skip = self.n_too_long = self.n_overlap_trigger = 0
+    def _read(self, file_path: str) -> Iterable[Instance]:
+        if os.path.isdir(file_path):
+            for fn in os.listdir(file_path):
+                if not fn.endswith('.json'):
+                    logger.info(f'Skipping {fn}')
+                    continue
+                logger.info(f'Loading from {fn}')
+                yield from self._read_single_file(os.path.join(file_path, fn))
+        else:
+            yield from self._read_single_file(file_path)
+    def text_to_instance(self, entry, is_training=False):
+        word_tokens = entry['segment-text-tok']
+        # span sets have been trimmed to the earliest span mention
+        spans = self.get_mention_spans(
+            word_tokens, entry['annotation-sets'][f'{self.eval_type}-events']['span-sets']
+        )
+        # idx of every token that is a part of an event trigger/anchor span
+        all_trigger_idxs = set()
+        # actual inputs to the model
+        input_spans = []
+        self._local_child_overlap = 0
+        self._local_child_total = 0
+        better_events = entry['annotation-sets'][f'{self.eval_type}-events']['events']
+        skipped_events = set()
+        # check for events that overlap other event's anchors, skip them later
+        for event_id, event in better_events.items():
+            assert event['anchors'] in spans
+            # take the first consolidated span for anchors
+            anchor_start, anchor_end = spans[event['anchors']][0]
+            if any(ix in all_trigger_idxs for ix in range(anchor_start, anchor_end + 1)):
+                logger.warning(
+                    f"Skipped {event_id} with anchor span {event['anchors']}, overlaps a previously found event trigger/anchor")
+                self.n_overlap_trigger += 1
+                skipped_events.add(event_id)
+                continue
+            all_trigger_idxs.update(range(anchor_start, anchor_end + 1))  # record the trigger
+        for event_id, event in better_events.items():
+            if event_id in skipped_events:
+                continue
+            # arguments for just this event
+            local_arg_idxs = set()
+            # take the first consolidated span for anchors
+            anchor_start, anchor_end = spans[event['anchors']][0]
+            event_span = Span(anchor_start, anchor_end, event['event-type'], True)
+            input_spans.append(event_span)
+            def add_a_child(span_id, label):
+                # TODO this is a bad way to do this
+                assert span_id in spans
+                for child_span in spans[span_id]:
+                    self._local_child_total += 1
+                    arg_start, arg_end = child_span
+                    if any(ix in local_arg_idxs for ix in range(arg_start, arg_end + 1)):
+                        # logger.warn(f"Skipped argument {span_id}, overlaps a previously found argument")
+                        # print(entry['annotation-sets'][f'{self.eval_type}-events']['span-sets'][span_id])
+                        self.n_overlap_arg += 1
+                        self._local_child_overlap += 1
+                        continue
+                    local_arg_idxs.update(range(arg_start, arg_end + 1))
+                    event_span.add_child(Span(arg_start, arg_end, label, False))
+            for agent in event['agents']:
+                add_a_child(agent, 'agent')
+            for patient in event['patients']:
+                add_a_child(patient, 'patient')
+            if self.use_ref_events:
+                for ref_event in event['ref-events']:
+                    if ref_event in skipped_events:
+                        continue
+                    ref_event_anchor_id = better_events[ref_event]['anchors']
+                    add_a_child(ref_event_anchor_id, 'ref-event')
+            # if len(event['ref-events']) > 0:
+            # breakpoint()
+        fields = self.prepare_inputs(word_tokens, spans=input_spans)
+        if self._local_child_overlap > 0:
+            logging.warning(
+                f"Skipped {self._local_child_overlap} / {self._local_child_total} argument spans due to overlaps")
+        return Instance(fields)

sftp/data_reader/concrete_reader.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import logging
+from collections import defaultdict
+from typing import *
+import os
+from allennlp.data.dataset_readers.dataset_reader import DatasetReader
+from allennlp.data.instance import Instance
+from concrete import SituationMention
+from concrete.util import CommunicationReader
+from .span_reader import SpanReader
+from .srl_reader import SRLDatasetReader
+from .concrete_srl import collect_concrete_srl
+from ..utils import Span, BIOSmoothing
+logger = logging.getLogger(__name__)
+@DatasetReader.register('concrete')
+class ConcreteDatasetReader(SRLDatasetReader):
+    def __init__(
+            self,
+            event_only: bool = False,
+            event_smoothing_factor: float = 0.,
+            arg_smoothing_factor: float = 0.,
+            **extra
+    ):
+        super().__init__(**extra)
+        self.event_only = event_only
+        self.event_only = event_only
+        self.event_smooth_factor = event_smoothing_factor
+        self.arg_smooth_factor = arg_smoothing_factor
+    def _read(self, file_path: str) -> Iterable[Instance]:
+        if os.path.isdir(file_path):
+            for fn in os.listdir(file_path):
+                yield from self._read(os.path.join(file_path, fn))
+        all_files = CommunicationReader(file_path)
+        for comm, fn in all_files:
+            sentences = collect_concrete_srl(comm)
+            for tokens, vr in sentences:
+                yield self.text_to_instance(tokens, vr)
+        logger.warning(f'{self.n_span_removed} spans were removed')
+        self.n_span_removed = 0

sftp/data_reader/concrete_srl.py ADDED Viewed

	@@ -0,0 +1,169 @@

+from time import time
+from typing import *
+from collections import defaultdict
+from concrete import (
+    Token, TokenList, TextSpan, MentionArgument, SituationMentionSet, SituationMention, TokenRefSequence,
+    Communication, EntityMention, EntityMentionSet, Entity, EntitySet, AnnotationMetadata, Sentence
+)
+from concrete.util import create_comm, AnalyticUUIDGeneratorFactory
+from concrete.validate import validate_communication
+from ..utils import Span
+def _process_sentence(sent, comm_sent, aug, char_idx_offset: int):
+    token_list = list()
+    for tok_idx, (start_idx, end_idx) in enumerate(sent['tokenization']):
+        token_list.append(Token(
+            tokenIndex=tok_idx,
+            text=sent['sentence'][start_idx:end_idx + 1],
+            textSpan=TextSpan(
+                start=start_idx + char_idx_offset,
+                ending=end_idx + char_idx_offset + 1
+            ),
+        ))
+    comm_sent.tokenization.tokenList = TokenList(tokenList=token_list)
+    sm_list, em_dict, entity_list = list(), dict(), list()
+    annotation = sent['annotations'] if isinstance(sent['annotations'], Span) else Span.from_json(sent['annotations'])
+    for event in annotation:
+        char_start_idx = sent['tokenization'][event.start_idx][0]
+        char_end_idx = sent['tokenization'][event.end_idx][1]
+        sm = SituationMention(
+            uuid=next(aug),
+            text=sent['sentence'][char_start_idx: char_end_idx + 1],
+            situationType='EVENT',
+            situationKind=event.label,
+            argumentList=list(),
+            tokens=TokenRefSequence(
+                tokenIndexList=list(range(event.start_idx, event.end_idx + 1)),
+                tokenizationId=comm_sent.tokenization.uuid
+            ),
+        )
+        for arg in event:
+            em = em_dict.get((arg.start_idx, arg.end_idx + 1))
+            if em is None:
+                char_start_idx = sent['tokenization'][arg.start_idx][0]
+                char_end_idx = sent['tokenization'][arg.end_idx][1]
+                em = EntityMention(next(aug), TokenRefSequence(
+                    tokenIndexList=list(range(arg.start_idx, arg.end_idx + 1)),
+                    tokenizationId=comm_sent.tokenization.uuid,
+                ), text=sent['sentence'][char_start_idx: char_end_idx + 1])
+                entity_list.append(Entity(next(aug), id=em.text, mentionIdList=[em.uuid]))
+                em_dict[(arg.start_idx, arg.end_idx + 1)] = em
+            sm.argumentList.append(MentionArgument(
+                role=arg.label,
+                entityMentionId=em.uuid,
+            ))
+        sm_list.append(sm)
+    return sm_list, list(em_dict.values()), entity_list
+def concrete_doc(
+        sentences: List[Dict[str, Any]],
+        doc_name: str = 'document',
+) -> Communication:
+    """
+    Data format: A list of sentences. Each sentence should be a dict of the following format:
+    {
+        "sentence": String.
+        "tokenization": A list of Tuple[int, int] for start and end indices. Both inclusive.
+        "annotations": A list of event dict, or Span object.
+    }
+    If it is dict, its format should be:
+        Each event should be a dict of the following format:
+        {
+            "span": [start_idx, end_idx]: Integer. Both inclusive.
+            "label": String.
+            "children": A list of arguments.
+        }
+        Each argument should be a dict of the following format:
+        {
+            "span": [start_idx, end_idx]: Integer. Both inclusive.
+            "label": String.
+        }
+    Note the "indices" above all refer to the indices of tokens, instead of characters.
+    """
+    comm = create_comm(
+        doc_name,
+        '\n'.join([sent['sentence'] for sent in sentences]),
+    )
+    aug = AnalyticUUIDGeneratorFactory(comm).create()
+    situation_mention_set = SituationMentionSet(next(aug), AnnotationMetadata('Span Finder', time()), list())
+    comm.situationMentionSetList = [situation_mention_set]
+    entity_mention_set = EntityMentionSet(next(aug), AnnotationMetadata('Span Finder', time()), list())
+    comm.entityMentionSetList = [entity_mention_set]
+    entity_set = EntitySet(
+        next(aug), AnnotationMetadata('O(0) Coref Paser.', time()), list(), None, entity_mention_set.uuid
+    )
+    comm.entitySetList = [entity_set]
+    assert len(sentences) == len(comm.sectionList[0].sentenceList)
+    char_idx_offset = 0
+    for sent, comm_sent in zip(sentences, comm.sectionList[0].sentenceList):
+        sm_list, em_list, entity_list = _process_sentence(sent, comm_sent, aug, char_idx_offset)
+        entity_set.entityList.extend(entity_list)
+        situation_mention_set.mentionList.extend(sm_list)
+        entity_mention_set.mentionList.extend(em_list)
+        char_idx_offset += len(sent['sentence']) + 1
+    validate_communication(comm)
+    return comm
+def concrete_doc_tokenized(
+        sentences: List[List[str]],
+        spans: List[Span],
+        doc_name: str = "document",
+):
+    """
+    Similar to concrete_doc, but with tokenized words and spans.
+    """
+    inputs = list()
+    for sent, vr in zip(sentences, spans):
+        cur_start = 0
+        tokenization = list()
+        for token in sent:
+            tokenization.append((cur_start, cur_start + len(token) - 1))
+            cur_start += len(token) + 1
+        inputs.append({
+            "sentence": " ".join(sent),
+            "tokenization": tokenization,
+            "annotations": vr
+        })
+    return concrete_doc(inputs, doc_name)
+def collect_concrete_srl(comm: Communication) -> List[Tuple[List[str], Span]]:
+    # Mapping from <sentence uuid> to [<ConcreteSentence>, <Associated situation mentions>]
+    sentences = defaultdict(lambda: [None, list()])
+    for sec in comm.sectionList:
+        for sen in sec.sentenceList:
+            sentences[sen.uuid.uuidString][0] = sen
+    # Assume there's only ONE situation mention set
+    assert len(comm.situationMentionSetList) == 1
+    # Assign each situation mention to the corresponding sentence
+    for men in comm.situationMentionSetList[0].mentionList:
+        if men.tokens is None: continue  # For ACE relations
+        sentences[men.tokens.tokenization.sentence.uuid.uuidString][1].append(men)
+    ret = list()
+    for sen, mention_list in sentences.values():
+        tokens = [t.text for t in sen.tokenization.tokenList.tokenList]
+        spans = list()
+        for mention in mention_list:
+            mention_tokens = sorted(mention.tokens.tokenIndexList)
+            event = Span(mention_tokens[0], mention_tokens[-1], mention.situationKind, True)
+            for men_arg in mention.argumentList:
+                arg_tokens = sorted(men_arg.entityMention.tokens.tokenIndexList)
+                event.add_child(Span(arg_tokens[0], arg_tokens[-1], men_arg.role, False))
+            spans.append(event)
+        vr = Span.virtual_root(spans)
+        ret.append((tokens, vr))
+    return ret

sftp/data_reader/span_reader.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import logging
+from abc import ABC
+from typing import *
+import numpy as np
+from allennlp.common.util import END_SYMBOL
+from allennlp.data.dataset_readers.dataset_reader import DatasetReader
+from allennlp.data.dataset_readers.dataset_utils.span_utils import bio_tags_to_spans
+from allennlp.data.fields import *
+from allennlp.data.token_indexers import PretrainedTransformerIndexer
+from allennlp.data.tokenizers import PretrainedTransformerTokenizer, Token
+from ..utils import Span, BIOSmoothing, apply_bio_smoothing
+logger = logging.getLogger(__name__)
+@DatasetReader.register('span')
+class SpanReader(DatasetReader, ABC):
+    def __init__(
+            self,
+            pretrained_model: str,
+            max_length: int = 512,
+            ignore_label: bool = False,
+            debug: bool = False,
+            **extras
+    ) -> None:
+        """
+        :param pretrained_model: The name of the pretrained model. E.g. xlm-roberta-large
+        :param max_length: Sequences longer than this limit will be truncated.
+        :param ignore_label: If True, label on spans will be anonymized.
+        :param debug: True to turn on debugging mode.
+        :param span_proposals: Needed for "enumeration" scheme, but not needed for "BIO".
+            If True, it will try to enumerate candidate spans in the sentence, which will then be fed into
+            a binary classifier (EnumSpanFinder).
+            Note: It might take time to propose spans. And better to use SpacyTokenizer if you want to call
+            constituency parser or dependency parser.
+        :param maximum_negative_spans: Necessary for EnumSpanFinder.
+        :param extras: Args to DatasetReader.
+        """
+        super().__init__(**extras)
+        self.word_indexer = {
+            'pieces': PretrainedTransformerIndexer(pretrained_model, namespace='pieces')
+        }
+        self._pretrained_model_name = pretrained_model
+        self.debug = debug
+        self.ignore_label = ignore_label
+        self._pretrained_tokenizer = PretrainedTransformerTokenizer(pretrained_model)
+        self.max_length = max_length
+        self.n_span_removed = 0
+    def retokenize(
+            self, sentence: List[str], truncate: bool = True
+    ) -> Tuple[List[str], List[Optional[Tuple[int, int]]]]:
+        pieces, offsets = self._pretrained_tokenizer.intra_word_tokenize(sentence)
+        pieces = list(map(str, pieces))
+        if truncate:
+            pieces = pieces[:self.max_length]
+            pieces[-1] = END_SYMBOL
+        return pieces, offsets
+    def prepare_inputs(
+            self,
+            sentence: List[str],
+            spans: Optional[Union[List[Span], Span]] = None,
+            truncate: bool = True,
+            label_type: str = 'string',
+    ) -> Dict[str, Field]:
+        """
+        Prepare inputs and auxiliary variables for span model.
+        :param sentence: A list of tokens. Do not pass in any special tokens, like BOS or EOS.
+            Necessary for both training and testing.
+        :param spans: Optional. For training, spans passed in will be considered as positive examples; the spans
+            that are automatically proposed and not in the positive set will be considered as negative examples.
+            Necessary for training.
+        :param truncate: If True, sequence will be truncated if it's longer than `self.max_training_length`
+        :param label_type: One of [string, list].
+        :return: Dict of AllenNLP fields. For detailed of explanation of every field, refer to the comments
+            below. For the shape of every field, check the module doc.
+                Fields list:
+                    - words
+                    - span_labels
+                    - span_boundary
+                    - parent_indices
+                    - parent_mask
+                    - bio_seqs
+                    - raw_sentence
+                    - raw_spans
+                    - proposed_spans
+        """
+        fields = dict()
+        pieces, offsets = self.retokenize(sentence, truncate)
+        fields['tokens'] = TextField(list(map(Token, pieces)), self.word_indexer)
+        raw_inputs = {'sentence': sentence, "pieces": pieces, 'offsets': offsets}
+        fields['raw_inputs'] = MetadataField(raw_inputs)
+        if spans is None:
+            return fields
+        vr = spans if isinstance(spans, Span) else Span.virtual_root(spans)
+        self.n_span_removed = vr.remove_overlapping()
+        raw_inputs['spans'] = vr
+        vr = vr.re_index(offsets)
+        if truncate:
+            vr.truncate(self.max_length)
+        if self.ignore_label:
+            vr.ignore_labels()
+        # (start_idx, end_idx) pairs. Left and right inclusive.
+        # The first span is the Virtual Root node. Shape [span, 2]
+        span_boundary = list()
+        # label on span. Shape [span]
+        span_labels = list()
+        # parent idx (span indexing space). Shape [span]
+        span_parent_indices = list()
+        # True for parents. Shape [span]
+        parent_mask = [False] * vr.n_nodes
+        # Key: parent idx (span indexing space). Value: child span idx
+        flatten_spans = list(vr.bfs())
+        for span_idx, span in enumerate(vr.bfs()):
+            if span.is_parent:
+                parent_mask[span_idx] = True
+            # 0 is the virtual root
+            parent_idx = flatten_spans.index(span.parent) if span.parent else 0
+            span_parent_indices.append(parent_idx)
+            span_boundary.append(span.boundary)
+            span_labels.append(span.label)
+        bio_tag_list: List[List[str]] = list()
+        bio_configs: List[List[BIOSmoothing]] = list()
+        # Shape: [#parent, #token, 3]
+        bio_seqs: List[np.ndarray] = list()
+        # Parent index for every BIO seq
+        for parent_idx, parent in filter(lambda node: node[1].is_parent, enumerate(flatten_spans)):
+            bio_tags = ['O'] * len(pieces)
+            bio_tag_list.append(bio_tags)
+            bio_smooth: List[BIOSmoothing] = [parent.child_smooth.clone() for _ in pieces]
+            bio_configs.append(bio_smooth)
+            for child in parent:
+                assert all(bio_tags[bio_idx] == 'O' for bio_idx in range(child.start_idx, child.end_idx + 1))
+                if child.smooth_weight is not None:
+                    for i in range(child.start_idx, child.end_idx+1):
+                        bio_smooth[i].weight = child.smooth_weight
+                bio_tags[child.start_idx] = 'B'
+                for word_idx in range(child.start_idx + 1, child.end_idx + 1):
+                    bio_tags[word_idx] = 'I'
+            bio_seqs.append(apply_bio_smoothing(bio_smooth, bio_tags))
+        fields['span_boundary'] = ArrayField(
+            np.array(span_boundary), padding_value=0, dtype=np.int
+        )
+        fields['parent_indices'] = ArrayField(np.array(span_parent_indices), 0, np.int)
+        if label_type == 'string':
+            fields['span_labels'] = ListField([LabelField(label, 'span_label') for label in span_labels])
+        elif label_type == 'list':
+            fields['span_labels'] = ArrayField(np.array(span_labels))
+        else:
+            raise NotImplementedError
+        fields['parent_mask'] = ArrayField(np.array(parent_mask), False, np.bool)
+        fields['bio_seqs'] = ArrayField(np.stack(bio_seqs))
+        self._sanity_check(
+            flatten_spans, pieces, bio_tag_list, parent_mask, span_boundary, span_labels, span_parent_indices
+        )
+        return fields
+    @staticmethod
+    def _sanity_check(
+            flatten_spans, words, bio_tag_list, parent_mask, span_boundary, span_labels, parent_indices, verbose=False
+    ):
+        # For debugging use.
+        assert len(parent_mask) == len(span_boundary) == len(span_labels) == len(parent_indices)
+        for (parent_idx, parent_span), bio_tags in zip(
+                filter(lambda x: x[1].is_parent, enumerate(flatten_spans)), bio_tag_list
+        ):
+            assert parent_mask[parent_idx]
+            parent_s, parent_e = span_boundary[parent_idx]
+            if verbose:
+                print('Parent: ', span_labels[parent_idx], 'Text: ', ' '.join(words[parent_s:parent_e+1]))
+                print(f'It contains {len(parent_span)} children.')
+            for child in parent_span:
+                child_idx = flatten_spans.index(child)
+                assert parent_indices[child_idx] == flatten_spans.index(parent_span)
+                if verbose:
+                    child_s, child_e = span_boundary[child_idx]
+                    print('   ', span_labels[child_idx], 'Text', words[child_s:child_e+1])
+            if verbose:
+                print(f'Child derived from BIO tags:')
+                for _, (start, end) in bio_tags_to_spans(bio_tags):
+                    print(words[start:end+1])

sftp/data_reader/srl_reader.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import json
+import logging
+import random
+from typing import *
+import numpy as np
+from allennlp.data.dataset_readers.dataset_reader import DatasetReader
+from allennlp.data.fields import MetadataField
+from allennlp.data.instance import Instance
+from .span_reader import SpanReader
+from ..utils import Span, VIRTUAL_ROOT, BIOSmoothing
+logger = logging.getLogger(__name__)
+@DatasetReader.register('semantic_role_labeling')
+class SRLDatasetReader(SpanReader):
+    def __init__(
+            self,
+            min_negative: int = 5,
+            negative_ratio: float = 1.,
+            event_only: bool = False,
+            event_smoothing_factor: float = 0.,
+            arg_smoothing_factor: float = 0.,
+            # For Ontology Mapping
+            ontology_mapping_path: Optional[str] = None,
+            min_weight: float = 1e-2,
+            max_weight: float = 1.0,
+            **extra
+    ):
+        super().__init__(**extra)
+        self.min_negative = min_negative
+        self.negative_ratio = negative_ratio
+        self.event_only = event_only
+        self.event_smooth_factor = event_smoothing_factor
+        self.arg_smooth_factor = arg_smoothing_factor
+        self.ontology_mapping = None
+        if ontology_mapping_path is not None:
+            self.ontology_mapping = json.load(open(ontology_mapping_path))
+            for k1 in ['event', 'argument']:
+                for k2, weights in self.ontology_mapping['mapping'][k1].items():
+                    weights = np.array(weights)
+                    weights[weights < min_weight] = 0.0
+                    weights[weights > max_weight] = max_weight
+                    self.ontology_mapping['mapping'][k1][k2] = weights
+                self.ontology_mapping['mapping'][k1] = {
+                    k2: weights for k2, weights in self.ontology_mapping['mapping'][k1].items() if weights.sum() > 1e-5
+                }
+            vr_label = [0.] * len(self.ontology_mapping['target']['label'])
+            vr_label[self.ontology_mapping['target']['label'].index(VIRTUAL_ROOT)] = 1.0
+            self.ontology_mapping['mapping']['event'][VIRTUAL_ROOT] = np.array(vr_label)
+    def _read(self, file_path: str) -> Iterable[Instance]:
+        all_lines = list(map(json.loads, open(file_path).readlines()))
+        if self.debug:
+            random.seed(1); random.shuffle(all_lines)
+        for line in all_lines:
+            ins = self.text_to_instance(**line)
+            if ins is not None:
+                yield ins
+        if self.n_span_removed > 0:
+            logger.warning(f'{self.n_span_removed} spans are removed.')
+        self.n_span_removed = 0
+    def apply_ontology_mapping(self, vr):
+        new_events = list()
+        event_map, arg_map = self.ontology_mapping['mapping']['event'], self.ontology_mapping['mapping']['argument']
+        for event in vr:
+            if event.label not in event_map: continue
+            event.child_smooth.weight = event.smooth_weight = event_map[event.label].sum()
+            event = event.map_ontology(event_map, False, False)
+            new_events.append(event)
+            new_children = list()
+            for child in event:
+                if child.label not in arg_map: continue
+                child.child_smooth.weight = child.smooth_weight = arg_map[child.label].sum()
+                child = child.map_ontology(arg_map, False, False)
+                new_children.append(child)
+            event.remove_child()
+            for child in new_children: event.add_child(child)
+        new_vr = Span.virtual_root(new_events)
+        # For Virtual Root itself.
+        new_vr.map_ontology(self.ontology_mapping['mapping']['event'], True, False)
+        return new_vr
+    def text_to_instance(self, tokens, annotations=None, meta=None) -> Optional[Instance]:
+        meta = meta or {'fully_annotated': True}
+        meta['fully_annotated'] = meta.get('fully_annotated', True)
+        vr = None
+        if annotations is not None:
+            vr = annotations if isinstance(annotations, Span) else Span.from_json(annotations)
+            vr = self.apply_ontology_mapping(vr) if self.ontology_mapping is not None else vr
+            # if len(vr) == 0: return  # Ignore sentence with empty annotation
+            if self.event_smooth_factor != 0.0:
+                vr.child_smooth = BIOSmoothing(o_smooth=self.event_smooth_factor if meta['fully_annotated'] else -1)
+            if self.arg_smooth_factor != 0.0:
+                for event in vr:
+                    event.child_smooth = BIOSmoothing(o_smooth=self.arg_smooth_factor)
+            if self.event_only:
+                for event in vr:
+                    event.remove_child()
+                    event.is_parent = False
+        fields = self.prepare_inputs(tokens, vr, True, 'string' if self.ontology_mapping is None else 'list')
+        fields['meta'] = MetadataField(meta)
+        return Instance(fields)

sftp/metrics/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from sftp.metrics.base_f import BaseF
+from sftp.metrics.exact_match import ExactMatch
+from sftp.metrics.fbeta_mix_measure import FBetaMixMeasure
+from sftp.metrics.srl_metrics import SRLMetric

sftp/metrics/base_f.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from abc import ABC
+from typing import *
+from allennlp.training.metrics import Metric
+class BaseF(Metric, ABC):
+    def __init__(self, prefix: str):
+        self.tp = self.fp = self.fn = 0
+        self.prefix = prefix
+    def reset(self) -> None:
+        self.tp = self.fp = self.fn = 0
+    def get_metric(
+            self, reset: bool
+    ) -> Union[float, Tuple[float, ...], Dict[str, float], Dict[str, List[float]]]:
+        precision = self.tp * 100 / (self.tp + self.fp) if self.tp > 0 else 0.
+        recall = self.tp * 100 / (self.tp + self.fn) if self.tp > 0 else 0.
+        rst = {
+            f'{self.prefix}_p': precision,
+            f'{self.prefix}_r': recall,
+            f'{self.prefix}_f': 2 / (1 / precision + 1 / recall) if self.tp > 0 else 0.
+        }
+        if reset:
+            self.reset()
+        return rst

sftp/metrics/exact_match.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from allennlp.training.metrics import Metric
+from overrides import overrides
+from .base_f import BaseF
+from ..utils import Span
+@Metric.register('exact_match')
+class ExactMatch(BaseF):
+    def __init__(self, check_type: bool):
+        self.check_type = check_type
+        if check_type:
+            super(ExactMatch, self).__init__('em')
+        else:
+            super(ExactMatch, self).__init__('sm')
+    @overrides
+    def __call__(
+            self,
+            prediction: Span,
+            gold: Span,
+    ):
+        tp = prediction.match(gold, self.check_type) - 1
+        fp = prediction.n_nodes - tp - 1
+        fn = gold.n_nodes - tp - 1
+        assert tp >= 0 and fp >= 0 and fn >= 0
+        self.tp += tp
+        self.fp += fp
+        self.fn += fn

sftp/metrics/fbeta_mix_measure.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from allennlp.training.metrics import FBetaMeasure, Metric
+@Metric.register('fbeta_mix')
+class FBetaMixMeasure(FBetaMeasure):
+    def __init__(self, null_idx, **kwargs):
+        super().__init__(**kwargs)
+        self.null_idx = null_idx
+    def get_metric(self, reset: bool = False):
+        tp = float(self._true_positive_sum.sum() - self._true_positive_sum[self.null_idx])
+        total_pred = float(self._pred_sum.sum() - self._pred_sum[self.null_idx])
+        total_gold = float(self._true_sum.sum() - self._true_sum[self.null_idx])
+        beta2 = self._beta ** 2
+        p = 0. if total_pred == 0 else tp / total_pred
+        r = 0. if total_pred == 0 else tp / total_gold
+        f = 0. if p == 0. or r == 0. else ((1 + beta2) * p * r / (p * beta2 + r))
+        mix_f = {
+            'p': p * 100,
+            'r': r * 100,
+            'f': f * 100
+        }
+        if reset:
+            self.reset()
+        return mix_f
+    def add_false_negative(self, labels):
+        for lab in labels:
+            self._true_sum[lab] += 1

sftp/metrics/srl_metrics.py ADDED Viewed

	@@ -0,0 +1,138 @@

+from typing import *
+from allennlp.training.metrics import Metric
+from overrides import overrides
+import numpy as np
+import logging
+from .base_f import BaseF
+from ..utils import Span, max_match
+logger = logging.getLogger('srl_metric')
+@Metric.register('srl')
+class SRLMetric(Metric):
+    def __init__(self, check_type: Optional[bool] = None):
+        self.tri_i = BaseF('tri-i')
+        self.tri_c = BaseF('tri-c')
+        self.arg_i = BaseF('arg-i')
+        self.arg_c = BaseF('arg-c')
+        if check_type is not None:
+            logger.warning('Check type argument is deprecated.')
+    def reset(self) -> None:
+        for metric in [self.tri_i, self.tri_c, self.arg_i, self.arg_c]:
+            metric.reset()
+    def get_metric(self, reset: bool) -> Dict[str, Any]:
+        ret = dict()
+        for metric in [self.tri_i, self.tri_c, self.arg_i, self.arg_c]:
+            ret.update(metric.get_metric(reset))
+        return ret
+    @overrides
+    def __call__(self, prediction: Span, gold: Span):
+        self.with_label_event(prediction, gold)
+        self.without_label_event(prediction, gold)
+        self.tuple_eval(prediction, gold)
+        # self.with_label_arg(prediction, gold)
+        # self.without_label_arg(prediction, gold)
+    def tuple_eval(self, prediction: Span, gold: Span):
+        def extract_tuples(vr: Span, parent_boundary: bool):
+            labeled, unlabeled = list(), list()
+            for event in vr:
+                for arg in event:
+                    if parent_boundary:
+                        labeled.append((event.boundary, event.label, arg.boundary, arg.label))
+                        unlabeled.append((event.boundary, event.label, arg.boundary))
+                    else:
+                        labeled.append((event.label, arg.boundary, arg.label))
+                        unlabeled.append((event.label, arg.boundary))
+            return labeled, unlabeled
+        def equal_matrix(l1, l2): return np.array([[e1 == e2 for e2 in l2] for e1 in l1], dtype=np.int)
+        pred_label, pred_unlabel = extract_tuples(prediction, False)
+        gold_label, gold_unlabel = extract_tuples(gold, False)
+        if len(pred_label) == 0 or len(gold_label) == 0:
+            arg_c_tp = arg_i_tp = 0
+        else:
+            label_bipartite = equal_matrix(pred_label, gold_label)
+            unlabel_bipartite = equal_matrix(pred_unlabel, gold_unlabel)
+            arg_c_tp, arg_i_tp = max_match(label_bipartite), max_match(unlabel_bipartite)
+        arg_c_fp = prediction.n_nodes - len(prediction) - 1 - arg_c_tp
+        arg_c_fn = gold.n_nodes - len(gold) - 1 - arg_c_tp
+        arg_i_fp = prediction.n_nodes - len(prediction) - 1 - arg_i_tp
+        arg_i_fn = gold.n_nodes - len(gold) - 1 - arg_i_tp
+        assert arg_i_tp >= 0 and arg_i_fn >= 0 and arg_i_fp >= 0
+        self.arg_i.tp += arg_i_tp
+        self.arg_i.fp += arg_i_fp
+        self.arg_i.fn += arg_i_fn
+        assert arg_c_tp >= 0 and arg_c_fn >= 0 and arg_c_fp >= 0
+        self.arg_c.tp += arg_c_tp
+        self.arg_c.fp += arg_c_fp
+        self.arg_c.fn += arg_c_fn
+    def with_label_event(self, prediction: Span, gold: Span):
+        trigger_tp = prediction.match(gold, True, 2) - 1
+        trigger_fp = len(prediction) - trigger_tp
+        trigger_fn = len(gold) - trigger_tp
+        assert trigger_fp >= 0 and trigger_fn >= 0 and trigger_tp >= 0
+        self.tri_c.tp += trigger_tp
+        self.tri_c.fp += trigger_fp
+        self.tri_c.fn += trigger_fn
+    def with_label_arg(self, prediction: Span, gold: Span):
+        trigger_tp = prediction.match(gold, True, 2) - 1
+        role_tp = prediction.match(gold, True, ignore_parent_boundary=True) - 1 - trigger_tp
+        role_fp = (prediction.n_nodes - 1 - len(prediction)) - role_tp
+        role_fn = (gold.n_nodes - 1 - len(gold)) - role_tp
+        assert role_fp >= 0 and role_fn >= 0 and role_tp >= 0
+        self.arg_c.tp += role_tp
+        self.arg_c.fp += role_fp
+        self.arg_c.fn += role_fn
+    def without_label_event(self, prediction: Span, gold: Span):
+        tri_i_tp = prediction.match(gold, False, 2) - 1
+        tri_i_fp = len(prediction) - tri_i_tp
+        tri_i_fn = len(gold) - tri_i_tp
+        assert tri_i_tp >= 0 and tri_i_fp >= 0 and tri_i_fn >= 0
+        self.tri_i.tp += tri_i_tp
+        self.tri_i.fp += tri_i_fp
+        self.tri_i.fn += tri_i_fn
+    def without_label_arg(self, prediction: Span, gold: Span):
+        arg_i_tp = 0
+        matched_pairs: List[Tuple[Span, Span]] = list()
+        n_gold_arg, n_pred_arg = gold.n_nodes - len(gold) - 1, prediction.n_nodes - len(prediction) - 1
+        prediction, gold = prediction.clone(), gold.clone()
+        for p in prediction:
+            for g in gold:
+                if p.match(g, True, 1) == 1:
+                    arg_i_tp += (p.match(g, False) - 1)
+                    matched_pairs.append((p, g))
+                    break
+        for p, g in matched_pairs:
+            prediction.remove_child(p)
+            gold.remove_child(g)
+        sub_matches = np.zeros([len(prediction), len(gold)], np.int)
+        for p_idx, p in enumerate(prediction):
+            for g_idx, g in enumerate(gold):
+                if p.label == g.label:
+                    sub_matches[p_idx, g_idx] = p.match(g, False, -1, True)
+        arg_i_tp += max_match(sub_matches)
+        arg_i_fp = n_pred_arg - arg_i_tp
+        arg_i_fn = n_gold_arg - arg_i_tp
+        assert arg_i_tp >= 0 and arg_i_fn >= 0 and arg_i_fp >= 0
+        self.arg_i.tp += arg_i_tp
+        self.arg_i.fp += arg_i_fp
+        self.arg_i.fn += arg_i_fn

sftp/models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from sftp.models.span_model import SpanModel

sftp/models/span_model.py ADDED Viewed

	@@ -0,0 +1,362 @@

+import os
+from typing import *
+import torch
+from allennlp.common.from_params import Params, T, pop_and_construct_arg
+from allennlp.data.vocabulary import Vocabulary, DEFAULT_PADDING_TOKEN, DEFAULT_OOV_TOKEN
+from allennlp.models.model import Model
+from allennlp.modules import TextFieldEmbedder
+from allennlp.modules.seq2seq_encoders.pytorch_seq2seq_wrapper import Seq2SeqEncoder
+from allennlp.modules.span_extractors import SpanExtractor
+from allennlp.training.metrics import Metric
+from ..metrics import ExactMatch
+from ..modules import SpanFinder, SpanTyping
+from ..utils import num2mask, VIRTUAL_ROOT, Span, tensor2span
+@Model.register("span")
+class SpanModel(Model):
+    """
+    Identify/Find spans; link them as a tree; label them.
+    """
+    default_predictor = 'span'
+    def __init__(
+            self,
+            vocab: Vocabulary,
+            # Modules
+            word_embedding: TextFieldEmbedder,
+            span_extractor: SpanExtractor,
+            span_finder: SpanFinder,
+            span_typing: SpanTyping,
+            # Config
+            typing_loss_factor: float = 1.,
+            max_recursion_depth: int = -1,
+            max_decoding_spans: int = -1,
+            debug: bool = False,
+            # Ontology Constraints
+            ontology_path: Optional[str] = None,
+            # Metrics
+            metrics: Optional[List[Metric]] = None,
+    ) -> None:
+        """
+        Note for jsonnet file: it doesn't strictly follow the init examples of every module for that we override
+        the from_params method.
+        You can either check the SpanModel.from_params or the example jsonnet file.
+        :param vocab: No need to specify.
+        ## Modules
+        :param word_embedding: Refer to the module doc.
+        :param span_extractor: Refer to the module doc.
+        :param span_finder: Refer to the module doc.
+        :param span_typing: Refer to the module doc.
+        ## Configs
+        :param typing_loss_factor: loss = span_finder_loss + span_typing_loss * typing_loss_factor
+        :param max_recursion_depth: Maximum tree depth for inference. E.g., 1 for shallow event typing, 2 for SRL,
+            -1 (unlimited) for dependency parsing.
+        :param max_decoding_spans: Maximum spans for inference. -1 for unlimited.
+        :param debug: Useless now.
+        """
+        self._pad_idx = vocab.get_token_index(DEFAULT_PADDING_TOKEN, 'token')
+        self._null_idx = vocab.get_token_index(DEFAULT_OOV_TOKEN, 'span_label')
+        super().__init__(vocab)
+        self.word_embedding = word_embedding
+        self._span_finder = span_finder
+        self._span_extractor = span_extractor
+        self._span_typing = span_typing
+        self.metrics = [ExactMatch(True), ExactMatch(False)]
+        if metrics is not None:
+            self.metrics.extend(metrics)
+        if ontology_path is not None and os.path.exists(ontology_path):
+            self._span_typing.load_ontology(ontology_path, self.vocab)
+        self._max_decoding_spans = max_decoding_spans
+        self._typing_loss_factor = typing_loss_factor
+        self._max_recursion_depth = max_recursion_depth
+        self.debug = debug
+    def forward(
+            self,
+            tokens: Dict[str, Dict[str, torch.Tensor]],
+            span_boundary: Optional[torch.Tensor] = None,
+            span_labels: Optional[torch.Tensor] = None,
+            parent_indices: Optional[torch.Tensor] = None,
+            parent_mask: Optional[torch.Tensor] = None,
+            bio_seqs: Optional[torch.Tensor] = None,
+            raw_inputs: Optional[dict] = None,
+            meta: Optional[dict] = None,
+            **extra
+    ) -> Dict[str, torch.Tensor]:
+        """
+        For training, provide all blow.
+        For inference, it's enough to only provide words.
+        :param tokens: Indexed input sentence. Shape: [batch, token]
+        :param span_boundary: Start and end indices for every span. Note this includes both parent and
+            non-parent spans. Shape: [batch, span, 2]. For the last dim, [0] is start idx and [1] is end idx.
+        :param span_labels: Indexed label for spans, including parent and non-parent ones. Shape: [batch, span]
+        :param parent_indices: The parent span idx of every span. Shape: [batch, span]
+        :param parent_mask: True if this span is a parent. Shape: [batch, span]
+        :param bio_seqs: Shape [batch, parent, token, 3]
+        :param raw_inputs
+        :param meta: Meta information. Will be copied to the outputs.
+        :return:
+            - loss: training loss
+            - prediction: Predicted spans
+            - meta: Meta info copied from input
+            - inputs: Input sentences and spans (if exist)
+        """
+        ret = {'inputs': raw_inputs, 'meta': meta or dict()}
+        is_eval = span_labels is not None and not self.training  # evaluation on dev set
+        is_test = span_labels is None  # test on test set
+        # Shape [batch]
+        num_spans = (span_labels != -1).sum(1) if span_labels is not None else None
+        num_words = tokens['pieces']['mask'].sum(1)
+        # Shape [batch, word, token_dim]
+        token_vec = self.word_embedding(tokens)
+        if span_labels is not None:
+            # Revise the padding value from -1 to 0
+            span_labels[span_labels == -1] = 0
+        # Calculate Loss
+        if self.training or is_eval:
+            # Shape [batch, word, token_dim]
+            span_vec = self._span_extractor(token_vec, span_boundary)
+            finder_rst = self._span_finder(
+                token_vec, num2mask(num_words), span_vec, num2mask(num_spans), span_labels, parent_indices,
+                parent_mask, bio_seqs
+            )
+            typing_rst = self._span_typing(span_vec, parent_indices, span_labels)
+            ret['loss'] = finder_rst['loss'] + typing_rst['loss'] * self._typing_loss_factor
+        # Decoding
+        if is_eval or is_test:
+            pred_span_boundary, pred_span_labels, pred_parent_indices, pred_cursor, pred_label_confidence \
+                = self.inference(num_words, token_vec, **extra)
+            prediction = self.post_process_pred(
+                pred_span_boundary, pred_span_labels, pred_parent_indices, pred_cursor, pred_label_confidence
+            )
+            for pred, raw_in in zip(prediction, raw_inputs):
+                pred.re_index(raw_in['offsets'], True, True, True)
+                pred.remove_overlapping()
+            ret['prediction'] = prediction
+            if 'spans' in raw_inputs[0]:
+                for pred, raw_in in zip(prediction, raw_inputs):
+                    gold = raw_in['spans']
+                    for metric in self.metrics:
+                        metric(pred, gold)
+        return ret
+    def inference(
+            self,
+            num_words: torch.Tensor,
+            token_vec: torch.Tensor,
+            **auxiliaries
+    ):
+        n_batch = num_words.shape[0]
+        # The decoding results are preserved in the following tensors starting with `pred`
+        # During inference, we completely ignore the arguments defaulted None in the forward method.
+        # The span indexing space is shift to the decoding span space. (since we do not have gold span now)
+        # boundary indices of every predicted span
+        pred_span_boundary = num_words.new_zeros([n_batch, self._max_decoding_spans, 2])
+        # labels (and corresponding confidence) for predicted spans
+        pred_span_labels = num_words.new_full(
+            [n_batch, self._max_decoding_spans], self.vocab.get_token_index(VIRTUAL_ROOT, 'span_label')
+        )
+        pred_label_confidence = num_words.new_zeros([n_batch, self._max_decoding_spans])
+        # label masked as True will be treated as parent in the next round
+        pred_parent_mask = num_words.new_zeros([n_batch, self._max_decoding_spans], dtype=torch.bool)
+        pred_parent_mask[:, 0] = True
+        # parent index (in the span indexing space) for every span
+        pred_parent_indices = num_words.new_zeros([n_batch, self._max_decoding_spans])
+        # what index have we reached for every batch?
+        pred_cursor = num_words.new_ones([n_batch])
+        # Pass environment variables to handler. Extra variables will be ignored.
+        # So pass the union of variables that are needed by different modules.
+        span_find_handler = self._span_finder.inference_forward_handler(
+            token_vec, num2mask(num_words), self._span_extractor, **auxiliaries
+        )
+        # Every step here is one layer of the tree. It deals with all the parents for the last layer
+        # so there might be 0 to multiple parents for a batch for a single step.
+        for _ in range(self._max_recursion_depth):
+            cursor_before_find = pred_cursor.clone()
+            span_find_handler(
+                pred_span_boundary, pred_span_labels, pred_parent_mask, pred_parent_indices, pred_cursor
+            )
+            # Labels of old spans are re-predicted. It doesn't matter since their results shouldn't change
+            # in theory.
+            span_typing_ret = self._span_typing(
+                self._span_extractor(token_vec, pred_span_boundary), pred_parent_indices, pred_span_labels, True
+            )
+            pred_span_labels = span_typing_ret['prediction']
+            pred_label_confidence = span_typing_ret['label_confidence']
+            pred_span_labels[:, 0] = self.vocab.get_token_index(VIRTUAL_ROOT, 'span_label')
+            pred_parent_mask = (
+                    num2mask(cursor_before_find, self._max_decoding_spans) ^ num2mask(pred_cursor,
+                                                                                      self._max_decoding_spans)
+            )
+            # Break the inference loop if 1) all batches reach max span limit OR 2) no parent is predicted
+            # at last step OR 3) max recursion limit is reached (for loop condition)
+            if (pred_cursor == self._max_decoding_spans).all() or pred_parent_mask.sum() == 0:
+                break
+        return pred_span_boundary, pred_span_labels, pred_parent_indices, pred_cursor, pred_label_confidence
+    def one_step_prediction(
+            self,
+            tokens: Dict[str, Dict[str, torch.Tensor]],
+            parent_boundary: torch.Tensor,
+            parent_labels: torch.Tensor,
+    ):
+        """
+        Single step prediction. Given parent span boundary indices, return the corresponding children spans
+            and their labels.
+        Restriction: Each sentence contain exactly 1 parent.
+        For efficient multi-layer prediction, i.e. given a root, predict the whole tree,
+            refer to the `forward' method.
+        :param tokens: See forward.
+        :param parent_boundary: Pairs of (start_idx, end_idx) for parents. Shape [batch, 2]
+        :param parent_labels: Labels for parents. Shape [batch]
+            Note: If `no_label' is on in span_finder module, this will be ignored.
+        :return:
+            children_boundary: (start_idx, end_idx) for every child span. Padded with (0, 0).
+                Shape [batch, children, 2]
+            children_labels: Label for every child span. Padded with null_idx. Shape [batch, children]
+            num_children: The number of children predicted for parent/batch. Shape [batch]
+                Tips: You can use num2mask method to convert this to bool tensor mask.
+        """
+        num_words = tokens['pieces']['mask'].sum(1)
+        # Shape [batch, word, token_dim]
+        token_vec = self.word_embedding(tokens)
+        n_batch = token_vec.shape[0]
+        # The following variables assumes the parent is the 0-th span, and we let the model
+        # to extend the span list.
+        pred_span_boundary = num_words.new_zeros([n_batch, self._max_decoding_spans, 2])
+        pred_span_boundary[:, 0] = parent_boundary
+        pred_span_labels = num_words.new_full([n_batch, self._max_decoding_spans], self._null_idx)
+        pred_span_labels[:, 0] = parent_labels
+        pred_parent_mask = num_words.new_zeros(pred_span_labels.shape, dtype=torch.bool)
+        pred_parent_mask[:, 0] = True
+        pred_parent_indices = num_words.new_zeros([n_batch, self._max_decoding_spans])
+        # We start from idx 1 since 0 is the parents.
+        pred_cursor = num_words.new_ones([n_batch])
+        span_find_handler = self._span_finder.inference_forward_handler(
+            token_vec, num2mask(num_words), self._span_extractor
+        )
+        span_find_handler(
+            pred_span_boundary, pred_span_labels, pred_parent_mask, pred_parent_indices, pred_cursor
+        )
+        typing_out = self._span_typing(
+            self._span_extractor(token_vec, pred_span_boundary), pred_parent_indices, pred_span_labels, True
+        )
+        pred_span_labels = typing_out['prediction']
+        # Now remove the parent
+        num_children = pred_cursor - 1
+        max_children = int(num_children.max())
+        children_boundary = pred_span_boundary[:, 1:max_children + 1]
+        children_labels = pred_span_labels[:, 1:max_children + 1]
+        children_distribution = typing_out['distribution'][:, 1:max_children + 1]
+        return children_boundary, children_labels, num_children, children_distribution
+    def post_process_pred(
+            self, span_boundary, span_labels, parent_indices, num_spans, label_confidence
+    ) -> List[Span]:
+        pred_spans = tensor2span(
+            span_boundary, span_labels, parent_indices, num_spans, label_confidence,
+            self.vocab.get_index_to_token_vocabulary('span_label'),
+            label_ignore=[self._null_idx],
+        )
+        return pred_spans
+    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
+        ret = dict()
+        if reset:
+            for metric in self.metrics:
+                ret.update(metric.get_metric(reset))
+        ret.update(self._span_finder.get_metrics(reset))
+        ret.update(self._span_typing.get_metric(reset))
+        return ret
+    @classmethod
+    def from_params(
+            cls: Type[T],
+            params: Params,
+            constructor_to_call: Callable[..., T] = None,
+            constructor_to_inspect: Callable[..., T] = None,
+            **extras,
+    ) -> T:
+        """
+        Specify the dependency between modules. E.g. the input dim of a module might depend on the output dim
+        of another module.
+        """
+        vocab = extras['vocab']
+        word_embedding = pop_and_construct_arg('SpanModel', 'word_embedding', TextFieldEmbedder, None, params, **extras)
+        label_dim, token_emb_dim = params.pop('label_dim'), word_embedding.get_output_dim()
+        span_extractor = pop_and_construct_arg(
+            'SpanModel', 'span_extractor', SpanExtractor, None, params, input_dim=token_emb_dim, **extras
+        )
+        label_embedding = torch.nn.Embedding(vocab.get_vocab_size('span_label'), label_dim)
+        extras['label_emb'] = label_embedding
+        if params.get('span_finder').get('type') == 'bio':
+            bio_encoder = Seq2SeqEncoder.from_params(
+                params['span_finder'].pop('bio_encoder'),
+                input_size=span_extractor.get_output_dim() + token_emb_dim + label_dim,
+                input_dim=span_extractor.get_output_dim() + token_emb_dim + label_dim,
+                **extras
+            )
+            extras['span_finder'] = SpanFinder.from_params(
+                params.pop('span_finder'), bio_encoder=bio_encoder, **extras
+            )
+        else:
+            extras['span_finder'] = pop_and_construct_arg(
+                'SpanModel', 'span_finder', SpanFinder, None, params, **extras
+            )
+            extras['span_finder'].label_emb = label_embedding
+        if params.get('span_typing').get('type') == 'mlp':
+            extras['span_typing'] = SpanTyping.from_params(
+                params.pop('span_typing'),
+                input_dim=span_extractor.get_output_dim() * 2 + label_dim,
+                n_category=vocab.get_vocab_size('span_label'),
+                label_to_ignore=[
+                    vocab.get_token_index(lti, 'span_label')
+                    for lti in [DEFAULT_OOV_TOKEN, DEFAULT_PADDING_TOKEN]
+                ],
+                **extras
+            )
+        else:
+            extras['span_typing'] = pop_and_construct_arg(
+                'SpanModel', 'span_typing', SpanTyping, None, params, **extras
+            )
+            extras['span_typing'].label_emb = label_embedding
+        return super().from_params(
+            params,
+            word_embedding=word_embedding,
+            span_extractor=span_extractor,
+            **extras
+        )

sftp/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .span_extractor import ComboSpanExtractor
+from .span_finder import SpanFinder, BIOSpanFinder
+from .span_typing import MLPSpanTyping, SpanTyping
+from .smooth_crf import SmoothCRF

sftp/modules/smooth_crf.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import torch
+from allennlp.modules.conditional_random_field import ConditionalRandomField
+from allennlp.nn.util import logsumexp
+from overrides import overrides
+class SmoothCRF(ConditionalRandomField):
+    @overrides
+    def forward(self, inputs: torch.Tensor, tags: torch.Tensor, mask: torch.Tensor = None):
+        """
+        :param inputs: Shape [batch, token, tag]
+        :param tags: Shape [batch, token] or [batch, token, tag]
+        :param mask: Shape [batch, token]
+        :return:
+        """
+        if mask is None:
+            mask = tags.new_ones(tags.shape, dtype=torch.bool)
+        mask = mask.to(dtype=torch.bool)
+        if tags.dim() == 2:
+            return super(SmoothCRF, self).forward(inputs, tags, mask)
+        # smooth mode
+        log_denominator = self._input_likelihood(inputs, mask)
+        log_numerator = self._smooth_joint_likelihood(inputs, tags, mask)
+        return torch.sum(log_numerator - log_denominator)
+    def _smooth_joint_likelihood(
+        self, logits: torch.Tensor, soft_tags: torch.Tensor, mask: torch.Tensor
+    ) -> torch.Tensor:
+        batch_size, sequence_length, num_tags = logits.size()
+        epsilon = 1e-30
+        soft_tags = soft_tags.clone()
+        soft_tags[soft_tags < epsilon] = epsilon
+        # Transpose batch size and sequence dimensions
+        mask = mask.transpose(0, 1).contiguous()
+        logits = logits.transpose(0, 1).contiguous()
+        soft_tags = soft_tags.transpose(0, 1).contiguous()
+        # Initial alpha is the (batch_size, num_tags) tensor of likelihoods combining the
+        # transitions to the initial states and the logits for the first timestep.
+        if self.include_start_end_transitions:
+            alpha = self.start_transitions.view(1, num_tags) + logits[0] + soft_tags[0].log()
+        else:
+            alpha = logits[0] * soft_tags[0]
+        # For each i we compute logits for the transitions from timestep i-1 to timestep i.
+        # We do so in a (batch_size, num_tags, num_tags) tensor where the axes are
+        # (instance, current_tag, next_tag)
+        for i in range(1, sequence_length):
+            # The emit scores are for time i ("next_tag") so we broadcast along the current_tag axis.
+            emit_scores = logits[i].view(batch_size, 1, num_tags)
+            # Transition scores are (current_tag, next_tag) so we broadcast along the instance axis.
+            transition_scores = self.transitions.view(1, num_tags, num_tags)
+            # Alpha is for the current_tag, so we broadcast along the next_tag axis.
+            broadcast_alpha = alpha.view(batch_size, num_tags, 1)
+            # Add all the scores together and logexp over the current_tag axis.
+            inner = broadcast_alpha + emit_scores + transition_scores + soft_tags[i].log().unsqueeze(1)
+            # In valid positions (mask == True) we want to take the logsumexp over the current_tag dimension
+            # of `inner`. Otherwise (mask == False) we want to retain the previous alpha.
+            alpha = logsumexp(inner, 1) * mask[i].view(batch_size, 1) + alpha * (
+                ~mask[i]
+            ).view(batch_size, 1)
+        # Every sequence needs to end with a transition to the stop_tag.
+        if self.include_start_end_transitions:
+            stops = alpha + self.end_transitions.view(1, num_tags)
+        else:
+            stops = alpha
+        # Finally we log_sum_exp along the num_tags dim, result is (batch_size,)
+        return logsumexp(stops)

sftp/modules/span_extractor/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .combo import ComboSpanExtractor

sftp/modules/span_extractor/combo.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from typing import *
+import torch
+from allennlp.modules.span_extractors import SpanExtractor
+@SpanExtractor.register('combo')
+class ComboSpanExtractor(SpanExtractor):
+    def __init__(self, input_dim: int, sub_extractors: List[SpanExtractor]):
+        super().__init__()
+        self.sub_extractors = sub_extractors
+        for i, sub in enumerate(sub_extractors):
+            self.add_module(f'SpanExtractor-{i+1}', sub)
+        self.input_dim = input_dim
+    def get_input_dim(self) -> int:
+        return self.input_dim
+    def get_output_dim(self) -> int:
+        return sum([sub.get_output_dim() for sub in self.sub_extractors])
+    def forward(
+            self,
+            sequence_tensor: torch.FloatTensor,
+            span_indices: torch.LongTensor,
+            sequence_mask: torch.BoolTensor = None,
+            span_indices_mask: torch.BoolTensor = None,
+    ):
+        outputs = [
+            sub(
+                sequence_tensor=sequence_tensor,
+                span_indices=span_indices,
+                span_indices_mask=span_indices_mask
+            ) for sub in self.sub_extractors
+        ]
+        return torch.cat(outputs, dim=2)

sftp/modules/span_finder/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .bio_span_finder import BIOSpanFinder
2	+ from .span_finder import SpanFinder

sftp/modules/span_finder/bio_span_finder.py ADDED Viewed

	@@ -0,0 +1,216 @@

+from typing import *
+import torch
+from allennlp.data.dataset_readers.dataset_utils.span_utils import bio_tags_to_spans
+from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder
+from allennlp.modules.span_extractors import SpanExtractor
+from allennlp.training.metrics import FBetaMeasure
+from ..smooth_crf import SmoothCRF
+from .span_finder import SpanFinder
+from ...utils import num2mask, mask2idx, BIO
+@SpanFinder.register("bio")
+class BIOSpanFinder(SpanFinder):
+    """
+    Train BIO representations for span finding.
+    """
+    def __init__(
+            self,
+            bio_encoder: Seq2SeqEncoder,
+            label_emb: torch.nn.Embedding,
+            no_label: bool = True,
+    ):
+        super().__init__(no_label)
+        self.bio_encoder = bio_encoder
+        self.label_emb = label_emb
+        self.classifier = torch.nn.Linear(bio_encoder.get_output_dim(), 3)
+        self.crf = SmoothCRF(3)
+        self.fb_measure = FBetaMeasure(1., 'micro', [BIO.index('B'), BIO.index('I')])
+    def forward(
+            self,
+            token_vec: torch.Tensor,
+            token_mask: torch.Tensor,
+            span_vec: torch.Tensor,
+            span_mask: Optional[torch.Tensor] = None,  # Do not need to provide
+            span_labels: Optional[torch.Tensor] = None,  # Do not need to provide
+            parent_indices: Optional[torch.Tensor] = None,  # Do not need to provide
+            parent_mask: Optional[torch.Tensor] = None,
+            bio_seqs: Optional[torch.Tensor] = None,
+            prediction: bool = False,
+            **extra
+    ) -> Dict[str, torch.Tensor]:
+        """
+        See doc of SpanFinder.
+        Possible extra variables:
+            smoothing_factor
+        :return:
+            - loss
+            - prediction
+        """
+        ret = dict()
+        is_soft = span_labels.dtype != torch.int64
+        distinct_parent_indices, num_parents = mask2idx(parent_mask)
+        n_batch, n_parent = distinct_parent_indices.shape
+        n_token = token_vec.shape[1]
+        # Shape [batch, parent, token_dim]
+        parent_span_features = span_vec.gather(
+            1, distinct_parent_indices.unsqueeze(2).expand(-1, -1, span_vec.shape[2])
+        )
+        label_features = span_labels @ self.label_emb.weight if is_soft else self.label_emb(span_labels)
+        if self._no_label:
+            label_features = label_features.zero_()
+        # Shape [batch, span, label_dim]
+        parent_label_features = label_features.gather(
+            1, distinct_parent_indices.unsqueeze(2).expand(-1, -1, label_features.shape[2])
+        )
+        # Shape [batch, parent, token, token_dim*2]
+        encoder_inputs = torch.cat([
+            parent_span_features.unsqueeze(2).expand(-1, -1, n_token, -1),
+            token_vec.unsqueeze(1).expand(-1, n_parent, -1, -1),
+            parent_label_features.unsqueeze(2).expand(-1, -1, n_token, -1),
+        ], dim=3)
+        encoder_inputs = encoder_inputs.reshape(n_batch * n_parent, n_token, -1)
+        # Shape [batch, parent]. Considers batches may have fewer seqs.
+        seq_mask = num2mask(num_parents)
+        # Shape [batch, parent, token]. Also considers batches may have fewer tokens.
+        token_mask = seq_mask.unsqueeze(2).expand(-1, -1, n_token) & token_mask.unsqueeze(1).expand(-1, n_parent, -1)
+        class_in = self.bio_encoder(encoder_inputs, token_mask.flatten(0, 1))
+        class_out = self.classifier(class_in).reshape(n_batch, n_parent, n_token, 3)
+        if not prediction:
+            # For training
+            # We use `seq_mask` here because seq with length 0 is not acceptable.
+            ret['loss'] = -self.crf(class_out[seq_mask], bio_seqs[seq_mask], token_mask[seq_mask])
+            self.fb_measure(class_out[seq_mask], bio_seqs[seq_mask].max(2).indices, token_mask[seq_mask])
+        else:
+            # For prediction
+            features_for_decode = class_out.clone().detach()
+            decoded = self.crf.viterbi_tags(features_for_decode.flatten(0, 1), token_mask.flatten(0, 1))
+            pred_tag = torch.tensor(
+                [path + [BIO.index('O')] * (n_token - len(path)) for path, _ in decoded]
+            )
+            pred_tag = pred_tag.reshape(n_batch, n_parent, n_token)
+            ret['prediction'] = pred_tag
+        return ret
+    @staticmethod
+    def bio2boundary(seqs) -> Tuple[torch.Tensor, torch.Tensor]:
+        def recursive_construct_spans(seqs_):
+            """
+            Helper function for bio2boundary
+            Recursively convert seqs of integers to boundary indices.
+            Return boundary indices and corresponding lens
+            """
+            if isinstance(seqs_, torch.Tensor):
+                if seqs_.device.type == 'cuda':
+                    seqs_ = seqs_.to(device='cpu')
+                seqs_ = seqs_.tolist()
+            if isinstance(seqs_[0], int):
+                seqs_ = [BIO[i] for i in seqs_]
+                span_boundary_list = bio_tags_to_spans(seqs_)
+                return torch.tensor([item[1] for item in span_boundary_list]), len(span_boundary_list)
+            span_boundary = list()
+            lens_ = list()
+            for seq in seqs_:
+                one_bou, one_len = recursive_construct_spans(seq)
+                span_boundary.append(one_bou)
+                lens_.append(one_len)
+            if isinstance(lens_[0], int):
+                lens_ = torch.tensor(lens_)
+            else:
+                lens_ = torch.stack(lens_)
+            return span_boundary, lens_
+        boundary_list, lens = recursive_construct_spans(seqs)
+        max_span = int(lens.max())
+        boundary = torch.zeros((*lens.shape, max_span, 2), dtype=torch.long)
+        def recursive_copy(list_var, tensor_var):
+            if len(list_var) == 0:
+                return
+            if isinstance(list_var, torch.Tensor):
+                tensor_var[:len(list_var)] = list_var
+                return
+            assert len(list_var) == len(tensor_var)
+            for list_var_, tensor_var_ in zip(list_var, tensor_var):
+                recursive_copy(list_var_, tensor_var_)
+        recursive_copy(boundary_list, boundary)
+        return boundary, lens
+    def inference_forward_handler(
+            self,
+            token_vec: torch.Tensor,
+            token_mask: torch.Tensor,
+            span_extractor: SpanExtractor,
+            **auxiliaries,
+    ) -> Callable[[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], None]:
+        """
+        Refer to the doc of the SpanFinder for definition of this function.
+        """
+        def handler(
+                span_boundary: torch.Tensor,
+                span_labels: torch.Tensor,
+                parent_mask: torch.Tensor,
+                parent_indices: torch.Tensor,
+                cursor: torch.tensor,
+        ):
+            """
+            Refer to the doc of the SpanFinder for definition of this function.
+            """
+            max_decoding_span = span_boundary.shape[1]
+            # Shape [batch, span, token_dim]
+            span_vec = span_extractor(token_vec, span_boundary)
+            # Shape [batch, parent]
+            parent_indices_at_span, _ = mask2idx(parent_mask)
+            pred_bio = self(
+                token_vec, token_mask, span_vec, None, span_labels, None, parent_mask, prediction=True
+            )['prediction']
+            # Shape [batch, parent, span, 2]; Shape [batch, parent]
+            pred_boundary, pred_num = self.bio2boundary(pred_bio)
+            if pred_boundary.device != span_boundary.device:
+                pred_boundary = pred_boundary.to(device=span_boundary.device)
+                pred_num = pred_num.to(device=span_boundary.device)
+            # Shape [batch, parent, span]
+            pred_mask = num2mask(pred_num)
+            # Parent Loop
+            for pred_boundary_parent, pred_mask_parent, parent_indices_parent \
+                    in zip(pred_boundary.unbind(1), pred_mask.unbind(1), parent_indices_at_span.unbind(1)):
+                for pred_boundary_step, step_mask in zip(pred_boundary_parent.unbind(1), pred_mask_parent.unbind(1)):
+                    step_mask &= cursor < max_decoding_span
+                    parent_indices[step_mask] = parent_indices[step_mask].scatter(
+                        1,
+                        cursor[step_mask].unsqueeze(1),
+                        parent_indices_parent[step_mask].unsqueeze(1)
+                    )
+                    span_boundary[step_mask] = span_boundary[step_mask].scatter(
+                        1,
+                        cursor[step_mask].reshape(-1, 1, 1).expand(-1, -1, 2),
+                        pred_boundary_step[step_mask].unsqueeze(1)
+                    )
+                    cursor[step_mask] += 1
+        return handler
+    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
+        score = self.fb_measure.get_metric(reset)
+        if reset:
+            return {
+                'finder_p': score['precision'] * 100,
+                'finder_r': score['recall'] * 100,
+                'finder_f': score['fscore'] * 100,
+            }
+        else:
+            return {'finder_f': score['fscore'] * 100}

sftp/modules/span_finder/span_finder.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from abc import ABC, abstractmethod
+from typing import *
+import torch
+from allennlp.common import Registrable
+from allennlp.modules.span_extractors import SpanExtractor
+class SpanFinder(Registrable, ABC, torch.nn.Module):
+    """
+    Model the probability p(child_span | parent_span [, parent_label])
+    It's optional to model parent_label, since in some cases we may want the parameters to be shared across
+    different tasks, where we may have similar span semantics but different label space.
+    """
+    def __init__(
+            self,
+            no_label: bool = True,
+    ):
+        """
+        :param no_label: If True, will not use input labels as features and use all 0 vector instead.
+        """
+        super().__init__()
+        self._no_label = no_label
+    @abstractmethod
+    def forward(
+            self,
+            token_vec: torch.Tensor,
+            token_mask: torch.Tensor,
+            span_vec: torch.Tensor,
+            span_mask: Optional[torch.Tensor] = None,  # Do not need to provide
+            span_labels: Optional[torch.Tensor] = None,  # Do not need to provide
+            parent_indices: Optional[torch.Tensor] = None,  # Do not need to provide
+            parent_mask: Optional[torch.Tensor] = None,
+            bio_seqs: Optional[torch.Tensor] = None,
+            prediction: bool = False,
+            **extra
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Return training loss and predictions.
+        :param token_vec: Vector representation of tokens. Shape [batch, token ,token_dim]
+        :param token_mask: True for non-padding tokens.
+        :param span_vec: Vector representation of spans. Shape [batch, span, token_dim]
+        :param span_mask: True for non-padding spans. Shape [batch, span]
+        :param span_labels: The labels of spans. Shape [batch, span]
+        :param parent_indices: Parent indices of spans. Shape [batch, span]
+        :param parent_mask: True for parent spans. Shape [batch, span]
+        :param prediction: If True, no loss will be return & no metrics will be updated.
+        :param bio_seqs: BIO sequences. Shape [batch, parent, token, 3]
+        :return:
+            loss: Training loss
+            prediction: Shape [batch, span]. True for positive predictions.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def inference_forward_handler(
+            self,
+            token_vec: torch.Tensor,
+            token_mask: torch.Tensor,
+            span_extractor: SpanExtractor,
+            **auxiliaries,
+    ) -> Callable[[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], None]:
+        """
+        Pre-process some information and return a callable module for p(child_span | parent_span [,parent_label])
+        :param token_vec: Vector representation of tokens. Shape [batch, token ,token_dim]
+        :param token_mask: True for non-padding tokens.
+        :param span_extractor: The same module in model.
+        :param auxiliaries: Environment variables. You can pass extra environment variables
+            since the extras will be ignored.
+        :return:
+            A callable function in a closure.
+            The arguments for the callable object are:
+                - span_boundary: Shape [batch, span, 2]
+                - span_labels: Shape [batch, span]
+                - parent_mask: Shape [batch, span]
+                - parent_indices: Shape [batch, span]
+                - cursor: Shape [batch]
+            No return values. Everything should be done inplace.
+            Note the span indexing space has different meaning from training process. We don't have gold span list,
+            so span here refers to the predicted spans.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
+        raise NotImplementedError

sftp/modules/span_typing/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .mlp_span_typing import MLPSpanTyping
2	+ from .span_typing import SpanTyping

sftp/modules/span_typing/mlp_span_typing.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from typing import *
+import torch
+from torch.nn import CrossEntropyLoss, KLDivLoss, LogSoftmax
+from .span_typing import SpanTyping
+@SpanTyping.register('mlp')
+class MLPSpanTyping(SpanTyping):
+    """
+    An MLP implementation for Span Typing.
+    """
+    def __init__(
+            self,
+            input_dim: int,
+            hidden_dims: List[int],
+            label_emb: torch.nn.Embedding,
+            n_category: int,
+            label_to_ignore: Optional[List[int]] = None
+    ):
+        """
+        :param input_dim: dim(parent_span) + dim(child_span) + dim(label_dim)
+        :param hidden_dims: The dim of hidden layers of MLP.
+        :param n_category: #labels
+        :param label_emb: Embeds labels to vectors.
+        """
+        super().__init__(label_emb.num_embeddings, label_to_ignore, )
+        self.MLPs: List[torch.nn.Linear] = list()
+        for i_mlp, output_dim in enumerate(hidden_dims + [n_category]):
+            mlp = torch.nn.Linear(input_dim, output_dim, bias=True)
+            self.MLPs.append(mlp)
+            self.add_module(f'MLP-{i_mlp}', mlp)
+            input_dim = output_dim
+        # Embeds labels as features.
+        self.label_emb = label_emb
+    def forward(
+            self,
+            span_vec: torch.Tensor,
+            parent_at_span: torch.Tensor,
+            span_labels: Optional[torch.Tensor],
+            prediction_only: bool = False,
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Inputs: All features for typing a child span.
+        Process: Update the metric.
+        Output: The loss of typing and predictions.
+        :return:
+            loss: Loss for label prediction.
+            prediction: Predicted labels.
+        """
+        is_soft = span_labels.dtype != torch.int64
+        # Shape [batch, span, label_dim]
+        label_vec = span_labels @ self.label_emb.weight if is_soft else self.label_emb(span_labels)
+        n_batch, n_span, _ = label_vec.shape
+        n_label, _ = self.ontology.shape
+        # Shape [batch, span, label_dim]
+        parent_label_features = label_vec.gather(1, parent_at_span.unsqueeze(2).expand_as(label_vec))
+        # Shape [batch, span, token_dim]
+        parent_span_features = span_vec.gather(1, parent_at_span.unsqueeze(2).expand_as(span_vec))
+        # Shape [batch, span, token_dim]
+        child_span_features = span_vec
+        features = torch.cat([parent_label_features, parent_span_features, child_span_features], dim=2)
+        # Shape [batch, span, label]
+        for mlp in self.MLPs[:-1]:
+            features = torch.relu(mlp(features))
+        logits = self.MLPs[-1](features)
+        logits_for_prediction = logits.clone()
+        if not is_soft:
+            # Shape [batch, span]
+            parent_labels = span_labels.gather(1, parent_at_span)
+            onto_mask = self.ontology.unsqueeze(0).expand(n_batch, -1, -1).gather(
+                1, parent_labels.unsqueeze(2).expand(-1, -1, n_label)
+            )
+            logits_for_prediction[~onto_mask] = float('-inf')
+        label_dist = torch.softmax(logits_for_prediction, 2)
+        label_confidence, predictions = label_dist.max(2)
+        ret = {'prediction': predictions, 'label_confidence': label_confidence, 'distribution': label_dist}
+        if prediction_only:
+            return ret
+        span_labels = span_labels.clone()
+        if is_soft:
+            self.acc_metric(logits_for_prediction, span_labels.max(2)[1], ~span_labels.sum(2).isclose(torch.tensor(0.)))
+            ret['loss'] = KLDivLoss(reduction='sum')(LogSoftmax(dim=2)(logits), span_labels)
+        else:
+            for label_idx in self.label_to_ignore:
+                span_labels[span_labels == label_idx] = -100
+            self.acc_metric(logits_for_prediction, span_labels, span_labels != -100)
+            ret['loss'] = CrossEntropyLoss(reduction='sum')(logits.flatten(0, 1), span_labels.flatten())
+        return ret

sftp/modules/span_typing/span_typing.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from abc import ABC
+from typing import *
+import torch
+from allennlp.common import Registrable
+from allennlp.data.vocabulary import DEFAULT_OOV_TOKEN, Vocabulary
+from allennlp.training.metrics import CategoricalAccuracy
+class SpanTyping(Registrable, torch.nn.Module, ABC):
+    """
+    Models the probability p(child_label | child_span, parent_span, parent_label).
+    """
+    def __init__(
+            self,
+            n_label: int,
+            label_to_ignore: Optional[List[int]] = None,
+    ):
+        """
+        :param label_to_ignore: Label indexes in this list will be ignored.
+            Usually this should include NULL, PADDING and UNKNOWN.
+        """
+        super().__init__()
+        self.label_to_ignore = label_to_ignore or list()
+        self.acc_metric = CategoricalAccuracy()
+        self.onto = torch.ones([n_label, n_label], dtype=torch.bool)
+        self.register_buffer('ontology', self.onto)
+    def load_ontology(self, path: str, vocab: Vocabulary):
+        unk_id = vocab.get_token_index(DEFAULT_OOV_TOKEN, 'span_label')
+        for line in open(path).readlines():
+            entities = [vocab.get_token_index(ent, 'span_label') for ent in line.replace('\n', '').split('\t')]
+            parent, children = entities[0], entities[1:]
+            if parent == unk_id:
+                continue
+            self.onto[parent, :] = False
+            children = list(filter(lambda x: x != unk_id, children))
+            self.onto[parent, children] = True
+        self.register_buffer('ontology', self.onto)
+    def forward(
+            self,
+            span_vec: torch.Tensor,
+            parent_at_span: torch.Tensor,
+            span_labels: Optional[torch.Tensor],
+            prediction_only: bool = False,
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Inputs: All features for typing a child span.
+        Output: The loss of typing and predictions.
+        :param span_vec: Shape [batch, span, token_dim]
+        :param parent_at_span: Shape [batch, span]
+        :param span_labels: Shape [batch, span]
+        :param prediction_only: If True, no loss returned & metric will not be updated
+        :return:
+            loss: Loss for label prediction. (absent of pred_only = True)
+            prediction: Predicted labels.
+        """
+        raise NotImplementedError
+    def get_metric(self, reset):
+        return{
+            "typing_acc": self.acc_metric.get_metric(reset) * 100
+        }

sftp/predictor/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .span_predictor import SpanPredictor

sftp/predictor/span_predictor.orig.py ADDED Viewed

	@@ -0,0 +1,362 @@

+import os
+from time import time
+from typing import *
+import json
+import numpy as np
+import torch
+from allennlp.common.util import JsonDict, sanitize
+from allennlp.data import DatasetReader, Instance
+from allennlp.data.data_loaders import SimpleDataLoader
+from allennlp.data.samplers import MaxTokensBatchSampler
+from allennlp.data.tokenizers import SpacyTokenizer
+from allennlp.models import Model
+from allennlp.nn import util as nn_util
+from allennlp.predictors import Predictor
+from concrete import (
+    MentionArgument, SituationMentionSet, SituationMention, TokenRefSequence,
+    EntityMention, EntityMentionSet, Entity, EntitySet, AnnotationMetadata, Communication
+)
+from concrete.util import CommunicationReader, AnalyticUUIDGeneratorFactory, CommunicationWriterZip
+from concrete.validate import validate_communication
+from ..data_reader import concrete_doc, concrete_doc_tokenized
+from ..utils import Span, re_index_span, VIRTUAL_ROOT
+class PredictionReturn(NamedTuple):
+    span: Union[Span, dict, Communication]
+    sentence: List[str]
+    meta: Dict[str, Any]
+class ForceDecodingReturn(NamedTuple):
+    span: np.ndarray
+    label: List[str]
+    distribution: np.ndarray
+@Predictor.register('span')
+class SpanPredictor(Predictor):
+    @staticmethod
+    def format_convert(
+            sentence: Union[List[str], List[List[str]]],
+            prediction: Union[Span, List[Span]],
+            output_format: str
+    ):
+        if output_format == 'span':
+            return prediction
+        elif output_format == 'json':
+            if isinstance(prediction, list):
+                return [SpanPredictor.format_convert(sent, pred, 'json') for sent, pred in zip(sentence, prediction)]
+            return prediction.to_json()
+        elif output_format == 'concrete':
+            if isinstance(prediction, Span):
+                sentence, prediction = [sentence], [prediction]
+            return concrete_doc_tokenized(sentence, prediction)
+    def predict_concrete(
+            self,
+            concrete_path: str,
+            output_path: Optional[str] = None,
+            max_tokens: int = 2048,
+            ontology_mapping: Optional[Dict[str, str]] = None,
+    ):
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        writer = CommunicationWriterZip(output_path)
+        for comm, fn in CommunicationReader(concrete_path):
+            assert len(comm.sectionList) == 1
+            concrete_sentences = comm.sectionList[0].sentenceList
+            json_sentences = list()
+            for con_sent in concrete_sentences:
+                json_sentences.append(
+                    [t.text for t in con_sent.tokenization.tokenList.tokenList]
+                )
+            predictions = self.predict_batch_sentences(json_sentences, max_tokens, ontology_mapping=ontology_mapping)
+            # Merge predictions into concrete
+            aug = AnalyticUUIDGeneratorFactory(comm).create()
+            situation_mention_set = SituationMentionSet(next(aug), AnnotationMetadata('Span Finder', time()), list())
+            comm.situationMentionSetList = [situation_mention_set]
+            situation_mention_set.mentionList = sm_list = list()
+            entity_mention_set = EntityMentionSet(next(aug), AnnotationMetadata('Span Finder', time()), list())
+            comm.entityMentionSetList = [entity_mention_set]
+            entity_mention_set.mentionList = em_list = list()
+            entity_set = EntitySet(
+                next(aug), AnnotationMetadata('Span Finder', time()), list(), None, entity_mention_set.uuid
+            )
+            comm.entitySetList = [entity_set]
+            em_dict = dict()
+            for con_sent, pred in zip(concrete_sentences, predictions):
+                for event in pred.span:
+                    def raw_text_span(start_idx, end_idx, **_):
+                        si_char = con_sent.tokenization.tokenList.tokenList[start_idx].textSpan.start
+                        ei_char = con_sent.tokenization.tokenList.tokenList[end_idx].textSpan.ending
+                        return comm.text[si_char:ei_char]
+                    sm = SituationMention(
+                        next(aug),
+                        text=raw_text_span(event.start_idx, event.end_idx),
+                        situationKind=event.label,
+                        situationType='EVENT',
+                        confidence=event.confidence,
+                        argumentList=list(),
+                        tokens=TokenRefSequence(
+                            tokenIndexList=list(range(event.start_idx, event.end_idx+1)),
+                            tokenizationId=con_sent.tokenization.uuid
+                        )
+                    )
+                    for arg in event:
+                        em = em_dict.get((arg.start_idx, arg.end_idx + 1))
+                        if em is None:
+                            em = EntityMention(
+                                next(aug),
+                                tokens=TokenRefSequence(
+                                    tokenIndexList=list(range(arg.start_idx, arg.end_idx+1)),
+                                    tokenizationId=con_sent.tokenization.uuid,
+                                ),
+                                text=raw_text_span(arg.start_idx, arg.end_idx)
+                            )
+                            em_list.append(em)
+                            entity_set.entityList.append(Entity(next(aug), id=em.text, mentionIdList=[em.uuid]))
+                            em_dict[(arg.start_idx, arg.end_idx+1)] = em
+                        sm.argumentList.append(MentionArgument(
+                            role=arg.label,
+                            entityMentionId=em.uuid,
+                            confidence=arg.confidence
+                        ))
+                    sm_list.append(sm)
+            validate_communication(comm)
+            writer.write(comm, fn)
+        writer.close()
+    def predict_sentence(
+            self,
+            sentence: Union[str, List[str]],
+            ontology_mapping: Optional[Dict[str, str]] = None,
+            output_format: str = 'span',
+    ) -> PredictionReturn:
+        """
+        Predict spans on a single sentence (no batch). If not tokenized, will tokenize it with SpacyTokenizer.
+        :param sentence: If tokenized, should be a list of tokens in string. If not, should be a string.
+        :param ontology_mapping:
+        :param output_format: span, json or concrete.
+        """
+        prediction = self.predict_json(self._prepare_sentence(sentence))
+        prediction['prediction'] = self.format_convert(
+            prediction['sentence'],
+            Span.from_json(prediction['prediction']).map_ontology(ontology_mapping),
+            output_format
+        )
+        return PredictionReturn(prediction['prediction'], prediction['sentence'], prediction.get('meta', dict()))
+    def predict_batch_sentences(
+            self,
+            sentences: List[Union[List[str], str]],
+            max_tokens: int = 512,
+            ontology_mapping: Optional[Dict[str, str]] = None,
+            output_format: str = 'span',
+    ) -> List[PredictionReturn]:
+        """
+        Predict spans on a batch of sentences. If not tokenized, will tokenize it with SpacyTokenizer.
+        :param sentences: A list of sentences. Refer to `predict_sentence`.
+        :param max_tokens: Maximum tokens in a batch.
+        :param ontology_mapping: If not None, will try to map the output from one ontology to another.
+            If the predicted frame is not in the mapping, the prediction will be ignored.
+        :param output_format: span, json or concrete.
+        :return: A list of predictions.
+        """
+        sentences = list(map(self._prepare_sentence, sentences))
+        for i_sent, sent in enumerate(sentences):
+            sent['meta'] = {"idx": i_sent}
+        instances = list(map(self._json_to_instance, sentences))
+        outputs = list()
+        for ins_indices in MaxTokensBatchSampler(max_tokens, ["tokens"], 0.0).get_batch_indices(instances):
+            batch_ins = list(
+                SimpleDataLoader([instances[ins_idx] for ins_idx in ins_indices], len(ins_indices), vocab=self.vocab)
+            )[0]
+            batch_inputs = nn_util.move_to_device(batch_ins, device=self.cuda_device)
+            batch_outputs = self._model(**batch_inputs)
+            for meta, prediction, inputs in zip(
+                batch_outputs['meta'], batch_outputs['prediction'], batch_outputs['inputs']
+            ):
+                prediction.map_ontology(ontology_mapping)
+                prediction = self.format_convert(inputs['sentence'], prediction, output_format)
+                outputs.append(PredictionReturn(prediction, inputs['sentence'], {"input_idx": meta['idx']}))
+        outputs.sort(key=lambda x: x.meta['input_idx'])
+        return outputs
+    def predict_instance(self, instance: Instance) -> JsonDict:
+        outputs = self._model.forward_on_instance(instance)
+        outputs = sanitize(outputs)
+        return {
+            'prediction': outputs['prediction'],
+            'sentence': outputs['inputs']['sentence'],
+            'meta': outputs.get('meta', {})
+        }
+    def __init__(
+            self,
+            model: Model,
+            dataset_reader: DatasetReader,
+            frozen: bool = True,
+    ):
+        super(SpanPredictor, self).__init__(model=model, dataset_reader=dataset_reader, frozen=frozen)
+        self.spacy_tokenizer = SpacyTokenizer(language='en_core_web_sm')
+    def economize(
+            self,
+            max_decoding_spans: Optional[int] = None,
+            max_recursion_depth: Optional[int] = None,
+    ):
+        if max_decoding_spans:
+            self._model._max_decoding_spans = max_decoding_spans
+        if max_recursion_depth:
+            self._model._max_recursion_depth = max_recursion_depth
+    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
+        return self._dataset_reader.text_to_instance(**json_dict)
+    @staticmethod
+    def to_nested(prediction: List[dict]):
+        first_layer, idx2children = list(), dict()
+        for idx, pred in enumerate(prediction):
+            children = list()
+            pred['children'] = idx2children[idx+1] = children
+            if pred['parent'] == 0:
+                first_layer.append(pred)
+            else:
+                idx2children[pred['parent']].append(pred)
+            del pred['parent']
+        return first_layer
+    def _prepare_sentence(self, sentence: Union[str, List[str]]) -> Dict[str, List[str]]:
+        if isinstance(sentence, str):
+            while '  ' in sentence:
+                sentence = sentence.replace('  ', ' ')
+            sentence = sentence.replace(chr(65533), '')
+            if sentence == '':
+                sentence = [""]
+            sentence = list(map(str, self.spacy_tokenizer.tokenize(sentence)))
+        return {"tokens": sentence}
+    @staticmethod
+    def json_to_concrete(
+            predictions: List[dict],
+    ):
+        sentences = list()
+        for pred in predictions:
+            tokenization, event = list(), list()
+            sent = {'text': ' '.join(pred['inputs']), 'tokenization': tokenization, 'event': event}
+            sentences.append(sent)
+            start_idx = 0
+            for token in pred['inputs']:
+                tokenization.append((start_idx, len(token)-1+start_idx))
+                start_idx += len(token) + 1
+            for pred_event in pred['prediction']:
+                arg_list = list()
+                one_event = {'argument': arg_list}
+                event.append(one_event)
+                for key in ['start_idx', 'end_idx', 'label']:
+                    one_event[key] = pred_event[key]
+                for pred_arg in pred_event['children']:
+                    arg_list.append({key: pred_arg[key] for key in ['start_idx', 'end_idx', 'label']})
+        concrete_comm = concrete_doc(sentences)
+        return concrete_comm
+    def force_decode(
+            self,
+            sentence: List[str],
+            parent_span: Tuple[int, int] = (-1, -1),
+            parent_label: str = VIRTUAL_ROOT,
+            child_spans: Optional[List[Tuple[int, int]]] = None,
+    ) -> ForceDecodingReturn:
+        """
+        Force decoding. There are 2 modes:
+        1. Given parent span and its label, find all it children (direct children, not including other descendents)
+            and type them.
+        2. Given parent span, parent label, and children spans, type all children.
+        :param sentence: Tokens.
+        :param parent_span: [start_idx, end_idx], both inclusive.
+        :param parent_label: Parent label in string.
+        :param child_spans: Optional. If provided, will turn to mode 2; else mode 1.
+        :return:
+            - span: children spans.
+            - label: most probable labels of children.
+            - distribution: distribution over children labels.
+        """
+        instance = self._dataset_reader.text_to_instance(self._prepare_sentence(sentence)['tokens'])
+        model_input = nn_util.move_to_device(
+            list(SimpleDataLoader([instance], 1, vocab=self.vocab))[0], device=self.cuda_device
+        )
+        offsets = instance.fields['raw_inputs'].metadata['offsets']
+        with torch.no_grad():
+            tokens = model_input['tokens']
+            parent_span = re_index_span(parent_span, offsets)
+            if parent_span[1] >= self._dataset_reader.max_length:
+                return ForceDecodingReturn(
+                    np.zeros([0, 2], dtype=np.int),
+                    [],
+                    np.zeros([0, self.vocab.get_vocab_size('span_label')], dtype=np.float64)
+                )
+            if child_spans is not None:
+                token_vec = self._model.word_embedding(tokens)
+                child_pieces = [re_index_span(bdr, offsets) for bdr in child_spans]
+                child_pieces = list(filter(lambda x: x[1] < self._dataset_reader.max_length-1, child_pieces))
+                span_tensor = torch.tensor(
+                    [parent_span] + child_pieces, dtype=torch.int64, device=self.device
+                ).unsqueeze(0)
+                parent_indices = span_tensor.new_zeros(span_tensor.shape[0:2])
+                span_labels = parent_indices.new_full(
+                    parent_indices.shape, self._model.vocab.get_token_index(parent_label, 'span_label')
+                )
+                span_vec = self._model._span_extractor(token_vec, span_tensor)
+                typing_out = self._model._span_typing(span_vec, parent_indices, span_labels)
+                distribution = typing_out['distribution'][0, 1:].cpu().numpy()
+                boundary = np.array(child_spans)
+            else:
+                parent_label_tensor = torch.tensor(
+                    [self._model.vocab.get_token_index(parent_label, 'span_label')], device=self.device
+                )
+                parent_boundary_tensor = torch.tensor([parent_span], device=self.device)
+                boundary, _, num_children, distribution = self._model.one_step_prediction(
+                    tokens, parent_boundary_tensor, parent_label_tensor
+                )
+                boundary, distribution = boundary[0].cpu().tolist(), distribution[0].cpu().numpy()
+                boundary = np.array([re_index_span(bdr, offsets, True) for bdr in boundary])
+            labels = [
+                self.vocab.get_token_from_index(label_idx, 'span_label') for label_idx in distribution.argmax(1)
+            ]
+            return ForceDecodingReturn(boundary, labels, distribution)
+    @property
+    def vocab(self):
+        return self._model.vocab
+    @property
+    def device(self):
+        return self.cuda_device if self.cuda_device > -1 else 'cpu'
+    @staticmethod
+    def read_ontology_mapping(file_path: str):
+        """
+        Read the ontology mapping file. The file format can be read in docs.
+        """
+        if file_path is None:
+            return None
+        if file_path.endswith('.json'):
+            return json.load(open(file_path))
+        mapping = dict()
+        for line in open(file_path).readlines():
+            parent_label, original_label, new_label = line.replace('\n', '').split('\t')
+            if parent_label == '*':
+                mapping[original_label] = new_label
+            else:
+                mapping[(parent_label, original_label)] = new_label
+        return mapping

sftp/predictor/span_predictor.py ADDED Viewed

	@@ -0,0 +1,401 @@

+import os
+from time import time
+from typing import *
+import json
+# # ---GFM add debugger
+# import pdb
+# # end---
+import numpy as np
+import torch
+from allennlp.common.util import JsonDict, sanitize
+from allennlp.data import DatasetReader, Instance
+from allennlp.data.data_loaders import SimpleDataLoader
+from allennlp.data.samplers import MaxTokensBatchSampler
+from allennlp.data.tokenizers import SpacyTokenizer
+from allennlp.models import Model
+from allennlp.nn import util as nn_util
+from allennlp.predictors import Predictor
+from concrete import (
+    MentionArgument, SituationMentionSet, SituationMention, TokenRefSequence,
+    EntityMention, EntityMentionSet, Entity, EntitySet, AnnotationMetadata, Communication
+)
+from concrete.util import CommunicationReader, AnalyticUUIDGeneratorFactory, CommunicationWriterZip
+from concrete.validate import validate_communication
+from ..data_reader import concrete_doc, concrete_doc_tokenized
+from ..utils import Span, re_index_span, VIRTUAL_ROOT
+class PredictionReturn(NamedTuple):
+    span: Union[Span, dict, Communication]
+    sentence: List[str]
+    meta: Dict[str, Any]
+class ForceDecodingReturn(NamedTuple):
+    span: np.ndarray
+    label: List[str]
+    distribution: np.ndarray
+@Predictor.register('span')
+class SpanPredictor(Predictor):
+    @staticmethod
+    def format_convert(
+            sentence: Union[List[str], List[List[str]]],
+            prediction: Union[Span, List[Span]],
+            output_format: str
+    ):
+        if output_format == 'span':
+            return prediction
+        elif output_format == 'json':
+            if isinstance(prediction, list):
+                return [SpanPredictor.format_convert(sent, pred, 'json') for sent, pred in zip(sentence, prediction)]
+            return prediction.to_json()
+        elif output_format == 'concrete':
+            if isinstance(prediction, Span):
+                sentence, prediction = [sentence], [prediction]
+            return concrete_doc_tokenized(sentence, prediction)
+    def predict_concrete(
+            self,
+            concrete_path: str,
+            output_path: Optional[str] = None,
+            max_tokens: int = 2048,
+            ontology_mapping: Optional[Dict[str, str]] = None,
+    ):
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        writer = CommunicationWriterZip(output_path)
+        print(concrete_path)
+        for comm, fn in CommunicationReader(concrete_path):
+            print(fn)
+            assert len(comm.sectionList) == 1
+            concrete_sentences = comm.sectionList[0].sentenceList
+            json_sentences = list()
+            for con_sent in concrete_sentences:
+                json_sentences.append(
+                    [t.text for t in con_sent.tokenization.tokenList.tokenList]
+                )
+            predictions = self.predict_batch_sentences(json_sentences, max_tokens, ontology_mapping=ontology_mapping)
+            # Merge predictions into concrete
+            aug = AnalyticUUIDGeneratorFactory(comm).create()
+            situation_mention_set = SituationMentionSet(next(aug), AnnotationMetadata('Span Finder', time()), list())
+            comm.situationMentionSetList = [situation_mention_set]
+            situation_mention_set.mentionList = sm_list = list()
+            entity_mention_set = EntityMentionSet(next(aug), AnnotationMetadata('Span Finder', time()), list())
+            comm.entityMentionSetList = [entity_mention_set]
+            entity_mention_set.mentionList = em_list = list()
+            entity_set = EntitySet(
+                next(aug), AnnotationMetadata('Span Finder', time()), list(), None, entity_mention_set.uuid
+            )
+            comm.entitySetList = [entity_set]
+            em_dict = dict()
+            for con_sent, pred in zip(concrete_sentences, predictions):
+                for event in pred.span:
+                    def raw_text_span(start_idx, end_idx, **_):
+                        si_char = con_sent.tokenization.tokenList.tokenList[start_idx].textSpan.start
+                        ei_char = con_sent.tokenization.tokenList.tokenList[end_idx].textSpan.ending
+                        return comm.text[si_char:ei_char]
+                    # ---GFM: added this to get around off-by-one errors (unclear why these arise)
+                    event_start_idx = event.start_idx
+                    event_end_idx = event.end_idx
+                    if event_end_idx > len(con_sent.tokenization.tokenList.tokenList) - 1:
+                        print("WARNING: invalid `event_end_idx` passed for sentence, adjusting to final token")
+                        print("\tsentence:", con_sent.tokenization.tokenList)
+                        print("event_end_idx:", event_end_idx)
+                        print("length:", len(con_sent.tokenization.tokenList.tokenList))
+                        event_end_idx = len(con_sent.tokenization.tokenList.tokenList) - 1
+                        print("new event_end_idx:", event_end_idx)
+                        print()
+                    # end---
+                    sm = SituationMention(
+                        next(aug),
+                        # ---GFM: added this to get around off-by-one errors (unclear why these arise)
+                        text=raw_text_span(event_start_idx, event_end_idx),
+                        # end---
+                        situationKind=event.label,
+                        situationType='EVENT',
+                        confidence=event.confidence,
+                        argumentList=list(),
+                        tokens=TokenRefSequence(
+                            # ---GFM: added this to get around off-by-one errors (unclear why these arise)
+                            tokenIndexList=list(range(event_start_idx, event_end_idx+1)),
+                            # end---
+                            tokenizationId=con_sent.tokenization.uuid
+                        )
+                    )
+                    for arg in event:
+                        # ---GFM: added this to get around off-by-one errors (unclear why these arise)
+                        arg_start_idx = arg.start_idx
+                        arg_end_idx = arg.end_idx
+                        if arg_end_idx > len(con_sent.tokenization.tokenList.tokenList) - 1:
+                            print("WARNING: invalid `arg_end_idx` passed for sentence, adjusting to final token")
+                            print("\tsentence:", con_sent.tokenization.tokenList)
+                            print("arg_end_idx:", arg_end_idx)
+                            print("length:", len(con_sent.tokenization.tokenList.tokenList))
+                            arg_end_idx = len(con_sent.tokenization.tokenList.tokenList) - 1
+                            print("new arg_end_idx:", arg_end_idx)
+                            print()
+                        # end---
+                        # ---GFM: replaced all arg.*_idx to arg_*_idx
+                        em = em_dict.get((arg_start_idx, arg_end_idx + 1))
+                        if em is None:
+                            em = EntityMention(
+                                next(aug),
+                                tokens=TokenRefSequence(
+                                    tokenIndexList=list(range(arg_start_idx, arg_end_idx+1)),
+                                    tokenizationId=con_sent.tokenization.uuid,
+                                ),
+                                text=raw_text_span(arg_start_idx, arg_end_idx)
+                            )
+                            em_list.append(em)
+                            entity_set.entityList.append(Entity(next(aug), id=em.text, mentionIdList=[em.uuid]))
+                            em_dict[(arg_start_idx, arg_end_idx+1)] = em
+                        sm.argumentList.append(MentionArgument(
+                            role=arg.label,
+                            entityMentionId=em.uuid,
+                            confidence=arg.confidence
+                        ))
+                        # end---
+                    sm_list.append(sm)
+            validate_communication(comm)
+            writer.write(comm, fn)
+        writer.close()
+    def predict_sentence(
+            self,
+            sentence: Union[str, List[str]],
+            ontology_mapping: Optional[Dict[str, str]] = None,
+            output_format: str = 'span',
+    ) -> PredictionReturn:
+        """
+        Predict spans on a single sentence (no batch). If not tokenized, will tokenize it with SpacyTokenizer.
+        :param sentence: If tokenized, should be a list of tokens in string. If not, should be a string.
+        :param ontology_mapping:
+        :param output_format: span, json or concrete.
+        """
+        prediction = self.predict_json(self._prepare_sentence(sentence))
+        prediction['prediction'] = self.format_convert(
+            prediction['sentence'],
+            Span.from_json(prediction['prediction']).map_ontology(ontology_mapping),
+            output_format
+        )
+        return PredictionReturn(prediction['prediction'], prediction['sentence'], prediction.get('meta', dict()))
+    def predict_batch_sentences(
+            self,
+            sentences: List[Union[List[str], str]],
+            max_tokens: int = 512,
+            ontology_mapping: Optional[Dict[str, str]] = None,
+            output_format: str = 'span',
+    ) -> List[PredictionReturn]:
+        """
+        Predict spans on a batch of sentences. If not tokenized, will tokenize it with SpacyTokenizer.
+        :param sentences: A list of sentences. Refer to `predict_sentence`.
+        :param max_tokens: Maximum tokens in a batch.
+        :param ontology_mapping: If not None, will try to map the output from one ontology to another.
+            If the predicted frame is not in the mapping, the prediction will be ignored.
+        :param output_format: span, json or concrete.
+        :return: A list of predictions.
+        """
+        sentences = list(map(self._prepare_sentence, sentences))
+        for i_sent, sent in enumerate(sentences):
+            sent['meta'] = {"idx": i_sent}
+        instances = list(map(self._json_to_instance, sentences))
+        outputs = list()
+        for ins_indices in MaxTokensBatchSampler(max_tokens, ["tokens"], 0.0).get_batch_indices(instances):
+            batch_ins = list(
+                SimpleDataLoader([instances[ins_idx] for ins_idx in ins_indices], len(ins_indices), vocab=self.vocab)
+            )[0]
+            batch_inputs = nn_util.move_to_device(batch_ins, device=self.cuda_device)
+            batch_outputs = self._model(**batch_inputs)
+            for meta, prediction, inputs in zip(
+                batch_outputs['meta'], batch_outputs['prediction'], batch_outputs['inputs']
+            ):
+                prediction.map_ontology(ontology_mapping)
+                prediction = self.format_convert(inputs['sentence'], prediction, output_format)
+                outputs.append(PredictionReturn(prediction, inputs['sentence'], {"input_idx": meta['idx']}))
+        outputs.sort(key=lambda x: x.meta['input_idx'])
+        return outputs
+    def predict_instance(self, instance: Instance) -> JsonDict:
+        outputs = self._model.forward_on_instance(instance)
+        outputs = sanitize(outputs)
+        return {
+            'prediction': outputs['prediction'],
+            'sentence': outputs['inputs']['sentence'],
+            'meta': outputs.get('meta', {})
+        }
+    def __init__(
+            self,
+            model: Model,
+            dataset_reader: DatasetReader,
+            frozen: bool = True,
+    ):
+        super(SpanPredictor, self).__init__(model=model, dataset_reader=dataset_reader, frozen=frozen)
+        self.spacy_tokenizer = SpacyTokenizer(language='en_core_web_sm')
+    def economize(
+            self,
+            max_decoding_spans: Optional[int] = None,
+            max_recursion_depth: Optional[int] = None,
+    ):
+        if max_decoding_spans:
+            self._model._max_decoding_spans = max_decoding_spans
+        if max_recursion_depth:
+            self._model._max_recursion_depth = max_recursion_depth
+    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
+        return self._dataset_reader.text_to_instance(**json_dict)
+    @staticmethod
+    def to_nested(prediction: List[dict]):
+        first_layer, idx2children = list(), dict()
+        for idx, pred in enumerate(prediction):
+            children = list()
+            pred['children'] = idx2children[idx+1] = children
+            if pred['parent'] == 0:
+                first_layer.append(pred)
+            else:
+                idx2children[pred['parent']].append(pred)
+            del pred['parent']
+        return first_layer
+    def _prepare_sentence(self, sentence: Union[str, List[str]]) -> Dict[str, List[str]]:
+        if isinstance(sentence, str):
+            while '  ' in sentence:
+                sentence = sentence.replace('  ', ' ')
+            sentence = sentence.replace(chr(65533), '')
+            if sentence == '':
+                sentence = [""]
+            sentence = list(map(str, self.spacy_tokenizer.tokenize(sentence)))
+        return {"tokens": sentence}
+    @staticmethod
+    def json_to_concrete(
+            predictions: List[dict],
+    ):
+        sentences = list()
+        for pred in predictions:
+            tokenization, event = list(), list()
+            sent = {'text': ' '.join(pred['inputs']), 'tokenization': tokenization, 'event': event}
+            sentences.append(sent)
+            start_idx = 0
+            for token in pred['inputs']:
+                tokenization.append((start_idx, len(token)-1+start_idx))
+                start_idx += len(token) + 1
+            for pred_event in pred['prediction']:
+                arg_list = list()
+                one_event = {'argument': arg_list}
+                event.append(one_event)
+                for key in ['start_idx', 'end_idx', 'label']:
+                    one_event[key] = pred_event[key]
+                for pred_arg in pred_event['children']:
+                    arg_list.append({key: pred_arg[key] for key in ['start_idx', 'end_idx', 'label']})
+        concrete_comm = concrete_doc(sentences)
+        return concrete_comm
+    def force_decode(
+            self,
+            sentence: List[str],
+            parent_span: Tuple[int, int] = (-1, -1),
+            parent_label: str = VIRTUAL_ROOT,
+            child_spans: Optional[List[Tuple[int, int]]] = None,
+    ) -> ForceDecodingReturn:
+        """
+        Force decoding. There are 2 modes:
+        1. Given parent span and its label, find all it children (direct children, not including other descendents)
+            and type them.
+        2. Given parent span, parent label, and children spans, type all children.
+        :param sentence: Tokens.
+        :param parent_span: [start_idx, end_idx], both inclusive.
+        :param parent_label: Parent label in string.
+        :param child_spans: Optional. If provided, will turn to mode 2; else mode 1.
+        :return:
+            - span: children spans.
+            - label: most probable labels of children.
+            - distribution: distribution over children labels.
+        """
+        instance = self._dataset_reader.text_to_instance(self._prepare_sentence(sentence)['tokens'])
+        model_input = nn_util.move_to_device(
+            list(SimpleDataLoader([instance], 1, vocab=self.vocab))[0], device=self.cuda_device
+        )
+        offsets = instance.fields['raw_inputs'].metadata['offsets']
+        with torch.no_grad():
+            tokens = model_input['tokens']
+            parent_span = re_index_span(parent_span, offsets)
+            if parent_span[1] >= self._dataset_reader.max_length:
+                return ForceDecodingReturn(
+                    np.zeros([0, 2], dtype=np.int),
+                    [],
+                    np.zeros([0, self.vocab.get_vocab_size('span_label')], dtype=np.float64)
+                )
+            if child_spans is not None:
+                token_vec = self._model.word_embedding(tokens)
+                child_pieces = [re_index_span(bdr, offsets) for bdr in child_spans]
+                child_pieces = list(filter(lambda x: x[1] < self._dataset_reader.max_length-1, child_pieces))
+                span_tensor = torch.tensor(
+                    [parent_span] + child_pieces, dtype=torch.int64, device=self.device
+                ).unsqueeze(0)
+                parent_indices = span_tensor.new_zeros(span_tensor.shape[0:2])
+                span_labels = parent_indices.new_full(
+                    parent_indices.shape, self._model.vocab.get_token_index(parent_label, 'span_label')
+                )
+                span_vec = self._model._span_extractor(token_vec, span_tensor)
+                typing_out = self._model._span_typing(span_vec, parent_indices, span_labels)
+                distribution = typing_out['distribution'][0, 1:].cpu().numpy()
+                boundary = np.array(child_spans)
+            else:
+                parent_label_tensor = torch.tensor(
+                    [self._model.vocab.get_token_index(parent_label, 'span_label')], device=self.device
+                )
+                parent_boundary_tensor = torch.tensor([parent_span], device=self.device)
+                boundary, _, num_children, distribution = self._model.one_step_prediction(
+                    tokens, parent_boundary_tensor, parent_label_tensor
+                )
+                boundary, distribution = boundary[0].cpu().tolist(), distribution[0].cpu().numpy()
+                boundary = np.array([re_index_span(bdr, offsets, True) for bdr in boundary])
+            labels = [
+                self.vocab.get_token_from_index(label_idx, 'span_label') for label_idx in distribution.argmax(1)
+            ]
+            return ForceDecodingReturn(boundary, labels, distribution)
+    @property
+    def vocab(self):
+        return self._model.vocab
+    @property
+    def device(self):
+        return self.cuda_device if self.cuda_device > -1 else 'cpu'
+    @staticmethod
+    def read_ontology_mapping(file_path: str):
+        """
+        Read the ontology mapping file. The file format can be read in docs.
+        """
+        if file_path is None:
+            return None
+        if file_path.endswith('.json'):
+            return json.load(open(file_path))
+        mapping = dict()
+        for line in open(file_path).readlines():
+            parent_label, original_label, new_label = line.replace('\n', '').split('\t')
+            if parent_label == '*':
+                mapping[original_label] = new_label
+            else:
+                mapping[(parent_label, original_label)] = new_label
+        return mapping

sftp/training/__init__.py ADDED Viewed

File without changes

sftp/training/transformer_optimizer.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import logging
+import re
+from typing import *
+import torch
+from allennlp.common.from_params import Params, T
+from allennlp.training.optimizers import Optimizer
+logger = logging.getLogger('optim')
+@Optimizer.register('transformer')
+class TransformerOptimizer:
+    """
+    Wrapper for AllenNLP optimizer.
+    This is used to fine-tune the pretrained transformer with some layers fixed and different learning rate.
+    When some layers are fixed, the wrapper will set the `require_grad` flag as  False, which could save
+    training time and optimize memory usage.
+    Plz contact Guanghui Qin for bugs.
+    Params:
+        base: base optimizer.
+        embeddings_lr: learning rate for embedding layer. Set as 0.0 to fix it.
+        encoder_lr: learning rate for encoder layer. Set as 0.0 to fix it.
+        pooler_lr: learning rate for pooler layer. Set as 0.0 to fix it.
+        layer_fix: the number of encoder layers that should be fixed.
+    Example json config:
+    1. No-op. Do nothing (why do you use me?)
+    optimizer: {
+        type: "transformer",
+        base: {
+            type: "adam",
+            lr: 0.001
+        }
+    }
+    2. Fix everything in the transformer.
+    optimizer: {
+        type: "transformer",
+        base: {
+            type: "adam",
+            lr: 0.001
+        },
+        embeddings_lr: 0.0,
+        encoder_lr: 0.0,
+        pooler_lr: 0.0
+    }
+    Or equivalently (suppose we have 24 layers)
+    optimizer: {
+        type: "transformer",
+        base: {
+            type: "adam",
+            lr: 0.001
+        },
+        embeddings_lr: 0.0,
+        layer_fix: 24,
+        pooler_lr: 0.0
+    }
+    3. Fix embeddings and the lower 12 encoder layers, set a small learning rate
+       for the other parts of the transformer
+    optimizer: {
+        type: "transformer",
+        base: {
+            type: "adam",
+            lr: 0.001
+        },
+        embeddings_lr: 0.0,
+        layer_fix: 12,
+        encoder_lr: 1e-5,
+        pooler_lr: 1e-5
+    }
+    """
+    @classmethod
+    def from_params(
+            cls: Type[T],
+            params: Params,
+            model_parameters: List[Tuple[str, torch.nn.Parameter]],
+            **_
+    ):
+        param_groups = list()
+        def remove_param(keyword_):
+            nonlocal model_parameters
+            logger.info(f'Fix param with name matching {keyword_}.')
+            for name, param in model_parameters:
+                if keyword_ in name:
+                    logger.debug(f'Fix param {name}.')
+                    param.requires_grad_(False)
+            model_parameters = list(filter(lambda x: keyword_ not in x[0], model_parameters))
+        for i_layer in range(params.pop('layer_fix')):
+            remove_param('transformer_model.encoder.layer.{}.'.format(i_layer))
+        for specific_lr, keyword in (
+            (params.pop('embeddings_lr', None), 'transformer_model.embeddings'),
+            (params.pop('encoder_lr', None), 'transformer_model.encoder.layer'),
+            (params.pop('pooler_lr', None), 'transformer_model.pooler'),
+        ):
+            if specific_lr is not None:
+                if specific_lr > 0.:
+                    pattern = '.*' + keyword.replace('.', r'\.') + '.*'
+                    if len([name for name, _ in model_parameters if re.match(pattern, name)]) > 0:
+                        param_groups.append([[pattern], {'lr': specific_lr}])
+                    else:
+                        logger.warning(f'{pattern} is set to use lr {specific_lr} but no param matches.')
+                else:
+                    remove_param(keyword)
+        if 'parameter_groups' in params:
+            for pg in params.pop('parameter_groups'):
+                param_groups.append([pg[0], pg[1].as_dict()])
+        return Optimizer.by_name(params.get('base').pop('type'))(
+            model_parameters=model_parameters, parameter_groups=param_groups,
+            **params.pop('base').as_flat_dict()
+        )

sftp/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import sftp.utils.label_smoothing
+from sftp.utils.common import VIRTUAL_ROOT, DEFAULT_SPAN, BIO
+from sftp.utils.db_storage import Cache
+from sftp.utils.functions import num2mask, mask2idx, numpy2torch, one_hot, max_match
+from sftp.utils.span import Span, re_index_span
+from sftp.utils.span_utils import tensor2span
+from sftp.utils.bio_smoothing import BIOSmoothing, apply_bio_smoothing

sftp/utils/bio_smoothing.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from typing import *
+import numpy as np
+from .common import BIO
+class BIOSmoothing:
+    def __init__(
+            self,
+            b_smooth: float = 0.0,
+            i_smooth: float = 0.0,
+            o_smooth: float = 0.0,
+            weight: float = 1.0
+    ):
+        self.smooth = [b_smooth, i_smooth, o_smooth]
+        self.weight = weight
+    def apply_sequence(self, sequence: List[str]):
+        bio_tags = np.zeros([len(sequence), 3], np.float32)
+        for i, tag in enumerate(sequence):
+            bio_tags[i] = self.apply_tag(tag)
+        return bio_tags
+    def apply_tag(self, tag: str):
+        j = BIO.index(tag)
+        ret = np.zeros([3], np.float32)
+        if self.smooth[j] >= 0.0:
+            # Smooth
+            ret[j] = 1.0 - self.smooth[j]
+            for j_ in set(range(3)) - {j}:
+                ret[j_] = self.smooth[j] / 2
+        else:
+            # Marginalize
+            ret[:] = 1.0
+        return ret * self.weight
+    def __repr__(self):
+        ret = f'<W={self.weight:.2f}'
+        for j, tag in enumerate(BIO):
+            if self.smooth[j] != 0.0:
+                if self.smooth[j] < 0:
+                    ret += f' [marginalize {tag}]'
+                else:
+                    ret += f' [smooth {tag} by {self.smooth[j]:.2f}]'
+        return ret + '>'
+    def clone(self):
+        return BIOSmoothing(*self.smooth, self.weight)
+def apply_bio_smoothing(
+        config: Optional[Union[BIOSmoothing, List[BIOSmoothing]]],
+        bio_seq: List[str]
+) -> np.ndarray:
+    if config is None:
+        config = BIOSmoothing()
+    if isinstance(config, BIOSmoothing):
+        return config.apply_sequence(bio_seq)
+    else:
+        assert len(bio_seq) == len(config)
+        return np.stack([cfg.apply_tag(tag) for cfg, tag in zip(config, bio_seq)])

sftp/utils/common.py ADDED Viewed

	@@ -0,0 +1,3 @@

+DEFAULT_SPAN = '@@SPAN@@'
+VIRTUAL_ROOT = '@@VIRTUAL_ROOT@@'
+BIO = 'BIO'

sftp/utils/db_storage.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import pickle
+import warnings
+import h5py
+import numpy as np
+class Cache:
+    def __init__(self, file: str, mode: str = 'a', overwrite=False):
+        self.db_file = h5py.File(file, mode=mode)
+        self.overwrite = overwrite
+    @staticmethod
+    def _key(key):
+        if isinstance(key, str):
+            return key
+        elif isinstance(key, list):
+            ret = []
+            for k in key:
+                ret.append(Cache._key(k))
+            return ' '.join(ret)
+        else:
+            return str(key)
+    @staticmethod
+    def _value(value: np.ndarray):
+        if isinstance(value, h5py.Dataset):
+            value: np.ndarray = value[()]
+        if value.dtype.name.startswith('bytes'):
+            value = pickle.loads(value)
+        return value
+    def __getitem__(self, key):
+        key = self._key(key)
+        if key not in self:
+            raise KeyError
+        return self._value(self.db_file[key])
+    def __setitem__(self, key, value) -> None:
+        key = self._key(key)
+        if key in self:
+            del self.db_file[key]
+        if not isinstance(value, np.ndarray):
+            value = np.array(pickle.dumps(value))
+        self.db_file[key] = value
+    def __delitem__(self, key) -> None:
+        key = self._key(key)
+        if key in self:
+            del self.db_file[key]
+    def __len__(self) -> int:
+        return len(self.db_file)
+    def close(self) -> None:
+        self.db_file.close()
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        self.close()
+    def __contains__(self, item):
+        item = self._key(item)
+        return item in self.db_file
+    def __enter__(self):
+        return self
+    def __call__(self, function):
+        """
+        The object of the class could also be used as a decorator. Provide an additional
+        argument `cache_id' when calling the function, and the results will be cached.
+        """
+        def wrapper(*args, **kwargs):
+            if 'cache_id' in kwargs:
+                cache_id = kwargs['cache_id']
+                del kwargs['cache_id']
+                if cache_id in self and not self.overwrite:
+                    return self[cache_id]
+                rst = function(*args, **kwargs)
+                self[cache_id] = rst
+                return rst
+            else:
+                warnings.warn("`cache_id' argument not found. Cache is disabled.")
+                return function(*args, **kwargs)
+        return wrapper

sftp/utils/functions.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from typing import *
+import numpy as np
+import torch
+from scipy.optimize import linear_sum_assignment
+from torch.nn.utils.rnn import pad_sequence
+def num2mask(
+        nums: torch.Tensor,
+        max_length: Optional[int] = None
+) -> torch.Tensor:
+    """
+    E.g. input a tensor [2, 3, 4], return [[T T F F], [T T T F], [T T T T]]
+    :param nums: Shape [batch]
+    :param max_length: maximum length. if not provided, will choose the largest number from nums.
+    :return: 2D binary mask.
+    """
+    shape_backup = nums.shape
+    nums = nums.flatten()
+    max_length = max_length or int(nums.max())
+    batch_size = len(nums)
+    range_nums = torch.arange(0, max_length, device=nums.device).unsqueeze(0).expand([batch_size, max_length])
+    ret = (range_nums.T < nums).T
+    return ret.reshape(*shape_backup, max_length)
+def mask2idx(
+        mask: torch.Tensor,
+        max_length: Optional[int] = None,
+        padding_value: int = 0,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    E.g. input a tensor [[T T F F], [T T T F], [F F F T]] with padding value -1,
+    return [[0, 1, -1], [0, 1, 2], [3, -1, -1]]
+    :param mask: Mask tensor. Boolean. Not necessarily to be 2D.
+    :param max_length: If provided, will truncate.
+    :param padding_value: Padding value. Default to 0.
+    :return: Index tensor.
+    """
+    shape_prefix, mask_length = mask.shape[:-1], mask.shape[-1]
+    flat_mask = mask.flatten(0, -2)
+    index_list = [torch.arange(mask_length, device=mask.device)[one_mask] for one_mask in flat_mask.unbind(0)]
+    index_tensor = pad_sequence(index_list, batch_first=True, padding_value=padding_value)
+    if max_length is not None:
+        index_tensor = index_tensor[:, :max_length]
+    index_tensor = index_tensor.reshape(*shape_prefix, -1)
+    return index_tensor, mask.sum(-1)
+def one_hot(tags: torch.Tensor, num_tags: Optional[int] = None) -> torch.Tensor:
+    num_tags = num_tags or int(tags.max())
+    ret = tags.new_zeros(size=[*tags.shape, num_tags], dtype=torch.bool)
+    ret.scatter_(2, tags.unsqueeze(2), tags.new_ones([*tags.shape, 1], dtype=torch.bool))
+    return ret
+def numpy2torch(
+        dict_obj: dict
+) -> dict:
+    """
+    Convert list/np.ndarray data to torch.Tensor and add add a batch dim.
+    """
+    ret = dict()
+    for k, v in dict_obj.items():
+        if isinstance(v, list) or isinstance(v, np.ndarray):
+            ret[k] = torch.tensor(v).unsqueeze(0)
+        else:
+            ret[k] = v
+    return ret
+def max_match(mat: np.ndarray):
+    row_idx, col_idx = linear_sum_assignment(mat, True)
+    return mat[row_idx, col_idx].sum()

sftp/utils/label_smoothing.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import torch
+from torch import nn
+from torch.nn import KLDivLoss
+from torch.nn import LogSoftmax
+class LabelSmoothingLoss(nn.Module):
+    def __init__(self, label_smoothing=0.0, unreliable_label=None, ignore_index=-100):
+        """
+        If label_smoothing == 0.0, it is equivalent to xentropy
+        """
+        assert 0.0 <= label_smoothing <= 1.0
+        super(LabelSmoothingLoss, self).__init__()
+        self.ignore_index = ignore_index
+        self.label_smoothing = label_smoothing
+        self.loss_fn = KLDivLoss(reduction='batchmean')
+        self.unreliable_label = unreliable_label
+        self.max_gap = 100.
+        self.log_softmax = LogSoftmax(1)
+    def forward(self, output, target):
+        """
+        output: logits
+        target: labels
+        """
+        vocab_size = output.shape[1]
+        mask = (target != self.ignore_index)
+        output, target = output[mask], target[mask]
+        output = self.log_softmax(output)
+        def get_smooth_prob(ls):
+            smoothing_value = ls / (vocab_size - 1)
+            prob = output.new_full((target.size(0), vocab_size), smoothing_value)
+            prob.scatter_(1, target.unsqueeze(1), 1 - ls)
+            return prob
+        if self.unreliable_label is not None:
+            smoothed_prob = get_smooth_prob(self.label_smoothing)
+            hard_prob = get_smooth_prob(0.0)
+            unreliable_mask = (target == self.unreliable_label).to(torch.float)
+            model_prob = ((smoothed_prob.T * unreliable_mask) + (hard_prob.T * (1 - unreliable_mask))).T
+        else:
+            model_prob = get_smooth_prob(self.label_smoothing)
+        loss = self.loss_fn(output, model_prob)
+        return loss

sftp/utils/span.py ADDED Viewed

	@@ -0,0 +1,420 @@

+from typing import *
+import numpy as np
+from .common import VIRTUAL_ROOT, DEFAULT_SPAN
+from .bio_smoothing import BIOSmoothing
+from .functions import max_match
+class Span:
+    """
+    Span is a simple data structure for a span (not necessarily associated with text), along with its label,
+    children and possibly its parent and a confidence score.
+    Basic usages (suppose span is a Span object):
+        1. len(span) -- #children.
+        2. span[i] -- i-th child.
+        3. for s in span: ... -- iterate its children.
+        4. for s in span.bfs: ... -- iterate its descendents.
+        5. print(span) -- show its description.
+        6. span.tree() -- print the whole tree.
+    It provides some utilities:
+        1. Re-indexing. BPE will change token indices, and the `re_index` method can convert normal tokens
+            BPE word piece indices, or vice versa.
+        2. Span object and span dict (JSON format) are mutually convertible (by `to_json` and `from_json` methods).
+        3. Recursively truncate spans up to a given length. (see `truncate` method)
+        4. Recursively replace all labels with the default label. (see `ignore_labels` method)
+        5. Recursively solve the span overlapping problem by removing children overlapped with others.
+            (see `remove_overlapping` method)
+    """
+    def __init__(
+            self,
+            start_idx: int,
+            end_idx: int,
+            label: Union[str, int, list] = DEFAULT_SPAN,
+            is_parent: bool = False,
+            parent: Optional["Span"] = None,
+            confidence: Optional[float] = None,
+    ):
+        """
+        Init function. Children should be added using the `add_children` method.
+        :param start_idx: Start index in a seq of tokens, inclusive.
+        :param end_idx: End index in a seq of tokens, inclusive.
+        :param label: Label. If not provided, will assign a default label.
+            Can be of various types: String, integer, or list of something.
+        :param is_parent: If True, will be treated as parent. This is important because in the training process of BIO
+            tagger, when a span has no children, we need to know if it's a parent with no children (so we should have
+            an training example with all O tags) or not (then the above example doesn't exist).
+            We follow a convention where if a span is not parent, then the key `children` shouldn't appear in its
+            JSON dict; if a span is parent but has no children, the key `children` in its JSON dict should appear
+            and be an empty list.
+        :param parent: A pointer to its parent.
+        :param confidence: Confidence value.
+        """
+        self.start_idx, self.end_idx = start_idx, end_idx
+        self.label: Union[int, str, list] = label
+        self.is_parent = is_parent
+        self.parent = parent
+        self._children: List[Span] = list()
+        self.confidence = confidence
+        # Following are for label smoothing. Leave default is you don't need smoothing.
+        # Logic:
+        # The label smoothing factors of (i.e. b_smooth, i_smooth, o_smooth) depend on the `child_span` of its parent.
+        # The re-weighting factor of a span also depends on the `child_span` of its parent, but can be overridden
+        # by its own `smoothing_weight` field if it's not None.
+        self.child_smooth: BIOSmoothing = BIOSmoothing()
+        self.smooth_weight: Optional[float] = None
+    def add_child(self, span: "Span") -> "Span":
+        """
+        Add a span to children list. Will link current span to child's parent pointer.
+        :param span: Child span.
+        """
+        assert self.is_parent
+        span.parent = self
+        self._children.append(span)
+        return self
+    def re_index(
+            self,
+            offsets: List[Optional[Tuple[int, int]]],
+            reverse: bool = False,
+            recursive: bool = True,
+            inplace: bool = False,
+    ) -> "Span":
+        """
+        BPE will change token indices, and the `re_index` method can convert normal tokens BPE word piece indices,
+        or vice versa.
+        We assume Virtual Root has a boundary [-1, -1] before being mapped to the BPE space, and a boundary [0, 0]
+        after the re-indexing. We use [0, 0] because it's always the BOS token in BPE.
+        Mapping to BPE space is straight forward. The reverse mapping has special cases where the span might
+        contain BOS or EOS. Usually this is a parsing bug. We will map the BOS index to 0, and EOS index to -1.
+        :param offsets: Offsets. Defined by BPE tokenizer and resides in the SpanFinder outputs.
+        :param reverse: If True, map from the BPE space to original token space.
+        :param recursive: If True, will apply the re-indexing to its children.
+        :param inplace: Inplace?
+        :return: Re-indexed span.
+        """
+        span = self if inplace else self.clone()
+        span.start_idx, span.end_idx = re_index_span(span.boundary, offsets, reverse)
+        if recursive:
+            new_children = list()
+            for child in span._children:
+                new_children.append(child.re_index(offsets, reverse, recursive, True))
+            span._children = new_children
+        return span
+    def truncate(self, max_length: int) -> bool:
+        """
+        Discard spans whose end_idx exceeds the max_length (inclusive).
+        This is done recursively.
+        This is useful for some encoder like XLMR that has a limit on input length. (512 for XLMR large)
+        :param max_length: Max length.
+        :return: You don't need to care return value.
+        """
+        if self.end_idx >= max_length:
+            return False
+        else:
+            self._children = list(filter(lambda x: x.truncate(max_length), self._children))
+            return True
+    @classmethod
+    def virtual_root(cls: "Span", spans: Optional[List["Span"]] = None) -> "Span":
+        """
+        An official method to create a tree: Generate the first layer of spans by yourself, and pass them into this
+        method.
+        E.g., for SRL style task, generate a list of events, assign arguments to them as children. Then pass the
+        events to this method to have a virtual root which serves as a parent of events.
+        :param spans: 1st layer spans.
+        :return: Virtual root.
+        """
+        vr = Span(-1, -1, VIRTUAL_ROOT, True)
+        if spans is not None:
+            vr._children = spans
+        for child in vr._children:
+            child.parent = vr
+        return vr
+    def ignore_labels(self) -> None:
+        """
+        Remove all labels. Make them placeholders. Inplace.
+        """
+        self.label = DEFAULT_SPAN
+        for child in self._children:
+            child.ignore_labels()
+    def clone(self) -> "Span":
+        """
+        Clone a tree.
+        :return: Cloned tree.
+        """
+        span = Span(self.start_idx, self.end_idx, self.label, self.is_parent, self.parent, self.confidence)
+        span.child_smooth, span.smooth_weight = self.child_smooth, self.smooth_weight
+        for child in self._children:
+            span.add_child(child.clone())
+        return span
+    def bfs(self) -> Iterable["Span"]:
+        """
+        Iterate over all descendents with BFS, including self.
+        :return: Spans.
+        """
+        yield self
+        yield from self._bfs()
+    def _bfs(self) -> List["Span"]:
+        """
+        Helper function.
+        """
+        for child in self._children:
+            yield child
+        for child in self._children:
+            yield from child._bfs()
+    def remove_overlapping(self, recursive=True) -> int:
+        """
+        Remove overlapped spans. If spans overlap, will pick the first one and discard the others, judged by start_idx.
+        :param recursive: Apply to all of the descendents?
+        :return: The number of spans that are removed.
+        """
+        indices = set()
+        new_children = list()
+        removing = 0
+        for child in self._children:
+            if len(set(range(child.start_idx, child.end_idx + 1)) & indices) > 0:
+                removing += 1
+                continue
+            indices.update(set(range(child.start_idx, child.end_idx + 1)))
+            new_children.append(child)
+            if recursive:
+                removing += child.remove_overlapping(True)
+        self._children = new_children
+        return removing
+    def describe(self, sentence: Optional[List[str]] = None) -> str:
+        """
+        :param sentence: If provided, will replace the indices with real tokens for presentation.
+        :return: The description in a single line.
+        """
+        if self.start_idx >= 0:
+            if sentence is None:
+                span = f'({self.start_idx}, {self.end_idx})'
+            else:
+                span = '(' + ' '.join(sentence[self.start_idx: self.end_idx + 1]) + ')'
+            if self.is_parent:
+                return f'<Span: {span}, {self.label}, {len(self._children)} children>'
+            else:
+                return f'[Span: {span}, {self.label}]'
+        else:
+            return f'<Span Annotation: {self.n_nodes - 1} descendents>'
+    def __repr__(self) -> str:
+        return self.describe()
+    @property
+    def n_nodes(self) -> int:
+        """
+        :return: Number of descendents + self.
+        """
+        return sum([child.n_nodes for child in self._children], 1)
+    @property
+    def boundary(self):
+        """
+        :return: (start_idx, end_idx), both inclusive.
+        """
+        return self.start_idx, self.end_idx
+    def __iter__(self) -> Iterable["Span"]:
+        """
+        Iterate over children.
+        """
+        yield from self._children
+    def __len__(self):
+        """
+        :return: #children.
+        """
+        return len(self._children)
+    def __getitem__(self, idx: int):
+        """
+        :return: The indexed child.
+        """
+        return self._children[idx]
+    def tree(self, sentence: Optional[List[str]] = None, printing: bool = True) -> str:
+        """
+        A tree description of all descendents. Human readable.
+        :param sentence: If provided, will replace the indices with real tokens for presentation.
+        :param printing: If True, will print out.
+        :return: The description.
+        """
+        ret = list()
+        ret.append(self.describe(sentence))
+        for child in self._children:
+            child_lines = child.tree(sentence, False).split('\n')
+            for line in child_lines:
+                ret.append('  ' + line)
+        desc = '\n'.join(ret)
+        if printing: print(desc)
+        else: return desc
+    def match(
+            self,
+            other: "Span",
+            match_label: bool = True,
+            depth: int = -1,
+            ignore_parent_boundary: bool = False,
+    ) -> int:
+        """
+        Used for evaluation. Count how many spans two trees share. Two spans are considered to be identical
+        if their boundary, label, and parent match.
+        :param other: The other tree to compare.
+        :param match_label: If False, will ignore label.
+        :param depth: If specified as non-negative, will only search thru certain depth.
+        :param ignore_parent_boundary: If True, two children can be matched ignoring parent boundaries.
+        :return: #spans two tree share.
+        """
+        if depth == 0:
+            return 0
+        if self.label != other.label and match_label:
+            return 0
+        if self.boundary == other.boundary:
+            n_match = 1
+        elif ignore_parent_boundary:
+            # Parents fail, Children might match!
+            n_match = 0
+        else:
+            return 0
+        sub_matches = np.zeros([len(self), len(other)], dtype=np.int)
+        for self_idx, my_child in enumerate(self):
+            for other_idx, other_child in enumerate(other):
+                sub_matches[self_idx, other_idx] = my_child.match(
+                    other_child, match_label, depth-1, ignore_parent_boundary
+                )
+        if not ignore_parent_boundary:
+            for m in [sub_matches, sub_matches.T]:
+                for line in m:
+                    assert (line > 0).sum() <= 1
+        n_match += max_match(sub_matches)
+        return n_match
+    def to_json(self) -> dict:
+        """
+        To JSON dict format. See init.
+        """
+        ret = {
+            "label": self.label,
+            "span": list(self.boundary),
+        }
+        if self.confidence is not None:
+            ret['confidence'] = self.confidence
+        if self.is_parent:
+            children = list()
+            for child in self._children:
+                children.append(child.to_json())
+            ret['children'] = children
+        return ret
+    @classmethod
+    def from_json(cls, span_json: Union[list, dict]) -> "Span":
+        """
+        Load from JSON. See init.
+        """
+        if isinstance(span_json, dict):
+            span = Span(
+                span_json['span'][0], span_json['span'][1], span_json.get('label', None), 'children' in span_json,
+                confidence=span_json.get('confidence', None)
+            )
+            for child_dict in span_json.get('children', []):
+                span.add_child(Span.from_json(child_dict))
+        else:
+            spans = [Span.from_json(child) for child in span_json]
+            span = Span.virtual_root(spans)
+        return span
+    def map_ontology(
+            self,
+            ontology_mapping: Optional[dict] = None,
+            inplace: bool = True,
+            recursive: bool = True,
+    ) -> Optional["Span"]:
+        """
+        Map labels to other things, like another ontology of soft labels.
+        :param ontology_mapping: Mapping dict. The key should be labels, and values can be anything.
+            Labels not in the dict will not be deleted. So be careful.
+        :param inplace: Inplace?
+        :param recursive: Apply to all descendents if True.
+        :return: The mapped tree.
+        """
+        span = self if inplace else self.clone()
+        if ontology_mapping is None:
+            # Do nothing if mapping not provided.
+            return span
+        if recursive:
+            new_children = list()
+            for child in span:
+                new_child = child.map_ontology(ontology_mapping, False, True)
+                if new_child is not None:
+                    new_children.append(new_child)
+            span._children = new_children
+        if span.label != VIRTUAL_ROOT:
+            if span.parent is not None and (span.parent.label, span.label) in ontology_mapping:
+                span.label = ontology_mapping[(span.parent.label, span.label)]
+            elif span.label in ontology_mapping:
+                span.label = ontology_mapping[span.label]
+            else:
+                return
+        return span
+    def isolate(self) -> "Span":
+        """
+        Generate a span that is identical to self but has no children or parent.
+        """
+        return Span(self.start_idx, self.end_idx, self.label, self.is_parent, None, self.confidence)
+    def remove_child(self, span: Optional["Span"] = None):
+        """
+        Remove a child. If pass None, will reset the children list.
+        """
+        if span is None:
+            self._children = list()
+        else:
+            del self._children[self._children.index(span)]
+def re_index_span(
+        boundary: Tuple[int, int], offsets: List[Tuple[int, int]], reverse: bool = False
+) -> Tuple[int, int]:
+    """
+    Helper function.
+    """
+    if not reverse:
+        if boundary[0] == boundary[1] == -1:
+            # Virtual Root
+            start_idx = end_idx = 0
+        else:
+            start_idx = offsets[boundary[0]][0]
+            end_idx = offsets[boundary[1]][1]
+    else:
+        if boundary[0] == boundary[1] == 0:
+            # Virtual Root
+            start_idx = end_idx = -1
+        else:
+            start_within = [bo[0] <= boundary[0] <= bo[1] if bo is not None else False for bo in offsets]
+            end_within = [bo[0] <= boundary[1] <= bo[1] if bo is not None else False for bo in offsets]
+            assert sum(start_within) <= 1 and sum(end_within) <= 1
+            start_idx = start_within.index(True) if sum(start_within) == 1 else 0
+            end_idx = end_within.index(True) if sum(end_within) == 1 else len(offsets)
+            if start_idx > end_idx:
+                raise IndexError
+    return start_idx, end_idx

sftp/utils/span_utils.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from typing import *
+import torch
+from .span import Span
+def _tensor2span_batch(
+        span_boundary: torch.Tensor,
+        span_labels: torch.Tensor,
+        parent_indices: torch.Tensor,
+        num_spans: torch.Tensor,
+        label_confidence: torch.Tensor,
+        idx2label: Dict[int, str],
+        label_ignore: List[int],
+) -> Span:
+    spans = list()
+    for (start_idx, end_idx), parent_idx, label, label_conf in \
+            list(zip(span_boundary, parent_indices, span_labels, label_confidence))[:int(num_spans)]:
+        if label not in label_ignore:
+            span = Span(int(start_idx), int(end_idx), idx2label[int(label)], True, confidence=float(label_conf))
+            if int(parent_idx) < len(spans):
+                spans[int(parent_idx)].add_child(span)
+            spans.append(span)
+    return spans[0]
+def tensor2span(
+        span_boundary: torch.Tensor,
+        span_labels: torch.Tensor,
+        parent_indices: torch.Tensor,
+        num_spans: torch.Tensor,
+        label_confidence: torch.Tensor,
+        idx2label: Dict[int, str],
+        label_ignore: Optional[List[int]] = None,
+) -> List[Span]:
+    """
+    Generate spans in dict from vectors. Refer to the model part for the meaning of these variables.
+    If idx_ignore is provided, some labels will be ignored.
+    :return:
+    """
+    label_ignore = label_ignore or []
+    if span_boundary.device.type != 'cpu':
+        span_boundary = span_boundary.to(device='cpu')
+        parent_indices = parent_indices.to(device='cpu')
+        span_labels = span_labels.to(device='cpu')
+        num_spans = num_spans.to(device='cpu')
+        label_confidence = label_confidence.to(device='cpu')
+    ret = list()
+    for args in zip(
+            span_boundary.unbind(0), span_labels.unbind(0), parent_indices.unbind(0), num_spans.unbind(0),
+            label_confidence.unbind(0),
+    ):
+        ret.append(_tensor2span_batch(*args, label_ignore=label_ignore, idx2label=idx2label))
+    return ret

sociolome/combine_models.py ADDED Viewed

	@@ -0,0 +1,130 @@

+from typing import Any, Dict, List, Optional
+import dataclasses
+import glob
+import os
+import sys
+import json
+import spacy
+from spacy.language import Language
+from sftp import SpanPredictor
+@dataclasses.dataclass
+class FrameAnnotation:
+    tokens: List[str] = dataclasses.field(default_factory=list)
+    pos: List[str] = dataclasses.field(default_factory=list)
+@dataclasses.dataclass
+class MultiLabelAnnotation(FrameAnnotation):
+    frame_list: List[List[str]] = dataclasses.field(default_factory=list)
+    lu_list: List[Optional[str]] = dataclasses.field(default_factory=list)
+    def to_txt(self):
+        for i, tok in enumerate(self.tokens):
+            yield f"{tok} {self.pos[i]} {'|'.join(self.frame_list[i]) or '_'} {self.lu_list[i] or '_'}"
+def convert_to_seq_labels(sentence: List[str], structures: Dict[int, Dict[str, Any]]) -> List[List[str]]:
+    labels = [[] for _ in sentence]
+    for struct_id, struct in structures.items():
+        tgt_span = struct["target"]
+        frame = struct["frame"]
+        for i in range(tgt_span[0], tgt_span[1] + 1):
+            labels[i].append(f"T:{frame}@{struct_id:02}")
+        for role in struct["roles"]:
+            role_span = role["boundary"]
+            role_label = role["label"]
+            for i in range(role_span[0], role_span[1] + 1):
+                prefix = "B" if i == role_span[0] else "I"
+                labels[i].append(f"{prefix}:{frame}:{role_label}@{struct_id:02}")
+    return labels
+def predict_combined(
+    spacy_model: Language,
+    sentences: List[str],
+    tgt_predictor: SpanPredictor,
+    frm_predictor: SpanPredictor,
+    bnd_predictor: SpanPredictor,
+    arg_predictor: SpanPredictor,
+) -> List[MultiLabelAnnotation]:
+    annotations_out = []
+    for sent_idx, sent in enumerate(sentences):
+        sent = sent.strip()
+        print(f"Processing sent with idx={sent_idx}: {sent}")
+        doc = spacy_model(sent)
+        sent_tokens = [t.text for t in doc]
+        tgt_spans, _, _ = tgt_predictor.force_decode(sent_tokens)
+        frame_structures = {}
+        for i, span in enumerate(tgt_spans):
+            span = tuple(span)
+            _, fr_labels, _ = frm_predictor.force_decode(sent_tokens, child_spans=[span])
+            frame = fr_labels[0]
+            if frame == "@@VIRTUAL_ROOT@@@":
+                continue
+            boundaries, _, _ = bnd_predictor.force_decode(sent_tokens, parent_span=span, parent_label=frame)
+            _, arg_labels, _ = arg_predictor.force_decode(sent_tokens, parent_span=span, parent_label=frame, child_spans=boundaries)
+            frame_structures[i] = {
+                "target": span,
+                "frame": frame,
+                "roles": [
+                    {"boundary": bnd, "label": label}
+                    for bnd, label in zip(boundaries, arg_labels)
+                    if label != "Target"
+                ]
+            }
+        annotations_out.append(MultiLabelAnnotation(
+            tokens=sent_tokens,
+            pos=[t.pos_ for t in doc],
+            frame_list=convert_to_seq_labels(sent_tokens, frame_structures),
+            lu_list=[None for _ in sent_tokens]
+        ))
+    return annotations_out
+def main(input_folder):
+    print("Loading spaCy model ...")
+    nlp = spacy.load("it_core_news_md")
+    print("Loading predictors ...")
+    zs_predictor = SpanPredictor.from_path("/data/p289731/cloned/lome-models/models/spanfinder/model.mod.tar.gz", cuda_device=0)
+    ev_predictor = SpanPredictor.from_path("/scratch/p289731/lome-training-files/train-evalita-plus-fn-vanilla/model.tar.gz", cuda_device=0)
+    print("Reading input files ...")
+    for file in glob.glob(os.path.join(input_folder, "*.txt")):
+        print(file)
+        with open(file, encoding="utf-8") as f:
+            sentences = list(f)
+        annotations = predict_combined(nlp, sentences, zs_predictor, ev_predictor, ev_predictor, ev_predictor)
+        out_name = os.path.splitext(os.path.basename(file))[0]
+        with open(f"../../data-out/{out_name}.combined_zs_ev.tc_bilstm.txt", "w", encoding="utf-8") as f_out:
+            for ann in annotations:
+                for line in ann.to_txt():
+                    f_out.write(line + os.linesep)
+                f_out.write(os.linesep)
+        with open(f"../../data-out/{out_name}.combined_zs_ev.tc_bilstm.json", "w", encoding="utf-8") as f_out:
+            json.dump([dataclasses.asdict(ann) for ann in annotations], f_out)
+if __name__ == "__main__":
+    main(sys.argv[1])

sociolome/evalita_eval.py ADDED Viewed

	@@ -0,0 +1,319 @@

+import json
+from typing import List, Tuple
+import pandas as pd
+from sftp import SpanPredictor
+def main():
+    # data_file = "/home/p289731/cloned/lome/preproc/evalita_jsonl/evalita_dev.jsonl"
+    # data_file = "/home/p289731/cloned/lome/preproc/svm_challenge.jsonl"
+    data_file = "/home/p289731/cloned/lome/preproc/evalita_jsonl/evalita_test.jsonl"
+    models = [
+        (
+            "lome-en",
+            "/data/p289731/cloned/lome-models/models/spanfinder/model.mod.tar.gz",
+        ),
+        (
+            "lome-it-best",
+            "/scratch/p289731/lome-training-files/train-evalita-plus-fn-vanilla/model.tar.gz",
+        ),
+        # (
+        #     "lome-it-freeze",
+        #     "/data/p289731/cloned/lome/train-evalita-plus-fn-freeze/model.tar.gz",
+        # ),
+        # (
+        #     "lome-it-mono",
+            # "/data/p289731/cloned/lome/train-evalita-it_mono/model.tar.gz",
+        # ),
+    ]
+    for (model_name, model_path) in models:
+        print("testing model: ", model_name)
+        predictor = SpanPredictor.from_path(model_path)
+        print("=== FD (run 1) ===")
+        eval_frame_detection(data_file, predictor, model_name=model_name)
+        for run in [1, 2]:
+            print(f"=== BD (run {run}) ===")
+            eval_boundary_detection(data_file, predictor, run=run)
+        for run in [1, 2, 3]:
+            print(f"=== AC (run {run}) ===")
+            eval_argument_classification(data_file, predictor, run=run)
+def predict_frame(
+    predictor: SpanPredictor, tokens: List[str], predicate_span: Tuple[int, int]
+):
+    _, labels, _ = predictor.force_decode(tokens, child_spans=[predicate_span])
+    return labels[0]
+def eval_frame_detection(data_file, predictor, verbose=False, model_name="_"):
+    true_pos = 0
+    false_pos = 0
+    out = []
+    with open(data_file, encoding="utf-8") as f:
+        for sent_id, sent in enumerate(f):
+            sent_data = json.loads(sent)
+            tokens = sent_data["tokens"]
+            annotation = sent_data["annotations"][0]
+            predicate_span = tuple(annotation["span"])
+            predicate = tokens[predicate_span[0] : predicate_span[1] + 1]
+            frame_gold = annotation["label"]
+            frame_pred = predict_frame(predictor, tokens, predicate_span)
+            if frame_pred == frame_gold:
+                true_pos += 1
+            else:
+                false_pos += 1
+            out.append({
+                "sentence": " ".join(tokens),
+                "predicate": predicate,
+                "frame_gold": frame_gold,
+                "frame_pred": frame_pred
+            })
+            if verbose:
+                print(f"Sentence #{sent_id:03}: {' '.join(tokens)}")
+                print(f"\tpredicate: {predicate}")
+                print(f"\t     gold: {frame_gold}")
+                print(f"\tpredicted: {frame_pred}")
+                print()
+    acc_score = true_pos / (true_pos + false_pos)
+    print("ACC =", acc_score)
+    data_sect = "rai" if "svm_challenge" in data_file else "dev" if "dev" in data_file else "test"
+    df_out = pd.DataFrame(out)
+    df_out.to_csv(f"frame_prediction_output_{model_name}_{data_sect}.csv")
+def predict_boundaries(predictor: SpanPredictor, tokens, predicate_span, frame):
+    boundaries, labels, _ = predictor.force_decode(
+        tokens, parent_span=predicate_span, parent_label=frame
+    )
+    out = []
+    for bnd, lab in zip(boundaries, labels):
+        bnd = tuple(bnd)
+        if bnd == predicate_span and lab == "Target":
+            continue
+        out.append(bnd)
+    return out
+def get_gold_boundaries(annotation, predicate_span):
+    return {
+        tuple(c["span"])
+        for c in annotation["children"]
+        if not (tuple(c["span"]) == predicate_span and c["label"] == "Target")
+    }
+def eval_boundary_detection(data_file, predictor, run=1, verbose=False):
+    assert run in [1, 2]
+    true_pos = 0
+    false_pos = 0
+    false_neg = 0
+    true_pos_tok = 0
+    false_pos_tok = 0
+    false_neg_tok = 0
+    with open(data_file, encoding="utf-8") as f:
+        for sent_id, sent in enumerate(f):
+            sent_data = json.loads(sent)
+            tokens = sent_data["tokens"]
+            annotation = sent_data["annotations"][0]
+            predicate_span = tuple(annotation["span"])
+            predicate = tokens[predicate_span[0] : predicate_span[1] + 1]
+            if run == 1:
+                frame = predict_frame(predictor, tokens, predicate_span)
+            else:
+                frame = annotation["label"]
+            boundaries_gold = get_gold_boundaries(annotation, predicate_span)
+            boundaries_pred = set(
+                predict_boundaries(predictor, tokens, predicate_span, frame)
+            )
+            sent_true_pos = len(boundaries_gold & boundaries_pred)
+            sent_false_pos = len(boundaries_pred - boundaries_gold)
+            sent_false_neg = len(boundaries_gold - boundaries_pred)
+            true_pos += sent_true_pos
+            false_pos += sent_false_pos
+            false_neg += sent_false_neg
+            boundary_toks_gold = {
+                tok_idx
+                for (start, stop) in boundaries_gold
+                for tok_idx in range(start, stop + 1)
+            }
+            boundary_toks_pred = {
+                tok_idx
+                for (start, stop) in boundaries_pred
+                for tok_idx in range(start, stop + 1)
+            }
+            sent_tok_true_pos = len(boundary_toks_gold & boundary_toks_pred)
+            sent_tok_false_pos = len(boundary_toks_pred - boundary_toks_gold)
+            sent_tok_false_neg = len(boundary_toks_gold - boundary_toks_pred)
+            true_pos_tok += sent_tok_true_pos
+            false_pos_tok += sent_tok_false_pos
+            false_neg_tok += sent_tok_false_neg
+            if verbose:
+                print(f"Sentence #{sent_id:03}: {' '.join(tokens)}")
+                print(f"\tpredicate: {predicate}")
+                print(f"\t    frame: {frame}")
+                print(f"\t     gold: {boundaries_gold}")
+                print(f"\tpredicted: {boundaries_pred}")
+                print(f"\ttp={sent_true_pos}\tfp={sent_false_pos}\tfn={sent_false_neg}")
+                print(
+                    f"\ttp_t={sent_tok_true_pos}\tfp_t={sent_tok_false_pos}\tfn_t={sent_tok_false_neg}"
+                )
+                print()
+    prec = true_pos / (true_pos + false_pos)
+    rec = true_pos / (true_pos + false_neg)
+    f1_score = 2 * ((prec * rec) / (prec + rec))
+    print(f"P/R/F=\n{prec}\t{rec}\t{f1_score}")
+    tok_prec = true_pos_tok / (true_pos_tok + false_pos_tok)
+    tok_rec = true_pos_tok / (true_pos_tok + false_neg_tok)
+    tok_f1 = 2 * ((tok_prec * tok_rec) / (tok_prec + tok_rec))
+    print(f"Pt/Rt/Ft=\n{tok_prec}\t{tok_rec}\t{tok_f1}")
+def predict_arguments(
+    predictor: SpanPredictor, tokens, predicate_span, frame, boundaries
+):
+    boundaries = list(sorted(boundaries, key=lambda t: t[0]))
+    _, labels, _ = predictor.force_decode(
+        tokens, parent_span=predicate_span, parent_label=frame, child_spans=boundaries
+    )
+    out = []
+    for bnd, lab in zip(boundaries, labels):
+        if bnd == predicate_span and lab == "Target":
+            continue
+        out.append((bnd, lab))
+    return out
+def eval_argument_classification(data_file, predictor, run=1, verbose=False):
+    assert run in [1, 2, 3]
+    true_pos = 0
+    false_pos = 0
+    false_neg = 0
+    true_pos_tok = 0
+    false_pos_tok = 0
+    false_neg_tok = 0
+    with open(data_file, encoding="utf-8") as f:
+        for sent_id, sent in enumerate(f):
+            sent_data = json.loads(sent)
+            tokens = sent_data["tokens"]
+            annotation = sent_data["annotations"][0]
+            predicate_span = tuple(annotation["span"])
+            predicate = tokens[predicate_span[0] : predicate_span[1] + 1]
+            # gold or predicted frames?
+            if run == 1:
+                frame = predict_frame(predictor, tokens, predicate_span)
+            else:
+                frame = annotation["label"]
+            # gold or predicted argument boundaries?
+            if run in [1, 2]:
+                boundaries = set(
+                    predict_boundaries(predictor, tokens, predicate_span, frame)
+                )
+            else:
+                boundaries = get_gold_boundaries(annotation, predicate_span)
+            pred_arguments = predict_arguments(
+                predictor, tokens, predicate_span, frame, boundaries
+            )
+            gold_arguments = {
+                (tuple(c["span"]), c["label"])
+                for c in annotation["children"]
+                if not (tuple(c["span"]) == predicate_span and c["label"] == "Target")
+            }
+            if verbose:
+                print(f"Sentence #{sent_id:03}: {' '.join(tokens)}")
+                print(f"\tpredicate: {predicate}")
+                print(f"\t    frame: {frame}")
+                print(f"\t     gold: {gold_arguments}")
+                print(f"\tpredicted: {pred_arguments}")
+                print()
+            # -- full spans version
+            for g_bnd, g_label in gold_arguments:
+                # true positive: found the span and labeled it correctly
+                if (g_bnd, g_label) in pred_arguments:
+                    true_pos += 1
+                # false negative: missed this argument
+                else:
+                    false_neg += 1
+            for p_bnd, p_label in pred_arguments:
+                # all predictions that are not true positives are false positives
+                if (p_bnd, p_label) not in gold_arguments:
+                    false_pos += 1
+            # -- token based
+            tok_gold_labels = {
+                (token, label)
+                for ((bnd_start, bnd_end), label) in gold_arguments
+                for token in range(bnd_start, bnd_end + 1)
+            }
+            tok_pred_labels = {
+                (token, label)
+                for ((bnd_start, bnd_end), label) in pred_arguments
+                for token in range(bnd_start, bnd_end + 1)
+            }
+            for g_tok, g_tok_label in tok_gold_labels:
+                if (g_tok, g_tok_label) in tok_pred_labels:
+                    true_pos_tok += 1
+                else:
+                    false_neg_tok += 1
+            for p_tok, p_tok_label in tok_pred_labels:
+                if (p_tok, p_tok_label) not in tok_gold_labels:
+                    false_pos_tok += 1
+    prec = true_pos / (true_pos + false_pos)
+    rec = true_pos / (true_pos + false_neg)
+    f1_score = 2 * ((prec * rec) / (prec + rec))
+    print(f"P/R/F=\n{prec}\t{rec}\t{f1_score}")
+    tok_prec = true_pos_tok / (true_pos_tok + false_pos_tok)
+    tok_rec = true_pos_tok / (true_pos_tok + false_neg_tok)
+    tok_f1 = 2 * ((tok_prec * tok_rec) / (tok_prec + tok_rec))
+    print(f"Pt/Rt/Ft=\n{tok_prec}\t{tok_rec}\t{tok_f1}")
+if __name__ == "__main__":
+    main()

sociolome/lome_wrapper.py ADDED Viewed

	@@ -0,0 +1,83 @@

+from sftp import SpanPredictor
+import spacy
+import sys
+import dataclasses
+from typing import List, Optional, Dict, Any
+predictor = SpanPredictor.from_path("model.mod.tar.gz")
+nlp = spacy.load("xx_sent_ud_sm")
+@dataclasses.dataclass
+class FrameAnnotation:
+    tokens: List[str] = dataclasses.field(default_factory=list)
+    pos: List[str] = dataclasses.field(default_factory=list)
+@dataclasses.dataclass
+class MultiLabelAnnotation(FrameAnnotation):
+    frame_list: List[List[str]] = dataclasses.field(default_factory=list)
+    lu_list: List[Optional[str]] = dataclasses.field(default_factory=list)
+    def to_txt(self):
+        for i, tok in enumerate(self.tokens):
+            yield f"{tok} {self.pos[i]} {'|'.join(self.frame_list[i]) or '_'} {self.lu_list[i] or '_'}"
+# reused from "combine_predictions.py" (cloned/lome/src/spanfinder/sociolome)
+def convert_to_seq_labels(sentence: List[str], structures: Dict[int, Dict[str, Any]]) -> List[List[str]]:
+    labels = [[] for _ in sentence]
+    for struct_id, struct in structures.items():
+        tgt_span = struct["target"]
+        frame = struct["frame"]
+        for i in range(tgt_span[0], tgt_span[1] + 1):
+            labels[i].append(f"T:{frame}@{struct_id:02}")
+        for role in struct["roles"]:
+            role_span = role["boundary"]
+            role_label = role["label"]
+            for i in range(role_span[0], role_span[1] + 1):
+                prefix = "B" if i == role_span[0] else "I"
+                labels[i].append(f"{prefix}:{frame}:{role_label}@{struct_id:02}")
+    return labels
+def make_prediction(sentence, spacy_model, predictor):
+    spacy_doc = spacy_model(sentence)
+    tokens = [t.text for t in spacy_doc]
+    tgt_spans, fr_labels, _ = predictor.force_decode(tokens)
+    frame_structures = {}
+    for i, (tgt, frm) in enumerate(sorted(zip(tgt_spans, fr_labels), key=lambda t: t[0][0])):
+        arg_spans, arg_labels, _ = predictor.force_decode(tokens, parent_span=tgt, parent_label=frm)
+        frame_structures[i] = {
+                "target": tgt,
+                "frame": frm,
+                "roles": [
+                    {"boundary": bnd, "label": label}
+                    for bnd, label in zip(arg_spans, arg_labels)
+                    if label != "Target"
+                ]
+            }
+    return MultiLabelAnnotation(
+        tokens=tokens,
+        pos=[t.pos_ for t in spacy_doc],
+        frame_list=convert_to_seq_labels(tokens, frame_structures),
+        lu_list=[None for _ in tokens]
+    )
+def analyze(text):
+    analyses = []
+    for sentence in text.split("\n"):
+        analyses.append(make_prediction(sentence, nlp, predictor))
+    return {
+        "result": "OK",
+        "analyses": [dataclasses.asdict(an) for an in analyses]
+    }