Spaces:

BramVanroy
/

text-to-amr

Running

App Files Files Community

Bram Vanroy commited on Sep 20, 2023

Commit

05de9a6

1 Parent(s): 4d85339

Make stateful

Browse files

This also gets rid of this issue which https://github.com/streamlit/streamlit/issues/6451

Files changed (3) hide show

app.py +47 -65
requirements.txt +7 -4
utils.py +64 -26

app.py CHANGED Viewed

@@ -3,10 +3,7 @@ from collections import Counter
 import graphviz
 import penman
-from mbart_amr.data.linearization import linearized2penmanstr
-from penman.models.noop import NoOpModel
-import streamlit as st
-from transformers import LogitsProcessorList
 from utils import get_resources, LANGUAGES, translate
@@ -17,43 +14,41 @@ st.set_page_config(
     page_icon="👩‍💻"
 )
-st.title("👩‍💻 Multilingual text to AMR ᵇᵉᵗᵃ")
-with st.form("input data"):
-    text_col, lang_col = st.columns((4, 1))
-    text = text_col.text_input(label="Input text")
-    src_lang = lang_col.selectbox(label="Language", options=list(LANGUAGES.keys()), index=0)
-    submitted = st.form_submit_button("Submit")
 error_ct = st.empty()
-if submitted:
-    text = text.strip()
-    if not text:
-        error_ct.error("Text cannot be empty!", icon="⚠️")
-    else:
         error_ct.info("Generating abstract meaning representation (AMR)...", icon="💻")
-        multilingual = src_lang != "English"
-        model, tokenizer, logitsprocessor = get_resources(multilingual)
         gen_kwargs = {
-            "max_length": model.config.max_length,
-            "num_beams": model.config.num_beams,
-            "logits_processor": LogitsProcessorList([logitsprocessor])
         }
-        linearized = translate(text, src_lang, model, tokenizer, **gen_kwargs)
-        penman_str = linearized2penmanstr(linearized)
         error_ct.empty()
-        try:
-            graph = penman.decode(penman_str, model=NoOpModel())
-        except Exception as exc:
-            st.write(f"The generated graph is not valid so it cannot be visualized correctly. Below is the closest attempt"
-                     f" to a valid graph but note that this is invalid Penman.")
-            st.code(penman_str)
-            with st.expander("Error trace"):
-                st.write(exc)
         else:
             visualized = graphviz.Digraph(node_attr={"color": "#3aafa9", "style": "rounded,filled", "shape": "box",
                                                      "fontcolor": "white"})
@@ -74,40 +69,27 @@ if submitted:
             def get_node_name(item: str):
                 return nodenames[item] if item in nodenames else item
-            try:
-                for triple in graph.triples:
-                    if triple[1] == ":instance":
-                        continue
-                    else:
-                        visualized.edge(get_node_name(triple[0]), get_node_name(triple[2]), label=triple[1])
-            except Exception as exc:
-                st.write("The generated graph is not valid so it cannot be visualized correctly. Below is the closest attempt"
-                         " to a valid graph but note that this is probably invalid Penman.")
-                st.code(penman_str)
-                st.write("The initial linearized output of the model was:")
-                st.code(linearized)
-                with st.expander("Error trace"):
-                    st.write(exc)
-            else:
-                st.subheader("Graph visualization")
-                st.graphviz_chart(visualized, use_container_width=True)
-                # Download link
-                def create_download_link(img_bytes: bytes):
-                    encoded = base64.b64encode(img_bytes).decode("utf-8")
-                    return f'<a href="data:image/png;charset=utf-8;base64,{encoded}" download="amr-graph.png">Download graph</a>'
-                img = visualized.pipe(format="png")
-                st.markdown(create_download_link(img), unsafe_allow_html=True)
-                # Additional info
-                st.subheader("Model output and Penman graph")
-                st.write("The linearized output of the model (after some post-processing) is:")
-                st.code(linearized)
-                st.write("When converted into Penman, it looks like this:")
-                st.code(penman.encode(graph))
 ########################
 # Information, socials #

 import graphviz
 import penman
+from multi_amr.data.postprocessing_graph import ParsedStatus
 from utils import get_resources, LANGUAGES, translate
     page_icon="👩‍💻"
 )
+st.title("👩‍💻 Multilingual text to AMR")
+if "text" not in st.session_state:
+    st.session_state["text"] = ""
+if "language" not in st.session_state:
+    st.session_state["language"] = "English"
+if "use_multilingual" not in st.session_state:
+    st.session_state["use_multilingual"] = False
+text_col, lang_col = st.columns((4, 1))
+text = text_col.text_input(label="Input text", key="text")
+src_lang = lang_col.selectbox(label="Language", options=list(LANGUAGES.keys()), index=0, key="language")
+multilingual = st.checkbox("Use multilingual model", label_visibility="visible", key="use_multilingual",
+                           help="Whether to use a single multilingual model that was trained on English, Spanish and"
+                                " Dutch together, or (if not checked) language-specific models. Enabling this will"
+                                " results in worse performance but can be of interest for research purposes.")
 error_ct = st.empty()
+if st.session_state["text"]:
+    if st.button("Submit"):
+        text = text.strip()
         error_ct.info("Generating abstract meaning representation (AMR)...", icon="💻")
+        model, tokenizer = get_resources(multilingual, src_lang)
         gen_kwargs = {
+            "max_new_tokens": 512,
+            "num_beams": 5,
         }
+        outputs = translate(text, src_lang, model, tokenizer, **gen_kwargs)
         error_ct.empty()
+        if outputs["status"][0] == ParsedStatus.BACKOFF:
+            st.write(f"The system could not generate a valid graph no matter how hard it tried.")
         else:
+            graph = outputs["graph"][0]
             visualized = graphviz.Digraph(node_attr={"color": "#3aafa9", "style": "rounded,filled", "shape": "box",
                                                      "fontcolor": "white"})
             def get_node_name(item: str):
                 return nodenames[item] if item in nodenames else item
+            for triple in graph.triples:
+                if triple[1] == ":instance":
+                    continue
+                else:
+                    visualized.edge(get_node_name(triple[0]), get_node_name(triple[2]), label=triple[1])
+            st.subheader("Graph visualization")
+            st.graphviz_chart(visualized, use_container_width=True)
+            # Download link
+            def create_download_link(img_bytes: bytes):
+                encoded = base64.b64encode(img_bytes).decode("utf-8")
+                return f'<a href="data:image/png;charset=utf-8;base64,{encoded}" download="amr-graph.png">Download graph</a>'
+            img = visualized.pipe(format="png")
+            st.markdown(create_download_link(img), unsafe_allow_html=True)
+            # Additional info
+            st.subheader("PENMAN representation")
+            st.code(penman.encode(graph))
+else:
+    error_ct.warning("Text cannot be empty!", icon="⚠️")
 ########################
 # Information, socials #

requirements.txt CHANGED Viewed

@@ -1,7 +1,10 @@
 altair==4.2.2
 graphviz==0.20.1
-optimum==1.7.1
 penman==1.2.2
-streamlit==1.19.0
-torch==1.13.1
-git+https://github.com/BramVanroy/multilingual-text-to-amr@5859af0d870acd2f76d71e5a7d12fa35a7a2059b#egg=mbart-amr

+accelerate==0.22.0
 altair==4.2.2
 graphviz==0.20.1
+multi_amr @ git+https://github.com/BramVanroy/multilingual-text-to-amr@v1.0.0-alpha
+optimum==1.10.1
 penman==1.2.2
+streamlit==1.26.0
+torch==2.0.1
+transformers==4.33.1
+wheel

utils.py CHANGED Viewed

@@ -1,63 +1,101 @@
-from typing import Tuple
 from optimum.bettertransformer import BetterTransformer
-from mbart_amr.constraints.constraints import AMRLogitsProcessor
-from mbart_amr.data.tokenization import AMRMBartTokenizer
 import streamlit as st
 import torch
 from torch.quantization import quantize_dynamic
 from torch import nn, qint8
-from transformers import MBartForConditionalGeneration
 @st.cache_resource(show_spinner=False)
-def get_resources(multilingual: bool, quantize: bool = True, no_cuda: bool = False) -> Tuple[MBartForConditionalGeneration, AMRMBartTokenizer, AMRLogitsProcessor]:
     """Get the relevant model, tokenizer and logits_processor. The loaded model depends on whether the multilingual
     model is requested, or not. If not, an English-only model is loaded. The model can be optionally quantized
     for better performance.
-    :param multilingual: whether or not to load the multilingual model. If not, loads the English-only model
     :param quantize: whether to quantize the model with PyTorch's 'quantize_dynamic'
     :param no_cuda: whether to disable CUDA, even if it is available
-    :return: the loaded model, tokenizer, and logits processor
     """
-    if multilingual:
-        # Tokenizer src_lang is reset during translation to the right language
-        tokenizer = AMRMBartTokenizer.from_pretrained("BramVanroy/mbart-en-es-nl-to-amr", src_lang="nl_XX")
-        model = MBartForConditionalGeneration.from_pretrained("BramVanroy/mbart-en-es-nl-to-amr")
-    else:
-        tokenizer = AMRMBartTokenizer.from_pretrained("BramVanroy/mbart-en-to-amr", src_lang="en_XX")
-        model = MBartForConditionalGeneration.from_pretrained("BramVanroy/mbart-en-to-amr")
     model = BetterTransformer.transform(model, keep_original_model=False)
-    model.resize_token_embeddings(len(tokenizer))
     if torch.cuda.is_available() and not no_cuda:
         model = model.to("cuda")
     elif quantize:  # Quantization not supported on CUDA
         model = quantize_dynamic(model, {nn.Linear, nn.Dropout, nn.LayerNorm}, dtype=qint8)
-    logits_processor = AMRLogitsProcessor(tokenizer, model.config.max_length)
-    return model, tokenizer, logits_processor
-def translate(text: str, src_lang: str, model: MBartForConditionalGeneration, tokenizer: AMRMBartTokenizer, **gen_kwargs) -> str:
     """Translates a given text of a given source language with a given model and tokenizer. The generation is guided by
     potential keyword-arguments, which can include arguments such as max length, logits processors, etc.
-    :param text: source text to translate
     :param src_lang: source language
     :param model: MBART model
-    :param tokenizer: MBART tokenizer
     :param gen_kwargs: potential keyword arguments for the generation process
     :return: the translation (linearized AMR graph)
     """
-    tokenizer.src_lang = LANGUAGES[src_lang]
-    encoded = tokenizer(text, return_tensors="pt")
-    encoded = {k: v.to(model.device) for k, v in encoded.items()}
-    generated = model.generate(**encoded, **gen_kwargs).cpu()
-    return tokenizer.decode_and_fix(generated)[0]
 LANGUAGES = {

+from typing import Tuple, Union, Dict, List
+from multi_amr.data.postprocessing_graph import ParsedStatus
+from multi_amr.data.tokenization import AMRTokenizerWrapper
 from optimum.bettertransformer import BetterTransformer
 import streamlit as st
 import torch
 from torch.quantization import quantize_dynamic
 from torch import nn, qint8
+from transformers import MBartForConditionalGeneration, AutoConfig
+import penman
 @st.cache_resource(show_spinner=False)
+def get_resources(multilingual: bool, src_lang: str, quantize: bool = True, no_cuda: bool = False) -> Tuple[MBartForConditionalGeneration, AMRTokenizerWrapper]:
     """Get the relevant model, tokenizer and logits_processor. The loaded model depends on whether the multilingual
     model is requested, or not. If not, an English-only model is loaded. The model can be optionally quantized
     for better performance.
+    :param multilingual: whether to load the multilingual model or not
+    :param src_lang: source language
     :param quantize: whether to quantize the model with PyTorch's 'quantize_dynamic'
     :param no_cuda: whether to disable CUDA, even if it is available
+    :return: the loaded model, and tokenizer wrapper
     """
+    model_name = "BramVanroy/mbart-large-cc25-ft-amr30-en_es_nl"
+    if not multilingual:
+        if src_lang == "English":
+            model_name = "BramVanroy/mbart-large-cc25-ft-amr30-en"
+        elif src_lang == "Spanish":
+            model_name = "BramVanroy/mbart-large-cc25-ft-amr30-es"
+        elif src_lang == "Dutch":
+            model_name = "BramVanroy/mbart-large-cc25-ft-amr30-nl"
+        else:
+            raise ValueError(f"Language {src_lang} not supported")
+    # Tokenizer src_lang is reset during translation to the right language
+    tok_wrapper = AMRTokenizerWrapper.from_pretrained(model_name, src_lang="en_XX")
+    config = AutoConfig.from_pretrained(model_name)
+    config.decoder_start_token_id = tok_wrapper.amr_token_id
+    model = MBartForConditionalGeneration.from_pretrained(model_name, config=config)
+    model.eval()
+    embedding_size = model.get_input_embeddings().weight.shape[0]
+    if len(tok_wrapper.tokenizer) > embedding_size:
+        model.resize_token_embeddings(len(tok_wrapper.tokenizer))
     model = BetterTransformer.transform(model, keep_original_model=False)
     if torch.cuda.is_available() and not no_cuda:
         model = model.to("cuda")
     elif quantize:  # Quantization not supported on CUDA
         model = quantize_dynamic(model, {nn.Linear, nn.Dropout, nn.LayerNorm}, dtype=qint8)
+    return model, tok_wrapper
+def translate(texts: List[str], src_lang: str, model: MBartForConditionalGeneration, tok_wrapper: AMRTokenizerWrapper, **gen_kwargs) -> Dict[str, List[Union[penman.Graph, ParsedStatus]]]:
     """Translates a given text of a given source language with a given model and tokenizer. The generation is guided by
     potential keyword-arguments, which can include arguments such as max length, logits processors, etc.
+    :param texts: source text to translate (potentially a batch)
     :param src_lang: source language
     :param model: MBART model
+    :param tok_wrapper: MBART tokenizer wrapper
     :param gen_kwargs: potential keyword arguments for the generation process
     :return: the translation (linearized AMR graph)
     """
+    if isinstance(texts, str):
+        texts = [texts]
+    tok_wrapper.src_lang = LANGUAGES[src_lang]
+    encoded = tok_wrapper(texts, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        generated = model.generate(**encoded, output_scores=True, return_dict_in_generate=True, **gen_kwargs)
+    generated["sequences"] = generated["sequences"].cpu()
+    generated["sequences_scores"] = generated["sequences_scores"].cpu()
+    best_scoring_results = {"graph": [], "status": []}
+    beam_size = gen_kwargs["num_beams"]
+    # Select the best item from the beam: the sequence with best status and highest score
+    for sample_idx in range(0, len(generated["sequences_scores"]), beam_size):
+        sequences = generated["sequences"][sample_idx: sample_idx + beam_size]
+        scores = generated["sequences_scores"][sample_idx: sample_idx + beam_size].tolist()
+        outputs = tok_wrapper.batch_decode_amr_ids(sequences)
+        statuses = outputs["status"]
+        graphs = outputs["graph"]
+        zipped = zip(statuses, scores, graphs)
+        # Lowest status first (OK=0, FIXED=1, BACKOFF=2), highest score second
+        best = sorted(zipped, key=lambda item: (item[0].value, -item[1]))[0]
+        best_scoring_results["graph"].append(best[2])
+        best_scoring_results["status"].append(best[0])
+    # Returns dictionary with "graph" and "status" keys
+    return best_scoring_results
 LANGUAGES = {