Spaces:

aseifert
/

ExplaiNER

Runtime error

App Files Files

Alexander Seifert commited on Jun 17, 2022

Commit

554bac5

•

1 Parent(s): 8778b89

improve docs

Browse files

Files changed (5) hide show

src/data.py +47 -14
src/load.py +2 -2
src/subpages/attention.py +3 -14
src/subpages/hidden_states.py +37 -1
src/utils.py +41 -20

src/data.py CHANGED Viewed

@@ -46,7 +46,16 @@ def get_collator(tokenizer) -> DataCollatorForTokenClassification:
     return DataCollatorForTokenClassification(tokenizer)
-def create_word_ids_from_tokens(tokenizer, input_ids: list[int]):
     word_ids = []
     wid = -1
     tokens = [tokenizer.convert_ids_to_tokens(i) for i in input_ids]
@@ -65,16 +74,27 @@ def create_word_ids_from_tokens(tokenizer, input_ids: list[int]):
     return word_ids
-def tokenize_and_align_labels(examples, tokenizer):
-    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
     labels = []
     wids = []
-    for idx, label in enumerate(examples["ner_tags"]):
         try:
             word_ids = tokenized_inputs.word_ids(batch_index=idx)
         except ValueError:
-            word_ids = create_word_ids_from_tokens(tokenizer, tokenized_inputs["input_ids"][idx])
         previous_word_idx = None
         label_ids = []
         for word_idx in word_ids:
@@ -119,7 +139,7 @@ def encode_dataset(split: Dataset, tokenizer):
     remove_columns = split.column_names
     ids = split["id"]
     split = split.map(
-        partial(tokenize_and_align_labels, tokenizer=tokenizer),
         batched=True,
         remove_columns=remove_columns,
     )
@@ -128,6 +148,18 @@ def encode_dataset(split: Dataset, tokenizer):
 def forward_pass_with_label(batch, model, collator, num_classes: int) -> dict:
     # Convert dict of lists to list of dicts suitable for data collator
     features = [dict(zip(batch, t)) for t in zip(*batch.values())]
@@ -159,19 +191,20 @@ def forward_pass_with_label(batch, model, collator, num_classes: int) -> dict:
     return {"losses": loss, "preds": preds, "hidden_states": hidden_states}
-def get_split_df(split_encoded: Dataset, model, tokenizer, collator, tags) -> pd.DataFrame:
-    """Turns a Dataset into a pandas dataframe.
     Args:
-        split_encoded (Dataset): _description_
-        model (_type_): _description_
-        tokenizer (_type_): _description_
-        collator (_type_): _description_
-        tags (_type_): _description_
     Returns:
-        pd.DataFrame: _description_
     """
     split_encoded = split_encoded.map(
         partial(
             forward_pass_with_label,

     return DataCollatorForTokenClassification(tokenizer)
+def create_word_ids_from_input_ids(tokenizer, input_ids: list[int]) -> list[int]:
+    """Takes a list of input_ids and return corresponding word_ids
+    Args:
+        tokenizer: The tokenizer that was used to obtain the input ids.
+        input_ids (list[int]): List of token ids.
+    Returns:
+        list[int]: Word ids corresponding to the input ids.
+    """
     word_ids = []
     wid = -1
     tokens = [tokenizer.convert_ids_to_tokens(i) for i in input_ids]
     return word_ids
+def tokenize(batch, tokenizer) -> dict:
+    """Tokenizes a batch of examples.
+    Args:
+        batch: The examples to tokenize
+        tokenizer: The tokenizer to use
+    Returns:
+        dict: The tokenized batch
+    """
+    tokenized_inputs = tokenizer(batch["tokens"], truncation=True, is_split_into_words=True)
     labels = []
     wids = []
+    for idx, label in enumerate(batch["ner_tags"]):
         try:
             word_ids = tokenized_inputs.word_ids(batch_index=idx)
         except ValueError:
+            word_ids = create_word_ids_from_input_ids(
+                tokenizer, tokenized_inputs["input_ids"][idx]
+            )
         previous_word_idx = None
         label_ids = []
         for word_idx in word_ids:
     remove_columns = split.column_names
     ids = split["id"]
     split = split.map(
+        partial(tokenize, tokenizer=tokenizer),
         batched=True,
         remove_columns=remove_columns,
     )
 def forward_pass_with_label(batch, model, collator, num_classes: int) -> dict:
+    """Runs the forward pass for a batch of examples.
+    Args:
+        batch: The batch to process
+        model: The model to process the batch with
+        collator: A data collator
+        num_classes (int): Number of classes
+    Returns:
+        dict: a dictionary containing `losses`, `preds` and `hidden_states`
+    """
     # Convert dict of lists to list of dicts suitable for data collator
     features = [dict(zip(batch, t)) for t in zip(*batch.values())]
     return {"losses": loss, "preds": preds, "hidden_states": hidden_states}
+def predict(split_encoded: Dataset, model, tokenizer, collator, tags) -> pd.DataFrame:
+    """Generates predictions for a given dataset split and returns the results as a dataframe.
     Args:
+        split_encoded (Dataset): The dataset to process
+        model: The model to process the dataset with
+        tokenizer: The tokenizer to process the dataset with
+        collator: The data collator to use
+        tags: The tags used in the dataset
     Returns:
+        pd.DataFrame: A dataframe containing token-level predictions.
     """
     split_encoded = split_encoded.map(
         partial(
             forward_pass_with_label,

src/load.py CHANGED Viewed

@@ -4,7 +4,7 @@ import pandas as pd
 import streamlit as st
 from datasets import Dataset  # type: ignore
-from src.data import encode_dataset, get_collator, get_data, get_split_df
 from src.model import get_encoder, get_model, get_tokenizer
 from src.subpages import Context
 from src.utils import align_sample, device, explode_df
@@ -68,7 +68,7 @@ def load_context(
     split_encoded, word_ids, ids = encode_dataset(split, tokenizer)
     # transform into dataframe
-    df = get_split_df(split_encoded, model, tokenizer, collator, tags)
     df["word_ids"] = word_ids
     df["ids"] = ids

 import streamlit as st
 from datasets import Dataset  # type: ignore
+from src.data import encode_dataset, get_collator, get_data, predict
 from src.model import get_encoder, get_model, get_tokenizer
 from src.subpages import Context
 from src.utils import align_sample, device, explode_df
     split_encoded, word_ids, ids = encode_dataset(split, tokenizer)
     # transform into dataframe
+    df = predict(split_encoded, model, tokenizer, collator, tags)
     df["word_ids"] = word_ids
     df["ids"] = ids

src/subpages/attention.py CHANGED Viewed

@@ -7,7 +7,7 @@ from streamlit.components.v1 import html
 from src.subpages.page import Context, Page  # type: ignore
-SETUP_HTML = """
 <script src="https://requirejs.org/docs/release/2.3.6/minified/require.js"></script>
 <script>
     var ecco_url = 'https://storage.googleapis.com/ml-intro/ecco/'
@@ -70,17 +70,6 @@ SETUP_HTML = """
 <div id="basic"></div>
 """
-JS_TEMPLATE = """requirejs(['basic', 'ecco'], function(basic, ecco){{
-    const viz_id = basic.init()
-    ecco.interactiveTokensAndFactorSparklines(viz_id, {}, {{
-    'hltrCFG': {{'tokenization_config': {{'token_prefix': '', 'partial_token_prefix': '##'}}
-        }}
-    }})
-}}, function (err) {{
-    console.log(err);
-}})"""
 @st.cache(allow_output_mutation=True)
 def _load_ecco_model():
@@ -160,10 +149,10 @@ class AttentionPage(Page):
         output = lm(inputs)
         nmf = output.run_nmf(n_components=n_components, from_layer=from_layer, to_layer=to_layer)
         data = nmf.explore(returnData=True)
-        JS_TEMPLATE = f"""<script>requirejs(['basic', 'ecco'], function(basic, ecco){{
             const viz_id = basic.init()
             ecco.interactiveTokensAndFactorSparklines(viz_id, {data}, {{ 'hltrCFG': {{'tokenization_config': {{'token_prefix': '', 'partial_token_prefix': '##'}} }} }})
         }}, function (err) {{
             console.log(err);
         }})</script>"""
-        html(SETUP_HTML + JS_TEMPLATE, height=800, scrolling=True)

 from src.subpages.page import Context, Page  # type: ignore
+_SETUP_HTML = """
 <script src="https://requirejs.org/docs/release/2.3.6/minified/require.js"></script>
 <script>
     var ecco_url = 'https://storage.googleapis.com/ml-intro/ecco/'
 <div id="basic"></div>
 """
 @st.cache(allow_output_mutation=True)
 def _load_ecco_model():
         output = lm(inputs)
         nmf = output.run_nmf(n_components=n_components, from_layer=from_layer, to_layer=to_layer)
         data = nmf.explore(returnData=True)
+        _JS_TEMPLATE = f"""<script>requirejs(['basic', 'ecco'], function(basic, ecco){{
             const viz_id = basic.init()
             ecco.interactiveTokensAndFactorSparklines(viz_id, {data}, {{ 'hltrCFG': {{'tokenization_config': {{'token_prefix': '', 'partial_token_prefix': '##'}} }} }})
         }}, function (err) {{
             console.log(err);
         }})</script>"""
+        html(_SETUP_HTML + _JS_TEMPLATE, height=800, scrolling=True)

src/subpages/hidden_states.py CHANGED Viewed

@@ -10,7 +10,19 @@ from src.subpages.page import Context, Page
 @st.cache
-def reduce_dim_svd(X, n_iter, random_state=42):
     from sklearn.decomposition import TruncatedSVD
     svd = TruncatedSVD(n_components=2, n_iter=n_iter, random_state=random_state)
@@ -19,6 +31,17 @@ def reduce_dim_svd(X, n_iter, random_state=42):
 @st.cache
 def reduce_dim_pca(X, random_state=42):
     from sklearn.decomposition import PCA
     return PCA(n_components=2, random_state=random_state).fit_transform(X)
@@ -26,6 +49,19 @@ def reduce_dim_pca(X, random_state=42):
 @st.cache
 def reduce_dim_umap(X, n_neighbors=5, min_dist=0.1, metric="euclidean"):
     from umap import UMAP
     return UMAP(n_neighbors=n_neighbors, min_dist=min_dist, metric=metric).fit_transform(X)

 @st.cache
+def reduce_dim_svd(X, n_iter: int, random_state=42):
+    """Dimensionality reduction using truncated SVD (aka LSA).
+    This transformer performs linear dimensionality reduction by means of truncated singular value decomposition (SVD). Contrary to PCA, this estimator does not center the data before computing the singular value decomposition. This means it can work with sparse matrices efficiently.
+        Args:
+            X: Training data
+            n_iter (int): Desired dimensionality of output data. Must be strictly less than the number of features.
+            random_state (int, optional): Used during randomized svd. Pass an int for reproducible results across multiple function calls. Defaults to 42.
+        Returns:
+            ndarray: Reduced version of X, ndarray of shape (n_samples, 2).
+    """
     from sklearn.decomposition import TruncatedSVD
     svd = TruncatedSVD(n_components=2, n_iter=n_iter, random_state=random_state)
 @st.cache
 def reduce_dim_pca(X, random_state=42):
+    """Principal component analysis (PCA).
+    Linear dimensionality reduction using Singular Value Decomposition of the data to project it to a lower dimensional space. The input data is centered but not scaled for each feature before applying the SVD.
+        Args:
+            X: Training data
+            random_state (int, optional): Used when the 'arpack' or 'randomized' solvers are used. Pass an int for reproducible results across multiple function calls.
+        Returns:
+            ndarray: Reduced version of X, ndarray of shape (n_samples, 2).
+    """
     from sklearn.decomposition import PCA
     return PCA(n_components=2, random_state=random_state).fit_transform(X)
 @st.cache
 def reduce_dim_umap(X, n_neighbors=5, min_dist=0.1, metric="euclidean"):
+    """Uniform Manifold Approximation and Projection
+    Finds a low dimensional embedding of the data that approximates an underlying manifold.
+        Args:
+            X: Training data
+            n_neighbors (int, optional): The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. In general values should be in the range 2 to 100. Defaults to 5.
+            min_dist (float, optional): The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped embedding where nearby points on the manifold are drawn closer together, while larger values will result on a more even dispersal of points. The value should be set relative to the `spread` value, which determines the scale at which embedded points will be spread out. Defaults to 0.1.
+            metric (str, optional): The metric to use to compute distances in high dimensional space (see UMAP docs for options). Defaults to "euclidean".
+        Returns:
+            ndarray: Reduced version of X, ndarray of shape (n_samples, 2).
+    """
     from umap import UMAP
     return UMAP(n_neighbors=n_neighbors, min_dist=min_dist, metric=metric).fit_transform(X)

src/utils.py CHANGED Viewed

@@ -34,6 +34,7 @@ classmap = {
 def aggrid_interactive_table(df: pd.DataFrame) -> dict:
     """Creates an st-aggrid interactive table based on a dataframe.
     Args:
         df (pd.DataFrame]): Source dataframe
     Returns:
@@ -60,6 +61,8 @@ def aggrid_interactive_table(df: pd.DataFrame) -> dict:
 def explode_df(df: pd.DataFrame) -> pd.DataFrame:
     df_tokens = df.apply(pd.Series.explode)
     if "losses" in df.columns:
         df_tokens["losses"] = df_tokens["losses"].astype(float)
@@ -67,7 +70,7 @@ def explode_df(df: pd.DataFrame) -> pd.DataFrame:
 def align_sample(row: pd.Series):
-    """Use word_ids to align all lists in a sample."""
     columns = row.axes[0].to_list()
     indices = [i for i, id in enumerate(row.word_ids) if id >= 0 and id != row.word_ids[i - 1]]
@@ -113,7 +116,17 @@ def align_sample(row: pd.Series):
     hash_funcs=tokenizer_hash_funcs,
 )
 def tag_text(text: str, tokenizer, model, device: torch.device) -> pd.DataFrame:
-    """Create an (exploded) DataFrame with the predicted labels and probabilities."""
     tokens = tokenizer(text).tokens()
     tokenized = tokenizer(text, return_tensors="pt")
@@ -137,21 +150,31 @@ def tag_text(text: str, tokenizer, model, device: torch.device) -> pd.DataFrame:
     return explode_df(merged_df).reset_index().drop(columns=["index"])
-def get_bg_color(label):
     return st.session_state[f"color_{label}"]
-def get_fg_color(hex_color: str) -> str:
-    """Adapted from https://gomakethings.com/dynamically-changing-the-text-color-based-on-background-color-contrast-with-vanilla-js/"""
-    r = int(hex_color[1:3], 16)
-    g = int(hex_color[3:5], 16)
-    b = int(hex_color[5:7], 16)
     yiq = ((r * 299) + (g * 587) + (b * 114)) / 1000
     return "black" if (yiq >= 128) else "white"
 def colorize_classes(df: pd.DataFrame) -> pd.DataFrame:
-    """Colorize the errors in the dataframe."""
     def colorize_row(row):
         return [
@@ -175,6 +198,14 @@ def colorize_classes(df: pd.DataFrame) -> pd.DataFrame:
 def htmlify_labeled_example(example: pd.DataFrame) -> str:
     html = []
     for _, row in example.iterrows():
@@ -215,18 +246,8 @@ def htmlify_labeled_example(example: pd.DataFrame) -> str:
     return " ".join(html)
-def htmlify_example(example: pd.DataFrame) -> str:
-    corr_html = " ".join(
-        [
-            f", {row.tokens}" if row.labels == "B-COMMA" else row.tokens
-            for _, row in example.iterrows()
-        ]
-    ).strip()
-    return f"<em>{corr_html}</em>"
 def color_map_color(value: float, cmap_name="Set1", vmin=0, vmax=1) -> str:
-    """Turn a value into a color using a color map."""
     norm = matplotlib.colors.Normalize(vmin=vmin, vmax=vmax)
     cmap = cm.get_cmap(cmap_name)  # PiYG
     rgba = cmap(norm(abs(value)))

 def aggrid_interactive_table(df: pd.DataFrame) -> dict:
     """Creates an st-aggrid interactive table based on a dataframe.
     Args:
         df (pd.DataFrame]): Source dataframe
     Returns:
 def explode_df(df: pd.DataFrame) -> pd.DataFrame:
+    """Takes a dataframe and explodes all the fields."""
     df_tokens = df.apply(pd.Series.explode)
     if "losses" in df.columns:
         df_tokens["losses"] = df_tokens["losses"].astype(float)
 def align_sample(row: pd.Series):
+    """Uses word_ids to align all lists in a sample."""
     columns = row.axes[0].to_list()
     indices = [i for i, id in enumerate(row.word_ids) if id >= 0 and id != row.word_ids[i - 1]]
     hash_funcs=tokenizer_hash_funcs,
 )
 def tag_text(text: str, tokenizer, model, device: torch.device) -> pd.DataFrame:
+    """Tags a given text and creates an (exploded) DataFrame with the predicted labels and probabilities.
+    Args:
+        text (str): The text to be processed
+        tokenizer: Tokenizer to use
+        model (_type_): Model to use
+        device (torch.device): The device we want pytorch to use for its calcultaions.
+    Returns:
+        pd.DataFrame: A data frame holding the tagged text.
+    """
     tokens = tokenizer(text).tokens()
     tokenized = tokenizer(text, return_tensors="pt")
     return explode_df(merged_df).reset_index().drop(columns=["index"])
+def get_bg_color(label: str):
+    """Retrieves a label's color from the session state."""
     return st.session_state[f"color_{label}"]
+def get_fg_color(bg_color_hex: str) -> str:
+    """Chooses the proper (foreground) text color (black/white) for a given background color, maximizing contrast.
+    Adapted from https://gomakethings.com/dynamically-changing-the-text-color-based-on-background-color-contrast-with-vanilla-js/
+    Args:
+        bg_color_hex (str): The background color given as a HEX stirng.
+    Returns:
+        str: Either "black" or "white".
+    """
+    r = int(bg_color_hex[1:3], 16)
+    g = int(bg_color_hex[3:5], 16)
+    b = int(bg_color_hex[5:7], 16)
     yiq = ((r * 299) + (g * 587) + (b * 114)) / 1000
     return "black" if (yiq >= 128) else "white"
 def colorize_classes(df: pd.DataFrame) -> pd.DataFrame:
+    """Colorizes the errors in the dataframe."""
     def colorize_row(row):
         return [
 def htmlify_labeled_example(example: pd.DataFrame) -> str:
+    """Builds an HTML (string) representation of a single example.
+    Args:
+        example (pd.DataFrame): The example to process.
+    Returns:
+        str: An HTML string representation of a single example.
+    """
     html = []
     for _, row in example.iterrows():
     return " ".join(html)
 def color_map_color(value: float, cmap_name="Set1", vmin=0, vmax=1) -> str:
+    """Turns a value into a color using a color map."""
     norm = matplotlib.colors.Normalize(vmin=vmin, vmax=vmax)
     cmap = cm.get_cmap(cmap_name)  # PiYG
     rgba = cmap(norm(abs(value)))