Spaces:

aseifert
/

ExplaiNER

Runtime error

App Files Files

Alexander Seifert commited on Jun 15, 2022

Commit

9556889

•

1 Parent(s): 2918df9

improve docs

Browse files

Files changed (4) hide show

src/data.py +53 -2
src/load.py +14 -0
src/subpages/attention.py +2 -2
src/subpages/page.py +10 -0

src/data.py CHANGED Viewed

@@ -11,7 +11,19 @@ from utils import device, tokenizer_hash_funcs
 @st.cache(allow_output_mutation=True)
-def get_data(ds_name, config_name, split_name, split_sample_size) -> Dataset:
     ds: DatasetDict = load_dataset(ds_name, name=config_name, use_auth_token=True).shuffle(seed=0)  # type: ignore
     split = ds[split_name].select(range(split_sample_size))
     return split
@@ -22,6 +34,14 @@ def get_data(ds_name, config_name, split_name, split_sample_size) -> Dataset:
     hash_funcs=tokenizer_hash_funcs,
 )
 def get_collator(tokenizer) -> DataCollatorForTokenClassification:
     return DataCollatorForTokenClassification(tokenizer)
@@ -70,10 +90,29 @@ def tokenize_and_align_labels(examples, tokenizer):
 def stringify_ner_tags(batch, tags):
     return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}
-def encode_dataset(split, tokenizer):
     tags = split.features["ner_tags"].feature
     split = split.map(partial(stringify_ner_tags, tags=tags), batched=True)
     remove_columns = split.column_names
@@ -120,6 +159,18 @@ def forward_pass_with_label(batch, model, collator, num_classes: int) -> dict:
 def get_split_df(split_encoded: Dataset, model, tokenizer, collator, tags) -> pd.DataFrame:
     split_encoded = split_encoded.map(
         partial(
             forward_pass_with_label,

 @st.cache(allow_output_mutation=True)
+def get_data(ds_name: str, config_name: str, split_name: str, split_sample_size: int) -> Dataset:
+    """Loads dataset from the HF hub (if not already loaded) and returns a Dataset object.
+    Uses datasets.load_dataset to load the dataset (see its documentation for additional details).
+    Args:
+        ds_name (str): Path or name of the dataset.
+        config_name (str): Name of the dataset configuration.
+        split_name (str): Which split of the data to load.
+        split_sample_size (int): The number of examples to load from the split.
+    Returns:
+        Dataset: A Dataset object.
+    """
     ds: DatasetDict = load_dataset(ds_name, name=config_name, use_auth_token=True).shuffle(seed=0)  # type: ignore
     split = ds[split_name].select(range(split_sample_size))
     return split
     hash_funcs=tokenizer_hash_funcs,
 )
 def get_collator(tokenizer) -> DataCollatorForTokenClassification:
+    """Data collator that will dynamically pad the inputs received, as well as the labels.
+    Args:
+        tokenizer ([PreTrainedTokenizer] or [PreTrainedTokenizerFast]): The tokenizer used for encoding the data.
+    Returns:
+        DataCollatorForTokenClassification: The DataCollatorForTokenClassification object.
+    """
     return DataCollatorForTokenClassification(tokenizer)
 def stringify_ner_tags(batch, tags):
+    """Stringifies a dataset batch's NER tags.
+    Args:
+        batch (_type_): _description_
+        tags (_type_): _description_
+    Returns:
+        _type_: _description_
+    """
     return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}
+def encode_dataset(split: Dataset, tokenizer):
+    """Encodes a dataset split.
+    Args:
+        split (Dataset): A Dataset object.
+        tokenizer: A PreTrainedTokenizer object.
+    Returns:
+        Dataset: A Dataset object with the encoded inputs.
+    """
     tags = split.features["ner_tags"].feature
     split = split.map(partial(stringify_ner_tags, tags=tags), batched=True)
     remove_columns = split.column_names
 def get_split_df(split_encoded: Dataset, model, tokenizer, collator, tags) -> pd.DataFrame:
+    """Turns a Dataset into a pandas dataframe.
+    Args:
+        split_encoded (Dataset): _description_
+        model (_type_): _description_
+        tokenizer (_type_): _description_
+        collator (_type_): _description_
+        tags (_type_): _description_
+    Returns:
+        pd.DataFrame: _description_
+    """
     split_encoded = split_encoded.map(
         partial(
             forward_pass_with_label,

src/load.py CHANGED Viewed

@@ -39,6 +39,20 @@ def load_context(
     split_sample_size: int,
     **kw_args,
 ) -> Context:
     sentence_encoder, model, tokenizer = _load_models_and_tokenizer(
         encoder_model_name=encoder_model_name,

     split_sample_size: int,
     **kw_args,
 ) -> Context:
+    """Utility method loading (almost) everything we need for the application.
+    This exists just because we want to cache the results of this function.
+    Args:
+        encoder_model_name (str): Name of the sentence encoder to load.
+        model_name (str): Name of the NER model to load.
+        ds_name (str): Dataset name or path.
+        ds_config_name (str): Dataset config name.
+        ds_split_name (str): Dataset split name.
+        split_sample_size (int): Number of examples to load from the split.
+    Returns:
+        Context: An object containing everything we need for the application.
+    """
     sentence_encoder, model, tokenizer = _load_models_and_tokenizer(
         encoder_model_name=encoder_model_name,

src/subpages/attention.py CHANGED Viewed

@@ -80,7 +80,7 @@ JS_TEMPLATE = """requirejs(['basic', 'ecco'], function(basic, ecco){{
 @st.cache(allow_output_mutation=True)
-def load_ecco_model():
     model_config = {
         "embedding": "embeddings.word_embeddings",
         "type": "mlm",
@@ -115,7 +115,7 @@ class AttentionPage(Page):
                 "A group of neurons tend to fire in response to commas and other punctuation. Other groups of neurons tend to fire in response to pronouns. Use this visualization to factorize neuron activity in individual FFNN layers or in the entire model."
             )
-        lm = load_ecco_model()
         col1, _, col2 = st.columns([1.5, 0.5, 4])
         with col1:

 @st.cache(allow_output_mutation=True)
+def _load_ecco_model():
     model_config = {
         "embedding": "embeddings.word_embeddings",
         "type": "mlm",
                 "A group of neurons tend to fire in response to commas and other punctuation. Other groups of neurons tend to fire in response to pronouns. Use this visualization to factorize neuron activity in individual FFNN layers or in the entire model."
             )
+        lm = _load_ecco_model()
         col1, _, col2 = st.columns([1.5, 0.5, 4])
         with col1:

src/subpages/page.py CHANGED Viewed

@@ -10,6 +10,8 @@ from transformers import AutoTokenizer  # type: ignore
 @dataclass
 class Context:
     model: AutoModelForSequenceClassification
     tokenizer: AutoTokenizer
     sentence_encoder: SentenceTransformer
@@ -27,11 +29,19 @@ class Context:
 class Page:
     name: str
     icon: str
     def get_widget_defaults(self):
         return {}
     def render(self, context):
         ...

 @dataclass
 class Context:
+    """This object facilitates passing around the applications state between different pages."""
     model: AutoModelForSequenceClassification
     tokenizer: AutoTokenizer
     sentence_encoder: SentenceTransformer
 class Page:
+    """Base class for all pages."""
     name: str
     icon: str
     def get_widget_defaults(self):
+        """This function holds the default settings for all the page's widgets.
+        Returns:
+            dict: A dictionary of widget defaults, where the keys are the widget names and the values are the default.
+        """
         return {}
     def render(self, context):
+        """This function renders the page."""
         ...