Spaces:

clarin-pl
/

datasets-explorer

Runtime error

Mariusz Kossakowski commited on Aug 30, 2022

Commit

f10673c

•

1 Parent(s): 8eb9cdc

Black formatting

Files changed (1) hide show

clarin_datasets/kpwr_ner_datasets.py CHANGED Viewed

@@ -72,7 +72,9 @@ class KpwrNerDataset(DatasetToShow):
         full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
         tokens_all = full_dataframe["tokens"].tolist()
         tokens_all = [x for subarray in tokens_all for x in subarray]
-        labels_all = pd.concat(self.data_dict_named.values(), axis="rows")["ner"].tolist()
         labels_all = [x for subarray in labels_all for x in subarray]
         with dataframe_head:
@@ -93,9 +95,9 @@ class KpwrNerDataset(DatasetToShow):
             all_labels_from_subset = pd.Series(all_labels_from_subset)
             class_distribution_dict[subset] = (
                 all_labels_from_subset.value_counts(normalize=True)
-                    .sort_index()
-                    .reset_index()
-                    .rename({"index": "class", 0: subset}, axis="columns")
             )
         class_distribution_df = pd.merge(
@@ -117,7 +119,10 @@ class KpwrNerDataset(DatasetToShow):
                     "ner": labels_all,
                 }
             )
-            full_df_unzipped = full_df_unzipped.loc[full_df_unzipped["ner"] != "O" and not full_df_unzipped["ner"].str.starstwith("I-")]
             possible_options = sorted(full_df_unzipped["ner"].unique())
             with most_common_tokens:
                 st.header("10 most common tokens from selected class (without 'O')")

         full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
         tokens_all = full_dataframe["tokens"].tolist()
         tokens_all = [x for subarray in tokens_all for x in subarray]
+        labels_all = pd.concat(self.data_dict_named.values(), axis="rows")[
+            "ner"
+        ].tolist()
         labels_all = [x for subarray in labels_all for x in subarray]
         with dataframe_head:
             all_labels_from_subset = pd.Series(all_labels_from_subset)
             class_distribution_dict[subset] = (
                 all_labels_from_subset.value_counts(normalize=True)
+                .sort_index()
+                .reset_index()
+                .rename({"index": "class", 0: subset}, axis="columns")
             )
         class_distribution_df = pd.merge(
                     "ner": labels_all,
                 }
             )
+            full_df_unzipped = full_df_unzipped.loc[
+                (full_df_unzipped["ner"] != "O")
+                & (full_df_unzipped["ner"].str.starstwith("I-"))
+            ]
             possible_options = sorted(full_df_unzipped["ner"].unique())
             with most_common_tokens:
                 st.header("10 most common tokens from selected class (without 'O')")