Mariusz Kossakowski commited on
Commit
f10673c
1 Parent(s): 8eb9cdc

Black formatting

Browse files
clarin_datasets/kpwr_ner_datasets.py CHANGED
@@ -72,7 +72,9 @@ class KpwrNerDataset(DatasetToShow):
72
  full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
73
  tokens_all = full_dataframe["tokens"].tolist()
74
  tokens_all = [x for subarray in tokens_all for x in subarray]
75
- labels_all = pd.concat(self.data_dict_named.values(), axis="rows")["ner"].tolist()
 
 
76
  labels_all = [x for subarray in labels_all for x in subarray]
77
 
78
  with dataframe_head:
@@ -93,9 +95,9 @@ class KpwrNerDataset(DatasetToShow):
93
  all_labels_from_subset = pd.Series(all_labels_from_subset)
94
  class_distribution_dict[subset] = (
95
  all_labels_from_subset.value_counts(normalize=True)
96
- .sort_index()
97
- .reset_index()
98
- .rename({"index": "class", 0: subset}, axis="columns")
99
  )
100
 
101
  class_distribution_df = pd.merge(
@@ -117,7 +119,10 @@ class KpwrNerDataset(DatasetToShow):
117
  "ner": labels_all,
118
  }
119
  )
120
- full_df_unzipped = full_df_unzipped.loc[full_df_unzipped["ner"] != "O" and not full_df_unzipped["ner"].str.starstwith("I-")]
 
 
 
121
  possible_options = sorted(full_df_unzipped["ner"].unique())
122
  with most_common_tokens:
123
  st.header("10 most common tokens from selected class (without 'O')")
 
72
  full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
73
  tokens_all = full_dataframe["tokens"].tolist()
74
  tokens_all = [x for subarray in tokens_all for x in subarray]
75
+ labels_all = pd.concat(self.data_dict_named.values(), axis="rows")[
76
+ "ner"
77
+ ].tolist()
78
  labels_all = [x for subarray in labels_all for x in subarray]
79
 
80
  with dataframe_head:
 
95
  all_labels_from_subset = pd.Series(all_labels_from_subset)
96
  class_distribution_dict[subset] = (
97
  all_labels_from_subset.value_counts(normalize=True)
98
+ .sort_index()
99
+ .reset_index()
100
+ .rename({"index": "class", 0: subset}, axis="columns")
101
  )
102
 
103
  class_distribution_df = pd.merge(
 
119
  "ner": labels_all,
120
  }
121
  )
122
+ full_df_unzipped = full_df_unzipped.loc[
123
+ (full_df_unzipped["ner"] != "O")
124
+ & (full_df_unzipped["ner"].str.starstwith("I-"))
125
+ ]
126
  possible_options = sorted(full_df_unzipped["ner"].unique())
127
  with most_common_tokens:
128
  st.header("10 most common tokens from selected class (without 'O')")