Spaces:
Build error
Build error
meg-huggingface
commited on
Commit
•
f9936fb
1
Parent(s):
deefca3
Update from rollback
Browse files
data_measurements/dataset_statistics.py
CHANGED
@@ -303,6 +303,7 @@ class DatasetStatisticsCacheClass:
|
|
303 |
self.node_list_fid = pjoin(self.cache_path, "node_list.th")
|
304 |
# Needed for UI
|
305 |
self.fig_tree_json_fid = pjoin(self.cache_path, "fig_tree.json")
|
|
|
306 |
|
307 |
self.live = False
|
308 |
|
@@ -366,6 +367,7 @@ class DatasetStatisticsCacheClass:
|
|
366 |
"""
|
367 |
# Text length figure
|
368 |
if (self.use_cache and exists(self.fig_tok_length_fid)):
|
|
|
369 |
self.fig_tok_length = read_plotly(self.fig_tok_length_fid)
|
370 |
else:
|
371 |
if not self.live:
|
@@ -709,6 +711,8 @@ class DatasetStatisticsCacheClass:
|
|
709 |
zipf_dict = json.load(f)
|
710 |
self.z = Zipf()
|
711 |
self.z.load(zipf_dict)
|
|
|
|
|
712 |
self.zipf_fig = read_plotly(self.zipf_fig_fid)
|
713 |
elif self.use_cache and exists(self.zipf_fid):
|
714 |
# TODO: Read zipf data so that the vocab is there.
|
@@ -771,26 +775,30 @@ class nPMIStatisticsCacheClass:
|
|
771 |
and exists(self.npmi_terms_fid)
|
772 |
and json.load(open(self.npmi_terms_fid))["available terms"] != []
|
773 |
):
|
774 |
-
available_terms = json.load(open(self.npmi_terms_fid))["available terms"]
|
775 |
else:
|
776 |
-
|
777 |
-
|
778 |
-
|
779 |
-
|
780 |
-
|
781 |
-
|
782 |
-
|
783 |
-
|
784 |
-
|
785 |
-
|
786 |
-
|
787 |
-
|
788 |
-
|
789 |
-
|
790 |
-
|
791 |
-
|
792 |
-
|
793 |
-
|
|
|
|
|
|
|
|
|
794 |
"""
|
795 |
Run on-the fly, while the app is already open,
|
796 |
as it depends on the subgroup terms that the user chooses
|
@@ -824,12 +832,14 @@ class nPMIStatisticsCacheClass:
|
|
824 |
joint_npmi_df, subgroup_dict = self.prepare_joint_npmi_df(
|
825 |
subgroup_pair, subgroup_files
|
826 |
)
|
827 |
-
|
828 |
-
|
829 |
-
|
830 |
-
|
831 |
-
|
832 |
-
|
|
|
|
|
833 |
else:
|
834 |
joint_npmi_df = pd.DataFrame()
|
835 |
logs.info("The joint npmi df is")
|
@@ -871,7 +881,7 @@ class nPMIStatisticsCacheClass:
|
|
871 |
subgroup_dict[subgroup] = cached_results
|
872 |
logs.info("Calculating for subgroup list")
|
873 |
joint_npmi_df, subgroup_dict = self.do_npmi(subgroup_pair, subgroup_dict)
|
874 |
-
return joint_npmi_df
|
875 |
|
876 |
# TODO: Update pairwise assumption
|
877 |
def do_npmi(self, subgroup_pair, subgroup_dict):
|
@@ -882,6 +892,7 @@ class nPMIStatisticsCacheClass:
|
|
882 |
:return: Selected identity term's co-occurrence counts with
|
883 |
other words, pmi per word, and nPMI per word.
|
884 |
"""
|
|
|
885 |
logs.info("Initializing npmi class")
|
886 |
npmi_obj = self.set_npmi_obj()
|
887 |
# Canonical ordering used
|
@@ -889,18 +900,26 @@ class nPMIStatisticsCacheClass:
|
|
889 |
# Calculating nPMI statistics
|
890 |
for subgroup in subgroup_pair:
|
891 |
# If the subgroup data is already computed, grab it.
|
892 |
-
# TODO: Should we set idx and column names similarly to
|
|
|
893 |
if subgroup not in subgroup_dict:
|
894 |
logs.info("Calculating statistics for %s" % subgroup)
|
895 |
vocab_cooc_df, pmi_df, npmi_df = npmi_obj.calc_metrics(subgroup)
|
896 |
-
|
897 |
-
|
898 |
-
|
899 |
-
|
900 |
-
|
901 |
-
|
902 |
-
|
903 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
904 |
|
905 |
def set_npmi_obj(self):
|
906 |
"""
|
@@ -1291,3 +1310,4 @@ def write_zipf_data(z, zipf_fid):
|
|
1291 |
zipf_dict["uniq_ranks"] = [int(rank) for rank in z.uniq_ranks]
|
1292 |
with open(zipf_fid, "w+", encoding="utf-8") as f:
|
1293 |
json.dump(zipf_dict, f)
|
|
|
|
303 |
self.node_list_fid = pjoin(self.cache_path, "node_list.th")
|
304 |
# Needed for UI
|
305 |
self.fig_tree_json_fid = pjoin(self.cache_path, "fig_tree.json")
|
306 |
+
self.zipf_counts = None
|
307 |
|
308 |
self.live = False
|
309 |
|
|
|
367 |
"""
|
368 |
# Text length figure
|
369 |
if (self.use_cache and exists(self.fig_tok_length_fid)):
|
370 |
+
self.fig_tok_length_png = mpimg.imread(self.fig_tok_length_fid)
|
371 |
self.fig_tok_length = read_plotly(self.fig_tok_length_fid)
|
372 |
else:
|
373 |
if not self.live:
|
|
|
711 |
zipf_dict = json.load(f)
|
712 |
self.z = Zipf()
|
713 |
self.z.load(zipf_dict)
|
714 |
+
# TODO: Should this be cached?
|
715 |
+
self.zipf_counts = self.z.calc_zipf_counts(self.vocab_counts_df)
|
716 |
self.zipf_fig = read_plotly(self.zipf_fig_fid)
|
717 |
elif self.use_cache and exists(self.zipf_fid):
|
718 |
# TODO: Read zipf data so that the vocab is there.
|
|
|
775 |
and exists(self.npmi_terms_fid)
|
776 |
and json.load(open(self.npmi_terms_fid))["available terms"] != []
|
777 |
):
|
778 |
+
self.available_terms = json.load(open(self.npmi_terms_fid))["available terms"]
|
779 |
else:
|
780 |
+
if not self.live:
|
781 |
+
if self.dstats.vocab_counts_df is None:
|
782 |
+
self.dstats.load_or_prepare_vocab()
|
783 |
+
|
784 |
+
true_false = [
|
785 |
+
term in self.dstats.vocab_counts_df.index for term in self.termlist
|
786 |
+
]
|
787 |
+
word_list_tmp = [x for x, y in zip(self.termlist, true_false) if y]
|
788 |
+
true_false_counts = [
|
789 |
+
self.dstats.vocab_counts_df.loc[word, CNT] >= self.min_vocab_count
|
790 |
+
for word in word_list_tmp
|
791 |
+
]
|
792 |
+
available_terms = [
|
793 |
+
word for word, y in zip(word_list_tmp, true_false_counts) if y
|
794 |
+
]
|
795 |
+
logs.info(available_terms)
|
796 |
+
with open(self.npmi_terms_fid, "w+") as f:
|
797 |
+
json.dump({"available terms": available_terms}, f)
|
798 |
+
self.available_terms = available_terms
|
799 |
+
return self.available_terms
|
800 |
+
|
801 |
+
def load_or_prepare_joint_npmi(self, subgroup_pair, save=True):
|
802 |
"""
|
803 |
Run on-the fly, while the app is already open,
|
804 |
as it depends on the subgroup terms that the user chooses
|
|
|
832 |
joint_npmi_df, subgroup_dict = self.prepare_joint_npmi_df(
|
833 |
subgroup_pair, subgroup_files
|
834 |
)
|
835 |
+
if save:
|
836 |
+
if joint_npmi_df is not None:
|
837 |
+
# Cache new results
|
838 |
+
logs.info("Writing out.")
|
839 |
+
for subgroup in subgroup_pair:
|
840 |
+
write_subgroup_npmi_data(subgroup, subgroup_dict, subgroup_files)
|
841 |
+
with open(joint_npmi_fid, "w+") as f:
|
842 |
+
joint_npmi_df.to_csv(f)
|
843 |
else:
|
844 |
joint_npmi_df = pd.DataFrame()
|
845 |
logs.info("The joint npmi df is")
|
|
|
881 |
subgroup_dict[subgroup] = cached_results
|
882 |
logs.info("Calculating for subgroup list")
|
883 |
joint_npmi_df, subgroup_dict = self.do_npmi(subgroup_pair, subgroup_dict)
|
884 |
+
return joint_npmi_df, subgroup_dict
|
885 |
|
886 |
# TODO: Update pairwise assumption
|
887 |
def do_npmi(self, subgroup_pair, subgroup_dict):
|
|
|
892 |
:return: Selected identity term's co-occurrence counts with
|
893 |
other words, pmi per word, and nPMI per word.
|
894 |
"""
|
895 |
+
no_results = False
|
896 |
logs.info("Initializing npmi class")
|
897 |
npmi_obj = self.set_npmi_obj()
|
898 |
# Canonical ordering used
|
|
|
900 |
# Calculating nPMI statistics
|
901 |
for subgroup in subgroup_pair:
|
902 |
# If the subgroup data is already computed, grab it.
|
903 |
+
# TODO: Should we set idx and column names similarly to
|
904 |
+
# how we set them for cached files?
|
905 |
if subgroup not in subgroup_dict:
|
906 |
logs.info("Calculating statistics for %s" % subgroup)
|
907 |
vocab_cooc_df, pmi_df, npmi_df = npmi_obj.calc_metrics(subgroup)
|
908 |
+
if vocab_cooc_df is None:
|
909 |
+
no_results = True
|
910 |
+
else:
|
911 |
+
# Store the nPMI information for the current subgroups
|
912 |
+
subgroup_dict[subgroup] = (vocab_cooc_df, pmi_df, npmi_df)
|
913 |
+
if no_results:
|
914 |
+
logs.warning("Couldn't grap the npmi files -- Under construction")
|
915 |
+
return None, None
|
916 |
+
else:
|
917 |
+
# Pair the subgroups together, indexed by all words that
|
918 |
+
# co-occur between them.
|
919 |
+
logs.info("Computing pairwise npmi bias")
|
920 |
+
paired_results = npmi_obj.calc_paired_metrics(subgroup_pair, subgroup_dict)
|
921 |
+
UI_results = make_npmi_fig(paired_results, subgroup_pair)
|
922 |
+
return UI_results.dropna(), subgroup_dict
|
923 |
|
924 |
def set_npmi_obj(self):
|
925 |
"""
|
|
|
1310 |
zipf_dict["uniq_ranks"] = [int(rank) for rank in z.uniq_ranks]
|
1311 |
with open(zipf_fid, "w+", encoding="utf-8") as f:
|
1312 |
json.dump(zipf_dict, f)
|
1313 |
+
|