Spaces:
Runtime error
Runtime error
import evaluate | |
import logging | |
import os | |
import pandas as pd | |
import plotly.express as px | |
import utils | |
import utils.dataset_utils as ds_utils | |
from collections import Counter | |
from os.path import exists, isdir | |
from os.path import join as pjoin | |
TEXT = "text" | |
# These are string constants defined in the evaluate library. | |
# They may need to be updated if the evaluate library changes these strings | |
DUPS_FRAC = "duplicate_fraction" | |
# Evaluate calls the dictionary a "list" | |
DUPS_DICT = "duplicates_dict" | |
# This isn't in the evaluate measurement, but TODO to add that... | |
# DUPS_SUM = "duplicate_sum" | |
logs = utils.prepare_logging(__file__) | |
class DMTHelper: | |
"""Helper class for the Data Measurements Tool. | |
This allows us to keep all variables and functions related to labels | |
in one file. | |
Does caching and using the evaluate library for computation. | |
""" | |
def __init__(self, dstats, load_only, save): | |
# Input HuggingFace Dataset. | |
self.dset = dstats.text_dset[TEXT] | |
if self.dset is None: | |
dstats.load_or_prepare_text_dset() | |
self.dset = dstats.text_dset | |
self.use_cache = dstats.use_cache | |
# Note: This is None as it can be called different times with different | |
# settings, and so we want fresh results each time. With the evaluate | |
# integration, results are different depending on whether | |
# list_duplicates is set. | |
self.duplicates_results = None | |
self.cache_dir = dstats.dataset_cache_dir | |
self.save = save | |
self.load_only = load_only | |
# Filenames | |
self.dups_dir = "text_duplicates" | |
dups_json = "text_duplicates.json" | |
dups_html = "text_duplicates.html" | |
self.dups_result_json_fid = pjoin(self.cache_dir, self.dups_dir, dups_json) | |
self.dups_result_html_fid = pjoin(self.cache_dir, self.dups_dir, dups_html) | |
def run_DMT_processing(self, list_duplicates=True): | |
"""Calls functions to do the main work. | |
DMT uses the full duplicates list in a widget, | |
so it is set to default True. | |
""" | |
# First look to see what we can load from cache. | |
if self.use_cache: | |
self.duplicates_results = self._load_duplicates_cache() | |
if self.duplicates_results: | |
logs.info("Loaded cached text duplicate results.") | |
if not self.duplicates_results and not self.load_only: | |
self.duplicates_results = self._prepare_duplicates(list_duplicates=list_duplicates) | |
logs.info("Prepared duplicates.") | |
if self.save: | |
self._write_duplicates_cache() | |
def _prepare_duplicates(self, list_duplicates=True): | |
"""Wraps the evaluate library.""" | |
duplicates = evaluate.load("text_duplicates") | |
results = duplicates.compute(data=self.dset, list_duplicates=list_duplicates) | |
return results | |
def _load_duplicates_cache(self): | |
"""Loads previously computed results from cache.""" | |
results = {} | |
if exists(self.dups_result_json_fid): | |
results = ds_utils.read_json(self.dups_result_json_fid) | |
return results | |
def _write_duplicates_cache(self): | |
"""Writes newly computed results to cache.""" | |
ds_utils.make_path(pjoin(self.cache_dir, self.dups_dir)) | |
if self.duplicates_results: | |
ds_utils.write_json(self.duplicates_results, self.dups_result_json_fid) | |
# TODO: Use df_to_html rather than write_json_as_html; | |
# this will make it possible to order the results. | |
# But they must first be turned into a dataframe. | |
ds_utils.write_json_as_html(self.duplicates_results, self.dups_result_html_fid) | |
def get_duplicates_filenames(self): | |
dups_fid_dict = {"statistics": self.dups_result_json_fid, "html":self.dups_result_html_fid} | |
return dups_fid_dict | |