File size: 3,879 Bytes
46df0b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import evaluate
import logging
import os
import pandas as pd
import plotly.express as px
import utils
import utils.dataset_utils as ds_utils
from collections import Counter
from os.path import exists, isdir
from os.path import join as pjoin

TEXT = "text"
# These are string constants defined in the evaluate library.
# They may need to be updated if the evaluate library changes these strings
DUPS_FRAC = "duplicate_fraction"
# Evaluate calls the dictionary a "list"
DUPS_DICT = "duplicates_dict"
# This isn't in the evaluate measurement, but TODO to add that...
# DUPS_SUM = "duplicate_sum"

logs = utils.prepare_logging(__file__)

class DMTHelper:
    """Helper class for the Data Measurements Tool.
    This allows us to keep all variables and functions related to labels
    in one file.
    Does caching and using the evaluate library for computation.
    """

    def __init__(self, dstats, load_only, save):
        # Input HuggingFace Dataset.
        self.dset = dstats.text_dset[TEXT]
        if self.dset is None:
            dstats.load_or_prepare_text_dset()
            self.dset = dstats.text_dset
        self.use_cache = dstats.use_cache
        # Note: This is None as it can be called different times with different
        # settings, and so we want fresh results each time. With the evaluate
        # integration, results are different depending on whether
        # list_duplicates is set.
        self.duplicates_results = None
        self.cache_dir = dstats.dataset_cache_dir
        self.save = save
        self.load_only = load_only
        # Filenames
        self.dups_dir = "text_duplicates"
        dups_json = "text_duplicates.json"
        dups_html = "text_duplicates.html"
        self.dups_result_json_fid = pjoin(self.cache_dir, self.dups_dir, dups_json)
        self.dups_result_html_fid = pjoin(self.cache_dir, self.dups_dir, dups_html)

    def run_DMT_processing(self, list_duplicates=True):
        """Calls functions to do the main work.
        DMT uses the full duplicates list in a widget,
        so it is set to default True.
        """
        # First look to see what we can load from cache.
        if self.use_cache:
            self.duplicates_results = self._load_duplicates_cache()
            if self.duplicates_results:
                logs.info("Loaded cached text duplicate results.")
        if not self.duplicates_results and not self.load_only:
            self.duplicates_results = self._prepare_duplicates(list_duplicates=list_duplicates)
            logs.info("Prepared duplicates.")
            if self.save:
                self._write_duplicates_cache()

    def _prepare_duplicates(self, list_duplicates=True):
        """Wraps the evaluate library."""
        duplicates = evaluate.load("text_duplicates")
        results = duplicates.compute(data=self.dset, list_duplicates=list_duplicates)
        return results

    def _load_duplicates_cache(self):
        """Loads previously computed results from cache."""
        results = {}
        if exists(self.dups_result_json_fid):
            results = ds_utils.read_json(self.dups_result_json_fid)
        return results

    def _write_duplicates_cache(self):
        """Writes newly computed results to cache."""
        ds_utils.make_path(pjoin(self.cache_dir, self.dups_dir))
        if self.duplicates_results:
            ds_utils.write_json(self.duplicates_results, self.dups_result_json_fid)
            # TODO: Use df_to_html rather than write_json_as_html;
            # this will make it possible to order the results.
            # But they must first be turned into a dataframe.
            ds_utils.write_json_as_html(self.duplicates_results, self.dups_result_html_fid)

    def get_duplicates_filenames(self):
        dups_fid_dict = {"statistics": self.dups_result_json_fid, "html":self.dups_result_html_fid}
        return dups_fid_dict