File size: 3,990 Bytes
7e3e85d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
import json
import datasets


"""QMsum dataset."""


_CITATION = """
@inproceedings{zhong2021qmsum,
   title={{QMS}um: {A} {N}ew {B}enchmark for {Q}uery-based {M}ulti-domain {M}eeting {S}ummarization},
   author={Zhong, Ming and Yin, Da and Yu, Tao and Zaidi, Ahmad and Mutuma, Mutethia and Jha, Rahul and Hassan Awadallah, Ahmed and Celikyilmaz, Asli and Liu, Yang and Qiu, Xipeng and Radev, Dragomir},
   booktitle={North American Association for Computational Linguistics (NAACL)},
   year={2021}
}
"""

_DESCRIPTION = """
QMSum is a new human-annotated benchmark for query-based multi-domain meeting summarization task, \
which consists of 1,808 query-summary pairs over 232 meetings in multiple domains.
"""

_HOMEPAGE = "https://github.com/Yale-LILY/QMSum"

_BASE_URL = "https://raw.githubusercontent.com/Yale-LILY/QMSum/main/data/ALL/jsonl"
_URLs = {
    "train": _BASE_URL + "/train.jsonl",
    "val": _BASE_URL + "/val.jsonl",
    "test": _BASE_URL + "/test.jsonl",
}


class SummertimeQmsum(datasets.GeneratorBasedBuilder):
    """QMsum dataset."""

    VERSION = datasets.Version("1.0.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(),
    ]

    def _info(self):
        features = datasets.Features(
            {
                "entry_number": datasets.Value("string"),
                "meeting_transcripts": [
                    {
                        "speaker": datasets.Value("string"),
                        "content": datasets.Value("string"),
                    }
                ],
                "general_query_list": [
                    {
                        "query": datasets.Value("string"),
                        "answer": datasets.Value("string"),
                    }
                ],
                "specific_query_list": [
                    {
                        "query": datasets.Value("string"),
                        "answer": datasets.Value("string"),
                        "relevant_text_span": [[datasets.Value("string")]],
                    }
                ],
            }
        )
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=None,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        my_urls = _URLs
        downloaded_files = dl_manager.download_and_extract(my_urls)

        trainpath = downloaded_files["train"]
        valpath = downloaded_files["val"]
        testpath = downloaded_files["test"]

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={"filepath": trainpath, "split": "train"},
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={"filepath": valpath, "split": "val"},
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={"filepath": testpath, "split": "test"},
            ),
        ]

    def _generate_examples(self, filepath, split):
        """Yields examples."""

        extraction_path = os.path.join(filepath)

        with open(extraction_path) as f:
            for i, line in enumerate(f):

                instance = json.loads(line)

                entry = {}
                entry["entry_number"] = split + "_" + str(i)
                entry["meeting_transcripts"] = instance["meeting_transcripts"]
                entry["general_query_list"] = instance["general_query_list"]
                entry["specific_query_list"] = instance["specific_query_list"]

                yield entry["entry_number"], entry