Spaces:
No application file
No application file
import csv | |
import json | |
import os | |
from .examples import MultipleChoiceExample, TextExample, TokensExample | |
class DataProcessor: | |
"""Base class for data converters for sequence classification data sets.""" | |
def __init__(self, data_dir): | |
self.data_dir = data_dir | |
def get_examples(self, lang, mode): | |
if mode == 'train': | |
return self.get_train_examples(lang) | |
elif mode == 'dev': | |
return self.get_dev_examples(lang) | |
elif mode == 'test': | |
return self.get_test_examples(lang) | |
def modes(self): | |
return ['train', 'dev', 'test'] | |
def get_train_examples(self, lang): | |
"""Gets a collection of :class:`InputExample` for the train set.""" | |
raise NotImplementedError() | |
def get_dev_examples(self, lang): | |
"""Gets a collection of :class:`InputExample` for the dev set.""" | |
raise NotImplementedError() | |
def get_test_examples(self, lang): | |
"""Gets a collection of :class:`InputExample` for the test set.""" | |
raise NotImplementedError() | |
def get_labels(self, lang): | |
"""Gets the list of labels for this data set.""" | |
raise NotImplementedError() | |
def read_csv(cls, input_file, quotechar=None): | |
"""Reads a tab separated value file.""" | |
with open(input_file, encoding='utf-8') as fp: | |
return list(csv.reader(fp, delimiter=',')) | |
def read_json(cls, input_file): | |
"""Reads a json file file.""" | |
with open(input_file, encoding='utf-8') as fp: | |
return json.load(fp) | |
def readlines(cls, filepath): | |
with open(filepath, encoding='utf-8') as fp: | |
return fp.readlines() | |
def read_jsonl(cls, filepath): | |
with open(filepath, 'r', encoding='utf-8') as fp: | |
data = fp.readlines() | |
data = list(map(lambda l: json.loads(l), data)) | |
return data | |
class IndicNLPHeadlines(DataProcessor): | |
"""Processor for the Headline Predction dataset""" | |
def __init__(self, data_dir): | |
self.data_dir = data_dir | |
def get_train_examples(self, lang): | |
"""See base class.""" | |
fname = '{}/{}-train.json'.format(lang, lang) | |
fpath = os.path.join(self.data_dir, fname) | |
return self._create_examples(self.read_json(fpath), 'train') | |
def get_dev_examples(self, lang): | |
'''See base class.''' | |
fname = '{}/{}-valid.json'.format(lang, lang) | |
fpath = os.path.join(self.data_dir, fname) | |
return self._create_examples(self.read_json(fpath), 'dev') | |
def get_test_examples(self, lang): | |
'''See base class.''' | |
fname = '{}/{}-test.json'.format(lang, lang) | |
fpath = os.path.join(self.data_dir, fname) | |
return self._create_examples(self.read_json(fpath), 'test') | |
def get_labels(self, lang): | |
"""See base class.""" | |
return ['A', 'B', 'C', 'D'] | |
def _create_examples(self, items, set_type): | |
"""Creates examples for the training and dev sets.""" | |
examples = [ | |
MultipleChoiceExample( | |
example_id=idx, | |
question='', | |
contexts=[item['content'], item['content'], item['content'], | |
item['content']], | |
endings=[item['optionA'], item['optionB'], item['optionC'], | |
item['optionD']], | |
label=item['correctOption'], | |
) | |
for idx, item in enumerate(items) | |
] | |
return examples | |
class WikiCloze(DataProcessor): | |
"""Processor for Wiki Cloze QA dataset""" | |
def __init__(self, data_dir): | |
self.data_dir = data_dir | |
def modes(self): | |
return ['test'] | |
def get_test_examples(self, lang): | |
"""See base class.""" | |
fname = '{}.json'.format(lang, lang) | |
fpath = os.path.join(self.data_dir, fname) | |
return self._create_examples(self.read_json(fpath)['cloze_data'], 'test') | |
def get_labels(self, lang): | |
"""See base class.""" | |
return list(range(4)) | |
def _create_examples(self, items, set_type): | |
"""Creates examples for the training and dev sets.""" | |
examples = [] | |
for (i, item) in enumerate(items): | |
if '' in [option.strip() for option in item['options']]: | |
continue | |
example = MultipleChoiceExample( | |
example_id=i, | |
question=item['question'].replace('<MASK>', '[MASK]'), | |
contexts=[], | |
endings=item['options'], | |
label=item['options'].index(item['answer']) | |
) | |
examples.append(example) | |
return examples | |
class IndicNLPGenre(DataProcessor): | |
"""Processor for the Article Genre Classification data set""" | |
def __init__(self, data_dir): | |
self.data_dir = data_dir | |
def get_train_examples(self, lang): | |
"""See base class.""" | |
fname = '{}/{}-train.csv'.format(lang, lang) | |
fpath = os.path.join(self.data_dir, fname) | |
return self._create_examples(self.read_csv(fpath), 'train') | |
def get_dev_examples(self, lang): | |
"""See base class.""" | |
fname = '{}/{}-valid.csv'.format(lang, lang) | |
fpath = os.path.join(self.data_dir, fname) | |
return self._create_examples(self.read_csv(fpath), 'dev') | |
def get_test_examples(self, lang): | |
fname = '{}/{}-test.csv'.format(lang, lang) | |
fpath = os.path.join(self.data_dir, fname) | |
return self._create_examples(self.read_csv(fpath), 'test') | |
def get_labels(self, lang): | |
"""See base class.""" | |
filename = '{}/{}-train.csv'.format(lang, lang) | |
lines = self.read_csv(os.path.join(self.data_dir, filename)) | |
labels = map(lambda l: l[0], lines) | |
labels = list(set(labels)) | |
return labels | |
def _create_examples(self, lines, set_type): | |
"""Creates examples for the training and dev sets.""" | |
examples = [] | |
for (i, line) in enumerate(lines): | |
example = TextExample( | |
guid=('%s-%s' % (set_type, i)), | |
text_a=line[1], | |
label=line[0] | |
) | |
examples.append(example) | |
return examples | |
class WikiNER(DataProcessor): | |
def __init__(self, data_dir): | |
self.data_dir = data_dir | |
def get_examples(self, lang, mode): | |
mode = 'valid' if mode == 'dev' else mode | |
file_path = os.path.join(self.data_dir, lang, f'{mode}.txt') | |
guid_index = 1 | |
examples = [] | |
with open(file_path, encoding='utf-8') as f: | |
words = [] | |
labels = [] | |
for line in f: | |
if line.startswith('-DOCSTART-') or line == '' or line == '\n': | |
if words: | |
example = TokensExample( | |
guid=f'{mode}-{guid_index}', | |
words=words, | |
labels=labels | |
) | |
examples.append(example) | |
guid_index += 1 | |
words = [] | |
labels = [] | |
else: | |
splits = line.split(' ') | |
words.append(splits[0]) | |
if len(splits) > 1: | |
labels.append(splits[-1].replace('\n', '')) | |
else: | |
# Examples could have no label for mode = 'test' | |
labels.append('O') | |
if words: | |
example = TokensExample( | |
guid=f'{mode}-{guid_index}', | |
words=words, | |
labels=labels | |
) | |
examples.append(example) | |
return examples | |
def get_labels(self, lang): | |
path = os.path.join(self.data_dir, lang, 'labels.txt') | |
with open(path, 'r') as f: | |
labels = f.read().splitlines() | |
if 'O' not in labels: | |
labels = ['O'] + labels | |
return labels | |
class WikiSectionTitles(DataProcessor): | |
"""Processor for the Wikipedia Section Title Prediction dataset""" | |
def __init__(self, data_dir): | |
self.data_dir = data_dir | |
def get_train_examples(self, lang): | |
"""See base class.""" | |
fname = '{}/{}-train.json'.format(lang, lang) | |
fpath = os.path.join(self.data_dir, fname) | |
return self._create_examples(self.read_json(fpath), 'train') | |
def get_dev_examples(self, lang): | |
"""See base class.""" | |
fname = '{}/{}-valid.json'.format(lang, lang) | |
fpath = os.path.join(self.data_dir, fname) | |
return self._create_examples(self.read_json(fpath), 'dev') | |
def get_test_examples(self, lang): | |
"""See base class.""" | |
fname = '{}/{}-test.json'.format(lang, lang) | |
fpath = os.path.join(self.data_dir, fname) | |
return self._create_examples(self.read_json(fpath), 'test') | |
def get_labels(self, lang): | |
"""See base class.""" | |
return ['titleA', 'titleB', 'titleC', 'titleD'] | |
def _create_examples(self, items, set_type): | |
"""Creates examples for the training and dev sets.""" | |
examples = [ | |
MultipleChoiceExample( | |
example_id=idx, | |
question='', | |
contexts=[item['sectionText'], item['sectionText'], | |
item['sectionText'], item['sectionText']], | |
endings=[item['titleA'], item['titleB'], item['titleC'], | |
item['titleD']], | |
label=item['correctTitle'], | |
) | |
for idx, item in enumerate(items) | |
] | |
return examples | |
class ManKiBaat(DataProcessor): | |
"""Processor for Man ki Baat dataset""" | |
def __init__(self, data_dir): | |
self.data_dir = data_dir | |
def modes(self): | |
return ['en', 'in'] | |
def get_examples(self, lang, mode): | |
if mode == 'en': | |
return self.get_examples_en(lang) | |
elif mode == 'in': | |
return self.get_examples_in(lang) | |
def get_examples_en(self, lang): | |
"""Get examples of English language""" | |
fname = 'en-{}/mkb.en'.format(lang) | |
fpath = os.path.join(self.data_dir, fname) | |
return self._create_examples(self.readlines(fpath), 'en') | |
def get_examples_in(self, lang): | |
"""Get examples of the Indian language""" | |
fname = 'en-{}/mkb.{}'.format(lang, lang) | |
fpath = os.path.join(self.data_dir, fname) | |
return self._create_examples(self.readlines(fpath), 'in') | |
def _create_examples(self, lines, set_type): | |
"""Creates examples for the training and dev sets.""" | |
examples = [] | |
for (i, line) in enumerate(lines): | |
example = TextExample( | |
guid=('%s-%s' % (set_type, i)), | |
text_a=line, | |
label=i | |
) | |
examples.append(example) | |
return examples | |
def get_labels(self, lang): | |
# return dummy value greater than number of examples | |
return list(range(10000)) | |
class ACTSA(IndicNLPGenre): | |
pass | |
class BBCNews(IndicNLPGenre): | |
def get_dev_examples(self, lang): | |
"""See base class.""" | |
fname = '{}/{}-test.csv'.format(lang, lang) | |
fpath = os.path.join(self.data_dir, fname) | |
return self._create_examples(self.read_csv(fpath), 'dev') | |
class INLTKHeadlines(IndicNLPGenre): | |
pass | |
class SohamArticles(IndicNLPGenre): | |
pass | |
class IITPMovies(IndicNLPGenre): | |
pass | |
class IITProducts(IndicNLPGenre): | |
pass | |
class AmritaParaphraseExact(IndicNLPGenre): | |
def get_dev_examples(self, lang): | |
"""See base class.""" | |
fname = '{}/{}-test.csv'.format(lang, lang) | |
fpath = os.path.join(self.data_dir, fname) | |
return self._create_examples(self.read_csv(fpath), 'dev') | |
def get_labels(self, lang): | |
"""See base class.""" | |
filename = '{}/{}-train.csv'.format(lang, lang) | |
lines = self.read_csv(os.path.join(self.data_dir, filename)) | |
labels = map(lambda l: l[2], lines) | |
labels = list(set(labels)) | |
return labels | |
def _create_examples(self, lines, set_type): | |
"""Creates examples for the training and dev sets.""" | |
examples = [] | |
for (i, line) in enumerate(lines): | |
example = TextExample( | |
guid=('%s-%s' % (set_type, i)), | |
text_a=line[0], | |
text_b=line[1], | |
label=line[2] | |
) | |
examples.append(example) | |
return examples | |
class AmritaParaphraseFuzzy(AmritaParaphraseExact): | |
pass | |
class MidasDiscourse(DataProcessor): | |
"""Processor for the Article Genre Classification data set""" | |
def __init__(self, data_dir): | |
self.data_dir = data_dir | |
def get_train_examples(self, lang): | |
"""See base class.""" | |
fname = '{}/train.json'.format(lang, lang) | |
fpath = os.path.join(self.data_dir, fname) | |
return self._create_examples(self.read_json(fpath), 'train') | |
def get_dev_examples(self, lang): | |
"""See base class.""" | |
fname = '{}/val.json'.format(lang, lang) | |
fpath = os.path.join(self.data_dir, fname) | |
return self._create_examples(self.read_json(fpath), 'dev') | |
def get_test_examples(self, lang): | |
fname = '{}/test.json'.format(lang, lang) | |
fpath = os.path.join(self.data_dir, fname) | |
return self._create_examples(self.read_json(fpath), 'test') | |
def get_labels(self, lang): | |
"""See base class.""" | |
filename = '{}/train.json'.format(lang, lang) | |
lines = self.read_json(os.path.join(self.data_dir, filename)) | |
labels = map(lambda l: l['Discourse Mode'], lines) | |
labels = list(set(labels)) | |
return labels | |
def _create_examples(self, lines, set_type): | |
"""Creates examples for the training and dev sets.""" | |
examples = [] | |
for (i, line) in enumerate(lines): | |
example = TextExample( | |
guid=('%s-%s' % (set_type, i)), | |
text_a=line['Sentence'], | |
label=line['Discourse Mode'] | |
) | |
examples.append(example) | |
return examples | |
class WNLI(DataProcessor): | |
"""Processor for the WNLI data set (GLUE version).""" | |
def __init__(self, data_dir): | |
self.data_dir = data_dir | |
def get_train_examples(self, lang): | |
"""See base class.""" | |
fname = '{}/train.csv'.format(lang) | |
fpath = os.path.join(self.data_dir, fname) | |
return self._create_examples(self.read_csv(fpath), 'train') | |
def get_dev_examples(self, lang): | |
"""See base class.""" | |
fname = '{}/dev.csv'.format(lang) | |
fpath = os.path.join(self.data_dir, fname) | |
return self._create_examples(self.read_csv(fpath), 'dev') | |
def get_test_examples(self, lang): | |
"""See base class.""" | |
fname = '{}/dev.csv'.format(lang) | |
fpath = os.path.join(self.data_dir, fname) | |
return self._create_examples(self.read_csv(fpath), 'test') | |
def get_labels(self, lang): | |
"""See base class.""" | |
return ['0', '1'] | |
def _create_examples(self, lines, set_type): | |
"""Creates examples for the training, dev and test sets.""" | |
examples = [] | |
for (i, line) in enumerate(lines): | |
if i == 0: | |
continue | |
guid = "%s-%s" % (set_type, line[0]) | |
text_a = line[1] | |
text_b = line[2] | |
label = line[-1] | |
examples.append(TextExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) | |
return examples | |
class COPA(DataProcessor): | |
"""Processor for the Wikipedia Section Title Prediction dataset""" | |
def __init__(self, data_dir): | |
self.data_dir = data_dir | |
def get_train_examples(self, lang): | |
"""See base class.""" | |
fname = '{}/train.jsonl'.format(lang) | |
fpath = os.path.join(self.data_dir, fname) | |
return self._create_examples(self.read_jsonl(fpath), 'train') | |
def get_dev_examples(self, lang): | |
"""See base class.""" | |
fname = '{}/val.jsonl'.format(lang) | |
fpath = os.path.join(self.data_dir, fname) | |
return self._create_examples(self.read_jsonl(fpath), 'dev') | |
def get_test_examples(self, lang): | |
"""See base class.""" | |
fname = '{}/val.jsonl'.format(lang, lang) | |
fpath = os.path.join(self.data_dir, fname) | |
return self._create_examples(self.read_jsonl(fpath), 'test') | |
def get_labels(self, lang): | |
"""See base class.""" | |
return [0, 1] | |
def _create_examples(self, items, set_type): | |
"""Creates examples for the training and dev sets.""" | |
examples = [ | |
MultipleChoiceExample( | |
example_id=idx, | |
question='', | |
contexts=[item['premise'], item['premise']], | |
endings=[item['choice1'], item['choice2']], | |
label=item['label'], | |
) | |
for idx, item in enumerate(items) | |
] | |
return examples | |