|
from . import InputExample |
|
import csv |
|
import gzip |
|
import os |
|
import gzip |
|
|
|
class PairedFilesReader(object): |
|
""" |
|
Reads in the a Pair Dataset, split in two files |
|
""" |
|
def __init__(self, filepaths): |
|
self.filepaths = filepaths |
|
|
|
|
|
def get_examples(self, max_examples=0): |
|
""" |
|
""" |
|
fIns = [] |
|
for filepath in self.filepaths: |
|
fIn = gzip.open(filepath, 'rt', encoding='utf-8') if filepath.endswith('.gz') else open(filepath, encoding='utf-8') |
|
fIns.append(fIn) |
|
|
|
examples = [] |
|
|
|
eof = False |
|
while not eof: |
|
texts = [] |
|
for fIn in fIns: |
|
text = fIn.readline() |
|
|
|
if text == '': |
|
eof = True |
|
break |
|
|
|
texts.append(text) |
|
|
|
if eof: |
|
break; |
|
|
|
examples.append(InputExample(guid=str(len(examples)), texts=texts, label=1)) |
|
if max_examples > 0 and len(examples) >= max_examples: |
|
break |
|
|
|
return examples |