|
import collections |
|
import functools |
|
import math |
|
import re |
|
from typing import Callable, Mapping, Optional, Sequence, Union |
|
import uuid |
|
|
|
from absl import logging |
|
import babel |
|
import gin |
|
import seqio |
|
import tensorflow.compat.v2 as tf |
|
|
|
import json |
|
import pandas as pd |
|
|
|
|
|
|
|
|
|
AUTOTUNE = tf.data.experimental.AUTOTUNE |
|
|
|
FeatureType = Mapping[str, tf.Tensor] |
|
|
|
rekey = seqio.preprocessors.rekey |
|
tokenize = seqio.preprocessors.tokenize |
|
|
|
|
|
@seqio.map_over_dataset |
|
def parse_tsv(line, field_names=None, field_delim='\t'): |
|
"""Splits TSV lines into dict examples mapping field name to string value. |
|
Args: |
|
line: an example containing a comma/tab-delimited string. |
|
field_names: a list of strings, the ordered names of the TSV fields. |
|
Defaults to "inputs" and "targets". |
|
field_delim: a string, the delimiter to split on e.g. ',' for csv. |
|
Returns: |
|
A feature dict mapping field name to string value. |
|
""" |
|
breakpoint() |
|
field_names = field_names or ['inputs', 'targets'] |
|
return dict( |
|
zip(field_names, |
|
tf.io.decode_csv( |
|
line, |
|
record_defaults=[''] * len(field_names), |
|
field_delim=field_delim, |
|
use_quote_delim=False))) |
|
|
|
|
|
@seqio.map_over_dataset |
|
def parse_json(line,field_delim='\t'): |
|
"""Splits JSON lines into dict examples mapping. |
|
Args: |
|
line: an example containing valid json |
|
Returns: |
|
A feature dict mapping field name to string value. |
|
""" |
|
mydf = pd.read_json(line, lines=True) |
|
line = mydf.to_csv(header=False, index=False,sep="\t").strip() |
|
field_names = list(mydf.columns) |
|
|
|
return dict( |
|
zip(field_names, |
|
tf.io.decode_csv( |
|
line, |
|
record_defaults=[''] * len(field_names), |
|
field_delim=field_delim, |
|
use_quote_delim=False))) |
|
|