pere
/

t5-parliament-categorisation

Model card Files Files and versions Community

t5-parliament-categorisation / my_preprocessors.py

pere

more small changes

0173bf7 about 2 years ago

raw history blame contribute delete

No virus

1.96 kB

	import collections
	import functools
	import math
	import re
	from typing import Callable, Mapping, Optional, Sequence, Union
	import uuid

	from absl import logging
	import babel
	import gin
	import seqio
	import tensorflow.compat.v2 as tf

	import json
	import pandas as pd

	# We disable no-value-for-parameter since the seqio.map_over_dataset leads to
	# a false positive when seeds are provided.
	# pylint:disable=no-value-for-parameter
	AUTOTUNE = tf.data.experimental.AUTOTUNE

	FeatureType = Mapping[str, tf.Tensor]

	rekey = seqio.preprocessors.rekey
	tokenize = seqio.preprocessors.tokenize


	@seqio.map_over_dataset
	def parse_tsv(line, field_names=None, field_delim='\t'):
	"""Splits TSV lines into dict examples mapping field name to string value.
	Args:
	line: an example containing a comma/tab-delimited string.
	field_names: a list of strings, the ordered names of the TSV fields.
	Defaults to "inputs" and "targets".
	field_delim: a string, the delimiter to split on e.g. ',' for csv.
	Returns:
	A feature dict mapping field name to string value.
	"""
	breakpoint()
	field_names = field_names or ['inputs', 'targets']
	return dict(
	zip(field_names,
	tf.io.decode_csv(
	line,
	record_defaults=[''] * len(field_names),
	field_delim=field_delim,
	use_quote_delim=False)))


	@seqio.map_over_dataset
	def parse_json(line,field_delim='\t'):
	"""Splits JSON lines into dict examples mapping.
	Args:
	line: an example containing valid json
	Returns:
	A feature dict mapping field name to string value.
	"""
	mydf = pd.read_json(line, lines=True)
	line = mydf.to_csv(header=False, index=False,sep="\t").strip()
	field_names = list(mydf.columns)

	return dict(
	zip(field_names,
	tf.io.decode_csv(
	line,
	record_defaults=[''] * len(field_names),
	field_delim=field_delim,
	use_quote_delim=False)))