Upload 3026 files

a366dd4 verified 4 months ago

10.8 kB

	"""
	Tests encoding functionality during parsing
	for all of the parsers defined in parsers.py
	"""
	from io import (
	BytesIO,
	TextIOWrapper,
	)
	import os
	import tempfile
	import uuid

	import numpy as np
	import pytest

	from pandas import (
	DataFrame,
	read_csv,
	)
	import pandas._testing as tm

	pytestmark = pytest.mark.filterwarnings(
	"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
	)

	skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")


	def test_bytes_io_input(all_parsers):
	encoding = "cp1255"
	parser = all_parsers

	data = BytesIO("שלום:1234\n562:123".encode(encoding))
	result = parser.read_csv(data, sep=":", encoding=encoding)

	expected = DataFrame([[562, 123]], columns=["שלום", "1234"])
	tm.assert_frame_equal(result, expected)


	@skip_pyarrow # CSV parse error: Empty CSV file or block
	def test_read_csv_unicode(all_parsers):
	parser = all_parsers
	data = BytesIO("\u0141aski, Jan;1".encode())

	result = parser.read_csv(data, sep=";", encoding="utf-8", header=None)
	expected = DataFrame([["\u0141aski, Jan", 1]])
	tm.assert_frame_equal(result, expected)


	@skip_pyarrow
	@pytest.mark.parametrize("sep", [",", "\t"])
	@pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"])
	def test_utf16_bom_skiprows(all_parsers, sep, encoding):
	# see gh-2298
	parser = all_parsers
	data = """skip this
	skip this too
	A,B,C
	1,2,3
	4,5,6""".replace(
	",", sep
	)
	path = f"__{uuid.uuid4()}__.csv"
	kwargs = {"sep": sep, "skiprows": 2}
	utf8 = "utf-8"

	with tm.ensure_clean(path) as path:
	bytes_data = data.encode(encoding)

	with open(path, "wb") as f:
	f.write(bytes_data)

	with TextIOWrapper(BytesIO(data.encode(utf8)), encoding=utf8) as bytes_buffer:
	result = parser.read_csv(path, encoding=encoding, **kwargs)
	expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs)
	tm.assert_frame_equal(result, expected)


	def test_utf16_example(all_parsers, csv_dir_path):
	path = os.path.join(csv_dir_path, "utf16_ex.txt")
	parser = all_parsers
	result = parser.read_csv(path, encoding="utf-16", sep="\t")
	assert len(result) == 50


	def test_unicode_encoding(all_parsers, csv_dir_path):
	path = os.path.join(csv_dir_path, "unicode_series.csv")
	parser = all_parsers

	result = parser.read_csv(path, header=None, encoding="latin-1")
	result = result.set_index(0)
	got = result[1][1632]

	expected = "\xc1 k\xf6ldum klaka (Cold Fever) (1994)"
	assert got == expected


	@pytest.mark.parametrize(
	"data,kwargs,expected",
	[
	# Basic test
	("a\n1", {}, DataFrame({"a": [1]})),
	# "Regular" quoting
	('"a"\n1', {"quotechar": '"'}, DataFrame({"a": [1]})),
	# Test in a data row instead of header
	("b\n1", {"names": ["a"]}, DataFrame({"a": ["b", "1"]})),
	# Test in empty data row with skipping
	("\n1", {"names": ["a"], "skip_blank_lines": True}, DataFrame({"a": [1]})),
	# Test in empty data row without skipping
	(
	"\n1",
	{"names": ["a"], "skip_blank_lines": False},
	DataFrame({"a": [np.nan, 1]}),
	),
	],
	)
	def test_utf8_bom(all_parsers, data, kwargs, expected, request):
	# see gh-4793
	parser = all_parsers
	bom = "\ufeff"
	utf8 = "utf-8"

	def _encode_data_with_bom(_data):
	bom_data = (bom + _data).encode(utf8)
	return BytesIO(bom_data)

	if (
	parser.engine == "pyarrow"
	and data == "\n1"
	and kwargs.get("skip_blank_lines", True)
	):
	# CSV parse error: Empty CSV file or block: cannot infer number of columns
	pytest.skip(reason="https://github.com/apache/arrow/issues/38676")

	result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs)
	tm.assert_frame_equal(result, expected)


	def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt):
	# see gh-13549
	expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]})
	parser = all_parsers

	encoding = encoding_fmt.format(utf_value)
	data = "mb_num,multibyte\n4.8,test".encode(encoding)

	result = parser.read_csv(BytesIO(data), encoding=encoding)
	tm.assert_frame_equal(result, expected)


	@pytest.mark.parametrize(
	"file_path,encoding",
	[
	(("io", "data", "csv", "test1.csv"), "utf-8"),
	(("io", "parser", "data", "unicode_series.csv"), "latin-1"),
	(("io", "parser", "data", "sauron.SHIFT_JIS.csv"), "shiftjis"),
	],
	)
	def test_binary_mode_file_buffers(all_parsers, file_path, encoding, datapath):
	# gh-23779: Python csv engine shouldn't error on files opened in binary.
	# gh-31575: Python csv engine shouldn't error on files opened in raw binary.
	parser = all_parsers

	fpath = datapath(*file_path)
	expected = parser.read_csv(fpath, encoding=encoding)

	with open(fpath, encoding=encoding) as fa:
	result = parser.read_csv(fa)
	assert not fa.closed
	tm.assert_frame_equal(expected, result)

	with open(fpath, mode="rb") as fb:
	result = parser.read_csv(fb, encoding=encoding)
	assert not fb.closed
	tm.assert_frame_equal(expected, result)

	with open(fpath, mode="rb", buffering=0) as fb:
	result = parser.read_csv(fb, encoding=encoding)
	assert not fb.closed
	tm.assert_frame_equal(expected, result)


	@pytest.mark.parametrize("pass_encoding", [True, False])
	def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding):
	# see gh-24130
	parser = all_parsers
	encoding = encoding_fmt.format(utf_value)

	if parser.engine == "pyarrow" and pass_encoding is True and utf_value in [16, 32]:
	# FIXME: this is bad!
	pytest.skip("These cases freeze")

	expected = DataFrame({"foo": ["bar"]})

	with tm.ensure_clean(mode="w+", encoding=encoding, return_filelike=True) as f:
	f.write("foo\nbar")
	f.seek(0)

	result = parser.read_csv(f, encoding=encoding if pass_encoding else None)
	tm.assert_frame_equal(result, expected)


	def test_encoding_named_temp_file(all_parsers):
	# see gh-31819
	parser = all_parsers
	encoding = "shift-jis"

	title = "てすと"
	data = "こむ"

	expected = DataFrame({title: [data]})

	with tempfile.NamedTemporaryFile() as f:
	f.write(f"{title}\n{data}".encode(encoding))

	f.seek(0)

	result = parser.read_csv(f, encoding=encoding)
	tm.assert_frame_equal(result, expected)
	assert not f.closed


	@pytest.mark.parametrize(
	"encoding", ["utf-8", "utf-16", "utf-16-be", "utf-16-le", "utf-32"]
	)
	def test_parse_encoded_special_characters(encoding):
	# GH16218 Verify parsing of data with encoded special characters
	# Data contains a Unicode 'FULLWIDTH COLON' (U+FF1A) at position (0,"a")
	data = "a\tb\n：foo\t0\nbar\t1\nbaz\t2" # noqa: RUF001
	encoded_data = BytesIO(data.encode(encoding))
	result = read_csv(encoded_data, delimiter="\t", encoding=encoding)

	expected = DataFrame(
	data=[["：foo", 0], ["bar", 1], ["baz", 2]], # noqa: RUF001
	columns=["a", "b"],
	)
	tm.assert_frame_equal(result, expected)


	@pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"])
	def test_encoding_memory_map(all_parsers, encoding):
	# GH40986
	parser = all_parsers
	expected = DataFrame(
	{
	"name": ["Raphael", "Donatello", "Miguel Angel", "Leonardo"],
	"mask": ["red", "purple", "orange", "blue"],
	"weapon": ["sai", "bo staff", "nunchunk", "katana"],
	}
	)
	with tm.ensure_clean() as file:
	expected.to_csv(file, index=False, encoding=encoding)

	if parser.engine == "pyarrow":
	msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
	with pytest.raises(ValueError, match=msg):
	parser.read_csv(file, encoding=encoding, memory_map=True)
	return

	df = parser.read_csv(file, encoding=encoding, memory_map=True)
	tm.assert_frame_equal(df, expected)


	def test_chunk_splits_multibyte_char(all_parsers):
	"""
	Chunk splits a multibyte character with memory_map=True

	GH 43540
	"""
	parser = all_parsers
	# DEFAULT_CHUNKSIZE = 262144, defined in parsers.pyx
	df = DataFrame(data=["a" * 127] * 2048)

	# Put two-bytes utf-8 encoded character "ą" at the end of chunk
	# utf-8 encoding of "ą" is b'\xc4\x85'
	df.iloc[2047] = "a" * 127 + "ą"
	with tm.ensure_clean("bug-gh43540.csv") as fname:
	df.to_csv(fname, index=False, header=False, encoding="utf-8")

	if parser.engine == "pyarrow":
	msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
	with pytest.raises(ValueError, match=msg):
	parser.read_csv(fname, header=None, memory_map=True)
	return

	dfr = parser.read_csv(fname, header=None, memory_map=True)
	tm.assert_frame_equal(dfr, df)


	def test_readcsv_memmap_utf8(all_parsers):
	"""
	GH 43787

	Test correct handling of UTF-8 chars when memory_map=True and encoding is UTF-8
	"""
	lines = []
	line_length = 128
	start_char = " "
	end_char = "\U00010080"
	# This for loop creates a list of 128-char strings
	# consisting of consecutive Unicode chars
	for lnum in range(ord(start_char), ord(end_char), line_length):
	line = "".join([chr(c) for c in range(lnum, lnum + 0x80)]) + "\n"
	try:
	line.encode("utf-8")
	except UnicodeEncodeError:
	continue
	lines.append(line)
	parser = all_parsers
	df = DataFrame(lines)
	with tm.ensure_clean("utf8test.csv") as fname:
	df.to_csv(fname, index=False, header=False, encoding="utf-8")

	if parser.engine == "pyarrow":
	msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
	with pytest.raises(ValueError, match=msg):
	parser.read_csv(fname, header=None, memory_map=True, encoding="utf-8")
	return

	dfr = parser.read_csv(fname, header=None, memory_map=True, encoding="utf-8")
	tm.assert_frame_equal(df, dfr)


	@pytest.mark.usefixtures("pyarrow_xfail")
	@pytest.mark.parametrize("mode", ["w+b", "w+t"])
	def test_not_readable(all_parsers, mode):
	# GH43439
	parser = all_parsers
	content = b"abcd"
	if "t" in mode:
	content = "abcd"
	with tempfile.SpooledTemporaryFile(mode=mode, encoding="utf-8") as handle:
	handle.write(content)
	handle.seek(0)
	df = parser.read_csv(handle)
	expected = DataFrame([], columns=["abcd"])
	tm.assert_frame_equal(df, expected)