Spaces:
Running
Running
Update workflow/dataloading/dataloading_core.py
Browse files
workflow/dataloading/dataloading_core.py
CHANGED
|
@@ -85,36 +85,57 @@ def read_data_from_file(
|
|
| 85 |
return df
|
| 86 |
|
| 87 |
if encoding is None:
|
| 88 |
-
|
| 89 |
-
encoding =
|
| 90 |
-
sample = data_bytes[:10_000].decode(encoding, errors='ignore')
|
| 91 |
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
use_whitespace = False
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
read_kwargs = {
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
'on_bad_lines': 'skip',
|
| 107 |
}
|
| 108 |
-
if use_whitespace:
|
| 109 |
-
read_kwargs['delim_whitespace'] = True
|
| 110 |
-
else:
|
| 111 |
-
read_kwargs['sep'] = detected_sep
|
| 112 |
|
| 113 |
if col_names is None:
|
| 114 |
-
read_kwargs[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
else:
|
| 116 |
-
read_kwargs[
|
| 117 |
-
read_kwargs['names'] = col_names
|
| 118 |
|
| 119 |
return pd.read_csv(io.BytesIO(data_bytes), **read_kwargs)
|
| 120 |
|
|
|
|
| 85 |
return df
|
| 86 |
|
| 87 |
if encoding is None:
|
| 88 |
+
det = chardet.detect(data_bytes)
|
| 89 |
+
encoding = det.get("encoding", "utf-8")
|
|
|
|
| 90 |
|
| 91 |
+
if encoding.lower() in ("utf-16", "utf-16le", "utf-16be",
|
| 92 |
+
"utf-32", "utf-32le", "utf-32be"):
|
| 93 |
+
text = data_bytes.decode(encoding, errors="ignore")
|
| 94 |
+
data_bytes = text.encode("utf-8")
|
| 95 |
+
encoding = "utf-8"
|
| 96 |
+
|
| 97 |
+
sample = data_bytes[:10000].decode(encoding, errors="ignore")
|
| 98 |
+
|
| 99 |
+
first_line = sample.splitlines()[0].strip()
|
| 100 |
+
|
| 101 |
+
if sep is not None:
|
| 102 |
+
detected_sep = sep
|
| 103 |
use_whitespace = False
|
| 104 |
+
|
| 105 |
+
elif "," in first_line:
|
| 106 |
+
detected_sep = ","
|
| 107 |
+
use_whitespace = False
|
| 108 |
+
|
| 109 |
+
else:
|
| 110 |
+
try:
|
| 111 |
+
dialect = csv.Sniffer().sniff(
|
| 112 |
+
sample,
|
| 113 |
+
delimiters=[",", ";", "\t", "|"]
|
| 114 |
+
)
|
| 115 |
+
detected_sep = dialect.delimiter
|
| 116 |
+
use_whitespace = False
|
| 117 |
+
except csv.Error:
|
| 118 |
+
detected_sep = None
|
| 119 |
+
use_whitespace = True # fallback
|
| 120 |
|
| 121 |
read_kwargs = {
|
| 122 |
+
"engine": "python",
|
| 123 |
+
"encoding": encoding,
|
| 124 |
+
"na_values": na_values,
|
| 125 |
+
"skipinitialspace": True,
|
| 126 |
+
"on_bad_lines": "skip",
|
|
|
|
| 127 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
if col_names is None:
|
| 130 |
+
read_kwargs["header"] = 0
|
| 131 |
+
else:
|
| 132 |
+
read_kwargs["header"] = None
|
| 133 |
+
read_kwargs["names"] = col_names
|
| 134 |
+
|
| 135 |
+
if use_whitespace:
|
| 136 |
+
read_kwargs["delim_whitespace"] = True
|
| 137 |
else:
|
| 138 |
+
read_kwargs["sep"] = detected_sep
|
|
|
|
| 139 |
|
| 140 |
return pd.read_csv(io.BytesIO(data_bytes), **read_kwargs)
|
| 141 |
|