ElvisWang111 commited on
Commit
e31c0e4
·
verified ·
1 Parent(s): a38eb92

Update workflow/dataloading/dataloading_core.py

Browse files
workflow/dataloading/dataloading_core.py CHANGED
@@ -85,36 +85,57 @@ def read_data_from_file(
85
  return df
86
 
87
  if encoding is None:
88
- detected = chardet.detect(data_bytes)
89
- encoding = detected.get('encoding', 'utf-8')
90
- sample = data_bytes[:10_000].decode(encoding, errors='ignore')
91
 
92
- try:
93
- dialect = csv.Sniffer().sniff(sample, delimiters=[',',';','\t','|'])
94
- detected_sep = dialect.delimiter
 
 
 
 
 
 
 
 
 
95
  use_whitespace = False
96
- except csv.Error:
97
- detected_sep = None
98
- use_whitespace = True
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
  read_kwargs = {
101
- 'engine': 'python',
102
- 'encoding': encoding,
103
- 'na_values': na_values,
104
- 'comment': '|',
105
- 'skipinitialspace': True,
106
- 'on_bad_lines': 'skip',
107
  }
108
- if use_whitespace:
109
- read_kwargs['delim_whitespace'] = True
110
- else:
111
- read_kwargs['sep'] = detected_sep
112
 
113
  if col_names is None:
114
- read_kwargs['header'] = 0
 
 
 
 
 
 
115
  else:
116
- read_kwargs['header'] = None
117
- read_kwargs['names'] = col_names
118
 
119
  return pd.read_csv(io.BytesIO(data_bytes), **read_kwargs)
120
 
 
85
  return df
86
 
87
  if encoding is None:
88
+ det = chardet.detect(data_bytes)
89
+ encoding = det.get("encoding", "utf-8")
 
90
 
91
+ if encoding.lower() in ("utf-16", "utf-16le", "utf-16be",
92
+ "utf-32", "utf-32le", "utf-32be"):
93
+ text = data_bytes.decode(encoding, errors="ignore")
94
+ data_bytes = text.encode("utf-8")
95
+ encoding = "utf-8"
96
+
97
+ sample = data_bytes[:10000].decode(encoding, errors="ignore")
98
+
99
+ first_line = sample.splitlines()[0].strip()
100
+
101
+ if sep is not None:
102
+ detected_sep = sep
103
  use_whitespace = False
104
+
105
+ elif "," in first_line:
106
+ detected_sep = ","
107
+ use_whitespace = False
108
+
109
+ else:
110
+ try:
111
+ dialect = csv.Sniffer().sniff(
112
+ sample,
113
+ delimiters=[",", ";", "\t", "|"]
114
+ )
115
+ detected_sep = dialect.delimiter
116
+ use_whitespace = False
117
+ except csv.Error:
118
+ detected_sep = None
119
+ use_whitespace = True # fallback
120
 
121
  read_kwargs = {
122
+ "engine": "python",
123
+ "encoding": encoding,
124
+ "na_values": na_values,
125
+ "skipinitialspace": True,
126
+ "on_bad_lines": "skip",
 
127
  }
 
 
 
 
128
 
129
  if col_names is None:
130
+ read_kwargs["header"] = 0
131
+ else:
132
+ read_kwargs["header"] = None
133
+ read_kwargs["names"] = col_names
134
+
135
+ if use_whitespace:
136
+ read_kwargs["delim_whitespace"] = True
137
  else:
138
+ read_kwargs["sep"] = detected_sep
 
139
 
140
  return pd.read_csv(io.BytesIO(data_bytes), **read_kwargs)
141