File size: 1,465 Bytes
66cdad4 5d61750 66cdad4 5d61750 66cdad4 316112a 5d61750 316112a 5d61750 66cdad4 316112a 66cdad4 316112a 66cdad4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
from datasets import Dataset, DatasetDict
import pandas as pd
def process_ChatDoctor_data(data_path: str, header: list, hf_data_path=None):
"""
converting ChatDoctor data to hugging face Dataset
:param data_path:
:param header: a list of header names
:param hf_data_path:
:return:
"""
buffer = []
rows = []
errors = 0
def clean_line(text):
text = text.strip()
text = text.split(":")[1].strip()
text = text.strip(",").strip("\"")
text = text.lstrip(",").lstrip("\"")
return text.strip()
with open(data_path, 'r') as file:
for line in file:
try:
if line.strip() in ['[', '{', ']']:
continue
if line.strip() in ["},", "}"]:
if len(buffer) == len(header):
rows.append(buffer)
buffer = []
else:
buffer.append(clean_line(line))
except Exception as e:
print("Error in processing line. Detail: {}".format(e))
errors += 1
df_train = pd.DataFrame(rows, columns=header)
hf_data_train = Dataset.from_pandas(df_train)
hf_data = DatasetDict({'train': hf_data_train})
if hf_data_path is not None:
hf_data.push_to_hub(hf_data_path)
print("Processed data points:\nSuccessful: {}, Failed: {}".format(len(df_train), errors))
return df_train
|