Spaces:

lavita
/

medical-question-answering-datasets

Running

File size: 1,465 Bytes

from datasets import Dataset, DatasetDict
import pandas as pd


def process_ChatDoctor_data(data_path: str, header: list, hf_data_path=None):
    """
    converting ChatDoctor data to hugging face Dataset
    :param data_path:
    :param header: a list of header names
    :param hf_data_path:
    :return:
    """
    buffer = []
    rows = []
    errors = 0

    def clean_line(text):
        text = text.strip()
        text = text.split(":")[1].strip()
        text = text.strip(",").strip("\"")
        text = text.lstrip(",").lstrip("\"")
        return text.strip()

    with open(data_path, 'r') as file:
        for line in file:
            try:
                if line.strip() in ['[', '{', ']']:
                    continue
                if line.strip() in ["},", "}"]:
                    if len(buffer) == len(header):
                        rows.append(buffer)
                    buffer = []
                else:
                    buffer.append(clean_line(line))
            except Exception as e:
                print("Error in processing line. Detail: {}".format(e))
                errors += 1
    df_train = pd.DataFrame(rows, columns=header)
    hf_data_train = Dataset.from_pandas(df_train)
    hf_data = DatasetDict({'train': hf_data_train})

    if hf_data_path is not None:
        hf_data.push_to_hub(hf_data_path)

    print("Processed data points:\nSuccessful: {}, Failed: {}".format(len(df_train), errors))

    return df_train