File size: 1,465 Bytes
66cdad4
 
 
 
5d61750
66cdad4
 
 
5d61750
66cdad4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316112a
 
 
 
5d61750
316112a
 
 
 
 
 
 
5d61750
66cdad4
 
316112a
66cdad4
 
316112a
 
 
66cdad4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from datasets import Dataset, DatasetDict
import pandas as pd


def process_ChatDoctor_data(data_path: str, header: list, hf_data_path=None):
    """
    converting ChatDoctor data to hugging face Dataset
    :param data_path:
    :param header: a list of header names
    :param hf_data_path:
    :return:
    """
    buffer = []
    rows = []
    errors = 0

    def clean_line(text):
        text = text.strip()
        text = text.split(":")[1].strip()
        text = text.strip(",").strip("\"")
        text = text.lstrip(",").lstrip("\"")
        return text.strip()

    with open(data_path, 'r') as file:
        for line in file:
            try:
                if line.strip() in ['[', '{', ']']:
                    continue
                if line.strip() in ["},", "}"]:
                    if len(buffer) == len(header):
                        rows.append(buffer)
                    buffer = []
                else:
                    buffer.append(clean_line(line))
            except Exception as e:
                print("Error in processing line. Detail: {}".format(e))
                errors += 1
    df_train = pd.DataFrame(rows, columns=header)
    hf_data_train = Dataset.from_pandas(df_train)
    hf_data = DatasetDict({'train': hf_data_train})

    if hf_data_path is not None:
        hf_data.push_to_hub(hf_data_path)

    print("Processed data points:\nSuccessful: {}, Failed: {}".format(len(df_train), errors))

    return df_train