from datasets import Dataset, DatasetDict import pandas as pd def process_ChatDoctor_data(data_path, hf_data_path=None): """ converting ChatDoctor data to hugging face Dataset :param data_path: :param hf_data_path: :return: """ buffer = [] rows = [] errors = 0 def clean_line(text): text = text.strip() text = text.split(":")[1].strip() text = text.strip(",").strip("\"") text = text.lstrip(",").lstrip("\"") return text.strip() with open(data_path, 'r') as file: for line in file: if line.strip() in ['[', '{', ']']: continue if line.strip() in ["},", "}"]: if len(buffer) == 3: rows.append(buffer) buffer = [] else: buffer.append(clean_line(line)) df_train = pd.DataFrame(rows, columns=["instruction", "input", "output"]) hf_data_train = Dataset.from_pandas(df_train) hf_data = DatasetDict({'train': hf_data_train}) if hf_data_path is not None: hf_data.push_to_hub(hf_data_path) print("processed data points: {}".format(len(df_train))) return df_train