from datasets import Dataset, DatasetDict import pandas as pd def process_ChatDoctor_data(data_path: str, header: list, hf_data_path=None): """ converting ChatDoctor data to hugging face Dataset :param data_path: :param header: a list of header names :param hf_data_path: :return: """ buffer = [] rows = [] errors = 0 def clean_line(text): text = text.strip() text = text.split(":")[1].strip() text = text.strip(",").strip("\"") text = text.lstrip(",").lstrip("\"") return text.strip() with open(data_path, 'r') as file: for line in file: try: if line.strip() in ['[', '{', ']']: continue if line.strip() in ["},", "}"]: if len(buffer) == len(header): rows.append(buffer) buffer = [] else: buffer.append(clean_line(line)) except Exception as e: print("Error in processing line. Detail: {}".format(e)) errors += 1 df_train = pd.DataFrame(rows, columns=header) hf_data_train = Dataset.from_pandas(df_train) hf_data = DatasetDict({'train': hf_data_train}) if hf_data_path is not None: hf_data.push_to_hub(hf_data_path) print("Processed data points:\nSuccessful: {}, Failed: {}".format(len(df_train), errors)) return df_train