medical-question-answering-datasets / src /data_preprocessing.py
phosseini's picture
adding method for converting ChatDoctor
66cdad4
raw
history blame
1.2 kB
from datasets import Dataset, DatasetDict
import pandas as pd
def process_ChatDoctor_data(data_path, hf_data_path=None):
"""
converting ChatDoctor data to hugging face Dataset
:param data_path:
:param hf_data_path:
:return:
"""
buffer = []
rows = []
errors = 0
def clean_line(text):
text = text.strip()
text = text.split(":")[1].strip()
text = text.strip(",").strip("\"")
text = text.lstrip(",").lstrip("\"")
return text.strip()
with open(data_path, 'r') as file:
for line in file:
if line.strip() in ['[', '{', ']']:
continue
if line.strip() in ["},", "}"]:
if len(buffer) == 3:
rows.append(buffer)
buffer = []
else:
buffer.append(clean_line(line))
df_train = pd.DataFrame(rows, columns=["instruction", "input", "output"])
hf_data_train = Dataset.from_pandas(df_train)
hf_data = DatasetDict({'train': hf_data_train})
if hf_data_path is not None:
hf_data.push_to_hub(hf_data_path)
print("processed data points: {}".format(len(df_train)))
return df_train