Spaces:

lavita
/

medical-question-answering-datasets

adding method for converting ChatDoctor

66cdad4 about 1 year ago

1.2 kB

	from datasets import Dataset, DatasetDict
	import pandas as pd


	def process_ChatDoctor_data(data_path, hf_data_path=None):
	"""
	converting ChatDoctor data to hugging face Dataset
	:param data_path:
	:param hf_data_path:
	:return:
	"""
	buffer = []
	rows = []
	errors = 0

	def clean_line(text):
	text = text.strip()
	text = text.split(":")[1].strip()
	text = text.strip(",").strip("\"")
	text = text.lstrip(",").lstrip("\"")
	return text.strip()

	with open(data_path, 'r') as file:
	for line in file:
	if line.strip() in ['[', '{', ']']:
	continue
	if line.strip() in ["},", "}"]:
	if len(buffer) == 3:
	rows.append(buffer)
	buffer = []
	else:
	buffer.append(clean_line(line))
	df_train = pd.DataFrame(rows, columns=["instruction", "input", "output"])
	hf_data_train = Dataset.from_pandas(df_train)
	hf_data = DatasetDict({'train': hf_data_train})
	if hf_data_path is not None:
	hf_data.push_to_hub(hf_data_path)
	print("processed data points: {}".format(len(df_train)))
	return df_train