Spaces:

lavita
/

medical-question-answering-datasets

phosseini commited on Sep 9, 2023

Commit

66cdad4

•

1 Parent(s): 71d6a9c

adding method for converting ChatDoctor

Files changed (1) hide show

src/data_preprocessing.py ADDED Viewed

+from datasets import Dataset, DatasetDict
+import pandas as pd
+def process_ChatDoctor_data(data_path, hf_data_path=None):
+    """
+    converting ChatDoctor data to hugging face Dataset
+    :param data_path:
+    :param hf_data_path:
+    :return:
+    """
+    buffer = []
+    rows = []
+    errors = 0
+    def clean_line(text):
+        text = text.strip()
+        text = text.split(":")[1].strip()
+        text = text.strip(",").strip("\"")
+        text = text.lstrip(",").lstrip("\"")
+        return text.strip()
+    with open(data_path, 'r') as file:
+        for line in file:
+            if line.strip() in ['[', '{', ']']:
+                continue
+            if line.strip() in ["},", "}"]:
+                if len(buffer) == 3:
+                    rows.append(buffer)
+                buffer = []
+            else:
+                buffer.append(clean_line(line))
+    df_train = pd.DataFrame(rows, columns=["instruction", "input", "output"])
+    hf_data_train = Dataset.from_pandas(df_train)
+    hf_data = DatasetDict({'train': hf_data_train})
+    if hf_data_path is not None:
+        hf_data.push_to_hub(hf_data_path)
+    print("processed data points: {}".format(len(df_train)))
+    return df_train