phosseini commited on
Commit
66cdad4
1 Parent(s): 71d6a9c

adding method for converting ChatDoctor

Browse files
Files changed (1) hide show
  1. src/data_preprocessing.py +39 -0
src/data_preprocessing.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import Dataset, DatasetDict
2
+ import pandas as pd
3
+
4
+
5
+ def process_ChatDoctor_data(data_path, hf_data_path=None):
6
+ """
7
+ converting ChatDoctor data to hugging face Dataset
8
+ :param data_path:
9
+ :param hf_data_path:
10
+ :return:
11
+ """
12
+ buffer = []
13
+ rows = []
14
+ errors = 0
15
+
16
+ def clean_line(text):
17
+ text = text.strip()
18
+ text = text.split(":")[1].strip()
19
+ text = text.strip(",").strip("\"")
20
+ text = text.lstrip(",").lstrip("\"")
21
+ return text.strip()
22
+
23
+ with open(data_path, 'r') as file:
24
+ for line in file:
25
+ if line.strip() in ['[', '{', ']']:
26
+ continue
27
+ if line.strip() in ["},", "}"]:
28
+ if len(buffer) == 3:
29
+ rows.append(buffer)
30
+ buffer = []
31
+ else:
32
+ buffer.append(clean_line(line))
33
+ df_train = pd.DataFrame(rows, columns=["instruction", "input", "output"])
34
+ hf_data_train = Dataset.from_pandas(df_train)
35
+ hf_data = DatasetDict({'train': hf_data_train})
36
+ if hf_data_path is not None:
37
+ hf_data.push_to_hub(hf_data_path)
38
+ print("processed data points: {}".format(len(df_train)))
39
+ return df_train