phosseini commited on
Commit
316112a
1 Parent(s): fb946e8

adding iCliniq data + test methods

Browse files
src/data_preprocessing.py CHANGED
@@ -2,10 +2,11 @@ from datasets import Dataset, DatasetDict
2
  import pandas as pd
3
 
4
 
5
- def process_ChatDoctor_data(data_path, hf_data_path=None):
6
  """
7
  converting ChatDoctor data to hugging face Dataset
8
  :param data_path:
 
9
  :param hf_data_path:
10
  :return:
11
  """
@@ -22,18 +23,25 @@ def process_ChatDoctor_data(data_path, hf_data_path=None):
22
 
23
  with open(data_path, 'r') as file:
24
  for line in file:
25
- if line.strip() in ['[', '{', ']']:
26
- continue
27
- if line.strip() in ["},", "}"]:
28
- if len(buffer) == 3:
29
- rows.append(buffer)
30
- buffer = []
31
- else:
32
- buffer.append(clean_line(line))
33
- df_train = pd.DataFrame(rows, columns=["instruction", "input", "output"])
 
 
 
 
34
  hf_data_train = Dataset.from_pandas(df_train)
35
  hf_data = DatasetDict({'train': hf_data_train})
 
36
  if hf_data_path is not None:
37
  hf_data.push_to_hub(hf_data_path)
38
- print("processed data points: {}".format(len(df_train)))
 
 
39
  return df_train
 
2
  import pandas as pd
3
 
4
 
5
+ def process_ChatDoctor_data(data_path: str, headers: list, hf_data_path=None):
6
  """
7
  converting ChatDoctor data to hugging face Dataset
8
  :param data_path:
9
+ :param headers: a list of header names
10
  :param hf_data_path:
11
  :return:
12
  """
 
23
 
24
  with open(data_path, 'r') as file:
25
  for line in file:
26
+ try:
27
+ if line.strip() in ['[', '{', ']']:
28
+ continue
29
+ if line.strip() in ["},", "}"]:
30
+ if len(buffer) == len(headers):
31
+ rows.append(buffer)
32
+ buffer = []
33
+ else:
34
+ buffer.append(clean_line(line))
35
+ except Exception as e:
36
+ print("Error in processing line. Detail: {}".format(e))
37
+ errors += 1
38
+ df_train = pd.DataFrame(rows, columns=headers)
39
  hf_data_train = Dataset.from_pandas(df_train)
40
  hf_data = DatasetDict({'train': hf_data_train})
41
+
42
  if hf_data_path is not None:
43
  hf_data.push_to_hub(hf_data_path)
44
+
45
+ print("Processed data points:\nSuccessful: {}, Failed: {}".format(len(df_train), errors))
46
+
47
  return df_train
tests/test_data_conversion.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ path = os.getcwd()
5
+ sys.path.append('{}/src'.format('/'.join(path.split('/')[:-1])))
6
+
7
+ import unittest
8
+ from data_preprocessing import process_ChatDoctor_data
9
+
10
+
11
+ class TestMethods(unittest.TestCase):
12
+ def test_a_process_ChatDoctor_data(self):
13
+ data_path = "../data/"
14
+ data_dict = {"HealthCareMagic-100k.json": ["instruction", "input", "output"],
15
+ "iCliniq.json": ["input", "answer_icliniq", "answer_chatgpt", "answer_chatdoctor"]
16
+ }
17
+ for data_name, headers in data_dict.items():
18
+ process_ChatDoctor_data(data_path + data_name, headers=headers)
19
+
20
+
21
+ if __name__ == '__main__':
22
+ unittest.main()