phosseini commited on
Commit
5d61750
1 Parent(s): 316112a

changing parameter name

Browse files
src/data_preprocessing.py CHANGED
@@ -2,11 +2,11 @@ from datasets import Dataset, DatasetDict
2
  import pandas as pd
3
 
4
 
5
- def process_ChatDoctor_data(data_path: str, headers: list, hf_data_path=None):
6
  """
7
  converting ChatDoctor data to hugging face Dataset
8
  :param data_path:
9
- :param headers: a list of header names
10
  :param hf_data_path:
11
  :return:
12
  """
@@ -27,7 +27,7 @@ def process_ChatDoctor_data(data_path: str, headers: list, hf_data_path=None):
27
  if line.strip() in ['[', '{', ']']:
28
  continue
29
  if line.strip() in ["},", "}"]:
30
- if len(buffer) == len(headers):
31
  rows.append(buffer)
32
  buffer = []
33
  else:
@@ -35,7 +35,7 @@ def process_ChatDoctor_data(data_path: str, headers: list, hf_data_path=None):
35
  except Exception as e:
36
  print("Error in processing line. Detail: {}".format(e))
37
  errors += 1
38
- df_train = pd.DataFrame(rows, columns=headers)
39
  hf_data_train = Dataset.from_pandas(df_train)
40
  hf_data = DatasetDict({'train': hf_data_train})
41
 
 
2
  import pandas as pd
3
 
4
 
5
+ def process_ChatDoctor_data(data_path: str, header: list, hf_data_path=None):
6
  """
7
  converting ChatDoctor data to hugging face Dataset
8
  :param data_path:
9
+ :param header: a list of header names
10
  :param hf_data_path:
11
  :return:
12
  """
 
27
  if line.strip() in ['[', '{', ']']:
28
  continue
29
  if line.strip() in ["},", "}"]:
30
+ if len(buffer) == len(header):
31
  rows.append(buffer)
32
  buffer = []
33
  else:
 
35
  except Exception as e:
36
  print("Error in processing line. Detail: {}".format(e))
37
  errors += 1
38
+ df_train = pd.DataFrame(rows, columns=header)
39
  hf_data_train = Dataset.from_pandas(df_train)
40
  hf_data = DatasetDict({'train': hf_data_train})
41
 
tests/test_data_conversion.py CHANGED
@@ -15,7 +15,7 @@ class TestMethods(unittest.TestCase):
15
  "iCliniq.json": ["input", "answer_icliniq", "answer_chatgpt", "answer_chatdoctor"]
16
  }
17
  for data_name, headers in data_dict.items():
18
- process_ChatDoctor_data(data_path + data_name, headers=headers)
19
 
20
 
21
  if __name__ == '__main__':
 
15
  "iCliniq.json": ["input", "answer_icliniq", "answer_chatgpt", "answer_chatdoctor"]
16
  }
17
  for data_name, headers in data_dict.items():
18
+ process_ChatDoctor_data(data_path + data_name, header=headers)
19
 
20
 
21
  if __name__ == '__main__':