changing parameter name
Browse files
src/data_preprocessing.py
CHANGED
@@ -2,11 +2,11 @@ from datasets import Dataset, DatasetDict
|
|
2 |
import pandas as pd
|
3 |
|
4 |
|
5 |
-
def process_ChatDoctor_data(data_path: str,
|
6 |
"""
|
7 |
converting ChatDoctor data to hugging face Dataset
|
8 |
:param data_path:
|
9 |
-
:param
|
10 |
:param hf_data_path:
|
11 |
:return:
|
12 |
"""
|
@@ -27,7 +27,7 @@ def process_ChatDoctor_data(data_path: str, headers: list, hf_data_path=None):
|
|
27 |
if line.strip() in ['[', '{', ']']:
|
28 |
continue
|
29 |
if line.strip() in ["},", "}"]:
|
30 |
-
if len(buffer) == len(
|
31 |
rows.append(buffer)
|
32 |
buffer = []
|
33 |
else:
|
@@ -35,7 +35,7 @@ def process_ChatDoctor_data(data_path: str, headers: list, hf_data_path=None):
|
|
35 |
except Exception as e:
|
36 |
print("Error in processing line. Detail: {}".format(e))
|
37 |
errors += 1
|
38 |
-
df_train = pd.DataFrame(rows, columns=
|
39 |
hf_data_train = Dataset.from_pandas(df_train)
|
40 |
hf_data = DatasetDict({'train': hf_data_train})
|
41 |
|
|
|
2 |
import pandas as pd
|
3 |
|
4 |
|
5 |
+
def process_ChatDoctor_data(data_path: str, header: list, hf_data_path=None):
|
6 |
"""
|
7 |
converting ChatDoctor data to hugging face Dataset
|
8 |
:param data_path:
|
9 |
+
:param header: a list of header names
|
10 |
:param hf_data_path:
|
11 |
:return:
|
12 |
"""
|
|
|
27 |
if line.strip() in ['[', '{', ']']:
|
28 |
continue
|
29 |
if line.strip() in ["},", "}"]:
|
30 |
+
if len(buffer) == len(header):
|
31 |
rows.append(buffer)
|
32 |
buffer = []
|
33 |
else:
|
|
|
35 |
except Exception as e:
|
36 |
print("Error in processing line. Detail: {}".format(e))
|
37 |
errors += 1
|
38 |
+
df_train = pd.DataFrame(rows, columns=header)
|
39 |
hf_data_train = Dataset.from_pandas(df_train)
|
40 |
hf_data = DatasetDict({'train': hf_data_train})
|
41 |
|
tests/test_data_conversion.py
CHANGED
@@ -15,7 +15,7 @@ class TestMethods(unittest.TestCase):
|
|
15 |
"iCliniq.json": ["input", "answer_icliniq", "answer_chatgpt", "answer_chatdoctor"]
|
16 |
}
|
17 |
for data_name, headers in data_dict.items():
|
18 |
-
process_ChatDoctor_data(data_path + data_name,
|
19 |
|
20 |
|
21 |
if __name__ == '__main__':
|
|
|
15 |
"iCliniq.json": ["input", "answer_icliniq", "answer_chatgpt", "answer_chatdoctor"]
|
16 |
}
|
17 |
for data_name, headers in data_dict.items():
|
18 |
+
process_ChatDoctor_data(data_path + data_name, header=headers)
|
19 |
|
20 |
|
21 |
if __name__ == '__main__':
|