Spaces:

lavita
/

medical-question-answering-datasets

Running

phosseini commited on Sep 11, 2023

Commit

316112a

1 Parent(s): fb946e8

adding iCliniq data + test methods

Files changed (2) hide show

src/data_preprocessing.py CHANGED Viewed

@@ -2,10 +2,11 @@ from datasets import Dataset, DatasetDict
 import pandas as pd
-def process_ChatDoctor_data(data_path, hf_data_path=None):
     """
     converting ChatDoctor data to hugging face Dataset
     :param data_path:
     :param hf_data_path:
     :return:
     """
@@ -22,18 +23,25 @@ def process_ChatDoctor_data(data_path, hf_data_path=None):
     with open(data_path, 'r') as file:
         for line in file:
-            if line.strip() in ['[', '{', ']']:
-                continue
-            if line.strip() in ["},", "}"]:
-                if len(buffer) == 3:
-                    rows.append(buffer)
-                buffer = []
-            else:
-                buffer.append(clean_line(line))
-    df_train = pd.DataFrame(rows, columns=["instruction", "input", "output"])
     hf_data_train = Dataset.from_pandas(df_train)
     hf_data = DatasetDict({'train': hf_data_train})
     if hf_data_path is not None:
         hf_data.push_to_hub(hf_data_path)
-    print("processed data points: {}".format(len(df_train)))
     return df_train

 import pandas as pd
+def process_ChatDoctor_data(data_path: str, headers: list, hf_data_path=None):
     """
     converting ChatDoctor data to hugging face Dataset
     :param data_path:
+    :param headers: a list of header names
     :param hf_data_path:
     :return:
     """
     with open(data_path, 'r') as file:
         for line in file:
+            try:
+                if line.strip() in ['[', '{', ']']:
+                    continue
+                if line.strip() in ["},", "}"]:
+                    if len(buffer) == len(headers):
+                        rows.append(buffer)
+                    buffer = []
+                else:
+                    buffer.append(clean_line(line))
+            except Exception as e:
+                print("Error in processing line. Detail: {}".format(e))
+                errors += 1
+    df_train = pd.DataFrame(rows, columns=headers)
     hf_data_train = Dataset.from_pandas(df_train)
     hf_data = DatasetDict({'train': hf_data_train})
     if hf_data_path is not None:
         hf_data.push_to_hub(hf_data_path)
+    print("Processed data points:\nSuccessful: {}, Failed: {}".format(len(df_train), errors))
     return df_train

tests/test_data_conversion.py ADDED Viewed

+import os
+import sys
+path = os.getcwd()
+sys.path.append('{}/src'.format('/'.join(path.split('/')[:-1])))
+import unittest
+from data_preprocessing import process_ChatDoctor_data
+class TestMethods(unittest.TestCase):
+    def test_a_process_ChatDoctor_data(self):
+        data_path = "../data/"
+        data_dict = {"HealthCareMagic-100k.json": ["instruction", "input", "output"],
+                     "iCliniq.json": ["input", "answer_icliniq", "answer_chatgpt", "answer_chatdoctor"]
+                     }
+        for data_name, headers in data_dict.items():
+            process_ChatDoctor_data(data_path + data_name, headers=headers)
+if __name__ == '__main__':
+    unittest.main()