|
from datasets import Dataset, DatasetDict |
|
import pandas as pd |
|
|
|
|
|
def process_ChatDoctor_data(data_path: str, header: list, hf_data_path=None): |
|
""" |
|
converting ChatDoctor data to hugging face Dataset |
|
:param data_path: |
|
:param header: a list of header names |
|
:param hf_data_path: |
|
:return: |
|
""" |
|
buffer = [] |
|
rows = [] |
|
errors = 0 |
|
|
|
def clean_line(text): |
|
text = text.strip() |
|
text = text.split(":")[1].strip() |
|
text = text.strip(",").strip("\"") |
|
text = text.lstrip(",").lstrip("\"") |
|
return text.strip() |
|
|
|
with open(data_path, 'r') as file: |
|
for line in file: |
|
try: |
|
if line.strip() in ['[', '{', ']']: |
|
continue |
|
if line.strip() in ["},", "}"]: |
|
if len(buffer) == len(header): |
|
rows.append(buffer) |
|
buffer = [] |
|
else: |
|
buffer.append(clean_line(line)) |
|
except Exception as e: |
|
print("Error in processing line. Detail: {}".format(e)) |
|
errors += 1 |
|
df_train = pd.DataFrame(rows, columns=header) |
|
hf_data_train = Dataset.from_pandas(df_train) |
|
hf_data = DatasetDict({'train': hf_data_train}) |
|
|
|
if hf_data_path is not None: |
|
hf_data.push_to_hub(hf_data_path) |
|
|
|
print("Processed data points:\nSuccessful: {}, Failed: {}".format(len(df_train), errors)) |
|
|
|
return df_train |
|
|