Spaces:

lavita
/

medical-question-answering-datasets

Running

changing parameter name

5d61750 over 1 year ago

1.47 kB

	from datasets import Dataset, DatasetDict
	import pandas as pd


	def process_ChatDoctor_data(data_path: str, header: list, hf_data_path=None):
	"""
	converting ChatDoctor data to hugging face Dataset
	:param data_path:
	:param header: a list of header names
	:param hf_data_path:
	:return:
	"""
	buffer = []
	rows = []
	errors = 0

	def clean_line(text):
	text = text.strip()
	text = text.split(":")[1].strip()
	text = text.strip(",").strip("\"")
	text = text.lstrip(",").lstrip("\"")
	return text.strip()

	with open(data_path, 'r') as file:
	for line in file:
	try:
	if line.strip() in ['[', '{', ']']:
	continue
	if line.strip() in ["},", "}"]:
	if len(buffer) == len(header):
	rows.append(buffer)
	buffer = []
	else:
	buffer.append(clean_line(line))
	except Exception as e:
	print("Error in processing line. Detail: {}".format(e))
	errors += 1
	df_train = pd.DataFrame(rows, columns=header)
	hf_data_train = Dataset.from_pandas(df_train)
	hf_data = DatasetDict({'train': hf_data_train})

	if hf_data_path is not None:
	hf_data.push_to_hub(hf_data_path)

	print("Processed data points:\nSuccessful: {}, Failed: {}".format(len(df_train), errors))

	return df_train