Spaces:

zeno-ml
/

translation-report

Runtime error

translation-report / gpt-MT /tools /evaluate.py

Alex Cabrera

initial

382191a over 1 year ago

11.9 kB

	import os
	import argparse
	import json
	import logging
	import pandas as pd

	COMET_REF_MODELS = ["wmt20-comet-da", "wmt21-comet-mqm", "wmt22-comet-da"]
	COMET_SRC_MODELS = ["wmt20-comet-qe-da", "wmt21-comet-qe-mqm", "wmt22-cometkiwi-da"]
	DOC_SCORING_SCRIPT = "./doc_score.py"

	def count_lines(fname):
	def _make_gen(reader):
	b = reader(2 ** 16)
	while b:
	yield b
	b = reader(2 ** 16)

	with open(fname, "rb") as f:
	count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
	return count

	def read_last_line(fname):
	with open(fname, 'rb') as f:
	try: # catch OSError in case of a one line file
	f.seek(-2, os.SEEK_END)
	while f.read(1) != b'\n':
	f.seek(-2, os.SEEK_CUR)
	except OSError:
	f.seek(0)
	last_line = f.readline().decode()
	return last_line

	def is_doc_boundary(doc_ids, idx):
	after_idx = min(len(doc_ids) - 1, idx + 1)
	return (not doc_ids[after_idx] == doc_ids[idx]) or (idx == len(doc_ids) - 1)

	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('--testset', type=str, required=True, help='A path to the test set directory containing references and sources for each language pair. Must contain {src_lang}{tgt_lang}/test.{src_lang}-{tgt_lang}.{tgt_lang} and {src_lang}{tgt_lang}/test.{src_lang}-{tgt_lang}.{src_lang}')
	parser.add_argument('--docids', type=str, required=False, help='A path to the directory containing doc-ids corresponding to testset for each language pair. Must contain {src_lang}{tgt_lang}/test.{src_lang}-{tgt_lang}.docids')
	parser.add_argument('--hypotheses', type=str, nargs='+', required=True, help='A path to the model output files. must contain {src_lang}{tgt_lang}/test.{src_lang}-{tgt_lang}.{tgt_lang}')
	parser.add_argument('--directions', type=str, required=True, nargs='+', help='Language directions to evaluate on e.g. "en-de de-en"')
	parser.add_argument('--comet-models', type=str, required=False, nargs='+', help='A list of COMET models to use for evaluation')
	parser.add_argument('--gpus', type=int, required=False, default=1, help='Number of GPUs to use with COMET')
	parser.add_argument('--metrics', type=str, required=True, nargs='+', help='A list of metrics to use for evaluation, options ["bleu", "comet", "doc-comet", "chrf", "doc-bleu", "doc-chrf"]')
	parser.add_argument('--save-name', type=str, required=False, default='scores', help='name of the output files/folders')
	parser.add_argument('--sliding-window', type=int, required=False, default=1, help='The stride step over document')
	parser.add_argument('--context-length', type=int, required=False, default=4, help='The number of sentences in a single context')
	args = parser.parse_args()

	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s',
	datefmt='%Y-%m-%d %H:%M:%S'
	)

	for hypotheses in args.hypotheses:
	scores = {}
	for direction in args.directions:
	src = direction.split('-')[0]
	tgt = direction.split('-')[1]
	logging.info(f"Evaluating {direction}")

	os.makedirs(f"{hypotheses}/{src}{tgt}/{args.save_name}", exist_ok=True)

	hyp_file = f"{hypotheses}/{src}{tgt}/test.{direction}.{tgt}"
	src_file = f"{args.testset}/{src}{tgt}/test.{direction}.{src}"
	ref_file = f"{args.testset}/{src}{tgt}/test.{direction}.{tgt}"

	hyp_line_count = count_lines(hyp_file)
	src_line_count = count_lines(src_file)
	ref_line_count = count_lines(ref_file)

	assert (ref_line_count == hyp_line_count) and (ref_line_count == src_line_count), f"ref_file = {ref_line_count}, hyp_file = {hyp_line_count}, src_file = {src_line_count} - src/ref/hyp lines count should be matched"

	scores[direction] = {
	"references": ref_file,
	"hypotheses": hyp_file,
	"sources": src_file
	}
	for m in ["chrf", "bleu"]:
	tokenizer = "ja-mecab" if tgt == "ja" else "zh" if tgt == "zh" else "13a"
	if m in args.metrics:
	command = f"sacrebleu -m {m} -tok {tokenizer} {ref_file} < {hyp_file} > {hypotheses}/{src}{tgt}/{args.save_name}/{m}.scores"
	logging.info(command)
	os.system(command)
	with open(f"{hypotheses}/{src}{tgt}/{args.save_name}/{m}.scores", 'r') as score_file:
	score = json.load(score_file)
	# Logging detailed evaluation
	logging.info(f"{direction} {m} scores: {json.dumps(score, indent=2)}")
	scores[direction][m] = score

	if f"doc-{m}" in args.metrics:
	assert args.docids, f'document ids directory must be probided to calculate doc-{m}'
	docids_file = f"{args.docids}/{src}{tgt}/test.{direction}.docids"
	scores[direction]["docids"] = docids_file
	docids_line_count = count_lines(docids_file)
	assert docids_line_count == src_line_count, "Doc Ids file line count is not matching"
	with open(src_file, 'r') as f_src, open(ref_file, 'r') as f_ref, open(hyp_file, 'r') as f_hyp, open(docids_file, 'r') as f_docids:
	lines_src = [x.strip() for x in f_src.readlines()]
	lines_ref = [x.strip() for x in f_ref.readlines()]
	lines_hyp = [x.strip() for x in f_hyp.readlines()]
	docid_lines = [x.strip() for x in f_docids.readlines()]
	assert len(lines_src) == len(docid_lines), "Doc id file lines are not matching"
	docs_src, docs_ref, docs_hyp = [], [], []
	current_doc = []
	i = 0
	while i < len(lines_src):
	current_doc.append({
	'source': lines_src[i],
	'reference': lines_ref[i],
	'hypothesis': lines_hyp[i]
	})
	if is_doc_boundary(docid_lines, i):
	docs_src.append([current_doc[j]['source'] for j in range(len(current_doc))])
	docs_ref.append([current_doc[j]['reference'] for j in range(len(current_doc))])
	docs_hyp.append([current_doc[j]['hypothesis'] for j in range(len(current_doc))])
	current_doc = []
	i += 1
	assert len(docs_src) == len(docs_ref) and len(docs_src) == len(docs_hyp), "docs reconstruction failed"
	tmp_dir = f"{hypotheses}/{src}{tgt}/{args.save_name}/tmp"
	os.makedirs(tmp_dir, exist_ok=True)
	with open(f"{tmp_dir}/test.{direction}.docsnt.src.{src}", 'w') as src_tmp_out, open(f"{tmp_dir}/test.{direction}.docsnt.ref.{tgt}", 'w') as ref_tmp_out, open(f"{tmp_dir}/test.{direction}.docsnt.hyp.{tgt}", 'w') as hyp_tmp_out:
	for s_doc, r_doc, h_doc in zip(docs_src, docs_ref, docs_hyp):
	s = ' '.join([x.strip() for x in s_doc]).strip()
	r = ' '.join([x.strip() for x in r_doc]).strip()
	h = ' '.join([x.strip() for x in h_doc]).strip()
	src_tmp_out.write(s + '\n')
	ref_tmp_out.write(r + '\n')
	hyp_tmp_out.write(h + '\n')
	tmp_ref_path = f"{tmp_dir}/test.{direction}.docsnt.ref.{tgt}"
	tmp_hyp_path = f"{tmp_dir}/test.{direction}.docsnt.hyp.{tgt}"
	command = f"sacrebleu -m {m} -tok {tokenizer} {tmp_ref_path} < {tmp_hyp_path} > {hypotheses}/{src}{tgt}/{args.save_name}/doc-{m}.scores"
	logging.info(command)
	os.system(command)
	with open(f"{hypotheses}/{src}{tgt}/{args.save_name}/doc-{m}.scores", 'r') as score_file:
	score = json.load(score_file)
	logging.info(f"{direction} doc-{m} scores: {json.dumps(score, indent=2)}")
	scores[direction][f'doc-{m}'] = score

	if "comet" in args.metrics:
	scores[direction]['comet'] = {}
	for model in args.comet_models:
	if model not in COMET_REF_MODELS + COMET_SRC_MODELS:
	logging.info(f"Skipping evaluation using {model} since it is not available")
	continue
	if model in COMET_REF_MODELS:
	command = f"comet-score -s {src_file} -t {hyp_file} -r {ref_file} --gpus {args.gpus} --model {model} > {hypotheses}/{src}{tgt}/{args.save_name}/{model}.scores"
	logging.info(command)
	os.system(command)
	score_line = read_last_line(f"{hypotheses}/{src}{tgt}/{args.save_name}/{model}.scores")
	score = float(score_line.split()[-1])
	scores[direction]['comet'][model] = score
	elif model in COMET_SRC_MODELS:
	command = f"comet-score -s {src_file} -t {hyp_file} --gpus {args.gpus} --model {model} > {hypotheses}/{src}{tgt}/{args.save_name}/{model}.scores"
	logging.info(command)
	os.system(command)
	score_line = read_last_line(f"{hypotheses}/{src}{tgt}/{args.save_name}/{model}.scores")
	score = float(score_line.split()[-1])
	scores[direction]['comet'][model] = score
	logging.info(f"{direction} comet scores: {json.dumps(scores[direction]['comet'], indent=2)}")

	if "doc-comet" in args.metrics:
	assert args.docids, 'document ids directory must be probided to calculate doc-comet'
	docids_file = f"{args.docids}/{src}{tgt}/test.{direction}.docids"
	scores[direction]["docids"] = docids_file
	docids_line_count = count_lines(docids_file)
	assert docids_line_count == src_line_count, "Doc Ids file line count is not matching"
	scores[direction]['doc-comet'] = {}

	for model in ["wmt22-cometkiwi-da"]:
	if model not in COMET_REF_MODELS + COMET_SRC_MODELS:
	logging.info(f"Skipping evaluation using {model} since it is not available")
	continue
	if model in COMET_REF_MODELS:
	command = f"python {DOC_SCORING_SCRIPT} -src {src_file} -hyp {hyp_file} -ref {ref_file} --model {model} --sliding-window {args.sliding_window} --context-length {args.context_length} -doc {docids_file} > {hypotheses}/{src}{tgt}/{args.save_name}/{model}.doclevel.scores"
	logging.info(command)
	os.system(command)
	with open(f"{hypotheses}/{src}{tgt}/{args.save_name}/{model}.doclevel.scores", 'r') as score_file:
	score_json = json.load(score_file)
	score = score_json['score']
	scores[direction]['doc-comet'][model] = score
	elif model in COMET_SRC_MODELS:
	command =f"python {DOC_SCORING_SCRIPT} -src {src_file} -hyp {hyp_file} --model {model} --sliding-window {args.sliding_window} --context-length {args.context_length} -doc {docids_file} > {hypotheses}/{src}{tgt}/{args.save_name}/{model}.doclevel.scores"
	logging.info(command)
	os.system(command)
	with open(f"{hypotheses}/{src}{tgt}/{args.save_name}/{model}.doclevel.scores", 'r') as score_file:
	score_json = json.load(score_file)
	score = score_json['score']
	scores[direction]['doc-comet'][model] = score

	with open(f"{hypotheses}/{args.save_name}.json", 'w') as score_file:
	score_file.write(json.dumps(scores, indent=2))

	scores_csv = {
	'langs': args.directions,
	}
	for metric in args.metrics:
	if metric == 'comet':
	for model in args.comet_models:
	scores_csv[f"{model}"] = []
	for lang in args.directions:
	scores_csv[f"{model}"].append(scores[lang][metric][model])
	elif metric == 'doc-comet':
	for model in ["wmt22-cometkiwi-da"]:
	scores_csv[f"doc-{model}"] = []
	for lang in args.directions:
	scores_csv[f"doc-{model}"].append(scores[lang][metric][model])
	else:
	scores_csv[metric] = []
	for lang in args.directions:
	scores_csv[f"{metric}"].append(scores[lang][metric]["score"])

	df = pd.DataFrame(scores_csv)
	logging.info(f"Scores:\n{df}")

	with open(f"{hypotheses}/{args.save_name}.txt", 'w') as score_file:
	print(df, file=score_file)


	if __name__ == "__main__":
	main()