import os import argparse import json import logging import pandas as pd COMET_REF_MODELS = ["wmt20-comet-da", "wmt21-comet-mqm", "wmt22-comet-da"] COMET_SRC_MODELS = ["wmt20-comet-qe-da", "wmt21-comet-qe-mqm", "wmt22-cometkiwi-da"] DOC_SCORING_SCRIPT = "./doc_score.py" def count_lines(fname): def _make_gen(reader): b = reader(2 ** 16) while b: yield b b = reader(2 ** 16) with open(fname, "rb") as f: count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read)) return count def read_last_line(fname): with open(fname, 'rb') as f: try: # catch OSError in case of a one line file f.seek(-2, os.SEEK_END) while f.read(1) != b'\n': f.seek(-2, os.SEEK_CUR) except OSError: f.seek(0) last_line = f.readline().decode() return last_line def is_doc_boundary(doc_ids, idx): after_idx = min(len(doc_ids) - 1, idx + 1) return (not doc_ids[after_idx] == doc_ids[idx]) or (idx == len(doc_ids) - 1) def main(): parser = argparse.ArgumentParser() parser.add_argument('--testset', type=str, required=True, help='A path to the test set directory containing references and sources for each language pair. Must contain {src_lang}{tgt_lang}/test.{src_lang}-{tgt_lang}.{tgt_lang} and {src_lang}{tgt_lang}/test.{src_lang}-{tgt_lang}.{src_lang}') parser.add_argument('--docids', type=str, required=False, help='A path to the directory containing doc-ids corresponding to testset for each language pair. Must contain {src_lang}{tgt_lang}/test.{src_lang}-{tgt_lang}.docids') parser.add_argument('--hypotheses', type=str, nargs='+', required=True, help='A path to the model output files. must contain {src_lang}{tgt_lang}/test.{src_lang}-{tgt_lang}.{tgt_lang}') parser.add_argument('--directions', type=str, required=True, nargs='+', help='Language directions to evaluate on e.g. "en-de de-en"') parser.add_argument('--comet-models', type=str, required=False, nargs='+', help='A list of COMET models to use for evaluation') parser.add_argument('--gpus', type=int, required=False, default=1, help='Number of GPUs to use with COMET') parser.add_argument('--metrics', type=str, required=True, nargs='+', help='A list of metrics to use for evaluation, options ["bleu", "comet", "doc-comet", "chrf", "doc-bleu", "doc-chrf"]') parser.add_argument('--save-name', type=str, required=False, default='scores', help='name of the output files/folders') parser.add_argument('--sliding-window', type=int, required=False, default=1, help='The stride step over document') parser.add_argument('--context-length', type=int, required=False, default=4, help='The number of sentences in a single context') args = parser.parse_args() logging.basicConfig( level=logging.INFO, format='%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) for hypotheses in args.hypotheses: scores = {} for direction in args.directions: src = direction.split('-')[0] tgt = direction.split('-')[1] logging.info(f"Evaluating {direction}") os.makedirs(f"{hypotheses}/{src}{tgt}/{args.save_name}", exist_ok=True) hyp_file = f"{hypotheses}/{src}{tgt}/test.{direction}.{tgt}" src_file = f"{args.testset}/{src}{tgt}/test.{direction}.{src}" ref_file = f"{args.testset}/{src}{tgt}/test.{direction}.{tgt}" hyp_line_count = count_lines(hyp_file) src_line_count = count_lines(src_file) ref_line_count = count_lines(ref_file) assert (ref_line_count == hyp_line_count) and (ref_line_count == src_line_count), f"ref_file = {ref_line_count}, hyp_file = {hyp_line_count}, src_file = {src_line_count} - src/ref/hyp lines count should be matched" scores[direction] = { "references": ref_file, "hypotheses": hyp_file, "sources": src_file } for m in ["chrf", "bleu"]: tokenizer = "ja-mecab" if tgt == "ja" else "zh" if tgt == "zh" else "13a" if m in args.metrics: command = f"sacrebleu -m {m} -tok {tokenizer} {ref_file} < {hyp_file} > {hypotheses}/{src}{tgt}/{args.save_name}/{m}.scores" logging.info(command) os.system(command) with open(f"{hypotheses}/{src}{tgt}/{args.save_name}/{m}.scores", 'r') as score_file: score = json.load(score_file) # Logging detailed evaluation logging.info(f"{direction} {m} scores: {json.dumps(score, indent=2)}") scores[direction][m] = score if f"doc-{m}" in args.metrics: assert args.docids, f'document ids directory must be probided to calculate doc-{m}' docids_file = f"{args.docids}/{src}{tgt}/test.{direction}.docids" scores[direction]["docids"] = docids_file docids_line_count = count_lines(docids_file) assert docids_line_count == src_line_count, "Doc Ids file line count is not matching" with open(src_file, 'r') as f_src, open(ref_file, 'r') as f_ref, open(hyp_file, 'r') as f_hyp, open(docids_file, 'r') as f_docids: lines_src = [x.strip() for x in f_src.readlines()] lines_ref = [x.strip() for x in f_ref.readlines()] lines_hyp = [x.strip() for x in f_hyp.readlines()] docid_lines = [x.strip() for x in f_docids.readlines()] assert len(lines_src) == len(docid_lines), "Doc id file lines are not matching" docs_src, docs_ref, docs_hyp = [], [], [] current_doc = [] i = 0 while i < len(lines_src): current_doc.append({ 'source': lines_src[i], 'reference': lines_ref[i], 'hypothesis': lines_hyp[i] }) if is_doc_boundary(docid_lines, i): docs_src.append([current_doc[j]['source'] for j in range(len(current_doc))]) docs_ref.append([current_doc[j]['reference'] for j in range(len(current_doc))]) docs_hyp.append([current_doc[j]['hypothesis'] for j in range(len(current_doc))]) current_doc = [] i += 1 assert len(docs_src) == len(docs_ref) and len(docs_src) == len(docs_hyp), "docs reconstruction failed" tmp_dir = f"{hypotheses}/{src}{tgt}/{args.save_name}/tmp" os.makedirs(tmp_dir, exist_ok=True) with open(f"{tmp_dir}/test.{direction}.docsnt.src.{src}", 'w') as src_tmp_out, open(f"{tmp_dir}/test.{direction}.docsnt.ref.{tgt}", 'w') as ref_tmp_out, open(f"{tmp_dir}/test.{direction}.docsnt.hyp.{tgt}", 'w') as hyp_tmp_out: for s_doc, r_doc, h_doc in zip(docs_src, docs_ref, docs_hyp): s = ' '.join([x.strip() for x in s_doc]).strip() r = ' '.join([x.strip() for x in r_doc]).strip() h = ' '.join([x.strip() for x in h_doc]).strip() src_tmp_out.write(s + '\n') ref_tmp_out.write(r + '\n') hyp_tmp_out.write(h + '\n') tmp_ref_path = f"{tmp_dir}/test.{direction}.docsnt.ref.{tgt}" tmp_hyp_path = f"{tmp_dir}/test.{direction}.docsnt.hyp.{tgt}" command = f"sacrebleu -m {m} -tok {tokenizer} {tmp_ref_path} < {tmp_hyp_path} > {hypotheses}/{src}{tgt}/{args.save_name}/doc-{m}.scores" logging.info(command) os.system(command) with open(f"{hypotheses}/{src}{tgt}/{args.save_name}/doc-{m}.scores", 'r') as score_file: score = json.load(score_file) logging.info(f"{direction} doc-{m} scores: {json.dumps(score, indent=2)}") scores[direction][f'doc-{m}'] = score if "comet" in args.metrics: scores[direction]['comet'] = {} for model in args.comet_models: if model not in COMET_REF_MODELS + COMET_SRC_MODELS: logging.info(f"Skipping evaluation using {model} since it is not available") continue if model in COMET_REF_MODELS: command = f"comet-score -s {src_file} -t {hyp_file} -r {ref_file} --gpus {args.gpus} --model {model} > {hypotheses}/{src}{tgt}/{args.save_name}/{model}.scores" logging.info(command) os.system(command) score_line = read_last_line(f"{hypotheses}/{src}{tgt}/{args.save_name}/{model}.scores") score = float(score_line.split()[-1]) scores[direction]['comet'][model] = score elif model in COMET_SRC_MODELS: command = f"comet-score -s {src_file} -t {hyp_file} --gpus {args.gpus} --model {model} > {hypotheses}/{src}{tgt}/{args.save_name}/{model}.scores" logging.info(command) os.system(command) score_line = read_last_line(f"{hypotheses}/{src}{tgt}/{args.save_name}/{model}.scores") score = float(score_line.split()[-1]) scores[direction]['comet'][model] = score logging.info(f"{direction} comet scores: {json.dumps(scores[direction]['comet'], indent=2)}") if "doc-comet" in args.metrics: assert args.docids, 'document ids directory must be probided to calculate doc-comet' docids_file = f"{args.docids}/{src}{tgt}/test.{direction}.docids" scores[direction]["docids"] = docids_file docids_line_count = count_lines(docids_file) assert docids_line_count == src_line_count, "Doc Ids file line count is not matching" scores[direction]['doc-comet'] = {} for model in ["wmt22-cometkiwi-da"]: if model not in COMET_REF_MODELS + COMET_SRC_MODELS: logging.info(f"Skipping evaluation using {model} since it is not available") continue if model in COMET_REF_MODELS: command = f"python {DOC_SCORING_SCRIPT} -src {src_file} -hyp {hyp_file} -ref {ref_file} --model {model} --sliding-window {args.sliding_window} --context-length {args.context_length} -doc {docids_file} > {hypotheses}/{src}{tgt}/{args.save_name}/{model}.doclevel.scores" logging.info(command) os.system(command) with open(f"{hypotheses}/{src}{tgt}/{args.save_name}/{model}.doclevel.scores", 'r') as score_file: score_json = json.load(score_file) score = score_json['score'] scores[direction]['doc-comet'][model] = score elif model in COMET_SRC_MODELS: command =f"python {DOC_SCORING_SCRIPT} -src {src_file} -hyp {hyp_file} --model {model} --sliding-window {args.sliding_window} --context-length {args.context_length} -doc {docids_file} > {hypotheses}/{src}{tgt}/{args.save_name}/{model}.doclevel.scores" logging.info(command) os.system(command) with open(f"{hypotheses}/{src}{tgt}/{args.save_name}/{model}.doclevel.scores", 'r') as score_file: score_json = json.load(score_file) score = score_json['score'] scores[direction]['doc-comet'][model] = score with open(f"{hypotheses}/{args.save_name}.json", 'w') as score_file: score_file.write(json.dumps(scores, indent=2)) scores_csv = { 'langs': args.directions, } for metric in args.metrics: if metric == 'comet': for model in args.comet_models: scores_csv[f"{model}"] = [] for lang in args.directions: scores_csv[f"{model}"].append(scores[lang][metric][model]) elif metric == 'doc-comet': for model in ["wmt22-cometkiwi-da"]: scores_csv[f"doc-{model}"] = [] for lang in args.directions: scores_csv[f"doc-{model}"].append(scores[lang][metric][model]) else: scores_csv[metric] = [] for lang in args.directions: scores_csv[f"{metric}"].append(scores[lang][metric]["score"]) df = pd.DataFrame(scores_csv) logging.info(f"Scores:\n{df}") with open(f"{hypotheses}/{args.save_name}.txt", 'w') as score_file: print(df, file=score_file) if __name__ == "__main__": main()