Alex Cabrera
initial
382191a
raw
history blame
11.9 kB
import os
import argparse
import json
import logging
import pandas as pd
COMET_REF_MODELS = ["wmt20-comet-da", "wmt21-comet-mqm", "wmt22-comet-da"]
COMET_SRC_MODELS = ["wmt20-comet-qe-da", "wmt21-comet-qe-mqm", "wmt22-cometkiwi-da"]
DOC_SCORING_SCRIPT = "./doc_score.py"
def count_lines(fname):
def _make_gen(reader):
b = reader(2 ** 16)
while b:
yield b
b = reader(2 ** 16)
with open(fname, "rb") as f:
count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
return count
def read_last_line(fname):
with open(fname, 'rb') as f:
try: # catch OSError in case of a one line file
f.seek(-2, os.SEEK_END)
while f.read(1) != b'\n':
f.seek(-2, os.SEEK_CUR)
except OSError:
f.seek(0)
last_line = f.readline().decode()
return last_line
def is_doc_boundary(doc_ids, idx):
after_idx = min(len(doc_ids) - 1, idx + 1)
return (not doc_ids[after_idx] == doc_ids[idx]) or (idx == len(doc_ids) - 1)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--testset', type=str, required=True, help='A path to the test set directory containing references and sources for each language pair. Must contain {src_lang}{tgt_lang}/test.{src_lang}-{tgt_lang}.{tgt_lang} and {src_lang}{tgt_lang}/test.{src_lang}-{tgt_lang}.{src_lang}')
parser.add_argument('--docids', type=str, required=False, help='A path to the directory containing doc-ids corresponding to testset for each language pair. Must contain {src_lang}{tgt_lang}/test.{src_lang}-{tgt_lang}.docids')
parser.add_argument('--hypotheses', type=str, nargs='+', required=True, help='A path to the model output files. must contain {src_lang}{tgt_lang}/test.{src_lang}-{tgt_lang}.{tgt_lang}')
parser.add_argument('--directions', type=str, required=True, nargs='+', help='Language directions to evaluate on e.g. "en-de de-en"')
parser.add_argument('--comet-models', type=str, required=False, nargs='+', help='A list of COMET models to use for evaluation')
parser.add_argument('--gpus', type=int, required=False, default=1, help='Number of GPUs to use with COMET')
parser.add_argument('--metrics', type=str, required=True, nargs='+', help='A list of metrics to use for evaluation, options ["bleu", "comet", "doc-comet", "chrf", "doc-bleu", "doc-chrf"]')
parser.add_argument('--save-name', type=str, required=False, default='scores', help='name of the output files/folders')
parser.add_argument('--sliding-window', type=int, required=False, default=1, help='The stride step over document')
parser.add_argument('--context-length', type=int, required=False, default=4, help='The number of sentences in a single context')
args = parser.parse_args()
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
for hypotheses in args.hypotheses:
scores = {}
for direction in args.directions:
src = direction.split('-')[0]
tgt = direction.split('-')[1]
logging.info(f"Evaluating {direction}")
os.makedirs(f"{hypotheses}/{src}{tgt}/{args.save_name}", exist_ok=True)
hyp_file = f"{hypotheses}/{src}{tgt}/test.{direction}.{tgt}"
src_file = f"{args.testset}/{src}{tgt}/test.{direction}.{src}"
ref_file = f"{args.testset}/{src}{tgt}/test.{direction}.{tgt}"
hyp_line_count = count_lines(hyp_file)
src_line_count = count_lines(src_file)
ref_line_count = count_lines(ref_file)
assert (ref_line_count == hyp_line_count) and (ref_line_count == src_line_count), f"ref_file = {ref_line_count}, hyp_file = {hyp_line_count}, src_file = {src_line_count} - src/ref/hyp lines count should be matched"
scores[direction] = {
"references": ref_file,
"hypotheses": hyp_file,
"sources": src_file
}
for m in ["chrf", "bleu"]:
tokenizer = "ja-mecab" if tgt == "ja" else "zh" if tgt == "zh" else "13a"
if m in args.metrics:
command = f"sacrebleu -m {m} -tok {tokenizer} {ref_file} < {hyp_file} > {hypotheses}/{src}{tgt}/{args.save_name}/{m}.scores"
logging.info(command)
os.system(command)
with open(f"{hypotheses}/{src}{tgt}/{args.save_name}/{m}.scores", 'r') as score_file:
score = json.load(score_file)
# Logging detailed evaluation
logging.info(f"{direction} {m} scores: {json.dumps(score, indent=2)}")
scores[direction][m] = score
if f"doc-{m}" in args.metrics:
assert args.docids, f'document ids directory must be probided to calculate doc-{m}'
docids_file = f"{args.docids}/{src}{tgt}/test.{direction}.docids"
scores[direction]["docids"] = docids_file
docids_line_count = count_lines(docids_file)
assert docids_line_count == src_line_count, "Doc Ids file line count is not matching"
with open(src_file, 'r') as f_src, open(ref_file, 'r') as f_ref, open(hyp_file, 'r') as f_hyp, open(docids_file, 'r') as f_docids:
lines_src = [x.strip() for x in f_src.readlines()]
lines_ref = [x.strip() for x in f_ref.readlines()]
lines_hyp = [x.strip() for x in f_hyp.readlines()]
docid_lines = [x.strip() for x in f_docids.readlines()]
assert len(lines_src) == len(docid_lines), "Doc id file lines are not matching"
docs_src, docs_ref, docs_hyp = [], [], []
current_doc = []
i = 0
while i < len(lines_src):
current_doc.append({
'source': lines_src[i],
'reference': lines_ref[i],
'hypothesis': lines_hyp[i]
})
if is_doc_boundary(docid_lines, i):
docs_src.append([current_doc[j]['source'] for j in range(len(current_doc))])
docs_ref.append([current_doc[j]['reference'] for j in range(len(current_doc))])
docs_hyp.append([current_doc[j]['hypothesis'] for j in range(len(current_doc))])
current_doc = []
i += 1
assert len(docs_src) == len(docs_ref) and len(docs_src) == len(docs_hyp), "docs reconstruction failed"
tmp_dir = f"{hypotheses}/{src}{tgt}/{args.save_name}/tmp"
os.makedirs(tmp_dir, exist_ok=True)
with open(f"{tmp_dir}/test.{direction}.docsnt.src.{src}", 'w') as src_tmp_out, open(f"{tmp_dir}/test.{direction}.docsnt.ref.{tgt}", 'w') as ref_tmp_out, open(f"{tmp_dir}/test.{direction}.docsnt.hyp.{tgt}", 'w') as hyp_tmp_out:
for s_doc, r_doc, h_doc in zip(docs_src, docs_ref, docs_hyp):
s = ' '.join([x.strip() for x in s_doc]).strip()
r = ' '.join([x.strip() for x in r_doc]).strip()
h = ' '.join([x.strip() for x in h_doc]).strip()
src_tmp_out.write(s + '\n')
ref_tmp_out.write(r + '\n')
hyp_tmp_out.write(h + '\n')
tmp_ref_path = f"{tmp_dir}/test.{direction}.docsnt.ref.{tgt}"
tmp_hyp_path = f"{tmp_dir}/test.{direction}.docsnt.hyp.{tgt}"
command = f"sacrebleu -m {m} -tok {tokenizer} {tmp_ref_path} < {tmp_hyp_path} > {hypotheses}/{src}{tgt}/{args.save_name}/doc-{m}.scores"
logging.info(command)
os.system(command)
with open(f"{hypotheses}/{src}{tgt}/{args.save_name}/doc-{m}.scores", 'r') as score_file:
score = json.load(score_file)
logging.info(f"{direction} doc-{m} scores: {json.dumps(score, indent=2)}")
scores[direction][f'doc-{m}'] = score
if "comet" in args.metrics:
scores[direction]['comet'] = {}
for model in args.comet_models:
if model not in COMET_REF_MODELS + COMET_SRC_MODELS:
logging.info(f"Skipping evaluation using {model} since it is not available")
continue
if model in COMET_REF_MODELS:
command = f"comet-score -s {src_file} -t {hyp_file} -r {ref_file} --gpus {args.gpus} --model {model} > {hypotheses}/{src}{tgt}/{args.save_name}/{model}.scores"
logging.info(command)
os.system(command)
score_line = read_last_line(f"{hypotheses}/{src}{tgt}/{args.save_name}/{model}.scores")
score = float(score_line.split()[-1])
scores[direction]['comet'][model] = score
elif model in COMET_SRC_MODELS:
command = f"comet-score -s {src_file} -t {hyp_file} --gpus {args.gpus} --model {model} > {hypotheses}/{src}{tgt}/{args.save_name}/{model}.scores"
logging.info(command)
os.system(command)
score_line = read_last_line(f"{hypotheses}/{src}{tgt}/{args.save_name}/{model}.scores")
score = float(score_line.split()[-1])
scores[direction]['comet'][model] = score
logging.info(f"{direction} comet scores: {json.dumps(scores[direction]['comet'], indent=2)}")
if "doc-comet" in args.metrics:
assert args.docids, 'document ids directory must be probided to calculate doc-comet'
docids_file = f"{args.docids}/{src}{tgt}/test.{direction}.docids"
scores[direction]["docids"] = docids_file
docids_line_count = count_lines(docids_file)
assert docids_line_count == src_line_count, "Doc Ids file line count is not matching"
scores[direction]['doc-comet'] = {}
for model in ["wmt22-cometkiwi-da"]:
if model not in COMET_REF_MODELS + COMET_SRC_MODELS:
logging.info(f"Skipping evaluation using {model} since it is not available")
continue
if model in COMET_REF_MODELS:
command = f"python {DOC_SCORING_SCRIPT} -src {src_file} -hyp {hyp_file} -ref {ref_file} --model {model} --sliding-window {args.sliding_window} --context-length {args.context_length} -doc {docids_file} > {hypotheses}/{src}{tgt}/{args.save_name}/{model}.doclevel.scores"
logging.info(command)
os.system(command)
with open(f"{hypotheses}/{src}{tgt}/{args.save_name}/{model}.doclevel.scores", 'r') as score_file:
score_json = json.load(score_file)
score = score_json['score']
scores[direction]['doc-comet'][model] = score
elif model in COMET_SRC_MODELS:
command =f"python {DOC_SCORING_SCRIPT} -src {src_file} -hyp {hyp_file} --model {model} --sliding-window {args.sliding_window} --context-length {args.context_length} -doc {docids_file} > {hypotheses}/{src}{tgt}/{args.save_name}/{model}.doclevel.scores"
logging.info(command)
os.system(command)
with open(f"{hypotheses}/{src}{tgt}/{args.save_name}/{model}.doclevel.scores", 'r') as score_file:
score_json = json.load(score_file)
score = score_json['score']
scores[direction]['doc-comet'][model] = score
with open(f"{hypotheses}/{args.save_name}.json", 'w') as score_file:
score_file.write(json.dumps(scores, indent=2))
scores_csv = {
'langs': args.directions,
}
for metric in args.metrics:
if metric == 'comet':
for model in args.comet_models:
scores_csv[f"{model}"] = []
for lang in args.directions:
scores_csv[f"{model}"].append(scores[lang][metric][model])
elif metric == 'doc-comet':
for model in ["wmt22-cometkiwi-da"]:
scores_csv[f"doc-{model}"] = []
for lang in args.directions:
scores_csv[f"doc-{model}"].append(scores[lang][metric][model])
else:
scores_csv[metric] = []
for lang in args.directions:
scores_csv[f"{metric}"].append(scores[lang][metric]["score"])
df = pd.DataFrame(scores_csv)
logging.info(f"Scores:\n{df}")
with open(f"{hypotheses}/{args.save_name}.txt", 'w') as score_file:
print(df, file=score_file)
if __name__ == "__main__":
main()