import os
import argparse
import json
import logging
import pandas as pd

COMET_REF_MODELS = ["wmt20-comet-da", "wmt21-comet-mqm", "wmt22-comet-da"]
COMET_SRC_MODELS = ["wmt20-comet-qe-da", "wmt21-comet-qe-mqm", "wmt22-cometkiwi-da"]
DOC_SCORING_SCRIPT = "./doc_score.py"

def count_lines(fname):
  def _make_gen(reader):
    b = reader(2 ** 16)
    while b:
      yield b
      b = reader(2 ** 16)

  with open(fname, "rb") as f:
    count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
  return count

def read_last_line(fname):
  with open(fname, 'rb') as f:
    try:  # catch OSError in case of a one line file 
      f.seek(-2, os.SEEK_END)
      while f.read(1) != b'\n':
        f.seek(-2, os.SEEK_CUR)
    except OSError:
      f.seek(0)
    last_line = f.readline().decode()
    return last_line
  
def is_doc_boundary(doc_ids, idx):
  after_idx = min(len(doc_ids) - 1, idx + 1)
  return (not doc_ids[after_idx] == doc_ids[idx]) or (idx == len(doc_ids) - 1)

def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('--testset', type=str, required=True, help='A path to the test set directory containing references and sources for each language pair. Must contain {src_lang}{tgt_lang}/test.{src_lang}-{tgt_lang}.{tgt_lang} and {src_lang}{tgt_lang}/test.{src_lang}-{tgt_lang}.{src_lang}')
  parser.add_argument('--docids', type=str, required=False, help='A path to the directory containing doc-ids corresponding to testset for each language pair. Must contain {src_lang}{tgt_lang}/test.{src_lang}-{tgt_lang}.docids')
  parser.add_argument('--hypotheses', type=str, nargs='+', required=True, help='A path to the model output files. must contain {src_lang}{tgt_lang}/test.{src_lang}-{tgt_lang}.{tgt_lang}')
  parser.add_argument('--directions', type=str, required=True, nargs='+', help='Language directions to evaluate on e.g. "en-de de-en"')
  parser.add_argument('--comet-models', type=str, required=False, nargs='+', help='A list of COMET models to use for evaluation')
  parser.add_argument('--gpus', type=int, required=False, default=1, help='Number of GPUs to use with COMET')
  parser.add_argument('--metrics', type=str, required=True, nargs='+', help='A list of metrics to use for evaluation, options ["bleu", "comet", "doc-comet", "chrf", "doc-bleu", "doc-chrf"]')
  parser.add_argument('--save-name', type=str, required=False, default='scores', help='name of the output files/folders')
  parser.add_argument('--sliding-window', type=int, required=False, default=1, help='The stride step over document')
  parser.add_argument('--context-length', type=int, required=False, default=4, help='The number of sentences in a single context')
  args = parser.parse_args()
  
  logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
  )
  
  for hypotheses in args.hypotheses:
    scores = {}
    for direction in args.directions:
      src = direction.split('-')[0]
      tgt = direction.split('-')[1]
      logging.info(f"Evaluating {direction}")

      os.makedirs(f"{hypotheses}/{src}{tgt}/{args.save_name}", exist_ok=True)

      hyp_file = f"{hypotheses}/{src}{tgt}/test.{direction}.{tgt}"
      src_file = f"{args.testset}/{src}{tgt}/test.{direction}.{src}"
      ref_file = f"{args.testset}/{src}{tgt}/test.{direction}.{tgt}"

      hyp_line_count = count_lines(hyp_file)
      src_line_count = count_lines(src_file)
      ref_line_count = count_lines(ref_file)

      assert (ref_line_count == hyp_line_count) and (ref_line_count == src_line_count), f"ref_file = {ref_line_count}, hyp_file = {hyp_line_count}, src_file = {src_line_count} - src/ref/hyp lines count should be matched"

      scores[direction] = {
        "references": ref_file,
        "hypotheses": hyp_file,
        "sources": src_file
      }
      for m in ["chrf", "bleu"]:
        tokenizer = "ja-mecab" if tgt == "ja" else "zh" if tgt == "zh" else "13a"
        if m in args.metrics:
          command = f"sacrebleu -m {m} -tok {tokenizer} {ref_file} < {hyp_file} > {hypotheses}/{src}{tgt}/{args.save_name}/{m}.scores"
          logging.info(command)
          os.system(command)
          with open(f"{hypotheses}/{src}{tgt}/{args.save_name}/{m}.scores", 'r') as score_file:
            score = json.load(score_file)
          # Logging detailed evaluation
          logging.info(f"{direction} {m} scores: {json.dumps(score, indent=2)}")
          scores[direction][m] = score
          
        if f"doc-{m}" in args.metrics:
          assert args.docids, f'document ids directory must be probided to calculate doc-{m}'
          docids_file = f"{args.docids}/{src}{tgt}/test.{direction}.docids"
          scores[direction]["docids"] = docids_file
          docids_line_count = count_lines(docids_file)
          assert docids_line_count == src_line_count, "Doc Ids file line count is not matching"
          with open(src_file, 'r') as f_src, open(ref_file, 'r') as f_ref, open(hyp_file, 'r') as f_hyp, open(docids_file, 'r') as f_docids:
            lines_src = [x.strip() for x in f_src.readlines()]
            lines_ref = [x.strip() for x in f_ref.readlines()]
            lines_hyp = [x.strip() for x in f_hyp.readlines()]
            docid_lines = [x.strip() for x in f_docids.readlines()]            
            assert len(lines_src) == len(docid_lines), "Doc id file lines are not matching"
          docs_src, docs_ref, docs_hyp = [], [], []
          current_doc = []
          i = 0
          while i < len(lines_src):
            current_doc.append({
              'source': lines_src[i],
              'reference': lines_ref[i],
              'hypothesis': lines_hyp[i]
            })
            if is_doc_boundary(docid_lines, i):
              docs_src.append([current_doc[j]['source'] for j in range(len(current_doc))])
              docs_ref.append([current_doc[j]['reference'] for j in range(len(current_doc))])
              docs_hyp.append([current_doc[j]['hypothesis'] for j in range(len(current_doc))])
              current_doc = []
            i += 1
          assert len(docs_src) == len(docs_ref) and len(docs_src) == len(docs_hyp), "docs reconstruction failed"
          tmp_dir = f"{hypotheses}/{src}{tgt}/{args.save_name}/tmp"
          os.makedirs(tmp_dir, exist_ok=True)
          with open(f"{tmp_dir}/test.{direction}.docsnt.src.{src}", 'w') as src_tmp_out,  open(f"{tmp_dir}/test.{direction}.docsnt.ref.{tgt}", 'w') as ref_tmp_out,  open(f"{tmp_dir}/test.{direction}.docsnt.hyp.{tgt}", 'w') as hyp_tmp_out:
            for s_doc, r_doc, h_doc in zip(docs_src, docs_ref, docs_hyp):
              s = ' '.join([x.strip() for x in s_doc]).strip()
              r = ' '.join([x.strip() for x in r_doc]).strip()
              h = ' '.join([x.strip() for x in h_doc]).strip()
              src_tmp_out.write(s + '\n')
              ref_tmp_out.write(r + '\n')
              hyp_tmp_out.write(h + '\n')
          tmp_ref_path = f"{tmp_dir}/test.{direction}.docsnt.ref.{tgt}"
          tmp_hyp_path = f"{tmp_dir}/test.{direction}.docsnt.hyp.{tgt}"
          command = f"sacrebleu -m {m} -tok {tokenizer} {tmp_ref_path} < {tmp_hyp_path} > {hypotheses}/{src}{tgt}/{args.save_name}/doc-{m}.scores"
          logging.info(command)
          os.system(command)
          with open(f"{hypotheses}/{src}{tgt}/{args.save_name}/doc-{m}.scores", 'r') as score_file:
            score = json.load(score_file)
          logging.info(f"{direction} doc-{m} scores: {json.dumps(score, indent=2)}")
          scores[direction][f'doc-{m}'] = score
        
      if "comet" in args.metrics:
        scores[direction]['comet'] = {}
        for model in args.comet_models:
          if model not in COMET_REF_MODELS + COMET_SRC_MODELS:
            logging.info(f"Skipping evaluation using {model} since it is not available")
            continue
          if model in COMET_REF_MODELS:
            command = f"comet-score -s {src_file} -t {hyp_file} -r {ref_file} --gpus {args.gpus} --model {model} > {hypotheses}/{src}{tgt}/{args.save_name}/{model}.scores"
            logging.info(command)
            os.system(command)
            score_line = read_last_line(f"{hypotheses}/{src}{tgt}/{args.save_name}/{model}.scores")
            score = float(score_line.split()[-1])
            scores[direction]['comet'][model] = score
          elif model in COMET_SRC_MODELS:
            command = f"comet-score -s {src_file} -t {hyp_file} --gpus {args.gpus} --model {model} > {hypotheses}/{src}{tgt}/{args.save_name}/{model}.scores"
            logging.info(command)
            os.system(command)
            score_line = read_last_line(f"{hypotheses}/{src}{tgt}/{args.save_name}/{model}.scores")
            score = float(score_line.split()[-1])
            scores[direction]['comet'][model] = score
        logging.info(f"{direction} comet scores: {json.dumps(scores[direction]['comet'], indent=2)}")

      if "doc-comet" in args.metrics:
        assert args.docids, 'document ids directory must be probided to calculate doc-comet'
        docids_file = f"{args.docids}/{src}{tgt}/test.{direction}.docids"
        scores[direction]["docids"] = docids_file
        docids_line_count = count_lines(docids_file)
        assert docids_line_count == src_line_count, "Doc Ids file line count is not matching"
        scores[direction]['doc-comet'] = {}

        for model in ["wmt22-cometkiwi-da"]:
          if model not in COMET_REF_MODELS + COMET_SRC_MODELS:
            logging.info(f"Skipping evaluation using {model} since it is not available")
            continue
          if model in COMET_REF_MODELS:
            command = f"python {DOC_SCORING_SCRIPT} -src {src_file} -hyp {hyp_file} -ref {ref_file} --model {model} --sliding-window {args.sliding_window} --context-length {args.context_length} -doc {docids_file} > {hypotheses}/{src}{tgt}/{args.save_name}/{model}.doclevel.scores"
            logging.info(command)
            os.system(command)
            with open(f"{hypotheses}/{src}{tgt}/{args.save_name}/{model}.doclevel.scores", 'r') as score_file:
              score_json = json.load(score_file)
            score = score_json['score']
            scores[direction]['doc-comet'][model] = score
          elif model in COMET_SRC_MODELS:
            command =f"python {DOC_SCORING_SCRIPT} -src {src_file} -hyp {hyp_file} --model {model} --sliding-window {args.sliding_window} --context-length {args.context_length} -doc {docids_file} > {hypotheses}/{src}{tgt}/{args.save_name}/{model}.doclevel.scores"
            logging.info(command)
            os.system(command)
            with open(f"{hypotheses}/{src}{tgt}/{args.save_name}/{model}.doclevel.scores", 'r') as score_file:
              score_json = json.load(score_file)
            score = score_json['score']
            scores[direction]['doc-comet'][model] = score

    with open(f"{hypotheses}/{args.save_name}.json", 'w') as score_file:
      score_file.write(json.dumps(scores, indent=2))

    scores_csv = {
      'langs': args.directions,
    }
    for metric in args.metrics:
      if metric == 'comet':
        for model in args.comet_models:
          scores_csv[f"{model}"] = []
          for lang in args.directions:
            scores_csv[f"{model}"].append(scores[lang][metric][model])
      elif metric == 'doc-comet':
        for model in ["wmt22-cometkiwi-da"]:
          scores_csv[f"doc-{model}"] = []
          for lang in args.directions:
            scores_csv[f"doc-{model}"].append(scores[lang][metric][model])
      else:
        scores_csv[metric] = []
        for lang in args.directions:
          scores_csv[f"{metric}"].append(scores[lang][metric]["score"])
  
    df = pd.DataFrame(scores_csv)
    logging.info(f"Scores:\n{df}")

    with open(f"{hypotheses}/{args.save_name}.txt", 'w') as score_file:
      print(df, file=score_file)


if __name__ == "__main__":
  main()