|
|
|
|
|
|
|
import sys |
|
import json |
|
import os |
|
|
|
|
|
def main(file_path, src_lang, tgt_lang): |
|
|
|
file_list = ["train", "valid", "test"] |
|
for filename in file_list: |
|
sys.stderr.write("**** Start processing {} ... ****\n".format(filename)) |
|
src_full_path = os.path.join(file_path, ".".join((filename, src_lang))) |
|
tgt_full_path = os.path.join(file_path, ".".join((filename, tgt_lang))) |
|
src_reader = open(src_full_path, 'r') |
|
tgt_reader = open(tgt_full_path, "r") |
|
|
|
writer_full_path = os.path.join(file_path, ".".join((filename, src_lang + "_" + tgt_lang))) |
|
writer = open(writer_full_path, "w") |
|
|
|
for row_src, row_tgt in zip(src_reader, tgt_reader): |
|
combine_line = {} |
|
combine_line["src"] = row_src.strip() |
|
combine_line["tgt"] = row_tgt.strip() |
|
json.dump(combine_line, writer, ensure_ascii=False) |
|
writer.write('\n') |
|
|
|
|
|
sys.stderr.write(f"**** Done change {filename} format **** \n") |
|
|
|
|
|
if __name__ == "__main__": |
|
file_path = sys.argv[1] |
|
src_lang, tgt_lang = sys.argv[2].split("-") |
|
|
|
main(file_path, src_lang, tgt_lang) |
|
|