File size: 1,278 Bytes
8ebda9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
import json
import os


def main(file_path, src_lang, tgt_lang):

    file_list = ["train", "valid", "test"]
    for filename in file_list:
        sys.stderr.write("**** Start processing {} ... ****\n".format(filename))
        src_full_path = os.path.join(file_path, ".".join((filename, src_lang)))
        tgt_full_path = os.path.join(file_path, ".".join((filename, tgt_lang)))
        src_reader = open(src_full_path, 'r')
        tgt_reader = open(tgt_full_path, "r")

        writer_full_path = os.path.join(file_path, ".".join((filename, src_lang + "_" + tgt_lang)))
        writer = open(writer_full_path, "w")
        # combine_dict = OrderedDict()
        for row_src, row_tgt in zip(src_reader, tgt_reader):
            combine_line = {}
            combine_line["src"] = row_src.strip()
            combine_line["tgt"] = row_tgt.strip()
            json.dump(combine_line, writer, ensure_ascii=False)
            writer.write('\n')
            # print(row_src)
            # print(row_tgt)
        sys.stderr.write(f"**** Done change {filename} format **** \n")


if __name__ == "__main__":
    file_path = sys.argv[1]
    src_lang, tgt_lang = sys.argv[2].split("-")

    main(file_path, src_lang, tgt_lang)