summary / fengshen /examples /translate /prepare_dataset.py
fclong's picture
Upload 396 files
8ebda9e
raw
history blame
1.28 kB
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import json
import os
def main(file_path, src_lang, tgt_lang):
file_list = ["train", "valid", "test"]
for filename in file_list:
sys.stderr.write("**** Start processing {} ... ****\n".format(filename))
src_full_path = os.path.join(file_path, ".".join((filename, src_lang)))
tgt_full_path = os.path.join(file_path, ".".join((filename, tgt_lang)))
src_reader = open(src_full_path, 'r')
tgt_reader = open(tgt_full_path, "r")
writer_full_path = os.path.join(file_path, ".".join((filename, src_lang + "_" + tgt_lang)))
writer = open(writer_full_path, "w")
# combine_dict = OrderedDict()
for row_src, row_tgt in zip(src_reader, tgt_reader):
combine_line = {}
combine_line["src"] = row_src.strip()
combine_line["tgt"] = row_tgt.strip()
json.dump(combine_line, writer, ensure_ascii=False)
writer.write('\n')
# print(row_src)
# print(row_tgt)
sys.stderr.write(f"**** Done change {filename} format **** \n")
if __name__ == "__main__":
file_path = sys.argv[1]
src_lang, tgt_lang = sys.argv[2].split("-")
main(file_path, src_lang, tgt_lang)