aarontseng commited on
Commit
4519f43
·
verified ·
1 Parent(s): a418fea

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +88 -3
README.md CHANGED
@@ -1,3 +1,88 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ language:
4
+ - zh
5
+ - en
6
+ pipeline_tag: translation
7
+ tags:
8
+ - text2text-generation
9
+ ---
10
+
11
+ # Zero-mt
12
+
13
+ [https://github.com/zape-aat/zero-mt](https://github.com/zape-aat/zero-mt)
14
+
15
+ ## Metrics
16
+
17
+ |Testset|BLEU|Chrf++|Comet22|
18
+ |:-------------:|:---------------:|:---------:|:---------:|
19
+ |flores200-dev|41.37|65.13|0.867|
20
+ |flores200-devtest|63.06|53.57|0.868|
21
+ |newstest2019|14.96|36.16|0.843|
22
+ |wmt-22|?|?|0.775|
23
+ |wmt-23|22.65|41.22|0.777|
24
+
25
+ ## How to use
26
+
27
+ ```
28
+ git lfs install
29
+ git clone https://huggingface.co/aarontseng/zero-mt-zh_hant-en
30
+ ```
31
+
32
+ ```
33
+ pip install ctranslate2
34
+ pip install sentencepiece
35
+ ```
36
+ ## Basic Usage
37
+
38
+ ```
39
+ import ctranslate2
40
+ import sentencepiece
41
+
42
+ src_model = sentencepiece.SentencePieceProcessor()
43
+ src_model.load("zero-mt-zh_hant-en/source.model")
44
+ tgt_model = sentencepiece.SentencePieceProcessor()
45
+ tgt_model.load("zero-mt-zh_hant-en/target.model")
46
+
47
+ translator = ctranslate2.Translator("zero-mt-zh_hant-en", device="cuda") # "cpu" or "cuda"
48
+
49
+ encoded_line = src_model.encode_as_pieces("在世界上的許多地方,揮手都是一種表示「你好」的友善手勢」。")
50
+
51
+ results = translator.translate_batch([encoded_line], batch_type="tokens", max_batch_size=1024)
52
+
53
+ decoded_line = tgt_model.decode(results[0].hypotheses[0])
54
+
55
+ print(decoded_line) # In many places around the world, waving is a friendly gesture of "hello".
56
+ ```
57
+
58
+ ## Batch translation
59
+ ```
60
+ import ctranslate2
61
+ import sentencepiece
62
+
63
+ src_path = "dev.cmn_Hant"
64
+ tgt_path = "translated.txt"
65
+
66
+ src_model = sentencepiece.SentencePieceProcessor()
67
+ src_model.load("zero-mt-zh_hant-en/source.model")
68
+ tgt_model = sentencepiece.SentencePieceProcessor()
69
+ tgt_model.load("zero-mt-zh_hant-en/target.model")
70
+
71
+ translator = ctranslate2.Translator("zero-mt-zh_hant-en", device="cuda") # "cpu" or "cuda"
72
+
73
+ src_file = open(src_path, 'r', encoding="utf-8")
74
+ src_lines = src_file.readlines()
75
+
76
+ encoded_lines = src_model.encode_as_pieces(src_lines)
77
+
78
+ results = translator.translate_batch(encoded_lines, batch_type="tokens", max_batch_size=1024)
79
+ translations = [translation.hypotheses[0] for translation in results]
80
+
81
+ decoded_lines = tgt_model.decode(translations)
82
+
83
+ tgt_file = open(tgt_path, "w", encoding="utf-8", newline='')
84
+
85
+ for line in decoded_lines:
86
+ tgt_file.write(line)
87
+ tgt_file.write('\n')
88
+ ```