glamprou commited on
Commit
25f90bf
·
verified ·
1 Parent(s): e90f0a4

Upload 2 files

Browse files

Custom classes to use until merge with main

configuration_switch_transformers.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022, Google and HuggingFace Inc.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ Switch Transformers model configuration"""
16
+ from transformers.configuration_utils import PretrainedConfig
17
+ from transformers.utils import logging
18
+
19
+
20
+ logger = logging.get_logger(__name__)
21
+
22
+ SWITCH_TRANSFORMERS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
23
+ "google/switch-base-8": "https://huggingface.co/google/switch-base-8/blob/main/config.json",
24
+ }
25
+
26
+
27
+ class SwitchTransformersConfig(PretrainedConfig):
28
+ r"""
29
+ This is the configuration class to store the configuration of a [`SwitchTransformersModel`]. It is used to
30
+ instantiate a SwitchTransformers model according to the specified arguments, defining the model architecture.
31
+ Instantiating a configuration with the defaults will yield a similar configuration to that of the
32
+ SwitchTransformers [google/switch-base-8](https://huggingface.co/google/switch-base-8) architecture.
33
+
34
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
35
+ documentation from [`PretrainedConfig`] for more information.
36
+
37
+ Arguments:
38
+ vocab_size (`int`, *optional*, defaults to 32128):
39
+ Vocabulary size of the SwitchTransformers model. Defines the number of different tokens that can be
40
+ represented by the `inputs_ids` passed when calling [`SwitchTransformersModel`].
41
+ d_model (`int`, *optional*, defaults to 768):
42
+ Size of the encoder layers and the pooler layer.
43
+ d_kv (`int`, *optional*, defaults to 64):
44
+ Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model //
45
+ num_heads`.
46
+ d_ff (`int`, *optional*, defaults to 2048):
47
+ Size of the intermediate feed forward layer in each `SwitchTransformersBlock`.
48
+ expert_capacity (`int`, *optional*, defaults to 64):
49
+ Number of tokens that can be stored in each expert. If set to 1, the model will behave like a regular
50
+ Transformer.
51
+ num_layers (`int`, *optional*, defaults to 12):
52
+ Number of dense hidden layers in the Transformer encoder layer.
53
+ num_sparse_encoder_layers (`int`, *optional*, defaults to 3):
54
+ Number of sparse (MoE) dense hidden layers in the Transformer encoder layer.
55
+ num_decoder_layers (`int`, *optional*, defaults to 12):
56
+ Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not set.
57
+ num_sparse_decoder_layers (`int`, *optional*, defaults to 3):
58
+ Number of sparse (MoE) dense hidden layers in the Transformer decoder layer.
59
+ num_heads (`int`, *optional*, defaults to 12):
60
+ Number of attention heads for each attention layer in the Transformer encoder.
61
+ num_experts (`int`, *optional*, defaults to 8):
62
+ Number of experts for each SwitchTransformer layer.
63
+ router_bias (`bool`, *optional*, defaults to `False`):
64
+ Whether to add a bias to the router.
65
+ router_jitter_noise (`float`, *optional*, defaults to 0.01):
66
+ Amount of noise to add to the router.
67
+ router_dtype (`str`, *optional*, default to `"float32"`):
68
+ The `dtype` used for the routers. It is preferable to keep the `dtype` to `"float32"` as specified in the
69
+ *selective precision* discussion in [the paper](https://arxiv.org/abs/2101.03961).
70
+ router_ignore_padding_tokens (`bool`, *optional*, defaults to `False`):
71
+ Whether to ignore padding tokens when routing.
72
+ relative_attention_num_buckets (`int`, *optional*, defaults to 32):
73
+ The number of buckets to use for each attention layer.
74
+ relative_attention_max_distance (`int`, *optional*, defaults to 128):
75
+ The maximum distance of the longer sequences for the bucket separation.
76
+ dropout_rate (`float`, *optional*, defaults to 0.1):
77
+ The ratio for all dropout layers.
78
+ classifier_dropout (`float`, *optional*, defaults to 0.0):
79
+ The dropout ratio for classifier.
80
+ layer_norm_eps (`float`, *optional*, defaults to 1e-6):
81
+ The epsilon used by the layer normalization layers.
82
+ router_z_loss_coef (`float`, *optional*, defaults to 0.001):
83
+ The z loss factor for the total loss.
84
+ router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
85
+ The aux loss factor for the total loss.
86
+ initializer_factor (`float`, *optional*, defaults to 1.0):
87
+ A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
88
+ testing).
89
+ dense_act_fn (`string`, *optional*, defaults to `"relu"`):
90
+ Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`. SwitchTransformersv1.1
91
+ uses the `"gated-gelu"` feed forward projection. Original SwitchTransformers uses `"relu"`.
92
+ add_router_probs (`bool`, *optional*, defaults to `False`):
93
+ Whether to output router probabilities to compute router auxiliary loss.
94
+ use_cache (`bool`, *optional*, defaults to `True`):
95
+ Whether or not the model should return the last key/values attentions (not used by all models).
96
+ """
97
+
98
+ model_type = "switch_transformers"
99
+ keys_to_ignore_at_inference = ["past_key_values"]
100
+ attribute_map = {"hidden_size": "d_model", "num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"}
101
+
102
+ def __init__(
103
+ self,
104
+ vocab_size=32128,
105
+ d_model=768,
106
+ d_kv=64,
107
+ d_ff=2048,
108
+ expert_capacity=64,
109
+ num_layers=12,
110
+ num_sparse_encoder_layers=3,
111
+ num_decoder_layers=12,
112
+ num_sparse_decoder_layers=3,
113
+ num_heads=12,
114
+ num_experts=8,
115
+ router_bias=False,
116
+ router_jitter_noise=0.01,
117
+ router_dtype="float32",
118
+ router_ignore_padding_tokens=False,
119
+ relative_attention_num_buckets=32,
120
+ relative_attention_max_distance=128,
121
+ dropout_rate=0.1,
122
+ classifier_dropout=0.0,
123
+ layer_norm_epsilon=1e-6,
124
+ router_z_loss_coef=0.001,
125
+ router_aux_loss_coef=0.001,
126
+ initializer_factor=1.0,
127
+ dense_act_fn="relu",
128
+ is_encoder_decoder=True,
129
+ add_router_probs=False,
130
+ use_cache=True,
131
+ pad_token_id=0,
132
+ eos_token_id=1,
133
+ **kwargs,
134
+ ):
135
+ self.vocab_size = vocab_size
136
+ self.d_model = d_model
137
+ self.d_kv = d_kv
138
+ self.d_ff = d_ff
139
+
140
+ self.num_sparse_encoder_layers = num_sparse_encoder_layers
141
+
142
+ self.num_layers = num_layers
143
+ self.num_decoder_layers = (
144
+ num_decoder_layers if num_decoder_layers is not None else self.num_layers
145
+ ) # default = symmetry
146
+ self.num_sparse_decoder_layers = num_sparse_decoder_layers
147
+
148
+ # This tells us, each how many encoder layer we'll have to set a sparse layer.
149
+ if self.num_sparse_encoder_layers > 0:
150
+ self.encoder_sparse_step = self.num_layers // self.num_sparse_encoder_layers
151
+ else:
152
+ self.encoder_sparse_step = self.num_layers # HACK: this will create 0 sparse layers
153
+
154
+ # This tells us, each how many encoder layer we'll have to set a sparse layer.
155
+ if self.num_sparse_decoder_layers > 0:
156
+ self.decoder_sparse_step = self.num_decoder_layers // self.num_sparse_decoder_layers
157
+ else:
158
+ self.decoder_sparse_step = self.num_decoder_layers # HACK: this will create 0 sparse layers
159
+
160
+ self.num_heads = num_heads
161
+ self.num_experts = num_experts
162
+ self.expert_capacity = expert_capacity
163
+ self.router_bias = router_bias
164
+ self.router_jitter_noise = router_jitter_noise
165
+ if router_dtype not in ["float32", "float16", "bfloat16"]:
166
+ raise ValueError(f"`router_dtype` must be one of 'float32', 'float16' or 'bfloat16', got {router_dtype}")
167
+ self.router_dtype = router_dtype
168
+
169
+ self.router_ignore_padding_tokens = router_ignore_padding_tokens
170
+ self.relative_attention_num_buckets = relative_attention_num_buckets
171
+ self.relative_attention_max_distance = relative_attention_max_distance
172
+
173
+ self.dropout_rate = dropout_rate
174
+ if classifier_dropout is not None:
175
+ self.classifier_dropout = classifier_dropout
176
+ else:
177
+ self.classifier_dropout = 0.0
178
+ self.layer_norm_epsilon = layer_norm_epsilon
179
+ self.initializer_factor = initializer_factor
180
+ self.use_cache = use_cache
181
+ self.add_router_probs = add_router_probs
182
+
183
+ self.router_z_loss_coef = router_z_loss_coef
184
+ self.router_aux_loss_coef = router_aux_loss_coef
185
+ self.dense_act_fn = dense_act_fn
186
+
187
+ super().__init__(
188
+ pad_token_id=pad_token_id,
189
+ eos_token_id=eos_token_id,
190
+ is_encoder_decoder=is_encoder_decoder,
191
+ **kwargs,
192
+ )
modeling_switch_transformers.py ADDED
The diff for this file is too large to render. See raw diff