Safetensors
mpt
custom_code
krutrim-admin commited on
Commit
251e248
·
verified ·
1 Parent(s): d0dd892

Push model files

Browse files
attention.py ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Attention layers."""
2
+ import math
3
+ import warnings
4
+ from typing import Any, Optional
5
+ import torch
6
+ import torch.nn as nn
7
+ import transformers
8
+ from einops import rearrange
9
+ from packaging import version
10
+ from torch import nn
11
+ from .fc import FC_CLASS_REGISTRY
12
+ from .norm import NORM_CLASS_REGISTRY
13
+
14
+ def is_flash_v2_installed(v2_version: str='2.0.0'):
15
+ assert version.parse(v2_version) >= version.parse('2.0.0')
16
+ try:
17
+ import flash_attn as flash_attn
18
+ except:
19
+ return False
20
+ return version.parse(flash_attn.__version__) >= version.parse(v2_version)
21
+
22
+ def is_flash_v1_installed():
23
+ try:
24
+ import flash_attn as flash_attn
25
+ except:
26
+ return False
27
+ return version.parse(flash_attn.__version__) < version.parse('2.0.0')
28
+
29
+ def is_transformers_version_gte(hf_version: str) -> bool:
30
+ return version.parse(transformers.__version__) >= version.parse(hf_version)
31
+
32
+ def check_alibi_support(attention_impl: str) -> bool:
33
+ return attention_impl != 'flash' or is_flash_v2_installed(v2_version='v2.4.2')
34
+ if is_flash_v1_installed():
35
+ import transformers
36
+ transformers.utils.is_flash_attn_available = lambda: False
37
+ from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
38
+
39
+ def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool) -> bool:
40
+ if original_is_causal and num_query_tokens != num_key_tokens:
41
+ if num_query_tokens != 1:
42
+ raise NotImplementedError('MPT does not support query and key with different number of tokens, unless number of query tokens is 1.')
43
+ else:
44
+ return False
45
+ return original_is_causal
46
+
47
+ def repeat_kv_for_gqa(hidden: torch.Tensor, n_rep: int) -> torch.Tensor:
48
+ """Perform repeat of kv heads along a particular dimension.
49
+
50
+ hidden.shape expected to be: (batch size, seq len, kv_n_heads, head_dim)
51
+ n_rep: amount of repetitions of kv_n_heads
52
+ Unlike torch.repeat_interleave, this function avoids allocating new memory.
53
+ """
54
+ if n_rep == 1:
55
+ return hidden
56
+ b, s, kv_n_heads, d = hidden.shape
57
+ hidden = hidden[:, :, :, None, :].expand(b, s, kv_n_heads, n_rep, d)
58
+ return hidden.reshape(b, s, kv_n_heads * n_rep, d)
59
+
60
+ def scaled_multihead_dot_product_attention(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, n_heads: int, kv_n_heads: int, past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]]=None, softmax_scale: Optional[float]=None, attn_bias: Optional[torch.Tensor]=None, key_padding_mask: Optional[torch.Tensor]=None, is_causal: bool=False, dropout_p: float=0.0, training: bool=False, needs_weights: bool=False) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor, torch.Tensor]]]:
61
+ q = rearrange(query, 'b s (h d) -> b h s d', h=n_heads)
62
+ k = rearrange(key, 'b s (h d) -> b h d s', h=kv_n_heads)
63
+ v = rearrange(value, 'b s (h d) -> b h s d', h=kv_n_heads)
64
+ if past_key_value is not None:
65
+ if len(past_key_value) != 0:
66
+ k = torch.cat([past_key_value[0], k], dim=3)
67
+ v = torch.cat([past_key_value[1], v], dim=2)
68
+ past_key_value = (k, v)
69
+ b, _, s_q, d = q.shape
70
+ s_k = k.size(-1)
71
+ if kv_n_heads > 1 and kv_n_heads < n_heads:
72
+ k = repeat_kv_for_gqa(k.transpose(1, 2), n_heads // kv_n_heads).transpose(1, 2)
73
+ v = repeat_kv_for_gqa(v.transpose(1, 2), n_heads // kv_n_heads).transpose(1, 2)
74
+ if softmax_scale is None:
75
+ softmax_scale = 1 / math.sqrt(d)
76
+ attn_weight = q.matmul(k) * softmax_scale
77
+ if attn_bias is not None:
78
+ _s_q = max(0, attn_bias.size(2) - s_q)
79
+ _s_k = max(0, attn_bias.size(3) - s_k)
80
+ attn_bias = attn_bias[:, :, _s_q:, _s_k:]
81
+ if attn_bias.size(-1) != 1 and attn_bias.size(-1) != s_k or (attn_bias.size(-2) != 1 and attn_bias.size(-2) != s_q):
82
+ raise RuntimeError(f'attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}.')
83
+ attn_weight = attn_weight + attn_bias
84
+ min_val = torch.finfo(q.dtype).min
85
+ if key_padding_mask is not None:
86
+ if attn_bias is not None:
87
+ warnings.warn('Propagating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unnecessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
88
+ attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
89
+ if is_causal and (not q.size(2) == 1):
90
+ s = max(s_q, s_k)
91
+ causal_mask = attn_weight.new_ones(s, s, dtype=torch.float32)
92
+ causal_mask = causal_mask.tril()
93
+ causal_mask = causal_mask.to(torch.bool)
94
+ causal_mask = ~causal_mask
95
+ causal_mask = causal_mask[-s_q:, -s_k:]
96
+ attn_weight = attn_weight.masked_fill(causal_mask.view(1, 1, s_q, s_k), min_val)
97
+ attn_weight = torch.softmax(attn_weight, dim=-1)
98
+ if dropout_p:
99
+ attn_weight = torch.nn.functional.dropout(attn_weight, p=dropout_p, training=training, inplace=True)
100
+ out = attn_weight.to(v.dtype).matmul(v)
101
+ out = rearrange(out, 'b h s d -> b s (h d)')
102
+ if needs_weights:
103
+ return (out, attn_weight, past_key_value)
104
+ return (out, None, past_key_value)
105
+
106
+ def check_valid_inputs(*tensors: torch.Tensor, valid_dtypes: Optional[list[torch.dtype]]=None):
107
+ if valid_dtypes is None:
108
+ valid_dtypes = [torch.float16, torch.bfloat16]
109
+ for tensor in tensors:
110
+ if tensor.dtype not in valid_dtypes:
111
+ raise TypeError(f'tensor.dtype={tensor.dtype!r} must be in valid_dtypes={valid_dtypes!r}.')
112
+ if not tensor.is_cuda:
113
+ raise TypeError(f'Inputs must be cuda tensors (tensor.is_cuda={tensor.is_cuda!r}).')
114
+
115
+ def flash_attn_fn(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, n_heads: int, kv_n_heads: int, past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]]=None, softmax_scale: Optional[float]=None, attn_bias: Optional[torch.Tensor]=None, key_padding_mask: Optional[torch.Tensor]=None, is_causal: bool=False, dropout_p: float=0.0, training: bool=False, needs_weights: bool=False, multiquery: bool=False, should_repeat_kv_for_gqa: Optional[bool]=True, sliding_window_size: int=-1, alibi_slopes: Optional[torch.Tensor]=None, flash_attn_padding_info: Optional[dict[str, torch.Tensor]]=None) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor, torch.Tensor]]]:
116
+ if key_padding_mask is not None:
117
+ raise ValueError('key_padding_mask should be None for flash attn.')
118
+ del key_padding_mask
119
+ if flash_attn_padding_info is None:
120
+ raise ValueError('flash_attn_padding_info is required for flash attn.')
121
+ try:
122
+ from flash_attn import bert_padding, flash_attn_interface
123
+ except:
124
+ raise RuntimeError('Please install flash-attn==1.0.9 or flash-attn==2.3.6')
125
+ check_valid_inputs(query, key, value)
126
+ if past_key_value is not None:
127
+ if len(past_key_value) != 0:
128
+ key = torch.cat([past_key_value[0], key], dim=1)
129
+ value = torch.cat([past_key_value[1], value], dim=1)
130
+ past_key_value = (key, value)
131
+ if attn_bias is not None:
132
+ raise NotImplementedError(f'attn_bias not implemented for flash attn.')
133
+ batch_size, seqlen = query.shape[:2]
134
+ indices_q = flash_attn_padding_info['indices_q']
135
+ indices_k = flash_attn_padding_info['indices_k']
136
+ indices_v = flash_attn_padding_info['indices_v']
137
+ cu_seqlens_q = flash_attn_padding_info['cu_seqlens_q']
138
+ cu_seqlens_k = flash_attn_padding_info['cu_seqlens_k']
139
+ max_seqlen_q = flash_attn_padding_info['max_seqlen_q']
140
+ max_seqlen_k = flash_attn_padding_info['max_seqlen_k']
141
+ query_unpad = bert_padding.index_first_axis(rearrange(query, 'b s ... -> (b s) ...'), indices_q)
142
+ query_unpad = rearrange(query_unpad, 'nnz (h d) -> nnz h d', h=n_heads)
143
+ key_unpad = bert_padding.index_first_axis(rearrange(key, 'b s ... -> (b s) ...'), indices_k)
144
+ key_unpad = rearrange(key_unpad, 'nnz (h d) -> nnz h d', h=kv_n_heads)
145
+ value_unpad = bert_padding.index_first_axis(rearrange(value, 'b s ... -> (b s) ...'), indices_v)
146
+ value_unpad = rearrange(value_unpad, 'nnz (h d) -> nnz h d', h=kv_n_heads)
147
+ if kv_n_heads < n_heads and (not is_flash_v2_installed()) and (not should_repeat_kv_for_gqa):
148
+ raise ValueError('For Grouped Query Attention or Multi Query Attention, should_repeat_kv_for_gqa should be set to True if not using Flash Attention v2.')
149
+ if should_repeat_kv_for_gqa:
150
+ if kv_n_heads == 1:
151
+ key_unpad = key_unpad.expand(key_unpad.size(0), n_heads, key_unpad.size(-1))
152
+ value_unpad = value_unpad.expand(value_unpad.size(0), n_heads, value_unpad.size(-1))
153
+ elif kv_n_heads < n_heads:
154
+ key_unpad = repeat_kv_for_gqa(key_unpad.view(1, key_unpad.size(0), kv_n_heads, -1), n_heads // kv_n_heads).view(key_unpad.size(0), n_heads, -1)
155
+ value_unpad = repeat_kv_for_gqa(value_unpad.view(1, value_unpad.size(0), kv_n_heads, -1), n_heads // kv_n_heads).view(value_unpad.size(0), n_heads, -1)
156
+ dropout_p = dropout_p if training else 0.0
157
+ reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
158
+ if is_flash_v1_installed():
159
+ output_unpad = flash_attn_interface.flash_attn_unpadded_func(q=query_unpad, k=key_unpad, v=value_unpad, cu_seqlens_q=cu_seqlens_q, cu_seqlens_k=cu_seqlens_k, max_seqlen_q=max_seqlen_q, max_seqlen_k=max_seqlen_k, dropout_p=dropout_p, softmax_scale=softmax_scale, causal=reset_is_causal, return_attn_probs=needs_weights)
160
+ elif is_flash_v2_installed():
161
+ alibi_kwargs = {}
162
+ if check_alibi_support('flash'):
163
+ alibi_kwargs = {'alibi_slopes': alibi_slopes}
164
+ elif alibi_slopes is not None:
165
+ raise ValueError('alibi_slopes is only supported for flash-attn>=2.4.2')
166
+ output_unpad = flash_attn_interface.flash_attn_varlen_func(q=query_unpad, k=key_unpad, v=value_unpad, cu_seqlens_q=cu_seqlens_q, cu_seqlens_k=cu_seqlens_k, max_seqlen_q=max_seqlen_q, max_seqlen_k=max_seqlen_k, dropout_p=dropout_p, softmax_scale=softmax_scale, causal=reset_is_causal, return_attn_probs=needs_weights, window_size=(sliding_window_size, sliding_window_size), **alibi_kwargs)
167
+ else:
168
+ raise RuntimeError('flash-attn==1.0.9 or flash-attn==2.4.2 is required.')
169
+ output = bert_padding.pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size, seqlen)
170
+ return (output, None, past_key_value)
171
+
172
+ def triton_flash_attn_fn(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, n_heads: int, kv_n_heads: int, past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]]=None, softmax_scale: Optional[float]=None, attn_bias: Optional[torch.Tensor]=None, key_padding_mask: Optional[torch.Tensor]=None, is_causal: bool=False, dropout_p: float=0.0, training: bool=False, needs_weights: bool=False) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor, torch.Tensor]]]:
173
+ try:
174
+ from .flash_attn_triton import flash_attn_func
175
+ except:
176
+ _installed = False
177
+ if version.parse(torch.__version__) < version.parse('2.0.0'):
178
+ _installed = True
179
+ try:
180
+ from flash_attn.flash_attn_triton import flash_attn_func
181
+ except:
182
+ _installed = False
183
+ if not _installed:
184
+ raise RuntimeError('Requirements for `attn_impl: triton` not installed. Either (1) have a CUDA-compatible GPU ' + 'and `pip install .[gpu]` if installing from llm-foundry source or ' + '`pip install triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir#subdirectory=python` ' + 'if installing from pypi, or (2) use torch attn model.attn_config.attn_impl=torch (torch attn_impl will be slow). ' + 'Note: (1) requires you have CMake and PyTorch already installed.')
185
+ check_valid_inputs(query, key, value)
186
+ if past_key_value is not None:
187
+ if len(past_key_value) != 0:
188
+ key = torch.cat([past_key_value[0], key], dim=1)
189
+ value = torch.cat([past_key_value[1], value], dim=1)
190
+ past_key_value = (key, value)
191
+ if attn_bias is not None:
192
+ _s_q = max(0, attn_bias.size(2) - query.size(1))
193
+ _s_k = max(0, attn_bias.size(3) - key.size(1))
194
+ attn_bias = attn_bias[:, :, _s_q:, _s_k:]
195
+ if dropout_p:
196
+ raise NotImplementedError(f'Dropout not implemented for attn_impl: triton.')
197
+ dropout_p = dropout_p if training else 0.0
198
+ if needs_weights:
199
+ raise NotImplementedError(f'attn_impl: triton cannot return attn weights.')
200
+ if key_padding_mask is not None:
201
+ warnings.warn('Propagating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unnecessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
202
+ b_size, s_k = key_padding_mask.shape[:2]
203
+ if attn_bias is None:
204
+ attn_bias = query.new_zeros(b_size, 1, 1, s_k)
205
+ attn_bias = attn_bias.masked_fill(~key_padding_mask.view((b_size, 1, 1, s_k)), torch.finfo(query.dtype).min)
206
+ query = rearrange(query, 'b s (h d) -> b s h d', h=n_heads)
207
+ key = rearrange(key, 'b s (h d) -> b s h d', h=kv_n_heads)
208
+ value = rearrange(value, 'b s (h d) -> b s h d', h=kv_n_heads)
209
+ if kv_n_heads == 1:
210
+ key = key.repeat(1, 1, n_heads, 1)
211
+ value = value.repeat(1, 1, n_heads, 1)
212
+ elif kv_n_heads < n_heads:
213
+ key = repeat_kv_for_gqa(key, n_heads // kv_n_heads)
214
+ value = repeat_kv_for_gqa(value, n_heads // kv_n_heads)
215
+ reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
216
+ attn_output = flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale)
217
+ output = attn_output.view(*attn_output.shape[:2], -1)
218
+ return (output, None, past_key_value)
219
+
220
+ class GroupedQueryAttention(nn.Module):
221
+ """Grouped Query Attention (GQA) is a generalization of Multi-head (MHA).
222
+
223
+ and Multi-query attention (MQA).
224
+
225
+ This allows the user to set a variable of number of kv_n_heads, rather than
226
+ just n_heads or 1, as in MHA and MQA. Using torch or triton attention
227
+ implementation enables user to also use additive bias.
228
+ """
229
+
230
+ def __init__(self, d_model: int, n_heads: int, kv_n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, qk_gn: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, norm_type: str='low_precision_layernorm', fc_type: str='torch', device: Optional[str]=None, bias: bool=True, sliding_window_size: int=-1):
231
+ super().__init__()
232
+ self.attn_impl = attn_impl
233
+ self.clip_qkv = clip_qkv
234
+ self.qk_ln = qk_ln
235
+ self.qk_gn = qk_gn
236
+ self.d_model = d_model
237
+ self.n_heads = n_heads
238
+ self.kv_n_heads = kv_n_heads
239
+ self.sliding_window_size = sliding_window_size
240
+ self.head_dim = d_model // n_heads
241
+ if self.kv_n_heads <= 0:
242
+ raise ValueError('kv_n_heads should be greater than zero.')
243
+ if self.kv_n_heads > self.n_heads:
244
+ raise ValueError('The number of KV heads should be less than or equal to Q heads.')
245
+ if self.n_heads % self.kv_n_heads != 0:
246
+ raise ValueError('Each Q head should get the same number of KV heads, so n_heads must be divisible by kv_n_heads.')
247
+ if qk_ln and qk_gn:
248
+ raise ValueError('Only one of qk_ln and qk_gn can be set to True.')
249
+ self.softmax_scale = softmax_scale
250
+ if self.softmax_scale is None:
251
+ self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
252
+ self.attn_dropout_p = attn_pdrop
253
+ fc_kwargs: dict[str, Any] = {'bias': bias}
254
+ fc_kwargs['device'] = device
255
+ self.Wqkv = FC_CLASS_REGISTRY[fc_type](self.d_model, self.d_model + 2 * self.kv_n_heads * self.head_dim, **fc_kwargs)
256
+ fuse_splits = [i * self.head_dim for i in range(1, self.n_heads + 2 * self.kv_n_heads)]
257
+ self.Wqkv._fused = (0, fuse_splits)
258
+ if self.qk_ln or self.qk_gn:
259
+ norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
260
+ norm_size = self.head_dim if qk_gn else d_model
261
+ self.q_ln = norm_class(norm_size, device=device)
262
+ if qk_ln:
263
+ norm_size = self.head_dim * kv_n_heads
264
+ self.k_ln = norm_class(norm_size, device=device)
265
+ if self.attn_impl == 'flash':
266
+ self.attn_fn = flash_attn_fn
267
+ elif self.attn_impl == 'triton':
268
+ self.attn_fn = triton_flash_attn_fn
269
+ elif self.attn_impl == 'torch':
270
+ self.attn_fn = scaled_multihead_dot_product_attention
271
+ else:
272
+ raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
273
+ self.out_proj = FC_CLASS_REGISTRY[fc_type](self.d_model, self.d_model, **fc_kwargs)
274
+ self.out_proj._is_residual = True
275
+
276
+ def forward(self, x: torch.Tensor, past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, attention_mask: Optional[torch.Tensor]=None, rotary_emb_w_meta_info: Optional[dict]=None, is_causal: bool=True, needs_weights: bool=False, alibi_slopes: Optional[torch.Tensor]=None, flash_attn_padding_info: Optional[dict[str, torch.Tensor]]=None) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor, torch.Tensor]]]:
277
+ qkv = self.Wqkv(x)
278
+ if self.clip_qkv:
279
+ qkv = qkv.clamp(min=-self.clip_qkv, max=self.clip_qkv)
280
+ query, key, value = qkv.split([self.d_model, self.kv_n_heads * self.head_dim, self.kv_n_heads * self.head_dim], dim=2)
281
+ key_padding_mask = attention_mask
282
+ if self.qk_ln or self.qk_gn:
283
+ q_shape, k_shape = (query.shape, key.shape)
284
+ if self.qk_gn:
285
+ b, s = query.shape[:2]
286
+ query = query.view(b, s, self.n_heads, -1)
287
+ key = key.view(b, s, self.kv_n_heads, -1)
288
+ dtype = query.dtype
289
+ query = self.q_ln(query).to(dtype).view(q_shape)
290
+ key = self.k_ln(key).to(dtype).view(k_shape)
291
+ if rotary_emb_w_meta_info is not None:
292
+ rotary_emb = rotary_emb_w_meta_info['rotary_emb']
293
+ seq_len = rotary_emb_w_meta_info['seq_len']
294
+ offset_info = rotary_emb_w_meta_info['offset_info']
295
+ bsz, seqlen = query.shape[:2]
296
+ query = query.view(bsz, seqlen, -1, self.head_dim)
297
+ key = key.view(bsz, seqlen, -1, self.head_dim)
298
+ if rotary_emb_w_meta_info['impl'] == 'dail':
299
+ value = value.view(bsz, seqlen, -1, self.head_dim)
300
+ kv = torch.stack([key, value], dim=2)
301
+ query, kv = rotary_emb(query, kv, seqlen_offset=offset_info, max_seqlen=seq_len)
302
+ [key, value] = torch.unbind(kv, dim=2)
303
+ value = value.view(bsz, seqlen, self.kv_n_heads * self.head_dim)
304
+ elif rotary_emb_w_meta_info['impl'] == 'hf':
305
+ cos, sin = rotary_emb(value, seq_len)
306
+ if is_transformers_version_gte('4.36'):
307
+ query, key = apply_rotary_pos_emb(query, key, cos, sin, offset_info, unsqueeze_dim=2)
308
+ else:
309
+ query = query.transpose(1, 2)
310
+ key = key.transpose(1, 2)
311
+ query, key = apply_rotary_pos_emb(query, key, cos, sin, offset_info)
312
+ query = query.transpose(1, 2)
313
+ key = key.transpose(1, 2)
314
+ query = query.view(bsz, seqlen, self.d_model)
315
+ key = key.view(bsz, seqlen, self.kv_n_heads * self.head_dim)
316
+ extra_attn_kwargs = {}
317
+ if self.attn_impl == 'flash':
318
+ key_padding_mask = None
319
+ extra_attn_kwargs = {'should_repeat_kv_for_gqa': not is_flash_v2_installed(), 'sliding_window_size': self.sliding_window_size, 'alibi_slopes': alibi_slopes, 'flash_attn_padding_info': flash_attn_padding_info}
320
+ context, attn_weights, past_key_value = self.attn_fn(query, key, value, self.n_heads, self.kv_n_heads, past_key_value=past_key_value, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights, **extra_attn_kwargs)
321
+ return (self.out_proj(context), attn_weights, past_key_value)
322
+
323
+ class MultiheadAttention(GroupedQueryAttention):
324
+ """Multi-head self attention.
325
+
326
+ Using torch or triton attention implementation enables user to also use
327
+ additive bias.
328
+ """
329
+
330
+ def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, qk_gn: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, norm_type: str='low_precision_layernorm', fc_type: str='torch', device: Optional[str]=None, bias: bool=True, sliding_window_size: int=-1):
331
+ super().__init__(d_model=d_model, n_heads=n_heads, kv_n_heads=n_heads, attn_impl=attn_impl, clip_qkv=clip_qkv, qk_ln=qk_ln, qk_gn=qk_gn, softmax_scale=softmax_scale, attn_pdrop=attn_pdrop, norm_type=norm_type, fc_type=fc_type, device=device, bias=bias, sliding_window_size=sliding_window_size)
332
+
333
+ class MultiQueryAttention(GroupedQueryAttention):
334
+ """Multi-Query self attention.
335
+
336
+ Using torch or triton attention implementation enables user to also use
337
+ additive bias.
338
+ """
339
+
340
+ def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, qk_gn: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, norm_type: str='low_precision_layernorm', fc_type: str='torch', device: Optional[str]=None, bias: bool=True, sliding_window_size: int=-1):
341
+ super().__init__(d_model=d_model, n_heads=n_heads, kv_n_heads=1, attn_impl=attn_impl, clip_qkv=clip_qkv, qk_ln=qk_ln, qk_gn=qk_gn, softmax_scale=softmax_scale, attn_pdrop=attn_pdrop, norm_type=norm_type, fc_type=fc_type, device=device, bias=bias, sliding_window_size=sliding_window_size)
342
+
343
+ def attn_bias_shape(attn_impl: str, n_heads: int, seq_len: int, alibi: bool, prefix_lm: bool, causal: bool, use_sequence_id: bool) -> Optional[tuple[int, int, int, int]]:
344
+ if attn_impl == 'flash':
345
+ return None
346
+ elif attn_impl in ['torch', 'triton']:
347
+ if alibi:
348
+ if (prefix_lm or not causal) or use_sequence_id:
349
+ return (1, n_heads, seq_len, seq_len)
350
+ return (1, n_heads, 1, seq_len)
351
+ elif prefix_lm or use_sequence_id:
352
+ return (1, 1, seq_len, seq_len)
353
+ return None
354
+ else:
355
+ raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
356
+
357
+ def build_attn_bias(attn_impl: str, attn_bias: torch.Tensor, n_heads: int, seq_len: int, causal: bool=False, alibi: bool=False, alibi_bias_max: int=8) -> Optional[torch.Tensor]:
358
+ if attn_impl == 'flash':
359
+ return None
360
+ elif attn_impl in ['torch', 'triton']:
361
+ if alibi:
362
+ device, dtype = (attn_bias.device, attn_bias.dtype)
363
+ attn_bias = attn_bias.add(build_alibi_bias(n_heads, seq_len, full=not causal, alibi_bias_max=alibi_bias_max, device=device, dtype=dtype))
364
+ return attn_bias
365
+ else:
366
+ raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
367
+
368
+ def gen_slopes(n_heads: int, alibi_bias_max: int=8, device: Optional[torch.device]=None, return_1d: bool=False) -> torch.Tensor:
369
+ _n_heads = 2 ** math.ceil(math.log2(n_heads))
370
+ m = torch.arange(1, _n_heads + 1, dtype=torch.float32, device=device)
371
+ m = m.mul(alibi_bias_max / _n_heads)
372
+ slopes = 1.0 / torch.pow(2, m)
373
+ if _n_heads != n_heads:
374
+ slopes = torch.concat([slopes[1::2], slopes[::2]])[:n_heads]
375
+ if return_1d:
376
+ return slopes
377
+ return slopes.view(1, n_heads, 1, 1)
378
+
379
+ def build_alibi_bias(n_heads: int, seq_len: int, full: bool=False, alibi_bias_max: int=8, device: Optional[torch.device]=None, dtype: Optional[torch.dtype]=None) -> torch.Tensor:
380
+ alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, 1, seq_len)
381
+ if full:
382
+ alibi_bias = alibi_bias - torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, seq_len, 1)
383
+ alibi_bias = alibi_bias.abs().mul(-1)
384
+ slopes = gen_slopes(n_heads, alibi_bias_max, device=device)
385
+ alibi_bias = alibi_bias * slopes
386
+ return alibi_bias.to(dtype=dtype)
387
+ ATTN_CLASS_REGISTRY = {'multihead_attention': MultiheadAttention, 'multiquery_attention': MultiQueryAttention, 'grouped_query_attention': GroupedQueryAttention}
blocks.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """GPT Blocks used for the GPT Model."""
2
+ from typing import Any, Dict, Optional, Tuple
3
+ import torch
4
+ import torch.nn as nn
5
+ from .attention import ATTN_CLASS_REGISTRY
6
+ from .ffn import FFN_CLASS_REGISTRY, build_ffn
7
+ from .norm import NORM_CLASS_REGISTRY
8
+ try:
9
+ from flash_attn.bert_padding import unpad_input, pad_input
10
+ except:
11
+ unpad_input, pad_input = (None, None)
12
+ attn_config_defaults: Dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'qk_gn': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'sliding_window_size': -1, 'alibi': False, 'alibi_bias_max': 8, 'rope': False, 'rope_theta': 10000, 'rope_impl': 'dail', 'rope_dail_config': {'type': 'original', 'pos_idx_in_fp32': True, 'xpos_scale_base': 512}, 'rope_hf_config': {'type': 'no_scaling', 'factor': 1.0}}
13
+
14
+ class MPTBlock(nn.Module):
15
+
16
+ def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Optional[Dict]=None, ffn_config: Optional[Dict]=None, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', fc_type: str='torch', device: Optional[str]=None, no_bias: bool=False, use_pad_tok_in_ffn: bool=True, **kwargs: Any):
17
+ if attn_config is None:
18
+ attn_config = attn_config_defaults
19
+ if ffn_config is None:
20
+ ffn_config = {'ffn_type': 'mptmlp'}
21
+ del kwargs
22
+ super().__init__()
23
+ norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
24
+ assert isinstance(attn_config['attn_type'], str)
25
+ attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']]
26
+ args_to_exclude_in_attn_class = {'attn_type', 'prefix_lm', 'alibi', 'attn_uses_sequence_id', 'alibi_bias_max', 'rope', 'rope_theta', 'rope_impl', 'rope_dail_config', 'rope_hf_config'}
27
+ attn_config_subset_for_attn_class = {k: v for k, v in attn_config.items() if k not in args_to_exclude_in_attn_class}
28
+ self.norm_1 = norm_class(d_model, device=device)
29
+ self.attn = attn_class(d_model=d_model, n_heads=n_heads, fc_type=fc_type, device=device, **attn_config_subset_for_attn_class, bias=not no_bias)
30
+ self.norm_2 = None
31
+ if not getattr(FFN_CLASS_REGISTRY[ffn_config['ffn_type']], '_has_norm', False):
32
+ self.norm_2 = norm_class(d_model, device=device)
33
+ self.ffn = build_ffn(d_model=d_model, expansion_ratio=expansion_ratio, device=device, bias=not no_bias, **ffn_config)
34
+ self.resid_attn_dropout = nn.Dropout(resid_pdrop)
35
+ self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
36
+ self.use_pad_tok_in_ffn = use_pad_tok_in_ffn
37
+
38
+ def forward(self, x: torch.Tensor, past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, rotary_emb_w_meta_info: Optional[Dict]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True, output_attentions: bool=False, alibi_slopes: Optional[torch.Tensor]=None, flash_attn_padding_info: Optional[dict[str, torch.Tensor]]=None) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor, torch.Tensor]]]:
39
+ a = self.norm_1(x)
40
+ b, attn_weights, past_key_value = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, rotary_emb_w_meta_info=rotary_emb_w_meta_info, attention_mask=attention_mask, is_causal=is_causal, needs_weights=output_attentions, alibi_slopes=alibi_slopes, flash_attn_padding_info=flash_attn_padding_info)
41
+ x = x + self.resid_attn_dropout(b)
42
+ m = x
43
+ if self.norm_2 is not None:
44
+ m = self.norm_2(x)
45
+ batch_size, seq_len = m.size()[:2]
46
+ indices = None
47
+ if not self.use_pad_tok_in_ffn:
48
+ assert unpad_input is not None
49
+ m, indices, _, _ = unpad_input(m, attention_mask)
50
+ n = self.ffn(m)
51
+ if not self.use_pad_tok_in_ffn:
52
+ assert pad_input is not None
53
+ n = pad_input(n, indices, batch_size, seq_len)
54
+ x = x + self.resid_ffn_dropout(n)
55
+ return (x, attn_weights, past_key_value)
config.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/user/shahrukh/models/responder_v2_mpt",
3
+ "architectures": [
4
+ "LlavaMPTForCausalLM"
5
+ ],
6
+ "attn_config": {
7
+ "alibi": true,
8
+ "alibi_bias_max": 8,
9
+ "attn_impl": "flash",
10
+ "attn_pdrop": 0.0,
11
+ "attn_type": "grouped_query_attention",
12
+ "attn_uses_sequence_id": false,
13
+ "clip_qkv": 6,
14
+ "kv_n_heads": 8,
15
+ "prefix_lm": false,
16
+ "qk_gn": false,
17
+ "qk_ln": false,
18
+ "rope": false,
19
+ "rope_dail_config": {
20
+ "pos_idx_in_fp32": true,
21
+ "type": "original",
22
+ "xpos_scale_base": 512
23
+ },
24
+ "rope_hf_config": {
25
+ "factor": 1.0,
26
+ "type": "no_scaling"
27
+ },
28
+ "rope_impl": "dail",
29
+ "rope_theta": 10000,
30
+ "sliding_window_size": -1,
31
+ "softmax_scale": null
32
+ },
33
+ "auto_map": {
34
+ "AutoConfig": "configuration_mpt.MPTConfig",
35
+ "AutoModelForCausalLM": "modeling_mpt.MPTForCausalLM"
36
+ },
37
+ "d_model": 4608,
38
+ "emb_pdrop": 0.0,
39
+ "embedding_fraction": 1.0,
40
+ "expansion_ratio": 4,
41
+ "fc_type": "torch",
42
+ "ffn_config": {
43
+ "fc_type": "torch",
44
+ "ffn_type": "mptmlp"
45
+ },
46
+ "freeze_mm_mlp_adapter": false,
47
+ "hidden_size": 4608,
48
+ "image_aspect_ratio": "pad",
49
+ "image_grid_pinpoints": null,
50
+ "init_config": {
51
+ "emb_init_std": null,
52
+ "emb_init_uniform_lim": null,
53
+ "fan_mode": "fan_in",
54
+ "init_div_is_residual": true,
55
+ "init_gain": 0.0,
56
+ "init_nonlinearity": "relu",
57
+ "init_std": null,
58
+ "name": "kaiming_normal_"
59
+ },
60
+ "init_device": "cpu",
61
+ "learned_pos_emb": false,
62
+ "logit_scale": null,
63
+ "max_seq_len": 4096,
64
+ "mm_hidden_size": 1152,
65
+ "mm_projector_type": "mlp2x_gelu",
66
+ "mm_use_im_patch_token": false,
67
+ "mm_use_im_start_end": false,
68
+ "mm_vision_select_feature": "patch",
69
+ "mm_vision_select_layer": -2,
70
+ "mm_vision_tower": "google/siglip-so400m-patch14-384",
71
+ "model_type": "mpt",
72
+ "n_heads": 48,
73
+ "n_layers": 32,
74
+ "no_bias": true,
75
+ "norm_type": "low_precision_layernorm",
76
+ "resid_pdrop": 0.0,
77
+ "torch_dtype": "bfloat16",
78
+ "transformers_version": "4.37.0",
79
+ "tune_mm_mlp_adapter": false,
80
+ "use_cache": true,
81
+ "use_mm_proj": true,
82
+ "use_pad_tok_in_ffn": true,
83
+ "vocab_size": 70400
84
+ }
configuration_mpt.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """A HuggingFace-style model configuration."""
2
+ import warnings
3
+ from typing import Any, Dict, Optional, Union
4
+ from transformers import PretrainedConfig
5
+ from .attention import check_alibi_support, is_flash_v1_installed, is_flash_v2_installed
6
+ from .blocks import attn_config_defaults
7
+ from .fc import FC_CLASS_REGISTRY
8
+ from .norm import LPLayerNorm
9
+ from .ffn import FFN_CLASS_REGISTRY
10
+ from .warnings import VersionedDeprecationWarning
11
+ ffn_config_defaults: Dict = {'ffn_type': 'mptmlp'}
12
+ init_config_defaults: Dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu', 'init_div_is_residual': True, 'emb_init_std': None, 'emb_init_uniform_lim': None, 'init_std': None, 'init_gain': 0.0}
13
+
14
+ class MPTConfig(PretrainedConfig):
15
+ model_type = 'mpt'
16
+
17
+ def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=24, expansion_ratio: Union[int, float]=4, max_seq_len: int=2048, vocab_size: int=50368, resid_pdrop: float=0.0, emb_pdrop: float=0.0, learned_pos_emb: bool=True, attn_config: Dict=attn_config_defaults, ffn_config: Dict=ffn_config_defaults, init_device: str='cpu', logit_scale: Optional[Union[float, str]]=None, no_bias: bool=False, embedding_fraction: float=1.0, norm_type: str='low_precision_layernorm', use_cache: bool=False, init_config: Dict=init_config_defaults, fc_type: str='torch', tie_word_embeddings: bool=True, use_pad_tok_in_ffn: bool=True, **kwargs: Any):
18
+ """The MPT configuration class.
19
+
20
+ Args:
21
+ d_model (int): The size of the embedding dimension of the model.
22
+ n_heads (int): The number of attention heads.
23
+ n_layers (int): The number of layers in the model.
24
+ expansion_ratio (Union[int, float]): The ratio of the up/down scale in the ffn.
25
+ max_seq_len (int): The maximum sequence length of the model.
26
+ vocab_size (int): The size of the vocabulary.
27
+ resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
28
+ emb_pdrop (float): The dropout probability for the embedding layer.
29
+ learned_pos_emb (bool): Whether to use learned positional embeddings
30
+ attn_config (Dict): A dictionary used to configure the model's attention module:
31
+ attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention, grouped_query_attention
32
+ attn_pdrop (float): The dropout probability for the attention layers.
33
+ attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
34
+ qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
35
+ qk_gn (bool): Whether to apply group normalization to the queries and keys in the attention layer.
36
+ clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
37
+ this value.
38
+ softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
39
+ use the default scale of ``1/sqrt(d_keys)``.
40
+ prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
41
+ extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
42
+ can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
43
+ attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
44
+ When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
45
+ which sub-sequence each token belongs to.
46
+ Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
47
+ sliding_window_size (int): Window size for sliding window local attention. Defaults to -1, which means no sliding window. Query at position i will only attend to keys between [i + seqlen_k - seqlen_q - window_size, i + seqlen_k - seqlen_q + window_size] inclusive. Only works for flash attention v2.3.0 or higher.
48
+ alibi (bool): Whether to use the alibi bias instead of position embeddings.
49
+ alibi_bias_max (int): The maximum value of the alibi bias.
50
+ rope (bool): Whether to use rotary positional embeddings.
51
+ rope_theta (int): The base frequency for rope.
52
+ rope_impl (str): The implementation of rope to use. One of 'hf' (to use the implementation from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py) or 'dail' (to use the implementation from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/layers/rotary.py).
53
+ rope_dail_config (Dict): The configuration for the dail implementation of rope.
54
+ type (str): The type of rotary position embedding to use. Options: 'original' (for https://arxiv.org/pdf/2104.09864.pdf), 'xpos' (for https://arxiv.org/pdf/2212.10554.pdf).
55
+ pos_idx_in_fp32 (bool): If True, the position indices [0, ..., seqlen - 1] are in fp32, otherwise they might be in lower precision. A consequence could be, for example, that bf16 rounds position 1995 to 2000, which leads to them having the same positional embedding.
56
+ xpos_scale_base (float): The scale base for XPos (if using XPos).
57
+ rope_hf_config (Dict): A dictionary used to configure rope's scaling behavior (when scaling beyond the training length).
58
+ type (str): Can be one of 'no_scaling', 'linear', or 'dynamic'. 'no_scaling' uses the default implementation for rotary embeddings, 'linear' uses linear scaling as proposed by the Reddit user /u/kaiokendev, and 'dynamic' uses Dynamic NTK scaling as proposed by the Reddit users /u/bloc97 and /u/emozilla.
59
+ factor (float): Scaling factor to use if using 'linear' or 'dynamic' as rope_scaling.type.
60
+ kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
61
+ ffn_config (Dict): A dictionary used to configure the model's ffn module:
62
+ ffn_type (str): type of ffn to use. Options: mptmlp, mptglu, te_ln_mlp
63
+ init_device (str): The device to use for parameter initialization.
64
+ logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
65
+ no_bias (bool): Whether to use bias in all layers.
66
+ embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
67
+ norm_type (str): choose type of norm to use
68
+ use_cache (bool): Whether or not the model should return the last key/values attentions
69
+ init_config (Dict): A dictionary used to configure the model initialization:
70
+ init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
71
+ 'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
72
+ 'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
73
+ init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
74
+ emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
75
+ emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
76
+ used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
77
+ init_std (float): The standard deviation of the normal distribution used to initialize the model,
78
+ if using the baseline_ parameter initialization scheme.
79
+ init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
80
+ fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
81
+ init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
82
+ ---
83
+ See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
84
+ fc_type (str): choose fc layer implementation. Options: torch and te. te layers support fp8 when using H100 GPUs.
85
+ tie_word_embeddings (bool): Whether to tie the input embedding and output layers.
86
+ use_pad_tok_in_ffn (bool): Whether to forward the pad token in the feedforward networks.
87
+ """
88
+ self.d_model = d_model
89
+ self.n_heads = n_heads
90
+ self.n_layers = n_layers
91
+ self.expansion_ratio = expansion_ratio
92
+ self.max_seq_len = max_seq_len
93
+ self.vocab_size = vocab_size
94
+ self.resid_pdrop = resid_pdrop
95
+ self.emb_pdrop = emb_pdrop
96
+ self.learned_pos_emb = learned_pos_emb
97
+ self.attn_config = attn_config
98
+ self.ffn_config = ffn_config
99
+ self.init_device = init_device
100
+ self.logit_scale = logit_scale
101
+ self.no_bias = no_bias
102
+ self.embedding_fraction = embedding_fraction
103
+ self.norm_type = norm_type
104
+ self.use_cache = use_cache
105
+ self.init_config = init_config
106
+ self.fc_type = fc_type
107
+ self.use_pad_tok_in_ffn = use_pad_tok_in_ffn
108
+ if 'name' in kwargs:
109
+ del kwargs['name']
110
+ if 'loss_fn' in kwargs:
111
+ del kwargs['loss_fn']
112
+ if self.attn_config.get('alibi', False) or self.attn_config.get('rope', False):
113
+ self.learned_pos_emb = False
114
+ warnings.warn(f'alibi or rope is turned on, setting `learned_pos_emb` to `False.`')
115
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
116
+ self._validate_config()
117
+
118
+ def _set_config_defaults(self, config: Dict[str, Any], config_defaults: Dict[str, Any]) -> Dict[str, Any]:
119
+ for k, v in config_defaults.items():
120
+ if k not in config:
121
+ config[k] = v
122
+ elif isinstance(v, dict):
123
+ config[k] = self._set_config_defaults(config[k] if config[k] is not None else {}, v)
124
+ return config
125
+
126
+ def _validate_config(self) -> None:
127
+ self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults)
128
+ self.ffn_config = self._set_config_defaults(self.ffn_config, ffn_config_defaults)
129
+ self.init_config = self._set_config_defaults(self.init_config, init_config_defaults)
130
+ if self.d_model % self.n_heads != 0:
131
+ raise ValueError('d_model must be divisible by n_heads')
132
+ if any((prob < 0 or prob > 1 for prob in [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop])):
133
+ raise ValueError("self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1")
134
+ if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
135
+ raise ValueError(f"Unknown attn_impl={self.attn_config['attn_impl']}")
136
+ if self.attn_config['prefix_lm'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
137
+ raise NotImplementedError('prefix_lm only implemented with torch and triton attention.')
138
+ if self.attn_config['attn_impl'] == 'flash' and is_flash_v1_installed():
139
+ warnings.warn(VersionedDeprecationWarning('Support for Flash Attention v1 is deprecated. Please upgrade to Flash Attention v2.4.2. To install Flash Attention v2.4.2, please run `pip install -e ".[gpu-flash2]"` from the root directory of the llm-foundry repository.', remove_version='0.6.0'))
140
+ if self.attn_config['attn_impl'] == 'triton' and (not self.attn_config['prefix_lm']):
141
+ warnings.warn(UserWarning('If not using a Prefix Language Model, we recommend setting "attn_impl" to "flash" instead of "triton".'))
142
+ # if self.attn_config['alibi'] and (not check_alibi_support(self.attn_config['attn_impl'])):
143
+ # raise NotImplementedError('alibi only implemented with torch, triton, and flash (v2.4.2 or higher) attention.')
144
+ if self.attn_config['attn_uses_sequence_id'] and (not (self.attn_config['attn_impl'] in ['torch', 'triton'] or (self.attn_config['attn_impl'] == 'flash' and is_flash_v2_installed(v2_version='v2.1.2')))):
145
+ raise NotImplementedError('attn_uses_sequence_id only implemented with torch, triton, and flash (v2.1.2 or higher) attention.')
146
+ if self.attn_config['rope'] and self.attn_config['rope_impl'] not in ['dail', 'hf']:
147
+ raise ValueError('If rope is being used then rope_impl should be either "dail", or "hf".')
148
+ if self.attn_config['rope'] and self.attn_config['rope_impl'] == 'hf' and (self.attn_config['rope_hf_config']['type'] not in ['no_scaling', 'linear', 'dynamic']):
149
+ raise ValueError('If using hf implementation of rope, the type should be one of "no_scaling", "linear" or "dynamic".')
150
+ if self.attn_config['rope'] and self.attn_config['rope_impl'] == 'dail':
151
+ if self.attn_config['rope_dail_config']['type'] not in ['original', 'xpos']:
152
+ raise ValueError('If using the dail implementation of rope, the type should be one of "original" or "xpos".')
153
+ if not is_flash_v2_installed(v2_version='2.0.1'):
154
+ raise ImportError('If using the dail implementation of rope, the flash_attn library v2.0.1 or higher must be installed. Please check the instructions at https://github.com/mosaicml/llm-foundry/blob/main/TUTORIAL.md#what-kinds-of-positional-embeddings-does-llm-foundry-support')
155
+ if self.attn_config['sliding_window_size'] != -1 and (not (self.attn_config['attn_impl'] == 'flash' and is_flash_v2_installed(v2_version='v2.3.0'))):
156
+ raise NotImplementedError('sliding window only implemented with flash attention v2.3.0 or higher.')
157
+ if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
158
+ raise ValueError('model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!')
159
+ if isinstance(self.logit_scale, str) and self.logit_scale != 'inv_sqrt_d_model':
160
+ raise ValueError(f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
161
+ if self.init_config.get('name', None) is None:
162
+ raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.")
163
+ if not (self.learned_pos_emb or self.attn_config['alibi'] or self.attn_config['rope']):
164
+ warnings.warn(f'Positional information not being provided to the model using either learned_pos_emb or alibi or rope.')
165
+ if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
166
+ try:
167
+ import transformer_engine.pytorch as te
168
+ del te
169
+ except:
170
+ raise ImportError('TransformerEngine import fail. `fc_type: te` requires TransformerEngine be installed. ' + 'The required version of transformer_engine also requires FlashAttention v1.0.6 is installed:\n' + 'pip install flash-attn==1.0.6 --no-build-isolation \n' + 'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156')
171
+ if self.ffn_config['ffn_type'] == 'mptgeglu':
172
+ raise ValueError('API CHANGE: `ffn_type=="mptgeglu"` changed to `ffn_type=="mptglu"`. ' + 'See [#829](https://github.com/mosaicml/llm-foundry/pull/829) for details.')
173
+ elif self.ffn_config['ffn_type'] in ['mptmlp', 'mptglu']:
174
+ self.ffn_config['fc_type'] = self.fc_type
175
+ elif self.ffn_config['ffn_type'] == 'te_ln_mlp':
176
+ self.ffn_config['bias'] = not self.no_bias
177
+ if 'ffn_act_fn' in self.ffn_config.keys():
178
+ raise ValueError(f'Transformer Engine block does not support custom activation functions.')
179
+ if not self.use_pad_tok_in_ffn:
180
+ try:
181
+ from flash_attn.bert_padding import unpad_input, pad_input
182
+ except:
183
+ raise ImportError('In order to set `use_pad_tok_in_ffn=False`, please install flash-attn==1.0.9 or flash-attn==2.3.6')
fc.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from torch import nn
2
+ FC_CLASS_REGISTRY = {'torch': nn.Linear}
3
+ try:
4
+ import transformer_engine.pytorch as te
5
+ FC_CLASS_REGISTRY['te'] = te.Linear
6
+ except:
7
+ pass
ffn.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """MPT Blocks used for the MPT Model."""
2
+ import logging
3
+ from copy import deepcopy
4
+ from functools import partial
5
+ from typing import Any, Callable, Optional, Union
6
+ import torch
7
+ import torch.nn as nn
8
+ from .fc import FC_CLASS_REGISTRY
9
+ try:
10
+ import transformer_engine.pytorch as te
11
+ except:
12
+ te = None
13
+ log = logging.getLogger(__name__)
14
+ _FFN_ACT_FN_DEFAULT = {'name': 'gelu', 'approximate': 'none'}
15
+
16
+ def resolve_ffn_act_fn(config: Optional[dict]=None) -> Callable[[torch.Tensor], torch.Tensor]:
17
+ """Resolve the activation function for the feed-forward network.
18
+
19
+ Args:
20
+ config (Optional[dict]): The configuration dictionary for the activation function.
21
+ The dict config must specify the 'name' of a torch.nn.functional activation
22
+ function. All of other key values pairs are bound to the function as a partial.
23
+
24
+ Returns:
25
+ Callable[[torch.Tensor], torch.Tensor]: The activation function.
26
+ """
27
+ if config is None:
28
+ config = _FFN_ACT_FN_DEFAULT
29
+ config = deepcopy(config)
30
+ name = config.pop('name')
31
+ if not hasattr(torch.nn.functional, name):
32
+ raise ValueError(f'Unrecognised activation function name ({name}).')
33
+ act = getattr(torch.nn.functional, name)
34
+ return partial(act, **config)
35
+ _DEFAULT_ACT_FN = resolve_ffn_act_fn(_FFN_ACT_FN_DEFAULT)
36
+
37
+ def resolve_ffn_hidden_size(d_model: int, expansion_ratio: Union[int, float], ffn_hidden_size: Optional[int]=None) -> int:
38
+ """Resolve the hidden size of the feed-forward network.
39
+
40
+ Args:
41
+ d_model (int): The dimension of the input and output of the feed-forward network.
42
+ expansion_ratio (Union[int, float]): The expansion ratio of the feed-forward network.
43
+ ffn_hidden_size (Optional[int]): The hidden size of the feed-forward network.
44
+
45
+ Returns:
46
+ int: The hidden size of the feed-forward network.
47
+ """
48
+ if ffn_hidden_size is not None:
49
+ log.info(f'`expansion_ratio` (={expansion_ratio}) ignored when `ffn_hidden_size` (={ffn_hidden_size}) is specified.')
50
+ else:
51
+ ffn_hidden_size = int(d_model * expansion_ratio)
52
+ if ffn_hidden_size != d_model * expansion_ratio:
53
+ raise ValueError(f'`d_model * expansion_ratio` must be an integer (d_model={d_model!r}; expansion_ratio={expansion_ratio!r}; d_model * expansion_ratio={d_model * expansion_ratio!r}).')
54
+ return ffn_hidden_size
55
+
56
+ class MPTMLP(nn.Module):
57
+
58
+ def __init__(self, d_model: int, expansion_ratio: Union[int, float], fc_type: str='torch', ffn_hidden_size: Optional[int]=None, act_fn: Callable[[torch.Tensor], torch.Tensor]=_DEFAULT_ACT_FN, device: Optional[str]=None, bias: bool=True):
59
+ super().__init__()
60
+ ffn_hidden_size = resolve_ffn_hidden_size(d_model, expansion_ratio, ffn_hidden_size)
61
+ self.fc_kwargs: dict[str, Any] = {'bias': bias}
62
+ self.fc_kwargs['device'] = device
63
+ self.up_proj = FC_CLASS_REGISTRY[fc_type](d_model, ffn_hidden_size, **self.fc_kwargs)
64
+ self.act = act_fn
65
+ self.down_proj = FC_CLASS_REGISTRY[fc_type](ffn_hidden_size, d_model, **self.fc_kwargs)
66
+ self.down_proj._is_residual = True
67
+
68
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
69
+ return self.down_proj(self.act(self.up_proj(x)))
70
+
71
+ class MPTGLU(MPTMLP):
72
+
73
+ def __init__(self, d_model: int, expansion_ratio: Union[int, float], fc_type: str='torch', ffn_hidden_size: Optional[int]=None, act_fn: Callable[[torch.Tensor], torch.Tensor]=_DEFAULT_ACT_FN, device: Optional[str]=None, bias: bool=True):
74
+ super().__init__(d_model=d_model, expansion_ratio=expansion_ratio, fc_type=fc_type, ffn_hidden_size=ffn_hidden_size, act_fn=act_fn, device=device, bias=bias)
75
+ self.gate_proj = FC_CLASS_REGISTRY[fc_type](d_model, self.up_proj.out_features, **self.fc_kwargs)
76
+
77
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
78
+ return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
79
+ FFN_CLASS_REGISTRY = {'mptmlp': MPTMLP, 'mptglu': MPTGLU}
80
+ if te is not None:
81
+ te.LayerNormMLP._has_norm = True
82
+ FFN_CLASS_REGISTRY['te_ln_mlp'] = te.LayerNormMLP
83
+
84
+ def build_ffn(d_model: int, expansion_ratio: Union[int, float], fc_type: str='torch', ffn_hidden_size: Optional[int]=None, ffn_act_fn: Optional[dict]=None, device: Optional[str]=None, bias: bool=True, **kwargs: Any) -> nn.Module:
85
+ ffn_type = kwargs.pop('ffn_type')
86
+ if ffn_type in ['mptmlp', 'mptglu']:
87
+ if len(kwargs) > 0:
88
+ raise ValueError(f'MPTMLP (or MPTGLU) got an unexpected keyword argument: {kwargs}')
89
+ return FFN_CLASS_REGISTRY[ffn_type](d_model=d_model, expansion_ratio=expansion_ratio, fc_type=fc_type, act_fn=resolve_ffn_act_fn(ffn_act_fn), ffn_hidden_size=ffn_hidden_size, device=device, bias=bias)
90
+ elif ffn_type == 'te_ln_mlp':
91
+ assert te is not None
92
+ ffn_hidden_size = resolve_ffn_hidden_size(d_model, expansion_ratio, ffn_hidden_size)
93
+ if ffn_act_fn is not None:
94
+ raise ValueError(f'Transformer Engine block does not support custom activation functions.')
95
+ return te.LayerNormMLP(hidden_size=d_model, ffn_hidden_size=ffn_hidden_size, bias=bias, **kwargs)
96
+ raise ValueError(f'ffn_type={ffn_type!r} not recognized.')
flash_attn_triton.py ADDED
@@ -0,0 +1,484 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copied from https://github.com/HazyResearch/flash-attention/blob/eff9fe6b8076df59d64d7a3f464696738a3c7c24/flash_attn/flash_attn_triton.py
3
+ update imports to use 'triton_pre_mlir'
4
+
5
+ *Experimental* implementation of FlashAttention in Triton.
6
+ Tested with triton==2.0.0.dev20221202.
7
+ Triton 2.0 has a new backend (MLIR) but seems like it doesn't yet work for head dimensions
8
+ other than 64:
9
+ https://github.com/openai/triton/blob/d376020f90002757eea3ea9475d4f7cfc2ec5ead/python/triton/ops/flash_attention.py#L207
10
+ We'll update this implementation with the new Triton backend once this is fixed.
11
+
12
+ We use the FlashAttention implementation from Phil Tillet a starting point.
13
+ https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py
14
+
15
+ Changes:
16
+ - Implement both causal and non-causal attention.
17
+ - Implement both self-attention and cross-attention.
18
+ - Support arbitrary seqlens (not just multiples of 128), for both forward and backward.
19
+ - Support all head dimensions up to 128 (not just 16, 32, 64, 128), for both forward and backward.
20
+ - Support attention bias.
21
+ - Speed up the forward pass a bit, and only store the LSE instead of m and l.
22
+ - Make the backward for d=128 much faster by reducing register spilling.
23
+ - Optionally parallelize the backward pass across seqlen_k, to deal with the case of
24
+ small batch size * nheads.
25
+
26
+ Caution:
27
+ - This is an *experimental* implementation. The forward pass should be quite robust but
28
+ I'm not 100% sure that the backward pass doesn't have race conditions (due to the Triton compiler).
29
+ - This implementation has only been tested on A100.
30
+ - If you plan to use headdim other than 64 and 128, you should test for race conditions
31
+ (due to the Triton compiler), as done in tests/test_flash_attn.py
32
+ "test_flash_attn_triton_race_condition". I've tested and fixed many race conditions
33
+ for different head dimensions (40, 48, 64, 128, 80, 88, 96), but I'm still not 100% confident
34
+ that there are none left for other head dimensions.
35
+
36
+ Differences between this Triton version and the CUDA version:
37
+ - Triton version doesn't support dropout.
38
+ - Triton forward is generally faster than CUDA forward, while Triton backward is
39
+ generally slower than CUDA backward. Overall Triton forward + backward is slightly slower
40
+ than CUDA forward + backward.
41
+ - Triton version doesn't support different sequence lengths in a batch (i.e., RaggedTensor/NestedTensor).
42
+ - Triton version supports attention bias, while CUDA version doesn't.
43
+ """
44
+ import math
45
+ import torch
46
+ import triton_pre_mlir as triton
47
+ import triton_pre_mlir.language as tl
48
+
49
+ @triton.heuristics({'EVEN_M': lambda args: args['seqlen_q'] % args['BLOCK_M'] == 0, 'EVEN_N': lambda args: args['seqlen_k'] % args['BLOCK_N'] == 0, 'EVEN_HEADDIM': lambda args: args['headdim'] == args['BLOCK_HEADDIM']})
50
+ @triton.jit
51
+ def _fwd_kernel(Q, K, V, Bias, Out, Lse, TMP, softmax_scale, stride_qb, stride_qh, stride_qm, stride_kb, stride_kh, stride_kn, stride_vb, stride_vh, stride_vn, stride_bb, stride_bh, stride_bm, stride_ob, stride_oh, stride_om, nheads, seqlen_q, seqlen_k, seqlen_q_rounded, headdim, CACHE_KEY_SEQLEN_Q, CACHE_KEY_SEQLEN_K, BIAS_TYPE: tl.constexpr, IS_CAUSAL: tl.constexpr, BLOCK_HEADDIM: tl.constexpr, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
52
+ start_m = tl.program_id(0)
53
+ off_hb = tl.program_id(1)
54
+ off_b = off_hb // nheads
55
+ off_h = off_hb % nheads
56
+ offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
57
+ offs_n = tl.arange(0, BLOCK_N)
58
+ offs_d = tl.arange(0, BLOCK_HEADDIM)
59
+ q_ptrs = Q + off_b * stride_qb + off_h * stride_qh + (offs_m[:, None] * stride_qm + offs_d[None, :])
60
+ k_ptrs = K + off_b * stride_kb + off_h * stride_kh + (offs_n[:, None] * stride_kn + offs_d[None, :])
61
+ v_ptrs = V + off_b * stride_vb + off_h * stride_vh + (offs_n[:, None] * stride_vn + offs_d[None, :])
62
+ if BIAS_TYPE == 'vector':
63
+ b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + offs_n
64
+ elif BIAS_TYPE == 'matrix':
65
+ b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + (offs_m[:, None] * stride_bm + offs_n[None, :])
66
+ t_ptrs = TMP + off_hb * seqlen_q_rounded + offs_m
67
+ lse_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
68
+ m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
69
+ acc_o = tl.zeros([BLOCK_M, BLOCK_HEADDIM], dtype=tl.float32)
70
+ if EVEN_M & EVEN_N:
71
+ if EVEN_HEADDIM:
72
+ q = tl.load(q_ptrs)
73
+ else:
74
+ q = tl.load(q_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
75
+ elif EVEN_HEADDIM:
76
+ q = tl.load(q_ptrs, mask=offs_m[:, None] < seqlen_q, other=0.0)
77
+ else:
78
+ q = tl.load(q_ptrs, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0)
79
+ end_n = seqlen_k if not IS_CAUSAL else tl.minimum((start_m + 1) * BLOCK_M, seqlen_k)
80
+ for start_n in range(0, end_n, BLOCK_N):
81
+ start_n = tl.multiple_of(start_n, BLOCK_N)
82
+ if EVEN_N & EVEN_M:
83
+ if EVEN_HEADDIM:
84
+ k = tl.load(k_ptrs + start_n * stride_kn)
85
+ else:
86
+ k = tl.load(k_ptrs + start_n * stride_kn, mask=offs_d[None, :] < headdim, other=0.0)
87
+ elif EVEN_HEADDIM:
88
+ k = tl.load(k_ptrs + start_n * stride_kn, mask=(start_n + offs_n)[:, None] < seqlen_k, other=0.0)
89
+ else:
90
+ k = tl.load(k_ptrs + start_n * stride_kn, mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
91
+ qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
92
+ qk += tl.dot(q, k, trans_b=True)
93
+ if not EVEN_N:
94
+ qk += tl.where((start_n + offs_n)[None, :] < seqlen_k, 0, float('-inf'))
95
+ if IS_CAUSAL:
96
+ qk += tl.where(offs_m[:, None] >= (start_n + offs_n)[None, :], 0, float('-inf'))
97
+ if BIAS_TYPE != 'none':
98
+ if BIAS_TYPE == 'vector':
99
+ if EVEN_N:
100
+ bias = tl.load(b_ptrs + start_n).to(tl.float32)
101
+ else:
102
+ bias = tl.load(b_ptrs + start_n, mask=start_n + offs_n < seqlen_k, other=0.0).to(tl.float32)
103
+ bias = bias[None, :]
104
+ elif BIAS_TYPE == 'matrix':
105
+ if EVEN_M & EVEN_N:
106
+ bias = tl.load(b_ptrs + start_n).to(tl.float32)
107
+ else:
108
+ bias = tl.load(b_ptrs + start_n, mask=(offs_m[:, None] < seqlen_q) & ((start_n + offs_n)[None, :] < seqlen_k), other=0.0).to(tl.float32)
109
+ qk = qk * softmax_scale + bias
110
+ m_ij = tl.maximum(tl.max(qk, 1), lse_i)
111
+ p = tl.exp(qk - m_ij[:, None])
112
+ else:
113
+ m_ij = tl.maximum(tl.max(qk, 1) * softmax_scale, lse_i)
114
+ p = tl.exp(qk * softmax_scale - m_ij[:, None])
115
+ l_ij = tl.sum(p, 1)
116
+ acc_o_scale = tl.exp(m_i - m_ij)
117
+ tl.store(t_ptrs, acc_o_scale)
118
+ acc_o_scale = tl.load(t_ptrs)
119
+ acc_o = acc_o * acc_o_scale[:, None]
120
+ if EVEN_N & EVEN_M:
121
+ if EVEN_HEADDIM:
122
+ v = tl.load(v_ptrs + start_n * stride_vn)
123
+ else:
124
+ v = tl.load(v_ptrs + start_n * stride_vn, mask=offs_d[None, :] < headdim, other=0.0)
125
+ elif EVEN_HEADDIM:
126
+ v = tl.load(v_ptrs + start_n * stride_vn, mask=(start_n + offs_n)[:, None] < seqlen_k, other=0.0)
127
+ else:
128
+ v = tl.load(v_ptrs + start_n * stride_vn, mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
129
+ p = p.to(v.dtype)
130
+ acc_o += tl.dot(p, v)
131
+ m_i = m_ij
132
+ l_i_new = tl.exp(lse_i - m_ij) + l_ij
133
+ lse_i = m_ij + tl.log(l_i_new)
134
+ o_scale = tl.exp(m_i - lse_i)
135
+ tl.store(t_ptrs, o_scale)
136
+ o_scale = tl.load(t_ptrs)
137
+ acc_o = acc_o * o_scale[:, None]
138
+ start_m = tl.program_id(0)
139
+ offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
140
+ lse_ptrs = Lse + off_hb * seqlen_q_rounded + offs_m
141
+ tl.store(lse_ptrs, lse_i)
142
+ offs_d = tl.arange(0, BLOCK_HEADDIM)
143
+ out_ptrs = Out + off_b * stride_ob + off_h * stride_oh + (offs_m[:, None] * stride_om + offs_d[None, :])
144
+ if EVEN_M:
145
+ if EVEN_HEADDIM:
146
+ tl.store(out_ptrs, acc_o)
147
+ else:
148
+ tl.store(out_ptrs, acc_o, mask=offs_d[None, :] < headdim)
149
+ elif EVEN_HEADDIM:
150
+ tl.store(out_ptrs, acc_o, mask=offs_m[:, None] < seqlen_q)
151
+ else:
152
+ tl.store(out_ptrs, acc_o, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim))
153
+
154
+ @triton.jit
155
+ def _bwd_preprocess_do_o_dot(Out, DO, Delta, stride_ob, stride_oh, stride_om, stride_dob, stride_doh, stride_dom, nheads, seqlen_q, seqlen_q_rounded, headdim, BLOCK_M: tl.constexpr, BLOCK_HEADDIM: tl.constexpr):
156
+ start_m = tl.program_id(0)
157
+ off_hb = tl.program_id(1)
158
+ off_b = off_hb // nheads
159
+ off_h = off_hb % nheads
160
+ offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
161
+ offs_d = tl.arange(0, BLOCK_HEADDIM)
162
+ o = tl.load(Out + off_b * stride_ob + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :], mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0).to(tl.float32)
163
+ do = tl.load(DO + off_b * stride_dob + off_h * stride_doh + offs_m[:, None] * stride_dom + offs_d[None, :], mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0).to(tl.float32)
164
+ delta = tl.sum(o * do, axis=1)
165
+ tl.store(Delta + off_hb * seqlen_q_rounded + offs_m, delta)
166
+
167
+ @triton.jit
168
+ def _bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr):
169
+ if EVEN_N & EVEN_M:
170
+ if EVEN_HEADDIM:
171
+ tl.store(dv_ptrs, dv)
172
+ tl.store(dk_ptrs, dk)
173
+ else:
174
+ tl.store(dv_ptrs, dv, mask=offs_d[None, :] < headdim)
175
+ tl.store(dk_ptrs, dk, mask=offs_d[None, :] < headdim)
176
+ elif EVEN_HEADDIM:
177
+ tl.store(dv_ptrs, dv, mask=offs_n[:, None] < seqlen_k)
178
+ tl.store(dk_ptrs, dk, mask=offs_n[:, None] < seqlen_k)
179
+ else:
180
+ tl.store(dv_ptrs, dv, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
181
+ tl.store(dk_ptrs, dk, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
182
+
183
+ @triton.jit
184
+ def _bwd_kernel_one_col_block(start_n, Q, K, V, Bias, DO, DQ, DK, DV, LSE, D, softmax_scale, stride_qm, stride_kn, stride_vn, stride_bm, stride_dom, stride_dqm, stride_dkn, stride_dvn, seqlen_q, seqlen_k, headdim, ATOMIC_ADD: tl.constexpr, BIAS_TYPE: tl.constexpr, IS_CAUSAL: tl.constexpr, BLOCK_HEADDIM: tl.constexpr, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
185
+ begin_m = 0 if not IS_CAUSAL else start_n * BLOCK_N // BLOCK_M * BLOCK_M
186
+ offs_qm = begin_m + tl.arange(0, BLOCK_M)
187
+ offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
188
+ offs_m = tl.arange(0, BLOCK_M)
189
+ offs_d = tl.arange(0, BLOCK_HEADDIM)
190
+ q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_d[None, :])
191
+ k_ptrs = K + (offs_n[:, None] * stride_kn + offs_d[None, :])
192
+ v_ptrs = V + (offs_n[:, None] * stride_vn + offs_d[None, :])
193
+ do_ptrs = DO + (offs_qm[:, None] * stride_dom + offs_d[None, :])
194
+ dq_ptrs = DQ + (offs_qm[:, None] * stride_dqm + offs_d[None, :])
195
+ if BIAS_TYPE == 'vector':
196
+ b_ptrs = Bias + offs_n
197
+ elif BIAS_TYPE == 'matrix':
198
+ b_ptrs = Bias + (offs_qm[:, None] * stride_bm + offs_n[None, :])
199
+ dv = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
200
+ dk = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
201
+ if begin_m >= seqlen_q:
202
+ dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
203
+ dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
204
+ _bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim, EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM)
205
+ return
206
+ if EVEN_N & EVEN_M:
207
+ if EVEN_HEADDIM:
208
+ k = tl.load(k_ptrs)
209
+ v = tl.load(v_ptrs)
210
+ else:
211
+ k = tl.load(k_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
212
+ v = tl.load(v_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
213
+ elif EVEN_HEADDIM:
214
+ k = tl.load(k_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
215
+ v = tl.load(v_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
216
+ else:
217
+ k = tl.load(k_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
218
+ v = tl.load(v_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
219
+ num_block_m = tl.cdiv(seqlen_q, BLOCK_M)
220
+ for start_m in range(begin_m, num_block_m * BLOCK_M, BLOCK_M):
221
+ start_m = tl.multiple_of(start_m, BLOCK_M)
222
+ offs_m_curr = start_m + offs_m
223
+ if EVEN_M & EVEN_HEADDIM:
224
+ q = tl.load(q_ptrs)
225
+ elif EVEN_HEADDIM:
226
+ q = tl.load(q_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0)
227
+ else:
228
+ q = tl.load(q_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0)
229
+ qk = tl.dot(q, k, trans_b=True)
230
+ if not EVEN_N:
231
+ qk = tl.where(offs_n[None, :] < seqlen_k, qk, float('-inf'))
232
+ if IS_CAUSAL:
233
+ qk = tl.where(offs_m_curr[:, None] >= offs_n[None, :], qk, float('-inf'))
234
+ if BIAS_TYPE != 'none':
235
+ tl.debug_barrier()
236
+ if BIAS_TYPE == 'vector':
237
+ if EVEN_N:
238
+ bias = tl.load(b_ptrs).to(tl.float32)
239
+ else:
240
+ bias = tl.load(b_ptrs, mask=offs_n < seqlen_k, other=0.0).to(tl.float32)
241
+ bias = bias[None, :]
242
+ elif BIAS_TYPE == 'matrix':
243
+ if EVEN_M & EVEN_N:
244
+ bias = tl.load(b_ptrs).to(tl.float32)
245
+ else:
246
+ bias = tl.load(b_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_n[None, :] < seqlen_k), other=0.0).to(tl.float32)
247
+ qk = qk * softmax_scale + bias
248
+ if not EVEN_M & EVEN_HEADDIM:
249
+ tl.debug_barrier()
250
+ lse_i = tl.load(LSE + offs_m_curr)
251
+ if BIAS_TYPE == 'none':
252
+ p = tl.exp(qk * softmax_scale - lse_i[:, None])
253
+ else:
254
+ p = tl.exp(qk - lse_i[:, None])
255
+ if EVEN_M & EVEN_HEADDIM:
256
+ do = tl.load(do_ptrs)
257
+ else:
258
+ do = tl.load(do_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0)
259
+ dv += tl.dot(p.to(do.dtype), do, trans_a=True)
260
+ if not EVEN_M & EVEN_HEADDIM:
261
+ tl.debug_barrier()
262
+ dp = tl.dot(do, v, trans_b=True)
263
+ if not EVEN_HEADDIM:
264
+ tl.debug_barrier()
265
+ Di = tl.load(D + offs_m_curr)
266
+ ds = (p * (dp - Di[:, None]) * softmax_scale).to(q.dtype)
267
+ dk += tl.dot(ds, q, trans_a=True)
268
+ if not EVEN_M & EVEN_HEADDIM:
269
+ tl.debug_barrier()
270
+ if not ATOMIC_ADD:
271
+ if EVEN_M & EVEN_HEADDIM:
272
+ dq = tl.load(dq_ptrs, eviction_policy='evict_last')
273
+ dq += tl.dot(ds, k)
274
+ tl.store(dq_ptrs, dq, eviction_policy='evict_last')
275
+ elif EVEN_HEADDIM:
276
+ dq = tl.load(dq_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0, eviction_policy='evict_last')
277
+ dq += tl.dot(ds, k)
278
+ tl.store(dq_ptrs, dq, mask=offs_m_curr[:, None] < seqlen_q, eviction_policy='evict_last')
279
+ else:
280
+ dq = tl.load(dq_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0, eviction_policy='evict_last')
281
+ dq += tl.dot(ds, k)
282
+ tl.store(dq_ptrs, dq, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), eviction_policy='evict_last')
283
+ else:
284
+ dq = tl.dot(ds, k)
285
+ if EVEN_M & EVEN_HEADDIM:
286
+ tl.atomic_add(dq_ptrs, dq)
287
+ elif EVEN_HEADDIM:
288
+ tl.atomic_add(dq_ptrs, dq, mask=offs_m_curr[:, None] < seqlen_q)
289
+ else:
290
+ tl.atomic_add(dq_ptrs, dq, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim))
291
+ dq_ptrs += BLOCK_M * stride_dqm
292
+ q_ptrs += BLOCK_M * stride_qm
293
+ do_ptrs += BLOCK_M * stride_dom
294
+ if BIAS_TYPE == 'matrix':
295
+ b_ptrs += BLOCK_M * stride_bm
296
+ dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
297
+ dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
298
+ _bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim, EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM)
299
+
300
+ def init_to_zero(name):
301
+ return lambda nargs: nargs[name].zero_()
302
+
303
+ @triton.autotune(configs=[triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'SEQUENCE_PARALLEL': False}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ')), triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'SEQUENCE_PARALLEL': True}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ'))], key=['CACHE_KEY_SEQLEN_Q', 'CACHE_KEY_SEQLEN_K', 'BIAS_TYPE', 'IS_CAUSAL', 'BLOCK_HEADDIM'])
304
+ @triton.heuristics({'EVEN_M': lambda args: args['seqlen_q'] % args['BLOCK_M'] == 0, 'EVEN_N': lambda args: args['seqlen_k'] % args['BLOCK_N'] == 0, 'EVEN_HEADDIM': lambda args: args['headdim'] == args['BLOCK_HEADDIM']})
305
+ @triton.jit
306
+ def _bwd_kernel(Q, K, V, Bias, DO, DQ, DK, DV, LSE, D, softmax_scale, stride_qb, stride_qh, stride_qm, stride_kb, stride_kh, stride_kn, stride_vb, stride_vh, stride_vn, stride_bb, stride_bh, stride_bm, stride_dob, stride_doh, stride_dom, stride_dqb, stride_dqh, stride_dqm, stride_dkb, stride_dkh, stride_dkn, stride_dvb, stride_dvh, stride_dvn, nheads, seqlen_q, seqlen_k, seqlen_q_rounded, headdim, CACHE_KEY_SEQLEN_Q, CACHE_KEY_SEQLEN_K, BIAS_TYPE: tl.constexpr, IS_CAUSAL: tl.constexpr, BLOCK_HEADDIM: tl.constexpr, SEQUENCE_PARALLEL: tl.constexpr, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
307
+ off_hb = tl.program_id(1)
308
+ off_b = off_hb // nheads
309
+ off_h = off_hb % nheads
310
+ Q += off_b * stride_qb + off_h * stride_qh
311
+ K += off_b * stride_kb + off_h * stride_kh
312
+ V += off_b * stride_vb + off_h * stride_vh
313
+ DO += off_b * stride_dob + off_h * stride_doh
314
+ DQ += off_b * stride_dqb + off_h * stride_dqh
315
+ DK += off_b * stride_dkb + off_h * stride_dkh
316
+ DV += off_b * stride_dvb + off_h * stride_dvh
317
+ if BIAS_TYPE != 'none':
318
+ Bias += off_b * stride_bb + off_h * stride_bh
319
+ D += off_hb * seqlen_q_rounded
320
+ LSE += off_hb * seqlen_q_rounded
321
+ if not SEQUENCE_PARALLEL:
322
+ num_block_n = tl.cdiv(seqlen_k, BLOCK_N)
323
+ for start_n in range(0, num_block_n):
324
+ _bwd_kernel_one_col_block(start_n, Q, K, V, Bias, DO, DQ, DK, DV, LSE, D, softmax_scale, stride_qm, stride_kn, stride_vn, stride_bm, stride_dom, stride_dqm, stride_dkn, stride_dvn, seqlen_q, seqlen_k, headdim, ATOMIC_ADD=False, BIAS_TYPE=BIAS_TYPE, IS_CAUSAL=IS_CAUSAL, BLOCK_HEADDIM=BLOCK_HEADDIM, EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM, BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N)
325
+ else:
326
+ start_n = tl.program_id(0)
327
+ _bwd_kernel_one_col_block(start_n, Q, K, V, Bias, DO, DQ, DK, DV, LSE, D, softmax_scale, stride_qm, stride_kn, stride_vn, stride_bm, stride_dom, stride_dqm, stride_dkn, stride_dvn, seqlen_q, seqlen_k, headdim, ATOMIC_ADD=True, BIAS_TYPE=BIAS_TYPE, IS_CAUSAL=IS_CAUSAL, BLOCK_HEADDIM=BLOCK_HEADDIM, EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM, BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N)
328
+
329
+ def _flash_attn_forward(q, k, v, bias=None, causal=False, softmax_scale=None):
330
+ batch, seqlen_q, nheads, d = q.shape
331
+ _, seqlen_k, _, _ = k.shape
332
+ assert k.shape == (batch, seqlen_k, nheads, d)
333
+ assert v.shape == (batch, seqlen_k, nheads, d)
334
+ assert d <= 128, 'FlashAttention only support head dimensions up to 128'
335
+ assert q.dtype == k.dtype == v.dtype, 'All tensors must have the same type'
336
+ assert q.dtype in [torch.float16, torch.bfloat16], 'Only support fp16 and bf16'
337
+ assert q.is_cuda and k.is_cuda and v.is_cuda
338
+ softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
339
+ has_bias = bias is not None
340
+ bias_type = 'none'
341
+ if has_bias:
342
+ assert bias.dtype in [q.dtype, torch.float]
343
+ assert bias.is_cuda
344
+ assert bias.dim() == 4
345
+ if bias.stride(-1) != 1:
346
+ bias = bias.contiguous()
347
+ if bias.shape[2:] == (1, seqlen_k):
348
+ bias_type = 'vector'
349
+ elif bias.shape[2:] == (seqlen_q, seqlen_k):
350
+ bias_type = 'matrix'
351
+ else:
352
+ raise RuntimeError('Last 2 dimensions of bias must be (1, seqlen_k) or (seqlen_q, seqlen_k)')
353
+ bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
354
+ bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
355
+ seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
356
+ lse = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
357
+ tmp = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
358
+ o = torch.empty_like(q)
359
+ BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
360
+ BLOCK = 128
361
+ num_warps = 4 if d <= 64 else 8
362
+ grid = lambda META: (triton.cdiv(seqlen_q, META['BLOCK_M']), batch * nheads)
363
+ _fwd_kernel[grid](q, k, v, bias, o, lse, tmp, softmax_scale, q.stride(0), q.stride(2), q.stride(1), k.stride(0), k.stride(2), k.stride(1), v.stride(0), v.stride(2), v.stride(1), *bias_strides, o.stride(0), o.stride(2), o.stride(1), nheads, seqlen_q, seqlen_k, seqlen_q_rounded, d, seqlen_q // 32, seqlen_k // 32, bias_type, causal, BLOCK_HEADDIM, BLOCK_M=BLOCK, BLOCK_N=BLOCK, num_warps=num_warps, num_stages=1)
364
+ return (o, lse, softmax_scale)
365
+
366
+ def _flash_attn_backward(do, q, k, v, o, lse, dq, dk, dv, bias=None, causal=False, softmax_scale=None):
367
+ if do.stride(-1) != 1:
368
+ do = do.contiguous()
369
+ batch, seqlen_q, nheads, d = q.shape
370
+ _, seqlen_k, _, _ = k.shape
371
+ assert d <= 128
372
+ seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
373
+ assert lse.shape == (batch, nheads, seqlen_q_rounded)
374
+ assert q.stride(-1) == k.stride(-1) == v.stride(-1) == o.stride(-1) == 1
375
+ assert dq.stride(-1) == dk.stride(-1) == dv.stride(-1) == 1
376
+ softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
377
+ dq_accum = torch.empty_like(q, dtype=torch.float32)
378
+ delta = torch.empty_like(lse)
379
+ BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
380
+ grid = lambda META: (triton.cdiv(seqlen_q, META['BLOCK_M']), batch * nheads)
381
+ _bwd_preprocess_do_o_dot[grid](o, do, delta, o.stride(0), o.stride(2), o.stride(1), do.stride(0), do.stride(2), do.stride(1), nheads, seqlen_q, seqlen_q_rounded, d, BLOCK_M=128, BLOCK_HEADDIM=BLOCK_HEADDIM)
382
+ has_bias = bias is not None
383
+ bias_type = 'none'
384
+ if has_bias:
385
+ assert bias.dtype in [q.dtype, torch.float]
386
+ assert bias.is_cuda
387
+ assert bias.dim() == 4
388
+ assert bias.stride(-1) == 1
389
+ if bias.shape[2:] == (1, seqlen_k):
390
+ bias_type = 'vector'
391
+ elif bias.shape[2:] == (seqlen_q, seqlen_k):
392
+ bias_type = 'matrix'
393
+ else:
394
+ raise RuntimeError('Last 2 dimensions of bias must be (1, seqlen_k) or (seqlen_q, seqlen_k)')
395
+ bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
396
+ bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
397
+ grid = lambda META: (triton.cdiv(seqlen_k, META['BLOCK_N']) if META['SEQUENCE_PARALLEL'] else 1, batch * nheads)
398
+ _bwd_kernel[grid](q, k, v, bias, do, dq_accum, dk, dv, lse, delta, softmax_scale, q.stride(0), q.stride(2), q.stride(1), k.stride(0), k.stride(2), k.stride(1), v.stride(0), v.stride(2), v.stride(1), *bias_strides, do.stride(0), do.stride(2), do.stride(1), dq_accum.stride(0), dq_accum.stride(2), dq_accum.stride(1), dk.stride(0), dk.stride(2), dk.stride(1), dv.stride(0), dv.stride(2), dv.stride(1), nheads, seqlen_q, seqlen_k, seqlen_q_rounded, d, seqlen_q // 32, seqlen_k // 32, bias_type, causal, BLOCK_HEADDIM)
399
+ dq.copy_(dq_accum)
400
+
401
+ class FlashAttnQKVPackedFunc(torch.autograd.Function):
402
+
403
+ @staticmethod
404
+ def forward(ctx, qkv, bias=None, causal=False, softmax_scale=None):
405
+ """
406
+ qkv: (batch, seqlen, 3, nheads, headdim)
407
+ bias: optional, shape broadcastible to (batch, nheads, seqlen, seqlen).
408
+ For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen).
409
+ ALiBi mask for non-causal would have shape (1, nheads, seqlen, seqlen)
410
+ """
411
+ if qkv.stride(-1) != 1:
412
+ qkv = qkv.contiguous()
413
+ o, lse, ctx.softmax_scale = _flash_attn_forward(qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2], bias=bias, causal=causal, softmax_scale=softmax_scale)
414
+ ctx.save_for_backward(qkv, o, lse, bias)
415
+ ctx.causal = causal
416
+ return o
417
+
418
+ @staticmethod
419
+ def backward(ctx, do):
420
+ qkv, o, lse, bias = ctx.saved_tensors
421
+ assert not ctx.needs_input_grad[1], 'FlashAttention does not support bias gradient yet'
422
+ with torch.inference_mode():
423
+ dqkv = torch.empty_like(qkv)
424
+ _flash_attn_backward(do, qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2], o, lse, dqkv[:, :, 0], dqkv[:, :, 1], dqkv[:, :, 2], bias=bias, causal=ctx.causal, softmax_scale=ctx.softmax_scale)
425
+ return (dqkv, None, None, None)
426
+ flash_attn_qkvpacked_func = FlashAttnQKVPackedFunc.apply
427
+
428
+ class FlashAttnKVPackedFunc(torch.autograd.Function):
429
+
430
+ @staticmethod
431
+ def forward(ctx, q, kv, bias=None, causal=False, softmax_scale=None):
432
+ """
433
+ q: (batch, seqlen_q, nheads, headdim)
434
+ kv: (batch, seqlen_k, 2, nheads, headdim)
435
+ bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
436
+ For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
437
+ ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
438
+ """
439
+ q, kv = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, kv]]
440
+ o, lse, ctx.softmax_scale = _flash_attn_forward(q, kv[:, :, 0], kv[:, :, 1], bias=bias, causal=causal, softmax_scale=softmax_scale)
441
+ ctx.save_for_backward(q, kv, o, lse, bias)
442
+ ctx.causal = causal
443
+ return o
444
+
445
+ @staticmethod
446
+ def backward(ctx, do):
447
+ q, kv, o, lse, bias = ctx.saved_tensors
448
+ if len(ctx.needs_input_grad) >= 3:
449
+ assert not ctx.needs_input_grad[2], 'FlashAttention does not support bias gradient yet'
450
+ with torch.inference_mode():
451
+ dq = torch.empty_like(q)
452
+ dkv = torch.empty_like(kv)
453
+ _flash_attn_backward(do, q, kv[:, :, 0], kv[:, :, 1], o, lse, dq, dkv[:, :, 0], dkv[:, :, 1], bias=bias, causal=ctx.causal, softmax_scale=ctx.softmax_scale)
454
+ return (dq, dkv, None, None, None)
455
+ flash_attn_kvpacked_func = FlashAttnKVPackedFunc.apply
456
+
457
+ class FlashAttnFunc(torch.autograd.Function):
458
+
459
+ @staticmethod
460
+ def forward(ctx, q, k, v, bias=None, causal=False, softmax_scale=None):
461
+ """
462
+ q: (batch_size, seqlen_q, nheads, headdim)
463
+ k, v: (batch_size, seqlen_k, nheads, headdim)
464
+ bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
465
+ For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
466
+ ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
467
+ """
468
+ q, k, v = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, k, v]]
469
+ o, lse, ctx.softmax_scale = _flash_attn_forward(q, k, v, bias=bias, causal=causal, softmax_scale=softmax_scale)
470
+ ctx.save_for_backward(q, k, v, o, lse, bias)
471
+ ctx.causal = causal
472
+ return o
473
+
474
+ @staticmethod
475
+ def backward(ctx, do):
476
+ q, k, v, o, lse, bias = ctx.saved_tensors
477
+ assert not ctx.needs_input_grad[3], 'FlashAttention does not support bias gradient yet'
478
+ with torch.inference_mode():
479
+ dq = torch.empty_like(q)
480
+ dk = torch.empty_like(k)
481
+ dv = torch.empty_like(v)
482
+ _flash_attn_backward(do, q, k, v, o, lse, dq, dk, dv, bias=bias, causal=ctx.causal, softmax_scale=ctx.softmax_scale)
483
+ return (dq, dk, dv, None, None, None)
484
+ flash_attn_func = FlashAttnFunc.apply
generation_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.37.0",
4
+ "use_cache": false
5
+ }
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea0f4892cf6a2da609fc78fab6176e0588072596b483deaa8ea7f9bd98b0ec7d
3
+ size 4867418872
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:094da751a895042020c3533ae688a3f7f4c63d939a67c4b82f89719692e44701
3
+ size 4997208808
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:286837a01694d84b56617f317434bd9d062670df73d9eb287f43a236aba7b843
3
+ size 4993407560
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc9e74be0a34c858abda23911f8b01b278242c7b46308419a7751ee3ccffc2fc
3
+ size 743548904
model.safetensors.index.json ADDED
@@ -0,0 +1,653 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 15601489024
4
+ },
5
+ "weight_map": {
6
+ "transformer.blocks.0.attn.Wqkv.weight": "model-00001-of-00004.safetensors",
7
+ "transformer.blocks.0.attn.out_proj.weight": "model-00001-of-00004.safetensors",
8
+ "transformer.blocks.0.ffn.down_proj.weight": "model-00001-of-00004.safetensors",
9
+ "transformer.blocks.0.ffn.up_proj.weight": "model-00001-of-00004.safetensors",
10
+ "transformer.blocks.0.norm_1.weight": "model-00001-of-00004.safetensors",
11
+ "transformer.blocks.0.norm_2.weight": "model-00001-of-00004.safetensors",
12
+ "transformer.blocks.1.attn.Wqkv.weight": "model-00001-of-00004.safetensors",
13
+ "transformer.blocks.1.attn.out_proj.weight": "model-00001-of-00004.safetensors",
14
+ "transformer.blocks.1.ffn.down_proj.weight": "model-00001-of-00004.safetensors",
15
+ "transformer.blocks.1.ffn.up_proj.weight": "model-00001-of-00004.safetensors",
16
+ "transformer.blocks.1.norm_1.weight": "model-00001-of-00004.safetensors",
17
+ "transformer.blocks.1.norm_2.weight": "model-00001-of-00004.safetensors",
18
+ "transformer.blocks.10.attn.Wqkv.weight": "model-00002-of-00004.safetensors",
19
+ "transformer.blocks.10.attn.out_proj.weight": "model-00002-of-00004.safetensors",
20
+ "transformer.blocks.10.ffn.down_proj.weight": "model-00002-of-00004.safetensors",
21
+ "transformer.blocks.10.ffn.up_proj.weight": "model-00002-of-00004.safetensors",
22
+ "transformer.blocks.10.norm_1.weight": "model-00002-of-00004.safetensors",
23
+ "transformer.blocks.10.norm_2.weight": "model-00002-of-00004.safetensors",
24
+ "transformer.blocks.11.attn.Wqkv.weight": "model-00002-of-00004.safetensors",
25
+ "transformer.blocks.11.attn.out_proj.weight": "model-00002-of-00004.safetensors",
26
+ "transformer.blocks.11.ffn.down_proj.weight": "model-00002-of-00004.safetensors",
27
+ "transformer.blocks.11.ffn.up_proj.weight": "model-00002-of-00004.safetensors",
28
+ "transformer.blocks.11.norm_1.weight": "model-00002-of-00004.safetensors",
29
+ "transformer.blocks.11.norm_2.weight": "model-00002-of-00004.safetensors",
30
+ "transformer.blocks.12.attn.Wqkv.weight": "model-00002-of-00004.safetensors",
31
+ "transformer.blocks.12.attn.out_proj.weight": "model-00002-of-00004.safetensors",
32
+ "transformer.blocks.12.ffn.down_proj.weight": "model-00002-of-00004.safetensors",
33
+ "transformer.blocks.12.ffn.up_proj.weight": "model-00002-of-00004.safetensors",
34
+ "transformer.blocks.12.norm_1.weight": "model-00002-of-00004.safetensors",
35
+ "transformer.blocks.12.norm_2.weight": "model-00002-of-00004.safetensors",
36
+ "transformer.blocks.13.attn.Wqkv.weight": "model-00002-of-00004.safetensors",
37
+ "transformer.blocks.13.attn.out_proj.weight": "model-00002-of-00004.safetensors",
38
+ "transformer.blocks.13.ffn.down_proj.weight": "model-00002-of-00004.safetensors",
39
+ "transformer.blocks.13.ffn.up_proj.weight": "model-00002-of-00004.safetensors",
40
+ "transformer.blocks.13.norm_1.weight": "model-00002-of-00004.safetensors",
41
+ "transformer.blocks.13.norm_2.weight": "model-00002-of-00004.safetensors",
42
+ "transformer.blocks.14.attn.Wqkv.weight": "model-00002-of-00004.safetensors",
43
+ "transformer.blocks.14.attn.out_proj.weight": "model-00002-of-00004.safetensors",
44
+ "transformer.blocks.14.ffn.down_proj.weight": "model-00002-of-00004.safetensors",
45
+ "transformer.blocks.14.ffn.up_proj.weight": "model-00002-of-00004.safetensors",
46
+ "transformer.blocks.14.norm_1.weight": "model-00002-of-00004.safetensors",
47
+ "transformer.blocks.14.norm_2.weight": "model-00002-of-00004.safetensors",
48
+ "transformer.blocks.15.attn.Wqkv.weight": "model-00002-of-00004.safetensors",
49
+ "transformer.blocks.15.attn.out_proj.weight": "model-00002-of-00004.safetensors",
50
+ "transformer.blocks.15.ffn.down_proj.weight": "model-00002-of-00004.safetensors",
51
+ "transformer.blocks.15.ffn.up_proj.weight": "model-00002-of-00004.safetensors",
52
+ "transformer.blocks.15.norm_1.weight": "model-00002-of-00004.safetensors",
53
+ "transformer.blocks.15.norm_2.weight": "model-00002-of-00004.safetensors",
54
+ "transformer.blocks.16.attn.Wqkv.weight": "model-00002-of-00004.safetensors",
55
+ "transformer.blocks.16.attn.out_proj.weight": "model-00002-of-00004.safetensors",
56
+ "transformer.blocks.16.ffn.down_proj.weight": "model-00002-of-00004.safetensors",
57
+ "transformer.blocks.16.ffn.up_proj.weight": "model-00002-of-00004.safetensors",
58
+ "transformer.blocks.16.norm_1.weight": "model-00002-of-00004.safetensors",
59
+ "transformer.blocks.16.norm_2.weight": "model-00002-of-00004.safetensors",
60
+ "transformer.blocks.17.attn.Wqkv.weight": "model-00002-of-00004.safetensors",
61
+ "transformer.blocks.17.attn.out_proj.weight": "model-00002-of-00004.safetensors",
62
+ "transformer.blocks.17.ffn.down_proj.weight": "model-00002-of-00004.safetensors",
63
+ "transformer.blocks.17.ffn.up_proj.weight": "model-00002-of-00004.safetensors",
64
+ "transformer.blocks.17.norm_1.weight": "model-00002-of-00004.safetensors",
65
+ "transformer.blocks.17.norm_2.weight": "model-00002-of-00004.safetensors",
66
+ "transformer.blocks.18.attn.Wqkv.weight": "model-00002-of-00004.safetensors",
67
+ "transformer.blocks.18.attn.out_proj.weight": "model-00002-of-00004.safetensors",
68
+ "transformer.blocks.18.ffn.down_proj.weight": "model-00002-of-00004.safetensors",
69
+ "transformer.blocks.18.ffn.up_proj.weight": "model-00002-of-00004.safetensors",
70
+ "transformer.blocks.18.norm_1.weight": "model-00002-of-00004.safetensors",
71
+ "transformer.blocks.18.norm_2.weight": "model-00002-of-00004.safetensors",
72
+ "transformer.blocks.19.attn.Wqkv.weight": "model-00002-of-00004.safetensors",
73
+ "transformer.blocks.19.attn.out_proj.weight": "model-00002-of-00004.safetensors",
74
+ "transformer.blocks.19.ffn.down_proj.weight": "model-00002-of-00004.safetensors",
75
+ "transformer.blocks.19.ffn.up_proj.weight": "model-00002-of-00004.safetensors",
76
+ "transformer.blocks.19.norm_1.weight": "model-00002-of-00004.safetensors",
77
+ "transformer.blocks.19.norm_2.weight": "model-00002-of-00004.safetensors",
78
+ "transformer.blocks.2.attn.Wqkv.weight": "model-00001-of-00004.safetensors",
79
+ "transformer.blocks.2.attn.out_proj.weight": "model-00001-of-00004.safetensors",
80
+ "transformer.blocks.2.ffn.down_proj.weight": "model-00001-of-00004.safetensors",
81
+ "transformer.blocks.2.ffn.up_proj.weight": "model-00001-of-00004.safetensors",
82
+ "transformer.blocks.2.norm_1.weight": "model-00001-of-00004.safetensors",
83
+ "transformer.blocks.2.norm_2.weight": "model-00001-of-00004.safetensors",
84
+ "transformer.blocks.20.attn.Wqkv.weight": "model-00002-of-00004.safetensors",
85
+ "transformer.blocks.20.attn.out_proj.weight": "model-00002-of-00004.safetensors",
86
+ "transformer.blocks.20.ffn.down_proj.weight": "model-00002-of-00004.safetensors",
87
+ "transformer.blocks.20.ffn.up_proj.weight": "model-00002-of-00004.safetensors",
88
+ "transformer.blocks.20.norm_1.weight": "model-00002-of-00004.safetensors",
89
+ "transformer.blocks.20.norm_2.weight": "model-00002-of-00004.safetensors",
90
+ "transformer.blocks.21.attn.Wqkv.weight": "model-00003-of-00004.safetensors",
91
+ "transformer.blocks.21.attn.out_proj.weight": "model-00003-of-00004.safetensors",
92
+ "transformer.blocks.21.ffn.down_proj.weight": "model-00003-of-00004.safetensors",
93
+ "transformer.blocks.21.ffn.up_proj.weight": "model-00003-of-00004.safetensors",
94
+ "transformer.blocks.21.norm_1.weight": "model-00002-of-00004.safetensors",
95
+ "transformer.blocks.21.norm_2.weight": "model-00003-of-00004.safetensors",
96
+ "transformer.blocks.22.attn.Wqkv.weight": "model-00003-of-00004.safetensors",
97
+ "transformer.blocks.22.attn.out_proj.weight": "model-00003-of-00004.safetensors",
98
+ "transformer.blocks.22.ffn.down_proj.weight": "model-00003-of-00004.safetensors",
99
+ "transformer.blocks.22.ffn.up_proj.weight": "model-00003-of-00004.safetensors",
100
+ "transformer.blocks.22.norm_1.weight": "model-00003-of-00004.safetensors",
101
+ "transformer.blocks.22.norm_2.weight": "model-00003-of-00004.safetensors",
102
+ "transformer.blocks.23.attn.Wqkv.weight": "model-00003-of-00004.safetensors",
103
+ "transformer.blocks.23.attn.out_proj.weight": "model-00003-of-00004.safetensors",
104
+ "transformer.blocks.23.ffn.down_proj.weight": "model-00003-of-00004.safetensors",
105
+ "transformer.blocks.23.ffn.up_proj.weight": "model-00003-of-00004.safetensors",
106
+ "transformer.blocks.23.norm_1.weight": "model-00003-of-00004.safetensors",
107
+ "transformer.blocks.23.norm_2.weight": "model-00003-of-00004.safetensors",
108
+ "transformer.blocks.24.attn.Wqkv.weight": "model-00003-of-00004.safetensors",
109
+ "transformer.blocks.24.attn.out_proj.weight": "model-00003-of-00004.safetensors",
110
+ "transformer.blocks.24.ffn.down_proj.weight": "model-00003-of-00004.safetensors",
111
+ "transformer.blocks.24.ffn.up_proj.weight": "model-00003-of-00004.safetensors",
112
+ "transformer.blocks.24.norm_1.weight": "model-00003-of-00004.safetensors",
113
+ "transformer.blocks.24.norm_2.weight": "model-00003-of-00004.safetensors",
114
+ "transformer.blocks.25.attn.Wqkv.weight": "model-00003-of-00004.safetensors",
115
+ "transformer.blocks.25.attn.out_proj.weight": "model-00003-of-00004.safetensors",
116
+ "transformer.blocks.25.ffn.down_proj.weight": "model-00003-of-00004.safetensors",
117
+ "transformer.blocks.25.ffn.up_proj.weight": "model-00003-of-00004.safetensors",
118
+ "transformer.blocks.25.norm_1.weight": "model-00003-of-00004.safetensors",
119
+ "transformer.blocks.25.norm_2.weight": "model-00003-of-00004.safetensors",
120
+ "transformer.blocks.26.attn.Wqkv.weight": "model-00003-of-00004.safetensors",
121
+ "transformer.blocks.26.attn.out_proj.weight": "model-00003-of-00004.safetensors",
122
+ "transformer.blocks.26.ffn.down_proj.weight": "model-00003-of-00004.safetensors",
123
+ "transformer.blocks.26.ffn.up_proj.weight": "model-00003-of-00004.safetensors",
124
+ "transformer.blocks.26.norm_1.weight": "model-00003-of-00004.safetensors",
125
+ "transformer.blocks.26.norm_2.weight": "model-00003-of-00004.safetensors",
126
+ "transformer.blocks.27.attn.Wqkv.weight": "model-00003-of-00004.safetensors",
127
+ "transformer.blocks.27.attn.out_proj.weight": "model-00003-of-00004.safetensors",
128
+ "transformer.blocks.27.ffn.down_proj.weight": "model-00003-of-00004.safetensors",
129
+ "transformer.blocks.27.ffn.up_proj.weight": "model-00003-of-00004.safetensors",
130
+ "transformer.blocks.27.norm_1.weight": "model-00003-of-00004.safetensors",
131
+ "transformer.blocks.27.norm_2.weight": "model-00003-of-00004.safetensors",
132
+ "transformer.blocks.28.attn.Wqkv.weight": "model-00003-of-00004.safetensors",
133
+ "transformer.blocks.28.attn.out_proj.weight": "model-00003-of-00004.safetensors",
134
+ "transformer.blocks.28.ffn.down_proj.weight": "model-00003-of-00004.safetensors",
135
+ "transformer.blocks.28.ffn.up_proj.weight": "model-00003-of-00004.safetensors",
136
+ "transformer.blocks.28.norm_1.weight": "model-00003-of-00004.safetensors",
137
+ "transformer.blocks.28.norm_2.weight": "model-00003-of-00004.safetensors",
138
+ "transformer.blocks.29.attn.Wqkv.weight": "model-00003-of-00004.safetensors",
139
+ "transformer.blocks.29.attn.out_proj.weight": "model-00003-of-00004.safetensors",
140
+ "transformer.blocks.29.ffn.down_proj.weight": "model-00003-of-00004.safetensors",
141
+ "transformer.blocks.29.ffn.up_proj.weight": "model-00003-of-00004.safetensors",
142
+ "transformer.blocks.29.norm_1.weight": "model-00003-of-00004.safetensors",
143
+ "transformer.blocks.29.norm_2.weight": "model-00003-of-00004.safetensors",
144
+ "transformer.blocks.3.attn.Wqkv.weight": "model-00001-of-00004.safetensors",
145
+ "transformer.blocks.3.attn.out_proj.weight": "model-00001-of-00004.safetensors",
146
+ "transformer.blocks.3.ffn.down_proj.weight": "model-00001-of-00004.safetensors",
147
+ "transformer.blocks.3.ffn.up_proj.weight": "model-00001-of-00004.safetensors",
148
+ "transformer.blocks.3.norm_1.weight": "model-00001-of-00004.safetensors",
149
+ "transformer.blocks.3.norm_2.weight": "model-00001-of-00004.safetensors",
150
+ "transformer.blocks.30.attn.Wqkv.weight": "model-00003-of-00004.safetensors",
151
+ "transformer.blocks.30.attn.out_proj.weight": "model-00003-of-00004.safetensors",
152
+ "transformer.blocks.30.ffn.down_proj.weight": "model-00003-of-00004.safetensors",
153
+ "transformer.blocks.30.ffn.up_proj.weight": "model-00003-of-00004.safetensors",
154
+ "transformer.blocks.30.norm_1.weight": "model-00003-of-00004.safetensors",
155
+ "transformer.blocks.30.norm_2.weight": "model-00003-of-00004.safetensors",
156
+ "transformer.blocks.31.attn.Wqkv.weight": "model-00003-of-00004.safetensors",
157
+ "transformer.blocks.31.attn.out_proj.weight": "model-00003-of-00004.safetensors",
158
+ "transformer.blocks.31.ffn.down_proj.weight": "model-00003-of-00004.safetensors",
159
+ "transformer.blocks.31.ffn.up_proj.weight": "model-00003-of-00004.safetensors",
160
+ "transformer.blocks.31.norm_1.weight": "model-00003-of-00004.safetensors",
161
+ "transformer.blocks.31.norm_2.weight": "model-00003-of-00004.safetensors",
162
+ "transformer.blocks.4.attn.Wqkv.weight": "model-00001-of-00004.safetensors",
163
+ "transformer.blocks.4.attn.out_proj.weight": "model-00001-of-00004.safetensors",
164
+ "transformer.blocks.4.ffn.down_proj.weight": "model-00001-of-00004.safetensors",
165
+ "transformer.blocks.4.ffn.up_proj.weight": "model-00001-of-00004.safetensors",
166
+ "transformer.blocks.4.norm_1.weight": "model-00001-of-00004.safetensors",
167
+ "transformer.blocks.4.norm_2.weight": "model-00001-of-00004.safetensors",
168
+ "transformer.blocks.5.attn.Wqkv.weight": "model-00001-of-00004.safetensors",
169
+ "transformer.blocks.5.attn.out_proj.weight": "model-00001-of-00004.safetensors",
170
+ "transformer.blocks.5.ffn.down_proj.weight": "model-00001-of-00004.safetensors",
171
+ "transformer.blocks.5.ffn.up_proj.weight": "model-00001-of-00004.safetensors",
172
+ "transformer.blocks.5.norm_1.weight": "model-00001-of-00004.safetensors",
173
+ "transformer.blocks.5.norm_2.weight": "model-00001-of-00004.safetensors",
174
+ "transformer.blocks.6.attn.Wqkv.weight": "model-00001-of-00004.safetensors",
175
+ "transformer.blocks.6.attn.out_proj.weight": "model-00001-of-00004.safetensors",
176
+ "transformer.blocks.6.ffn.down_proj.weight": "model-00001-of-00004.safetensors",
177
+ "transformer.blocks.6.ffn.up_proj.weight": "model-00001-of-00004.safetensors",
178
+ "transformer.blocks.6.norm_1.weight": "model-00001-of-00004.safetensors",
179
+ "transformer.blocks.6.norm_2.weight": "model-00001-of-00004.safetensors",
180
+ "transformer.blocks.7.attn.Wqkv.weight": "model-00001-of-00004.safetensors",
181
+ "transformer.blocks.7.attn.out_proj.weight": "model-00001-of-00004.safetensors",
182
+ "transformer.blocks.7.ffn.down_proj.weight": "model-00001-of-00004.safetensors",
183
+ "transformer.blocks.7.ffn.up_proj.weight": "model-00001-of-00004.safetensors",
184
+ "transformer.blocks.7.norm_1.weight": "model-00001-of-00004.safetensors",
185
+ "transformer.blocks.7.norm_2.weight": "model-00001-of-00004.safetensors",
186
+ "transformer.blocks.8.attn.Wqkv.weight": "model-00001-of-00004.safetensors",
187
+ "transformer.blocks.8.attn.out_proj.weight": "model-00001-of-00004.safetensors",
188
+ "transformer.blocks.8.ffn.down_proj.weight": "model-00001-of-00004.safetensors",
189
+ "transformer.blocks.8.ffn.up_proj.weight": "model-00001-of-00004.safetensors",
190
+ "transformer.blocks.8.norm_1.weight": "model-00001-of-00004.safetensors",
191
+ "transformer.blocks.8.norm_2.weight": "model-00001-of-00004.safetensors",
192
+ "transformer.blocks.9.attn.Wqkv.weight": "model-00001-of-00004.safetensors",
193
+ "transformer.blocks.9.attn.out_proj.weight": "model-00001-of-00004.safetensors",
194
+ "transformer.blocks.9.ffn.down_proj.weight": "model-00002-of-00004.safetensors",
195
+ "transformer.blocks.9.ffn.up_proj.weight": "model-00001-of-00004.safetensors",
196
+ "transformer.blocks.9.norm_1.weight": "model-00001-of-00004.safetensors",
197
+ "transformer.blocks.9.norm_2.weight": "model-00001-of-00004.safetensors",
198
+ "transformer.mm_projector.0.bias": "model-00004-of-00004.safetensors",
199
+ "transformer.mm_projector.0.weight": "model-00004-of-00004.safetensors",
200
+ "transformer.mm_projector.2.bias": "model-00004-of-00004.safetensors",
201
+ "transformer.mm_projector.2.weight": "model-00004-of-00004.safetensors",
202
+ "transformer.norm_f.weight": "model-00003-of-00004.safetensors",
203
+ "transformer.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00004.safetensors",
204
+ "transformer.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00004.safetensors",
205
+ "transformer.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00004.safetensors",
206
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00004.safetensors",
207
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00004.safetensors",
208
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00004.safetensors",
209
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00004.safetensors",
210
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00004.safetensors",
211
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00004.safetensors",
212
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00004.safetensors",
213
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00004.safetensors",
214
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
215
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
216
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
217
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
218
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
219
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
220
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
221
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
222
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00004.safetensors",
223
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00004.safetensors",
224
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00004.safetensors",
225
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00004.safetensors",
226
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00004.safetensors",
227
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00004.safetensors",
228
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00004.safetensors",
229
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00004.safetensors",
230
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
231
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
232
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
233
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
234
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
235
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
236
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
237
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
238
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00004-of-00004.safetensors",
239
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00004-of-00004.safetensors",
240
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00004-of-00004.safetensors",
241
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00004-of-00004.safetensors",
242
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00004-of-00004.safetensors",
243
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00004-of-00004.safetensors",
244
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00004-of-00004.safetensors",
245
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00004-of-00004.safetensors",
246
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
247
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
248
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
249
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
250
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
251
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
252
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
253
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
254
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00004-of-00004.safetensors",
255
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00004-of-00004.safetensors",
256
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00004-of-00004.safetensors",
257
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00004-of-00004.safetensors",
258
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00004-of-00004.safetensors",
259
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00004-of-00004.safetensors",
260
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00004-of-00004.safetensors",
261
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00004-of-00004.safetensors",
262
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
263
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
264
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
265
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
266
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
267
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
268
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
269
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
270
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00004-of-00004.safetensors",
271
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00004-of-00004.safetensors",
272
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00004-of-00004.safetensors",
273
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00004-of-00004.safetensors",
274
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00004-of-00004.safetensors",
275
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00004-of-00004.safetensors",
276
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00004-of-00004.safetensors",
277
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00004-of-00004.safetensors",
278
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
279
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
280
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
281
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
282
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
283
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
284
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
285
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
286
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00004-of-00004.safetensors",
287
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00004-of-00004.safetensors",
288
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00004-of-00004.safetensors",
289
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00004-of-00004.safetensors",
290
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00004-of-00004.safetensors",
291
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00004-of-00004.safetensors",
292
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00004-of-00004.safetensors",
293
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00004-of-00004.safetensors",
294
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
295
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
296
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
297
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
298
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
299
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
300
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
301
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
302
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00004-of-00004.safetensors",
303
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00004-of-00004.safetensors",
304
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00004-of-00004.safetensors",
305
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00004-of-00004.safetensors",
306
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00004-of-00004.safetensors",
307
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00004-of-00004.safetensors",
308
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00004-of-00004.safetensors",
309
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00004-of-00004.safetensors",
310
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
311
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
312
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
313
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
314
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
315
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
316
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
317
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
318
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00004-of-00004.safetensors",
319
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00004-of-00004.safetensors",
320
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00004-of-00004.safetensors",
321
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00004-of-00004.safetensors",
322
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00004-of-00004.safetensors",
323
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00004-of-00004.safetensors",
324
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00004-of-00004.safetensors",
325
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00004-of-00004.safetensors",
326
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
327
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
328
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
329
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
330
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
331
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
332
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
333
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
334
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00004-of-00004.safetensors",
335
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00004-of-00004.safetensors",
336
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00004-of-00004.safetensors",
337
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00004-of-00004.safetensors",
338
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00004-of-00004.safetensors",
339
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00004-of-00004.safetensors",
340
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00004-of-00004.safetensors",
341
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00004-of-00004.safetensors",
342
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
343
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
344
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
345
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
346
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
347
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
348
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
349
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
350
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00004-of-00004.safetensors",
351
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00004-of-00004.safetensors",
352
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00004-of-00004.safetensors",
353
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00004-of-00004.safetensors",
354
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00004-of-00004.safetensors",
355
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00004-of-00004.safetensors",
356
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00004-of-00004.safetensors",
357
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00004-of-00004.safetensors",
358
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
359
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
360
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
361
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
362
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
363
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
364
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
365
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
366
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00004-of-00004.safetensors",
367
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00004-of-00004.safetensors",
368
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00004-of-00004.safetensors",
369
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00004-of-00004.safetensors",
370
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00004-of-00004.safetensors",
371
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00004-of-00004.safetensors",
372
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00004-of-00004.safetensors",
373
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00004-of-00004.safetensors",
374
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
375
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
376
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
377
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
378
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
379
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
380
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
381
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
382
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00004-of-00004.safetensors",
383
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00004-of-00004.safetensors",
384
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00004-of-00004.safetensors",
385
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00004-of-00004.safetensors",
386
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00004-of-00004.safetensors",
387
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00004-of-00004.safetensors",
388
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00004-of-00004.safetensors",
389
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00004-of-00004.safetensors",
390
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
391
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
392
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
393
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
394
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
395
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
396
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
397
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
398
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00004.safetensors",
399
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00004.safetensors",
400
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00004.safetensors",
401
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00004.safetensors",
402
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00004.safetensors",
403
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00004.safetensors",
404
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00004.safetensors",
405
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00004.safetensors",
406
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
407
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
408
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
409
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
410
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
411
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
412
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
413
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
414
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00004-of-00004.safetensors",
415
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00004-of-00004.safetensors",
416
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00004-of-00004.safetensors",
417
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00004-of-00004.safetensors",
418
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00004-of-00004.safetensors",
419
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00004-of-00004.safetensors",
420
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00004-of-00004.safetensors",
421
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00004-of-00004.safetensors",
422
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
423
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
424
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
425
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
426
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
427
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
428
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
429
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
430
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00004-of-00004.safetensors",
431
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00004-of-00004.safetensors",
432
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00004-of-00004.safetensors",
433
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00004-of-00004.safetensors",
434
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00004-of-00004.safetensors",
435
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00004-of-00004.safetensors",
436
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00004-of-00004.safetensors",
437
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00004-of-00004.safetensors",
438
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
439
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
440
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
441
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
442
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
443
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
444
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
445
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
446
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00004-of-00004.safetensors",
447
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00004-of-00004.safetensors",
448
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00004-of-00004.safetensors",
449
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00004-of-00004.safetensors",
450
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00004-of-00004.safetensors",
451
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00004-of-00004.safetensors",
452
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00004-of-00004.safetensors",
453
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00004-of-00004.safetensors",
454
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
455
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
456
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
457
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
458
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
459
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
460
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
461
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
462
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00004-of-00004.safetensors",
463
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00004-of-00004.safetensors",
464
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00004-of-00004.safetensors",
465
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00004-of-00004.safetensors",
466
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00004-of-00004.safetensors",
467
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00004-of-00004.safetensors",
468
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00004-of-00004.safetensors",
469
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00004-of-00004.safetensors",
470
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
471
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
472
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
473
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
474
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
475
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
476
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
477
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
478
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00004-of-00004.safetensors",
479
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00004-of-00004.safetensors",
480
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00004-of-00004.safetensors",
481
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00004-of-00004.safetensors",
482
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00004-of-00004.safetensors",
483
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00004-of-00004.safetensors",
484
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00004-of-00004.safetensors",
485
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00004-of-00004.safetensors",
486
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
487
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
488
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
489
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
490
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
491
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
492
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
493
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
494
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00004-of-00004.safetensors",
495
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00004-of-00004.safetensors",
496
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00004-of-00004.safetensors",
497
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00004-of-00004.safetensors",
498
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00004-of-00004.safetensors",
499
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00004-of-00004.safetensors",
500
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00004-of-00004.safetensors",
501
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00004-of-00004.safetensors",
502
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
503
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
504
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
505
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
506
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
507
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
508
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
509
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
510
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00004-of-00004.safetensors",
511
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00004-of-00004.safetensors",
512
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00004-of-00004.safetensors",
513
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00004-of-00004.safetensors",
514
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00004-of-00004.safetensors",
515
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00004-of-00004.safetensors",
516
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00004-of-00004.safetensors",
517
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00004-of-00004.safetensors",
518
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
519
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
520
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
521
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
522
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
523
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
524
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
525
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
526
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00004.safetensors",
527
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00004.safetensors",
528
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00004.safetensors",
529
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00004.safetensors",
530
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00004.safetensors",
531
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00004.safetensors",
532
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00004.safetensors",
533
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00004.safetensors",
534
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
535
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
536
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
537
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
538
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
539
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
540
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
541
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
542
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00004.safetensors",
543
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00004.safetensors",
544
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00004.safetensors",
545
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00004.safetensors",
546
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00004.safetensors",
547
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00004.safetensors",
548
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00004.safetensors",
549
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00004.safetensors",
550
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
551
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
552
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
553
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
554
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
555
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
556
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
557
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
558
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00004.safetensors",
559
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00004.safetensors",
560
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00004-of-00004.safetensors",
561
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00004-of-00004.safetensors",
562
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00004-of-00004.safetensors",
563
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00004-of-00004.safetensors",
564
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00004-of-00004.safetensors",
565
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00004-of-00004.safetensors",
566
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
567
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
568
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
569
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
570
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
571
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
572
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
573
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
574
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00004-of-00004.safetensors",
575
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00004-of-00004.safetensors",
576
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00004-of-00004.safetensors",
577
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00004-of-00004.safetensors",
578
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00004-of-00004.safetensors",
579
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00004-of-00004.safetensors",
580
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00004-of-00004.safetensors",
581
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00004-of-00004.safetensors",
582
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
583
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
584
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
585
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
586
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
587
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
588
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
589
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
590
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00004-of-00004.safetensors",
591
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00004-of-00004.safetensors",
592
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00004-of-00004.safetensors",
593
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00004-of-00004.safetensors",
594
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00004-of-00004.safetensors",
595
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00004-of-00004.safetensors",
596
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00004-of-00004.safetensors",
597
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00004-of-00004.safetensors",
598
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
599
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
600
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
601
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
602
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
603
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
604
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
605
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
606
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00004-of-00004.safetensors",
607
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00004-of-00004.safetensors",
608
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00004-of-00004.safetensors",
609
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00004-of-00004.safetensors",
610
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00004-of-00004.safetensors",
611
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00004-of-00004.safetensors",
612
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00004-of-00004.safetensors",
613
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00004-of-00004.safetensors",
614
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
615
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
616
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
617
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
618
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
619
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
620
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
621
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
622
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00004-of-00004.safetensors",
623
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00004-of-00004.safetensors",
624
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00004-of-00004.safetensors",
625
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00004-of-00004.safetensors",
626
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00004-of-00004.safetensors",
627
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00004-of-00004.safetensors",
628
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00004-of-00004.safetensors",
629
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00004-of-00004.safetensors",
630
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
631
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
632
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
633
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
634
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
635
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
636
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
637
+ "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
638
+ "transformer.vision_tower.vision_tower.vision_model.head.attention.in_proj_bias": "model-00004-of-00004.safetensors",
639
+ "transformer.vision_tower.vision_tower.vision_model.head.attention.in_proj_weight": "model-00004-of-00004.safetensors",
640
+ "transformer.vision_tower.vision_tower.vision_model.head.attention.out_proj.bias": "model-00004-of-00004.safetensors",
641
+ "transformer.vision_tower.vision_tower.vision_model.head.attention.out_proj.weight": "model-00004-of-00004.safetensors",
642
+ "transformer.vision_tower.vision_tower.vision_model.head.layernorm.bias": "model-00004-of-00004.safetensors",
643
+ "transformer.vision_tower.vision_tower.vision_model.head.layernorm.weight": "model-00004-of-00004.safetensors",
644
+ "transformer.vision_tower.vision_tower.vision_model.head.mlp.fc1.bias": "model-00004-of-00004.safetensors",
645
+ "transformer.vision_tower.vision_tower.vision_model.head.mlp.fc1.weight": "model-00004-of-00004.safetensors",
646
+ "transformer.vision_tower.vision_tower.vision_model.head.mlp.fc2.bias": "model-00004-of-00004.safetensors",
647
+ "transformer.vision_tower.vision_tower.vision_model.head.mlp.fc2.weight": "model-00004-of-00004.safetensors",
648
+ "transformer.vision_tower.vision_tower.vision_model.head.probe": "model-00004-of-00004.safetensors",
649
+ "transformer.vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00004-of-00004.safetensors",
650
+ "transformer.vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00004-of-00004.safetensors",
651
+ "transformer.wte.weight": "model-00001-of-00004.safetensors"
652
+ }
653
+ }
norm.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Optional, Type, Union
2
+ import torch
3
+
4
+ def _cast_if_autocast_enabled(tensor: torch.Tensor) -> torch.Tensor:
5
+ if torch.is_autocast_enabled():
6
+ if tensor.device.type == 'cuda':
7
+ dtype = torch.get_autocast_gpu_dtype()
8
+ elif tensor.device.type == 'cpu':
9
+ dtype = torch.get_autocast_cpu_dtype()
10
+ else:
11
+ raise NotImplementedError()
12
+ return tensor.to(dtype=dtype)
13
+ return tensor
14
+
15
+ class LPLayerNorm(torch.nn.LayerNorm):
16
+
17
+ def __init__(self, normalized_shape: Union[int, List[int], torch.Size], eps: float=1e-05, elementwise_affine: bool=True, device: Optional[torch.device]=None, dtype: Optional[torch.dtype]=None):
18
+ super().__init__(normalized_shape=normalized_shape, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
19
+
20
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
21
+ module_device = x.device
22
+ downcast_x = _cast_if_autocast_enabled(x)
23
+ downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
24
+ downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
25
+ with torch.autocast(enabled=False, device_type=module_device.type):
26
+ return torch.nn.functional.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps)
27
+
28
+ def rms_norm(x: torch.Tensor, weight: Optional[torch.Tensor]=None, eps: float=1e-05) -> torch.Tensor:
29
+ output = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
30
+ if weight is not None:
31
+ return output * weight
32
+ return output
33
+
34
+ class RMSNorm(torch.nn.Module):
35
+
36
+ def __init__(self, normalized_shape: Union[int, List[int], torch.Size], eps: float=1e-05, weight: bool=True, dtype: Optional[torch.dtype]=None, device: Optional[torch.device]=None):
37
+ super().__init__()
38
+ self.eps = eps
39
+ if weight:
40
+ self.weight = torch.nn.Parameter(torch.ones(normalized_shape, dtype=dtype, device=device))
41
+ else:
42
+ self.register_parameter('weight', None)
43
+
44
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
45
+ return rms_norm(x.float(), self.weight, self.eps).to(dtype=x.dtype)
46
+
47
+ class LPRMSNorm(RMSNorm):
48
+
49
+ def __init__(self, normalized_shape: Union[int, List[int], torch.Size], eps: float=1e-05, weight: bool=True, dtype: Optional[torch.dtype]=None, device: Optional[torch.device]=None):
50
+ super().__init__(normalized_shape=normalized_shape, eps=eps, weight=weight, dtype=dtype, device=device)
51
+
52
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
53
+ downcast_x = _cast_if_autocast_enabled(x)
54
+ downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
55
+ with torch.autocast(enabled=False, device_type=x.device.type):
56
+ return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
57
+ NORM_CLASS_REGISTRY: Dict[str, Type[torch.nn.Module]] = {'layernorm': torch.nn.LayerNorm, 'low_precision_layernorm': LPLayerNorm, 'rmsnorm': RMSNorm, 'low_precision_rmsnorm': LPRMSNorm}
special_tokens_map.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|SYSTEM|>",
4
+ "<|USER|>",
5
+ "<|RESPONSE|>"
6
+ ],
7
+ "bos_token": {
8
+ "content": "<s>",
9
+ "lstrip": false,
10
+ "normalized": false,
11
+ "rstrip": false,
12
+ "single_word": false
13
+ },
14
+ "eos_token": {
15
+ "content": "</s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "mask_token": {
22
+ "content": "<mask>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false
27
+ },
28
+ "pad_token": "<unk>",
29
+ "unk_token": {
30
+ "content": "<unk>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false
35
+ }
36
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,1758 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<unk>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<pad>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "<mask>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "70000": {
44
+ "content": "<unused0>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "70001": {
52
+ "content": "<unused1>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "70002": {
60
+ "content": "<unused2>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "70003": {
68
+ "content": "<unused3>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "70004": {
76
+ "content": "<unused4>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "70005": {
84
+ "content": "<unused5>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "70006": {
92
+ "content": "<unused6>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "70007": {
100
+ "content": "<unused7>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "70008": {
108
+ "content": "<unused8>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ },
115
+ "70009": {
116
+ "content": "<unused9>",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "70010": {
124
+ "content": "<unused10>",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": true
130
+ },
131
+ "70011": {
132
+ "content": "<unused11>",
133
+ "lstrip": false,
134
+ "normalized": false,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": true
138
+ },
139
+ "70012": {
140
+ "content": "<unused12>",
141
+ "lstrip": false,
142
+ "normalized": false,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": true
146
+ },
147
+ "70013": {
148
+ "content": "<unused13>",
149
+ "lstrip": false,
150
+ "normalized": false,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": true
154
+ },
155
+ "70014": {
156
+ "content": "<unused14>",
157
+ "lstrip": false,
158
+ "normalized": false,
159
+ "rstrip": false,
160
+ "single_word": false,
161
+ "special": true
162
+ },
163
+ "70015": {
164
+ "content": "<unused15>",
165
+ "lstrip": false,
166
+ "normalized": false,
167
+ "rstrip": false,
168
+ "single_word": false,
169
+ "special": true
170
+ },
171
+ "70016": {
172
+ "content": "<unused16>",
173
+ "lstrip": false,
174
+ "normalized": false,
175
+ "rstrip": false,
176
+ "single_word": false,
177
+ "special": true
178
+ },
179
+ "70017": {
180
+ "content": "<unused17>",
181
+ "lstrip": false,
182
+ "normalized": false,
183
+ "rstrip": false,
184
+ "single_word": false,
185
+ "special": true
186
+ },
187
+ "70018": {
188
+ "content": "<unused18>",
189
+ "lstrip": false,
190
+ "normalized": false,
191
+ "rstrip": false,
192
+ "single_word": false,
193
+ "special": true
194
+ },
195
+ "70019": {
196
+ "content": "<unused19>",
197
+ "lstrip": false,
198
+ "normalized": false,
199
+ "rstrip": false,
200
+ "single_word": false,
201
+ "special": true
202
+ },
203
+ "70020": {
204
+ "content": "<unused20>",
205
+ "lstrip": false,
206
+ "normalized": false,
207
+ "rstrip": false,
208
+ "single_word": false,
209
+ "special": true
210
+ },
211
+ "70021": {
212
+ "content": "<unused21>",
213
+ "lstrip": false,
214
+ "normalized": false,
215
+ "rstrip": false,
216
+ "single_word": false,
217
+ "special": true
218
+ },
219
+ "70022": {
220
+ "content": "<unused22>",
221
+ "lstrip": false,
222
+ "normalized": false,
223
+ "rstrip": false,
224
+ "single_word": false,
225
+ "special": true
226
+ },
227
+ "70023": {
228
+ "content": "<unused23>",
229
+ "lstrip": false,
230
+ "normalized": false,
231
+ "rstrip": false,
232
+ "single_word": false,
233
+ "special": true
234
+ },
235
+ "70024": {
236
+ "content": "<unused24>",
237
+ "lstrip": false,
238
+ "normalized": false,
239
+ "rstrip": false,
240
+ "single_word": false,
241
+ "special": true
242
+ },
243
+ "70025": {
244
+ "content": "<unused25>",
245
+ "lstrip": false,
246
+ "normalized": false,
247
+ "rstrip": false,
248
+ "single_word": false,
249
+ "special": true
250
+ },
251
+ "70026": {
252
+ "content": "<unused26>",
253
+ "lstrip": false,
254
+ "normalized": false,
255
+ "rstrip": false,
256
+ "single_word": false,
257
+ "special": true
258
+ },
259
+ "70027": {
260
+ "content": "<unused27>",
261
+ "lstrip": false,
262
+ "normalized": false,
263
+ "rstrip": false,
264
+ "single_word": false,
265
+ "special": true
266
+ },
267
+ "70028": {
268
+ "content": "<unused28>",
269
+ "lstrip": false,
270
+ "normalized": false,
271
+ "rstrip": false,
272
+ "single_word": false,
273
+ "special": true
274
+ },
275
+ "70029": {
276
+ "content": "<unused29>",
277
+ "lstrip": false,
278
+ "normalized": false,
279
+ "rstrip": false,
280
+ "single_word": false,
281
+ "special": true
282
+ },
283
+ "70030": {
284
+ "content": "<unused30>",
285
+ "lstrip": false,
286
+ "normalized": false,
287
+ "rstrip": false,
288
+ "single_word": false,
289
+ "special": true
290
+ },
291
+ "70031": {
292
+ "content": "<unused31>",
293
+ "lstrip": false,
294
+ "normalized": false,
295
+ "rstrip": false,
296
+ "single_word": false,
297
+ "special": true
298
+ },
299
+ "70032": {
300
+ "content": "<unused32>",
301
+ "lstrip": false,
302
+ "normalized": false,
303
+ "rstrip": false,
304
+ "single_word": false,
305
+ "special": true
306
+ },
307
+ "70033": {
308
+ "content": "<unused33>",
309
+ "lstrip": false,
310
+ "normalized": false,
311
+ "rstrip": false,
312
+ "single_word": false,
313
+ "special": true
314
+ },
315
+ "70034": {
316
+ "content": "<unused34>",
317
+ "lstrip": false,
318
+ "normalized": false,
319
+ "rstrip": false,
320
+ "single_word": false,
321
+ "special": true
322
+ },
323
+ "70035": {
324
+ "content": "<unused35>",
325
+ "lstrip": false,
326
+ "normalized": false,
327
+ "rstrip": false,
328
+ "single_word": false,
329
+ "special": true
330
+ },
331
+ "70036": {
332
+ "content": "<unused36>",
333
+ "lstrip": false,
334
+ "normalized": false,
335
+ "rstrip": false,
336
+ "single_word": false,
337
+ "special": true
338
+ },
339
+ "70037": {
340
+ "content": "<unused37>",
341
+ "lstrip": false,
342
+ "normalized": false,
343
+ "rstrip": false,
344
+ "single_word": false,
345
+ "special": true
346
+ },
347
+ "70038": {
348
+ "content": "<unused38>",
349
+ "lstrip": false,
350
+ "normalized": false,
351
+ "rstrip": false,
352
+ "single_word": false,
353
+ "special": true
354
+ },
355
+ "70039": {
356
+ "content": "<unused39>",
357
+ "lstrip": false,
358
+ "normalized": false,
359
+ "rstrip": false,
360
+ "single_word": false,
361
+ "special": true
362
+ },
363
+ "70040": {
364
+ "content": "<unused40>",
365
+ "lstrip": false,
366
+ "normalized": false,
367
+ "rstrip": false,
368
+ "single_word": false,
369
+ "special": true
370
+ },
371
+ "70041": {
372
+ "content": "<unused41>",
373
+ "lstrip": false,
374
+ "normalized": false,
375
+ "rstrip": false,
376
+ "single_word": false,
377
+ "special": true
378
+ },
379
+ "70042": {
380
+ "content": "<unused42>",
381
+ "lstrip": false,
382
+ "normalized": false,
383
+ "rstrip": false,
384
+ "single_word": false,
385
+ "special": true
386
+ },
387
+ "70043": {
388
+ "content": "<unused43>",
389
+ "lstrip": false,
390
+ "normalized": false,
391
+ "rstrip": false,
392
+ "single_word": false,
393
+ "special": true
394
+ },
395
+ "70044": {
396
+ "content": "<unused44>",
397
+ "lstrip": false,
398
+ "normalized": false,
399
+ "rstrip": false,
400
+ "single_word": false,
401
+ "special": true
402
+ },
403
+ "70045": {
404
+ "content": "<unused45>",
405
+ "lstrip": false,
406
+ "normalized": false,
407
+ "rstrip": false,
408
+ "single_word": false,
409
+ "special": true
410
+ },
411
+ "70046": {
412
+ "content": "<unused46>",
413
+ "lstrip": false,
414
+ "normalized": false,
415
+ "rstrip": false,
416
+ "single_word": false,
417
+ "special": true
418
+ },
419
+ "70047": {
420
+ "content": "<unused47>",
421
+ "lstrip": false,
422
+ "normalized": false,
423
+ "rstrip": false,
424
+ "single_word": false,
425
+ "special": true
426
+ },
427
+ "70048": {
428
+ "content": "<unused48>",
429
+ "lstrip": false,
430
+ "normalized": false,
431
+ "rstrip": false,
432
+ "single_word": false,
433
+ "special": true
434
+ },
435
+ "70049": {
436
+ "content": "<unused49>",
437
+ "lstrip": false,
438
+ "normalized": false,
439
+ "rstrip": false,
440
+ "single_word": false,
441
+ "special": true
442
+ },
443
+ "70050": {
444
+ "content": "<unused50>",
445
+ "lstrip": false,
446
+ "normalized": false,
447
+ "rstrip": false,
448
+ "single_word": false,
449
+ "special": true
450
+ },
451
+ "70051": {
452
+ "content": "<unused51>",
453
+ "lstrip": false,
454
+ "normalized": false,
455
+ "rstrip": false,
456
+ "single_word": false,
457
+ "special": true
458
+ },
459
+ "70052": {
460
+ "content": "<unused52>",
461
+ "lstrip": false,
462
+ "normalized": false,
463
+ "rstrip": false,
464
+ "single_word": false,
465
+ "special": true
466
+ },
467
+ "70053": {
468
+ "content": "<unused53>",
469
+ "lstrip": false,
470
+ "normalized": false,
471
+ "rstrip": false,
472
+ "single_word": false,
473
+ "special": true
474
+ },
475
+ "70054": {
476
+ "content": "<unused54>",
477
+ "lstrip": false,
478
+ "normalized": false,
479
+ "rstrip": false,
480
+ "single_word": false,
481
+ "special": true
482
+ },
483
+ "70055": {
484
+ "content": "<unused55>",
485
+ "lstrip": false,
486
+ "normalized": false,
487
+ "rstrip": false,
488
+ "single_word": false,
489
+ "special": true
490
+ },
491
+ "70056": {
492
+ "content": "<unused56>",
493
+ "lstrip": false,
494
+ "normalized": false,
495
+ "rstrip": false,
496
+ "single_word": false,
497
+ "special": true
498
+ },
499
+ "70057": {
500
+ "content": "<unused57>",
501
+ "lstrip": false,
502
+ "normalized": false,
503
+ "rstrip": false,
504
+ "single_word": false,
505
+ "special": true
506
+ },
507
+ "70058": {
508
+ "content": "<unused58>",
509
+ "lstrip": false,
510
+ "normalized": false,
511
+ "rstrip": false,
512
+ "single_word": false,
513
+ "special": true
514
+ },
515
+ "70059": {
516
+ "content": "<unused59>",
517
+ "lstrip": false,
518
+ "normalized": false,
519
+ "rstrip": false,
520
+ "single_word": false,
521
+ "special": true
522
+ },
523
+ "70060": {
524
+ "content": "<unused60>",
525
+ "lstrip": false,
526
+ "normalized": false,
527
+ "rstrip": false,
528
+ "single_word": false,
529
+ "special": true
530
+ },
531
+ "70061": {
532
+ "content": "<unused61>",
533
+ "lstrip": false,
534
+ "normalized": false,
535
+ "rstrip": false,
536
+ "single_word": false,
537
+ "special": true
538
+ },
539
+ "70062": {
540
+ "content": "<unused62>",
541
+ "lstrip": false,
542
+ "normalized": false,
543
+ "rstrip": false,
544
+ "single_word": false,
545
+ "special": true
546
+ },
547
+ "70063": {
548
+ "content": "<unused63>",
549
+ "lstrip": false,
550
+ "normalized": false,
551
+ "rstrip": false,
552
+ "single_word": false,
553
+ "special": true
554
+ },
555
+ "70064": {
556
+ "content": "<unused64>",
557
+ "lstrip": false,
558
+ "normalized": false,
559
+ "rstrip": false,
560
+ "single_word": false,
561
+ "special": true
562
+ },
563
+ "70065": {
564
+ "content": "<unused65>",
565
+ "lstrip": false,
566
+ "normalized": false,
567
+ "rstrip": false,
568
+ "single_word": false,
569
+ "special": true
570
+ },
571
+ "70066": {
572
+ "content": "<unused66>",
573
+ "lstrip": false,
574
+ "normalized": false,
575
+ "rstrip": false,
576
+ "single_word": false,
577
+ "special": true
578
+ },
579
+ "70067": {
580
+ "content": "<unused67>",
581
+ "lstrip": false,
582
+ "normalized": false,
583
+ "rstrip": false,
584
+ "single_word": false,
585
+ "special": true
586
+ },
587
+ "70068": {
588
+ "content": "<unused68>",
589
+ "lstrip": false,
590
+ "normalized": false,
591
+ "rstrip": false,
592
+ "single_word": false,
593
+ "special": true
594
+ },
595
+ "70069": {
596
+ "content": "<unused69>",
597
+ "lstrip": false,
598
+ "normalized": false,
599
+ "rstrip": false,
600
+ "single_word": false,
601
+ "special": true
602
+ },
603
+ "70070": {
604
+ "content": "<unused70>",
605
+ "lstrip": false,
606
+ "normalized": false,
607
+ "rstrip": false,
608
+ "single_word": false,
609
+ "special": true
610
+ },
611
+ "70071": {
612
+ "content": "<unused71>",
613
+ "lstrip": false,
614
+ "normalized": false,
615
+ "rstrip": false,
616
+ "single_word": false,
617
+ "special": true
618
+ },
619
+ "70072": {
620
+ "content": "<unused72>",
621
+ "lstrip": false,
622
+ "normalized": false,
623
+ "rstrip": false,
624
+ "single_word": false,
625
+ "special": true
626
+ },
627
+ "70073": {
628
+ "content": "<unused73>",
629
+ "lstrip": false,
630
+ "normalized": false,
631
+ "rstrip": false,
632
+ "single_word": false,
633
+ "special": true
634
+ },
635
+ "70074": {
636
+ "content": "<unused74>",
637
+ "lstrip": false,
638
+ "normalized": false,
639
+ "rstrip": false,
640
+ "single_word": false,
641
+ "special": true
642
+ },
643
+ "70075": {
644
+ "content": "<unused75>",
645
+ "lstrip": false,
646
+ "normalized": false,
647
+ "rstrip": false,
648
+ "single_word": false,
649
+ "special": true
650
+ },
651
+ "70076": {
652
+ "content": "<unused76>",
653
+ "lstrip": false,
654
+ "normalized": false,
655
+ "rstrip": false,
656
+ "single_word": false,
657
+ "special": true
658
+ },
659
+ "70077": {
660
+ "content": "<unused77>",
661
+ "lstrip": false,
662
+ "normalized": false,
663
+ "rstrip": false,
664
+ "single_word": false,
665
+ "special": true
666
+ },
667
+ "70078": {
668
+ "content": "<unused78>",
669
+ "lstrip": false,
670
+ "normalized": false,
671
+ "rstrip": false,
672
+ "single_word": false,
673
+ "special": true
674
+ },
675
+ "70079": {
676
+ "content": "<unused79>",
677
+ "lstrip": false,
678
+ "normalized": false,
679
+ "rstrip": false,
680
+ "single_word": false,
681
+ "special": true
682
+ },
683
+ "70080": {
684
+ "content": "<unused80>",
685
+ "lstrip": false,
686
+ "normalized": false,
687
+ "rstrip": false,
688
+ "single_word": false,
689
+ "special": true
690
+ },
691
+ "70081": {
692
+ "content": "<unused81>",
693
+ "lstrip": false,
694
+ "normalized": false,
695
+ "rstrip": false,
696
+ "single_word": false,
697
+ "special": true
698
+ },
699
+ "70082": {
700
+ "content": "<unused82>",
701
+ "lstrip": false,
702
+ "normalized": false,
703
+ "rstrip": false,
704
+ "single_word": false,
705
+ "special": true
706
+ },
707
+ "70083": {
708
+ "content": "<unused83>",
709
+ "lstrip": false,
710
+ "normalized": false,
711
+ "rstrip": false,
712
+ "single_word": false,
713
+ "special": true
714
+ },
715
+ "70084": {
716
+ "content": "<unused84>",
717
+ "lstrip": false,
718
+ "normalized": false,
719
+ "rstrip": false,
720
+ "single_word": false,
721
+ "special": true
722
+ },
723
+ "70085": {
724
+ "content": "<unused85>",
725
+ "lstrip": false,
726
+ "normalized": false,
727
+ "rstrip": false,
728
+ "single_word": false,
729
+ "special": true
730
+ },
731
+ "70086": {
732
+ "content": "<unused86>",
733
+ "lstrip": false,
734
+ "normalized": false,
735
+ "rstrip": false,
736
+ "single_word": false,
737
+ "special": true
738
+ },
739
+ "70087": {
740
+ "content": "<unused87>",
741
+ "lstrip": false,
742
+ "normalized": false,
743
+ "rstrip": false,
744
+ "single_word": false,
745
+ "special": true
746
+ },
747
+ "70088": {
748
+ "content": "<unused88>",
749
+ "lstrip": false,
750
+ "normalized": false,
751
+ "rstrip": false,
752
+ "single_word": false,
753
+ "special": true
754
+ },
755
+ "70089": {
756
+ "content": "<unused89>",
757
+ "lstrip": false,
758
+ "normalized": false,
759
+ "rstrip": false,
760
+ "single_word": false,
761
+ "special": true
762
+ },
763
+ "70090": {
764
+ "content": "<unused90>",
765
+ "lstrip": false,
766
+ "normalized": false,
767
+ "rstrip": false,
768
+ "single_word": false,
769
+ "special": true
770
+ },
771
+ "70091": {
772
+ "content": "<unused91>",
773
+ "lstrip": false,
774
+ "normalized": false,
775
+ "rstrip": false,
776
+ "single_word": false,
777
+ "special": true
778
+ },
779
+ "70092": {
780
+ "content": "<unused92>",
781
+ "lstrip": false,
782
+ "normalized": false,
783
+ "rstrip": false,
784
+ "single_word": false,
785
+ "special": true
786
+ },
787
+ "70093": {
788
+ "content": "<unused93>",
789
+ "lstrip": false,
790
+ "normalized": false,
791
+ "rstrip": false,
792
+ "single_word": false,
793
+ "special": true
794
+ },
795
+ "70094": {
796
+ "content": "<unused94>",
797
+ "lstrip": false,
798
+ "normalized": false,
799
+ "rstrip": false,
800
+ "single_word": false,
801
+ "special": true
802
+ },
803
+ "70095": {
804
+ "content": "<unused95>",
805
+ "lstrip": false,
806
+ "normalized": false,
807
+ "rstrip": false,
808
+ "single_word": false,
809
+ "special": true
810
+ },
811
+ "70096": {
812
+ "content": "<unused96>",
813
+ "lstrip": false,
814
+ "normalized": false,
815
+ "rstrip": false,
816
+ "single_word": false,
817
+ "special": true
818
+ },
819
+ "70097": {
820
+ "content": "<unused97>",
821
+ "lstrip": false,
822
+ "normalized": false,
823
+ "rstrip": false,
824
+ "single_word": false,
825
+ "special": true
826
+ },
827
+ "70098": {
828
+ "content": "<unused98>",
829
+ "lstrip": false,
830
+ "normalized": false,
831
+ "rstrip": false,
832
+ "single_word": false,
833
+ "special": true
834
+ },
835
+ "70099": {
836
+ "content": "<unused99>",
837
+ "lstrip": false,
838
+ "normalized": false,
839
+ "rstrip": false,
840
+ "single_word": false,
841
+ "special": true
842
+ },
843
+ "70100": {
844
+ "content": "<unused100>",
845
+ "lstrip": false,
846
+ "normalized": false,
847
+ "rstrip": false,
848
+ "single_word": false,
849
+ "special": true
850
+ },
851
+ "70101": {
852
+ "content": "<unused101>",
853
+ "lstrip": false,
854
+ "normalized": false,
855
+ "rstrip": false,
856
+ "single_word": false,
857
+ "special": true
858
+ },
859
+ "70102": {
860
+ "content": "<unused102>",
861
+ "lstrip": false,
862
+ "normalized": false,
863
+ "rstrip": false,
864
+ "single_word": false,
865
+ "special": true
866
+ },
867
+ "70103": {
868
+ "content": "<unused103>",
869
+ "lstrip": false,
870
+ "normalized": false,
871
+ "rstrip": false,
872
+ "single_word": false,
873
+ "special": true
874
+ },
875
+ "70104": {
876
+ "content": "<unused104>",
877
+ "lstrip": false,
878
+ "normalized": false,
879
+ "rstrip": false,
880
+ "single_word": false,
881
+ "special": true
882
+ },
883
+ "70105": {
884
+ "content": "<unused105>",
885
+ "lstrip": false,
886
+ "normalized": false,
887
+ "rstrip": false,
888
+ "single_word": false,
889
+ "special": true
890
+ },
891
+ "70106": {
892
+ "content": "<unused106>",
893
+ "lstrip": false,
894
+ "normalized": false,
895
+ "rstrip": false,
896
+ "single_word": false,
897
+ "special": true
898
+ },
899
+ "70107": {
900
+ "content": "<unused107>",
901
+ "lstrip": false,
902
+ "normalized": false,
903
+ "rstrip": false,
904
+ "single_word": false,
905
+ "special": true
906
+ },
907
+ "70108": {
908
+ "content": "<unused108>",
909
+ "lstrip": false,
910
+ "normalized": false,
911
+ "rstrip": false,
912
+ "single_word": false,
913
+ "special": true
914
+ },
915
+ "70109": {
916
+ "content": "<unused109>",
917
+ "lstrip": false,
918
+ "normalized": false,
919
+ "rstrip": false,
920
+ "single_word": false,
921
+ "special": true
922
+ },
923
+ "70110": {
924
+ "content": "<unused110>",
925
+ "lstrip": false,
926
+ "normalized": false,
927
+ "rstrip": false,
928
+ "single_word": false,
929
+ "special": true
930
+ },
931
+ "70111": {
932
+ "content": "<unused111>",
933
+ "lstrip": false,
934
+ "normalized": false,
935
+ "rstrip": false,
936
+ "single_word": false,
937
+ "special": true
938
+ },
939
+ "70112": {
940
+ "content": "<unused112>",
941
+ "lstrip": false,
942
+ "normalized": false,
943
+ "rstrip": false,
944
+ "single_word": false,
945
+ "special": true
946
+ },
947
+ "70113": {
948
+ "content": "<unused113>",
949
+ "lstrip": false,
950
+ "normalized": false,
951
+ "rstrip": false,
952
+ "single_word": false,
953
+ "special": true
954
+ },
955
+ "70114": {
956
+ "content": "<unused114>",
957
+ "lstrip": false,
958
+ "normalized": false,
959
+ "rstrip": false,
960
+ "single_word": false,
961
+ "special": true
962
+ },
963
+ "70115": {
964
+ "content": "<unused115>",
965
+ "lstrip": false,
966
+ "normalized": false,
967
+ "rstrip": false,
968
+ "single_word": false,
969
+ "special": true
970
+ },
971
+ "70116": {
972
+ "content": "<unused116>",
973
+ "lstrip": false,
974
+ "normalized": false,
975
+ "rstrip": false,
976
+ "single_word": false,
977
+ "special": true
978
+ },
979
+ "70117": {
980
+ "content": "<unused117>",
981
+ "lstrip": false,
982
+ "normalized": false,
983
+ "rstrip": false,
984
+ "single_word": false,
985
+ "special": true
986
+ },
987
+ "70118": {
988
+ "content": "<unused118>",
989
+ "lstrip": false,
990
+ "normalized": false,
991
+ "rstrip": false,
992
+ "single_word": false,
993
+ "special": true
994
+ },
995
+ "70119": {
996
+ "content": "<unused119>",
997
+ "lstrip": false,
998
+ "normalized": false,
999
+ "rstrip": false,
1000
+ "single_word": false,
1001
+ "special": true
1002
+ },
1003
+ "70120": {
1004
+ "content": "<unused120>",
1005
+ "lstrip": false,
1006
+ "normalized": false,
1007
+ "rstrip": false,
1008
+ "single_word": false,
1009
+ "special": true
1010
+ },
1011
+ "70121": {
1012
+ "content": "<unused121>",
1013
+ "lstrip": false,
1014
+ "normalized": false,
1015
+ "rstrip": false,
1016
+ "single_word": false,
1017
+ "special": true
1018
+ },
1019
+ "70122": {
1020
+ "content": "<unused122>",
1021
+ "lstrip": false,
1022
+ "normalized": false,
1023
+ "rstrip": false,
1024
+ "single_word": false,
1025
+ "special": true
1026
+ },
1027
+ "70123": {
1028
+ "content": "<unused123>",
1029
+ "lstrip": false,
1030
+ "normalized": false,
1031
+ "rstrip": false,
1032
+ "single_word": false,
1033
+ "special": true
1034
+ },
1035
+ "70124": {
1036
+ "content": "<unused124>",
1037
+ "lstrip": false,
1038
+ "normalized": false,
1039
+ "rstrip": false,
1040
+ "single_word": false,
1041
+ "special": true
1042
+ },
1043
+ "70125": {
1044
+ "content": "<unused125>",
1045
+ "lstrip": false,
1046
+ "normalized": false,
1047
+ "rstrip": false,
1048
+ "single_word": false,
1049
+ "special": true
1050
+ },
1051
+ "70126": {
1052
+ "content": "<unused126>",
1053
+ "lstrip": false,
1054
+ "normalized": false,
1055
+ "rstrip": false,
1056
+ "single_word": false,
1057
+ "special": true
1058
+ },
1059
+ "70127": {
1060
+ "content": "<unused127>",
1061
+ "lstrip": false,
1062
+ "normalized": false,
1063
+ "rstrip": false,
1064
+ "single_word": false,
1065
+ "special": true
1066
+ },
1067
+ "70128": {
1068
+ "content": "<unused128>",
1069
+ "lstrip": false,
1070
+ "normalized": false,
1071
+ "rstrip": false,
1072
+ "single_word": false,
1073
+ "special": true
1074
+ },
1075
+ "70129": {
1076
+ "content": "<unused129>",
1077
+ "lstrip": false,
1078
+ "normalized": false,
1079
+ "rstrip": false,
1080
+ "single_word": false,
1081
+ "special": true
1082
+ },
1083
+ "70130": {
1084
+ "content": "<unused130>",
1085
+ "lstrip": false,
1086
+ "normalized": false,
1087
+ "rstrip": false,
1088
+ "single_word": false,
1089
+ "special": true
1090
+ },
1091
+ "70131": {
1092
+ "content": "<unused131>",
1093
+ "lstrip": false,
1094
+ "normalized": false,
1095
+ "rstrip": false,
1096
+ "single_word": false,
1097
+ "special": true
1098
+ },
1099
+ "70132": {
1100
+ "content": "<unused132>",
1101
+ "lstrip": false,
1102
+ "normalized": false,
1103
+ "rstrip": false,
1104
+ "single_word": false,
1105
+ "special": true
1106
+ },
1107
+ "70133": {
1108
+ "content": "<unused133>",
1109
+ "lstrip": false,
1110
+ "normalized": false,
1111
+ "rstrip": false,
1112
+ "single_word": false,
1113
+ "special": true
1114
+ },
1115
+ "70134": {
1116
+ "content": "<unused134>",
1117
+ "lstrip": false,
1118
+ "normalized": false,
1119
+ "rstrip": false,
1120
+ "single_word": false,
1121
+ "special": true
1122
+ },
1123
+ "70135": {
1124
+ "content": "<unused135>",
1125
+ "lstrip": false,
1126
+ "normalized": false,
1127
+ "rstrip": false,
1128
+ "single_word": false,
1129
+ "special": true
1130
+ },
1131
+ "70136": {
1132
+ "content": "<unused136>",
1133
+ "lstrip": false,
1134
+ "normalized": false,
1135
+ "rstrip": false,
1136
+ "single_word": false,
1137
+ "special": true
1138
+ },
1139
+ "70137": {
1140
+ "content": "<unused137>",
1141
+ "lstrip": false,
1142
+ "normalized": false,
1143
+ "rstrip": false,
1144
+ "single_word": false,
1145
+ "special": true
1146
+ },
1147
+ "70138": {
1148
+ "content": "<unused138>",
1149
+ "lstrip": false,
1150
+ "normalized": false,
1151
+ "rstrip": false,
1152
+ "single_word": false,
1153
+ "special": true
1154
+ },
1155
+ "70139": {
1156
+ "content": "<unused139>",
1157
+ "lstrip": false,
1158
+ "normalized": false,
1159
+ "rstrip": false,
1160
+ "single_word": false,
1161
+ "special": true
1162
+ },
1163
+ "70140": {
1164
+ "content": "<unused140>",
1165
+ "lstrip": false,
1166
+ "normalized": false,
1167
+ "rstrip": false,
1168
+ "single_word": false,
1169
+ "special": true
1170
+ },
1171
+ "70141": {
1172
+ "content": "<unused141>",
1173
+ "lstrip": false,
1174
+ "normalized": false,
1175
+ "rstrip": false,
1176
+ "single_word": false,
1177
+ "special": true
1178
+ },
1179
+ "70142": {
1180
+ "content": "<unused142>",
1181
+ "lstrip": false,
1182
+ "normalized": false,
1183
+ "rstrip": false,
1184
+ "single_word": false,
1185
+ "special": true
1186
+ },
1187
+ "70143": {
1188
+ "content": "<unused143>",
1189
+ "lstrip": false,
1190
+ "normalized": false,
1191
+ "rstrip": false,
1192
+ "single_word": false,
1193
+ "special": true
1194
+ },
1195
+ "70144": {
1196
+ "content": "<unused144>",
1197
+ "lstrip": false,
1198
+ "normalized": false,
1199
+ "rstrip": false,
1200
+ "single_word": false,
1201
+ "special": true
1202
+ },
1203
+ "70145": {
1204
+ "content": "<unused145>",
1205
+ "lstrip": false,
1206
+ "normalized": false,
1207
+ "rstrip": false,
1208
+ "single_word": false,
1209
+ "special": true
1210
+ },
1211
+ "70146": {
1212
+ "content": "<unused146>",
1213
+ "lstrip": false,
1214
+ "normalized": false,
1215
+ "rstrip": false,
1216
+ "single_word": false,
1217
+ "special": true
1218
+ },
1219
+ "70147": {
1220
+ "content": "<unused147>",
1221
+ "lstrip": false,
1222
+ "normalized": false,
1223
+ "rstrip": false,
1224
+ "single_word": false,
1225
+ "special": true
1226
+ },
1227
+ "70148": {
1228
+ "content": "<unused148>",
1229
+ "lstrip": false,
1230
+ "normalized": false,
1231
+ "rstrip": false,
1232
+ "single_word": false,
1233
+ "special": true
1234
+ },
1235
+ "70149": {
1236
+ "content": "<unused149>",
1237
+ "lstrip": false,
1238
+ "normalized": false,
1239
+ "rstrip": false,
1240
+ "single_word": false,
1241
+ "special": true
1242
+ },
1243
+ "70150": {
1244
+ "content": "<unused150>",
1245
+ "lstrip": false,
1246
+ "normalized": false,
1247
+ "rstrip": false,
1248
+ "single_word": false,
1249
+ "special": true
1250
+ },
1251
+ "70151": {
1252
+ "content": "<unused151>",
1253
+ "lstrip": false,
1254
+ "normalized": false,
1255
+ "rstrip": false,
1256
+ "single_word": false,
1257
+ "special": true
1258
+ },
1259
+ "70152": {
1260
+ "content": "<unused152>",
1261
+ "lstrip": false,
1262
+ "normalized": false,
1263
+ "rstrip": false,
1264
+ "single_word": false,
1265
+ "special": true
1266
+ },
1267
+ "70153": {
1268
+ "content": "<unused153>",
1269
+ "lstrip": false,
1270
+ "normalized": false,
1271
+ "rstrip": false,
1272
+ "single_word": false,
1273
+ "special": true
1274
+ },
1275
+ "70154": {
1276
+ "content": "<unused154>",
1277
+ "lstrip": false,
1278
+ "normalized": false,
1279
+ "rstrip": false,
1280
+ "single_word": false,
1281
+ "special": true
1282
+ },
1283
+ "70155": {
1284
+ "content": "<unused155>",
1285
+ "lstrip": false,
1286
+ "normalized": false,
1287
+ "rstrip": false,
1288
+ "single_word": false,
1289
+ "special": true
1290
+ },
1291
+ "70156": {
1292
+ "content": "<unused156>",
1293
+ "lstrip": false,
1294
+ "normalized": false,
1295
+ "rstrip": false,
1296
+ "single_word": false,
1297
+ "special": true
1298
+ },
1299
+ "70157": {
1300
+ "content": "<unused157>",
1301
+ "lstrip": false,
1302
+ "normalized": false,
1303
+ "rstrip": false,
1304
+ "single_word": false,
1305
+ "special": true
1306
+ },
1307
+ "70158": {
1308
+ "content": "<unused158>",
1309
+ "lstrip": false,
1310
+ "normalized": false,
1311
+ "rstrip": false,
1312
+ "single_word": false,
1313
+ "special": true
1314
+ },
1315
+ "70159": {
1316
+ "content": "<unused159>",
1317
+ "lstrip": false,
1318
+ "normalized": false,
1319
+ "rstrip": false,
1320
+ "single_word": false,
1321
+ "special": true
1322
+ },
1323
+ "70160": {
1324
+ "content": "<unused160>",
1325
+ "lstrip": false,
1326
+ "normalized": false,
1327
+ "rstrip": false,
1328
+ "single_word": false,
1329
+ "special": true
1330
+ },
1331
+ "70161": {
1332
+ "content": "<unused161>",
1333
+ "lstrip": false,
1334
+ "normalized": false,
1335
+ "rstrip": false,
1336
+ "single_word": false,
1337
+ "special": true
1338
+ },
1339
+ "70162": {
1340
+ "content": "<unused162>",
1341
+ "lstrip": false,
1342
+ "normalized": false,
1343
+ "rstrip": false,
1344
+ "single_word": false,
1345
+ "special": true
1346
+ },
1347
+ "70163": {
1348
+ "content": "<unused163>",
1349
+ "lstrip": false,
1350
+ "normalized": false,
1351
+ "rstrip": false,
1352
+ "single_word": false,
1353
+ "special": true
1354
+ },
1355
+ "70164": {
1356
+ "content": "<unused164>",
1357
+ "lstrip": false,
1358
+ "normalized": false,
1359
+ "rstrip": false,
1360
+ "single_word": false,
1361
+ "special": true
1362
+ },
1363
+ "70165": {
1364
+ "content": "<unused165>",
1365
+ "lstrip": false,
1366
+ "normalized": false,
1367
+ "rstrip": false,
1368
+ "single_word": false,
1369
+ "special": true
1370
+ },
1371
+ "70166": {
1372
+ "content": "<unused166>",
1373
+ "lstrip": false,
1374
+ "normalized": false,
1375
+ "rstrip": false,
1376
+ "single_word": false,
1377
+ "special": true
1378
+ },
1379
+ "70167": {
1380
+ "content": "<unused167>",
1381
+ "lstrip": false,
1382
+ "normalized": false,
1383
+ "rstrip": false,
1384
+ "single_word": false,
1385
+ "special": true
1386
+ },
1387
+ "70168": {
1388
+ "content": "<unused168>",
1389
+ "lstrip": false,
1390
+ "normalized": false,
1391
+ "rstrip": false,
1392
+ "single_word": false,
1393
+ "special": true
1394
+ },
1395
+ "70169": {
1396
+ "content": "<unused169>",
1397
+ "lstrip": false,
1398
+ "normalized": false,
1399
+ "rstrip": false,
1400
+ "single_word": false,
1401
+ "special": true
1402
+ },
1403
+ "70170": {
1404
+ "content": "<unused170>",
1405
+ "lstrip": false,
1406
+ "normalized": false,
1407
+ "rstrip": false,
1408
+ "single_word": false,
1409
+ "special": true
1410
+ },
1411
+ "70171": {
1412
+ "content": "<unused171>",
1413
+ "lstrip": false,
1414
+ "normalized": false,
1415
+ "rstrip": false,
1416
+ "single_word": false,
1417
+ "special": true
1418
+ },
1419
+ "70172": {
1420
+ "content": "<unused172>",
1421
+ "lstrip": false,
1422
+ "normalized": false,
1423
+ "rstrip": false,
1424
+ "single_word": false,
1425
+ "special": true
1426
+ },
1427
+ "70173": {
1428
+ "content": "<unused173>",
1429
+ "lstrip": false,
1430
+ "normalized": false,
1431
+ "rstrip": false,
1432
+ "single_word": false,
1433
+ "special": true
1434
+ },
1435
+ "70174": {
1436
+ "content": "<unused174>",
1437
+ "lstrip": false,
1438
+ "normalized": false,
1439
+ "rstrip": false,
1440
+ "single_word": false,
1441
+ "special": true
1442
+ },
1443
+ "70175": {
1444
+ "content": "<unused175>",
1445
+ "lstrip": false,
1446
+ "normalized": false,
1447
+ "rstrip": false,
1448
+ "single_word": false,
1449
+ "special": true
1450
+ },
1451
+ "70176": {
1452
+ "content": "<unused176>",
1453
+ "lstrip": false,
1454
+ "normalized": false,
1455
+ "rstrip": false,
1456
+ "single_word": false,
1457
+ "special": true
1458
+ },
1459
+ "70177": {
1460
+ "content": "<unused177>",
1461
+ "lstrip": false,
1462
+ "normalized": false,
1463
+ "rstrip": false,
1464
+ "single_word": false,
1465
+ "special": true
1466
+ },
1467
+ "70178": {
1468
+ "content": "<unused178>",
1469
+ "lstrip": false,
1470
+ "normalized": false,
1471
+ "rstrip": false,
1472
+ "single_word": false,
1473
+ "special": true
1474
+ },
1475
+ "70179": {
1476
+ "content": "<unused179>",
1477
+ "lstrip": false,
1478
+ "normalized": false,
1479
+ "rstrip": false,
1480
+ "single_word": false,
1481
+ "special": true
1482
+ },
1483
+ "70180": {
1484
+ "content": "<unused180>",
1485
+ "lstrip": false,
1486
+ "normalized": false,
1487
+ "rstrip": false,
1488
+ "single_word": false,
1489
+ "special": true
1490
+ },
1491
+ "70181": {
1492
+ "content": "<unused181>",
1493
+ "lstrip": false,
1494
+ "normalized": false,
1495
+ "rstrip": false,
1496
+ "single_word": false,
1497
+ "special": true
1498
+ },
1499
+ "70182": {
1500
+ "content": "<unused182>",
1501
+ "lstrip": false,
1502
+ "normalized": false,
1503
+ "rstrip": false,
1504
+ "single_word": false,
1505
+ "special": true
1506
+ },
1507
+ "70183": {
1508
+ "content": "<unused183>",
1509
+ "lstrip": false,
1510
+ "normalized": false,
1511
+ "rstrip": false,
1512
+ "single_word": false,
1513
+ "special": true
1514
+ },
1515
+ "70184": {
1516
+ "content": "<unused184>",
1517
+ "lstrip": false,
1518
+ "normalized": false,
1519
+ "rstrip": false,
1520
+ "single_word": false,
1521
+ "special": true
1522
+ },
1523
+ "70185": {
1524
+ "content": "<unused185>",
1525
+ "lstrip": false,
1526
+ "normalized": false,
1527
+ "rstrip": false,
1528
+ "single_word": false,
1529
+ "special": true
1530
+ },
1531
+ "70186": {
1532
+ "content": "<unused186>",
1533
+ "lstrip": false,
1534
+ "normalized": false,
1535
+ "rstrip": false,
1536
+ "single_word": false,
1537
+ "special": true
1538
+ },
1539
+ "70187": {
1540
+ "content": "<unused187>",
1541
+ "lstrip": false,
1542
+ "normalized": false,
1543
+ "rstrip": false,
1544
+ "single_word": false,
1545
+ "special": true
1546
+ },
1547
+ "70188": {
1548
+ "content": "<unused188>",
1549
+ "lstrip": false,
1550
+ "normalized": false,
1551
+ "rstrip": false,
1552
+ "single_word": false,
1553
+ "special": true
1554
+ },
1555
+ "70189": {
1556
+ "content": "<unused189>",
1557
+ "lstrip": false,
1558
+ "normalized": false,
1559
+ "rstrip": false,
1560
+ "single_word": false,
1561
+ "special": true
1562
+ },
1563
+ "70190": {
1564
+ "content": "<unused190>",
1565
+ "lstrip": false,
1566
+ "normalized": false,
1567
+ "rstrip": false,
1568
+ "single_word": false,
1569
+ "special": true
1570
+ },
1571
+ "70191": {
1572
+ "content": "<unused191>",
1573
+ "lstrip": false,
1574
+ "normalized": false,
1575
+ "rstrip": false,
1576
+ "single_word": false,
1577
+ "special": true
1578
+ },
1579
+ "70192": {
1580
+ "content": "<unused192>",
1581
+ "lstrip": false,
1582
+ "normalized": false,
1583
+ "rstrip": false,
1584
+ "single_word": false,
1585
+ "special": true
1586
+ },
1587
+ "70193": {
1588
+ "content": "<unused193>",
1589
+ "lstrip": false,
1590
+ "normalized": false,
1591
+ "rstrip": false,
1592
+ "single_word": false,
1593
+ "special": true
1594
+ },
1595
+ "70194": {
1596
+ "content": "<unused194>",
1597
+ "lstrip": false,
1598
+ "normalized": false,
1599
+ "rstrip": false,
1600
+ "single_word": false,
1601
+ "special": true
1602
+ },
1603
+ "70195": {
1604
+ "content": "<unused195>",
1605
+ "lstrip": false,
1606
+ "normalized": false,
1607
+ "rstrip": false,
1608
+ "single_word": false,
1609
+ "special": true
1610
+ },
1611
+ "70196": {
1612
+ "content": "<unused196>",
1613
+ "lstrip": false,
1614
+ "normalized": false,
1615
+ "rstrip": false,
1616
+ "single_word": false,
1617
+ "special": true
1618
+ },
1619
+ "70197": {
1620
+ "content": "<unused197>",
1621
+ "lstrip": false,
1622
+ "normalized": false,
1623
+ "rstrip": false,
1624
+ "single_word": false,
1625
+ "special": true
1626
+ },
1627
+ "70198": {
1628
+ "content": "<unused198>",
1629
+ "lstrip": false,
1630
+ "normalized": false,
1631
+ "rstrip": false,
1632
+ "single_word": false,
1633
+ "special": true
1634
+ },
1635
+ "70199": {
1636
+ "content": "<unused199>",
1637
+ "lstrip": false,
1638
+ "normalized": false,
1639
+ "rstrip": false,
1640
+ "single_word": false,
1641
+ "special": true
1642
+ },
1643
+ "70200": {
1644
+ "content": "<unused200>",
1645
+ "lstrip": false,
1646
+ "normalized": false,
1647
+ "rstrip": false,
1648
+ "single_word": false,
1649
+ "special": true
1650
+ },
1651
+ "70201": {
1652
+ "content": "<unused201>",
1653
+ "lstrip": false,
1654
+ "normalized": false,
1655
+ "rstrip": false,
1656
+ "single_word": false,
1657
+ "special": true
1658
+ },
1659
+ "70202": {
1660
+ "content": "<unused202>",
1661
+ "lstrip": false,
1662
+ "normalized": false,
1663
+ "rstrip": false,
1664
+ "single_word": false,
1665
+ "special": true
1666
+ },
1667
+ "70203": {
1668
+ "content": "<unused203>",
1669
+ "lstrip": false,
1670
+ "normalized": false,
1671
+ "rstrip": false,
1672
+ "single_word": false,
1673
+ "special": true
1674
+ },
1675
+ "70204": {
1676
+ "content": "<unused204>",
1677
+ "lstrip": false,
1678
+ "normalized": false,
1679
+ "rstrip": false,
1680
+ "single_word": false,
1681
+ "special": true
1682
+ },
1683
+ "70205": {
1684
+ "content": "<unused205>",
1685
+ "lstrip": false,
1686
+ "normalized": false,
1687
+ "rstrip": false,
1688
+ "single_word": false,
1689
+ "special": true
1690
+ },
1691
+ "70206": {
1692
+ "content": "<unused206>",
1693
+ "lstrip": false,
1694
+ "normalized": false,
1695
+ "rstrip": false,
1696
+ "single_word": false,
1697
+ "special": true
1698
+ },
1699
+ "70207": {
1700
+ "content": "<unused207>",
1701
+ "lstrip": false,
1702
+ "normalized": false,
1703
+ "rstrip": false,
1704
+ "single_word": false,
1705
+ "special": true
1706
+ },
1707
+ "70208": {
1708
+ "content": "<unused208>",
1709
+ "lstrip": false,
1710
+ "normalized": false,
1711
+ "rstrip": false,
1712
+ "single_word": false,
1713
+ "special": true
1714
+ },
1715
+ "70209": {
1716
+ "content": "<|SYSTEM|>",
1717
+ "lstrip": false,
1718
+ "normalized": false,
1719
+ "rstrip": false,
1720
+ "single_word": false,
1721
+ "special": true
1722
+ },
1723
+ "70210": {
1724
+ "content": "<|USER|>",
1725
+ "lstrip": false,
1726
+ "normalized": false,
1727
+ "rstrip": false,
1728
+ "single_word": false,
1729
+ "special": true
1730
+ },
1731
+ "70211": {
1732
+ "content": "<|RESPONSE|>",
1733
+ "lstrip": false,
1734
+ "normalized": false,
1735
+ "rstrip": false,
1736
+ "single_word": false,
1737
+ "special": true
1738
+ }
1739
+ },
1740
+ "additional_special_tokens": [
1741
+ "<|SYSTEM|>",
1742
+ "<|USER|>",
1743
+ "<|RESPONSE|>"
1744
+ ],
1745
+ "bos_token": "<s>",
1746
+ "clean_up_tokenization_spaces": true,
1747
+ "eos_token": "</s>",
1748
+ "mask_token": "<mask>",
1749
+ "model_input_names": [
1750
+ "input_ids",
1751
+ "attention_mask"
1752
+ ],
1753
+ "model_max_length": 4096,
1754
+ "pad_token": "<unk>",
1755
+ "padding_side": "right",
1756
+ "tokenizer_class": "PreTrainedTokenizerFast",
1757
+ "unk_token": "<unk>"
1758
+ }
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10d9c29c7c9fa9e524674f6c2d91ad5229f2a072802808e65fcdedc6648a1eac
3
+ size 6392
warnings.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class VersionedDeprecationWarning(DeprecationWarning):
2
+ """A custom deprecation warning class that includes version information.
3
+
4
+ Attributes:
5
+ message (str): The deprecation message describing why the feature is deprecated.
6
+ remove_version (str): The version in which the feature will be removed.
7
+
8
+ Example:
9
+ >>> def deprecated_function():
10
+ ... warnings.warn(
11
+ ... VersionedDeprecationWarning(
12
+ ... "Function XYZ is deprecated.",
13
+ ... remove_version="2.0.0"
14
+ ... )
15
+ ... )
16
+ ...
17
+ >>> deprecated_function()
18
+ DeprecationWarning: Function XYZ is deprecated. It will be removed in version 2.0.0.
19
+ """
20
+
21
+ def __init__(self, message: str, remove_version: str) -> None:
22
+ super().__init__(message + f' It will be removed in version {remove_version}.')