File size: 3,854 Bytes
3fce0e3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
name: Text2Image_Diffusion_R1024R256R64RND_T5XL_Detailed_PTV2W
dataset_config: configs/datasets/cc12m.yaml
# sampler_arguments:
min_examples: 10000
sample_dir: /mnt/data/samples
# batch-size: 8
sample_image_size: 1024
test_file_list: validation.tsv
# reader-config-file: configs/datasets/reader_config_eval.yaml
# shared_arguments:
output_dir: /mnt/data/outputs
num_diffusion_steps: 1000
reproject_signal: false
model_output_scale: 0
prediction_type: V_PREDICTION
loss_target_type: DDPM
schedule_type: DEEPFLOYD
prediction_length: 129
use_vdm_loss_weights: false
use_double_loss: true
no_use_residual: true
num_training_steps: 1000000
avg_lm_steps: 0
categorical_conditioning: 0
rescale_signal: 1
schedule_shifted: true
schedule_shifted_power: 2
skip_normalization: true
random_low_noise: true
vocab_file: t5.vocab
text_model: google/flan-t5-xl
model: nested2_unet
vision_model: nested2_unet
unet_config:
attention_levels: []
conditioning_feature_dim: -1
conditioning_feature_proj_dim: 2048
freeze_inner_unet: false
initialize_inner_with_pretrained: 8rwvbg85tt
inner_config:
attention_levels: []
conditioning_feature_dim: -1
conditioning_feature_proj_dim: 2048
freeze_inner_unet: false
initialize_inner_with_pretrained: null
inner_config:
attention_levels: [1, 2]
conditioning_feature_dim: -1
conditioning_feature_proj_dim: 2048
masked_cross_attention: 0
micro_conditioning: scale:64
nesting: true
num_attention_layers: [0, 1, 5]
num_lm_head_layers: 0
num_resnets_per_resolution: [2, 2, 2]
num_temporal_attention_layers: null
resnet_config: {dropout: 0.0, num_channels: -1, num_groups_norm: 32, output_channels: -1,
use_attention_ffn: true}
resolution_channels: [256, 512, 768]
skip_cond_emb: false
skip_mid_blocks: false
temporal_dim: null
temporal_mode: false
temporal_positional_encoding: false
temporal_spatial_ds: false
interp_conditioning: false
masked_cross_attention: 1
micro_conditioning: scale:256
nesting: true
num_attention_layers: [0, 0, 0]
num_lm_head_layers: 0
num_resnets_per_resolution: [2, 2, 1]
num_temporal_attention_layers: null
resnet_config: {dropout: 0.0, num_channels: -1, num_groups_norm: 32, output_channels: -1,
use_attention_ffn: false}
resolution_channels: [64, 128, 256]
skip_cond_emb: true
skip_inner_unet_input: false
skip_mid_blocks: true
skip_normalization: false
temporal_dim: 1024
temporal_mode: false
temporal_positional_encoding: false
temporal_spatial_ds: false
interp_conditioning: false
masked_cross_attention: 1
micro_conditioning: scale:1024
nesting: false
num_attention_layers: [0, 0, 0]
num_lm_head_layers: 0
num_resnets_per_resolution: [2, 2, 1]
num_temporal_attention_layers: null
resnet_config: {dropout: 0.0, num_channels: -1, num_groups_norm: 32, output_channels: -1,
use_attention_ffn: false}
resolution_channels: [32, 32, 64]
skip_cond_emb: true
skip_inner_unet_input: false
skip_mid_blocks: true
skip_normalization: true
temporal_dim: 1024
temporal_mode: false
temporal_positional_encoding: false
temporal_spatial_ds: false
# import defaults
# reader-config-file: configs/datasets/reader_config.yaml
# add overrides
reader_config:
image_size: 1024
smaller_side_size: 1024
random_crop: false
max_caption_length: -1
max_caption_length: 512 # note
max_token_length: 128
reader_buffer_size: 64
shuffle_buffer_size: 9600
use_lm_mask: 1
# torchmetrics_arguments:
metrics: fid,clip
# trainer_arguments:
use_precomputed_text_embeddings: 0
batch_size: 4
multi_res_weights: '16:4:1'
gradient_clip_norm: 2
loss_factor: 1
num_gradient_accumulations: 1
warmup_steps: 10000
log_freq: 50
save_freq: 5000
lr: 5.0e-05
fp16: 1
|