File size: 3,854 Bytes
3fce0e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
name: Text2Image_Diffusion_R1024R256R64RND_T5XL_Detailed_PTV2W
dataset_config: configs/datasets/cc12m.yaml
# sampler_arguments:
min_examples: 10000
sample_dir: /mnt/data/samples
# batch-size: 8
sample_image_size: 1024
test_file_list: validation.tsv
# reader-config-file: configs/datasets/reader_config_eval.yaml
#  shared_arguments:
output_dir: /mnt/data/outputs
num_diffusion_steps: 1000
reproject_signal: false
model_output_scale: 0
prediction_type: V_PREDICTION
loss_target_type: DDPM
schedule_type: DEEPFLOYD
prediction_length: 129
use_vdm_loss_weights: false
use_double_loss: true
no_use_residual: true
num_training_steps: 1000000
avg_lm_steps: 0
categorical_conditioning: 0
rescale_signal: 1
schedule_shifted: true
schedule_shifted_power: 2
skip_normalization: true
random_low_noise: true
vocab_file: t5.vocab
text_model: google/flan-t5-xl
model: nested2_unet
vision_model: nested2_unet

unet_config:
  attention_levels: []
  conditioning_feature_dim: -1
  conditioning_feature_proj_dim: 2048
  freeze_inner_unet: false
  initialize_inner_with_pretrained: 8rwvbg85tt
  inner_config:
    attention_levels: []
    conditioning_feature_dim: -1
    conditioning_feature_proj_dim: 2048
    freeze_inner_unet: false
    initialize_inner_with_pretrained: null
    inner_config:
      attention_levels: [1, 2]
      conditioning_feature_dim: -1
      conditioning_feature_proj_dim: 2048
      masked_cross_attention: 0
      micro_conditioning: scale:64
      nesting: true
      num_attention_layers: [0, 1, 5]
      num_lm_head_layers: 0
      num_resnets_per_resolution: [2, 2, 2]
      num_temporal_attention_layers: null
      resnet_config: {dropout: 0.0, num_channels: -1, num_groups_norm: 32, output_channels: -1,
        use_attention_ffn: true}
      resolution_channels: [256, 512, 768]
      skip_cond_emb: false
      skip_mid_blocks: false
      temporal_dim: null
      temporal_mode: false
      temporal_positional_encoding: false
      temporal_spatial_ds: false
    interp_conditioning: false
    masked_cross_attention: 1
    micro_conditioning: scale:256
    nesting: true
    num_attention_layers: [0, 0, 0]
    num_lm_head_layers: 0
    num_resnets_per_resolution: [2, 2, 1]
    num_temporal_attention_layers: null
    resnet_config: {dropout: 0.0, num_channels: -1, num_groups_norm: 32, output_channels: -1,
      use_attention_ffn: false}
    resolution_channels: [64, 128, 256]
    skip_cond_emb: true
    skip_inner_unet_input: false
    skip_mid_blocks: true
    skip_normalization: false
    temporal_dim: 1024
    temporal_mode: false
    temporal_positional_encoding: false
    temporal_spatial_ds: false
  interp_conditioning: false
  masked_cross_attention: 1
  micro_conditioning: scale:1024
  nesting: false
  num_attention_layers: [0, 0, 0]
  num_lm_head_layers: 0
  num_resnets_per_resolution: [2, 2, 1]
  num_temporal_attention_layers: null
  resnet_config: {dropout: 0.0, num_channels: -1, num_groups_norm: 32, output_channels: -1,
    use_attention_ffn: false}
  resolution_channels: [32, 32, 64]
  skip_cond_emb: true
  skip_inner_unet_input: false
  skip_mid_blocks: true
  skip_normalization: true
  temporal_dim: 1024
  temporal_mode: false
  temporal_positional_encoding: false
  temporal_spatial_ds: false

# import defaults
# reader-config-file: configs/datasets/reader_config.yaml
# add overrides
reader_config:
  image_size: 1024
  smaller_side_size: 1024
  random_crop: false
  max_caption_length: -1
  max_caption_length: 512 # note
  max_token_length: 128
  reader_buffer_size: 64
  shuffle_buffer_size: 9600
use_lm_mask: 1
#  torchmetrics_arguments:
metrics: fid,clip
#  trainer_arguments:
use_precomputed_text_embeddings: 0
batch_size: 4
multi_res_weights: '16:4:1'
gradient_clip_norm: 2
loss_factor: 1
num_gradient_accumulations: 1
warmup_steps: 10000
log_freq: 50
save_freq: 5000
lr: 5.0e-05
fp16: 1