VTP-Small-f16d64 / config.json
MapleF9's picture
update vtp-s
13e76fd
{
"architectures": [
"VTPModel"
],
"decoder_depth": 12,
"decoder_embed_dim": 384,
"decoder_ffn_layer": "swiglu",
"decoder_init_values": null,
"decoder_norm_layer": "layernorm",
"decoder_num_heads": 6,
"decoder_use_qk_norm": false,
"image_size": 256,
"init_logit_bias": null,
"init_logit_scale": null,
"model_type": "vtp",
"nonscalar_logit_scale": false,
"text_context_length": 77,
"text_depth": 12,
"text_embed_cls": false,
"text_embed_dim": 768,
"text_ls_init_value": null,
"text_mlp_ratio": 4.0,
"text_no_causal_mask": false,
"text_num_heads": 12,
"text_output_tokens": false,
"text_pad_id": 0,
"text_pool_type": "argmax",
"text_proj_bias": false,
"text_proj_type": "linear",
"text_quick_gelu": false,
"text_vocab_size": 49408,
"torch_dtype": "float32",
"train_clip": true,
"train_reconstruction": true,
"transformers_version": "4.55.4",
"vision_bottleneck_ae_only": true,
"vision_clip_feat": "cls",
"vision_depth": 12,
"vision_embed_dim": 384,
"vision_feature_bottleneck": 64,
"vision_ffn_layer": "swiglu",
"vision_init_values": null,
"vision_mlp_ratio": 4,
"vision_norm_layer": "rmsnorm",
"vision_num_heads": 6,
"vision_patch_size": 16,
"vision_use_qk_norm": false
}