| { | |
| "architectures": [ | |
| "VTPModel" | |
| ], | |
| "decoder_depth": 12, | |
| "decoder_embed_dim": 384, | |
| "decoder_ffn_layer": "swiglu", | |
| "decoder_init_values": null, | |
| "decoder_norm_layer": "layernorm", | |
| "decoder_num_heads": 6, | |
| "decoder_use_qk_norm": false, | |
| "image_size": 256, | |
| "init_logit_bias": null, | |
| "init_logit_scale": null, | |
| "model_type": "vtp", | |
| "nonscalar_logit_scale": false, | |
| "text_context_length": 77, | |
| "text_depth": 12, | |
| "text_embed_cls": false, | |
| "text_embed_dim": 768, | |
| "text_ls_init_value": null, | |
| "text_mlp_ratio": 4.0, | |
| "text_no_causal_mask": false, | |
| "text_num_heads": 12, | |
| "text_output_tokens": false, | |
| "text_pad_id": 0, | |
| "text_pool_type": "argmax", | |
| "text_proj_bias": false, | |
| "text_proj_type": "linear", | |
| "text_quick_gelu": false, | |
| "text_vocab_size": 49408, | |
| "torch_dtype": "float32", | |
| "train_clip": true, | |
| "train_reconstruction": true, | |
| "transformers_version": "4.55.4", | |
| "vision_bottleneck_ae_only": true, | |
| "vision_clip_feat": "cls", | |
| "vision_depth": 12, | |
| "vision_embed_dim": 384, | |
| "vision_feature_bottleneck": 64, | |
| "vision_ffn_layer": "swiglu", | |
| "vision_init_values": null, | |
| "vision_mlp_ratio": 4, | |
| "vision_norm_layer": "rmsnorm", | |
| "vision_num_heads": 6, | |
| "vision_patch_size": 16, | |
| "vision_use_qk_norm": false | |
| } | |