MiniMaxAI
/

VTP-Small-f16d64

Image Feature Extraction

Model card Files Files and versions

VTP-Small-f16d64 / config.json

MapleF9's picture

update vtp-s

13e76fd 8 days ago

history blame contribute delete

1.27 kB

	{
	"architectures": [
	"VTPModel"
	],
	"decoder_depth": 12,
	"decoder_embed_dim": 384,
	"decoder_ffn_layer": "swiglu",
	"decoder_init_values": null,
	"decoder_norm_layer": "layernorm",
	"decoder_num_heads": 6,
	"decoder_use_qk_norm": false,
	"image_size": 256,
	"init_logit_bias": null,
	"init_logit_scale": null,
	"model_type": "vtp",
	"nonscalar_logit_scale": false,
	"text_context_length": 77,
	"text_depth": 12,
	"text_embed_cls": false,
	"text_embed_dim": 768,
	"text_ls_init_value": null,
	"text_mlp_ratio": 4.0,
	"text_no_causal_mask": false,
	"text_num_heads": 12,
	"text_output_tokens": false,
	"text_pad_id": 0,
	"text_pool_type": "argmax",
	"text_proj_bias": false,
	"text_proj_type": "linear",
	"text_quick_gelu": false,
	"text_vocab_size": 49408,
	"torch_dtype": "float32",
	"train_clip": true,
	"train_reconstruction": true,
	"transformers_version": "4.55.4",
	"vision_bottleneck_ae_only": true,
	"vision_clip_feat": "cls",
	"vision_depth": 12,
	"vision_embed_dim": 384,
	"vision_feature_bottleneck": 64,
	"vision_ffn_layer": "swiglu",
	"vision_init_values": null,
	"vision_mlp_ratio": 4,
	"vision_norm_layer": "rmsnorm",
	"vision_num_heads": 6,
	"vision_patch_size": 16,
	"vision_use_qk_norm": false
	}