{ "architectures": [ "Sam3VideoModel" ], "assoc_iou_thresh": 0.1, "decrease_trk_keep_alive_for_empty_masklets": false, "det_nms_thresh": 0.1, "detector_config": { "detr_decoder_config": { "_name_or_path": "", "add_cross_attention": false, "architectures": null, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "box_rpb_mode": "log", "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "dropout": 0.1, "dtype": null, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "hidden_act": "relu", "hidden_dropout": 0.0, "hidden_size": 256, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "initializer_range": 0.02, "intermediate_size": 2048, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "layer_norm_eps": 1e-06, "length_penalty": 1.0, "max_length": 20, "min_length": 0, "model_type": "sam3_detr_decoder", "no_repeat_ngram_size": 0, "num_attention_heads": 8, "num_beam_groups": 1, "num_beams": 1, "num_layers": 6, "num_queries": 200, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "prefix": null, "problem_type": null, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "sep_token_id": null, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "typical_p": 1.0, "use_presence_token": true }, "detr_encoder_config": { "_name_or_path": "", "add_cross_attention": false, "architectures": null, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "dropout": 0.1, "dtype": null, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "hidden_act": "relu", "hidden_dropout": 0.0, "hidden_size": 256, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "initializer_range": 0.02, "intermediate_size": 2048, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "layer_norm_eps": 1e-06, "length_penalty": 1.0, "max_length": 20, "min_length": 0, "model_type": "sam3_detr_encoder", "no_repeat_ngram_size": 0, "num_attention_heads": 8, "num_beam_groups": 1, "num_beams": 1, "num_layers": 6, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "prefix": null, "problem_type": null, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "sep_token_id": null, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "typical_p": 1.0 }, "geometry_encoder_config": { "_name_or_path": "", "add_cross_attention": false, "architectures": null, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "dropout": 0.1, "dtype": null, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "hidden_act": "relu", "hidden_dropout": 0.0, "hidden_size": 256, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "initializer_range": 0.02, "intermediate_size": 2048, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "layer_norm_eps": 1e-06, "length_penalty": 1.0, "max_length": 20, "min_length": 0, "model_type": "sam3_geometry_encoder", "no_repeat_ngram_size": 0, "num_attention_heads": 8, "num_beam_groups": 1, "num_beams": 1, "num_layers": 3, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "prefix": null, "problem_type": null, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "roi_size": 7, "sep_token_id": null, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "typical_p": 1.0 }, "initializer_range": 0.02, "mask_decoder_config": { "_name_or_path": "", "add_cross_attention": false, "architectures": null, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "dropout": 0.0, "dtype": null, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "hidden_size": 256, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "initializer_range": 0.02, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "layer_norm_eps": 1e-06, "length_penalty": 1.0, "max_length": 20, "min_length": 0, "model_type": "sam3_mask_decoder", "no_repeat_ngram_size": 0, "num_attention_heads": 8, "num_beam_groups": 1, "num_beams": 1, "num_return_sequences": 1, "num_upsampling_stages": 3, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "prefix": null, "problem_type": null, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "sep_token_id": null, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "typical_p": 1.0 }, "model_type": "sam3", "text_config": { "_name_or_path": "", "add_cross_attention": false, "architectures": null, "attention_dropout": 0.0, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": 49406, "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "dtype": null, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": 49407, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "hidden_act": "gelu", "hidden_size": 1024, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "initializer_factor": 1.0, "initializer_range": 0.02, "intermediate_size": 4096, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "layer_norm_eps": 1e-05, "length_penalty": 1.0, "max_length": 20, "max_position_embeddings": 32, "min_length": 0, "model_type": "clip_text_model", "no_repeat_ngram_size": 0, "num_attention_heads": 16, "num_beam_groups": 1, "num_beams": 1, "num_hidden_layers": 24, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": 1, "prefix": null, "problem_type": null, "projection_dim": 512, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "sep_token_id": null, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "typical_p": 1.0, "vocab_size": 49408 }, "vision_config": { "_name_or_path": "", "add_cross_attention": false, "architectures": null, "backbone_config": { "_name_or_path": "", "add_cross_attention": false, "architectures": null, "attention_dropout": 0.0, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "dtype": null, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "global_attn_indexes": [ 7, 15, 23, 31 ], "hidden_act": "gelu", "hidden_dropout": 0.0, "hidden_size": 1024, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "image_size": 1008, "initializer_range": 0.02, "intermediate_size": 4736, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "layer_norm_eps": 1e-06, "layer_scale_init_value": null, "length_penalty": 1.0, "max_length": 20, "min_length": 0, "model_type": "sam3_vit_model", "no_repeat_ngram_size": 0, "num_attention_heads": 16, "num_beam_groups": 1, "num_beams": 1, "num_channels": 3, "num_hidden_layers": 32, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "patch_size": 14, "prefix": null, "pretrain_image_size": 336, "problem_type": null, "qkv_bias": true, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "rope_theta": 10000.0, "sep_token_id": null, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "typical_p": 1.0, "window_size": 24 }, "backbone_feature_sizes": [ [ 288, 288 ], [ 144, 144 ], [ 72, 72 ] ], "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "dtype": null, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "fpn_hidden_size": 256, "fpn_kernel_size": 2, "fpn_stride": 2, "hidden_act": "gelu", "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "initializer_range": 0.02, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "layer_norm_eps": 1e-06, "length_penalty": 1.0, "max_length": 20, "min_length": 0, "model_type": "sam3_vision_model", "no_repeat_ngram_size": 0, "num_beam_groups": 1, "num_beams": 1, "num_feature_levels": 3, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "prefix": null, "problem_type": null, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "scale_factors": [ 4.0, 2.0, 1.0, 0.5 ], "sep_token_id": null, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "typical_p": 1.0 } }, "dtype": "float32", "fill_hole_area": 16, "high_conf_thresh": 0.8, "high_iou_thresh": 0.8, "hotstart_delay": 15, "hotstart_dup_thresh": 8, "hotstart_unmatch_thresh": 8, "init_trk_keep_alive": 30, "initializer_range": 0.02, "low_res_mask_size": 288, "max_num_objects": 10000, "max_trk_keep_alive": 30, "min_trk_keep_alive": -1, "model_type": "sam3_video", "new_det_thresh": 0.7, "recondition_every_nth_frame": 16, "recondition_on_trk_masks": false, "score_threshold_detection": 0.5, "suppress_overlapping_based_on_recent_occlusion_threshold": 0.7, "suppress_unmatched_only_within_hotstart": true, "tracker_config": { "enable_occlusion_spatial_embedding": true, "enable_temporal_pos_encoding_for_object_pointers": true, "image_size": 1008, "initializer_range": 0.02, "mask_decoder_config": { "_name_or_path": "", "add_cross_attention": false, "architectures": null, "attention_downsample_rate": 2, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "dtype": null, "dynamic_multimask_stability_delta": 0.05, "dynamic_multimask_stability_thresh": 0.98, "dynamic_multimask_via_stability": true, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "hidden_act": "gelu", "hidden_size": 256, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "iou_head_depth": 3, "iou_head_hidden_dim": 256, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "length_penalty": 1.0, "max_length": 20, "min_length": 0, "mlp_dim": 2048, "model_type": "", "no_repeat_ngram_size": 0, "num_attention_heads": 8, "num_beam_groups": 1, "num_beams": 1, "num_hidden_layers": 2, "num_multimask_outputs": 3, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "prefix": null, "problem_type": null, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "sep_token_id": null, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "typical_p": 1.0 }, "mask_downsampler_embed_dim": 256, "mask_downsampler_hidden_act": "gelu", "mask_downsampler_kernel_size": 3, "mask_downsampler_padding": 1, "mask_downsampler_stride": 2, "mask_downsampler_total_stride": 16, "max_cond_frame_num": 4, "max_object_pointers_in_encoder": 16, "memory_attention_downsample_rate": 1, "memory_attention_dropout": 0.1, "memory_attention_feed_forward_hidden_act": "relu", "memory_attention_feed_forward_hidden_size": 2048, "memory_attention_hidden_size": 256, "memory_attention_num_attention_heads": 1, "memory_attention_num_layers": 4, "memory_attention_rope_dropout": 0.1, "memory_attention_rope_feat_sizes": [ 72, 72 ], "memory_attention_rope_theta": 10000, "memory_encoder_hidden_size": 256, "memory_encoder_output_channels": 64, "memory_fuser_embed_dim": 256, "memory_fuser_hidden_act": "gelu", "memory_fuser_intermediate_dim": 1024, "memory_fuser_kernel_size": 7, "memory_fuser_layer_scale_init_value": 1e-06, "memory_fuser_num_layers": 2, "memory_fuser_padding": 3, "model_type": "sam3_tracker_video", "multimask_max_pt_num": 1, "multimask_min_pt_num": 0, "multimask_output_for_tracking": true, "multimask_output_in_sam": true, "num_maskmem": 7, "prompt_encoder_config": { "_name_or_path": "", "add_cross_attention": false, "architectures": null, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "dtype": null, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "hidden_act": "gelu", "hidden_size": 256, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "image_size": 1008, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "layer_norm_eps": 1e-06, "length_penalty": 1.0, "mask_input_channels": 16, "max_length": 20, "min_length": 0, "model_type": "", "no_repeat_ngram_size": 0, "num_beam_groups": 1, "num_beams": 1, "num_point_embeddings": 4, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "patch_size": 14, "prefix": null, "problem_type": null, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "scale": 1, "sep_token_id": null, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "typical_p": 1.0 }, "sigmoid_bias_for_mem_enc": -10.0, "sigmoid_scale_for_mem_enc": 20.0, "vision_config": { "_name_or_path": "", "add_cross_attention": false, "architectures": null, "backbone_config": { "_name_or_path": "", "add_cross_attention": false, "architectures": null, "attention_dropout": 0.0, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "dtype": null, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "global_attn_indexes": [ 7, 15, 23, 31 ], "hidden_act": "gelu", "hidden_dropout": 0.0, "hidden_size": 1024, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "image_size": 1008, "initializer_range": 0.02, "intermediate_size": 4736, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "layer_norm_eps": 1e-06, "layer_scale_init_value": null, "length_penalty": 1.0, "max_length": 20, "min_length": 0, "model_type": "sam3_vit_model", "no_repeat_ngram_size": 0, "num_attention_heads": 16, "num_beam_groups": 1, "num_beams": 1, "num_channels": 3, "num_hidden_layers": 32, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "patch_size": 14, "prefix": null, "pretrain_image_size": 336, "problem_type": null, "qkv_bias": true, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "rope_theta": 10000.0, "sep_token_id": null, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "typical_p": 1.0, "window_size": 24 }, "backbone_feature_sizes": [ [ 288, 288 ], [ 144, 144 ], [ 72, 72 ] ], "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "dtype": null, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "fpn_hidden_size": 256, "fpn_kernel_size": 2, "fpn_stride": 2, "hidden_act": "gelu", "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "initializer_range": 0.02, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "layer_norm_eps": 1e-06, "length_penalty": 1.0, "max_length": 20, "min_length": 0, "model_type": "sam3_vision_model", "no_repeat_ngram_size": 0, "num_beam_groups": 1, "num_beams": 1, "num_feature_levels": 3, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "prefix": null, "problem_type": null, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "scale_factors": [ 4.0, 2.0, 1.0, 0.5 ], "sep_token_id": null, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "typical_p": 1.0 } }, "transformers_version": "5.0.0.dev0", "trk_assoc_iou_thresh": 0.5 }