Gemma4ForConditionalGeneration(
(model): Gemma4Model(
(vision_tower): Gemma4VisionModel(
(patch_embedder): Gemma4VisionPatchEmbedder(
(input_proj): Linear(in_features=768, out_features=8, bias=False)
)
(encoder): Gemma4VisionEncoder(
(rotary_emb): Gemma4VisionRotaryEmbedding()
(layers): ModuleList(
(0-1): 2 x Gemma4VisionEncoderLayer(
(self_attn): Gemma4VisionAttention(
(q_proj): Gemma4ClippableLinear(
(linear): Linear(in_features=8, out_features=128, bias=False)
)
(k_proj): Gemma4ClippableLinear(
(linear): Linear(in_features=8, out_features=128, bias=False)
)
(v_proj): Gemma4ClippableLinear(
(linear): Linear(in_features=8, out_features=128, bias=False)
)
(o_proj): Gemma4ClippableLinear(
(linear): Linear(in_features=128, out_features=8, bias=False)
)
(q_norm): Gemma4RMSNorm()
(k_norm): Gemma4RMSNorm()
(v_norm): Gemma4RMSNorm()
)
(mlp): Gemma4VisionMLP(
(gate_proj): Gemma4ClippableLinear(
(linear): Linear(in_features=8, out_features=64, bias=False)
)
(up_proj): Gemma4ClippableLinear(
(linear): Linear(in_features=8, out_features=64, bias=False)
)
(down_proj): Gemma4ClippableLinear(
(linear): Linear(in_features=64, out_features=8, bias=False)
)
(act_fn): GELUTanh()
)
(input_layernorm): Gemma4RMSNorm()
(post_attention_layernorm): Gemma4RMSNorm()
(pre_feedforward_layernorm): Gemma4RMSNorm()
(post_feedforward_layernorm): Gemma4RMSNorm()
)
)
)
(pooler): Gemma4VisionPooler()
)
(language_model): Gemma4TextModel(
(embed_tokens): Gemma4TextScaledWordEmbedding(262144, 8, padding_idx=0)
(layers): ModuleList(
(0): Gemma4TextDecoderLayer(
(self_attn): Gemma4TextAttention(
(q_proj): Linear(in_features=8, out_features=256, bias=False)
(q_norm): Gemma4RMSNorm()
(k_norm): Gemma4RMSNorm()
(v_norm): Gemma4RMSNorm()
(k_proj): Linear(in_features=8, out_features=128, bias=False)
(v_proj): Linear(in_features=8, out_features=128, bias=False)
(o_proj): Linear(in_features=256, out_features=8, bias=False)
)
(mlp): Gemma4TextMLP(
(gate_proj): Linear(in_features=8, out_features=64, bias=False)
(up_proj): Linear(in_features=8, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=8, bias=False)
(act_fn): GELUTanh()
)
(input_layernorm): Gemma4RMSNorm()
(post_attention_layernorm): Gemma4RMSNorm()
(pre_feedforward_layernorm): Gemma4RMSNorm()
(post_feedforward_layernorm): Gemma4RMSNorm()
(act_fn): GELUTanh()
(per_layer_input_gate): Linear(in_features=8, out_features=2, bias=False)
(per_layer_projection): Linear(in_features=2, out_features=8, bias=False)
(post_per_layer_input_norm): Gemma4RMSNorm()
)
(1): Gemma4TextDecoderLayer(
(self_attn): Gemma4TextAttention(
(q_proj): Linear(in_features=8, out_features=512, bias=False)
(q_norm): Gemma4RMSNorm()
(k_norm): Gemma4RMSNorm()
(v_norm): Gemma4RMSNorm()
(k_proj): Linear(in_features=8, out_features=256, bias=False)
(v_proj): Linear(in_features=8, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=8, bias=False)
)
(mlp): Gemma4TextMLP(
(gate_proj): Linear(in_features=8, out_features=64, bias=False)
(up_proj): Linear(in_features=8, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=8, bias=False)
(act_fn): GELUTanh()
)
(input_layernorm): Gemma4RMSNorm()
(post_attention_layernorm): Gemma4RMSNorm()
(pre_feedforward_layernorm): Gemma4RMSNorm()
(post_feedforward_layernorm): Gemma4RMSNorm()
(act_fn): GELUTanh()
(per_layer_input_gate): Linear(in_features=8, out_features=2, bias=False)
(per_layer_projection): Linear(in_features=2, out_features=8, bias=False)
(post_per_layer_input_norm): Gemma4RMSNorm()
)
(2): Gemma4TextDecoderLayer(
(self_attn): Gemma4TextAttention(
(q_proj): Linear(in_features=8, out_features=256, bias=False)
(q_norm): Gemma4RMSNorm()
(o_proj): Linear(in_features=256, out_features=8, bias=False)
)
(mlp): Gemma4TextMLP(
(gate_proj): Linear(in_features=8, out_features=64, bias=False)
(up_proj): Linear(in_features=8, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=8, bias=False)
(act_fn): GELUTanh()
)
(input_layernorm): Gemma4RMSNorm()
(post_attention_layernorm): Gemma4RMSNorm()
(pre_feedforward_layernorm): Gemma4RMSNorm()
(post_feedforward_layernorm): Gemma4RMSNorm()
(act_fn): GELUTanh()
(per_layer_input_gate): Linear(in_features=8, out_features=2, bias=False)
(per_layer_projection): Linear(in_features=2, out_features=8, bias=False)
(post_per_layer_input_norm): Gemma4RMSNorm()
)
(3): Gemma4TextDecoderLayer(
(self_attn): Gemma4TextAttention(
(q_proj): Linear(in_features=8, out_features=512, bias=False)
(q_norm): Gemma4RMSNorm()
(o_proj): Linear(in_features=512, out_features=8, bias=False)
)
(mlp): Gemma4TextMLP(
(gate_proj): Linear(in_features=8, out_features=64, bias=False)
(up_proj): Linear(in_features=8, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=8, bias=False)
(act_fn): GELUTanh()
)
(input_layernorm): Gemma4RMSNorm()
(post_attention_layernorm): Gemma4RMSNorm()
(pre_feedforward_layernorm): Gemma4RMSNorm()
(post_feedforward_layernorm): Gemma4RMSNorm()
(act_fn): GELUTanh()
(per_layer_input_gate): Linear(in_features=8, out_features=2, bias=False)
(per_layer_projection): Linear(in_features=2, out_features=8, bias=False)
(post_per_layer_input_norm): Gemma4RMSNorm()
)
)
(norm): Gemma4RMSNorm()
(rotary_emb): Gemma4TextRotaryEmbedding()
(embed_tokens_per_layer): Gemma4TextScaledWordEmbedding(262144, 8, padding_idx=0)
(per_layer_model_projection): Linear(in_features=8, out_features=8, bias=False)
(per_layer_projection_norm): Gemma4RMSNorm()
)
(audio_tower): Gemma4AudioModel(
(subsample_conv_projection): Gemma4AudioSubSampleConvProjection(
(layer0): Gemma4AudioSubSampleConvProjectionLayer(
(conv): Conv2d(1, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
(act): ReLU()
)
(layer1): Gemma4AudioSubSampleConvProjectionLayer(
(conv): Conv2d(128, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(norm): LayerNorm((32,), eps=1e-06, elementwise_affine=True)
(act): ReLU()
)
(input_proj_linear): Linear(in_features=1024, out_features=64, bias=False)
)
(rel_pos_enc): Gemma4AudioRelPositionalEncoding()
(layers): ModuleList(
(0-1): 2 x Gemma4AudioLayer(
(feed_forward1): Gemma4AudioFeedForward(
(ffw_layer_1): Gemma4ClippableLinear(
(linear): Linear(in_features=64, out_features=256, bias=False)
)
(ffw_layer_2): Gemma4ClippableLinear(
(linear): Linear(in_features=256, out_features=64, bias=False)
)
(pre_layer_norm): Gemma4RMSNorm()
(post_layer_norm): Gemma4RMSNorm()
(act_fn): SiLUActivation()
)
(feed_forward2): Gemma4AudioFeedForward(
(ffw_layer_1): Gemma4ClippableLinear(
(linear): Linear(in_features=64, out_features=256, bias=False)
)
(ffw_layer_2): Gemma4ClippableLinear(
(linear): Linear(in_features=256, out_features=64, bias=False)
)
(pre_layer_norm): Gemma4RMSNorm()
(post_layer_norm): Gemma4RMSNorm()
(act_fn): SiLUActivation()
)
(self_attn): Gemma4AudioAttention(
(q_proj): Gemma4ClippableLinear(
(linear): Linear(in_features=64, out_features=64, bias=False)
)
(k_proj): Gemma4ClippableLinear(
(linear): Linear(in_features=64, out_features=64, bias=False)
)
(v_proj): Gemma4ClippableLinear(
(linear): Linear(in_features=64, out_features=64, bias=False)
)
(post): Gemma4ClippableLinear(
(linear): Linear(in_features=64, out_features=64, bias=False)
)
(relative_k_proj): Linear(in_features=64, out_features=64, bias=False)
)
(lconv1d): Gemma4AudioLightConv1d(
(linear_start): Gemma4ClippableLinear(
(linear): Linear(in_features=64, out_features=128, bias=False)
)
(linear_end): Gemma4ClippableLinear(
(linear): Linear(in_features=64, out_features=64, bias=False)
)
(depthwise_conv1d): Gemma4AudioCausalConv1d(64, 64, kernel_size=(5,), stride=(1,), groups=64, bias=False)
(pre_layer_norm): Gemma4RMSNorm()
(conv_norm): Gemma4RMSNorm()
(act_fn): SiLUActivation()
)
(norm_pre_attn): Gemma4RMSNorm()
(norm_post_attn): Gemma4RMSNorm()
(norm_out): Gemma4RMSNorm()
)
)
(output_proj): Linear(in_features=64, out_features=32, bias=True)
)
(embed_vision): Gemma4MultimodalEmbedder(
(embedding_projection): Linear(in_features=8, out_features=8, bias=False)
(embedding_pre_projection_norm): Gemma4RMSNorm()
)
(embed_audio): Gemma4MultimodalEmbedder(
(embedding_projection): Linear(in_features=32, out_features=8, bias=False)
(embedding_pre_projection_norm): Gemma4RMSNorm()
)
)
(lm_head): Linear(in_features=8, out_features=262144, bias=False)
)