读SAM代码

def add_decomposed_rel_pos(
    attn: torch.Tensor,
    q: torch.Tensor,
    rel_pos_h: torch.Tensor,	27,80的全零训练参数
    rel_pos_w: torch.Tensor,
    q_size: Tuple[int, int],	(14,14)
    k_size: Tuple[int, int],
) -> torch.Tensor:
	计算相对位置嵌入
    """
    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
    Args:
        attn (Tensor): attention map.
        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).

    Returns:
        attn (Tensor): attention map with added relative positional embeddings.
    """
    q_h, q_w = q_size
    k_h, k_w = k_size
    Rh = get_rel_pos(q_h, k_h, rel_pos_h)
    Rw = get_rel_pos(q_w, k_w, rel_pos_w)
    总之计算得到的这俩位置嵌入都是14,14,80

    B, _, dim = q.shape
    r_q = q.reshape(B, q_h, q_w, dim)
    400,14,14,80
    rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
    rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
    400,14,14,14

    attn = (
        attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
    ).view(B, q_h * q_w, k_h * k_w)

    return attn
def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
    """
    Get relative positional embeddings according to the relative positions of
        query and key sizes.
    Args:
        q_size (int): size of query q.
        k_size (int): size of key k.
        rel_pos (Tensor): relative position embeddings (L, C).

    Returns:
        Extracted positional embeddings according to relative positions.
    """
    max_rel_dist = int(2 * max(q_size, k_size) - 1)
    # Interpolate rel pos if needed.
    if rel_pos.shape[0] != max_rel_dist:
    如果输入的相对位置嵌入的长度不等于qk最大相对距离
        # Interpolate rel pos.
        rel_pos_resized = F.interpolate(
            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
            size=max_rel_dist,
            mode="linear",
        )
        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
    else:
        rel_pos_resized = rel_pos

    # Scale the coords with short length if shapes for q and k are different.
    q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
    k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)

    return rel_pos_resized[relative_coords.long()]

用的是default模型,大概2G

输入的图片是(480, 640, 3)的ndarray
transform成768,1024,3再改成1,3,768,1024的tensor
手写归一化后再pad成1,3,1024,1024
Sam(
  (image_encoder): ImageEncoderViT(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 1280, kernel_size=(16, 16), stride=(16, 16))
      得到1,1280,64,64,并permute成1,64,64,1280。随后再加上1,64,64,1280位置编码(一个训练参数)
    )
    (blocks): ModuleList(
      (0): Block(
      拷贝一份当前输入x为shortcut
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (window_partition)
        	先把x给pad成1,70,70,1280
        	再view成1,5,14,5,14,1280
        	permuet得到25,14,14,1280 x,和pad_hw=(70,70)返回
        (attn): Attention(输入x
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          得到25,14,14,3840,再reshape、permute得到3,25,16,196,80 qkv,再拆成q、k、v三个400,196,80
          计算q@k后得到attn 400,196,196后通过add_decomposed_rel_pos得到400,196,196 attn
          计算attn@v后再转转尺寸得到25,14,14,1280 x
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
          得到25,14,14,1280 x
        )
        (window_unpartition)
        	把x各种调整尺寸得到1,64,64,1280 x
        x = shortcut + x
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
        还是1,64,64,1280 x
      )
      (1): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (2): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (3): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (4): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (5): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (6): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (7): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (8): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (9): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (10): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (11): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (12): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (13): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (14): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (15): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (16): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (17): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (18): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (19): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (20): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (21): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (22): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (23): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (24): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (25): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (26): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (27): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (28): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (29): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (30): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
      (31): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU()
        )
      )
    )
    (neck): Sequential(
      (0): Conv2d(1280, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (1): LayerNorm2d()
      (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (3): LayerNorm2d()
    )
    得到1,256,64,64 features
  )
  (prompt_encoder): PromptEncoder(
  	没有点提示或bbox提示,所以稀疏嵌入为1,0,256的空张量
    (pe_layer): PositionEmbeddingRandom()
    	里面有一个2,128的训练参数,叫做高斯位置编码矩阵
    	基于一个64,64的全一grid计算cumsum得到也是64,64的y_embed和x_embed
    	堆叠后有64,64,2 coords送入_pe_encoding
	    	coords = 2 * coords - 1
	        coords = coords @ self.positional_encoding_gaussian_matrix
	        coords = 2 * np.pi * coords
	        # outputs d_1 x ... x d_n x C shape
	        return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
	        得到64,64,256 pe 再permute成1,256,64,64
    (point_embeddings): ModuleList(
      (0): Embedding(1, 256)
      (1): Embedding(1, 256)
      (2): Embedding(1, 256)
      (3): Embedding(1, 256)
    )
    (not_a_point_embed): Embedding(1, 256)
    (mask_downscaling): Sequential(
      (0): Conv2d(1, 4, kernel_size=(2, 2), stride=(2, 2))
      (1): LayerNorm2d()
      (2): GELU()
      (3): Conv2d(4, 16, kernel_size=(2, 2), stride=(2, 2))
      (4): LayerNorm2d()
      (5): GELU()
      (6): Conv2d(16, 256, kernel_size=(1, 1), stride=(1, 1))
    )
    没有提供mask提示,所以稠密嵌入为训练参数self.no_mask_embed 1,256 reshape再expand得到的1,256,64,64
        (no_mask_embed): Embedding(1, 256)
  )
  (mask_decoder): MaskDecoder(
  	这里有个输入image_pe来自提示编码器的get_dense_pe,其实就是pe_layer
  	(predict_masks)1,256的iou_token和4,256的mask_tokens拼接得到5,256,再修正一下尺寸得到1,5,256的output_tokens
  		再与sparse_prompt_embeddings拼接得到tokens 1,5,256
  		将features复制元素到1,256,64,64,再加上dense_prompt_embeddings得到src
  		将1,256,64,64的pe复制元素得到pos_src 1,256,64,64
	    (transformer): TwoWayTransformer(
	      输入src、pos_src、token(表示点提示)
	      src和pos_srcf分别permute成1,4096,256
	      token做query
	      (layers): ModuleList(
	        (0): TwoWayAttentionBlock(
	          (self_attn): Attention(
	            对query做自注意力
	            (q_proj): Linear(in_features=256, out_features=256, bias=True)
	            (k_proj): Linear(in_features=256, out_features=256, bias=True)
	            (v_proj): Linear(in_features=256, out_features=256, bias=True)
	            得到三个1,5,256
	            再分别分8头,即1,8,5,32
	            计算自注意力公式后再合头得到1,5,256	            
	            (out_proj): Linear(in_features=256, out_features=256, bias=True)
	          )
	          这样更新后的query经过norm1后再加上一开始的query得到q
	          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
			  k = src + pos_src
	          (cross_attn_token_to_image): Attention(
	            输入q,k,以及src作为v
	            (q_proj): Linear(in_features=256, out_features=128, bias=True)
	            (k_proj): Linear(in_features=256, out_features=128, bias=True)
	            (v_proj): Linear(in_features=256, out_features=128, bias=True)
	            k和v是分头得到1,8,4096,16
	            (out_proj): Linear(in_features=128, out_features=256, bias=True)
	          )
	          前面更新后的query加上交叉注意力的结果得到新的query,送入norm2
	          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
	          (mlp): MLPBlock(
	            (lin1): Linear(in_features=256, out_features=2048, bias=True)
	            (lin2): Linear(in_features=2048, out_features=256, bias=True)
	            (act): ReLU()
	          )
	          前面最新的query加上mlp的结果得到query送入norm3
	          (norm3): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
	          此时更新后的query再加上一开始的query得到q
	          k = src + pos_src
	          (cross_attn_image_to_token): Attention(
	            输入k作为q,q作为k,最新更新后的query作为v
	            (q_proj): Linear(in_features=256, out_features=128, bias=True)
	            (k_proj): Linear(in_features=256, out_features=128, bias=True)
	            (v_proj): Linear(in_features=256, out_features=128, bias=True)
	            (out_proj): Linear(in_features=128, out_features=256, bias=True)
	          )
	          src加上交叉注意力的结果得到新的src再送入norm4
	          (norm4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
	          输入最新的query 1,5,256,和key=最新的src 1,4096,256
	        )
	        (1): TwoWayAttentionBlock(
	          (self_attn): Attention(
	            (q_proj): Linear(in_features=256, out_features=256, bias=True)
	            (k_proj): Linear(in_features=256, out_features=256, bias=True)
	            (v_proj): Linear(in_features=256, out_features=256, bias=True)
	            (out_proj): Linear(in_features=256, out_features=256, bias=True)
	          )
	          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
	          (cross_attn_token_to_image): Attention(
	            (q_proj): Linear(in_features=256, out_features=128, bias=True)
	            (k_proj): Linear(in_features=256, out_features=128, bias=True)
	            (v_proj): Linear(in_features=256, out_features=128, bias=True)
	            (out_proj): Linear(in_features=128, out_features=256, bias=True)
	          )
	          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
	          (mlp): MLPBlock(
	            (lin1): Linear(in_features=256, out_features=2048, bias=True)
	            (lin2): Linear(in_features=2048, out_features=256, bias=True)
	            (act): ReLU()
	          )
	          (norm3): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
	          (norm4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
	          (cross_attn_image_to_token): Attention(
	            (q_proj): Linear(in_features=256, out_features=128, bias=True)
	            (k_proj): Linear(in_features=256, out_features=128, bias=True)
	            (v_proj): Linear(in_features=256, out_features=128, bias=True)
	            (out_proj): Linear(in_features=128, out_features=256, bias=True)
	          )
	        )
	      )
	      q=最终得到的query + token
	      k=最终得到的keys + pos_src
	      (final_attn_token_to_image): Attention(
	        输入q,k,keys做v
	        (q_proj): Linear(in_features=256, out_features=128, bias=True)
	        (k_proj): Linear(in_features=256, out_features=128, bias=True)
	        (v_proj): Linear(in_features=256, out_features=128, bias=True)
	        (out_proj): Linear(in_features=128, out_features=256, bias=True)
	      )
	      query+=交叉注意力的结果,再输入给下面的LN
	      (norm_final_attn): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
	      输出hs=query,src=key
	    )
        iou_token_out = hs[:, 0, :]
        1,256
        mask_tokens_out = hs[:, 1 : (1 + self.num_mask_tokens), :]	
        1,4,256    
        src view成1,256,64,64后给下面
        (output_upscaling): Sequential(
	      (0): ConvTranspose2d(256, 64, kernel_size=(2, 2), stride=(2, 2))
	      (1): LayerNorm2d()
	      (2): GELU()
	      (3): ConvTranspose2d(64, 32, kernel_size=(2, 2), stride=(2, 2))
	      (4): GELU()
	    )得到1,32,256,256 upscaled_embedding
		(output_hypernetworks_mlps): ModuleList(
		  把mask_tokens_out拆成41,1,256输入给下面的每一层
	      (0): MLP(
	        (layers): ModuleList(
	          (0): Linear(in_features=256, out_features=256, bias=True)
	          (1): Linear(in_features=256, out_features=256, bias=True)
	          (2): Linear(in_features=256, out_features=32, bias=True)
	        )
	      )
	      (1): MLP(
	        (layers): ModuleList(
	          (0): Linear(in_features=256, out_features=256, bias=True)
	          (1): Linear(in_features=256, out_features=256, bias=True)
	          (2): Linear(in_features=256, out_features=32, bias=True)
	        )
	      )
	      (2): MLP(
	        (layers): ModuleList(
	          (0): Linear(in_features=256, out_features=256, bias=True)
	          (1): Linear(in_features=256, out_features=256, bias=True)
	          (2): Linear(in_features=256, out_features=32, bias=True)
	        )
	      )
	      (3): MLP(
	        (layers): ModuleList(
	          (0): Linear(in_features=256, out_features=256, bias=True)
	          (1): Linear(in_features=256, out_features=256, bias=True)
	          (2): Linear(in_features=256, out_features=32, bias=True)
	        )
	      )
	      从而有四个1,1,32组成的list堆叠得到1,4,32的hyper_in
	      hyper_in和upscaled_embedding@乘法后再view得到masks 1,4,256,256
	    )
	    (iou_prediction_head): MLP(
	      输入iou_token_out
	      (layers): ModuleList(
	        (0): Linear(in_features=256, out_features=256, bias=True)
	        (1): Linear(in_features=256, out_features=256, bias=True)
	        (2): Linear(in_features=256, out_features=4, bias=True)
      	)得到iou_pred 1,4
        返回得到masks和iou_pred
        mask_slice = slice(1, None)
        得到一个slice(1,None,None)
        masks = masks[:, mask_slice, :, :]
        iou_pred = iou_pred[:, mask_slice]
    )
	返回low_res_masks 1,3,256,256和iou_predictions 1,3
	(postprocess_masks)
		一次双线性插值
		masks = masks[..., : input_size[0], : input_size[1]]
		二次双线性插值
		得到1,3,480,640
		
  
  )
  
)
顺便记一下保存特征图
import cv2
import numpy as np
image_array = x.squeeze()[:3].permute(1, 2, 0).cpu().numpy()
# image_array = x.squeeze()[:,:,:3].cpu().numpy()
# 将值缩放到0-255范围
image_array = (image_array * 255).astype(np.uint8)
# 保存图像
cv2.imwrite('3_image_encoder(1,1280,64,64).jpg', image_array)
sam = sam_model_registry["default"](checkpoint="sam_vit_h_4b8939.pth")
2023-12-04 16:09:13.740 | INFO     | __main__:<module>:11 - 读取模型  (9.14878 s)
predictor = SamPredictor(sam)
2023-12-04 16:09:13.740 | INFO     | __main__:<module>:17 - 创建模型  (0.03 ms)
predictor.set_image(image)
2023-12-04 16:09:53.530 | INFO     | __main__:<module>:25 - 预处理图片  (39.78268 s)
masks, _, _ = predictor.predict()
2023-12-04 16:09:53.614 | INFO     | __main__:<module>:31 - 分割  (83.36 ms)
cnm=masks.transpose(1,2,0)
cv2.imwrite(f"aaa.png",cnm.astype(np.uint8)*255)
2023-12-04 16:09:53.626 | INFO     | __main__:<module>:41 - 保存剪影  (11.11 ms)

主要是预处理那里很慢

在这里插入图片描述
原图1,3,480,640
请添加图片描述
刚transform 1,3,768,1024
请添加图片描述
网络的输入 1,3,1024,1024
请添加图片描述

网络中的参数pos_embed(1,64,64,1280)
请添加图片描述
pos_embed(1,1280,64,64)
请添加图片描述
neck(1,256,64,64)
请添加图片描述
upscaled_embedding(1,32,256,256)(可视化的后三通道)
请添加图片描述
阈值过滤之前的masks(1,3,480,640)
请添加图片描述
过滤后的masks[1],应该是关注前景([0]全黑)
请添加图片描述
过滤后的masks[0]

2023-12-04 17:15:37.883 | INFO     | __main__:<module>:11 - 读取模型  (12205.30 ms)
2023-12-04 17:15:37.883 | INFO     | __main__:<module>:17 - 创建模型  (0.02 ms)
2023-12-04 17:16:19.440 | INFO     | __main__:<module>:25 - 预处理图片  (41550.37 ms)
2023-12-04 17:16:19.515 | INFO     | __main__:<module>:35 - 点提示分割  (74.76 ms)
2023-12-04 17:16:19.529 | INFO     | __main__:<module>:47 - 保存剪影  (13.80 ms)
2023-12-04 17:16:19.619 | INFO     | __main__:<module>:56 - box提示分割  (88.85 ms)
2023-12-04 17:16:19.633 | INFO     | __main__:<module>:68 - 保存剪影  (13.65 ms)
2023-12-04 17:16:19.727 | INFO     | __main__:<module>:77 - box提示分割  (93.93 ms)
2023-12-04 17:16:19.743 | INFO     | __main__:<module>:89 - 保存剪影  (16.10 ms)

随机选取下面这两个点作为提示
请添加图片描述
分割结果
请添加图片描述
随机选取下面这个框作为提示
请添加图片描述
分割结果
请添加图片描述
随机选取下面的剪影作为提示
请添加图片描述
分割结果
请添加图片描述

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:/a/217455.html

如若内容造成侵权/违法违规/事实不符,请联系我们进行投诉反馈qq邮箱809451989@qq.com,一经查实,立即删除!

相关文章

Taro 学习教程 - - - - - 开发环境的安装 helloworld

一、Taro脚手架安装 npm install tarojs/cli -g // or yarn add tarojs/cli -g // or cnpm install tarojs/cli -g1.1 如何判断taro安装成功 taro -v正常安装成功之后显示如图&#xff1a; 1.2 环境变量配置(自行判断是否需要手动配置) 如果遇到如下问题&#xff0c;则是需要…

基于stm32的LCD1602与无线蓝牙温湿度显示

这一篇博客是为了实现温湿度的显示&#xff0c;温湿度传感器将数据穿给单片机&#xff0c;单片机又把数据送给LCD1602和蓝牙&#xff0c;让温度和湿度可以再LCD1602显示屏和手机上显示&#xff0c;它的执行逻辑和C51那里基本一样&#xff0c;就是要修改程序&#xff0c;在程序上…

【Linux20.04-qt5.12.4软件安装与初步使用-qt在Linux使用-记录-笔记】

【Linux-qt软件安装与初步使用-qt在Linux使用-记录-笔记】 1、概述2、环境说明3、步骤总结1、了解并选择自己想要安装的版本2、访问 Qt 官方网站3、在 Qt 网站上找到下载部分&#xff08;自己想下载&#xff09;4、下载完成后&#xff0c;给安装程序文件赋予执行权限。5、自动配…

单显卡插槽安装英伟达Tesla P4 AI加速卡

Tesla P4是专业AI显卡&#xff0c;只有70瓦功耗&#xff0c;可以作为AI入门使用。 安装时碰到的几个问题&#xff1a; 首先因为单显卡插槽&#xff0c;就需要先安装好机器&#xff0c;然后ssh登录进行相关配置。安装的时候来回插拔了好多次&#xff01; 其次就是安装驱动时&a…

微信聊天记录年度报告

记忆恢复 若运行代码&#xff0c;执行下列命令安装 git clone https://github.com/LC044/WeChatMsg cd WeChatMsg pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple一、登录微信 切记需要先登录要提取的微信号的微信。 手机端使用聊天记录迁移功…

【电路笔记】-电阻器额定功率

电阻器额定功率 文章目录 电阻器额定功率1、概述2、电阻功率&#xff08;P&#xff09;3、功率电阻器4、电阻器额定功率示例15、电阻器额定功率示例2 电能被电阻吸收&#xff0c;因为它是电压和电流的乘积&#xff0c;一些电阻将这种电能转化为热量。 1、概述 当电流由于电阻器…

基础堆溢出原理与DWORD SHOOT实现

堆介绍 堆的数据结构与管理策略 程序员在使用堆时只需要做三件事情&#xff1a;申请一定大小的内存&#xff0c;使用内存&#xff0c;释放内存。 对于堆管理系统来说&#xff0c;响应程序的内存使用申请就意味着要在"杂乱"的堆区中"辨别"出哪些内存是正在…

实用篇 | 利用Flask+Postman为深度学习模型进行快速测试(超详细)

利用FlaskPostman为深度学习模型进行快速测试&#xff0c;以及算法中的一些实例&#xff0c;以后会更新一些新的模板~~ #本文环境&#xff1a;服务器Ubuntu20.04(docker) 目录 1.下载postrman 2.编写flas的app文件 3.在postrman发送请求 4.实例 在服务器创建app.py文件 …

12月2号作业

#include <iostream>using namespace std; class Sofa{ private:string setting;string *lying new string;public:Sofa(){cout << "Sofa::无参构造函数" << endl;}Sofa(string setting,string lying):setting(setting),lying(new string (lying)…

【shell】

shell 一、shell简介二、shell脚本的执行方式三、shell变量3.1 shell变量介绍3.2 shell变量的定义3.1.1 基本语法3.2.2 定义变量的规则3.2.3 将命令的返回值赋予变量 四、环境变量的设置4.1 基本语法&#xff1a; 五、位置参数变量5.1 基本介绍5.2 基本语法 六、预定义变量6.1 …

金蝶云星空表单插件单据体批量删除,序号自增

文章目录 金蝶云星空表单插件单据体批量删除&#xff0c;序号自增字段标识说明表单插件获取单据体数据包移除物料为空的行其他移除物料为空的行的方式&#xff0c;但是测试不通过&#xff0c;不建议使用序号重新生成测试 金蝶云星空表单插件单据体批量删除&#xff0c;序号自增…

新的 BLUFFS 攻击导致蓝牙连接不再私密

蓝牙是一种连接我们设备的低功耗无线技术&#xff0c;有一个新的漏洞需要解决。 中间的攻击者可以使用新的 BLUFFS 攻击轻松窥探您的通信。 法国研究中心 EURECOM 的研究员 Daniele Antonioli 演示了六种新颖的攻击&#xff0c;这些攻击被定义为 BLUFFS&#xff08;蓝牙转发和…

合并两个有序链表[简单]

优质博文&#xff1a;IT-BLOG-CN 一、题目 将两个升序链表合并为一个新的升序链表并返回。新链表是通过拼接给定的两个链表的所有节点组成的。 示例 1&#xff1a; 输入&#xff1a;l1 [1,2,4], l2 [1,3,4] 输出&#xff1a;[1,1,2,3,4,4] 示例 2&#xff1a; 输入&#…

java常用知识点记忆

类的继承与多态 类的继承不支持多重继承非private 方法才可以被覆盖覆盖的方法要求&#xff0c;子类中的方法的名字&#xff0c;参数列表&#xff0c;返回类型与父类相同方法的重载是在一个类中定义方法名字相同&#xff0c;但是参数列表不同的方法要是在子类中定义了与父类名字…

IDEA 下载mysql驱动下载在不下来

结合一下 https://www.cnblogs.com/dadian/p/11936056.htmlhttps://www.cnblogs.com/dadian/p/11936056.html并且下载的 在idea改名 加入 加入到库 等待一会就要你输入sql的root和密码了,就OK

深入理解强化学习——马尔可夫决策过程:蒙特卡洛方法-[基础知识]

分类目录&#xff1a;《深入理解强化学习》总目录 蒙特卡洛方法&#xff08;Monte-Carlo Methods&#xff09;也被称为统计模拟方法&#xff0c;是一种基于概率统计的数值计算方法。运用蒙特卡洛方法时&#xff0c;我们通常使用重复随机抽样&#xff0c;然后运用概率统计方法来…

整数的立方和

系列文章目录 进阶的卡莎C++_睡觉觉觉得的博客-CSDN博客数1的个数_睡觉觉觉得的博客-CSDN博客双精度浮点数的输入输出_睡觉觉觉得的博客-CSDN博客足球联赛积分_睡觉觉觉得的博客-CSDN博客大减价(一级)_睡觉觉觉得的博客-CSDN博客小写字母的判断_睡觉觉觉得的博客-CSDN博客纸币(…

在线直线度测量仪在圆形轧钢中的重要性

在线直线度测量仪在圆形轧钢中的重要性 在现代轧钢生产中&#xff0c;在线直线度测量仪是一种非常重要的工具&#xff0c;它可以帮助工人和产线进行高精度的直线度和直径测量&#xff0c;从而保证产品质量的稳定性和精度。以下是详细介绍直线度测量仪的重要性和应用。 一、测…

【Java基础】几种拼接字符串的方法

几种拼接字符串的方法 1.使用 "" 运算符拼接字符串2.使用 StringBuilder 或 StringBuffer 类3.使用 StringJoiner 类4.使用 String 类 join 方法5.使用 StringUtils 类6.使用 String 类 concat 方法7.使用 String.format() 方法格式化字符串8.使用 Stream 实现9.总结…

http代理如何设置手机上网?http代理起到了哪些作用

本文将详细介绍如何设置手机上网使用HTTP代理&#xff0c;以及HTTP代理所起到的作用。 一、HTTP代理是什么&#xff1f; HTTP代理是一种网络协议&#xff0c;它允许客户端与服务器之间进行数据传输。它是一种常用的代理服务&#xff0c;可以帮助用户通过HTTP协议访问被封锁的网…