add yolo v10 and modify pipeline

2025-03-28 13:19:54 +08:00
parent 183299c06b
commit 798c596acc
471 changed files with 19109 additions and 7342 deletions
--- a/ultralytics/models/sam/modules/tiny_encoder.py
+++ b/ultralytics/models/sam/modules/tiny_encoder.py
@ -21,19 +21,27 @@ from ultralytics.utils.instance import to_2tuple


 class Conv2d_BN(torch.nn.Sequential):
+    """A sequential container that performs 2D convolution followed by batch normalization."""

    def __init__(self, a, b, ks=1, stride=1, pad=0, dilation=1, groups=1, bn_weight_init=1):
+        """Initializes the MBConv model with given input channels, output channels, expansion ratio, activation, and
+        drop path.
+        """
        super().__init__()
-        self.add_module('c', torch.nn.Conv2d(a, b, ks, stride, pad, dilation, groups, bias=False))
+        self.add_module("c", torch.nn.Conv2d(a, b, ks, stride, pad, dilation, groups, bias=False))
        bn = torch.nn.BatchNorm2d(b)
        torch.nn.init.constant_(bn.weight, bn_weight_init)
        torch.nn.init.constant_(bn.bias, 0)
-        self.add_module('bn', bn)
+        self.add_module("bn", bn)


 class PatchEmbed(nn.Module):
+    """Embeds images into patches and projects them into a specified embedding dimension."""

    def __init__(self, in_chans, embed_dim, resolution, activation):
+        """Initialize the PatchMerging class with specified input, output dimensions, resolution and activation
+        function.
+        """
        super().__init__()
        img_size: Tuple[int, int] = to_2tuple(resolution)
        self.patches_resolution = (img_size[0] // 4, img_size[1] // 4)
@ -48,12 +56,17 @@ class PatchEmbed(nn.Module):
        )

    def forward(self, x):
+        """Runs input tensor 'x' through the PatchMerging model's sequence of operations."""
        return self.seq(x)


 class MBConv(nn.Module):
+    """Mobile Inverted Bottleneck Conv (MBConv) layer, part of the EfficientNet architecture."""

    def __init__(self, in_chans, out_chans, expand_ratio, activation, drop_path):
+        """Initializes a convolutional layer with specified dimensions, input resolution, depth, and activation
+        function.
+        """
        super().__init__()
        self.in_chans = in_chans
        self.hidden_chans = int(in_chans * expand_ratio)
@ -73,6 +86,7 @@ class MBConv(nn.Module):
        self.drop_path = nn.Identity()

    def forward(self, x):
+        """Implements the forward pass for the model architecture."""
        shortcut = x
        x = self.conv1(x)
        x = self.act1(x)
@ -85,8 +99,12 @@ class MBConv(nn.Module):


 class PatchMerging(nn.Module):
+    """Merges neighboring patches in the feature map and projects to a new dimension."""

    def __init__(self, input_resolution, dim, out_dim, activation):
+        """Initializes the ConvLayer with specific dimension, input resolution, depth, activation, drop path, and other
+        optional parameters.
+        """
        super().__init__()

        self.input_resolution = input_resolution
@ -99,6 +117,7 @@ class PatchMerging(nn.Module):
        self.conv3 = Conv2d_BN(out_dim, out_dim, 1, 1, 0)

    def forward(self, x):
+        """Applies forward pass on the input utilizing convolution and activation layers, and returns the result."""
        if x.ndim == 3:
            H, W = self.input_resolution
            B = len(x)
@ -115,6 +134,11 @@ class PatchMerging(nn.Module):


 class ConvLayer(nn.Module):
+    """
+    Convolutional Layer featuring multiple MobileNetV3-style inverted bottleneck convolutions (MBConv).
+
+    Optionally applies downsample operations to the output, and provides support for gradient checkpointing.
+    """

    def __init__(
        self,
@ -122,41 +146,69 @@ class ConvLayer(nn.Module):
        input_resolution,
        depth,
        activation,
-        drop_path=0.,
+        drop_path=0.0,
        downsample=None,
        use_checkpoint=False,
        out_dim=None,
-        conv_expand_ratio=4.,
+        conv_expand_ratio=4.0,
    ):
+        """
+        Initializes the ConvLayer with the given dimensions and settings.
+
+        Args:
+            dim (int): The dimensionality of the input and output.
+            input_resolution (Tuple[int, int]): The resolution of the input image.
+            depth (int): The number of MBConv layers in the block.
+            activation (Callable): Activation function applied after each convolution.
+            drop_path (Union[float, List[float]]): Drop path rate. Single float or a list of floats for each MBConv.
+            downsample (Optional[Callable]): Function for downsampling the output. None to skip downsampling.
+            use_checkpoint (bool): Whether to use gradient checkpointing to save memory.
+            out_dim (Optional[int]): The dimensionality of the output. None means it will be the same as `dim`.
+            conv_expand_ratio (float): Expansion ratio for the MBConv layers.
+        """
        super().__init__()
        self.dim = dim
        self.input_resolution = input_resolution
        self.depth = depth
        self.use_checkpoint = use_checkpoint

-        # build blocks
-        self.blocks = nn.ModuleList([
-            MBConv(
-                dim,
-                dim,
-                conv_expand_ratio,
-                activation,
-                drop_path[i] if isinstance(drop_path, list) else drop_path,
-            ) for i in range(depth)])
+        # Build blocks
+        self.blocks = nn.ModuleList(
+            [
+                MBConv(
+                    dim,
+                    dim,
+                    conv_expand_ratio,
+                    activation,
+                    drop_path[i] if isinstance(drop_path, list) else drop_path,
+                )
+                for i in range(depth)
+            ]
+        )

-        # patch merging layer
-        self.downsample = None if downsample is None else downsample(
-            input_resolution, dim=dim, out_dim=out_dim, activation=activation)
+        # Patch merging layer
+        self.downsample = (
+            None
+            if downsample is None
+            else downsample(input_resolution, dim=dim, out_dim=out_dim, activation=activation)
+        )

    def forward(self, x):
+        """Processes the input through a series of convolutional layers and returns the activated output."""
        for blk in self.blocks:
            x = checkpoint.checkpoint(blk, x) if self.use_checkpoint else blk(x)
        return x if self.downsample is None else self.downsample(x)


 class Mlp(nn.Module):
+    """
+    Multi-layer Perceptron (MLP) for transformer architectures.

-    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+    This layer takes an input with in_features, applies layer normalization and two fully-connected layers.
+    """
+
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0):
+        """Initializes Attention module with the given parameters including dimension, key_dim, number of heads, etc."""
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
@ -167,6 +219,7 @@ class Mlp(nn.Module):
        self.drop = nn.Dropout(drop)

    def forward(self, x):
+        """Applies operations on input x and returns modified x, runs downsample if not None."""
        x = self.norm(x)
        x = self.fc1(x)
        x = self.act(x)
@ -176,20 +229,41 @@ class Mlp(nn.Module):


 class Attention(torch.nn.Module):
+    """
+    Multi-head attention module with support for spatial awareness, applying attention biases based on spatial
+    resolution. Implements trainable attention biases for each unique offset between spatial positions in the resolution
+    grid.
+
+    Attributes:
+        ab (Tensor, optional): Cached attention biases for inference, deleted during training.
+    """

    def __init__(
-            self,
-            dim,
-            key_dim,
-            num_heads=8,
-            attn_ratio=4,
-            resolution=(14, 14),
+        self,
+        dim,
+        key_dim,
+        num_heads=8,
+        attn_ratio=4,
+        resolution=(14, 14),
    ):
+        """
+        Initializes the Attention module.
+
+        Args:
+            dim (int): The dimensionality of the input and output.
+            key_dim (int): The dimensionality of the keys and queries.
+            num_heads (int, optional): Number of attention heads. Default is 8.
+            attn_ratio (float, optional): Attention ratio, affecting the dimensions of the value vectors. Default is 4.
+            resolution (Tuple[int, int], optional): Spatial resolution of the input feature map. Default is (14, 14).
+
+        Raises:
+            AssertionError: If `resolution` is not a tuple of length 2.
+        """
        super().__init__()
-        # (h, w)
+
        assert isinstance(resolution, tuple) and len(resolution) == 2
        self.num_heads = num_heads
-        self.scale = key_dim ** -0.5
+        self.scale = key_dim**-0.5
        self.key_dim = key_dim
        self.nh_kd = nh_kd = key_dim * num_heads
        self.d = int(attn_ratio * key_dim)
@ -212,18 +286,20 @@ class Attention(torch.nn.Module):
                    attention_offsets[offset] = len(attention_offsets)
                idxs.append(attention_offsets[offset])
        self.attention_biases = torch.nn.Parameter(torch.zeros(num_heads, len(attention_offsets)))
-        self.register_buffer('attention_bias_idxs', torch.LongTensor(idxs).view(N, N), persistent=False)
+        self.register_buffer("attention_bias_idxs", torch.LongTensor(idxs).view(N, N), persistent=False)

    @torch.no_grad()
    def train(self, mode=True):
+        """Sets the module in training mode and handles attribute 'ab' based on the mode."""
        super().train(mode)
-        if mode and hasattr(self, 'ab'):
+        if mode and hasattr(self, "ab"):
            del self.ab
        else:
            self.ab = self.attention_biases[:, self.attention_bias_idxs]

-    def forward(self, x):  # x (B,N,C)
-        B, N, _ = x.shape
+    def forward(self, x):  # x
+        """Performs forward pass over the input tensor 'x' by applying normalization and querying keys/values."""
+        B, N, _ = x.shape  # B, N, C

        # Normalization
        x = self.norm(x)
@ -237,28 +313,16 @@ class Attention(torch.nn.Module):
        v = v.permute(0, 2, 1, 3)
        self.ab = self.ab.to(self.attention_biases.device)

-        attn = ((q @ k.transpose(-2, -1)) * self.scale +
-                (self.attention_biases[:, self.attention_bias_idxs] if self.training else self.ab))
+        attn = (q @ k.transpose(-2, -1)) * self.scale + (
+            self.attention_biases[:, self.attention_bias_idxs] if self.training else self.ab
+        )
        attn = attn.softmax(dim=-1)
        x = (attn @ v).transpose(1, 2).reshape(B, N, self.dh)
        return self.proj(x)


 class TinyViTBlock(nn.Module):
-    """
-    TinyViT Block.
-
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int, int]): Input resolution.
-        num_heads (int): Number of attention heads.
-        window_size (int): Window size.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        drop (float, optional): Dropout rate. Default: 0.0
-        drop_path (float, optional): Stochastic depth rate. Default: 0.0
-        local_conv_size (int): the kernel size of the convolution between Attention and MLP. Default: 3
-        activation (torch.nn): the activation function. Default: nn.GELU
-    """
+    """TinyViT Block that applies self-attention and a local convolution to the input."""

    def __init__(
        self,
@ -266,17 +330,35 @@ class TinyViTBlock(nn.Module):
        input_resolution,
        num_heads,
        window_size=7,
-        mlp_ratio=4.,
-        drop=0.,
-        drop_path=0.,
+        mlp_ratio=4.0,
+        drop=0.0,
+        drop_path=0.0,
        local_conv_size=3,
        activation=nn.GELU,
    ):
+        """
+        Initializes the TinyViTBlock.
+
+        Args:
+            dim (int): The dimensionality of the input and output.
+            input_resolution (Tuple[int, int]): Spatial resolution of the input feature map.
+            num_heads (int): Number of attention heads.
+            window_size (int, optional): Window size for attention. Default is 7.
+            mlp_ratio (float, optional): Ratio of mlp hidden dim to embedding dim. Default is 4.
+            drop (float, optional): Dropout rate. Default is 0.
+            drop_path (float, optional): Stochastic depth rate. Default is 0.
+            local_conv_size (int, optional): The kernel size of the local convolution. Default is 3.
+            activation (torch.nn, optional): Activation function for MLP. Default is nn.GELU.
+
+        Raises:
+            AssertionError: If `window_size` is not greater than 0.
+            AssertionError: If `dim` is not divisible by `num_heads`.
+        """
        super().__init__()
        self.dim = dim
        self.input_resolution = input_resolution
        self.num_heads = num_heads
-        assert window_size > 0, 'window_size must be greater than 0'
+        assert window_size > 0, "window_size must be greater than 0"
        self.window_size = window_size
        self.mlp_ratio = mlp_ratio

@ -284,7 +366,7 @@ class TinyViTBlock(nn.Module):
        # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
        self.drop_path = nn.Identity()

-        assert dim % num_heads == 0, 'dim must be divisible by num_heads'
+        assert dim % num_heads == 0, "dim must be divisible by num_heads"
        head_dim = dim // num_heads

        window_resolution = (window_size, window_size)
@ -298,9 +380,12 @@ class TinyViTBlock(nn.Module):
        self.local_conv = Conv2d_BN(dim, dim, ks=local_conv_size, stride=1, pad=pad, groups=dim)

    def forward(self, x):
+        """Applies attention-based transformation or padding to input 'x' before passing it through a local
+        convolution.
+        """
        H, W = self.input_resolution
        B, L, C = x.shape
-        assert L == H * W, 'input feature has wrong size'
+        assert L == H * W, "input feature has wrong size"
        res_x = x
        if H == self.window_size and W == self.window_size:
            x = self.attn(x)
@ -316,11 +401,14 @@ class TinyViTBlock(nn.Module):
            pH, pW = H + pad_b, W + pad_r
            nH = pH // self.window_size
            nW = pW // self.window_size
-            # window partition
-            x = x.view(B, nH, self.window_size, nW, self.window_size,
-                       C).transpose(2, 3).reshape(B * nH * nW, self.window_size * self.window_size, C)
+            # Window partition
+            x = (
+                x.view(B, nH, self.window_size, nW, self.window_size, C)
+                .transpose(2, 3)
+                .reshape(B * nH * nW, self.window_size * self.window_size, C)
+            )
            x = self.attn(x)
-            # window reverse
+            # Window reverse
            x = x.view(B, nH, nW, self.window_size, self.window_size, C).transpose(2, 3).reshape(B, pH, pW, C)

            if padding:
@ -337,29 +425,17 @@ class TinyViTBlock(nn.Module):
        return x + self.drop_path(self.mlp(x))

    def extra_repr(self) -> str:
-        return f'dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, ' \
-               f'window_size={self.window_size}, mlp_ratio={self.mlp_ratio}'
+        """Returns a formatted string representing the TinyViTBlock's parameters: dimension, input resolution, number of
+        attentions heads, window size, and MLP ratio.
+        """
+        return (
+            f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, "
+            f"window_size={self.window_size}, mlp_ratio={self.mlp_ratio}"
+        )


 class BasicLayer(nn.Module):
-    """
-    A basic TinyViT layer for one stage.
-
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resolution.
-        depth (int): Number of blocks.
-        num_heads (int): Number of attention heads.
-        window_size (int): Local window size.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        drop (float, optional): Dropout rate. Default: 0.0
-        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
-        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
-        local_conv_size (int): the kernel size of the depthwise convolution between attention and MLP. Default: 3
-        activation (torch.nn): the activation function. Default: nn.GELU
-        out_dim (int | optional): the output dimension of the layer. Default: None
-    """
+    """A basic TinyViT layer for one stage in a TinyViT architecture."""

    def __init__(
        self,
@ -368,57 +444,90 @@ class BasicLayer(nn.Module):
        depth,
        num_heads,
        window_size,
-        mlp_ratio=4.,
-        drop=0.,
-        drop_path=0.,
+        mlp_ratio=4.0,
+        drop=0.0,
+        drop_path=0.0,
        downsample=None,
        use_checkpoint=False,
        local_conv_size=3,
        activation=nn.GELU,
        out_dim=None,
    ):
+        """
+        Initializes the BasicLayer.
+
+        Args:
+            dim (int): The dimensionality of the input and output.
+            input_resolution (Tuple[int, int]): Spatial resolution of the input feature map.
+            depth (int): Number of TinyViT blocks.
+            num_heads (int): Number of attention heads.
+            window_size (int): Local window size.
+            mlp_ratio (float, optional): Ratio of mlp hidden dim to embedding dim. Default is 4.
+            drop (float, optional): Dropout rate. Default is 0.
+            drop_path (float | tuple[float], optional): Stochastic depth rate. Default is 0.
+            downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default is None.
+            use_checkpoint (bool, optional): Whether to use checkpointing to save memory. Default is False.
+            local_conv_size (int, optional): Kernel size of the local convolution. Default is 3.
+            activation (torch.nn, optional): Activation function for MLP. Default is nn.GELU.
+            out_dim (int | None, optional): The output dimension of the layer. Default is None.
+
+        Raises:
+            ValueError: If `drop_path` is a list of float but its length doesn't match `depth`.
+        """
        super().__init__()
        self.dim = dim
        self.input_resolution = input_resolution
        self.depth = depth
        self.use_checkpoint = use_checkpoint

-        # build blocks
-        self.blocks = nn.ModuleList([
-            TinyViTBlock(
-                dim=dim,
-                input_resolution=input_resolution,
-                num_heads=num_heads,
-                window_size=window_size,
-                mlp_ratio=mlp_ratio,
-                drop=drop,
-                drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
-                local_conv_size=local_conv_size,
-                activation=activation,
-            ) for i in range(depth)])
+        # Build blocks
+        self.blocks = nn.ModuleList(
+            [
+                TinyViTBlock(
+                    dim=dim,
+                    input_resolution=input_resolution,
+                    num_heads=num_heads,
+                    window_size=window_size,
+                    mlp_ratio=mlp_ratio,
+                    drop=drop,
+                    drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                    local_conv_size=local_conv_size,
+                    activation=activation,
+                )
+                for i in range(depth)
+            ]
+        )

-        # patch merging layer
-        self.downsample = None if downsample is None else downsample(
-            input_resolution, dim=dim, out_dim=out_dim, activation=activation)
+        # Patch merging layer
+        self.downsample = (
+            None
+            if downsample is None
+            else downsample(input_resolution, dim=dim, out_dim=out_dim, activation=activation)
+        )

    def forward(self, x):
+        """Performs forward propagation on the input tensor and returns a normalized tensor."""
        for blk in self.blocks:
            x = checkpoint.checkpoint(blk, x) if self.use_checkpoint else blk(x)
        return x if self.downsample is None else self.downsample(x)

    def extra_repr(self) -> str:
-        return f'dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}'
+        """Returns a string representation of the extra_repr function with the layer's parameters."""
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"


 class LayerNorm2d(nn.Module):
+    """A PyTorch implementation of Layer Normalization in 2D."""

    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+        """Initialize LayerNorm2d with the number of channels and an optional epsilon."""
        super().__init__()
        self.weight = nn.Parameter(torch.ones(num_channels))
        self.bias = nn.Parameter(torch.zeros(num_channels))
        self.eps = eps

    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Perform a forward pass, normalizing the input tensor."""
        u = x.mean(1, keepdim=True)
        s = (x - u).pow(2).mean(1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.eps)
@ -426,6 +535,30 @@ class LayerNorm2d(nn.Module):


 class TinyViT(nn.Module):
+    """
+    The TinyViT architecture for vision tasks.
+
+    Attributes:
+        img_size (int): Input image size.
+        in_chans (int): Number of input channels.
+        num_classes (int): Number of classification classes.
+        embed_dims (List[int]): List of embedding dimensions for each layer.
+        depths (List[int]): List of depths for each layer.
+        num_heads (List[int]): List of number of attention heads for each layer.
+        window_sizes (List[int]): List of window sizes for each layer.
+        mlp_ratio (float): Ratio of MLP hidden dimension to embedding dimension.
+        drop_rate (float): Dropout rate for drop layers.
+        drop_path_rate (float): Drop path rate for stochastic depth.
+        use_checkpoint (bool): Use checkpointing for efficient memory usage.
+        mbconv_expand_ratio (float): Expansion ratio for MBConv layer.
+        local_conv_size (int): Local convolution kernel size.
+        layer_lr_decay (float): Layer-wise learning rate decay.
+
+    Note:
+        This implementation is generalized to accept a list of depths, attention heads,
+        embedding dimensions and window sizes, which allows you to create a
+        "stack" of TinyViT models of varying configurations.
+    """

    def __init__(
        self,
@ -436,14 +569,33 @@ class TinyViT(nn.Module):
        depths=[2, 2, 6, 2],
        num_heads=[3, 6, 12, 24],
        window_sizes=[7, 7, 14, 7],
-        mlp_ratio=4.,
-        drop_rate=0.,
+        mlp_ratio=4.0,
+        drop_rate=0.0,
        drop_path_rate=0.1,
        use_checkpoint=False,
        mbconv_expand_ratio=4.0,
        local_conv_size=3,
        layer_lr_decay=1.0,
    ):
+        """
+        Initializes the TinyViT model.
+
+        Args:
+            img_size (int, optional): The input image size. Defaults to 224.
+            in_chans (int, optional): Number of input channels. Defaults to 3.
+            num_classes (int, optional): Number of classification classes. Defaults to 1000.
+            embed_dims (List[int], optional): List of embedding dimensions for each layer. Defaults to [96, 192, 384, 768].
+            depths (List[int], optional): List of depths for each layer. Defaults to [2, 2, 6, 2].
+            num_heads (List[int], optional): List of number of attention heads for each layer. Defaults to [3, 6, 12, 24].
+            window_sizes (List[int], optional): List of window sizes for each layer. Defaults to [7, 7, 14, 7].
+            mlp_ratio (float, optional): Ratio of MLP hidden dimension to embedding dimension. Defaults to 4.
+            drop_rate (float, optional): Dropout rate. Defaults to 0.
+            drop_path_rate (float, optional): Drop path rate for stochastic depth. Defaults to 0.1.
+            use_checkpoint (bool, optional): Whether to use checkpointing for efficient memory usage. Defaults to False.
+            mbconv_expand_ratio (float, optional): Expansion ratio for MBConv layer. Defaults to 4.0.
+            local_conv_size (int, optional): Local convolution kernel size. Defaults to 3.
+            layer_lr_decay (float, optional): Layer-wise learning rate decay. Defaults to 1.0.
+        """
        super().__init__()
        self.img_size = img_size
        self.num_classes = num_classes
@ -453,50 +605,52 @@ class TinyViT(nn.Module):

        activation = nn.GELU

-        self.patch_embed = PatchEmbed(in_chans=in_chans,
-                                      embed_dim=embed_dims[0],
-                                      resolution=img_size,
-                                      activation=activation)
+        self.patch_embed = PatchEmbed(
+            in_chans=in_chans, embed_dim=embed_dims[0], resolution=img_size, activation=activation
+        )

        patches_resolution = self.patch_embed.patches_resolution
        self.patches_resolution = patches_resolution

-        # stochastic depth
+        # Stochastic depth
        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule

-        # build layers
+        # Build layers
        self.layers = nn.ModuleList()
        for i_layer in range(self.num_layers):
            kwargs = dict(
                dim=embed_dims[i_layer],
-                input_resolution=(patches_resolution[0] // (2 ** (i_layer - 1 if i_layer == 3 else i_layer)),
-                                  patches_resolution[1] // (2 ** (i_layer - 1 if i_layer == 3 else i_layer))),
+                input_resolution=(
+                    patches_resolution[0] // (2 ** (i_layer - 1 if i_layer == 3 else i_layer)),
+                    patches_resolution[1] // (2 ** (i_layer - 1 if i_layer == 3 else i_layer)),
+                ),
                #   input_resolution=(patches_resolution[0] // (2 ** i_layer),
                #                     patches_resolution[1] // (2 ** i_layer)),
                depth=depths[i_layer],
-                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
                use_checkpoint=use_checkpoint,
-                out_dim=embed_dims[min(i_layer + 1,
-                                       len(embed_dims) - 1)],
+                out_dim=embed_dims[min(i_layer + 1, len(embed_dims) - 1)],
                activation=activation,
            )
            if i_layer == 0:
                layer = ConvLayer(conv_expand_ratio=mbconv_expand_ratio, **kwargs)
            else:
-                layer = BasicLayer(num_heads=num_heads[i_layer],
-                                   window_size=window_sizes[i_layer],
-                                   mlp_ratio=self.mlp_ratio,
-                                   drop=drop_rate,
-                                   local_conv_size=local_conv_size,
-                                   **kwargs)
+                layer = BasicLayer(
+                    num_heads=num_heads[i_layer],
+                    window_size=window_sizes[i_layer],
+                    mlp_ratio=self.mlp_ratio,
+                    drop=drop_rate,
+                    local_conv_size=local_conv_size,
+                    **kwargs,
+                )
            self.layers.append(layer)

        # Classifier head
        self.norm_head = nn.LayerNorm(embed_dims[-1])
        self.head = nn.Linear(embed_dims[-1], num_classes) if num_classes > 0 else torch.nn.Identity()

-        # init weights
+        # Init weights
        self.apply(self._init_weights)
        self.set_layer_lr_decay(layer_lr_decay)
        self.neck = nn.Sequential(
@ -518,13 +672,15 @@ class TinyViT(nn.Module):
        )

    def set_layer_lr_decay(self, layer_lr_decay):
+        """Sets the learning rate decay for each layer in the TinyViT model."""
        decay_rate = layer_lr_decay

-        # layers -> blocks (depth)
+        # Layers -> blocks (depth)
        depth = sum(self.depths)
        lr_scales = [decay_rate ** (depth - i - 1) for i in range(depth)]

        def _set_lr_scale(m, scale):
+            """Sets the learning rate scale for each layer in the model based on the layer's depth."""
            for p in m.parameters():
                p.lr_scale = scale

@ -544,12 +700,14 @@ class TinyViT(nn.Module):
            p.param_name = k

        def _check_lr_scale(m):
+            """Checks if the learning rate scale attribute is present in module's parameters."""
            for p in m.parameters():
-                assert hasattr(p, 'lr_scale'), p.param_name
+                assert hasattr(p, "lr_scale"), p.param_name

        self.apply(_check_lr_scale)

    def _init_weights(self, m):
+        """Initializes weights for linear layers and layer normalization in the given module."""
        if isinstance(m, nn.Linear):
            # NOTE: This initialization is needed only for training.
            # trunc_normal_(m.weight, std=.02)
@ -561,11 +719,12 @@ class TinyViT(nn.Module):

    @torch.jit.ignore
    def no_weight_decay_keywords(self):
-        return {'attention_biases'}
+        """Returns a dictionary of parameter names where weight decay should not be applied."""
+        return {"attention_biases"}

    def forward_features(self, x):
-        # x: (N, C, H, W)
-        x = self.patch_embed(x)
+        """Runs the input through the model layers and returns the transformed output."""
+        x = self.patch_embed(x)  # x input is (N, C, H, W)

        x = self.layers[0](x)
        start_i = 1
@ -573,10 +732,11 @@ class TinyViT(nn.Module):
        for i in range(start_i, len(self.layers)):
            layer = self.layers[i]
            x = layer(x)
-        B, _, C = x.size()
+        B, _, C = x.shape
        x = x.view(B, 64, 64, C)
        x = x.permute(0, 3, 1, 2)
        return self.neck(x)

    def forward(self, x):
+        """Executes a forward pass on the input tensor through the constructed model layers."""
        return self.forward_features(x)