2D Models

k3im.cait.CAiTModel

Create a Class-Attention in Image Transformer (CaiT) model.

Parameters:

Name	Description	Default
`image_size`	tuple of (height, width) of the image	required
`patch_size`	tuple of (height, width) of the patch	required
`num_classes`	output classes for classification	required
`dim`	dimension of the model	required
`depth`	depth of the model	required
`heads`	number of heads in the model	required
`mlp_dim`	dimension of the mlp	required
`cls_depth`	depth of the cls token	required
`channels`	number of channels in the image	required
`dim_head`	dimension of the head	required
`aug`	augmentation layer	required

Source code in k3im/cait.py

def CaiTModel(
    image_size,
    patch_size,
    num_classes,
    dim,
    depth,
    heads,
    mlp_dim,
    cls_depth,
    channels=3,
    dim_head=64,
    aug=None,
):
    """ Create a Class-Attention in Image Transformer (CaiT) model.

    Args:
        `image_size`: tuple of (height, width) of the image
        `patch_size`: tuple of (height, width) of the patch
        `num_classes`: output classes for classification
        `dim`: dimension of the model
        `depth`: depth of the model
        `heads`: number of heads in the model
        `mlp_dim`: dimension of the mlp
        `cls_depth`: depth of the cls token
        `channels`: number of channels in the image
        `dim_head`: dimension of the head
        `aug`: augmentation layer
    """
    image_height, image_width = pair(image_size)
    patch_height, patch_width = pair(patch_size)

    assert (
        image_height % patch_height == 0 and image_width % patch_width == 0
    ), "Image dimensions must be divisible by the patch size."
    patch_dim = channels * patch_height * patch_width

    i_p = layers.Input((image_height, image_width, channels))
    if aug is not None:
        img = aug(i_p)
    else:
        img = i_p
    patches = ops.image.extract_patches(img, (patch_height, patch_width))
    patches = layers.Reshape((-1, patch_dim))(patches)
    patches = layers.LayerNormalization()(patches)
    patches = layers.Dense(dim)(patches)
    patches = layers.LayerNormalization()(patches)
    num_patches = ops.shape(patches)[1]
    patches = PositionEmb(num_patches, dim)(patches)
    patches = Transformer(dim, depth, heads, dim_head, mlp_dim)(patches)
    _, cls_token = CLS_Token(dim)(patches)
    cls_token = Transformer(dim, cls_depth, heads, dim_head, mlp_dim)(
        cls_token, context=patches
    )
    if num_classes is None:
        model = keras.Model(inputs=i_p, outputs=cls_token)
        return model

    cls_token = ops.squeeze(cls_token, axis=1)
    o_p = layers.Dense(num_classes)(cls_token)

    return keras.Model(inputs=i_p, outputs=o_p)

k3im.cct.CCT

Instantiates the Compact Convolutional Transformer architecture.

Parameters:

Name	Description	Default
`input_shape`	tuple of (height, width, channels)	required
`num_heads`	number of attention heads	required
`projection_dim`	projection dimension	required
`kernel_size`	kernel size for the first convolutional layer	required
`stride`	stride for the first convolutional layer	required
`padding`	padding for the first convolutional layer	required
`transformer_units`	list of units for the transformer blocks	required
`stochastic_depth_rate`	dropout rate for the stochastic depth	required
`transformer_layers`	number of transformer blocks	required
`num_classes`	number of output classes	required
`positional_emb`	boolean, whether to use positional embeddings	`False`
`aug`	data augmentation	`None`

Source code in k3im/cct.py

def CCT(
    input_shape,
    num_heads,
    projection_dim,
    kernel_size,
    stride,
    padding,
    transformer_units,
    stochastic_depth_rate,
    transformer_layers,
    num_classes,
    positional_emb=False,
    aug=None
):
    """ Instantiates the Compact Convolutional Transformer architecture.

    Args:
        input_shape: tuple of (height, width, channels)
        num_heads: number of attention heads
        projection_dim: projection dimension
        kernel_size: kernel size for the first convolutional layer
        stride: stride for the first convolutional layer
        padding: padding for the first convolutional layer
        transformer_units: list of units for the transformer blocks
        stochastic_depth_rate: dropout rate for the stochastic depth
        transformer_layers: number of transformer blocks
        num_classes: number of output classes
        positional_emb: boolean, whether to use positional embeddings
        aug: data augmentation

    """
    inputs = layers.Input(input_shape)
    if aug is not None:
        img = aug(inputs)
    else:
        img = inputs
    # Encode patches.

    cct_tokenizer = CCTTokenizer(
        kernel_size,
        stride,
        padding,
        n_output_channels=[64, projection_dim],
        n_conv_layers=2,
    )
    encoded_patches = cct_tokenizer(img)

    # Apply positional embedding.
    if positional_emb:
        sequence_length = encoded_patches.shape[1]
        encoded_patches += PositionEmbedding(sequence_length=sequence_length)(
            encoded_patches
        )

    # Calculate Stochastic Depth probabilities.
    dpr = [x for x in np.linspace(0, stochastic_depth_rate, transformer_layers)]

    # Create multiple layers of the Transformer block.
    for i in range(transformer_layers):
        # Layer normalization 1.
        x1 = layers.LayerNormalization(epsilon=1e-5)(encoded_patches)

        # Create a multi-head attention layer.
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=projection_dim, dropout=0.1
        )(x1, x1)

        # Skip connection 1.
        attention_output = StochasticDepth(dpr[i])(attention_output)
        x2 = layers.Add()([attention_output, encoded_patches])

        # Layer normalization 2.
        x3 = layers.LayerNormalization(epsilon=1e-5)(x2)

        # MLP.
        x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1)

        # Skip connection 2.
        x3 = StochasticDepth(dpr[i])(x3)
        encoded_patches = layers.Add()([x3, x2])
    if num_classes is None:
        model = keras.Model(inputs=inputs, outputs=encoded_patches)
        return model

    # Apply sequence pooling.
    representation = layers.LayerNormalization(epsilon=1e-5)(encoded_patches)
    weighted_representation = SequencePooling()(representation)

    # Classify outputs.
    logits = layers.Dense(num_classes)(weighted_representation)
    # Create the Keras model.
    model = keras.Model(inputs=inputs, outputs=logits)
    return model

k3im.convmixer.ConvMixer

Instantiates the ConvMixer architecture.

Parameters:

Name	Description	Default
`image_size`	Input image size.	`32`
`filters`	Number of filters.	`256`
`depth`	Depth of the network.	`8`
`kernel_size`	Kernel size.	`5`
`patch_size`	Patch size.	`2`
`num_classes`	Number of classes.	`10`
`num_channels`	Number of input channels.	`3`
`aug`	Augmentation layer.	`None`

Source code in k3im/convmixer.py

def ConvMixer(
    image_size=32,
    filters=256,
    depth=8,
    kernel_size=5,
    patch_size=2,
    num_classes=10,
    num_channels=3,
    aug=None
):
    """Instantiates the ConvMixer architecture.

    Args:
        image_size: Input image size.
        filters: Number of filters.
        depth: Depth of the network.
        kernel_size: Kernel size.
        patch_size: Patch size.
        num_classes: Number of classes.
        num_channels: Number of input channels.
        aug: Augmentation layer.
    """
    inputs = keras.Input((image_size, image_size, num_channels))
    if aug is not None:
        img = aug(inputs)
    else:
        img = inputs
    x = layers.Rescaling(scale=1.0 / 255)(img)

    # Extract patch embeddings.
    x = conv_stem(x, filters, patch_size)

    # ConvMixer blocks.
    for _ in range(depth):
        x = conv_mixer_block(x, filters, kernel_size)

    if num_classes is None:
        model = keras.Model(inputs=inputs, outputs=x)
        return model


    # Classification block.
    x = layers.GlobalAvgPool2D()(x)
    outputs = layers.Dense(num_classes)(x)

    return keras.Model(inputs, outputs)

k3im.eanet.EANet

Instantiates the EANet architecture.

Parameters:

Name	Description	Default
`input_shape`	tuple of (height, width, channels)	required
`patch_size`	size of the patch	required
`embedding_dim`	dimension of the embedding	required
`num_transformer_blocks`	number of transformer blocks	required
`mlp_dim`	dimension of the mlp	required
`num_heads`	number of heads	required
`dim_coefficient`	dimension coefficient	required
`attention_dropout`	dropout rate for attention	required
`projection_dropout`	dropout rate for projection	required
`num_classes`	number of classes	required
`aug`	augmentation layer	`None`

Source code in k3im/eanet.py

def EANet(
    input_shape,
    patch_size,
    embedding_dim,
    num_transformer_blocks,
    mlp_dim,
    num_heads,
    dim_coefficient,
    attention_dropout,
    projection_dropout,
    num_classes,
    aug=None,
):
    """ Instantiates the EANet architecture.

    Args:
        input_shape: tuple of (height, width, channels)
        patch_size: size of the patch
        embedding_dim: dimension of the embedding
        num_transformer_blocks: number of transformer blocks
        mlp_dim: dimension of the mlp
        num_heads: number of heads
        dim_coefficient: dimension coefficient
        attention_dropout: dropout rate for attention
        projection_dropout: dropout rate for projection
        num_classes: number of classes
        aug: augmentation layer

    """
    inputs = layers.Input(shape=input_shape)
    if aug is not None:
        img = aug(inputs)
    else:
        img = inputs
    num_patches = (input_shape[0] // patch_size) ** 2  # Number of patch

    # Extract patches.
    x = PatchExtract(patch_size)(img)
    # Create patch embedding.
    x = PatchEmbedding(num_patches, embedding_dim)(x)
    # Create Transformer block.
    for _ in range(num_transformer_blocks):
        x = transformer_encoder(
            x,
            embedding_dim,
            mlp_dim,
            num_heads,
            dim_coefficient,
            attention_dropout,
            projection_dropout,
        )
    # if num_classes is None return model without classification head
    if num_classes is None:
        return keras.Model(inputs=inputs, outputs=x)
    x = layers.GlobalAveragePooling1D()(x)
    outputs = layers.Dense(num_classes)(x)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

k3im.fnet.FNet

Instantiates the FNet architecture.

Parameters:

Name	Description	Default
`image_size`	Image size.	required
`patch_size`	Patch size.	required
`embedding_dim`	Size of the embedding dimension.	required
`num_blocks`	Number of blocks.	required
`dropout_rate`	Dropout rate.	required
`num_classes`	Number of classes to classify images into.	required
`positional_encoding`	Whether to include positional encoding.	`False`
`num_channels`	Number of image channels.	`3`
`aug`	Image augmentation.	`None`

Source code in k3im/fnet.py

def FNetModel(
    image_size,
    patch_size,
    embedding_dim,
    num_blocks,
    dropout_rate,
    num_classes,
    positional_encoding=False,
    num_channels=3,
    aug=None,
):
    """Instantiates the FNet architecture.

    Args:
        image_size: Image size.
        patch_size: Patch size.
        embedding_dim: Size of the embedding dimension.
        num_blocks: Number of blocks.
        dropout_rate: Dropout rate.
        num_classes: Number of classes to classify images into.
        positional_encoding: Whether to include positional encoding.
        num_channels: Number of image channels.
        aug: Image augmentation.

    """
    image_size = pair(image_size)
    patch_size = pair(patch_size)
    input_shape = (image_size[0], image_size[1], num_channels)
    inputs = layers.Input(shape=input_shape)
    img = aug(inputs) if aug else inputs
    num_patches = (image_size[0] // patch_size[0]) * (
        image_size[1] // patch_size[1]
    )  # Size of the data array.

    # Augment data.
    # Create patches.
    patches = Patches(patch_size)(img)
    # Encode patches to generate a [batch_size, num_patches, embedding_dim] tensor.
    x = layers.Dense(units=embedding_dim)(patches)
    if positional_encoding:
        x = x + PositionEmbedding(sequence_length=num_patches)(x)
    # Process x using the module blocks.
    for _ in range(num_blocks):
        x = FNetLayer(embedding_dim, dropout_rate)(x)
    # if num_classes is None return model without classification head
    if num_classes is None:
        return keras.Model(inputs=inputs, outputs=x)
    # Apply global average pooling to generate a [batch_size, embedding_dim] representation tensor.
    representation = layers.GlobalAveragePooling1D()(x)
    # Apply dropout.
    representation = layers.Dropout(rate=dropout_rate)(representation)
    # Compute logits outputs.
    logits = layers.Dense(num_classes)(representation)
    # Create the Keras model.
    return keras.Model(inputs=inputs, outputs=logits)

k3im.focalnet.FNet

Instantiates the FocalNet architecture.

Parameters:

Name	Description	Default
`img_size`	Image size.	`224`
`patch_size`	Patch size.	`4`
`num_classes`	Number of classes.	`1000`
`embed_dim`	Embedding dimension.	`128`
`depths`	Depths of each stage.	`[2, 2, 6, 2]`
`mlp_ratio`	Ratio of mlp hidden dim to embedding dim.	`4.0`
`drop_rate`	Dropout rate.	`0.0`
`drop_path_rate`	Stochastic depth rate.	`0.1`
`norm_layer`	Normalization layer.	`LayerNormalization`
`patch_norm`	Whether to use patch norm.	`True`
`focal_levels`	Number of focal levels.	`[2, 2, 3, 2]`
`focal_windows`	Focal window sizes.	`[3, 2, 3, 2]`
`use_conv_embed`	Whether to use conv embed.	`False`
`use_layerscale`	Whether to use layer scale.	`False`
`layerscale_value`	Value for layer scale.	`0.0001`
`use_postln`	Whether to use post layer norm.	`False`
`use_postln_in_modulation`	Whether to use post layer norm in modulation.	`False`
`normalize_modulator`	Whether to normalize modulator.	`False`