Skip to content

2D Models

k3im.cait.CAiTModel

Create a Class-Attention in Image Transformer (CaiT) model.

Parameters:

Name Type Description Default
`image_size`

tuple of (height, width) of the image

required
`patch_size`

tuple of (height, width) of the patch

required
`num_classes`

output classes for classification

required
`dim`

dimension of the model

required
`depth`

depth of the model

required
`heads`

number of heads in the model

required
`mlp_dim`

dimension of the mlp

required
`cls_depth`

depth of the cls token

required
`channels`

number of channels in the image

required
`dim_head`

dimension of the head

required
`aug`

augmentation layer

required
Source code in k3im/cait.py
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def CaiTModel(
    image_size,
    patch_size,
    num_classes,
    dim,
    depth,
    heads,
    mlp_dim,
    cls_depth,
    channels=3,
    dim_head=64,
    aug=None,
):
    """ Create a Class-Attention in Image Transformer (CaiT) model.

    Args:
        `image_size`: tuple of (height, width) of the image
        `patch_size`: tuple of (height, width) of the patch
        `num_classes`: output classes for classification
        `dim`: dimension of the model
        `depth`: depth of the model
        `heads`: number of heads in the model
        `mlp_dim`: dimension of the mlp
        `cls_depth`: depth of the cls token
        `channels`: number of channels in the image
        `dim_head`: dimension of the head
        `aug`: augmentation layer
    """
    image_height, image_width = pair(image_size)
    patch_height, patch_width = pair(patch_size)

    assert (
        image_height % patch_height == 0 and image_width % patch_width == 0
    ), "Image dimensions must be divisible by the patch size."
    patch_dim = channels * patch_height * patch_width

    i_p = layers.Input((image_height, image_width, channels))
    if aug is not None:
        img = aug(i_p)
    else:
        img = i_p
    patches = ops.image.extract_patches(img, (patch_height, patch_width))
    patches = layers.Reshape((-1, patch_dim))(patches)
    patches = layers.LayerNormalization()(patches)
    patches = layers.Dense(dim)(patches)
    patches = layers.LayerNormalization()(patches)
    num_patches = ops.shape(patches)[1]
    patches = PositionEmb(num_patches, dim)(patches)
    patches = Transformer(dim, depth, heads, dim_head, mlp_dim)(patches)
    _, cls_token = CLS_Token(dim)(patches)
    cls_token = Transformer(dim, cls_depth, heads, dim_head, mlp_dim)(
        cls_token, context=patches
    )
    if num_classes is None:
        model = keras.Model(inputs=i_p, outputs=cls_token)
        return model

    cls_token = ops.squeeze(cls_token, axis=1)
    o_p = layers.Dense(num_classes)(cls_token)

    return keras.Model(inputs=i_p, outputs=o_p)

k3im.cct.CCT

Instantiates the Compact Convolutional Transformer architecture.

Parameters:

Name Type Description Default
input_shape

tuple of (height, width, channels)

required
num_heads

number of attention heads

required
projection_dim

projection dimension

required
kernel_size

kernel size for the first convolutional layer

required
stride

stride for the first convolutional layer

required
padding

padding for the first convolutional layer

required
transformer_units

list of units for the transformer blocks

required
stochastic_depth_rate

dropout rate for the stochastic depth

required
transformer_layers

number of transformer blocks

required
num_classes

number of output classes

required
positional_emb

boolean, whether to use positional embeddings

False
aug

data augmentation

None
Source code in k3im/cct.py
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
def CCT(
    input_shape,
    num_heads,
    projection_dim,
    kernel_size,
    stride,
    padding,
    transformer_units,
    stochastic_depth_rate,
    transformer_layers,
    num_classes,
    positional_emb=False,
    aug=None
):
    """ Instantiates the Compact Convolutional Transformer architecture.

    Args:
        input_shape: tuple of (height, width, channels)
        num_heads: number of attention heads
        projection_dim: projection dimension
        kernel_size: kernel size for the first convolutional layer
        stride: stride for the first convolutional layer
        padding: padding for the first convolutional layer
        transformer_units: list of units for the transformer blocks
        stochastic_depth_rate: dropout rate for the stochastic depth
        transformer_layers: number of transformer blocks
        num_classes: number of output classes
        positional_emb: boolean, whether to use positional embeddings
        aug: data augmentation

    """
    inputs = layers.Input(input_shape)
    if aug is not None:
        img = aug(inputs)
    else:
        img = inputs
    # Encode patches.

    cct_tokenizer = CCTTokenizer(
        kernel_size,
        stride,
        padding,
        n_output_channels=[64, projection_dim],
        n_conv_layers=2,
    )
    encoded_patches = cct_tokenizer(img)

    # Apply positional embedding.
    if positional_emb:
        sequence_length = encoded_patches.shape[1]
        encoded_patches += PositionEmbedding(sequence_length=sequence_length)(
            encoded_patches
        )

    # Calculate Stochastic Depth probabilities.
    dpr = [x for x in np.linspace(0, stochastic_depth_rate, transformer_layers)]

    # Create multiple layers of the Transformer block.
    for i in range(transformer_layers):
        # Layer normalization 1.
        x1 = layers.LayerNormalization(epsilon=1e-5)(encoded_patches)

        # Create a multi-head attention layer.
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=projection_dim, dropout=0.1
        )(x1, x1)

        # Skip connection 1.
        attention_output = StochasticDepth(dpr[i])(attention_output)
        x2 = layers.Add()([attention_output, encoded_patches])

        # Layer normalization 2.
        x3 = layers.LayerNormalization(epsilon=1e-5)(x2)

        # MLP.
        x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1)

        # Skip connection 2.
        x3 = StochasticDepth(dpr[i])(x3)
        encoded_patches = layers.Add()([x3, x2])
    if num_classes is None:
        model = keras.Model(inputs=inputs, outputs=encoded_patches)
        return model

    # Apply sequence pooling.
    representation = layers.LayerNormalization(epsilon=1e-5)(encoded_patches)
    weighted_representation = SequencePooling()(representation)

    # Classify outputs.
    logits = layers.Dense(num_classes)(weighted_representation)
    # Create the Keras model.
    model = keras.Model(inputs=inputs, outputs=logits)
    return model

k3im.convmixer.ConvMixer

Instantiates the ConvMixer architecture.

Parameters:

Name Type Description Default
image_size

Input image size.

32
filters

Number of filters.

256
depth

Depth of the network.

8
kernel_size

Kernel size.

5
patch_size

Patch size.

2
num_classes

Number of classes.

10
num_channels

Number of input channels.

3
aug

Augmentation layer.

None
Source code in k3im/convmixer.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
def ConvMixer(
    image_size=32,
    filters=256,
    depth=8,
    kernel_size=5,
    patch_size=2,
    num_classes=10,
    num_channels=3,
    aug=None
):
    """Instantiates the ConvMixer architecture.

    Args:
        image_size: Input image size.
        filters: Number of filters.
        depth: Depth of the network.
        kernel_size: Kernel size.
        patch_size: Patch size.
        num_classes: Number of classes.
        num_channels: Number of input channels.
        aug: Augmentation layer.
    """
    inputs = keras.Input((image_size, image_size, num_channels))
    if aug is not None:
        img = aug(inputs)
    else:
        img = inputs
    x = layers.Rescaling(scale=1.0 / 255)(img)

    # Extract patch embeddings.
    x = conv_stem(x, filters, patch_size)

    # ConvMixer blocks.
    for _ in range(depth):
        x = conv_mixer_block(x, filters, kernel_size)

    if num_classes is None:
        model = keras.Model(inputs=inputs, outputs=x)
        return model


    # Classification block.
    x = layers.GlobalAvgPool2D()(x)
    outputs = layers.Dense(num_classes)(x)

    return keras.Model(inputs, outputs)

k3im.eanet.EANet

Instantiates the EANet architecture.

Parameters:

Name Type Description Default
input_shape

tuple of (height, width, channels)

required
patch_size

size of the patch

required
embedding_dim

dimension of the embedding

required
num_transformer_blocks

number of transformer blocks

required
mlp_dim

dimension of the mlp

required
num_heads

number of heads

required
dim_coefficient

dimension coefficient

required
attention_dropout

dropout rate for attention

required
projection_dropout

dropout rate for projection

required
num_classes

number of classes

required
aug

augmentation layer

None
Source code in k3im/eanet.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
def EANet(
    input_shape,
    patch_size,
    embedding_dim,
    num_transformer_blocks,
    mlp_dim,
    num_heads,
    dim_coefficient,
    attention_dropout,
    projection_dropout,
    num_classes,
    aug=None,
):
    """ Instantiates the EANet architecture.

    Args:
        input_shape: tuple of (height, width, channels)
        patch_size: size of the patch
        embedding_dim: dimension of the embedding
        num_transformer_blocks: number of transformer blocks
        mlp_dim: dimension of the mlp
        num_heads: number of heads
        dim_coefficient: dimension coefficient
        attention_dropout: dropout rate for attention
        projection_dropout: dropout rate for projection
        num_classes: number of classes
        aug: augmentation layer

    """
    inputs = layers.Input(shape=input_shape)
    if aug is not None:
        img = aug(inputs)
    else:
        img = inputs
    num_patches = (input_shape[0] // patch_size) ** 2  # Number of patch

    # Extract patches.
    x = PatchExtract(patch_size)(img)
    # Create patch embedding.
    x = PatchEmbedding(num_patches, embedding_dim)(x)
    # Create Transformer block.
    for _ in range(num_transformer_blocks):
        x = transformer_encoder(
            x,
            embedding_dim,
            mlp_dim,
            num_heads,
            dim_coefficient,
            attention_dropout,
            projection_dropout,
        )
    # if num_classes is None return model without classification head
    if num_classes is None:
        return keras.Model(inputs=inputs, outputs=x)
    x = layers.GlobalAveragePooling1D()(x)
    outputs = layers.Dense(num_classes)(x)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

k3im.fnet.FNet

Instantiates the FNet architecture.

Parameters:

Name Type Description Default
image_size

Image size.

required
patch_size

Patch size.

required
embedding_dim

Size of the embedding dimension.

required
num_blocks

Number of blocks.

required
dropout_rate

Dropout rate.

required
num_classes

Number of classes to classify images into.

required
positional_encoding

Whether to include positional encoding.

False
num_channels

Number of image channels.

3
aug

Image augmentation.

None
Source code in k3im/fnet.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
def FNetModel(
    image_size,
    patch_size,
    embedding_dim,
    num_blocks,
    dropout_rate,
    num_classes,
    positional_encoding=False,
    num_channels=3,
    aug=None,
):
    """Instantiates the FNet architecture.

    Args:
        image_size: Image size.
        patch_size: Patch size.
        embedding_dim: Size of the embedding dimension.
        num_blocks: Number of blocks.
        dropout_rate: Dropout rate.
        num_classes: Number of classes to classify images into.
        positional_encoding: Whether to include positional encoding.
        num_channels: Number of image channels.
        aug: Image augmentation.

    """
    image_size = pair(image_size)
    patch_size = pair(patch_size)
    input_shape = (image_size[0], image_size[1], num_channels)
    inputs = layers.Input(shape=input_shape)
    img = aug(inputs) if aug else inputs
    num_patches = (image_size[0] // patch_size[0]) * (
        image_size[1] // patch_size[1]
    )  # Size of the data array.

    # Augment data.
    # Create patches.
    patches = Patches(patch_size)(img)
    # Encode patches to generate a [batch_size, num_patches, embedding_dim] tensor.
    x = layers.Dense(units=embedding_dim)(patches)
    if positional_encoding:
        x = x + PositionEmbedding(sequence_length=num_patches)(x)
    # Process x using the module blocks.
    for _ in range(num_blocks):
        x = FNetLayer(embedding_dim, dropout_rate)(x)
    # if num_classes is None return model without classification head
    if num_classes is None:
        return keras.Model(inputs=inputs, outputs=x)
    # Apply global average pooling to generate a [batch_size, embedding_dim] representation tensor.
    representation = layers.GlobalAveragePooling1D()(x)
    # Apply dropout.
    representation = layers.Dropout(rate=dropout_rate)(representation)
    # Compute logits outputs.
    logits = layers.Dense(num_classes)(representation)
    # Create the Keras model.
    return keras.Model(inputs=inputs, outputs=logits)

k3im.focalnet.FNet

Instantiates the FocalNet architecture.

Parameters:

Name Type Description Default
img_size

Image size.

224
patch_size

Patch size.

4
num_classes

Number of classes.

1000
embed_dim

Embedding dimension.

128
depths

Depths of each stage.

[2, 2, 6, 2]
mlp_ratio

Ratio of mlp hidden dim to embedding dim.

4.0
drop_rate

Dropout rate.

0.0
drop_path_rate

Stochastic depth rate.

0.1
norm_layer

Normalization layer.

LayerNormalization
patch_norm

Whether to use patch norm.

True
focal_levels

Number of focal levels.

[2, 2, 3, 2]
focal_windows

Focal window sizes.

[3, 2, 3, 2]
use_conv_embed

Whether to use conv embed.

False
use_layerscale

Whether to use layer scale.

False
layerscale_value

Value for layer scale.

0.0001
use_postln

Whether to use post layer norm.

False
use_postln_in_modulation

Whether to use post layer norm in modulation.

False
normalize_modulator

Whether to normalize modulator.

False
Source code in k3im/focalnet.py
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
def FocalNet(
    img_size=224,
    patch_size=4,
    num_classes=1000,
    embed_dim=128,
    depths=[2, 2, 6, 2],
    mlp_ratio=4.0,
    drop_rate=0.0,
    drop_path_rate=0.1,
    norm_layer=keras.layers.LayerNormalization,
    patch_norm=True,
    focal_levels=[2, 2, 3, 2],
    focal_windows=[3, 2, 3, 2],
    use_conv_embed=False,
    use_layerscale=False,
    layerscale_value=1e-4,
    use_postln=False,
    use_postln_in_modulation=False,
    normalize_modulator=False,
):
    """Instantiates the FocalNet architecture.

    Args:
        img_size: Image size.
        patch_size: Patch size.
        num_classes: Number of classes.
        embed_dim: Embedding dimension.
        depths: Depths of each stage.
        mlp_ratio: Ratio of mlp hidden dim to embedding dim.
        drop_rate: Dropout rate.
        drop_path_rate: Stochastic depth rate.
        norm_layer: Normalization layer.
        patch_norm: Whether to use patch norm.
        focal_levels: Number of focal levels.
        focal_windows: Focal window sizes.
        use_conv_embed: Whether to use conv embed.
        use_layerscale: Whether to use layer scale.
        layerscale_value: Value for layer scale.
        use_postln: Whether to use post layer norm.
        use_postln_in_modulation: Whether to use post layer norm in modulation.
        normalize_modulator: Whether to normalize modulator.
    """
    num_layers = len(depths)
    embed_dim = [embed_dim * (2**i) for i in range(num_layers)]
    dpr = [
        ops.convert_to_numpy(x) for x in ops.linspace(0.0, drop_path_rate, sum(depths))
    ]  # stochastic depth decay rule

    def _apply(x):
        nonlocal num_classes
        x, *patches_resolution = PatchEmbed(
            img_size=(img_size, img_size),
            patch_size=patch_size,
            # in_chans=in_chans,
            embed_dim=embed_dim[0],
            use_conv_embed=use_conv_embed,
            norm_layer=norm_layer if patch_norm else None,
            is_stem=True,
        )(x, img_size, img_size)
        H, W = patches_resolution[0], patches_resolution[1]
        x = keras.layers.Dropout(drop_rate)(x)
        for i_layer in range(num_layers):
            x, H, W = BasicLayer(
                dim=embed_dim[i_layer],
                out_dim=embed_dim[i_layer + 1] if (i_layer < num_layers - 1) else None,
                input_resolution=(
                    patches_resolution[0] // (2**i_layer),
                    patches_resolution[1] // (2**i_layer),
                ),
                depth=depths[i_layer],
                mlp_ratio=mlp_ratio,
                drop=drop_rate,
                drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
                norm_layer=norm_layer,
                downsample=PatchEmbed if (i_layer < num_layers - 1) else None,
                focal_level=focal_levels[i_layer],
                focal_window=focal_windows[i_layer],
                use_conv_embed=use_conv_embed,
                use_layerscale=use_layerscale,
                layerscale_value=layerscale_value,
                use_postln=use_postln,
                use_postln_in_modulation=use_postln_in_modulation,
                normalize_modulator=normalize_modulator,
            )(x, H, W)
        # if num_classes is None return model without classification head   
        if num_classes is None:
            return x
        x = norm_layer(name="norm")(x)  # B L C
        x = keras.layers.GlobalAveragePooling1D()(x)  #
        x = keras.layers.Flatten()(x)
        num_classes = num_classes if num_classes > 0 else None
        x = keras.layers.Dense(num_classes, name="head")(x)
        return x

    return _apply

k3im.focalnet.FocalNetModel

Instantiates the FocalNet architecture.

Parameters:

Name Type Description Default
img_size

Image size.

required
in_channels

Number of input channels.

3
aug

Augmentation layer.

None
**kw

Other keyword arguments.

{}
Source code in k3im/focalnet.py
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
def FocalNetModel(img_size, in_channels=3, aug=None, **kw) -> keras.Model:
    """Instantiates the FocalNet architecture.

    Args:
        img_size: Image size.
        in_channels: Number of input channels.
        aug: Augmentation layer.
        **kw: Other keyword arguments.

    """
    focalnet_model = FocalNet(img_size=img_size, **kw)

    inputs = keras.Input((img_size, img_size, in_channels))
    if aug is not None:
        img = aug(inputs)
    else:
        img = inputs
    outputs = focalnet_model(img)
    final_model = keras.Model(inputs, outputs)

    return final_model

k3im.focalnet.focalnet_tiny_srf

FocalNet-Tiny-SRF model.

Parameters:

Name Type Description Default
img_size

Image size.

224
**kwargs

Other keyword arguments.

{}
Source code in k3im/focalnet.py
500
501
502
503
504
505
506
507
508
def focalnet_tiny_srf(img_size=224, **kwargs):
    """FocalNet-Tiny-SRF model.

    Args:
        img_size: Image size.
        **kwargs: Other keyword arguments.
    """
    model = FocalNetModel(img_size, depths=[2, 2, 6, 2], embed_dim=96, **kwargs)
    return model

k3im.focalnet.focalnet_small_srf

FocalNet-Small-SRF model.

Parameters:

Name Type Description Default
img_size

Image size.

224
**kwargs

Other keyword arguments.

{}
Source code in k3im/focalnet.py
511
512
513
514
515
516
517
518
519
def focalnet_small_srf(img_size=224, **kwargs):
    """FocalNet-Small-SRF model.

    Args:
        img_size: Image size.
        **kwargs: Other keyword arguments.
    """
    model = FocalNetModel(img_size, depths=[2, 2, 18, 2], embed_dim=96, **kwargs)
    return model

k3im.focalnet.focalnet_base_srf

FocalNet-Base-SRF model.

Parameters:

Name Type Description Default
img_size

Image size.

224
**kwargs

Other keyword arguments.

{}
Source code in k3im/focalnet.py
522
523
524
525
526
527
528
529
530
def focalnet_base_srf(img_size=224, **kwargs):
    """FocalNet-Base-SRF model.

    Args:
        img_size: Image size.
        **kwargs: Other keyword arguments.
    """
    model = FocalNetModel(img_size, depths=[2, 2, 18, 2], embed_dim=128, **kwargs)
    return model

k3im.focalnet.focalnet_tiny_lrf

FocalNet-Tiny-LRF model.

Parameters:

Name Type Description Default
img_size

Image size.

224
**kwargs

Other keyword arguments.

{}
Source code in k3im/focalnet.py
533
534
535
536
537
538
539
540
541
542
543
def focalnet_tiny_lrf(img_size=224, **kwargs):
    """FocalNet-Tiny-LRF model.

    Args:
        img_size: Image size.
        **kwargs: Other keyword arguments.
    """
    model = FocalNetModel(
        img_size, depths=[2, 2, 6, 2], embed_dim=96, focal_levels=[3, 3, 3, 3], **kwargs
    )
    return model

k3im.focalnet.focalnet_small_lrf

FocalNet-Small-LRF model.

Parameters:

Name Type Description Default
img_size

Image size.

224
**kwargs

Other keyword arguments.

{}
Source code in k3im/focalnet.py
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
def focalnet_small_lrf(img_size=224, **kwargs):

    """FocalNet-Small-LRF model.

    Args:
        img_size: Image size.
        **kwargs: Other keyword arguments.
    """
    model = FocalNetModel(
        img_size,
        depths=[2, 2, 18, 2],
        embed_dim=96,
        focal_levels=[3, 3, 3, 3],
        **kwargs,
    )

    return model

k3im.focalnet.focalnet_base_lrf

FocalNet-Base-LRF model.

Parameters:

Name Type Description Default
img_size

Image size.

224
**kwargs

Other keyword arguments.

{}
Source code in k3im/focalnet.py
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
def focalnet_base_lrf(img_size=224, **kwargs):
    """FocalNet-Base-LRF model.

    Args:
        img_size: Image size.
        **kwargs: Other keyword arguments.
    """

    model = FocalNetModel(
        img_size,
        depths=[2, 2, 18, 2],
        embed_dim=128,
        focal_levels=[3, 3, 3, 3],
        **kwargs,
    )
    return model

k3im.focalnet.focalnet_tiny_iso_16

FocalNet-Tiny-ISO-16 model.

Parameters:

Name Type Description Default
img_size

Image size.

224
**kwargs

Other keyword arguments.

{}
Source code in k3im/focalnet.py
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
def focalnet_tiny_iso_16(img_size=224, **kwargs):
    """FocalNet-Tiny-ISO-16 model.

    Args:
        img_size: Image size.
        **kwargs: Other keyword arguments.

    """
    model = FocalNetModel(
        img_size,
        depths=[12],
        patch_size=16,
        embed_dim=192,
        focal_levels=[3],
        focal_windows=[3],
        **kwargs,
    )
    return model

k3im.focalnet.focalnet_small_iso_16

FocalNet-Small-ISO-16 model.

Parameters:

Name Type Description Default
img_size

Image size.

224
**kwargs

Other keyword arguments.

{}
Source code in k3im/focalnet.py
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
def focalnet_small_iso_16(img_size=224, **kwargs):
    """FocalNet-Small-ISO-16 model.

    Args:
        img_size: Image size.
        **kwargs: Other keyword arguments.

    """
    model = FocalNetModel(
        img_size,
        depths=[12],
        patch_size=16,
        embed_dim=384,
        focal_levels=[3],
        focal_windows=[3],
        **kwargs,
    )
    return model

k3im.focalnet.focalnet_base_iso_16

FocalNet-Base-ISO-16 model.

Parameters:

Name Type Description Default
img_size

Image size.

224
**kwargs

Other keyword arguments.

{}
Source code in k3im/focalnet.py
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
def focalnet_base_iso_16(img_size=224, **kwargs):
    """FocalNet-Base-ISO-16 model.

    Args:
        img_size: Image size.
        **kwargs: Other keyword arguments.

    """
    model = FocalNetModel(
        img_size,
        depths=[12],
        patch_size=16,
        embed_dim=768,
        focal_levels=[3],
        focal_windows=[3],
        use_layerscale=True,
        use_postln=True,
        **kwargs,
    )
    return model

k3im.focalnet.gMLP

Instantiates the gMLP architecture.

Parameters:

Name Type Description Default
image_size

Image size.

required
patch_size

Patch size.

required
embedding_dim

Size of the embedding dimension.

required
num_blocks

Number of blocks.

required
dropout_rate

Dropout rate.

required
num_classes

Number of classes to classify images into.

required
positional_encoding

Whether to include positional encoding.

False
num_channels

Number of image channels.

3
aug

Image augmentation.

None
Source code in k3im/gmlp.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
def gMLPModel(
    image_size,
    patch_size,
    embedding_dim,
    num_blocks,
    dropout_rate,
    num_classes,
    positional_encoding=False,
    num_channels=3,
    aug=None,
):  
    """Instantiates the gMLP architecture.

    Args:
        image_size: Image size.
        patch_size: Patch size.
        embedding_dim: Size of the embedding dimension.
        num_blocks: Number of blocks.
        dropout_rate: Dropout rate.
        num_classes: Number of classes to classify images into.
        positional_encoding: Whether to include positional encoding.
        num_channels: Number of image channels.
        aug: Image augmentation.

    """
    image_size = pair(image_size)
    patch_size = pair(patch_size)
    input_shape = (image_size[0], image_size[1], num_channels)
    inputs = layers.Input(shape=input_shape)
    if aug is not None:
        img = aug(inputs)
    else:
        img = inputs
    num_patches = (image_size[0] // patch_size[0]) * (
        image_size[1] // patch_size[1]
    )  # Size of the data array.

    # Augment data.
    # Create patches.
    patches = Patches(patch_size)(img)
    # Encode patches to generate a [batch_size, num_patches, embedding_dim] tensor.
    x = layers.Dense(units=embedding_dim)(patches)
    if positional_encoding:
        x = x + PositionEmbedding(sequence_length=num_patches)(x)
    # Process x using the module blocks.
    for _ in range(num_blocks):
        x = gMLPLayer(num_patches, embedding_dim, dropout_rate)(x)
    # if num_classes is None return model without classification head
    if num_classes is None:
        return keras.Model(inputs=inputs, outputs=x)
    # Apply global average pooling to generate a [batch_size, embedding_dim] representation tensor.
    representation = layers.GlobalAveragePooling1D()(x)
    # Apply dropout.
    representation = layers.Dropout(rate=dropout_rate)(representation)
    # Compute logits outputs.
    logits = layers.Dense(num_classes)(representation)
    # Create the Keras model.
    return keras.Model(inputs=inputs, outputs=logits)

k3im.mlp_mixer.MlpMixer

MLP-Mixer

Parameters:

Name Type Description Default
num_classes

number of classes for classification head

1000
img_size

input image size

224
in_chans

number of input channels

3
patch_size

patch size

16
num_blocks

number of blocks

8
embed_dim

embedding dimension

512
mlp_ratio

ratio of mlp hidden dim to embedding dim

(0.5, 4.0)
block_layer

block layer type (e.g. MixerBlock, ResMLPBlock, ConvMLPBlock)

MixerBlock
mlp_layer

mlp layer type (e.g. Mlp, ConvMlp)

Mlp
norm_layer

normalization layer type (default: partial(layers.LayerNormalization, epsilon=1e-6))

partial(LayerNormalization, epsilon=1e-06)
act_layer

activation layer type (default: keras.activations.gelu)

gelu
drop_rate

dropout rate

0.0
proj_drop_rate

stochastic depth rate for projection

0.0
drop_path_rate

stochastic depth rate for block layers

0.0
stem_norm

whether to apply normalization to stem

False
global_pool

global pooling type, one of 'avg', 'max' or None

'avg'
Source code in k3im/mlp_mixer.py
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
def MlpMixer(num_classes=1000,
            img_size=224,
            in_chans=3,
            patch_size=16,
            num_blocks=8,
            embed_dim=512,
            mlp_ratio=(0.5, 4.0),
            block_layer=MixerBlock,
            mlp_layer=Mlp,
            norm_layer=partial(layers.LayerNormalization, epsilon=1e-6),
            act_layer=ops.gelu,
            drop_rate=0.,
            proj_drop_rate=0.,
            drop_path_rate=0.,
            stem_norm=False,
            global_pool='avg',):
    """ MLP-Mixer

        Args:
            num_classes: number of classes for classification head
            img_size: input image size
            in_chans: number of input channels
            patch_size: patch size
            num_blocks: number of blocks
            embed_dim: embedding dimension
            mlp_ratio: ratio of mlp hidden dim to embedding dim
            block_layer: block layer type (e.g. MixerBlock, ResMLPBlock, ConvMLPBlock)
            mlp_layer: mlp layer type (e.g. Mlp, ConvMlp)
            norm_layer: normalization layer type (default: partial(layers.LayerNormalization, epsilon=1e-6))
            act_layer: activation layer type (default: keras.activations.gelu)
            drop_rate: dropout rate
            proj_drop_rate: stochastic depth rate for projection
            drop_path_rate: stochastic depth rate for block layers
            stem_norm: whether to apply normalization to stem
            global_pool: global pooling type, one of 'avg', 'max' or None
    """
    img_size = pair(img_size)
    input_shape = (img_size[0], img_size[1], in_chans)
    inputs = layers.Input(input_shape)
    x = PatchEmbed(
            img_size=img_size,
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dim=embed_dim,
            norm_layer=norm_layer if stem_norm else None,
            name='stem'
        )(inputs) # stem
    num_patches = ops.shape(x)[1]
    for i in range(num_blocks):
        x = block_layer(
                embed_dim,
                num_patches,
                mlp_ratio,
                mlp_layer=mlp_layer,
                norm_layer=norm_layer,
                act_layer=act_layer,
                drop=proj_drop_rate,
                drop_path=drop_path_rate,
                name=f"blocks.{i}"
            )(x)
    x = norm_layer(name='norm')(x) # norm

    if global_pool == 'avg':
            x = ops.mean(x, axis=1)
    x = layers.Dropout(drop_rate)(x)
    if num_classes > 0:
        head = layers.Dense(num_classes, name='head') 
    else:
        head = layers.Identity() # head
    out = head(x)
    return keras.Model(inputs=inputs, outputs=out)

k3im.simple_vit.SimpleViT

Create a Simple Vision Transformer.

Parameters:

Name Type Description Default
`image_size`

tuple of (height, width) of the image

required
`patch_size`

tuple of (height, width) of the patch

required
`num_classes`

output classes for classification

required
`dim`

dimension of the model

required
`depth`

depth of the model

required
`heads`

number of heads in the model

required
`mlp_dim`

dimension of the mlp

required
`channels`

number of channels in the image

required
`dim_head`

dimension of the head

required
`pool`

pooling type, one of (mean, max)

required
`aug`

augmentation layer

required
Source code in k3im/simple_vit.py
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def SimpleViT(
    image_size,
    patch_size,
    num_classes,
    dim,
    depth,
    heads,
    mlp_dim,
    channels=3,
    dim_head=64,
    pool="mean",
    aug=None,
):
    """ Create a Simple Vision Transformer.

    Args:
        `image_size`: tuple of (height, width) of the image
        `patch_size`: tuple of (height, width) of the patch
        `num_classes`: output classes for classification
        `dim`: dimension of the model
        `depth`: depth of the model
        `heads`: number of heads in the model
        `mlp_dim`: dimension of the mlp
        `channels`: number of channels in the image
        `dim_head`: dimension of the head
        `pool`: pooling type, one of (`mean`, `max`)
        `aug`: augmentation layer
    """
    image_height, image_width = pair(image_size)
    patch_height, patch_width = pair(patch_size)

    assert (
        image_height % patch_height == 0 and image_width % patch_width == 0
    ), "Image dimensions must be divisible by the patch size."

    patch_dim = channels * patch_height * patch_width

    i_p = layers.Input((image_height, image_width, channels))
    if aug is not None:
        img = aug(i_p)
    else:
        img = i_p
    patches = ops.image.extract_patches(img, (patch_height, patch_width))
    patches = layers.Reshape((-1, patch_dim))(patches)
    patches = layers.LayerNormalization()(patches)
    patches = layers.Dense(dim)(patches)
    patches = layers.LayerNormalization()(patches)
    pos_embedding = posemb_sincos_2d(
        h=image_height // patch_height,
        w=image_width // patch_width,
        dim=dim,
    )
    patches += pos_embedding
    patches = Transformer(dim, depth, heads, dim_head, mlp_dim)(patches)
    # if num_classes is None return model without classification head
    if num_classes is None:
        return keras.Model(inputs=i_p, outputs=patches)

    if pool == "mean":
        patches = layers.GlobalAveragePooling1D(name="avg_pool")(patches)
    elif pool == "max":
        patches = layers.GlobalMaxPooling1D(name="max_pool")(patches)

    o_p = layers.Dense(num_classes)(patches)

    return keras.Model(inputs=i_p, outputs=o_p)

k3im.vit.ViT

Create a Vision Transformer for 2D data.

Parameters:

Name Type Description Default
`image_size`

tuple of ints (height, width) specifying the image dimensions

required
`patch_size`

tuple of ints (height, width) specifying the patch dimensions

required
`num_classes`

number of classes

required
`dim`

dimension of the transformer

required
`depth`

number of transformer layers

required
`heads`

number of attention heads

required
`mlp_dim`

dimension of the mlp

required
`channels`

number of channels in the input image

required
`dim_head`

dimension of the head

required
`pool`

type of pooling at the end of the network

required
`aug`

augmentation layer

required
Source code in k3im/vit.py
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def ViT(
    image_size,
    patch_size,
    num_classes,
    dim,
    depth,
    heads,
    mlp_dim,
    channels=3,
    dim_head=64,
    pool="mean",
    aug=None,
):
    """ Create a Vision Transformer for 2D data.

    Args:
        `image_size`: tuple of ints (height, width) specifying the image dimensions
        `patch_size`: tuple of ints (height, width) specifying the patch dimensions
        `num_classes`: number of classes
        `dim`: dimension of the transformer
        `depth`: number of transformer layers
        `heads`: number of attention heads
        `mlp_dim`: dimension of the mlp
        `channels`: number of channels in the input image
        `dim_head`: dimension of the head
        `pool`: type of pooling at the end of the network
        `aug`: augmentation layer
    """
    image_height, image_width = pair(image_size)
    patch_height, patch_width = pair(patch_size)

    assert (
        image_height % patch_height == 0 and image_width % patch_width == 0
    ), "Image dimensions must be divisible by the patch size."
    assert pool in {
        "cls",
        "mean",
    }, "pool type must be either cls (cls token) or mean (mean pooling)"
    patch_dim = channels * patch_height * patch_width

    i_p = layers.Input((image_height, image_width, channels))
    if aug is not None:
        img = aug(i_p)
    else:
        img = i_p
    patches = ops.image.extract_patches(img, (patch_height, patch_width))
    patches = layers.Reshape((-1, patch_dim))(patches)
    patches = layers.LayerNormalization()(patches)
    patches = layers.Dense(dim)(patches)
    patches = layers.LayerNormalization()(patches)
    num_patches = ops.shape(patches)[1]
    patches = ClassTokenPositionEmb(num_patches, dim)(patches)
    patches = Transformer(dim, depth, heads, dim_head, mlp_dim)(patches)
    # if num_classes is None return model without classification head
    if num_classes is None:
        return keras.Model(inputs=i_p, outputs=patches)

    if pool == "cls":
        patches = patches[:, -1]
    elif pool == "mean":
        patches = layers.GlobalAveragePooling1D(name="max_pool")(patches)

    o_p = layers.Dense(num_classes)(patches)

    return keras.Model(inputs=i_p, outputs=o_p)