Deep Learning with Python

by François Chollet

Deep Learning with Python

Advanced Deep Learning

Advanced Architectures

Residual Networks (ResNets)

ResNets introduce skip connections to enable training of very deep networks:

PYTHON

1
2    from tensorflow.keras import layers, Model
3    
4    # Residual block
5    def residual_block(x, filters, kernel_size=3):
6        shortcut = x
7        
8        x = layers.Conv2D(filters, kernel_size, padding='same')(x)
9        x = layers.BatchNormalization()(x)
10        x = layers.ReLU()(x)
11        x = layers.Conv2D(filters, kernel_size, padding='same')(x)
12        x = layers.BatchNormalization()(x)
13        
14        # Add skip connection
15        if shortcut.shape[-1] != filters:
16            shortcut = layers.Conv2D(filters, 1, padding='same')(shortcut)
17            shortcut = layers.BatchNormalization()(shortcut)
18        
19        x = layers.Add()([x, shortcut])
20        x = layers.ReLU()(x)
21        return x
22    
23    # Build ResNet
24    inputs = layers.Input(shape=(224, 224, 3))
25    x = layers.Conv2D(64, 7, strides=2, padding='same')(inputs)
26    x = layers.BatchNormalization()(x)
27    x = layers.ReLU()(x)
28    x = layers.MaxPooling2D(3, strides=2, padding='same')(x)
29    
30    x = residual_block(x, 64)
31    x = residual_block(x, 64)
32    x = residual_block(x, 128)
33    x = residual_block(x, 128)
34    
35    x = layers.GlobalAveragePooling2D()(x)
36    outputs = layers.Dense(1000, activation='softmax')(x)
37    
38    model = Model(inputs, outputs)
    

Inception Networks

Inception modules use multiple filter sizes in parallel:

PYTHON

1
2    def inception_module(x, filters):
3        branch1x1 = layers.Conv2D(filters[0], 1, padding='same', activation='relu')(x)
4        
5        branch3x3 = layers.Conv2D(filters[1], 1, padding='same', activation='relu')(x)
6        branch3x3 = layers.Conv2D(filters[2], 3, padding='same', activation='relu')(branch3x3)
7        
8        branch5x5 = layers.Conv2D(filters[3], 1, padding='same', activation='relu')(x)
9        branch5x5 = layers.Conv2D(filters[4], 5, padding='same', activation='relu')(branch5x5)
10        
11        branch_pool = layers.MaxPooling2D(3, strides=1, padding='same')(x)
12        branch_pool = layers.Conv2D(filters[5], 1, padding='same', activation='relu')(branch_pool)
13        
14        x = layers.Concatenate()([branch1x1, branch3x3, branch5x5, branch_pool])
15        return x
    

Dense Networks

DenseNet connects each layer to every other layer:

PYTHON

1
2    def dense_block(x, blocks, growth_rate):
3        for i in range(blocks):
4            x = conv_block(x, growth_rate)
5        return x
6    
7    def conv_block(x, growth_rate):
8        x = layers.BatchNormalization()(x)
9        x = layers.ReLU()(x)
10        x = layers.Conv2D(growth_rate, 3, padding='same')(x)
11        return x
    

Attention Mechanisms

Self-Attention

PYTHON

1
2    class SelfAttention(layers.Layer):
3        def __init__(self, units):
4            super(SelfAttention, self).__init__()
5            self.units = units
6            
7        def build(self, input_shape):
8            self.query = layers.Dense(self.units)
9            self.key = layers.Dense(self.units)
10            self.value = layers.Dense(self.units)
11            
12        def call(self, inputs):
13            q = self.query(inputs)
14            k = self.key(inputs)
15            v = self.value(inputs)
16            
17            # Scaled dot-product attention
18            scores = tf.matmul(q, k, transpose_b=True)
19            scores = scores / tf.math.sqrt(tf.cast(self.units, tf.float32))
20            weights = tf.nn.softmax(scores)
21            
22            output = tf.matmul(weights, v)
23            return output
    

Multi-Head Attention

PYTHON

1
2    class MultiHeadAttention(layers.Layer):
3        def __init__(self, d_model, num_heads):
4            super(MultiHeadAttention, self).__init__()
5            self.num_heads = num_heads
6            self.d_model = d_model
7            self.depth = d_model // num_heads
8            
9            self.wq = layers.Dense(d_model)
10            self.wk = layers.Dense(d_model)
11            self.wv = layers.Dense(d_model)
12            self.wo = layers.Dense(d_model)
13            
14        def split_heads(self, x):
15            x = tf.reshape(x, (-1, x.shape[1], self.num_heads, self.depth))
16            return tf.transpose(x, perm=[0, 2, 1, 3])
17            
18        def call(self, q, k, v):
19            q = self.split_heads(self.wq(q))
20            k = self.split_heads(self.wk(k))
21            v = self.split_heads(self.wv(v))
22            
23            # Scaled dot-product attention
24            scores = tf.matmul(q, k, transpose_b=True)
25            scores = scores / tf.math.sqrt(tf.cast(self.depth, tf.float32))
26            weights = tf.nn.softmax(scores)
27            
28            attention = tf.matmul(weights, v)
29            attention = tf.transpose(attention, perm=[0, 2, 1, 3])
30            attention = tf.reshape(attention, (-1, attention.shape[1], self.d_model))
31            
32            return self.wo(attention)
    

Generative Models

Variational Autoencoders (VAEs)

PYTHON

1
2    # Encoder
3    encoder_inputs = layers.Input(shape=(28, 28, 1))
4    x = layers.Conv2D(32, 3, activation='relu', strides=2, padding='same')(encoder_inputs)
5    x = layers.Conv2D(64, 3, activation='relu', strides=2, padding='same')(x)
6    x = layers.Flatten()(x)
7    
8    # Latent space
9    z_mean = layers.Dense(2)(x)
10    z_log_var = layers.Dense(2)(x)
11    
12    # Sampling
13    def sampling(args):
14        z_mean, z_log_var = args
15        epsilon = tf.keras.backend.random_normal(shape=tf.shape(z_mean))
16        return z_mean + tf.exp(0.5 * z_log_var) * epsilon
17    
18    z = layers.Lambda(sampling)([z_mean, z_log_var])
19    
20    # Decoder
21    decoder_inputs = layers.Input(shape=(2,))
22    x = layers.Dense(7 * 7 * 64, activation='relu')(decoder_inputs)
23    x = layers.Reshape((7, 7, 64))(x)
24    x = layers.Conv2DTranspose(64, 3, activation='relu', strides=2, padding='same')(x)
25    x = layers.Conv2DTranspose(32, 3, activation='relu', strides=2, padding='same')(x)
26    decoder_outputs = layers.Conv2DTranspose(1, 3, activation='sigmoid', padding='same')(x)
27    
28    # Models
29    encoder = Model(encoder_inputs, [z_mean, z_log_var])
30    decoder = Model(decoder_inputs, decoder_outputs)
31    
32    # VAE model
33    vae_outputs = decoder(encoder(encoder_inputs)[1])
34    vae = Model(encoder_inputs, vae_outputs)
    

Generative Adversarial Networks (GANs)

PYTHON

1
2    # Generator
3    generator = models.Sequential([
4        layers.Dense(7 * 7 * 256, use_bias=False, input_shape=(100,)),
5        layers.BatchNormalization(),
6        layers.ReLU(),
7        layers.Reshape((7, 7, 256)),
8        layers.Conv2DTranspose(128, 5, use_bias=False, strides=1, padding='same'),
9        layers.BatchNormalization(),
10        layers.ReLU(),
11        layers.Conv2DTranspose(64, 5, strides=2, padding='same'),
12        layers.BatchNormalization(),
13        layers.ReLU(),
14        layers.Conv2DTranspose(1, 5, strides=2, padding='same', activation='tanh')
15    ])
16    
17    # Discriminator
18    discriminator = models.Sequential([
19        layers.Conv2D(64, 5, strides=2, padding='same', input_shape=[28, 28, 1]),
20        layers.LeakyReLU(0.2),
21        layers.Dropout(0.3),
22        layers.Conv2D(128, 5, strides=2, padding='same'),
23        layers.LeakyReLU(0.2),
24        layers.Dropout(0.3),
25        layers.Flatten(),
26        layers.Dense(1, activation='sigmoid')
27    ])
28    
29    # Combined model
30    discriminator.compile(optimizer='adam', loss='binary_crossentropy')
31    discriminator.trainable = False
32    
33    gan_input = layers.Input(shape=(100,))
34    gan_output = discriminator(generator(gan_input))
35    gan = Model(gan_input, gan_output)
36    gan.compile(optimizer='adam', loss='binary_crossentropy')
    

Diffusion Models

PYTHON

1
2    class DiffusionModel(tf.keras.Model):
3        def __init__(self, network):
4            super().__init__()
5            self.network = network
6            self.num_timesteps = 1000
7            
8        def call(self, images):
9            # Add noise
10            noise = tf.random.normal(tf.shape(images))
11            t = tf.random.uniform((tf.shape(images)[0],), maxval=self.num_timesteps)
12            
13            # Predict noise
14            predicted_noise = self.network([images, t])
15            return predicted_noise
    

Reinforcement Learning with Deep Networks

Deep Q-Networks (DQN)

PYTHON

1
2    class DQN(tf.keras.Model):
3        def __init__(self, num_actions):
4            super(DQN, self).__init__()
5            self.dense1 = layers.Dense(128, activation='relu')
6            self.dense2 = layers.Dense(128, activation='relu')
7            self.values = layers.Dense(num_actions)
8            
9        def call(self, inputs):
10            x = self.dense1(inputs)
11            x = self.dense2(x)
12            return self.values(x)
    

Policy Gradients

PYTHON

1
2    class PolicyNetwork(tf.keras.Model):
3        def __init__(self, num_actions):
4            super(PolicyNetwork, self).__init__()
5            self.dense1 = layers.Dense(128, activation='relu')
6            self.dense2 = layers.Dense(128, activation='relu')
7            self.action_logits = layers.Dense(num_actions)
8            
9        def call(self, inputs):
10            x = self.dense1(inputs)
11            x = self.dense2(x)
12            return self.action_logits(x)
    

Meta-Learning

Model-Agnostic Meta-Learning (MAML)

PYTHON

1
2    class MAML(tf.keras.Model):
3        def __init__(self, model):
4            super(MAML, self).__init__()
5            self.model = model
6            
7        def adapt(self, support_data, learning_rate=0.01):
8            # Perform gradient steps on support data
9            with tf.GradientTape() as tape:
10                loss = self.compute_loss(support_data)
11            gradients = tape.gradient(loss, self.model.trainable_variables)
12            
13            # Update weights
14            adapted_weights = []
15            for weight, grad in zip(self.model.trainable_variables, gradients):
16                adapted_weights.append(weight - learning_rate * grad)
17            
18            return adapted_weights
    

Advanced Training Techniques

Learning Rate Scheduling

PYTHON

1
2    # Cosine annealing
3    lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
4        initial_learning_rate=0.001,
5        decay_steps=1000,
6        alpha=0.0
7    )
8    
9    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
    

Mixed Precision Training

PYTHON

1
2    from tensorflow.keras import mixed_precision
3    
4    policy = mixed_precision.Policy('mixed_float16')
5    mixed_precision.set_global_policy(policy)
    

Distributed Training

PYTHON

1
2    strategy = tf.distribute.MirroredStrategy()
3    
4    with strategy.scope():
5        model = create_model()
6        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
    

Custom Layers and Losses

Custom Layer Example

PYTHON

1
2    class CustomLayer(layers.Layer):
3        def __init__(self, units):
4            super(CustomLayer, self).__init__()
5            self.units = units
6            
7        def build(self, input_shape):
8            self.w = self.add_weight(
9                shape=(input_shape[-1], self.units),
10                initializer='random_normal',
11                trainable=True
12            )
13            
14        def call(self, inputs):
15            return tf.matmul(inputs, self.w)
    

Custom Loss Function

PYTHON

1
2    def custom_loss(y_true, y_pred):
3        # Custom loss calculation
4        error = y_true - y_pred
5        squared_error = tf.square(error)
6        return tf.reduce_mean(squared_error)
    

Best Practices

  1. Architecture Selection: Choose appropriate architecture for the task
  2. Regularization: Use dropout, batch norm, weight decay
  3. Optimization: Use appropriate optimizer and learning rate
  4. Monitoring: Track training and validation metrics
  5. Experimentation: Try different hyperparameters

Current Research Directions

  1. Efficient Transformers: Reducing computational cost
  2. Self-Supervised Learning: Learning without labels
  3. Neural Architecture Search: Automated architecture design
  4. Explainable AI: Understanding model decisions
  5. Continual Learning: Learning from new data without forgetting