Deep Learning with Python
by François Chollet
Deep Learning with Python
Share this page
Advanced Deep Learning
Advanced Architectures
Residual Networks (ResNets)
ResNets introduce skip connections to enable training of very deep networks:
PYTHON
1
2 from tensorflow.keras import layers, Model
3
4 # Residual block
5 def residual_block(x, filters, kernel_size=3):
6 shortcut = x
7
8 x = layers.Conv2D(filters, kernel_size, padding='same')(x)
9 x = layers.BatchNormalization()(x)
10 x = layers.ReLU()(x)
11 x = layers.Conv2D(filters, kernel_size, padding='same')(x)
12 x = layers.BatchNormalization()(x)
13
14 # Add skip connection
15 if shortcut.shape[-1] != filters:
16 shortcut = layers.Conv2D(filters, 1, padding='same')(shortcut)
17 shortcut = layers.BatchNormalization()(shortcut)
18
19 x = layers.Add()([x, shortcut])
20 x = layers.ReLU()(x)
21 return x
22
23 # Build ResNet
24 inputs = layers.Input(shape=(224, 224, 3))
25 x = layers.Conv2D(64, 7, strides=2, padding='same')(inputs)
26 x = layers.BatchNormalization()(x)
27 x = layers.ReLU()(x)
28 x = layers.MaxPooling2D(3, strides=2, padding='same')(x)
29
30 x = residual_block(x, 64)
31 x = residual_block(x, 64)
32 x = residual_block(x, 128)
33 x = residual_block(x, 128)
34
35 x = layers.GlobalAveragePooling2D()(x)
36 outputs = layers.Dense(1000, activation='softmax')(x)
37
38 model = Model(inputs, outputs)
Inception Networks
Inception modules use multiple filter sizes in parallel:
PYTHON
1
2 def inception_module(x, filters):
3 branch1x1 = layers.Conv2D(filters[0], 1, padding='same', activation='relu')(x)
4
5 branch3x3 = layers.Conv2D(filters[1], 1, padding='same', activation='relu')(x)
6 branch3x3 = layers.Conv2D(filters[2], 3, padding='same', activation='relu')(branch3x3)
7
8 branch5x5 = layers.Conv2D(filters[3], 1, padding='same', activation='relu')(x)
9 branch5x5 = layers.Conv2D(filters[4], 5, padding='same', activation='relu')(branch5x5)
10
11 branch_pool = layers.MaxPooling2D(3, strides=1, padding='same')(x)
12 branch_pool = layers.Conv2D(filters[5], 1, padding='same', activation='relu')(branch_pool)
13
14 x = layers.Concatenate()([branch1x1, branch3x3, branch5x5, branch_pool])
15 return x
Dense Networks
DenseNet connects each layer to every other layer:
PYTHON
1
2 def dense_block(x, blocks, growth_rate):
3 for i in range(blocks):
4 x = conv_block(x, growth_rate)
5 return x
6
7 def conv_block(x, growth_rate):
8 x = layers.BatchNormalization()(x)
9 x = layers.ReLU()(x)
10 x = layers.Conv2D(growth_rate, 3, padding='same')(x)
11 return x
Attention Mechanisms
Self-Attention
PYTHON
1
2 class SelfAttention(layers.Layer):
3 def __init__(self, units):
4 super(SelfAttention, self).__init__()
5 self.units = units
6
7 def build(self, input_shape):
8 self.query = layers.Dense(self.units)
9 self.key = layers.Dense(self.units)
10 self.value = layers.Dense(self.units)
11
12 def call(self, inputs):
13 q = self.query(inputs)
14 k = self.key(inputs)
15 v = self.value(inputs)
16
17 # Scaled dot-product attention
18 scores = tf.matmul(q, k, transpose_b=True)
19 scores = scores / tf.math.sqrt(tf.cast(self.units, tf.float32))
20 weights = tf.nn.softmax(scores)
21
22 output = tf.matmul(weights, v)
23 return output
Multi-Head Attention
PYTHON
1
2 class MultiHeadAttention(layers.Layer):
3 def __init__(self, d_model, num_heads):
4 super(MultiHeadAttention, self).__init__()
5 self.num_heads = num_heads
6 self.d_model = d_model
7 self.depth = d_model // num_heads
8
9 self.wq = layers.Dense(d_model)
10 self.wk = layers.Dense(d_model)
11 self.wv = layers.Dense(d_model)
12 self.wo = layers.Dense(d_model)
13
14 def split_heads(self, x):
15 x = tf.reshape(x, (-1, x.shape[1], self.num_heads, self.depth))
16 return tf.transpose(x, perm=[0, 2, 1, 3])
17
18 def call(self, q, k, v):
19 q = self.split_heads(self.wq(q))
20 k = self.split_heads(self.wk(k))
21 v = self.split_heads(self.wv(v))
22
23 # Scaled dot-product attention
24 scores = tf.matmul(q, k, transpose_b=True)
25 scores = scores / tf.math.sqrt(tf.cast(self.depth, tf.float32))
26 weights = tf.nn.softmax(scores)
27
28 attention = tf.matmul(weights, v)
29 attention = tf.transpose(attention, perm=[0, 2, 1, 3])
30 attention = tf.reshape(attention, (-1, attention.shape[1], self.d_model))
31
32 return self.wo(attention)
Generative Models
Variational Autoencoders (VAEs)
PYTHON
1
2 # Encoder
3 encoder_inputs = layers.Input(shape=(28, 28, 1))
4 x = layers.Conv2D(32, 3, activation='relu', strides=2, padding='same')(encoder_inputs)
5 x = layers.Conv2D(64, 3, activation='relu', strides=2, padding='same')(x)
6 x = layers.Flatten()(x)
7
8 # Latent space
9 z_mean = layers.Dense(2)(x)
10 z_log_var = layers.Dense(2)(x)
11
12 # Sampling
13 def sampling(args):
14 z_mean, z_log_var = args
15 epsilon = tf.keras.backend.random_normal(shape=tf.shape(z_mean))
16 return z_mean + tf.exp(0.5 * z_log_var) * epsilon
17
18 z = layers.Lambda(sampling)([z_mean, z_log_var])
19
20 # Decoder
21 decoder_inputs = layers.Input(shape=(2,))
22 x = layers.Dense(7 * 7 * 64, activation='relu')(decoder_inputs)
23 x = layers.Reshape((7, 7, 64))(x)
24 x = layers.Conv2DTranspose(64, 3, activation='relu', strides=2, padding='same')(x)
25 x = layers.Conv2DTranspose(32, 3, activation='relu', strides=2, padding='same')(x)
26 decoder_outputs = layers.Conv2DTranspose(1, 3, activation='sigmoid', padding='same')(x)
27
28 # Models
29 encoder = Model(encoder_inputs, [z_mean, z_log_var])
30 decoder = Model(decoder_inputs, decoder_outputs)
31
32 # VAE model
33 vae_outputs = decoder(encoder(encoder_inputs)[1])
34 vae = Model(encoder_inputs, vae_outputs)
Generative Adversarial Networks (GANs)
PYTHON
1
2 # Generator
3 generator = models.Sequential([
4 layers.Dense(7 * 7 * 256, use_bias=False, input_shape=(100,)),
5 layers.BatchNormalization(),
6 layers.ReLU(),
7 layers.Reshape((7, 7, 256)),
8 layers.Conv2DTranspose(128, 5, use_bias=False, strides=1, padding='same'),
9 layers.BatchNormalization(),
10 layers.ReLU(),
11 layers.Conv2DTranspose(64, 5, strides=2, padding='same'),
12 layers.BatchNormalization(),
13 layers.ReLU(),
14 layers.Conv2DTranspose(1, 5, strides=2, padding='same', activation='tanh')
15 ])
16
17 # Discriminator
18 discriminator = models.Sequential([
19 layers.Conv2D(64, 5, strides=2, padding='same', input_shape=[28, 28, 1]),
20 layers.LeakyReLU(0.2),
21 layers.Dropout(0.3),
22 layers.Conv2D(128, 5, strides=2, padding='same'),
23 layers.LeakyReLU(0.2),
24 layers.Dropout(0.3),
25 layers.Flatten(),
26 layers.Dense(1, activation='sigmoid')
27 ])
28
29 # Combined model
30 discriminator.compile(optimizer='adam', loss='binary_crossentropy')
31 discriminator.trainable = False
32
33 gan_input = layers.Input(shape=(100,))
34 gan_output = discriminator(generator(gan_input))
35 gan = Model(gan_input, gan_output)
36 gan.compile(optimizer='adam', loss='binary_crossentropy')
Diffusion Models
PYTHON
1
2 class DiffusionModel(tf.keras.Model):
3 def __init__(self, network):
4 super().__init__()
5 self.network = network
6 self.num_timesteps = 1000
7
8 def call(self, images):
9 # Add noise
10 noise = tf.random.normal(tf.shape(images))
11 t = tf.random.uniform((tf.shape(images)[0],), maxval=self.num_timesteps)
12
13 # Predict noise
14 predicted_noise = self.network([images, t])
15 return predicted_noise
Reinforcement Learning with Deep Networks
Deep Q-Networks (DQN)
PYTHON
1
2 class DQN(tf.keras.Model):
3 def __init__(self, num_actions):
4 super(DQN, self).__init__()
5 self.dense1 = layers.Dense(128, activation='relu')
6 self.dense2 = layers.Dense(128, activation='relu')
7 self.values = layers.Dense(num_actions)
8
9 def call(self, inputs):
10 x = self.dense1(inputs)
11 x = self.dense2(x)
12 return self.values(x)
Policy Gradients
PYTHON
1
2 class PolicyNetwork(tf.keras.Model):
3 def __init__(self, num_actions):
4 super(PolicyNetwork, self).__init__()
5 self.dense1 = layers.Dense(128, activation='relu')
6 self.dense2 = layers.Dense(128, activation='relu')
7 self.action_logits = layers.Dense(num_actions)
8
9 def call(self, inputs):
10 x = self.dense1(inputs)
11 x = self.dense2(x)
12 return self.action_logits(x)
Meta-Learning
Model-Agnostic Meta-Learning (MAML)
PYTHON
1
2 class MAML(tf.keras.Model):
3 def __init__(self, model):
4 super(MAML, self).__init__()
5 self.model = model
6
7 def adapt(self, support_data, learning_rate=0.01):
8 # Perform gradient steps on support data
9 with tf.GradientTape() as tape:
10 loss = self.compute_loss(support_data)
11 gradients = tape.gradient(loss, self.model.trainable_variables)
12
13 # Update weights
14 adapted_weights = []
15 for weight, grad in zip(self.model.trainable_variables, gradients):
16 adapted_weights.append(weight - learning_rate * grad)
17
18 return adapted_weights
Advanced Training Techniques
Learning Rate Scheduling
PYTHON
1
2 # Cosine annealing
3 lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
4 initial_learning_rate=0.001,
5 decay_steps=1000,
6 alpha=0.0
7 )
8
9 optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
Mixed Precision Training
PYTHON
1
2 from tensorflow.keras import mixed_precision
3
4 policy = mixed_precision.Policy('mixed_float16')
5 mixed_precision.set_global_policy(policy)
Distributed Training
PYTHON
1
2 strategy = tf.distribute.MirroredStrategy()
3
4 with strategy.scope():
5 model = create_model()
6 model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
Custom Layers and Losses
Custom Layer Example
PYTHON
1
2 class CustomLayer(layers.Layer):
3 def __init__(self, units):
4 super(CustomLayer, self).__init__()
5 self.units = units
6
7 def build(self, input_shape):
8 self.w = self.add_weight(
9 shape=(input_shape[-1], self.units),
10 initializer='random_normal',
11 trainable=True
12 )
13
14 def call(self, inputs):
15 return tf.matmul(inputs, self.w)
Custom Loss Function
PYTHON
1
2 def custom_loss(y_true, y_pred):
3 # Custom loss calculation
4 error = y_true - y_pred
5 squared_error = tf.square(error)
6 return tf.reduce_mean(squared_error)
Best Practices
- Architecture Selection: Choose appropriate architecture for the task
- Regularization: Use dropout, batch norm, weight decay
- Optimization: Use appropriate optimizer and learning rate
- Monitoring: Track training and validation metrics
- Experimentation: Try different hyperparameters
Current Research Directions
- Efficient Transformers: Reducing computational cost
- Self-Supervised Learning: Learning without labels
- Neural Architecture Search: Automated architecture design
- Explainable AI: Understanding model decisions
- Continual Learning: Learning from new data without forgetting