diff --git a/appendix-D/01_main-chapter-code/appendix-D.ipynb b/appendix-D/01_main-chapter-code/appendix-D.ipynb index 8b28e67..03e62a0 100644 --- a/appendix-D/01_main-chapter-code/appendix-D.ipynb +++ b/appendix-D/01_main-chapter-code/appendix-D.ipynb @@ -552,6 +552,8 @@ "source": [ "from previous_chapters import evaluate_model, generate_and_print_sample\n", "\n", + "BOOK_VERSION = True\n", + "\n", "\n", "def train_model(model, train_loader, val_loader, optimizer, device,\n", " n_epochs, eval_freq, eval_iter, start_context, tokenizer,\n", @@ -595,9 +597,14 @@ " loss.backward()\n", "\n", " # Apply gradient clipping after the warmup phase to avoid exploding gradients\n", - " if global_step > warmup_steps:\n", - " torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)\n", - " \n", + "\n", + " if BOOK_VERSION:\n", + " if global_step > warmup_steps:\n", + " torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) \n", + " else:\n", + " if global_step >= warmup_steps: # the book originally used global_step > warmup_steps, which lead to a skipped clipping step after warmup\n", + " torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)\n", + " \n", " optimizer.step()\n", " tokens_seen += input_batch.numel()\n", "\n", @@ -691,8 +698,8 @@ "model = GPTModel(GPT_CONFIG_124M)\n", "model.to(device)\n", "\n", - "peak_lr = 5e-4\n", - "optimizer = torch.optim.AdamW(model.parameters(), weight_decay=0.1)\n", + "peak_lr = 0.001 # this was originally set to 5e-4 in the book by mistake\n", + "optimizer = torch.optim.AdamW(model.parameters(), lr=peak_lr, weight_decay=0.1) # the book accidentally omitted the lr assignment\n", "tokenizer = tiktoken.get_encoding(\"gpt2\")\n", "\n", "n_epochs = 15\n", @@ -817,7 +824,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.11.4" } }, "nbformat": 4,