import time import torch import torchvision from torch import nn from torch.utils.data import DataLoader from torchvision.models import alexnet from torchvision import transforms from torch.utils.tensorboard import SummaryWriter import numpy as np import argparse
# 6.Create loss cross_entropy_loss = nn.CrossEntropyLoss()
# 7.Optimizer optim = torch.optim.AdamW(model.parameters(), lr=args.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=3, verbose=True) # 8. Set some parameters to control loop # epoch iter = 0 t0 = time.time() for epoch in range(args.epochs): t1 = time.time() print(" -----------------the {} number of training epoch --------------".format(epoch)) model.train() for data in train_dataloader: loss = 0 imgs, targets = data if args.cuda == cuda: cross_entropy_loss = cross_entropy_loss.cuda() imgs, targets = imgs.cuda(), targets.cuda() outputs = model(imgs) loss_train = cross_entropy_loss(outputs, targets) loss = loss_train.item() + loss if args.tensorboard: writer.add_scalar("train_loss", loss_train.item(), iter)
optim.zero_grad() loss_train.backward() optim.step() iter = iter + 1 if iter % 100 == 0: print( "Epoch: {} | Iteration: {} | lr: {} | loss: {} | np.mean(loss): {} " .format(epoch, iter, optim.param_groups[0]['lr'], loss_train.item(), np.mean(loss))) if args.tensorboard: writer.add_scalar("lr", optim.param_groups[0]['lr'], epoch) scheduler.step(np.mean(loss)) t2 = time.time() h = (t2 - t1) // 3600 m = ((t2 - t1) % 3600) // 60 s = ((t2 - t1) % 3600) % 60 print("epoch {} is finished, and time is {}h{}m{}s".format(epoch, int(h), int(m), int(s)))
for imgs,targets in dataloader: imgs,targets = imgs.cuda(),targets.cuda()
.... with autocast(): outputs = model(imgs) loss = loss_fn(outputs,targets) ... optim.zero_grad() loss.backward() optim.step()
...
train_autocast_without.py
import time import torch import torchvision from torch import nn from torch.cuda.amp import autocast from torchvision import transforms from torchvision.models import alexnet from torch.utils.data import DataLoader from torch.utils.tensorboard import SummaryWriter import numpy as np import argparse
# 6.Create loss cross_entropy_loss = nn.CrossEntropyLoss()
# 7.Optimizer optim = torch.optim.AdamW(model.parameters(), lr=args.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=3, verbose=True) # 8. Set some parameters to control loop # epoch iter = 0 t0 = time.time() for epoch in range(args.epochs): t1 = time.time() print(" -----------------the {} number of training epoch --------------".format(epoch)) model.train() for data in train_dataloader: loss = 0 imgs, targets = data if args.cuda == cuda: cross_entropy_loss = cross_entropy_loss.cuda() imgs, targets = imgs.cuda(), targets.cuda() with autocast(): outputs = model(imgs) loss_train = cross_entropy_loss(outputs, targets) loss = loss_train.item() + loss if args.tensorboard: writer.add_scalar("train_loss", loss_train.item(), iter)
optim.zero_grad() loss_train.backward() optim.step() iter = iter + 1 if iter % 100 == 0: print( "Epoch: {} | Iteration: {} | lr: {} | loss: {} | np.mean(loss): {} " .format(epoch, iter, optim.param_groups[0]['lr'], loss_train.item(), np.mean(loss))) if args.tensorboard: writer.add_scalar("lr", optim.param_groups[0]['lr'], epoch) scheduler.step(np.mean(loss)) t2 = time.time() h = (t2 - t1) // 3600 m = ((t2 - t1) % 3600) // 60 s = ((t2 - t1) % 3600) % 60 print("epoch {} is finished, and time is {}h{}m{}s".format(epoch, int(h), int(m), int(s)))
for imgs,targets in dataloader: imgs,targets = imgs.cuda(),targets.cuda() ... optim.zero_grad() .... with autocast(): outputs = model(imgs) loss = loss_fn(outputs,targets)
import time import torch import torchvision from torch import nn from torch.cuda.amp import autocast, GradScaler from torchvision import transforms from torchvision.models import alexnet from torch.utils.data import DataLoader from torch.utils.tensorboard import SummaryWriter import numpy as np import argparse
# 6.Create loss cross_entropy_loss = nn.CrossEntropyLoss()
# 7.Optimizer optim = torch.optim.AdamW(model.parameters(), lr=args.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=3, verbose=True) scaler = GradScaler() # 8. Set some parameters to control loop # epoch iter = 0 t0 = time.time() for epoch in range(args.epochs): t1 = time.time() print(" -----------------the {} number of training epoch --------------".format(epoch)) model.train() for data in train_dataloader: loss = 0 imgs, targets = data optim.zero_grad() if args.cuda == cuda: cross_entropy_loss = cross_entropy_loss.cuda() imgs, targets = imgs.cuda(), targets.cuda() with autocast(): outputs = model(imgs) loss_train = cross_entropy_loss(outputs, targets) loss = loss_train.item() + loss if args.tensorboard: writer.add_scalar("train_loss", loss_train.item(), iter)
scaler.scale(loss_train).backward() scaler.step(optim) scaler.update() iter = iter + 1 if iter % 100 == 0: print( "Epoch: {} | Iteration: {} | lr: {} | loss: {} | np.mean(loss): {} " .format(epoch, iter, optim.param_groups[0]['lr'], loss_train.item(), np.mean(loss))) if args.tensorboard: writer.add_scalar("lr", optim.param_groups[0]['lr'], epoch) scheduler.step(np.mean(loss)) t2 = time.time() h = (t2 - t1) // 3600 m = ((t2 - t1) % 3600) // 60 s = ((t2 - t1) % 3600) % 60 print("epoch {} is finished, and time is {}h{}m{}s".format(epoch, int(h), int(m), int(s)))
import time import torch import torchvision from torch import nn from torch.utils.data import DataLoader from torchvision.models import alexnet from torchvision import transforms from torch.utils.tensorboard import SummaryWriter import numpy as np import argparse
if args.cuda == cuda: model = model.cuda() model = torch.nn.DataParallel(model).cuda() else: model = torch.nn.DataParallel(model)
# 6.Create loss cross_entropy_loss = nn.CrossEntropyLoss()
# 7.Optimizer optim = torch.optim.AdamW(model.parameters(), lr=args.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=3, verbose=True) # 8. Set some parameters to control loop # epoch iter = 0 t0 = time.time() for epoch in range(args.epochs): t1 = time.time() print(" -----------------the {} number of training epoch --------------".format(epoch)) model.train() for data in train_dataloader: loss = 0 imgs, targets = data if args.cuda == cuda: cross_entropy_loss = cross_entropy_loss.cuda() imgs, targets = imgs.cuda(), targets.cuda() outputs = model(imgs) loss_train = cross_entropy_loss(outputs, targets) loss = loss_train.item() + loss if args.tensorboard: writer.add_scalar("train_loss", loss_train.item(), iter)
optim.zero_grad() loss_train.backward() optim.step() iter = iter + 1 if iter % 100 == 0: print( "Epoch: {} | Iteration: {} | lr: {} | loss: {} | np.mean(loss): {} " .format(epoch, iter, optim.param_groups[0]['lr'], loss_train.item(), np.mean(loss))) if args.tensorboard: writer.add_scalar("lr", optim.param_groups[0]['lr'], epoch) scheduler.step(np.mean(loss)) t2 = time.time() h = (t2 - t1) // 3600 m = ((t2 - t1) % 3600) // 60 s = ((t2 - t1) % 3600) % 60 print("epoch {} is finished, and time is {}h{}m{}s".format(epoch, int(h), int(m), int(s)))
import torch import torch.nn as nn from torchvision.models.utils import load_state_dict_from_url from torch.cuda.amp import autocast from typing import Any
@autocast() def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.features(x) x = self.avgpool(x) x = torch.flatten(x, 1) x = self.classifier(x) return x
def alexnet(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> AlexNet: r"""AlexNet model architecture from the `"One weird trick..." `_ paper. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ model = AlexNet(**kwargs) if pretrained: state_dict = load_state_dict_from_url(model_urls["alexnet"], progress=progress) model.load_state_dict(state_dict) return model
train_DP_autocast.py 导入自己的alexnet.py
import time import torch from alexnet import alexnet import torchvision from torch import nn from torch.utils.data import DataLoader from torchvision import transforms from torch.cuda.amp import autocast as autocast from torch.utils.tensorboard import SummaryWriter import numpy as np import argparse
if args.cuda == cuda: model = model.cuda() model = torch.nn.DataParallel(model).cuda() else: model = torch.nn.DataParallel(model)
# 6.Create loss cross_entropy_loss = nn.CrossEntropyLoss()
# 7.Optimizer optim = torch.optim.AdamW(model.parameters(), lr=args.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=3, verbose=True) # 8. Set some parameters to control loop # epoch iter = 0 t0 = time.time() for epoch in range(args.epochs): t1 = time.time() print(" -----------------the {} number of training epoch --------------".format(epoch)) model.train() for data in train_dataloader: loss = 0 imgs, targets = data if args.cuda == cuda: cross_entropy_loss = cross_entropy_loss.cuda() imgs, targets = imgs.cuda(), targets.cuda() with autocast(): outputs = model(imgs) loss_train = cross_entropy_loss(outputs, targets) loss = loss_train.item() + loss if args.tensorboard: writer.add_scalar("train_loss", loss_train.item(), iter)
optim.zero_grad() loss_train.backward() optim.step() iter = iter + 1 if iter % 100 == 0: print( "Epoch: {} | Iteration: {} | lr: {} | loss: {} | np.mean(loss): {} " .format(epoch, iter, optim.param_groups[0]['lr'], loss_train.item(), np.mean(loss))) if args.tensorboard: writer.add_scalar("lr", optim.param_groups[0]['lr'], epoch) scheduler.step(np.mean(loss)) t2 = time.time() h = (t2 - t1) // 3600 m = ((t2 - t1) % 3600) // 60 s = ((t2 - t1) % 3600) % 60 print("epoch {} is finished, and time is {}h{}m{}s".format(epoch, int(h), int(m), int(s)))
import time import torch from alexnet import alexnet import torchvision from torch import nn from torch.utils.data import DataLoader from torchvision import transforms from torch.cuda.amp import autocast as autocast from torch.cuda.amp import GradScaler as GradScaler from torch.utils.tensorboard import SummaryWriter import numpy as np import argparse
if args.cuda == cuda: model = model.cuda() model = torch.nn.DataParallel(model).cuda() else: model = torch.nn.DataParallel(model)
# 6.Create loss cross_entropy_loss = nn.CrossEntropyLoss()
# 7.Optimizer optim = torch.optim.AdamW(model.parameters(), lr=args.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=3, verbose=True) scaler = GradScaler() # 8. Set some parameters to control loop # epoch iter = 0 t0 = time.time() for epoch in range(args.epochs): t1 = time.time() print(" -----------------the {} number of training epoch --------------".format(epoch)) model.train() for data in train_dataloader: loss = 0 imgs, targets = data optim.zero_grad() if args.cuda == cuda: cross_entropy_loss = cross_entropy_loss.cuda() imgs, targets = imgs.cuda(), targets.cuda() with autocast(): outputs = model(imgs) loss_train = cross_entropy_loss(outputs, targets) loss = loss_train.item() + loss if args.tensorboard: writer.add_scalar("train_loss", loss_train.item(), iter)
import time import torch from torchvision.models.alexnet import alexnet import torchvision from torch import nn import torch.distributed as dist from torchvision import transforms from torch.utils.data import DataLoader from torch.utils.tensorboard import SummaryWriter import numpy as np import argparse
if args.cuda == cuda: model = model.cuda() model = torch.nn.parallel.DistributedDataParallel(model).cuda() else: model = torch.nn.parallel.DistributedDataParallel(model)
# 6.Create loss cross_entropy_loss = nn.CrossEntropyLoss()
# 8. Set some parameters to control loop # epoch iter = 0 t0 = time.time() for epoch in range(args.epochs): t1 = time.time() print(" -----------------the {} number of training epoch --------------".format(epoch)) model.train() for data in train_dataloader: loss = 0 imgs, targets = data if args.cuda == cuda: cross_entropy_loss = cross_entropy_loss.cuda() imgs, targets = imgs.cuda(), targets.cuda() outputs = model(imgs) loss_train = cross_entropy_loss(outputs, targets) loss = loss_train.item() + loss if args.tensorboard: writer.add_scalar("train_loss", loss_train.item(), iter)
optim.zero_grad() loss_train.backward() optim.step() iter = iter + 1 if iter % 100 == 0: print( "Epoch: {} | Iteration: {} | lr: {} | loss: {} | np.mean(loss): {} " .format(epoch, iter, optim.param_groups[0]['lr'], loss_train.item(), np.mean(loss))) if args.tensorboard: writer.add_scalar("lr", optim.param_groups[0]['lr'], epoch) scheduler.step(np.mean(loss)) t2 = time.time() h = (t2 - t1) // 3600 m = ((t2 - t1) % 3600) // 60 s = ((t2 - t1) % 3600) % 60 print("epoch {} is finished, and time is {}h{}m{}s".format(epoch, int(h), int(m), int(s)))
# 5. Set some parameters for testing the network total_accuracy = 0
# test model.eval() with torch.no_grad(): for data in test_dataloader: imgs, targets = data device = torch.device('cpu') imgs, targets = imgs.to(device), targets.to(device) model_load = torch.load("{}/AlexNet.pth".format(args.checkpoint), map_location=device) model.load_state_dict(model_load) outputs = model(imgs) outputs = outputs.to(device) accuracy = (outputs.argmax(1) == targets).sum() total_accuracy = total_accuracy + accuracy accuracy = total_accuracy / test_dataset_size print("the total accuracy is {}".format(accuracy))
if __name__ == "__main__": local_size = torch.cuda.device_count() print("local_size: ".format(local_size)) eval()
运行结果:
(2)DDP使用autocast的训练与评估源码:
训练源码:
train_DDP_autocast.py 导入自己的alexnet.py
import time import torch from alexnet import alexnet import torchvision from torch import nn import torch.distributed as dist from torchvision import transforms from torch.utils.data import DataLoader from torch.cuda.amp import autocast as autocast from torch.utils.tensorboard import SummaryWriter import numpy as np import argparse