代码拉取完成,页面将自动刷新
同步操作将从 HNUDLG/sifdnet-main 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
import heapq
import os
import random
import re
from datetime import datetime
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
from torch import hub
from torch.backends import cudnn
from config import Config
from training import models
from training.datasets import get_dataloader, get_test_dataloader
from training.tools.logger import get_logger
from training.tools.train_utils import parse_args, validate, train, create_optimizer
CONFIG = Config()
hub.set_dir(CONFIG['TORCH_HOME'])
torch.backends.cudnn.benchmark = True
def main_worker(gpu, ngpus_per_node: int, args):
args.gpu = gpu
log_name = 'logs/{}_{}'.format(args.arch, args.prefix)
logger = get_logger(save_log=True, log_name=log_name)
logger.info(args)
if args.gpu is not None:
logger.debug("Use GPU: {} for training".format(args.gpu))
if args.distributed:
if args.dist_url == "env://" and args.rank == -1:
args.rank = int(os.environ["RANK"])
if args.multiprocessing_distributed:
# For multiprocessing distributed training, rank needs to be the
# global rank among all the processes
args.rank = args.rank * ngpus_per_node + gpu
dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size, rank=args.rank)
logger.debug("Initializing Networks")
model = models.__dict__[args.arch](num_classes=2, pretrained=True)
model_cfg = model.default_cfg
logger.debug("Initializing Distribution")
if not torch.cuda.is_available():
logger.debug('Using CPU, this will be slow')
elif args.distributed:
# For multiprocessing distributed, DistributedDataParallel constructor
# should always set the single device scope, otherwise,
# DistributedDataParallel will use all available devices.
if args.gpu is not None:
torch.cuda.set_device(args.gpu)
model.cuda(args.gpu)
# When using a single GPU per process and per
# DistributedDataParallel, we need to divide the batch size
# ourselves based on the total number of GPUs we have
args.batch_size = int(args.batch_size / ngpus_per_node)
args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True)
else:
model.cuda()
# DistributedDataParallel will divide and allocate batch_size to all
# available GPUs if device_ids are not set
model = torch.nn.parallel.DistributedDataParallel(model)
elif args.gpu is not None:
torch.cuda.set_device(args.gpu)
model = model.cuda(args.gpu)
else:
model = torch.nn.DataParallel(model).cuda()
logger.debug("Initializing Data Loader")
train_sampler, train_loader, val_loader = get_dataloader(model_cfg, args)
test_loader = get_test_dataloader(model_cfg, args)
loss_functions = {
"classifier_loss": nn.CrossEntropyLoss().cuda(args.gpu),
"binary_map_loss": nn.CrossEntropyLoss().cuda(args.gpu),
"map_loss": nn.MSELoss().cuda(args.gpu)
}
optimizer, scheduler = create_optimizer(model, args)
start_epoch = 1
acc1 = 0.
scaler = torch.cuda.amp.GradScaler(enabled=args.use_amp)
if args.resume:
if os.path.isfile(args.resume):
checkpoint = torch.load(args.resume, map_location='cpu')
start_epoch = checkpoint['epoch'] + 1
acc1 = checkpoint['acc1']
state_dict = checkpoint['state_dict']
if args.distributed:
model.load_state_dict(state_dict, strict=False)
else:
model.load_state_dict({re.sub("^module.", "", k): v for k, v in state_dict.items()}, strict=False)
optimizer.load_state_dict(checkpoint["optimizer"])
scaler.load_state_dict(checkpoint["scaler"])
logger.debug("Loaded checkpoint '{}' (epoch {}, acc {})".format(args.resume, start_epoch - 1, acc1))
else:
logger.debug("No checkpoint found at '{}'".format(args.resume))
if args.evaluate:
validate(val_loader, model, logger, args)
return
logger.debug("Start Training")
is_main_node = not args.multiprocessing_distributed or (
args.multiprocessing_distributed and args.rank % ngpus_per_node == 0)
better_acc, min_heap = False, []
for epoch in range(start_epoch, args.epochs + 1):
if is_main_node:
logger.info('{}, Optimizer {}, Learning Rate {}',
datetime.now(), type(optimizer).__name__, optimizer.param_groups[0]['lr'])
if args.distributed:
train_sampler.set_epoch(epoch)
train(train_loader, model, scaler, optimizer, loss_functions, epoch, logger, args)
if scheduler is not None:
scheduler.step()
if (epoch % 1 == 0 or epoch == args.epochs) and is_main_node:
acc1, miou = validate(val_loader, model, logger, args)
validate(test_loader, model, logger, args)
if len(min_heap) < 2:
heapq.heappush(min_heap, acc1)
else:
mini_acc = min_heap[0]
if acc1 > mini_acc:
heapq.heapreplace(min_heap, acc1)
better_acc = True
save_model = better_acc and is_main_node
# save_model = is_main_node
if save_model:
logger.info('Save model, Top 2 ACC: {}', min_heap)
torch.save({
'epoch': epoch,
'arch': args.arch,
'acc1': acc1,
'state_dict': model.state_dict(),
'optimizer': optimizer.state_dict(),
"scaler": scaler.state_dict()
}, os.path.join('weights', '{}_{}_{}.pt'.format(args.arch, args.prefix, epoch)))
better_acc = False
def main():
args = parse_args()
os.makedirs('weights', exist_ok=True)
os.makedirs('logs', exist_ok=True)
logger = get_logger()
if args.seed is not None:
random.seed(args.seed)
torch.manual_seed(args.seed)
cudnn.deterministic = True
logger.warning('You have chosen to seed training. ' +
'This will turn on the CUDNN deterministic setting, ' +
'which can slow down your training considerably! ' +
'You may see unexpected behavior when restarting from checkpoints.')
if args.gpu is not None:
logger.warning('You have chosen a specific GPU. This will completely disable data parallelism.')
if args.dist_url == "env://" and args.world_size == -1:
args.world_size = int(os.environ["WORLD_SIZE"])
args.distributed = args.world_size > 1 or args.multiprocessing_distributed
ngpus_per_node = torch.cuda.device_count()
if args.multiprocessing_distributed:
# Since we have ngpus_per_node processes per node, the total world_size
# needs to be adjusted accordingly
args.world_size = ngpus_per_node * args.world_size
# Use torch.multiprocessing.spawn to launch distributed processes: the
# main_worker process function
mp.spawn(fn=main_worker, args=(ngpus_per_node, args), nprocs=ngpus_per_node)
else:
# Simply call main_worker function
main_worker(args.gpu, ngpus_per_node, args)
if __name__ == '__main__':
main()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。