diff --git a/EAutodet.ipynb b/EAutodet.ipynb new file mode 100644 index 0000000..09a9f0d --- /dev/null +++ b/EAutodet.ipynb @@ -0,0 +1,165 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "private_outputs": true, + "provenance": [], + "machine_shape": "hm", + "gpuType": "A100", + "authorship_tag": "ABX9TyMvTTM6qJIF9fbb5iHAFlMx", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9eP4BfF8OdIX" + }, + "outputs": [], + "source": [ + "!git clone https://github.com/aashikrasool/EAutoDet.git" + ] + }, + { + "cell_type": "code", + "source": [ + "cd EAutoDet" + ], + "metadata": { + "id": "qbCniNOHOnDH" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "ls\n" + ], + "metadata": { + "id": "gm3JophGOxeE" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!pip install thop" + ], + "metadata": { + "id": "NMBr0rxhY_sy" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!pip install wandb" + ], + "metadata": { + "id": "Dio2_oXWboTt" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!bash data/scripts/get_coco.sh" + ], + "metadata": { + "id": "8Lg26SaJO99O" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!touch logs/EAutoDet-s_test1.log\n" + ], + "metadata": { + "id": "M6YrDoKPcI0z" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!bash scripts/search.sh 0" + ], + "metadata": { + "id": "8hnodOlxPFOI" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!python -u train_search.py --data $data --cfg $cfg --weights '' --epochs 50 --batch-size $BATCHSIZE > $LOG_DIR/${cfg_file}_test1.log 2>&1 &\n", + "\n" + ], + "metadata": { + "id": "LruE8zGJPNuY" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!!python -u train_search.py --data $data --cfg $cfg --weights '' --epochs 50 --batch-size $BATCHSIZE > $LOG_DIR/${cfg_file}_test1.log 2>&1\n" + ], + "metadata": { + "id": "Yq7qlZBtRrpq" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!bash scripts/full_train.sh 0" + ], + "metadata": { + "id": "2xK2W_B9SFw7" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "RRnUZSD4SlNK" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/data/scripts/get_coco.sh b/data/scripts/get_coco.sh index 96d0199..e643321 100755 --- a/data/scripts/get_coco.sh +++ b/data/scripts/get_coco.sh @@ -8,20 +8,20 @@ # /yolov5 # Download/unzip labels -d='../' # unzip directory +d='./' # unzip directory url=https://github.com/ultralytics/yolov5/releases/download/v1.0/ -f='coco2017labels.zip' # or 'coco2017labels-segments.zip', 68 MB +f='coco2017labels-segments.zip' # or 'coco2017labels.zip', 68 MB echo 'Downloading' $url$f ' ...' curl -L $url$f -o $f && unzip -q $f -d $d && rm $f & # download, unzip, remove in background -## Download/unzip images -#d='../coco/images' # unzip directory -#url=http://images.cocodataset.org/zips/ -#f1='train2017.zip' # 19G, 118k images -#f2='val2017.zip' # 1G, 5k images -#f3='test2017.zip' # 7G, 41k images (optional) -#for f in $f1 $f2; do -# echo 'Downloading' $url$f '...' -# curl -L $url$f -o $f && unzip -q $f -d $d && rm $f & # download, unzip, remove in background -#done +# Download/unzip images +d='./coco/images' # unzip directory +url=http://images.cocodataset.org/zips/ +f1='train2017.zip' # 19G, 118k images +f2='val2017.zip' # 1G, 5k images +f3='test2017.zip' # 7G, 41k images (optional) +for f in $f1 $f2 $f3; do + echo 'Downloading' $url$f '...' + curl -L $url$f -o $f && unzip -q $f -d $d && rm $f & # download, unzip, remove in background +done wait # finish background tasks diff --git a/train.py b/train.py index 9667244..21e3525 100644 --- a/train.py +++ b/train.py @@ -680,4 +680,3 @@ def train(hyp, opt, device, tb_writer=None, wandb=None): plot_evolution(yaml_file) print(f'Hyperparameter evolution complete. Best results saved as: {yaml_file}\n' f'Command to train a new model with these hyperparameters: $ python train.py --hyp {yaml_file}') - diff --git a/train_search.py b/train_search.py index a50749f..7d22298 100644 --- a/train_search.py +++ b/train_search.py @@ -26,9 +26,10 @@ from models.yolo_search import Model, parse_model from utils.autoanchor import check_anchors from utils.datasets import create_dataloader_search, create_dataloader -from utils.general import labels_to_class_weights, increment_path, labels_to_image_weights, init_seeds, \ - fitness, strip_optimizer_search, get_latest_run, check_dataset, check_file, check_git_status, check_img_size, \ - check_requirements, print_mutation, set_logging, one_cycle, colorstr +from utils.general import (labels_to_class_weights, increment_path, labels_to_image_weights, init_seeds, + fitness, strip_optimizer_search, get_latest_run, check_dataset, check_file, + check_git_status, check_img_size, check_requirements, print_mutation, set_logging, + one_cycle, colorstr) from utils.google_utils import attempt_download from utils.loss import ComputeLoss from utils.plots import plot_images, plot_labels, plot_results, plot_evolution @@ -39,18 +40,19 @@ logger = logging.getLogger(__name__) -def train(hyp, opt, device, tb_writer=None, wandb=None): +def train(hyp, opt, device, tb_writer=None, wandb_run=None): logger.info(colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items())) - save_dir, epochs, batch_size, total_batch_size, weights, rank = \ + save_dir, epochs, batch_size, total_batch_size, weights, rank = ( Path(opt.save_dir), opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank + ) # Directories wdir = save_dir / 'weights' - wdir.mkdir(parents=True, exist_ok=True) # make dir + wdir.mkdir(parents=True, exist_ok=True) geno_dir = save_dir / 'genotypes' - geno_dir.mkdir(parents=True, exist_ok=True) # make dir + geno_dir.mkdir(parents=True, exist_ok=True) alpha_dir = save_dir / 'alphas' - alpha_dir.mkdir(parents=True, exist_ok=True) # make dir + alpha_dir.mkdir(parents=True, exist_ok=True) last = wdir / 'last.pt' best = wdir / 'best.pt' results_file = save_dir / 'results.txt' @@ -62,232 +64,198 @@ def train(hyp, opt, device, tb_writer=None, wandb=None): yaml.dump(vars(opt), f, sort_keys=False) # Configure - plots = not opt.evolve # create plots + plots = not opt.evolve cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: - data_dict = yaml.load(f, Loader=yaml.SafeLoader) # data dict + data_dict = yaml.load(f, Loader=yaml.SafeLoader) with torch_distributed_zero_first(rank): - check_dataset(data_dict) # check + check_dataset(data_dict) train_path = data_dict['train'] test_path = data_dict['val'] - nc = 1 if opt.single_cls else int(data_dict['nc']) # number of classes - names = ['item'] if opt.single_cls and len(data_dict['names']) != 1 else data_dict['names'] # class names - assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, opt.data) # check + nc = 1 if opt.single_cls else int(data_dict['nc']) + names = ['item'] if opt.single_cls and len(data_dict['names']) != 1 else data_dict['names'] + assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, opt.data) # Model pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(rank): - attempt_download(weights) # download if not found locally - ckpt = torch.load(weights, map_location=device) # load checkpoint - model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create - exclude = ['anchor'] if (opt.cfg or hyp.get('anchors')) and not opt.resume else [] # exclude keys - state_dict = ckpt['model'].float().state_dict() # to FP32 - state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect - model.load_state_dict(state_dict, strict=False) # load - logger.info('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report + attempt_download(weights) + ckpt = torch.load(weights, map_location=device) + model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) + exclude = ['anchor'] if (opt.cfg or hyp.get('anchors')) and not opt.resume else [] + state_dict = ckpt['model'].float().state_dict() + state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) + model.load_state_dict(state_dict, strict=False) + logger.info('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) else: - model = Model(opt.cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create -# model.update_arch_parameters() # Since to(device) will not update model._arch_parameters -# for alpha in model.arch_parameters(): -# alpha.requires_grad_(True) + model = Model(opt.cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) - # Freeze - freeze = [] # parameter names to freeze (full or partial) + # Freeze (if needed) + freeze = [] # List parameter names to freeze (if any) for k, v in model.named_parameters(): - v.requires_grad = True # train all layers + v.requires_grad = True if any(x in k for x in freeze): print('freezing %s' % k) v.requires_grad = False # Optimizer -# nbs = 64 # nominal batch size -# nbs = total_batch_size # nominal batch size nbs = max(total_batch_size, 64) # nominal batch size - accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing - hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay + accumulate = max(round(nbs / total_batch_size), 1) + hyp['weight_decay'] *= total_batch_size * accumulate / nbs logger.info(f"Scaled weight_decay = {hyp['weight_decay']}") - pg0, pg1, pg2 = [], [], [] # optimizer parameter groups + pg0, pg1, pg2 = [], [], [] for k, v in model.named_modules(): if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): - pg2.append(v.bias) # biases + pg2.append(v.bias) if isinstance(v, nn.BatchNorm2d) and hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): - pg0.append(v.weight) # no decay + pg0.append(v.weight) elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): - pg1.append(v.weight) # apply decay + pg1.append(v.weight) elif hasattr(v, 'depth_weight') and isinstance(v.depth_weight, nn.Parameter): - pg1.append(v.depth_weight) # apply decay - if hasattr(v, 'point_weight') and isinstance(v.point_weight, nn.Parameter): pg1.append(v.point_weight) # apply decay + pg1.append(v.depth_weight) + if hasattr(v, 'point_weight') and isinstance(v.point_weight, nn.Parameter): + pg1.append(v.point_weight) elif hasattr(v, 'depth_weight1') and isinstance(v.depth_weight1, nn.Parameter): - pg1.append(v.depth_weight1) # apply decay + pg1.append(v.depth_weight1) elif hasattr(v, 'depth_weight2') and isinstance(v.depth_weight2, nn.Parameter): - pg1.append(v.depth_weight2) # apply decay + pg1.append(v.depth_weight2) - tt = 0 - for w in model.parameters(): tt+=1 - assert (tt == len(pg0)+len(pg1)+len(pg2)) -# # Test genotype code -# geno, model_yaml = model.genotype() -# parse_model(model_yaml, [3]) -# print(geno) -# assert 0 + tt = sum(1 for _ in model.parameters()) + assert (tt == len(pg0) + len(pg1) + len(pg2)) if opt.adam: - optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum + optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) - optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay - optimizer.add_param_group({'params': pg2}) # add pg2 (biases) + optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) + optimizer.add_param_group({'params': pg2}) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 - # Scheduler https://arxiv.org/pdf/1812.01187.pdf - # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR if opt.linear_lr: - lf = lambda x: (1 - x / (epochs - 1)) * (1.0 - hyp['lrf']) + hyp['lrf'] # linear + lf = lambda x: (1 - x / (epochs - 1)) * (1.0 - hyp['lrf']) + hyp['lrf'] else: - lf = one_cycle(1, hyp['lrf'], epochs) # cosine 1->hyp['lrf'] + lf = one_cycle(1, hyp['lrf'], epochs) scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) - # plot_lr_scheduler(optimizer, scheduler, epochs) - - # Logging - if rank in [-1, 0] and wandb and wandb.run is None: - opt.hyp = hyp # add hyperparameters - wandb_run = wandb.init(config=opt, resume="allow", - project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem, - name=save_dir.stem, - entity=opt.entity, - id=ckpt.get('wandb_id') if 'ckpt' in locals() else None) - loggers = {'wandb': wandb} # loggers dict + + # Initialize WandB logging if available + if rank in [-1, 0] and wandb_run is None and wandb is not None: + opt.hyp = hyp # add hyperparameters to config + try: + wandb_run = wandb.init( + config=opt, + resume="allow", + project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem, + name=Path(opt.save_dir).stem, + entity=opt.entity, + id=ckpt.get('wandb_id') if 'ckpt' in locals() else None + ) + except wandb.errors.UsageError as e: + print(f"Wandb init failed: {e}. Disabling wandb logging.") + wandb_run = None + loggers = {'wandb': wandb_run} # EMA -# ema = ModelEMA(model) if rank in [-1, 0] else None ema = EMA(model) if rank in [-1, 0] else None - # Resume + # Resume from checkpoint if applicable start_epoch, best_fitness = 0, 0.0 if pretrained: - # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] - - # EMA if ema and ckpt.get('ema'): -# ema.ema.load_state_dict(ckpt['ema'].float().state_dict()) ema.shadow = ckpt['ema'] ema.updates = ckpt['updates'] - - # Results if ckpt.get('training_results') is not None: - results_file.write_text(ckpt['training_results']) # write results.txt - - # Epochs + results_file.write_text(ckpt['training_results']) start_epoch = ckpt['epoch'] + 1 if opt.resume: assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (weights, epochs) if epochs < start_epoch: logger.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) - epochs += ckpt['epoch'] # finetune additional epochs - - del ckpt, state_dict + epochs += ckpt['epoch'] + del ckpt - # Image sizes - gs = max(int(model.stride.max()), 32) # grid size (max stride) - nl = model.model[-1].nl # number of detection layers (used for scaling hyp['obj']) - imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size] # verify imgsz are gs-multiples + gs = max(int(model.stride.max()), 32) + nl = model.model[-1].nl + imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size] - # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) - - # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info('Using SyncBatchNorm()') - - # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank) - # Trainloader - dataloader, dataloader_val, dataset, dataset_val = create_dataloader_search(train_path, imgsz, batch_size, gs, opt, - hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, - world_size=opt.world_size, workers=opt.workers, - image_weights=opt.image_weights, quad=opt.quad, prefix=colorstr('train: '), train_portion=opt.train_portion) - mlc = np.concatenate(dataset.dataset.labels, 0)[:, 0].max() # max label class. dataset is an instance of torch.utils.data.Subset - nb = len(dataloader) # number of batches + dataloader, dataloader_val, dataset, dataset_val = create_dataloader_search( + train_path, imgsz, batch_size, gs, opt, + hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, + world_size=opt.world_size, workers=opt.workers, + image_weights=opt.image_weights, quad=opt.quad, + prefix=colorstr('train: '), train_portion=opt.train_portion + ) + mlc = np.concatenate(dataset.dataset.labels, 0)[:, 0].max() + nb = len(dataloader) assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, opt.data, nc - 1) - # Process 0 if rank in [-1, 0]: - testloader = create_dataloader(test_path, imgsz_test, batch_size * 2, gs, opt, # testloader - hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True, rank=-1, - world_size=opt.world_size, workers=opt.workers, - pad=0.5, prefix=colorstr('val: '))[0] - + testloader = create_dataloader( + test_path, imgsz_test, batch_size * 2, gs, opt, + hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True, rank=-1, + world_size=opt.world_size, workers=opt.workers, + pad=0.5, prefix=colorstr('val: ') + )[0] if not opt.resume: labels = np.concatenate(dataset.dataset.labels, 0) - c = torch.tensor(labels[:, 0]) # classes - # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency - # model._initialize_biases(cf.to(device)) + c = torch.tensor(labels[:, 0]) if plots: plot_labels(labels, save_dir, loggers) if tb_writer: tb_writer.add_histogram('classes', c, 0) - - # Anchors if not opt.noautoanchor: check_anchors(dataset.dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) -# model.half().float() # pre-reduce anchor precision - - # Model parameters - hyp['box'] *= 3. / nl # scale to layers - hyp['cls'] *= nc / 80. * 3. / nl # scale to classes and layers - hyp['obj'] *= (imgsz / 640) ** 2 * 3. / nl # scale to image size and layers - model.nc = nc # attach number of classes to model - model.hyp = hyp # attach hyperparameters to model - model.gr = 1.0 # iou loss ratio (obj_loss = 1.0 or iou) - model.class_weights = labels_to_class_weights(dataset.dataset.labels, nc).to(device) * nc # attach class weights + + hyp['box'] *= 3. / nl + hyp['cls'] *= nc / 80. * 3. / nl + hyp['obj'] *= (imgsz / 640) ** 2 * 3. / nl + model.nc = nc + model.hyp = hyp + model.gr = 1.0 + model.class_weights = labels_to_class_weights(dataset.dataset.labels, nc).to(device) * nc model.names = names - # Start training t0 = time.time() - nw = max(round(hyp['warmup_epochs'] * nb), 1000) # number of warmup iterations, max(3 epochs, 1k iterations) - # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training - maps = np.zeros(nc) # mAP per class - results = (0, 0, 0, 0, 0, 0, 0) # P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls) - scheduler.last_epoch = start_epoch - 1 # do not move + nw = max(round(hyp['warmup_epochs'] * nb), 1000) + maps = np.zeros(nc) + results = (0, 0, 0, 0, 0, 0, 0) + scheduler.last_epoch = start_epoch - 1 scaler = amp.GradScaler(enabled=cuda) - compute_loss = ComputeLoss(model) # init loss class + compute_loss = ComputeLoss(model) logger.info(f'Image sizes {imgsz} train, {imgsz_test} test\n' f'Using {dataloader.num_workers} dataloader workers\n' f'Logging results to {save_dir}\n' f'Starting training for {epochs} epochs...') # Architect for NAS - architect = Architect(model, compute_loss, accumulate, device, opt, DDP=torch.cuda.device_count()>1) + architect = Architect(model, compute_loss, accumulate, device, opt, DDP=torch.cuda.device_count() > 1) ori_model = model.module if is_parallel(model) else model ori_model.display_alphas() -# torch.autograd.set_detect_anomaly(True) - - for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------ + for epoch in range(start_epoch, epochs): model.train() - - # Update image weights (optional) if opt.image_weights: - # Generate indices if rank in [-1, 0]: - cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc # class weights - iw = labels_to_image_weights(dataset.dataset.labels, nc=nc, class_weights=cw) # image weights - dataset.indices = random.choices(range(len(dataset.indices)), weights=iw[:len(dataset.indices)], k=len(dataset.indices)) # rand weighted idx - dataset_val.indices = random.choices(range(len(dataset_val.indices)), weights=iw[-len(dataset_val.indices):], k=len(dataset_val.indices)) # rand weighted idx - # Broadcast if DDP + cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc + iw = labels_to_image_weights(dataset.dataset.labels, nc=nc, class_weights=cw) + dataset.indices = random.choices(range(len(dataset.indices)), weights=iw[:len(dataset.indices)], k=len(dataset.indices)) + dataset_val.indices = random.choices(range(len(dataset_val.indices)), weights=iw[-len(dataset_val.indices):], k=len(dataset_val.indices)) if rank != -1: indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(len(dataset.indices))).int() dist.broadcast(indices, 0) @@ -298,271 +266,206 @@ def train(hyp, opt, device, tb_writer=None, wandb=None): if rank != 0: dataset_val.indices = indices.cpu().numpy() - # Update mosaic border - # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) - # dataset.mosaic_border = [b - imgsz, -b] # height, width borders - - mloss = torch.zeros(4, device=device) # mean losses + mloss = torch.zeros(4, device=device) if rank != -1: dataloader.sampler.set_epoch(epoch) dataloader_val.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'targets', 'img_size')) if rank in [-1, 0]: - pbar = tqdm(pbar, total=nb) # progress bar + pbar = tqdm(pbar, total=nb) optimizer.zero_grad() def valid_generator(): while True: - for x, t, path, shape in dataloader_val: - yield x, t, path, shape + for x, t, path, shape in dataloader_val: + yield x, t, path, shape valid_gen = valid_generator() - for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- - ni = i + nb * epoch # number integrated batches (since train start) - imgs = imgs.to(device, non_blocking=True).float() / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 - # Warmup + for i, (imgs, targets, paths, _) in pbar: + ni = i + nb * epoch + imgs = imgs.to(device, non_blocking=True).float() / 255.0 + if ni <= nw: - xi = [0, nw] # x interp - # model.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) + xi = [0, nw] accumulate = max(1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): - # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) - - # Multi-scale if opt.multi_scale: - sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size - sf = sz / max(imgs.shape[2:]) # scale factor + sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs + sf = sz / max(imgs.shape[2:]) if sf != 1: - ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple) + ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) - # architect if epoch >= opt.search_warmup: -# input_valid = imgs -# target_valid = targets - input_valid, target_valid, _, _ = next(valid_gen) - input_valid = input_valid.to(device, non_blocking=True).float() / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 - # Multi-scale - if opt.multi_scale: - sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size - sf = sz / max(input_valid.shape[2:]) # scale factor - if sf != 1: - ns = [math.ceil(x * sf / gs) * gs for x in input_valid.shape[2:]] # new shape (stretched to gs-multiple) - input_valid = F.interpolate(input_valid, size=ns, mode='bilinear', align_corners=False) - architect.step(input_valid, target_valid) - - # Forward + input_valid, target_valid, _, _ = next(valid_gen) + input_valid = input_valid.to(device, non_blocking=True).float() / 255.0 + if opt.multi_scale: + sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs + sf = sz / max(input_valid.shape[2:]) + if sf != 1: + ns = [math.ceil(x * sf / gs) * gs for x in input_valid.shape[2:]] + input_valid = F.interpolate(input_valid, size=ns, mode='bilinear', align_corners=False) + architect.step(input_valid, target_valid) + with amp.autocast(enabled=cuda): - pred = model(imgs) # forward - loss, loss_items = compute_loss(pred, targets.to(device)) # loss scaled by batch_size + pred = model(imgs) + loss, loss_items = compute_loss(pred, targets.to(device)) if rank != -1: - loss *= opt.world_size # gradient averaged between devices in DDP mode + loss *= opt.world_size if opt.quad: loss *= 4. - - # Backward -# scaler.scale(loss).backward() - grads = torch.autograd.grad(scaler.scale(loss), model.parameters(), grad_outputs=torch.ones_like(loss), allow_unused=True) -# for idx, (name, p) in enumerate(model.named_parameters()): -# if grads[idx] is None: print(name) -# else: print(grads[idx]) -# assert 0 + grads = torch.autograd.grad(scaler.scale(loss), model.parameters(), grad_outputs=torch.ones_like(loss), allow_unused=True) for v, g in zip(model.parameters(), grads): - if v.grad is None: - if not (g is None): - v.grad = torch.autograd.Variable(g.data) - else: - if not (g is None): - v.grad.data.add_(g.data) - - # Optimize + if v.grad is None: + if g is not None: + v.grad = torch.autograd.Variable(g.data) + else: + if g is not None: + v.grad.data.add_(g.data) if ni % accumulate == 0: - scaler.step(optimizer) # optimizer.step + scaler.step(optimizer) scaler.update() optimizer.zero_grad() if ema: -# ema.update(model) ema.update() - - # Print if rank in [-1, 0]: - mloss = (mloss * i + loss_items) / (i + 1) # update mean losses - mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) + mloss = (mloss * i + loss_items) / (i + 1) + mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) s = ('%10s' * 2 + '%10.4g' * 6) % ( - '%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) + '%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1] + ) pbar.set_description(s) - - # Plot if plots and ni < 3: - f = save_dir / f'train_batch{ni}.jpg' # filename + f = save_dir / f'train_batch{ni}.jpg' Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start() - # if tb_writer: - # tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) - # tb_writer.add_graph(model, imgs) # add model to tensorboard - elif plots and ni == 10 and wandb: - wandb.log({"Mosaics": [wandb.Image(str(x), caption=x.name) for x in save_dir.glob('train*.jpg') - if x.exists()]}, commit=False) - -# # test whether the model in architect and ema is updated -# state_dict_m = model.state_dict() -# state_dict_a = architect.model.state_dict() -# state_dict_e = ema.model.state_dict() -# for key in state_dict_m.keys(): -# print((state_dict_m[key]-state_dict_a[key]).sum()) -# print((state_dict_m[key]-state_dict_e[key]).sum()) -# model.display_alphas() - - # end batch ------------------------------------------------------------------------------------------------ - # DDP process 0 or single-GPU + elif plots and ni == 10 and wandb_run: + wandb_run.log({ + "Mosaics": [wandb_run.Image(str(x), caption=x.name) for x in save_dir.glob('train*.jpg') if x.exists()] + }, commit=False) if rank in [-1, 0]: - # save alpha - alpha_file = os.path.join(alpha_dir, '%d.yaml'%epoch) - alphas_yaml = {} - state_dict = ori_model.state_dict() - for key in state_dict.keys(): - if 'alpha' in key: - alphas_yaml[key] = state_dict[key].data.cpu().numpy().tolist() - with open(alpha_file, encoding='utf-8', mode='w') as f: - try: - yaml.dump(data=alphas_yaml, stream=f, allow_unicode=True) - except Exception as e: - print(e) - - geno, model_yaml = ori_model.genotype() - # save genotype - geno_file = os.path.join(geno_dir, '%d.yaml'%epoch) - with open(geno_file, encoding='utf-8', mode='w') as f: - try: - yaml.dump(data=model_yaml, stream=f, allow_unicode=True) - except Exception as e: - print(e) - print("==================") - print("normalized alphas:") - ori_model.display_alphas() - print('genotype:') - for g in geno: - print(g) - print("==================") + alpha_file = os.path.join(alpha_dir, f'{epoch}.yaml') + alphas_yaml = {} + state_dict = ori_model.state_dict() + for key in state_dict.keys(): + if 'alpha' in key: + alphas_yaml[key] = state_dict[key].data.cpu().numpy().tolist() + with open(alpha_file, encoding='utf-8', mode='w') as f: + try: + yaml.dump(data=alphas_yaml, stream=f, allow_unicode=True) + except Exception as e: + print(e) + geno, model_yaml = ori_model.genotype() + geno_file = os.path.join(geno_dir, f'{epoch}.yaml') + with open(geno_file, encoding='utf-8', mode='w') as f: + try: + yaml.dump(data=model_yaml, stream=f, allow_unicode=True) + except Exception as e: + print(e) + print("==================") + print("normalized alphas:") + ori_model.display_alphas() + print('genotype:') + for g in geno: + print(g) + print("==================") - # end epoch ---------------------------------------------------------------------------------------------------- - - # Scheduler - lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard + lr = [x['lr'] for x in optimizer.param_groups] scheduler.step() - - # DDP process 0 or single-GPU if rank in [-1, 0]: - # mAP -# ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride', 'class_weights']) - final_epoch = epoch + 1 == epochs - if not opt.notest or final_epoch: # Calculate mAP + if not opt.notest or (epoch + 1) == epochs: ema.apply_shadow() - results, maps, times = test.test(opt.data, - batch_size=batch_size * 2, - imgsz=imgsz_test, -# model=ema.ema, - model=model.module if is_parallel(model) else model, - single_cls=opt.single_cls, - dataloader=testloader, - save_dir=save_dir, - verbose=nc < 50 and final_epoch, - plots=plots and final_epoch, - log_imgs=opt.log_imgs if wandb else 0, - compute_loss=compute_loss) + results, maps, times = test.test( + opt.data, + batch_size=batch_size * 2, + imgsz=imgsz_test, + model=model.module if is_parallel(model) else model, + single_cls=opt.single_cls, + dataloader=testloader, + save_dir=save_dir, + verbose=nc < 50 and (epoch + 1) == epochs, + plots=plots and (epoch + 1) == epochs, + log_imgs=opt.log_imgs if wandb_run else 0, + compute_loss=compute_loss + ) ema.restore() - - # Write with open(results_file, 'a') as f: - f.write(s + '%10.4g' * 7 % results + '\n') # append metrics, val_loss + f.write(s + '%10.4g' * 7 % results + '\n') if len(opt.name) and opt.bucket: - os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) - - # Log - tags = ['train/box_loss', 'train/obj_loss', 'train/cls_loss', # train loss + os.system(f'gsutil cp {results_file} gs://{opt.bucket}/results/results{opt.name}.txt') + tags = ['train/box_loss', 'train/obj_loss', 'train/cls_loss', 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', - 'val/box_loss', 'val/obj_loss', 'val/cls_loss', # val loss - 'x/lr0', 'x/lr1', 'x/lr2'] # params + 'val/box_loss', 'val/obj_loss', 'val/cls_loss', + 'x/lr0', 'x/lr1', 'x/lr2'] for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): if tb_writer: - tb_writer.add_scalar(tag, x, epoch) # tensorboard - if wandb: - wandb.log({tag: x}, step=epoch, commit=tag == tags[-1]) # W&B - - # Update best mAP - fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, mAP@.5, mAP@.5-.95] + tb_writer.add_scalar(tag, x, epoch) + if wandb_run: + wandb_run.log({tag: x}, step=epoch, commit=tag == tags[-1]) + fi = fitness(np.array(results).reshape(1, -1)) if fi > best_fitness: best_fitness = fi - - # Save model - if (not opt.nosave) or (final_epoch and not opt.evolve): # if save - ckpt = {'epoch': epoch, - 'best_fitness': best_fitness, - 'training_results': results_file.read_text(), -# 'model': (model.module if is_parallel(model) else model).half(), - 'model': model.module if is_parallel(model) else model, -# 'ema': deepcopy(ema.ema).half(), - 'ema': ema.shadow, - 'updates': ema.updates, - 'optimizer': optimizer.state_dict(), - 'wandb_id': wandb_run.id if wandb else None} - # Save last, best and delete + if (not opt.nosave) or ((epoch + 1) == epochs and not opt.evolve): + ckpt = { + 'epoch': epoch, + 'best_fitness': best_fitness, + 'training_results': results_file.read_text(), + 'model': model.module if is_parallel(model) else model, + 'ema': ema.shadow, + 'updates': ema.updates, + 'optimizer': optimizer.state_dict(), + 'wandb_id': wandb_run.id if wandb_run else None + } torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt model.float() - - # end epoch ---------------------------------------------------------------------------------------------------- - # end training + scheduler.step() if rank in [-1, 0]: - # Strip optimizers - final = best if best.exists() else last # final model - for f in last, best: + final = best if best.exists() else last + for f in [last, best]: if f.exists(): - tmp = f.with_name('stripped_%s'%f.name) + tmp = f.with_name('stripped_%s' % f.name) strip_optimizer_search(f, s=tmp) if opt.bucket: - os.system(f'gsutil cp {final} gs://{opt.bucket}/weights') # upload - - # Plots + os.system(f'gsutil cp {final} gs://{opt.bucket}/weights') if plots: - plot_results(save_dir=save_dir) # save as results.png - if wandb: + plot_results(save_dir=save_dir) + if wandb_run: files = ['results.png', 'confusion_matrix.png', *[f'{x}_curve.png' for x in ('F1', 'PR', 'P', 'R')]] - wandb.log({"Results": [wandb.Image(str(save_dir / f), caption=f) for f in files - if (save_dir / f).exists()]}) + wandb_run.log({ + "Results": [wandb_run.Image(str(save_dir / f), caption=f) for f in files if (save_dir / f).exists()] + }) if opt.log_artifacts: - wandb.log_artifact(artifact_or_path=str(final), type='model', name=save_dir.stem) - - # Test best.pt + wandb_run.log_artifact(artifact_or_path=str(final), type='model', name=save_dir.stem) logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) -# best = best.with_name('stripped_%s'%best.name) - if opt.data.endswith('coco.yaml') and nc == 80: # if COCO - for m in (last, best) if best.exists() else (last): # speed, mAP tests - stripped_m = m.with_name('stripped_%s'%m.name) + if opt.data.endswith('coco.yaml') and nc == 80: + for m in (last, best) if best.exists() else (last,): + stripped_m = m.with_name('stripped_%s' % m.name) print(stripped_m) - results, _, _ = test.test(opt.data, - batch_size=batch_size * 2, - imgsz=imgsz_test, - conf_thres=0.001, - iou_thres=0.7, -# model=attempt_load(stripped_m, device).half(), - model=attempt_load(stripped_m, device), - single_cls=opt.single_cls, - dataloader=testloader, - save_dir=save_dir, - save_json=True, - plots=False) - + results, _, _ = test.test( + opt.data, + batch_size=batch_size * 2, + imgsz=imgsz_test, + conf_thres=0.001, + iou_thres=0.7, + model=attempt_load(stripped_m, device), + single_cls=opt.single_cls, + dataloader=testloader, + save_dir=save_dir, + save_json=True, + plots=False + ) else: dist.destroy_process_group() - wandb.run.finish() if wandb and wandb.run else None + if wandb_run: + wandb_run.finish() torch.cuda.empty_cache() return results @@ -600,59 +503,47 @@ def valid_generator(): parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment') parser.add_argument('--quad', action='store_true', help='quad dataloader') parser.add_argument('--linear-lr', action='store_true', help='linear LR') - - # For NAS parser.add_argument('--arch_learning_rate', type=float, default=3e-4, help='learning rate for arch encoding') parser.add_argument('--arch_weight_decay', type=float, default=1e-3, help='weight decay for arch encoding') parser.add_argument('--search_warmup', type=int, default=0, help='Epoch to Warmup the operation weights') parser.add_argument('--train_portion', type=float, default=0.5, help='portion to split the train set and search set') - opt = parser.parse_args() opt.project = '{}-{}'.format(opt.project, time.strftime("%Y%m%d-%H%M%S")) - print("Experiments dir: %s"%opt.project) - print("cfg file: %s"%opt.cfg) - - # Set DDP variables + print("Experiments dir: %s" % opt.project) + print("cfg file: %s" % opt.cfg) opt.world_size = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1 opt.global_rank = int(os.environ['RANK']) if 'RANK' in os.environ else -1 set_logging(opt.global_rank) if opt.global_rank in [-1, 0]: check_git_status() check_requirements() - - # Resume - if opt.resume: # resume an interrupted run - ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run() # specified or most recent path + if opt.resume: + ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run() assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist' apriori = opt.global_rank, opt.local_rank with open(Path(ckpt).parent.parent / 'opt.yaml') as f: - opt = argparse.Namespace(**yaml.load(f, Loader=yaml.SafeLoader)) # replace - opt.cfg, opt.weights, opt.resume, opt.batch_size, opt.global_rank, opt.local_rank = '', ckpt, True, opt.total_batch_size, *apriori # reinstate + opt = argparse.Namespace(**yaml.load(f, Loader=yaml.SafeLoader)) + opt.cfg, opt.weights, opt.resume, opt.batch_size, opt.global_rank, opt.local_rank = ( + '', ckpt, True, opt.total_batch_size, *apriori + ) logger.info('Resuming training from %s' % ckpt) else: - # opt.hyp = opt.hyp or ('hyp.finetune.yaml' if opt.weights else 'hyp.scratch.yaml') - opt.data, opt.cfg, opt.hyp = check_file(opt.data), check_file(opt.cfg), check_file(opt.hyp) # check files + opt.data, opt.cfg, opt.hyp = check_file(opt.data), check_file(opt.cfg), check_file(opt.hyp) assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified' - opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test) + opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size))) opt.name = 'evolve' if opt.evolve else opt.name - opt.save_dir = increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok | opt.evolve) # increment run - - # DDP mode + opt.save_dir = increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok | opt.evolve) opt.total_batch_size = opt.batch_size device = select_device(opt.device, batch_size=opt.batch_size) if opt.local_rank != -1: assert torch.cuda.device_count() > opt.local_rank torch.cuda.set_device(opt.local_rank) device = torch.device('cuda', opt.local_rank) - dist.init_process_group(backend='nccl', init_method='env://') # distributed backend + dist.init_process_group(backend='nccl', init_method='env://') assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count' opt.batch_size = opt.total_batch_size // opt.world_size - - # Hyperparameters with open(opt.hyp) as f: - hyp = yaml.load(f, Loader=yaml.SafeLoader) # load hyps - - # Train + hyp = yaml.load(f, Loader=yaml.SafeLoader) logger.info(opt) try: import wandb @@ -660,92 +551,88 @@ def valid_generator(): wandb = None prefix = colorstr('wandb: ') logger.info(f"{prefix}Install Weights & Biases for YOLOv5 logging with 'pip install wandb' (recommended)") + wandb_run = None + if wandb is not None and not opt.evolve and opt.global_rank in [-1, 0]: + try: + wandb_run = wandb.init( + config=opt, + resume="allow", + project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem, + name=Path(opt.save_dir).stem, + entity=opt.entity, + id=ckpt.get('wandb_id') if 'ckpt' in locals() else None + ) + except wandb.errors.UsageError as e: + print(f"Wandb init failed: {e}. Disabling wandb logging.") + wandb_run = None if not opt.evolve: - tb_writer = None # init loggers + tb_writer = None if opt.global_rank in [-1, 0]: logger.info(f'Start Tensorboard with "tensorboard --logdir {opt.project}", view at http://localhost:6006/') - tb_writer = SummaryWriter(opt.save_dir) # Tensorboard - train(hyp, opt, device, tb_writer, wandb) - - # Evolve hyperparameters (optional) + tb_writer = SummaryWriter(opt.save_dir) + train(hyp, opt, device, tb_writer, wandb_run) else: - # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit) - meta = {'lr0': (1, 1e-5, 1e-1), # initial learning rate (SGD=1E-2, Adam=1E-3) - 'lrf': (1, 0.01, 1.0), # final OneCycleLR learning rate (lr0 * lrf) - 'momentum': (0.3, 0.6, 0.98), # SGD momentum/Adam beta1 - 'weight_decay': (1, 0.0, 0.001), # optimizer weight decay - 'warmup_epochs': (1, 0.0, 5.0), # warmup epochs (fractions ok) - 'warmup_momentum': (1, 0.0, 0.95), # warmup initial momentum - 'warmup_bias_lr': (1, 0.0, 0.2), # warmup initial bias lr - 'box': (1, 0.02, 0.2), # box loss gain - 'cls': (1, 0.2, 4.0), # cls loss gain - 'cls_pw': (1, 0.5, 2.0), # cls BCELoss positive_weight - 'obj': (1, 0.2, 4.0), # obj loss gain (scale with pixels) - 'obj_pw': (1, 0.5, 2.0), # obj BCELoss positive_weight - 'iou_t': (0, 0.1, 0.7), # IoU training threshold - 'anchor_t': (1, 2.0, 8.0), # anchor-multiple threshold - 'anchors': (2, 2.0, 10.0), # anchors per output grid (0 to ignore) - 'fl_gamma': (0, 0.0, 2.0), # focal loss gamma (efficientDet default gamma=1.5) - 'hsv_h': (1, 0.0, 0.1), # image HSV-Hue augmentation (fraction) - 'hsv_s': (1, 0.0, 0.9), # image HSV-Saturation augmentation (fraction) - 'hsv_v': (1, 0.0, 0.9), # image HSV-Value augmentation (fraction) - 'degrees': (1, 0.0, 45.0), # image rotation (+/- deg) - 'translate': (1, 0.0, 0.9), # image translation (+/- fraction) - 'scale': (1, 0.0, 0.9), # image scale (+/- gain) - 'shear': (1, 0.0, 10.0), # image shear (+/- deg) - 'perspective': (0, 0.0, 0.001), # image perspective (+/- fraction), range 0-0.001 - 'flipud': (1, 0.0, 1.0), # image flip up-down (probability) - 'fliplr': (0, 0.0, 1.0), # image flip left-right (probability) - 'mosaic': (1, 0.0, 1.0), # image mixup (probability) - 'mixup': (1, 0.0, 1.0)} # image mixup (probability) - + meta = {'lr0': (1, 1e-5, 1e-1), + 'lrf': (1, 0.01, 1.0), + 'momentum': (0.3, 0.6, 0.98), + 'weight_decay': (1, 0.0, 0.001), + 'warmup_epochs': (1, 0.0, 5.0), + 'warmup_momentum': (1, 0.0, 0.95), + 'warmup_bias_lr': (1, 0.0, 0.2), + 'box': (1, 0.02, 0.2), + 'cls': (1, 0.2, 4.0), + 'cls_pw': (1, 0.5, 2.0), + 'obj': (1, 0.2, 4.0), + 'obj_pw': (1, 0.5, 2.0), + 'iou_t': (0, 0.1, 0.7), + 'anchor_t': (1, 2.0, 8.0), + 'anchors': (2, 2.0, 10.0), + 'fl_gamma': (0, 0.0, 2.0), + 'hsv_h': (1, 0.0, 0.1), + 'hsv_s': (1, 0.0, 0.9), + 'hsv_v': (1, 0.0, 0.9), + 'degrees': (1, 0.0, 45.0), + 'translate': (1, 0.0, 0.9), + 'scale': (1, 0.0, 0.9), + 'shear': (1, 0.0, 10.0), + 'perspective': (0, 0.0, 0.001), + 'flipud': (1, 0.0, 1.0), + 'fliplr': (0, 0.0, 1.0), + 'mosaic': (1, 0.0, 1.0), + 'mixup': (1, 0.0, 1.0)} assert opt.local_rank == -1, 'DDP mode not implemented for --evolve' - opt.notest, opt.nosave = True, True # only test/save final epoch - # ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices - yaml_file = Path(opt.save_dir) / 'hyp_evolved.yaml' # save best result here + opt.notest, opt.nosave = True, True + yaml_file = Path(opt.save_dir) / 'hyp_evolved.yaml' if opt.bucket: - os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket) # download evolve.txt if exists - - for _ in range(300): # generations to evolve - if Path('evolve.txt').exists(): # if evolve.txt exists: select best hyps and mutate - # Select parent(s) - parent = 'single' # parent selection method: 'single' or 'weighted' + os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket) + for _ in range(300): + if Path('evolve.txt').exists(): + parent = 'single' x = np.loadtxt('evolve.txt', ndmin=2) - n = min(5, len(x)) # number of previous results to consider - x = x[np.argsort(-fitness(x))][:n] # top n mutations - w = fitness(x) - fitness(x).min() # weights + n = min(5, len(x)) + x = x[np.argsort(-fitness(x))][:n] + w = fitness(x) - fitness(x).min() if parent == 'single' or len(x) == 1: - # x = x[random.randint(0, n - 1)] # random selection - x = x[random.choices(range(n), weights=w)[0]] # weighted selection + x = x[random.choices(range(n), weights=w)[0]] elif parent == 'weighted': - x = (x * w.reshape(n, 1)).sum(0) / w.sum() # weighted combination - - # Mutate - mp, s = 0.8, 0.2 # mutation probability, sigma + x = (x * w.reshape(n, 1)).sum(0) / w.sum() + mp, s = 0.8, 0.2 npr = np.random npr.seed(int(time.time())) - g = np.array([x[0] for x in meta.values()]) # gains 0-1 + g = np.array([x[0] for x in meta.values()]) ng = len(meta) v = np.ones(ng) - while all(v == 1): # mutate until a change occurs (prevent duplicates) + while all(v == 1): v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0) - for i, k in enumerate(hyp.keys()): # plt.hist(v.ravel(), 300) - hyp[k] = float(x[i + 7] * v[i]) # mutate - - # Constrain to limits + for i, k in enumerate(hyp.keys()): + hyp[k] = float(x[i + 7] * v[i]) for k, v in meta.items(): - hyp[k] = max(hyp[k], v[1]) # lower limit - hyp[k] = min(hyp[k], v[2]) # upper limit - hyp[k] = round(hyp[k], 5) # significant digits - - # Train mutation + hyp[k] = max(hyp[k], v[1]) + hyp[k] = min(hyp[k], v[2]) + hyp[k] = round(hyp[k], 5) results = train(hyp.copy(), opt, device, wandb=wandb) - - # Write mutation results print_mutation(hyp.copy(), results, yaml_file, opt.bucket) - - # Plot results plot_evolution(yaml_file) print(f'Hyperparameter evolution complete. Best results saved as: {yaml_file}\n' f'Command to train a new model with these hyperparameters: $ python train.py --hyp {yaml_file}') - + diff --git a/utils/datasets.py b/utils/datasets.py index 1506682..524ef10 100644 --- a/utils/datasets.py +++ b/utils/datasets.py @@ -453,7 +453,8 @@ def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, r x[:, 0] = 0 n = len(shapes) # number of images - bi = np.floor(np.arange(n) / batch_size).astype(np.int) # batch index + bi = np.floor(np.arange(n) / batch_size).astype(int) # batch index + # batch index nb = bi[-1] + 1 # number of batches self.batch = bi # batch index of image self.n = n @@ -481,7 +482,7 @@ def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, r elif mini > 1: shapes[i] = [1, 1 / mini] - self.batch_shapes = np.ceil(np.array(shapes) * img_size / stride + pad).astype(np.int) * stride + self.batch_shapes = np.ceil(np.array(shapes) * img_size / stride + pad).astype(int) * stride # Cache images into memory for faster training (WARNING: large datasets may exceed system RAM) self.imgs = [None] * n diff --git a/utils/loss.py b/utils/loss.py index 080b51c..9557309 100644 --- a/utils/loss.py +++ b/utils/loss.py @@ -208,7 +208,7 @@ def build_targets(self, p, targets): # Append a = t[:, 6].long() # anchor indices - indices.append((b, a, gj.clamp_(0, gain[3] - 1), gi.clamp_(0, gain[2] - 1))) # image, anchor, grid indices + indices.append((b, a, gj.clamp_(0, int(gain[3].item()) - 1), gi.clamp_(0, int(gain[2].item()) - 1))) # image, anchor, grid indices # image, anchor, grid indices tbox.append(torch.cat((gxy - gij, gwh), 1)) # box anch.append(anchors[a]) # anchors tcls.append(c) # class