在torchvision\models\detection\faster_rcnn.py构造函数中,指定了image mean/std,这些前面笔记中都介绍了原因,这里不多说了,这里还指明了最大和最小的图像长宽,这里是800和1333,表明转换出来的图像不能超出800x1333或者1333x800这个尺寸。
class FasterRCNN(GeneralizedRCNN): def __init__(self, backbone, num_classes=None, # transform parameters min_size=800, max_size=1333, ...... ...... if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) ......在喂数据给网络之前,通过这个转换模块执行输入和目标转换。
class GeneralizedRCNNTransform(nn.Module): def forward(self, images, # type: List[Tensor] targets=None # type: Optional[List[Dict[str, Tensor]]] ): ...... for i in range(len(images)): ...... image = self.normalize(image) image, target_index = self.resize(image, target_index) ...... image_sizes = [img.shape[-2:] for img in images] images = self.batch_images(images) ......这里输入的一个图像列表,输出的是转换之后的图片张量,首先对图像进行normalize的处理
def normalize(self, image): dtype, device = image.dtype, image.device mean = torch.as_tensor(self.image_mean, dtype=dtype, device=device) std = torch.as_tensor(self.image_std, dtype=dtype, device=device) return (image - mean[:, None, None]) / std[:, None, None]产生的tensor每个像素值现在在0附近分布,更易于神经网络处理。
在此步骤中首先算出图像是基于长还是宽进行缩放
def _resize_image_and_masks(image, self_min_size, self_max_size, target): # type: (Tensor, float, float, Optional[Dict[str, Tensor]]) -> Tuple[Tensor, Optional[Dict[str, Tensor]]] im_shape = torch.tensor(image.shape[-2:]) min_size = float(torch.min(im_shape)) max_size = float(torch.max(im_shape)) scale_factor = self_min_size / min_size if max_size * scale_factor > self_max_size: scale_factor = self_max_size / max_size image = torch.nn.functional.interpolate( image[None], scale_factor=scale_factor, mode='bilinear', recompute_scale_factor=True, align_corners=False)[0] if target is None: return image, target ...... return image, target class GeneralizedRCNNTransform(nn.Module): ...... def resize(self, image, target): # type: (Tensor, Optional[Dict[str, Tensor]]) -> Tuple[Tensor, Optional[Dict[str, Tensor]]] h, w = image.shape[-2:] if self.training: size = float(self.torch_choice(self.min_size)) else: # FIXME assume for now that testing uses the largest scale size = float(self.min_size[-1]) if torchvision._is_tracing(): image, target = _resize_image_and_masks_onnx(image, size, float(self.max_size), target) else: image, target = _resize_image_and_masks(image, size, float(self.max_size), target)首先得到图像宽高(599, 900):
im_shape = torch.tensor(image.shape[-2:])然后得到图像宽高的最小值和最大值(599, 900):
min_size = float(torch.min(im_shape)) max_size = float(torch.max(im_shape))根据最小图像尺寸的要求,得到对应的缩放比(800/599 = 1.335559265442404006677796327212)
scale_factor = self_min_size / min_size看看这个缩放比,算出来的最大边(900*800/599 = 1202.00333)是否超过最大设置值(1333),如果超过了,用最长边来计算缩放比,这里没超过,不需要做这一步:
if max_size * scale_factor > self_max_size: scale_factor = self_max_size / max_size用pytorch中的interpolate做插值缩放,得到一个(3, 800, 1202)的图像张量:
image = torch.nn.functional.interpolate( image[None], scale_factor=scale_factor, mode='bilinear', recompute_scale_factor=True, align_corners=False)[0]得到各个维度最大的数字(比如,3, 800, 1202), 以及设定的stride(比如:32):
def batch_images(self, images, size_divisible=32): # type: (List[Tensor], int) -> Tensor if torchvision._is_tracing(): # batch_images() does not export well to ONNX # call _onnx_batch_images() instead return self._onnx_batch_images(images, size_divisible) max_size = self.max_by_axis([list(img.shape) for img in images]) stride = float(size_divisible)让长、宽和stride对齐,800能被32整除,不调整,1202需要调整为1216:
max_size[1] = int(math.ceil(float(max_size[1]) / stride) * stride) max_size[2] = int(math.ceil(float(max_size[2]) / stride) * stride重新resize图像,并且在右边,或者底部添加padding(黑色),这里得到的是(3, 800, 1216)的图片张量。
batch_shape = [len(images)] + max_size batched_imgs = images[0].new_full(batch_shape, 0) for img, pad_img in zip(images, batched_imgs): pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)这就是为什么在代码中涉及到三个image sizes: original_image_sizes:(599, 900),最后bbox会转换到这个坐标上 image_sizes:(800,1202),最后生成的bbox等都是基于这个坐标 feature_maps:基于(800,1216),然后所有的level的feature map size就是:
namesizeinput image800x1216conv1/maxpool200x304conv2_x100x152conv3_x50x76conv4_x25x38conv5_x13x19在ROI Pool/Head中最后会把Proposal裁剪到image_size对应的区域内。
def clip_boxes_to_image(boxes, size): dim = boxes.dim() boxes_x = boxes[..., 0::2] boxes_y = boxes[..., 1::2] height, width = size if torchvision._is_tracing(): boxes_x = torch.max(boxes_x, torch.tensor(0, dtype=boxes.dtype, device=boxes.device)) boxes_x = torch.min(boxes_x, torch.tensor(width, dtype=boxes.dtype, device=boxes.device)) boxes_y = torch.max(boxes_y, torch.tensor(0, dtype=boxes.dtype, device=boxes.device)) boxes_y = torch.min(boxes_y, torch.tensor(height, dtype=boxes.dtype, device=boxes.device)) else: boxes_x = boxes_x.clamp(min=0, max=width) boxes_y = boxes_y.clamp(min=0, max=height) clipped_boxes = torch.stack((boxes_x, boxes_y), dim=dim) return clipped_boxes.reshape(boxes.shape) class RoIHeads(torch.nn.Module): def postprocess_detections(self, class_logits, # type: Tensor box_regression, # type: Tensor proposals, # type: List[Tensor] image_shapes # type: List[Tuple[int, int]] ): ...... all_boxes = [] all_scores = [] all_labels = [] for boxes, scores, image_shape in zip(pred_boxes_list, pred_scores_list, image_shapes): boxes = box_ops.clip_boxes_to_image(boxes, image_shape) ......最后又会把image_sizes坐标转换成original_image_sizes,具体就在transform.postprocess这个地方,把image_sizes(800, 1202)转换成original_image_sizes(500, 900):
class GeneralizedRCNN(nn.Module): def forward(self, images, targets=None): ...... features = self.backbone(images.tensors) if isinstance(features, torch.Tensor): features = OrderedDict([('0', features)]) proposals, proposal_losses = self.rpn(images, features, targets) detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets) detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes)