RPN从FPN输出的特征图中,选出1000个proposal以及每个Proposal对应的前景Score,先看一下总体架构1: RPN主要包括这么几大部分:
RPN Head,卷积网络层,并做前景分数的逻辑回归,和Bounding-Box Delta数值回归Anchors产生器,这里主要是基于各层特征图,产生anchorsReshape,讲RPN Head输出各层进行拼接BoxCoder的解码器,将卷积层输出的Delta值和Anchor一同转换为左上角坐标和右下角坐标boxesProposal Filter,选1000个Proposal(boxes坐标以及前景Score)产生出来的1000个proposal boxes大概如下:
对应代码为:
class RPNHead(nn.Module): """ Adds a simple RPN Head with classification and regression heads Arguments: in_channels (int): number of channels of the input feature num_anchors (int): number of anchors to be predicted """ def __init__(self, in_channels, num_anchors): super(RPNHead, self).__init__() self.conv = nn.Conv2d( in_channels, in_channels, kernel_size=3, stride=1, padding=1 ) self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1) self.bbox_pred = nn.Conv2d( in_channels, num_anchors * 4, kernel_size=1, stride=1 ) for layer in self.children(): torch.nn.init.normal_(layer.weight, std=0.01) torch.nn.init.constant_(layer.bias, 0) def forward(self, x): # type: (List[Tensor]) -> Tuple[List[Tensor], List[Tensor]] logits = [] bbox_reg = [] for feature in x: t = F.relu(self.conv(feature)) logits.append(self.cls_logits(t)) bbox_reg.append(self.bbox_pred(t)) return logits, bbox_reg具体原理可以参考Faste-RCNN的RPN如何训练以及训练的参数集的详细解释。
首先AnchorGenerator会产生一个base anchors,总共15个(每层3个,总共5层,每层的anchor的面积是一样的,比如第一层是 3 2 2 32^2 322, 第二层是 6 4 2 64^2 642…)
# Feature Level#0 (size: 32, aspect-ratio [2:1, 1:1, 1:2]) [-23., -11., 23., 11.], [-16., -16., 16., 16.], [-11., -23., 11., 23.], # Feature Level#1 (size: 64, aspect-ratio [2:1, 1:1, 1:2]) [-45., -23., 45., 23.], [-32., -32., 32., 32.], [-23., -45., 23., 45.], # Feature Level#2 (size: 128, aspect-ratio [2:1, 1:1, 1:2]) [-91., -45., 91., 45.], [-64., -64., 64., 64.], [-45., -91., 45., 91.], # Feature Level#3 (size: 256, aspect-ratio [2:1, 1:1, 1:2]) [-181., -91., 181., 91.], [-128., -128., 128., 128.], [ -91., -181., 91., 181.], # Feature Level#4 (size: 512, aspect-ratio [2:1, 1:1, 1:2]) [-362., -181., 362., 181.], [-256., -256., 256., 256.], [-181., -362., 181., 362.] Layeranchor_areaaspect_ratiosAnchor CountsSize of feature-map of AnchorsNumber of AnchorsFeature-Map#0 3 2 2 32^2 322{2:1, 1:1, 1:2}3200x304182400Feature-Map#1 6 4 2 64^2 642{2:1, 1:1, 1:2}3100x15245600Feature-Map#2 12 8 2 128^2 1282{2:1, 1:1, 1:2}350x7611400Feature-Map#3 25 6 2 256^2 2562{2:1, 1:1, 1:2}325x382850Feature-Map#4 51 2 2 512^2 5122{2:1, 1:1, 1:2}313x19741Total242991如何根据size和aspect-ratio产生这组数据呢?
def generate_anchors(self, scales, aspect_ratios, dtype=torch.float32, device="cpu"): # type: (List[int], List[float], int, Device) -> Tensor # noqa: F821 scales = torch.as_tensor(scales, dtype=dtype, device=device) aspect_ratios = torch.as_tensor(aspect_ratios, dtype=dtype, device=device) h_ratios = torch.sqrt(aspect_ratios) w_ratios = 1 / h_ratios ws = (w_ratios[:, None] * scales[None, :]).view(-1) hs = (h_ratios[:, None] * scales[None, :]).view(-1) base_anchors = torch.stack([-ws, -hs, ws, hs], dim=1) / 2 return base_anchors.round(比如第一组aspect_ratios[0.5, 1.0, 2.0],scale是32那么第一个anchor是: [ − 1 / 0.5 × 32 / 2 , − 0.5 × 32 / 2 , 1 / 0.5 × 32 / 2 , − 0.5 × 32 / 2 ] → [ − 23. , − 11. , 23. , 11. ] [-1/\sqrt{0.5}\times32/2,-\sqrt{0.5}\times32/2,1/\sqrt{0.5}\times32/2,-\sqrt{0.5}\times32/2]\\ \to\\ [-23., -11., 23., 11.] [−1/0.5 ×32/2,−0.5 ×32/2,1/0.5 ×32/2,−0.5 ×32/2]→[−23.,−11.,23.,11.] 然后这个anchor的面积是: [ 23 − ( − 23 ) ] × [ 11 − ( − 11 ) ] = 46 × 22 ≈ 3 2 2 [23 - (-23)]\times[11-(-11)] = 46\times22 \approx 32^2 [23−(−23)]×[11−(−11)]=46×22≈322
上面的15个基本的Anchor,在每一个层的每个点上都有,而且产生的Anchors的坐标都是基于统一坐标,所以有一个strides的概念
[[tensor(4), tensor(4)], [tensor(8), tensor(8)], [tensor(16), tensor(16)], [tensor(32), tensor(32)], [tensor(61), tensor(64)]]在每个level的特征图上需要乘上各自对应的strides,然后再加上base_anchors就得到最终的anchors,这是对应的代码:
# For every combination of (a, (g, s), i) in (self.cell_anchors, zip(grid_sizes, strides), 0:2), # output g[i] anchors that are s[i] distance apart in direction i, with the same dimensions as a. def grid_anchors(self, grid_sizes, strides): # type: (List[List[int]], List[List[Tensor]]) -> List[Tensor] anchors = [] cell_anchors = self.cell_anchors assert cell_anchors is not None for size, stride, base_anchors in zip( grid_sizes, strides, cell_anchors ): grid_height, grid_width = size stride_height, stride_width = stride device = base_anchors.device # For output anchor, compute [x_center, y_center, x_center, y_center] shifts_x = torch.arange( 0, grid_width, dtype=torch.float32, device=device ) * stride_width shifts_y = torch.arange( 0, grid_height, dtype=torch.float32, device=device ) * stride_height shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x) shift_x = shift_x.reshape(-1) shift_y = shift_y.reshape(-1) shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1) # For every (base anchor, output anchor) pair, # offset each zero-centered base anchor by the center of the output anchor. anchors.append( (shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4) ) return anchors上面每幅图片会产生一个长度为5的张量列表,需要将其拼接起来,能和RPN Head输出对应起来:
class AnchorGenerator(nn.Module): ...... def forward(self, image_list, feature_maps): # type: (ImageList, List[Tensor]) -> List[Tensor] grid_sizes = list([feature_map.shape[-2:] for feature_map in feature_maps]) image_size = image_list.tensors.shape[-2:] dtype, device = feature_maps[0].dtype, feature_maps[0].device strides = [[torch.tensor(image_size[0] // g[0], dtype=torch.int64, device=device), torch.tensor(image_size[1] // g[1], dtype=torch.int64, device=device)] for g in grid_sizes] self.set_cell_anchors(dtype, device) anchors_over_all_feature_maps = self.cached_grid_anchors(grid_sizes, strides) anchors = torch.jit.annotate(List[List[torch.Tensor]], []) for i, (image_height, image_width) in enumerate(image_list.image_sizes): anchors_in_image = [] for anchors_per_feature_map in anchors_over_all_feature_maps: anchors_in_image.append(anchors_per_feature_map) anchors.append(anchors_in_image) anchors = [torch.cat(anchors_per_image) for anchors_per_image in anchors] # Clear the cache in case that memory leaks. self._cache.clear() return anchors上面RPN Head产生的objectness和pred_bbox_deltas都是一个长度为5的张量列表,需要将他们拼接起来,转换成和Anchor相同格式和Layout的张量。
def concat_box_prediction_layers(box_cls, box_regression): # type: (List[Tensor], List[Tensor]) -> Tuple[Tensor, Tensor] box_cls_flattened = [] box_regression_flattened = [] # for each feature level, permute the outputs to make them be in the # same format as the labels. Note that the labels are computed for # all feature levels concatenated, so we keep the same representation # for the objectness and the box_regression for box_cls_per_level, box_regression_per_level in zip( box_cls, box_regression ): N, AxC, H, W = box_cls_per_level.shape Ax4 = box_regression_per_level.shape[1] A = Ax4 // 4 C = AxC // A box_cls_per_level = permute_and_flatten( box_cls_per_level, N, A, C, H, W ) box_cls_flattened.append(box_cls_per_level) box_regression_per_level = permute_and_flatten( box_regression_per_level, N, A, 4, H, W ) box_regression_flattened.append(box_regression_per_level) # concatenate on the first dimension (representing the feature levels), to # take into account the way the labels were generated (with all feature maps # being concatenated as well) box_cls = torch.cat(box_cls_flattened, dim=1).flatten(0, -2) box_regression = torch.cat(box_regression_flattened, dim=1).reshape(-1, 4) return box_cls, box_regression最后产生出来的Anchors, Objectness(前景分类)和box_delta_regression(box delta数值回归)都是按照下面来排列的
通过RPN Head得到的是预测出来的delta值[ d x ( p ) d_x(p) dx(p), d y ( p ) d_y(p) dy(p), d w ( p ) d_w(p) dw(p), d h ( p ) d_h(p) dh(p)],下图中黑色边框的box是anchor,红色边框的是预测出来的proposal,这些预测出来的proposal( x b b o x _ t o p _ l e f t , y b b o x _ t o p _ l e f t , x b b o x _ b o t t o m _ r i g h t , y b b o x _ b o t t o m _ r i g h t \color{red}x_{bbox\_top\_left}, y_{bbox\_top\_left}, x_{bbox\_bottom\_right}, y_{bbox\_bottom\_right} xbbox_top_left,ybbox_top_left,xbbox_bottom_right,ybbox_bottom_right)。
class BoxCoder(object): ...... def decode(self, rel_codes, boxes): # type: (Tensor, List[Tensor]) -> Tensor assert isinstance(boxes, (list, tuple)) assert isinstance(rel_codes, torch.Tensor) boxes_per_image = [b.size(0) for b in boxes] concat_boxes = torch.cat(boxes, dim=0) box_sum = 0 for val in boxes_per_image: box_sum += val pred_boxes = self.decode_single( rel_codes.reshape(box_sum, -1), concat_boxes ) return pred_boxes.reshape(box_sum, -1, 4) def decode_single(self, rel_codes, boxes): """ From a set of original boxes and encoded relative box offsets, get the decoded boxes. Arguments: rel_codes (Tensor): encoded boxes boxes (Tensor): reference boxes. """ boxes = boxes.to(rel_codes.dtype) widths = boxes[:, 2] - boxes[:, 0] heights = boxes[:, 3] - boxes[:, 1] ctr_x = boxes[:, 0] + 0.5 * widths ctr_y = boxes[:, 1] + 0.5 * heights wx, wy, ww, wh = self.weights dx = rel_codes[:, 0::4] / wx dy = rel_codes[:, 1::4] / wy dw = rel_codes[:, 2::4] / ww dh = rel_codes[:, 3::4] / wh # Prevent sending too large values into torch.exp() dw = torch.clamp(dw, max=self.bbox_xform_clip) dh = torch.clamp(dh, max=self.bbox_xform_clip) pred_ctr_x = dx * widths[:, None] + ctr_x[:, None] pred_ctr_y = dy * heights[:, None] + ctr_y[:, None] pred_w = torch.exp(dw) * widths[:, None] pred_h = torch.exp(dh) * heights[:, None] pred_boxes1 = pred_ctr_x - torch.tensor(0.5, dtype=pred_ctr_x.dtype, device=pred_w.device) * pred_w pred_boxes2 = pred_ctr_y - torch.tensor(0.5, dtype=pred_ctr_y.dtype, device=pred_h.device) * pred_h pred_boxes3 = pred_ctr_x + torch.tensor(0.5, dtype=pred_ctr_x.dtype, device=pred_w.device) * pred_w pred_boxes4 = pred_ctr_y + torch.tensor(0.5, dtype=pred_ctr_y.dtype, device=pred_h.device) * pred_h pred_boxes = torch.stack((pred_boxes1, pred_boxes2, pred_boxes3, pred_boxes4), dim=2).flatten(1) return pred_boxes对于5层特征图每层取得分高的前1000个,
LayerAnchor CountsSize of feature-map of AnchorsNumber of bboxesmax(1000, num_of_bboxesFeature-Map#03200x3041824001000Feature-Map#13100x152456001000Feature-Map#2350x76114001000Feature-Map#3325x3828501000Feature-Map#4313x19741741Total2429914741所以通过这个步骤,会总共选取4741个bboxes。
这个步骤将把proposal bbox从800x1216的区域裁剪到800x1202,这个避免将padding的边缘包含进来,然后再将小的boxes给剔除掉2:
def remove_small_boxes(boxes, min_size): # type: (Tensor, float) -> Tensor """ Remove boxes which contains at least one side smaller than min_size. Arguments: boxes (Tensor[N, 4]): boxes in (x1, y1, x2, y2) format min_size (float): minimum size Returns: keep (Tensor[K]): indices of the boxes that have both sides larger than min_size """ ws, hs = boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1] keep = (ws >= min_size) & (hs >= min_size) keep = keep.nonzero().squeeze(1) return keep这步将在每层特征图的proposal bbox, score进行非极大值抑制(nms)处理,进一步过滤proposal,如果过滤出来的proposal数目大于1000,只保留前1000个。这里nms_thresh阈值缺省值为0.7。详细步骤结合下面代码看一下:
def filter_proposals(self, proposals, objectness, image_shapes, num_anchors_per_level): # type: (Tensor, Tensor, List[Tuple[int, int]], List[int]) -> Tuple[List[Tensor], List[Tensor]] num_images = proposals.shape[0] device = proposals.device # do not backprop throught objectness objectness = objectness.detach() objectness = objectness.reshape(num_images, -1) levels = [ torch.full((n,), idx, dtype=torch.int64, device=device) for idx, n in enumerate(num_anchors_per_level) ] levels = torch.cat(levels, 0) levels = levels.reshape(1, -1).expand_as(objectness) # select top_n boxes independently per level before applying nms top_n_idx = self._get_top_n_idx(objectness, num_anchors_per_level) image_range = torch.arange(num_images, device=device) batch_idx = image_range[:, None] objectness = objectness[batch_idx, top_n_idx] levels = levels[batch_idx, top_n_idx] proposals = proposals[batch_idx, top_n_idx] final_boxes = [] final_scores = [] for boxes, scores, lvl, img_shape in zip(proposals, objectness, levels, image_shapes): boxes = box_ops.clip_boxes_to_image(boxes, img_shape) keep = box_ops.remove_small_boxes(boxes, self.min_size) boxes, scores, lvl = boxes[keep], scores[keep], lvl[keep] # non-maximum suppression, independently done per level keep = box_ops.batched_nms(boxes, scores, lvl, self.nms_thresh) # keep only topk scoring predictions keep = keep[:self.post_nms_top_n()] boxes, scores = boxes[keep], scores[keep] final_boxes.append(boxes) final_scores.append(scores) return final_boxes, final_scores创作不易,望赐个赞!😃
假设输入图像高为599,宽为900,经过转换后产生的特征图是基于(800x1216) ↩︎
缺省长宽小于阈值(0.01)将会淘汰掉 ↩︎