Source code for rofunc.utils.visualab.segment.vlpart.vlpart_fast_rcnn

# Copyright (c) Facebook, Inc. and its affiliates.
# VLPart: Going denser with open-vocabulary part segmentation 
# Written by Peize Sun and Shoufa Chen
import torch
from detectron2.layers import ShapeSpec, cat
from torch import nn
from torch.nn import functional as F


[docs]class TexeEmbedClassifier(nn.Module):
    def __init__(
            self,
            input_shape: ShapeSpec,
            zs_weight_dim: int = 1024,
            norm_weight: bool = True,
            norm_temperature: float = 50.0,
    ):
        super().__init__()
        if isinstance(input_shape, int):  # some backward compatibility
            input_shape = ShapeSpec(channels=input_shape)
        input_size = input_shape.channels * (input_shape.width or 1) * (input_shape.height or 1)
        self.norm_weight = norm_weight
        self.norm_temperature = norm_temperature

        self.linear = nn.Linear(input_size, zs_weight_dim)

[docs]    def forward(self, x, text_embed):

        x = self.linear(x)
        if self.norm_weight:
            x = self.norm_temperature * F.normalize(x, p=2, dim=1)
        x = torch.mm(x, text_embed)
        return x


[docs]class VLMFastRCNNOutputLayers(nn.Module):
    def __init__(
            self,
            input_shape: ShapeSpec,
            box2box_transform,
            use_sigmoid_ce: bool = True,
            test_score_thresh: float = 0.0,
            test_nms_thresh: float = 0.5,
            test_topk_per_image: int = 100,
    ):
        super().__init__()
        if isinstance(input_shape, int):  # some backward compatibility
            input_shape = ShapeSpec(channels=input_shape)

        self.box2box_transform = box2box_transform
        self.use_sigmoid_ce = use_sigmoid_ce
        self.test_score_thresh = test_score_thresh
        self.test_nms_thresh = test_nms_thresh
        self.test_topk_per_image = test_topk_per_image

        input_size = input_shape.channels * \
                     (input_shape.width or 1) * (input_shape.height or 1)

        # bbox_pred
        self.bbox_pred = nn.Sequential(
            nn.Linear(input_size, input_size),
            nn.ReLU(inplace=True),
            nn.Linear(input_size, 4)
        )
        # cls_score
        self.cls_score = TexeEmbedClassifier(input_shape)

[docs]    def forward(self, x, text_embed):
        if x.dim() > 2:
            x = torch.flatten(x, start_dim=1)
        cls_scores = self.cls_score(x, text_embed)
        proposal_deltas = self.bbox_pred(x)

        return cls_scores, proposal_deltas

[docs]    def predict_boxes(self, predictions, proposals):
        if not len(proposals):
            return []
        _, proposal_deltas = predictions
        num_prop_per_image = [len(p) for p in proposals]
        proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0)

        predict_boxes = self.box2box_transform.apply_deltas(
            proposal_deltas,
            proposal_boxes,
        )  # Nx(KxB)
        return predict_boxes.split(num_prop_per_image)

[docs]    def predict_probs(self, predictions, proposals):
        cls_scores, _ = predictions
        num_inst_per_image = [len(p) for p in proposals]
        cls_scores = cls_scores.split(num_inst_per_image, dim=0)

        final_scores = []
        for cls_score in cls_scores:
            final_score = cls_score.sigmoid() if self.use_sigmoid_ce else F.softmax(cls_score, dim=-1)
            final_scores.append(final_score)
        return final_scores