Add CLIP fine-tuning pipeline for logo recognition

Implement contrastive learning with LoRA to fine-tune CLIP's vision encoder on LogoDet-3K dataset for improved logo embedding similarity. New training module (training/): - config.py: TrainingConfig dataclass with all hyperparameters - dataset.py: LogoContrastiveDataset with logo-level splits - model.py: LogoFineTunedCLIP wrapper with LoRA support - losses.py: InfoNCE, TripletLoss, SupConLoss implementations - trainer.py: Training loop with mixed precision and checkpointing - evaluation.py: EmbeddingEvaluator for validation metrics New scripts: - train_clip_logo.py: Main training entry point - export_model.py: Export to HuggingFace-compatible format Configurations: - configs/jetson_orin.yaml: Optimized for Jetson Orin AGX - configs/cloud_rtx4090.yaml: Optimized for 24GB cloud GPUs - configs/cloud_a100.yaml: Optimized for 80GB cloud GPUs Documentation: - CLIP_FINETUNING.md: Training guide and usage instructions - CLOUD_TRAINING.md: Cloud GPU recommendations and cost estimates Modified: - logo_detection_detr.py: Add fine-tuned model loading support - pyproject.toml: Add peft, pyyaml, torchvision dependencies
2026-01-04 13:45:25 -05:00
parent 1551360028
commit 44e8b6ae7d
16 changed files with 3334 additions and 12 deletions
--- a/training/losses.py
+++ b/training/losses.py
@ -0,0 +1,326 @@
+"""
+Loss functions for contrastive learning on logo embeddings.
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional
+
+
+class InfoNCELoss(nn.Module):
+    """
+    Normalized Temperature-scaled Cross Entropy Loss (InfoNCE).
+
+    This is the contrastive loss used in CLIP training. It maximizes
+    similarity between embeddings of the same logo class while
+    minimizing similarity to embeddings of different classes.
+
+    For a batch with N samples:
+    - Each sample is an anchor
+    - Positive pairs: samples with the same label
+    - Negative pairs: samples with different labels
+
+    The loss for each anchor is:
+        -log(sum(exp(sim(anchor, pos)/temp)) / sum(exp(sim(anchor, all)/temp)))
+    """
+
+    def __init__(self, temperature: float = 0.07):
+        """
+        Initialize InfoNCE loss.
+
+        Args:
+            temperature: Scaling factor for similarities (0.05-0.1 typical).
+                Lower temperature makes the distribution sharper.
+        """
+        super().__init__()
+        self.temperature = temperature
+
+    def forward(
+        self,
+        embeddings: torch.Tensor,
+        labels: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Compute InfoNCE loss for a batch of embeddings.
+
+        Args:
+            embeddings: [N, D] L2-normalized embeddings
+            labels: [N] integer logo class labels
+
+        Returns:
+            Scalar loss value
+        """
+        device = embeddings.device
+        batch_size = embeddings.shape[0]
+
+        if batch_size <= 1:
+            return torch.tensor(0.0, device=device, requires_grad=True)
+
+        # Compute similarity matrix [N, N]
+        # Since embeddings are L2-normalized, dot product = cosine similarity
+        similarity = embeddings @ embeddings.T / self.temperature
+
+        # Create positive mask: same label = 1, different = 0
+        labels_col = labels.unsqueeze(0)  # [1, N]
+        labels_row = labels.unsqueeze(1)  # [N, 1]
+        positive_mask = (labels_row == labels_col).float()  # [N, N]
+
+        # Remove self-similarity from positives (diagonal)
+        identity = torch.eye(batch_size, device=device)
+        positive_mask = positive_mask - identity
+
+        # Count positives per anchor (avoid division by zero)
+        num_positives = positive_mask.sum(dim=1)
+        has_positives = num_positives > 0
+
+        # If no positives exist for any anchor, return zero loss
+        if not has_positives.any():
+            return torch.tensor(0.0, device=device, requires_grad=True)
+
+        # Mask out self-similarity with large negative value
+        similarity = similarity - identity * 1e9
+
+        # Compute log-softmax over similarities
+        log_softmax = F.log_softmax(similarity, dim=1)
+
+        # Sum log probabilities of positive pairs
+        positive_log_probs = (log_softmax * positive_mask).sum(dim=1)
+
+        # Average over number of positives (only for anchors with positives)
+        loss_per_anchor = torch.zeros(batch_size, device=device)
+        loss_per_anchor[has_positives] = (
+            -positive_log_probs[has_positives] / num_positives[has_positives]
+        )
+
+        return loss_per_anchor.mean()
+
+
+class SupConLoss(nn.Module):
+    """
+    Supervised Contrastive Loss.
+
+    Similar to InfoNCE but uses a different formulation that
+    considers each positive pair separately rather than averaging.
+
+    Reference: https://arxiv.org/abs/2004.11362
+    """
+
+    def __init__(self, temperature: float = 0.07):
+        super().__init__()
+        self.temperature = temperature
+
+    def forward(
+        self,
+        embeddings: torch.Tensor,
+        labels: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Compute Supervised Contrastive loss.
+
+        Args:
+            embeddings: [N, D] L2-normalized embeddings
+            labels: [N] integer logo class labels
+
+        Returns:
+            Scalar loss value
+        """
+        device = embeddings.device
+        batch_size = embeddings.shape[0]
+
+        if batch_size <= 1:
+            return torch.tensor(0.0, device=device, requires_grad=True)
+
+        # Compute similarity matrix
+        similarity = embeddings @ embeddings.T / self.temperature
+
+        # Create masks
+        labels_col = labels.unsqueeze(0)
+        labels_row = labels.unsqueeze(1)
+        positive_mask = (labels_row == labels_col).float()
+        identity = torch.eye(batch_size, device=device)
+
+        # Remove self from positives
+        positive_mask = positive_mask - identity
+
+        # Number of positives per anchor
+        num_positives = positive_mask.sum(dim=1)
+        has_positives = num_positives > 0
+
+        if not has_positives.any():
+            return torch.tensor(0.0, device=device, requires_grad=True)
+
+        # For numerical stability, subtract max similarity
+        sim_max, _ = similarity.max(dim=1, keepdim=True)
+        similarity = similarity - sim_max.detach()
+
+        # Compute exp(similarity) with self masked out
+        exp_sim = torch.exp(similarity) * (1 - identity)
+
+        # Denominator: sum of exp over all pairs except self
+        log_prob = similarity - torch.log(exp_sim.sum(dim=1, keepdim=True) + 1e-8)
+
+        # Mean of log-prob over positive pairs
+        mean_log_prob_pos = (positive_mask * log_prob).sum(dim=1) / (
+            num_positives + 1e-8
+        )
+
+        # Loss is negative mean log probability
+        loss = -mean_log_prob_pos[has_positives].mean()
+
+        return loss
+
+
+class TripletLoss(nn.Module):
+    """
+    Triplet loss with online hard mining.
+
+    For each anchor:
+    - Hardest positive: most distant sample with same label
+    - Hardest negative: closest sample with different label
+
+    Loss = max(0, d(anchor, hardest_pos) - d(anchor, hardest_neg) + margin)
+
+    This is an alternative to InfoNCE for when batch sizes are small.
+    """
+
+    def __init__(self, margin: float = 0.3):
+        """
+        Initialize Triplet loss.
+
+        Args:
+            margin: Minimum required gap between positive and negative distances
+        """
+        super().__init__()
+        self.margin = margin
+
+    def forward(
+        self,
+        embeddings: torch.Tensor,
+        labels: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Compute triplet loss with online hard mining.
+
+        Args:
+            embeddings: [N, D] L2-normalized embeddings
+            labels: [N] integer logo class labels
+
+        Returns:
+            Scalar loss value
+        """
+        device = embeddings.device
+        batch_size = embeddings.shape[0]
+
+        if batch_size <= 1:
+            return torch.tensor(0.0, device=device, requires_grad=True)
+
+        # Compute pairwise cosine distances (1 - cosine_similarity)
+        # For normalized vectors: distance = 1 - dot_product
+        similarity = embeddings @ embeddings.T
+        distances = 1 - similarity
+
+        # Create masks
+        labels_col = labels.unsqueeze(0)
+        labels_row = labels.unsqueeze(1)
+        positive_mask = (labels_row == labels_col).float()
+        negative_mask = 1 - positive_mask
+
+        # Remove self from positives (diagonal)
+        identity = torch.eye(batch_size, device=device)
+        positive_mask = positive_mask - identity
+
+        # Check if we have any valid triplets
+        has_positives = positive_mask.sum(dim=1) > 0
+        has_negatives = negative_mask.sum(dim=1) > 0
+        valid_anchors = has_positives & has_negatives
+
+        if not valid_anchors.any():
+            return torch.tensor(0.0, device=device, requires_grad=True)
+
+        # For each anchor, find hardest positive (max distance among positives)
+        # Set negatives to -inf so they don't affect max
+        pos_distances = distances.clone()
+        pos_distances[positive_mask == 0] = float("-inf")
+        hardest_positive, _ = pos_distances.max(dim=1)
+
+        # For each anchor, find hardest negative (min distance among negatives)
+        # Set positives to inf so they don't affect min
+        neg_distances = distances.clone()
+        neg_distances[negative_mask == 0] = float("inf")
+        hardest_negative, _ = neg_distances.min(dim=1)
+
+        # Triplet loss: want positive to be closer than negative by margin
+        triplet_loss = F.relu(
+            hardest_positive - hardest_negative + self.margin
+        )
+
+        # Average over valid anchors only
+        loss = triplet_loss[valid_anchors].mean()
+
+        return loss
+
+
+class CombinedLoss(nn.Module):
+    """
+    Combined loss function with weighted InfoNCE and Triplet losses.
+
+    Can help stabilize training by combining the benefits of both losses.
+    """
+
+    def __init__(
+        self,
+        temperature: float = 0.07,
+        triplet_margin: float = 0.3,
+        infonce_weight: float = 1.0,
+        triplet_weight: float = 0.5,
+    ):
+        super().__init__()
+        self.infonce = InfoNCELoss(temperature=temperature)
+        self.triplet = TripletLoss(margin=triplet_margin)
+        self.infonce_weight = infonce_weight
+        self.triplet_weight = triplet_weight
+
+    def forward(
+        self,
+        embeddings: torch.Tensor,
+        labels: torch.Tensor,
+    ) -> torch.Tensor:
+        infonce_loss = self.infonce(embeddings, labels)
+        triplet_loss = self.triplet(embeddings, labels)
+
+        return (
+            self.infonce_weight * infonce_loss +
+            self.triplet_weight * triplet_loss
+        )
+
+
+def get_loss_function(
+    loss_type: str = "infonce",
+    temperature: float = 0.07,
+    triplet_margin: float = 0.3,
+) -> nn.Module:
+    """
+    Factory function to create loss function.
+
+    Args:
+        loss_type: One of "infonce", "supcon", "triplet", or "combined"
+        temperature: Temperature for InfoNCE/SupCon
+        triplet_margin: Margin for triplet loss
+
+    Returns:
+        Loss function module
+    """
+    if loss_type == "infonce":
+        return InfoNCELoss(temperature=temperature)
+    elif loss_type == "supcon":
+        return SupConLoss(temperature=temperature)
+    elif loss_type == "triplet":
+        return TripletLoss(margin=triplet_margin)
+    elif loss_type == "combined":
+        return CombinedLoss(
+            temperature=temperature,
+            triplet_margin=triplet_margin,
+        )
+    else:
+        raise ValueError(f"Unknown loss type: {loss_type}")