Implement contrastive learning with LoRA to fine-tune CLIP's vision encoder on LogoDet-3K dataset for improved logo embedding similarity. New training module (training/): - config.py: TrainingConfig dataclass with all hyperparameters - dataset.py: LogoContrastiveDataset with logo-level splits - model.py: LogoFineTunedCLIP wrapper with LoRA support - losses.py: InfoNCE, TripletLoss, SupConLoss implementations - trainer.py: Training loop with mixed precision and checkpointing - evaluation.py: EmbeddingEvaluator for validation metrics New scripts: - train_clip_logo.py: Main training entry point - export_model.py: Export to HuggingFace-compatible format Configurations: - configs/jetson_orin.yaml: Optimized for Jetson Orin AGX - configs/cloud_rtx4090.yaml: Optimized for 24GB cloud GPUs - configs/cloud_a100.yaml: Optimized for 80GB cloud GPUs Documentation: - CLIP_FINETUNING.md: Training guide and usage instructions - CLOUD_TRAINING.md: Cloud GPU recommendations and cost estimates Modified: - logo_detection_detr.py: Add fine-tuned model loading support - pyproject.toml: Add peft, pyyaml, torchvision dependencies
336 lines
11 KiB
Python
336 lines
11 KiB
Python
"""
|
|
Fine-tunable CLIP model wrapper with LoRA support.
|
|
"""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Tuple
|
|
|
|
import torch
|
|
import torch.nn as nn
|
|
import torch.nn.functional as F
|
|
from transformers import CLIPModel, CLIPProcessor
|
|
|
|
# Check if peft is available for LoRA
|
|
try:
|
|
from peft import LoraConfig, get_peft_model, PeftModel
|
|
PEFT_AVAILABLE = True
|
|
except ImportError:
|
|
PEFT_AVAILABLE = False
|
|
LoraConfig = None
|
|
get_peft_model = None
|
|
PeftModel = None
|
|
|
|
|
|
class LogoFineTunedCLIP(nn.Module):
|
|
"""
|
|
CLIP vision encoder fine-tuned for logo similarity.
|
|
|
|
Preserves embedding interface for compatibility with DetectLogosDETR:
|
|
- Same embedding dimensionality (768 for ViT-L/14)
|
|
- L2 normalized outputs
|
|
- Works with existing get_image_features() pattern
|
|
|
|
Supports:
|
|
- LoRA for memory-efficient fine-tuning
|
|
- Layer freezing for transfer learning
|
|
- Gradient checkpointing for memory optimization
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
vision_model: nn.Module,
|
|
lora_r: int = 16,
|
|
lora_alpha: int = 32,
|
|
lora_dropout: float = 0.1,
|
|
freeze_layers: int = 12,
|
|
use_gradient_checkpointing: bool = True,
|
|
add_projection_head: bool = True,
|
|
):
|
|
"""
|
|
Initialize the fine-tunable CLIP wrapper.
|
|
|
|
Args:
|
|
vision_model: CLIP vision model (CLIPVisionModel)
|
|
lora_r: Rank of LoRA low-rank matrices (0 to disable)
|
|
lora_alpha: LoRA scaling factor
|
|
lora_dropout: Dropout for LoRA layers
|
|
freeze_layers: Number of transformer layers to freeze (from bottom)
|
|
use_gradient_checkpointing: Enable gradient checkpointing
|
|
add_projection_head: Add trainable projection head
|
|
"""
|
|
super().__init__()
|
|
|
|
self.vision_model = vision_model
|
|
self.embedding_dim = vision_model.config.hidden_size
|
|
self.freeze_layers = freeze_layers
|
|
self.lora_r = lora_r
|
|
self.lora_alpha = lora_alpha
|
|
|
|
# Enable gradient checkpointing for memory efficiency
|
|
if use_gradient_checkpointing:
|
|
if hasattr(self.vision_model, "gradient_checkpointing_enable"):
|
|
self.vision_model.gradient_checkpointing_enable()
|
|
|
|
# Freeze lower layers
|
|
self._freeze_layers(freeze_layers)
|
|
|
|
# Apply LoRA to attention layers in upper blocks
|
|
self.peft_applied = False
|
|
if PEFT_AVAILABLE and lora_r > 0:
|
|
self._apply_lora(lora_r, lora_alpha, lora_dropout)
|
|
self.peft_applied = True
|
|
elif lora_r > 0 and not PEFT_AVAILABLE:
|
|
print(
|
|
"Warning: peft not installed. LoRA disabled. "
|
|
"Install with: pip install peft"
|
|
)
|
|
|
|
# Optional projection head for fine-tuning
|
|
self.add_projection_head = add_projection_head
|
|
if add_projection_head:
|
|
self.projection = nn.Sequential(
|
|
nn.Linear(self.embedding_dim, self.embedding_dim),
|
|
nn.LayerNorm(self.embedding_dim),
|
|
)
|
|
else:
|
|
self.projection = nn.Identity()
|
|
|
|
def _freeze_layers(self, num_layers: int) -> None:
|
|
"""Freeze the first N transformer layers and embeddings."""
|
|
if num_layers <= 0:
|
|
return
|
|
|
|
# Freeze embeddings
|
|
if hasattr(self.vision_model, "embeddings"):
|
|
for param in self.vision_model.embeddings.parameters():
|
|
param.requires_grad = False
|
|
|
|
# Freeze specified number of encoder layers
|
|
if hasattr(self.vision_model, "encoder"):
|
|
for i, layer in enumerate(self.vision_model.encoder.layers):
|
|
if i < num_layers:
|
|
for param in layer.parameters():
|
|
param.requires_grad = False
|
|
|
|
def _apply_lora(
|
|
self,
|
|
r: int,
|
|
alpha: int,
|
|
dropout: float,
|
|
) -> None:
|
|
"""Apply LoRA adapters to attention layers."""
|
|
if not PEFT_AVAILABLE:
|
|
return
|
|
|
|
# Configure LoRA for vision transformer
|
|
lora_config = LoraConfig(
|
|
r=r,
|
|
lora_alpha=alpha,
|
|
lora_dropout=dropout,
|
|
target_modules=["q_proj", "v_proj"],
|
|
bias="none",
|
|
modules_to_save=[], # Don't save any full modules
|
|
)
|
|
|
|
self.vision_model = get_peft_model(self.vision_model, lora_config)
|
|
|
|
def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
|
|
"""
|
|
Extract normalized embeddings for logo images.
|
|
|
|
Args:
|
|
pixel_values: [batch, 3, 224, 224] preprocessed images
|
|
|
|
Returns:
|
|
embeddings: [batch, embedding_dim] L2-normalized
|
|
"""
|
|
# Get vision features
|
|
outputs = self.vision_model(pixel_values=pixel_values)
|
|
|
|
# Use pooler output (CLS token projection) if available
|
|
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
|
features = outputs.pooler_output
|
|
else:
|
|
# Fall back to CLS token from last hidden state
|
|
features = outputs.last_hidden_state[:, 0, :]
|
|
|
|
# Apply projection head
|
|
features = self.projection(features)
|
|
|
|
# L2 normalize for cosine similarity
|
|
features = F.normalize(features, dim=-1)
|
|
|
|
return features
|
|
|
|
def get_image_features(self, **kwargs) -> torch.Tensor:
|
|
"""
|
|
Compatibility method matching CLIP's interface.
|
|
|
|
Used by DetectLogosDETR._get_embedding_pil().
|
|
"""
|
|
return self.forward(kwargs["pixel_values"])
|
|
|
|
def get_trainable_parameters(self) -> List[torch.nn.Parameter]:
|
|
"""Return list of trainable parameters."""
|
|
return [p for p in self.parameters() if p.requires_grad]
|
|
|
|
def get_parameter_count(self) -> Dict[str, int]:
|
|
"""Return count of trainable and total parameters."""
|
|
total = sum(p.numel() for p in self.parameters())
|
|
trainable = sum(p.numel() for p in self.parameters() if p.requires_grad)
|
|
return {
|
|
"total": total,
|
|
"trainable": trainable,
|
|
"frozen": total - trainable,
|
|
"trainable_percent": 100 * trainable / total if total > 0 else 0,
|
|
}
|
|
|
|
def save_pretrained(self, output_dir: str) -> None:
|
|
"""
|
|
Save model in HuggingFace-compatible format.
|
|
|
|
Args:
|
|
output_dir: Directory to save model files
|
|
"""
|
|
output_path = Path(output_dir)
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Save model weights
|
|
if self.peft_applied and PEFT_AVAILABLE:
|
|
# Save LoRA weights separately
|
|
self.vision_model.save_pretrained(output_path / "vision_lora")
|
|
# Save projection head
|
|
torch.save(
|
|
self.projection.state_dict(),
|
|
output_path / "projection_head.bin",
|
|
)
|
|
else:
|
|
# Save full model state
|
|
torch.save(self.state_dict(), output_path / "pytorch_model.bin")
|
|
|
|
# Save config
|
|
config = {
|
|
"model_type": "clip_logo_finetuned",
|
|
"embedding_dim": self.embedding_dim,
|
|
"lora_r": self.lora_r,
|
|
"lora_alpha": self.lora_alpha,
|
|
"freeze_layers": self.freeze_layers,
|
|
"add_projection_head": self.add_projection_head,
|
|
"peft_applied": self.peft_applied,
|
|
}
|
|
|
|
with open(output_path / "config.json", "w") as f:
|
|
json.dump(config, f, indent=2)
|
|
|
|
@classmethod
|
|
def from_pretrained(
|
|
cls,
|
|
model_path: str,
|
|
base_model: str = "openai/clip-vit-large-patch14",
|
|
device: Optional[torch.device] = None,
|
|
) -> "LogoFineTunedCLIP":
|
|
"""
|
|
Load a fine-tuned model from saved weights.
|
|
|
|
Args:
|
|
model_path: Path to saved model directory
|
|
base_model: Base CLIP model name (for architecture)
|
|
device: Device to load model on
|
|
|
|
Returns:
|
|
Loaded LogoFineTunedCLIP model
|
|
"""
|
|
model_path = Path(model_path)
|
|
|
|
# Load config
|
|
with open(model_path / "config.json", "r") as f:
|
|
config = json.load(f)
|
|
|
|
# Load base CLIP model
|
|
clip_model = CLIPModel.from_pretrained(base_model)
|
|
|
|
# Create model instance
|
|
model = cls(
|
|
vision_model=clip_model.vision_model,
|
|
lora_r=config.get("lora_r", 0),
|
|
lora_alpha=config.get("lora_alpha", 1),
|
|
freeze_layers=config.get("freeze_layers", 12),
|
|
add_projection_head=config.get("add_projection_head", True),
|
|
use_gradient_checkpointing=False, # Not needed for inference
|
|
)
|
|
|
|
# Load weights
|
|
if config.get("peft_applied", False) and PEFT_AVAILABLE:
|
|
# Load LoRA weights
|
|
lora_path = model_path / "vision_lora"
|
|
if lora_path.exists():
|
|
model.vision_model = PeftModel.from_pretrained(
|
|
model.vision_model, lora_path
|
|
)
|
|
# Load projection head
|
|
proj_path = model_path / "projection_head.bin"
|
|
if proj_path.exists():
|
|
model.projection.load_state_dict(torch.load(proj_path))
|
|
else:
|
|
# Load full model state
|
|
weights_path = model_path / "pytorch_model.bin"
|
|
if weights_path.exists():
|
|
model.load_state_dict(torch.load(weights_path))
|
|
|
|
if device is not None:
|
|
model = model.to(device)
|
|
|
|
return model
|
|
|
|
|
|
def create_model(
|
|
base_model: str = "openai/clip-vit-large-patch14",
|
|
lora_r: int = 16,
|
|
lora_alpha: int = 32,
|
|
lora_dropout: float = 0.1,
|
|
freeze_layers: int = 12,
|
|
use_gradient_checkpointing: bool = True,
|
|
device: Optional[torch.device] = None,
|
|
) -> Tuple[LogoFineTunedCLIP, CLIPProcessor]:
|
|
"""
|
|
Create a fine-tunable CLIP model and processor.
|
|
|
|
Args:
|
|
base_model: HuggingFace model name or path
|
|
lora_r: LoRA rank (0 to disable)
|
|
lora_alpha: LoRA scaling factor
|
|
lora_dropout: LoRA dropout
|
|
freeze_layers: Number of layers to freeze
|
|
use_gradient_checkpointing: Enable gradient checkpointing
|
|
device: Device to load model on
|
|
|
|
Returns:
|
|
Tuple of (model, processor)
|
|
"""
|
|
# Load base CLIP model
|
|
clip_model = CLIPModel.from_pretrained(base_model)
|
|
processor = CLIPProcessor.from_pretrained(base_model)
|
|
|
|
# Create fine-tunable wrapper
|
|
model = LogoFineTunedCLIP(
|
|
vision_model=clip_model.vision_model,
|
|
lora_r=lora_r,
|
|
lora_alpha=lora_alpha,
|
|
lora_dropout=lora_dropout,
|
|
freeze_layers=freeze_layers,
|
|
use_gradient_checkpointing=use_gradient_checkpointing,
|
|
)
|
|
|
|
if device is not None:
|
|
model = model.to(device)
|
|
|
|
# Print parameter info
|
|
param_info = model.get_parameter_count()
|
|
print(f"Model created:")
|
|
print(f" Total parameters: {param_info['total']:,}")
|
|
print(f" Trainable: {param_info['trainable']:,} ({param_info['trainable_percent']:.2f}%)")
|
|
print(f" Frozen: {param_info['frozen']:,}")
|
|
|
|
return model, processor
|