Add CLIP fine-tuning pipeline for logo recognition
Implement contrastive learning with LoRA to fine-tune CLIP's vision encoder on LogoDet-3K dataset for improved logo embedding similarity. New training module (training/): - config.py: TrainingConfig dataclass with all hyperparameters - dataset.py: LogoContrastiveDataset with logo-level splits - model.py: LogoFineTunedCLIP wrapper with LoRA support - losses.py: InfoNCE, TripletLoss, SupConLoss implementations - trainer.py: Training loop with mixed precision and checkpointing - evaluation.py: EmbeddingEvaluator for validation metrics New scripts: - train_clip_logo.py: Main training entry point - export_model.py: Export to HuggingFace-compatible format Configurations: - configs/jetson_orin.yaml: Optimized for Jetson Orin AGX - configs/cloud_rtx4090.yaml: Optimized for 24GB cloud GPUs - configs/cloud_a100.yaml: Optimized for 80GB cloud GPUs Documentation: - CLIP_FINETUNING.md: Training guide and usage instructions - CLOUD_TRAINING.md: Cloud GPU recommendations and cost estimates Modified: - logo_detection_detr.py: Add fine-tuned model loading support - pyproject.toml: Add peft, pyyaml, torchvision dependencies
This commit is contained in:
@ -13,6 +13,7 @@ Supported embedding models:
|
||||
- DINOv2 models (facebook/dinov2-*): Self-supervised, excellent for visual similarity
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
@ -100,16 +101,20 @@ class DetectLogosDETR:
|
||||
embedding_model, default_embedding_dir, "Embedding"
|
||||
)
|
||||
|
||||
# Detect model type and initialize accordingly
|
||||
self.model_type = self._detect_model_type(embedding_model)
|
||||
self.logger.info(f"Loading {self.model_type} embedding model: {embedding_model_path}")
|
||||
# Check if this is a fine-tuned model
|
||||
if self._is_finetuned_model(embedding_model_path):
|
||||
self._load_finetuned_embedding_model(embedding_model_path)
|
||||
else:
|
||||
# Detect model type and initialize accordingly
|
||||
self.model_type = self._detect_model_type(embedding_model)
|
||||
self.logger.info(f"Loading {self.model_type} embedding model: {embedding_model_path}")
|
||||
|
||||
if self.model_type == "clip":
|
||||
self.embedding_model = CLIPModel.from_pretrained(embedding_model_path).to(self.device)
|
||||
self.embedding_processor = CLIPProcessor.from_pretrained(embedding_model_path)
|
||||
else: # dinov2 or other transformer models
|
||||
self.embedding_model = AutoModel.from_pretrained(embedding_model_path).to(self.device)
|
||||
self.embedding_processor = AutoImageProcessor.from_pretrained(embedding_model_path)
|
||||
if self.model_type == "clip":
|
||||
self.embedding_model = CLIPModel.from_pretrained(embedding_model_path).to(self.device)
|
||||
self.embedding_processor = CLIPProcessor.from_pretrained(embedding_model_path)
|
||||
else: # dinov2 or other transformer models
|
||||
self.embedding_model = AutoModel.from_pretrained(embedding_model_path).to(self.device)
|
||||
self.embedding_processor = AutoImageProcessor.from_pretrained(embedding_model_path)
|
||||
|
||||
self.logger.info("DetectLogosDETR initialization complete")
|
||||
|
||||
@ -124,6 +129,62 @@ class DetectLogosDETR:
|
||||
# Default to generic transformer for unknown models
|
||||
return "transformer"
|
||||
|
||||
def _is_finetuned_model(self, model_path: str) -> bool:
|
||||
"""Check if a model path points to a fine-tuned CLIP model."""
|
||||
config_path = Path(model_path) / "config.json"
|
||||
if config_path.exists():
|
||||
try:
|
||||
with open(config_path, "r") as f:
|
||||
config = json.load(f)
|
||||
return config.get("model_type") == "clip_logo_finetuned"
|
||||
except (json.JSONDecodeError, IOError):
|
||||
pass
|
||||
return False
|
||||
|
||||
def _load_finetuned_embedding_model(self, model_path: str) -> None:
|
||||
"""
|
||||
Load a fine-tuned CLIP model from the training module.
|
||||
|
||||
Args:
|
||||
model_path: Path to the fine-tuned model directory
|
||||
"""
|
||||
# Import the fine-tuned model class
|
||||
try:
|
||||
from training.model import LogoFineTunedCLIP
|
||||
except ImportError as e:
|
||||
self.logger.error(
|
||||
f"Cannot import training.model for fine-tuned model: {e}"
|
||||
)
|
||||
raise ImportError(
|
||||
"Fine-tuned model requires the training module. "
|
||||
"Ensure the training/ directory is in your Python path."
|
||||
) from e
|
||||
|
||||
# Load config
|
||||
config_path = Path(model_path) / "config.json"
|
||||
with open(config_path, "r") as f:
|
||||
config = json.load(f)
|
||||
|
||||
base_model = config.get("base_model", "openai/clip-vit-large-patch14")
|
||||
|
||||
self.logger.info(f"Loading fine-tuned CLIP model from: {model_path}")
|
||||
self.logger.info(f" Base model: {base_model}")
|
||||
|
||||
# Load model using the from_pretrained method
|
||||
self.embedding_model = LogoFineTunedCLIP.from_pretrained(
|
||||
model_path,
|
||||
base_model=base_model,
|
||||
device=self.device,
|
||||
)
|
||||
self.embedding_model.eval()
|
||||
|
||||
# Load processor from base model
|
||||
self.embedding_processor = CLIPProcessor.from_pretrained(base_model)
|
||||
|
||||
# Set model type for embedding extraction
|
||||
self.model_type = "clip_finetuned"
|
||||
self.logger.info("Fine-tuned CLIP model loaded successfully")
|
||||
|
||||
def _resolve_model_path(
|
||||
self, model_name_or_path: str, default_local_dir: str, model_type: str
|
||||
) -> str:
|
||||
@ -345,7 +406,7 @@ class DetectLogosDETR:
|
||||
"""
|
||||
Internal method to get embedding from PIL image.
|
||||
|
||||
Handles both CLIP and DINOv2 model types.
|
||||
Handles CLIP, fine-tuned CLIP, and DINOv2 model types.
|
||||
|
||||
Args:
|
||||
pil_image: PIL Image (RGB format)
|
||||
@ -360,6 +421,9 @@ class DetectLogosDETR:
|
||||
if self.model_type == "clip":
|
||||
# CLIP has a dedicated method for image features
|
||||
features = self.embedding_model.get_image_features(**inputs)
|
||||
elif self.model_type == "clip_finetuned":
|
||||
# Fine-tuned CLIP uses get_image_features or forward with pixel_values
|
||||
features = self.embedding_model.get_image_features(**inputs)
|
||||
else:
|
||||
# DINOv2 and other transformers use the CLS token or pooled output
|
||||
outputs = self.embedding_model(**inputs)
|
||||
@ -370,8 +434,9 @@ class DetectLogosDETR:
|
||||
# Use CLS token from last_hidden_state
|
||||
features = outputs.last_hidden_state[:, 0, :]
|
||||
|
||||
# Normalize for cosine similarity
|
||||
features = F.normalize(features, dim=-1)
|
||||
# Normalize for cosine similarity (fine-tuned model already normalizes)
|
||||
if self.model_type != "clip_finetuned":
|
||||
features = F.normalize(features, dim=-1)
|
||||
|
||||
return features
|
||||
|
||||
|
||||
Reference in New Issue
Block a user