diff --git a/logo_detection_detr.py b/logo_detection_detr.py index 06e42b6..345426a 100644 --- a/logo_detection_detr.py +++ b/logo_detection_detr.py @@ -23,7 +23,6 @@ import cv2 import numpy as np from pathlib import Path from typing import List, Tuple, Dict, Optional, Any -from difflib import SequenceMatcher class DetectLogosDETR: @@ -765,311 +764,4 @@ class DetectLogosDETR: f"(threshold: {similarity_threshold})" ) - return matched_detections - - # ========================================================================= - # Hybrid Text + CLIP Matching - # ========================================================================= - - def set_text_detector(self, text_detector) -> None: - """ - Set an optional text detector for hybrid matching. - - Args: - text_detector: Instance of DetectText class from text_recognition.py - """ - self.text_detector = text_detector - self.logger.info("Text detector enabled for hybrid matching") - - def extract_text(self, image: np.ndarray, min_confidence: float = 0.3) -> List[str]: - """ - Extract text from an image using the text detector. - - Args: - image: OpenCV image (BGR format) - min_confidence: Minimum OCR confidence to accept text - - Returns: - List of detected text strings (lowercased, stripped) - """ - if not hasattr(self, 'text_detector') or self.text_detector is None: - return [] - - try: - results, _ = self.text_detector.detect(image) - # Filter by confidence and normalize text - texts = [] - for text, confidence in results: - if confidence >= min_confidence: - # Normalize: lowercase, strip whitespace, remove special chars - normalized = text.lower().strip() - if len(normalized) >= 2: # Ignore single characters - texts.append(normalized) - return texts - except Exception as e: - self.logger.warning(f"Text extraction failed: {e}") - return [] - - def extract_text_pil(self, pil_image: Image.Image, min_confidence: float = 0.3) -> List[str]: - """ - Extract text from a PIL image. - - Args: - pil_image: PIL Image (RGB format) - min_confidence: Minimum OCR confidence - - Returns: - List of detected text strings - """ - # Convert PIL to OpenCV format - cv_image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR) - return self.extract_text(cv_image, min_confidence) - - @staticmethod - def compute_text_similarity(text1_list: List[str], text2_list: List[str]) -> float: - """ - Compute fuzzy text similarity between two lists of text strings. - - Uses a combination of exact matches and fuzzy matching to handle - OCR variations like case differences, spacing, and minor errors. - - Args: - text1_list: List of text strings from first image - text2_list: List of text strings from second image - - Returns: - Similarity score between 0 and 1 - """ - if not text1_list or not text2_list: - return 0.0 - - # Combine all text into single strings for overall comparison - text1_combined = " ".join(sorted(text1_list)) - text2_combined = " ".join(sorted(text2_list)) - - # Method 1: Sequence matching on combined text - seq_similarity = SequenceMatcher(None, text1_combined, text2_combined).ratio() - - # Method 2: Token overlap (Jaccard-like) - # Split into tokens - tokens1 = set(text1_combined.split()) - tokens2 = set(text2_combined.split()) - - if tokens1 and tokens2: - intersection = len(tokens1 & tokens2) - union = len(tokens1 | tokens2) - token_similarity = intersection / union if union > 0 else 0 - else: - token_similarity = 0 - - # Method 3: Best pairwise match for each text in list1 - pairwise_scores = [] - for t1 in text1_list: - best_match = 0 - for t2 in text2_list: - score = SequenceMatcher(None, t1, t2).ratio() - best_match = max(best_match, score) - pairwise_scores.append(best_match) - - pairwise_similarity = sum(pairwise_scores) / len(pairwise_scores) if pairwise_scores else 0 - - # Combine methods (weighted average) - combined = (seq_similarity * 0.3 + token_similarity * 0.3 + pairwise_similarity * 0.4) - - return combined - - @staticmethod - def texts_match( - ref_texts: List[str], - det_texts: List[str], - threshold: float = 0.5 - ) -> Tuple[bool, float]: - """ - Determine if texts match above a threshold. - - Args: - ref_texts: Text from reference logo - det_texts: Text from detected region - threshold: Minimum similarity to consider a match - - Returns: - Tuple of (is_match, similarity_score) - """ - if not ref_texts: - # Reference has no text - can't match on text - return (False, 0.0) - - if not det_texts: - # Reference has text but detection doesn't - no text match - return (False, 0.0) - - similarity = DetectLogosDETR.compute_text_similarity(ref_texts, det_texts) - return (similarity >= threshold, similarity) - - def find_best_match_hybrid( - self, - detected_embedding: torch.Tensor, - detected_image: np.ndarray, - reference_data: Dict[str, Dict[str, Any]], - clip_threshold: float = 0.70, - clip_threshold_with_text: float = 0.60, - clip_threshold_text_mismatch: float = 0.80, - text_similarity_threshold: float = 0.5, - margin: float = 0.05, - use_mean_similarity: bool = False, - ) -> Optional[Tuple[str, float, Dict[str, Any]]]: - """ - Find best match using hybrid text + CLIP approach. - - Strategy: - - If reference has text AND detection has matching text: - → Use lower CLIP threshold (text provides additional confidence) - - If reference has text but detection doesn't match: - → Use higher CLIP threshold (need more visual confidence) - - If reference has no text: - → Use standard CLIP threshold - - Args: - detected_embedding: CLIP embedding from detected logo region - detected_image: OpenCV image of the detected region (for text extraction) - reference_data: Dict mapping logo name to: - { - 'embeddings': List[torch.Tensor], # CLIP embeddings - 'texts': List[str], # Extracted text from reference - } - clip_threshold: Standard CLIP threshold for no-text references - clip_threshold_with_text: Lower threshold when text matches - clip_threshold_text_mismatch: Higher threshold when text expected but missing - text_similarity_threshold: Threshold for text matching - margin: Required margin between best and second-best - use_mean_similarity: Use mean vs max for multi-ref aggregation - - Returns: - Tuple of (label, clip_similarity, match_info) or None - match_info contains: text_matched, text_similarity, threshold_used - """ - if not reference_data: - return None - - # Extract text from detected region - detected_texts = self.extract_text(detected_image) - - # Calculate scores for all logos - logo_scores = [] - - for label, ref_info in reference_data.items(): - ref_embeddings = ref_info.get('embeddings', []) - ref_texts = ref_info.get('texts', []) - - if not ref_embeddings: - continue - - # Calculate CLIP similarity - similarities = [] - for ref_emb in ref_embeddings: - sim = self.compare_embeddings(detected_embedding, ref_emb) - similarities.append(sim) - - if use_mean_similarity: - clip_score = sum(similarities) / len(similarities) - else: - clip_score = max(similarities) - - # Determine text match status and appropriate threshold - has_ref_text = len(ref_texts) > 0 - text_matched, text_sim = self.texts_match( - ref_texts, detected_texts, text_similarity_threshold - ) - - if has_ref_text: - if text_matched: - # Text matches - use lower threshold, boost confidence - threshold_used = clip_threshold_with_text - match_type = "text_match" - else: - # Reference has text but detection doesn't match - # Require higher CLIP threshold - threshold_used = clip_threshold_text_mismatch - match_type = "text_mismatch" - else: - # No text in reference - standard matching - threshold_used = clip_threshold - match_type = "no_text" - text_sim = 0.0 - - # Check if CLIP score meets the appropriate threshold - if clip_score >= threshold_used: - logo_scores.append({ - 'label': label, - 'clip_score': clip_score, - 'text_matched': text_matched, - 'text_similarity': text_sim, - 'threshold_used': threshold_used, - 'match_type': match_type, - 'has_ref_text': has_ref_text, - }) - - if not logo_scores: - return None - - # Sort by CLIP score descending - logo_scores.sort(key=lambda x: x['clip_score'], reverse=True) - - best = logo_scores[0] - - # Check margin against second-best - if margin > 0 and len(logo_scores) > 1: - second_best_score = logo_scores[1]['clip_score'] - if best['clip_score'] - second_best_score < margin: - return None - - match_info = { - 'text_matched': best['text_matched'], - 'text_similarity': best['text_similarity'], - 'threshold_used': best['threshold_used'], - 'match_type': best['match_type'], - 'has_ref_text': best['has_ref_text'], - 'detected_texts': detected_texts, - } - - return (best['label'], best['clip_score'], match_info) - - def prepare_reference_data_hybrid( - self, - reference_images: Dict[str, List[np.ndarray]], - text_min_confidence: float = 0.3, - ) -> Dict[str, Dict[str, Any]]: - """ - Prepare reference data for hybrid matching by computing embeddings and extracting text. - - Args: - reference_images: Dict mapping logo name to list of reference images (OpenCV BGR) - text_min_confidence: Minimum confidence for text extraction - - Returns: - Dict mapping logo name to {'embeddings': [...], 'texts': [...]} - """ - reference_data = {} - - for logo_name, images in reference_images.items(): - embeddings = [] - all_texts = set() - - for img in images: - # Compute CLIP embedding - emb = self.get_embedding(img) - embeddings.append(emb) - - # Extract text - texts = self.extract_text(img, text_min_confidence) - all_texts.update(texts) - - reference_data[logo_name] = { - 'embeddings': embeddings, - 'texts': list(all_texts), - } - - if all_texts: - self.logger.debug(f"Reference '{logo_name}' has text: {all_texts}") - - return reference_data \ No newline at end of file + return matched_detections \ No newline at end of file diff --git a/run_hybrid_test.sh b/run_hybrid_test.sh deleted file mode 100755 index 99843d9..0000000 --- a/run_hybrid_test.sh +++ /dev/null @@ -1,168 +0,0 @@ -#!/bin/bash -# -# Test the hybrid text+CLIP matching approach for logo detection. -# -# This approach uses text recognition to improve logo matching: -# - If reference logo has text and detection matches it: use lower CLIP threshold -# - If reference logo has text but detection doesn't match: use higher CLIP threshold -# - If reference logo has no text: use standard CLIP threshold -# -# Usage: -# ./run_hybrid_test.sh -# - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -OUTPUT_FILE="${SCRIPT_DIR}/test_results/hybrid_matching_results.txt" - -# Model - baseline CLIP -MODEL="openai/clip-vit-large-patch14" - -# Fixed parameters -NUM_LOGOS=20 -REFS_PER_LOGO=10 -POSITIVE_SAMPLES=20 -NEGATIVE_SAMPLES=100 -SEED=42 - -# Create output directory if needed -mkdir -p "${SCRIPT_DIR}/test_results" - -# Clear output file and write header -cat > "$OUTPUT_FILE" << EOF -Hybrid Text+CLIP Matching Test Results -====================================== -Date: $(date) - -Model: ${MODEL} - -Fixed Parameters: - Number of logo brands: ${NUM_LOGOS} - Refs per logo: ${REFS_PER_LOGO} - Positive samples/logo: ${POSITIVE_SAMPLES} - Negative samples/logo: ${NEGATIVE_SAMPLES} - Seed: ${SEED} - -EOF - -echo "Hybrid Text+CLIP Matching Test" -echo "===============================" -echo "Model: ${MODEL}" -echo "" - -# Test 1: Compare hybrid vs multi-ref baseline -echo "=== Test 1: Multi-ref baseline (for comparison) ===" -echo "" >> "$OUTPUT_FILE" -echo "=== BASELINE: Multi-ref (max) at threshold 0.70 ===" >> "$OUTPUT_FILE" - -uv run python "$SCRIPT_DIR/test_logo_detection.py" \ - --num-logos $NUM_LOGOS \ - --refs-per-logo $REFS_PER_LOGO \ - --positive-samples $POSITIVE_SAMPLES \ - --negative-samples $NEGATIVE_SAMPLES \ - --matching-method multi-ref \ - --min-matching-refs 1 \ - --use-max-similarity \ - --threshold 0.70 \ - --margin 0.05 \ - --seed $SEED \ - --embedding-model "$MODEL" \ - --output-file "$OUTPUT_FILE" \ - --no-cache - -echo "" - -# Test 2: Hybrid with default thresholds -echo "=== Test 2: Hybrid with default thresholds ===" -echo "" >> "$OUTPUT_FILE" -echo "=== HYBRID: default thresholds (0.70/0.60/0.80) ===" >> "$OUTPUT_FILE" - -uv run python "$SCRIPT_DIR/test_logo_detection.py" \ - --num-logos $NUM_LOGOS \ - --refs-per-logo $REFS_PER_LOGO \ - --positive-samples $POSITIVE_SAMPLES \ - --negative-samples $NEGATIVE_SAMPLES \ - --matching-method hybrid \ - --threshold 0.70 \ - --hybrid-text-threshold 0.60 \ - --hybrid-no-text-threshold 0.80 \ - --text-similarity-threshold 0.5 \ - --margin 0.05 \ - --seed $SEED \ - --embedding-model "$MODEL" \ - --output-file "$OUTPUT_FILE" \ - --no-cache - -echo "" - -# Test 3: Hybrid with more aggressive text bonus -echo "=== Test 3: Hybrid with lower text-match threshold ===" -echo "" >> "$OUTPUT_FILE" -echo "=== HYBRID: aggressive text bonus (0.70/0.55/0.80) ===" >> "$OUTPUT_FILE" - -uv run python "$SCRIPT_DIR/test_logo_detection.py" \ - --num-logos $NUM_LOGOS \ - --refs-per-logo $REFS_PER_LOGO \ - --positive-samples $POSITIVE_SAMPLES \ - --negative-samples $NEGATIVE_SAMPLES \ - --matching-method hybrid \ - --threshold 0.70 \ - --hybrid-text-threshold 0.55 \ - --hybrid-no-text-threshold 0.80 \ - --text-similarity-threshold 0.5 \ - --margin 0.05 \ - --seed $SEED \ - --embedding-model "$MODEL" \ - --output-file "$OUTPUT_FILE" \ - --no-cache - -echo "" - -# Test 4: Hybrid with stricter text mismatch penalty -echo "=== Test 4: Hybrid with stricter text mismatch penalty ===" -echo "" >> "$OUTPUT_FILE" -echo "=== HYBRID: strict mismatch (0.70/0.60/0.85) ===" >> "$OUTPUT_FILE" - -uv run python "$SCRIPT_DIR/test_logo_detection.py" \ - --num-logos $NUM_LOGOS \ - --refs-per-logo $REFS_PER_LOGO \ - --positive-samples $POSITIVE_SAMPLES \ - --negative-samples $NEGATIVE_SAMPLES \ - --matching-method hybrid \ - --threshold 0.70 \ - --hybrid-text-threshold 0.60 \ - --hybrid-no-text-threshold 0.85 \ - --text-similarity-threshold 0.5 \ - --margin 0.05 \ - --seed $SEED \ - --embedding-model "$MODEL" \ - --output-file "$OUTPUT_FILE" \ - --no-cache - -echo "" - -# Test 5: Hybrid with lower text similarity threshold (more lenient OCR matching) -echo "=== Test 5: Hybrid with lenient text matching ===" -echo "" >> "$OUTPUT_FILE" -echo "=== HYBRID: lenient text matching (text_sim=0.4) ===" >> "$OUTPUT_FILE" - -uv run python "$SCRIPT_DIR/test_logo_detection.py" \ - --num-logos $NUM_LOGOS \ - --refs-per-logo $REFS_PER_LOGO \ - --positive-samples $POSITIVE_SAMPLES \ - --negative-samples $NEGATIVE_SAMPLES \ - --matching-method hybrid \ - --threshold 0.70 \ - --hybrid-text-threshold 0.60 \ - --hybrid-no-text-threshold 0.80 \ - --text-similarity-threshold 0.4 \ - --margin 0.05 \ - --seed $SEED \ - --embedding-model "$MODEL" \ - --output-file "$OUTPUT_FILE" \ - --no-cache - -echo "" -echo "=======================================" -echo "Tests complete!" -echo "Results saved to: $OUTPUT_FILE" -echo "=======================================" diff --git a/test_logo_detection.py b/test_logo_detection.py index ec3580e..031ace3 100755 --- a/test_logo_detection.py +++ b/test_logo_detection.py @@ -243,12 +243,11 @@ def main(): parser.add_argument( "--matching-method", type=str, - choices=["simple", "margin", "multi-ref", "hybrid"], + choices=["simple", "margin", "multi-ref"], default="margin", help="Matching method: 'simple' returns all matches above threshold, " "'margin' requires confidence margin over 2nd best, " - "'multi-ref' aggregates scores across reference images, " - "'hybrid' combines text recognition with CLIP (default: margin)", + "'multi-ref' aggregates scores across reference images (default: margin)", ) parser.add_argument( "--min-matching-refs", @@ -261,25 +260,6 @@ def main(): action="store_true", help="For 'multi-ref' method: use max similarity instead of mean across references", ) - # Hybrid method arguments - parser.add_argument( - "--hybrid-text-threshold", - type=float, - default=0.60, - help="For 'hybrid' method: CLIP threshold when text matches (default: 0.60)", - ) - parser.add_argument( - "--hybrid-no-text-threshold", - type=float, - default=0.80, - help="For 'hybrid' method: CLIP threshold when text expected but not found (default: 0.80)", - ) - parser.add_argument( - "--text-similarity-threshold", - type=float, - default=0.5, - help="For 'hybrid' method: minimum text similarity to consider a match (default: 0.5)", - ) parser.add_argument( "-v", "--verbose", action="store_true", @@ -352,14 +332,6 @@ def main(): preprocess_mode=args.preprocess_mode, ) - # Initialize text detector for hybrid method - text_detector = None - if args.matching_method == "hybrid": - logger.info("Initializing text detector for hybrid matching...") - from text_recognition import DetectText - text_detector = DetectText(logger=logger, threshold=0.3) - detector.set_text_detector(text_detector) - # Load ground truth (both mappings) logger.info("Loading ground truth from database...") image_to_logos, logo_to_images = get_ground_truth(db_path) @@ -377,15 +349,10 @@ def main(): multi_ref_embeddings: Dict[str, List[torch.Tensor]] = {} # List for margin-based matching: (logo_name, embedding) tuples reference_embeddings: List[Tuple[str, torch.Tensor]] = [] - # Dict for hybrid matching: logo_name -> {'embeddings': [...], 'texts': [...]} - hybrid_reference_data: Dict[str, Dict[str, Any]] = {} total_refs = 0 - logos_with_text = 0 for logo_name, ref_filenames in tqdm(sampled_logos.items(), desc="Reference logos"): multi_ref_embeddings[logo_name] = [] - if args.matching_method == "hybrid": - hybrid_reference_data[logo_name] = {'embeddings': [], 'texts': set()} for ref_filename in ref_filenames: ref_path = reference_dir / ref_filename @@ -398,15 +365,12 @@ def main(): cache_key = f"ref:{ref_filename}" embedding = cache.get(cache_key) if cache else None - # Load image if needed (for embedding or text extraction) - img = None - if embedding is None or args.matching_method == "hybrid": + # Load image if needed for embedding + if embedding is None: img = load_image(ref_path) if img is None: logger.warning(f"Failed to load reference logo: {ref_path}") continue - - if embedding is None: embedding = detector.get_embedding(img) if cache: cache.put(cache_key, embedding) @@ -415,21 +379,7 @@ def main(): reference_embeddings.append((logo_name, embedding)) total_refs += 1 - # Extract text for hybrid method - if args.matching_method == "hybrid" and img is not None: - hybrid_reference_data[logo_name]['embeddings'].append(embedding) - texts = detector.extract_text(img, min_confidence=0.3) - hybrid_reference_data[logo_name]['texts'].update(texts) - - # Convert text set to list for hybrid data - if args.matching_method == "hybrid": - hybrid_reference_data[logo_name]['texts'] = list(hybrid_reference_data[logo_name]['texts']) - if hybrid_reference_data[logo_name]['texts']: - logos_with_text += 1 - logger.info(f"Computed {total_refs} embeddings for {len(sampled_logos)} logos") - if args.matching_method == "hybrid": - logger.info(f"Extracted text from {logos_with_text}/{len(sampled_logos)} reference logos") # Build test set: for each logo, sample positive and negative images logger.info(f"Sampling test images: {args.positive_samples} positive, {args.negative_samples} negative per logo...") @@ -504,14 +454,7 @@ def main(): cache_key = f"det:{test_filename}" cached_detections = cache.get(cache_key) if cache else None - # For hybrid matching, we always need the original image for text extraction test_img = None - if args.matching_method == "hybrid": - test_img = load_image(test_path) - if test_img is None: - logger.warning(f"Failed to load test image: {test_path}") - continue - if cached_detections is not None: # Cached detections contain serialized box data and embeddings detections = cached_detections @@ -651,50 +594,6 @@ def main(): "correct": is_correct, }) - else: # hybrid - # Hybrid matching: combines text recognition with CLIP - # Extract crop from original image for text extraction - box = detection["box"] - crop = test_img[ - int(box["ymin"]):int(box["ymax"]), - int(box["xmin"]):int(box["xmax"]) - ] - - match_result = detector.find_best_match_hybrid( - detected_embedding=detection["embedding"], - detected_image=crop, - reference_data=hybrid_reference_data, - clip_threshold=args.threshold, - clip_threshold_with_text=args.hybrid_text_threshold, - clip_threshold_text_mismatch=args.hybrid_no_text_threshold, - text_similarity_threshold=args.text_similarity_threshold, - margin=args.margin, - use_mean_similarity=not args.use_max_similarity, - ) - if match_result: - label, similarity, match_info = match_result - matched_logos.add(label) - - is_correct = label in expected_logos - if is_correct: - true_positives += 1 - if args.similarity_details: - similarity_details["true_positive_sims"].append(similarity) - else: - false_positives += 1 - if args.similarity_details: - similarity_details["false_positive_sims"].append(similarity) - - results.append({ - "test_image": test_filename, - "matched_logo": label, - "similarity": similarity, - "correct": is_correct, - "text_matched": match_info.get("text_matched", False), - "text_similarity": match_info.get("text_similarity", 0), - "match_type": match_info.get("match_type", "unknown"), - }) - # Count missed detections (false negatives) missed = expected_logos - matched_logos false_negatives += len(missed) @@ -742,16 +641,11 @@ def main(): print(f" DETR confidence threshold: {args.detr_threshold}") print(f" Preprocess mode: {args.preprocess_mode}") print(f" Matching method: {args.matching_method}") - if args.matching_method in ("margin", "multi-ref", "hybrid"): + if args.matching_method in ("margin", "multi-ref"): print(f" Matching margin: {args.margin}") if args.matching_method == "multi-ref": print(f" Min matching refs: {args.min_matching_refs}") print(f" Similarity aggregation: {'max' if args.use_max_similarity else 'mean'}") - if args.matching_method == "hybrid": - print(f" CLIP threshold (text match): {args.hybrid_text_threshold}") - print(f" CLIP threshold (no text): {args.hybrid_no_text_threshold}") - print(f" Text similarity threshold: {args.text_similarity_threshold}") - print(f" Refs with text: {logos_with_text}/{len(sampled_logos)}") if args.seed is not None: print(f" Random seed: {args.seed}") @@ -939,14 +833,9 @@ def write_results_to_file( method_desc = "Simple (all matches above threshold)" elif args.matching_method == "margin": method_desc = f"Margin-based (margin={args.margin})" - elif args.matching_method == "multi-ref": + else: # multi-ref agg = "max" if args.use_max_similarity else "mean" method_desc = f"Multi-ref ({agg}, min_refs={args.min_matching_refs}, margin={args.margin})" - else: # hybrid - method_desc = ( - f"Hybrid (text+CLIP, text_thresh={args.hybrid_text_threshold}, " - f"no_text_thresh={args.hybrid_no_text_threshold}, margin={args.margin})" - ) lines = [ "=" * 70, diff --git a/text_recognition.py b/text_recognition.py deleted file mode 100644 index 41899f2..0000000 --- a/text_recognition.py +++ /dev/null @@ -1,52 +0,0 @@ -import easyocr -import cv2 -import os -from pathlib import Path - - -class DetectText(): - def __init__(self, logger, threshold=0.0, allowlist=None, text_args=None): - # Set EasyOCR model storage directory (default: models/easyocr relative to this script) - default_model_dir = str(Path(__file__).parent / "models" / "easyocr") - model_storage_directory = os.environ.get('EASYOCR_MODEL_DIR', default_model_dir) - - # This needs to run only once to load the model into memory - self.reader = easyocr.Reader(['en'], model_storage_directory=model_storage_directory) - self.threshold = threshold - self.logger = logger - self.allowlist = allowlist - self.text_args = text_args.split(',') if text_args else [] - - def detect(self, img): # expects CV2 image - - if 'threshold' in self.text_args: - ret, img = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY_INV) - - if 'blur' in self.text_args: - img = cv2.blur(img, (5, 5)) - - if 'grayscale' in self.text_args: - img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - - if 'mag2' in self.text_args: - mag_ratio = 2.0 - else: - mag_ratio = 1.0 - - output = [] - boxes = [] - # run OCR - results = self.reader.readtext(img, allowlist=self.allowlist, mag_ratio=mag_ratio) - - for res in results: - top_left = (int(res[0][0][0]), int(res[0][0][1])) - bottom_right = (int(res[0][2][0]), int(res[0][2][1])) - - text = res[1] - confidence = res[2] - - if confidence >= self.threshold: - output.append((text, confidence)) - boxes.append([top_left, bottom_right]) - - return output, boxes