flywms/ware_detect_optimized.py

import argparse
import re
import sys
from pathlib import Path

import cv2
import numpy as np
import pytesseract


def parse_args():
    ap = argparse.ArgumentParser()

    ap.add_argument(
        "-v", "--video",
        default=None,
        help="Percorso video. Se omesso usa la webcam 0"
    )
    ap.add_argument(
        "--weights",
        default="yolov2.weights",
        help="File pesi YOLO"
    )
    ap.add_argument(
        "--config",
        default="yolov2.cfg",
        help="File cfg YOLO"
    )
    ap.add_argument(
        "--labels",
        default="labels.txt",
        help="File classi YOLO"
    )
    ap.add_argument(
        "--min-confidence",
        type=float,
        default=0.35,
        help="Soglia minima confidenza YOLO"
    )
    ap.add_argument(
        "--nms-threshold",
        type=float,
        default=0.40,
        help="Soglia NMS"
    )
    ap.add_argument(
        "--ocr-class",
        default="etichetta",
        help="Classe YOLO su cui fare OCR"
    )
    ap.add_argument(
        "--ocr-every",
        type=int,
        default=5,
        help="Fai OCR ogni N frame"
    )
    ap.add_argument(
        "--slot-cooldown",
        type=int,
        default=15,
        help="Numero minimo di frame prima di rifare OCR sulla stessa zona"
    )
    ap.add_argument(
        "--slot-size",
        type=int,
        default=120,
        help="Dimensione griglia per deduplicare le etichette"
    )
    ap.add_argument(
        "--max-ocr-boxes",
        type=int,
        default=2,
        help="Numero massimo di ROI etichetta su cui fare OCR per frame"
    )
    ap.add_argument(
        "--min-label-width",
        type=int,
        default=60,
        help="Larghezza minima bbox etichetta"
    )
    ap.add_argument(
        "--min-label-height",
        type=int,
        default=25,
        help="Altezza minima bbox etichetta"
    )
    ap.add_argument(
        "--detect-width",
        type=int,
        default=960,
        help="Larghezza massima del frame usato per detection YOLO"
    )
    ap.add_argument(
        "--show-roi",
        action="store_true",
        help="Mostra anche la ROI preprocessata per OCR"
    )
    ap.add_argument(
        "--tesseract-cmd",
        default=None,
        help="Percorso esplicito a tesseract.exe, se necessario"
    )
    ap.add_argument(
        "--print-all",
        action="store_true",
        help="Stampa anche OCR vuoti o corti"
    )

    return ap.parse_args()


def require_file(path_str, description):
    path = Path(path_str)
    if not path.exists():
        print(f"Errore: {description} non trovato: {path}")
        sys.exit(1)
    return path


def load_classes(labels_path):
    with open(labels_path, "rt", encoding="utf-8") as f:
        classes = [line.strip() for line in f if line.strip()]

    if not classes:
        print("Errore: labels.txt vuoto")
        sys.exit(1)

    return classes


def open_capture(video_arg):
    if video_arg is None:
        cap = cv2.VideoCapture(0, cv2.CAP_DSHOW)
        if not cap.isOpened():
            cap = cv2.VideoCapture(0)
        return cap

    if str(video_arg).isdigit():
        idx = int(video_arg)
        cap = cv2.VideoCapture(idx, cv2.CAP_DSHOW)
        if not cap.isOpened():
            cap = cv2.VideoCapture(idx)
        return cap

    return cv2.VideoCapture(video_arg)


def resize_for_detection(frame, max_width):
    h, w = frame.shape[:2]

    if max_width <= 0 or w <= max_width:
        return frame, 1.0, 1.0

    scale = max_width / float(w)
    new_w = int(w * scale)
    new_h = int(h * scale)

    resized = cv2.resize(frame, (new_w, new_h), interpolation=cv2.INTER_LINEAR)

    scale_x = w / float(new_w)
    scale_y = h / float(new_h)

    return resized, scale_x, scale_y


def clip_box(x1, y1, x2, y2, w, h):
    x1 = max(0, min(x1, w - 1))
    y1 = max(0, min(y1, h - 1))
    x2 = max(0, min(x2, w - 1))
    y2 = max(0, min(y2, h - 1))
    return x1, y1, x2, y2


def expand_box(x1, y1, x2, y2, frame_w, frame_h, pad_ratio=0.10):
    bw = x2 - x1
    bh = y2 - y1

    pad_x = int(bw * pad_ratio)
    pad_y = int(bh * pad_ratio)

    x1 -= pad_x
    y1 -= pad_y
    x2 += pad_x
    y2 += pad_y

    return clip_box(x1, y1, x2, y2, frame_w, frame_h)


def preprocess_for_ocr(roi):
    gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)

    gray = cv2.resize(
        gray,
        None,
        fx=2.0,
        fy=2.0,
        interpolation=cv2.INTER_CUBIC
    )

    gray = cv2.GaussianBlur(gray, (3, 3), 0)

    gray = cv2.threshold(
        gray,
        0,
        255,
        cv2.THRESH_BINARY + cv2.THRESH_OTSU
    )[1]

    gray = cv2.copyMakeBorder(
        gray,
        10, 10, 10, 10,
        borderType=cv2.BORDER_CONSTANT,
        value=255
    )

    return gray


def ocr_digits_only(roi):
    processed = preprocess_for_ocr(roi)

    config = r'--oem 3 --psm 7 -c tessedit_char_whitelist=0123456789'
    raw_text = pytesseract.image_to_string(processed, config=config)

    digits = re.sub(r"\D+", "", raw_text)

    return digits, processed, raw_text


def detect_yolo(net, frame, classes, min_confidence, nms_threshold):
    h, w = frame.shape[:2]

    blob = cv2.dnn.blobFromImage(
        frame,
        scalefactor=1.0 / 255.0,
        size=(416, 416),
        swapRB=True,
        crop=False
    )

    net.setInput(blob)

    output_layer_names = net.getUnconnectedOutLayersNames()
    layer_outputs = net.forward(output_layer_names)

    boxes = []
    confidences = []
    class_ids = []

    for output in layer_outputs:
        for detection in output:
            scores = detection[5:]

            if scores.size == 0:
                continue

            class_id = int(np.argmax(scores))
            confidence = float(scores[class_id])

            if confidence < min_confidence:
                continue

            center_x = int(detection[0] * w)
            center_y = int(detection[1] * h)
            box_w = int(detection[2] * w)
            box_h = int(detection[3] * h)

            x = int(center_x - box_w / 2)
            y = int(center_y - box_h / 2)

            boxes.append([x, y, box_w, box_h])
            confidences.append(confidence)
            class_ids.append(class_id)

    final_detections = []

    if len(boxes) == 0:
        return final_detections

    indices = cv2.dnn.NMSBoxes(
        boxes,
        confidences,
        min_confidence,
        nms_threshold
    )

    if len(indices) == 0:
        return final_detections

    indices = np.array(indices).flatten()

    for i in indices:
        x, y, bw, bh = boxes[i]

        x1 = x
        y1 = y
        x2 = x + bw
        y2 = y + bh

        x1, y1, x2, y2 = clip_box(x1, y1, x2, y2, w, h)

        final_detections.append({
            "class_id": class_ids[i],
            "label": classes[class_ids[i]],
            "confidence": confidences[i],
            "box": (x1, y1, x2, y2),
        })

    return final_detections


def quantized_slot_key(x1, y1, x2, y2, slot_size):
    cx = (x1 + x2) // 2
    cy = (y1 + y2) // 2
    return (cx // slot_size, cy // slot_size)


def draw_detection(frame, det, color=(255, 255, 255), text_extra=""):
    x1, y1, x2, y2 = det["box"]
    label = det["label"]
    conf = det["confidence"]

    cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)

    text = f"{label} {conf:.2f}"
    if text_extra:
        text += f" | {text_extra}"

    y_text = max(20, y1 - 8)

    cv2.putText(
        frame,
        text,
        (x1, y_text),
        cv2.FONT_HERSHEY_SIMPLEX,
        0.6,
        color,
        2,
        cv2.LINE_AA
    )


def main():
    args = parse_args()

    if args.tesseract_cmd:
        pytesseract.pytesseract.tesseract_cmd = args.tesseract_cmd

    require_file(args.weights, "File pesi YOLO")
    require_file(args.config, "File cfg YOLO")
    require_file(args.labels, "File labels YOLO")

    classes = load_classes(args.labels)
    print(classes)

    ocr_class_lower = args.ocr_class.strip().lower()

    net = cv2.dnn.readNetFromDarknet(args.config, args.weights)
    net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
    net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)

    cap = open_capture(args.video)
    if not cap.isOpened():
        print("Errore: impossibile aprire la sorgente video")
        sys.exit(1)

    win_name = "YOLO + OCR etichette"
    cv2.namedWindow(win_name, cv2.WINDOW_NORMAL)

    frame_idx = 0

    # Memoria OCR per posizione etichetta
    slot_memory = {}

    # Memoria per evitare stampe duplicate continue dello stesso numero
    printed_texts = {}

    while True:
        grabbed, frame = cap.read()

        if not grabbed or frame is None:
            print("Fine stream o impossibile leggere il frame")
            break

        frame_idx += 1
        orig_h, orig_w = frame.shape[:2]

        det_frame, scale_x, scale_y = resize_for_detection(
            frame,
            args.detect_width
        )

        detections = detect_yolo(
            net,
            det_frame,
            classes,
            args.min_confidence,
            args.nms_threshold
        )

        # Scala bbox sul frame originale
        scaled_detections = []
        for det in detections:
            x1, y1, x2, y2 = det["box"]

            x1 = int(x1 * scale_x)
            y1 = int(y1 * scale_y)
            x2 = int(x2 * scale_x)
            y2 = int(y2 * scale_y)

            x1, y1, x2, y2 = clip_box(x1, y1, x2, y2, orig_w, orig_h)

            scaled_detections.append({
                "class_id": det["class_id"],
                "label": det["label"],
                "confidence": det["confidence"],
                "box": (x1, y1, x2, y2),
            })

        # OCR solo per etichette
        label_detections = [
            d for d in scaled_detections
            if d["label"].strip().lower() == ocr_class_lower
        ]

        label_detections.sort(
            key=lambda d: d["confidence"],
            reverse=True
        )

        label_detections = label_detections[:args.max_ocr_boxes]

        # Disegna tutte le detection
        for det in scaled_detections:
            color = (255, 255, 255)

            if det["label"].strip().lower() == ocr_class_lower:
                color = (0, 255, 255)
            elif det["label"].strip().lower() == "gaylord":
                color = (0, 255, 0)

            draw_detection(frame, det, color=color)

        # OCR solo ogni N frame e con cooldown per slot
        if frame_idx % args.ocr_every == 0:
            for det in label_detections:
                x1, y1, x2, y2 = det["box"]

                bw = x2 - x1
                bh = y2 - y1

                if bw < args.min_label_width or bh < args.min_label_height:
                    continue

                slot_key = quantized_slot_key(
                    x1, y1, x2, y2,
                    args.slot_size
                )

                slot_info = slot_memory.get(slot_key)

                if slot_info is not None:
                    if frame_idx - slot_info["last_ocr_frame"] < args.slot_cooldown:
                        continue

                rx1, ry1, rx2, ry2 = expand_box(
                    x1, y1, x2, y2,
                    orig_w, orig_h,
                    pad_ratio=0.10
                )

                roi = frame[ry1:ry2, rx1:rx2]

                if roi.size == 0:
                    continue

                digits, processed, raw_text = ocr_digits_only(roi)

                if args.print_all:
                    print(
                        f"[frame {frame_idx}] OCR grezzo='{raw_text.strip()}' -> digits='{digits}'"
                    )

                if len(digits) >= 2:
                    slot_memory[slot_key] = {
                        "text": digits,
                        "last_ocr_frame": frame_idx,
                        "box": (x1, y1, x2, y2),
                    }

                    last_print_frame = printed_texts.get(digits, -999999)
                    if frame_idx - last_print_frame > 30:
                        print(f"[frame {frame_idx}] Etichetta letta: {digits}")
                        printed_texts[digits] = frame_idx

                    if args.show_roi:
                        cv2.imshow("ROI OCR", processed)
                else:
                    slot_memory[slot_key] = {
                        "text": slot_info["text"] if slot_info else "",
                        "last_ocr_frame": frame_idx,
                        "box": (x1, y1, x2, y2),
                    }

        # Ridisegna testo OCR memorizzato vicino alle etichette
        for det in label_detections:
            x1, y1, x2, y2 = det["box"]

            slot_key = quantized_slot_key(
                x1, y1, x2, y2,
                args.slot_size
            )

            slot_info = slot_memory.get(slot_key)

            if slot_info and slot_info.get("text"):
                cv2.putText(
                    frame,
                    f"NUM: {slot_info['text']}",
                    (x1, min(orig_h - 5, y2 + 22)),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.7,
                    (0, 255, 255),
                    2,
                    cv2.LINE_AA
                )

        cv2.imshow(win_name, frame)

        key = cv2.waitKey(1) & 0xFF
        if key == ord("q"):
            break

    cap.release()
    cv2.destroyAllWindows()


if __name__ == "__main__":
    main()