Initial import

2026-05-14 09:36:44 +02:00
commit 747298ac7a
1212 changed files with 56349 additions and 0 deletions
--- a/ware_detect_optimized.py
+++ b/ware_detect_optimized.py
@@ -0,0 +1,539 @@
+import argparse
+import re
+import sys
+from pathlib import Path
+
+import cv2
+import numpy as np
+import pytesseract
+
+
+def parse_args():
+    ap = argparse.ArgumentParser()
+
+    ap.add_argument(
+        "-v", "--video",
+        default=None,
+        help="Percorso video. Se omesso usa la webcam 0"
+    )
+    ap.add_argument(
+        "--weights",
+        default="yolov2.weights",
+        help="File pesi YOLO"
+    )
+    ap.add_argument(
+        "--config",
+        default="yolov2.cfg",
+        help="File cfg YOLO"
+    )
+    ap.add_argument(
+        "--labels",
+        default="labels.txt",
+        help="File classi YOLO"
+    )
+    ap.add_argument(
+        "--min-confidence",
+        type=float,
+        default=0.35,
+        help="Soglia minima confidenza YOLO"
+    )
+    ap.add_argument(
+        "--nms-threshold",
+        type=float,
+        default=0.40,
+        help="Soglia NMS"
+    )
+    ap.add_argument(
+        "--ocr-class",
+        default="etichetta",
+        help="Classe YOLO su cui fare OCR"
+    )
+    ap.add_argument(
+        "--ocr-every",
+        type=int,
+        default=5,
+        help="Fai OCR ogni N frame"
+    )
+    ap.add_argument(
+        "--slot-cooldown",
+        type=int,
+        default=15,
+        help="Numero minimo di frame prima di rifare OCR sulla stessa zona"
+    )
+    ap.add_argument(
+        "--slot-size",
+        type=int,
+        default=120,
+        help="Dimensione griglia per deduplicare le etichette"
+    )
+    ap.add_argument(
+        "--max-ocr-boxes",
+        type=int,
+        default=2,
+        help="Numero massimo di ROI etichetta su cui fare OCR per frame"
+    )
+    ap.add_argument(
+        "--min-label-width",
+        type=int,
+        default=60,
+        help="Larghezza minima bbox etichetta"
+    )
+    ap.add_argument(
+        "--min-label-height",
+        type=int,
+        default=25,
+        help="Altezza minima bbox etichetta"
+    )
+    ap.add_argument(
+        "--detect-width",
+        type=int,
+        default=960,
+        help="Larghezza massima del frame usato per detection YOLO"
+    )
+    ap.add_argument(
+        "--show-roi",
+        action="store_true",
+        help="Mostra anche la ROI preprocessata per OCR"
+    )
+    ap.add_argument(
+        "--tesseract-cmd",
+        default=None,
+        help="Percorso esplicito a tesseract.exe, se necessario"
+    )
+    ap.add_argument(
+        "--print-all",
+        action="store_true",
+        help="Stampa anche OCR vuoti o corti"
+    )
+
+    return ap.parse_args()
+
+
+def require_file(path_str, description):
+    path = Path(path_str)
+    if not path.exists():
+        print(f"Errore: {description} non trovato: {path}")
+        sys.exit(1)
+    return path
+
+
+def load_classes(labels_path):
+    with open(labels_path, "rt", encoding="utf-8") as f:
+        classes = [line.strip() for line in f if line.strip()]
+
+    if not classes:
+        print("Errore: labels.txt vuoto")
+        sys.exit(1)
+
+    return classes
+
+
+def open_capture(video_arg):
+    if video_arg is None:
+        cap = cv2.VideoCapture(0, cv2.CAP_DSHOW)
+        if not cap.isOpened():
+            cap = cv2.VideoCapture(0)
+        return cap
+
+    if str(video_arg).isdigit():
+        idx = int(video_arg)
+        cap = cv2.VideoCapture(idx, cv2.CAP_DSHOW)
+        if not cap.isOpened():
+            cap = cv2.VideoCapture(idx)
+        return cap
+
+    return cv2.VideoCapture(video_arg)
+
+
+def resize_for_detection(frame, max_width):
+    h, w = frame.shape[:2]
+
+    if max_width <= 0 or w <= max_width:
+        return frame, 1.0, 1.0
+
+    scale = max_width / float(w)
+    new_w = int(w * scale)
+    new_h = int(h * scale)
+
+    resized = cv2.resize(frame, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
+
+    scale_x = w / float(new_w)
+    scale_y = h / float(new_h)
+
+    return resized, scale_x, scale_y
+
+
+def clip_box(x1, y1, x2, y2, w, h):
+    x1 = max(0, min(x1, w - 1))
+    y1 = max(0, min(y1, h - 1))
+    x2 = max(0, min(x2, w - 1))
+    y2 = max(0, min(y2, h - 1))
+    return x1, y1, x2, y2
+
+
+def expand_box(x1, y1, x2, y2, frame_w, frame_h, pad_ratio=0.10):
+    bw = x2 - x1
+    bh = y2 - y1
+
+    pad_x = int(bw * pad_ratio)
+    pad_y = int(bh * pad_ratio)
+
+    x1 -= pad_x
+    y1 -= pad_y
+    x2 += pad_x
+    y2 += pad_y
+
+    return clip_box(x1, y1, x2, y2, frame_w, frame_h)
+
+
+def preprocess_for_ocr(roi):
+    gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
+
+    gray = cv2.resize(
+        gray,
+        None,
+        fx=2.0,
+        fy=2.0,
+        interpolation=cv2.INTER_CUBIC
+    )
+
+    gray = cv2.GaussianBlur(gray, (3, 3), 0)
+
+    gray = cv2.threshold(
+        gray,
+        0,
+        255,
+        cv2.THRESH_BINARY + cv2.THRESH_OTSU
+    )[1]
+
+    gray = cv2.copyMakeBorder(
+        gray,
+        10, 10, 10, 10,
+        borderType=cv2.BORDER_CONSTANT,
+        value=255
+    )
+
+    return gray
+
+
+def ocr_digits_only(roi):
+    processed = preprocess_for_ocr(roi)
+
+    config = r'--oem 3 --psm 7 -c tessedit_char_whitelist=0123456789'
+    raw_text = pytesseract.image_to_string(processed, config=config)
+
+    digits = re.sub(r"\D+", "", raw_text)
+
+    return digits, processed, raw_text
+
+
+def detect_yolo(net, frame, classes, min_confidence, nms_threshold):
+    h, w = frame.shape[:2]
+
+    blob = cv2.dnn.blobFromImage(
+        frame,
+        scalefactor=1.0 / 255.0,
+        size=(416, 416),
+        swapRB=True,
+        crop=False
+    )
+
+    net.setInput(blob)
+
+    output_layer_names = net.getUnconnectedOutLayersNames()
+    layer_outputs = net.forward(output_layer_names)
+
+    boxes = []
+    confidences = []
+    class_ids = []
+
+    for output in layer_outputs:
+        for detection in output:
+            scores = detection[5:]
+
+            if scores.size == 0:
+                continue
+
+            class_id = int(np.argmax(scores))
+            confidence = float(scores[class_id])
+
+            if confidence < min_confidence:
+                continue
+
+            center_x = int(detection[0] * w)
+            center_y = int(detection[1] * h)
+            box_w = int(detection[2] * w)
+            box_h = int(detection[3] * h)
+
+            x = int(center_x - box_w / 2)
+            y = int(center_y - box_h / 2)
+
+            boxes.append([x, y, box_w, box_h])
+            confidences.append(confidence)
+            class_ids.append(class_id)
+
+    final_detections = []
+
+    if len(boxes) == 0:
+        return final_detections
+
+    indices = cv2.dnn.NMSBoxes(
+        boxes,
+        confidences,
+        min_confidence,
+        nms_threshold
+    )
+
+    if len(indices) == 0:
+        return final_detections
+
+    indices = np.array(indices).flatten()
+
+    for i in indices:
+        x, y, bw, bh = boxes[i]
+
+        x1 = x
+        y1 = y
+        x2 = x + bw
+        y2 = y + bh
+
+        x1, y1, x2, y2 = clip_box(x1, y1, x2, y2, w, h)
+
+        final_detections.append({
+            "class_id": class_ids[i],
+            "label": classes[class_ids[i]],
+            "confidence": confidences[i],
+            "box": (x1, y1, x2, y2),
+        })
+
+    return final_detections
+
+
+def quantized_slot_key(x1, y1, x2, y2, slot_size):
+    cx = (x1 + x2) // 2
+    cy = (y1 + y2) // 2
+    return (cx // slot_size, cy // slot_size)
+
+
+def draw_detection(frame, det, color=(255, 255, 255), text_extra=""):
+    x1, y1, x2, y2 = det["box"]
+    label = det["label"]
+    conf = det["confidence"]
+
+    cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
+
+    text = f"{label} {conf:.2f}"
+    if text_extra:
+        text += f" | {text_extra}"
+
+    y_text = max(20, y1 - 8)
+
+    cv2.putText(
+        frame,
+        text,
+        (x1, y_text),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        0.6,
+        color,
+        2,
+        cv2.LINE_AA
+    )
+
+
+def main():
+    args = parse_args()
+
+    if args.tesseract_cmd:
+        pytesseract.pytesseract.tesseract_cmd = args.tesseract_cmd
+
+    require_file(args.weights, "File pesi YOLO")
+    require_file(args.config, "File cfg YOLO")
+    require_file(args.labels, "File labels YOLO")
+
+    classes = load_classes(args.labels)
+    print(classes)
+
+    ocr_class_lower = args.ocr_class.strip().lower()
+
+    net = cv2.dnn.readNetFromDarknet(args.config, args.weights)
+    net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
+    net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)
+
+    cap = open_capture(args.video)
+    if not cap.isOpened():
+        print("Errore: impossibile aprire la sorgente video")
+        sys.exit(1)
+
+    win_name = "YOLO + OCR etichette"
+    cv2.namedWindow(win_name, cv2.WINDOW_NORMAL)
+
+    frame_idx = 0
+
+    # Memoria OCR per posizione etichetta
+    slot_memory = {}
+
+    # Memoria per evitare stampe duplicate continue dello stesso numero
+    printed_texts = {}
+
+    while True:
+        grabbed, frame = cap.read()
+
+        if not grabbed or frame is None:
+            print("Fine stream o impossibile leggere il frame")
+            break
+
+        frame_idx += 1
+        orig_h, orig_w = frame.shape[:2]
+
+        det_frame, scale_x, scale_y = resize_for_detection(
+            frame,
+            args.detect_width
+        )
+
+        detections = detect_yolo(
+            net,
+            det_frame,
+            classes,
+            args.min_confidence,
+            args.nms_threshold
+        )
+
+        # Scala bbox sul frame originale
+        scaled_detections = []
+        for det in detections:
+            x1, y1, x2, y2 = det["box"]
+
+            x1 = int(x1 * scale_x)
+            y1 = int(y1 * scale_y)
+            x2 = int(x2 * scale_x)
+            y2 = int(y2 * scale_y)
+
+            x1, y1, x2, y2 = clip_box(x1, y1, x2, y2, orig_w, orig_h)
+
+            scaled_detections.append({
+                "class_id": det["class_id"],
+                "label": det["label"],
+                "confidence": det["confidence"],
+                "box": (x1, y1, x2, y2),
+            })
+
+        # OCR solo per etichette
+        label_detections = [
+            d for d in scaled_detections
+            if d["label"].strip().lower() == ocr_class_lower
+        ]
+
+        label_detections.sort(
+            key=lambda d: d["confidence"],
+            reverse=True
+        )
+
+        label_detections = label_detections[:args.max_ocr_boxes]
+
+        # Disegna tutte le detection
+        for det in scaled_detections:
+            color = (255, 255, 255)
+
+            if det["label"].strip().lower() == ocr_class_lower:
+                color = (0, 255, 255)
+            elif det["label"].strip().lower() == "gaylord":
+                color = (0, 255, 0)
+
+            draw_detection(frame, det, color=color)
+
+        # OCR solo ogni N frame e con cooldown per slot
+        if frame_idx % args.ocr_every == 0:
+            for det in label_detections:
+                x1, y1, x2, y2 = det["box"]
+
+                bw = x2 - x1
+                bh = y2 - y1
+
+                if bw < args.min_label_width or bh < args.min_label_height:
+                    continue
+
+                slot_key = quantized_slot_key(
+                    x1, y1, x2, y2,
+                    args.slot_size
+                )
+
+                slot_info = slot_memory.get(slot_key)
+
+                if slot_info is not None:
+                    if frame_idx - slot_info["last_ocr_frame"] < args.slot_cooldown:
+                        continue
+
+                rx1, ry1, rx2, ry2 = expand_box(
+                    x1, y1, x2, y2,
+                    orig_w, orig_h,
+                    pad_ratio=0.10
+                )
+
+                roi = frame[ry1:ry2, rx1:rx2]
+
+                if roi.size == 0:
+                    continue
+
+                digits, processed, raw_text = ocr_digits_only(roi)
+
+                if args.print_all:
+                    print(
+                        f"[frame {frame_idx}] OCR grezzo='{raw_text.strip()}' -> digits='{digits}'"
+                    )
+
+                if len(digits) >= 2:
+                    slot_memory[slot_key] = {
+                        "text": digits,
+                        "last_ocr_frame": frame_idx,
+                        "box": (x1, y1, x2, y2),
+                    }
+
+                    last_print_frame = printed_texts.get(digits, -999999)
+                    if frame_idx - last_print_frame > 30:
+                        print(f"[frame {frame_idx}] Etichetta letta: {digits}")
+                        printed_texts[digits] = frame_idx
+
+                    if args.show_roi:
+                        cv2.imshow("ROI OCR", processed)
+                else:
+                    slot_memory[slot_key] = {
+                        "text": slot_info["text"] if slot_info else "",
+                        "last_ocr_frame": frame_idx,
+                        "box": (x1, y1, x2, y2),
+                    }
+
+        # Ridisegna testo OCR memorizzato vicino alle etichette
+        for det in label_detections:
+            x1, y1, x2, y2 = det["box"]
+
+            slot_key = quantized_slot_key(
+                x1, y1, x2, y2,
+                args.slot_size
+            )
+
+            slot_info = slot_memory.get(slot_key)
+
+            if slot_info and slot_info.get("text"):
+                cv2.putText(
+                    frame,
+                    f"NUM: {slot_info['text']}",
+                    (x1, min(orig_h - 5, y2 + 22)),
+                    cv2.FONT_HERSHEY_SIMPLEX,
+                    0.7,
+                    (0, 255, 255),
+                    2,
+                    cv2.LINE_AA
+                )
+
+        cv2.imshow(win_name, frame)
+
+        key = cv2.waitKey(1) & 0xFF
+        if key == ord("q"):
+            break
+
+    cap.release()
+    cv2.destroyAllWindows()
+
+
+if __name__ == "__main__":
+    main()