Initial import

2026-05-14 09:36:44 +02:00
commit 747298ac7a
1212 changed files with 56349 additions and 0 deletions
--- a/ware_detect_optimized_v2.py
+++ b/ware_detect_optimized_v2.py
@@ -0,0 +1,382 @@
+import argparse
+import re
+import sys
+from pathlib import Path
+
+import cv2
+import numpy as np
+import pytesseract
+
+
+def parse_args():
+    ap = argparse.ArgumentParser()
+
+    ap.add_argument("-v", "--video", default=None,
+                    help="Percorso video. Se omesso usa webcam 0")
+    ap.add_argument("--weights", default="yolov2.weights",
+                    help="File pesi YOLOv2")
+    ap.add_argument("--config", default="yolov2.cfg",
+                    help="File config YOLOv2")
+    ap.add_argument("--labels", default="labels.txt",
+                    help="File labels classi")
+    ap.add_argument("--tesseract-cmd", default=None,
+                    help="Percorso esplicito a tesseract.exe")
+
+    # Prestazioni
+    ap.add_argument("--input-size", type=int, default=320,
+                    help="Dimensione input YOLO (320 piu' veloce di 416)")
+    ap.add_argument("--detect-every", type=int, default=4,
+                    help="Esegue YOLO ogni N frame")
+    ap.add_argument("--ocr-every-detect", type=int, default=2,
+                    help="Esegue OCR ogni N cicli di detection")
+    ap.add_argument("--preview-width", type=int, default=1280,
+                    help="Larghezza massima finestra preview")
+    ap.add_argument("--max-ocr-boxes", type=int, default=1,
+                    help="Numero massimo di etichette da leggere per detection")
+
+    # Qualita'
+    ap.add_argument("--min-confidence", type=float, default=0.30,
+                    help="Soglia minima confidenza")
+    ap.add_argument("--label-class", default="etichetta",
+                    help="Nome classe etichetta")
+    ap.add_argument("--min-label-width", type=int, default=50,
+                    help="Larghezza minima bbox etichetta")
+    ap.add_argument("--min-label-height", type=int, default=20,
+                    help="Altezza minima bbox etichetta")
+    ap.add_argument("--ocr-min-digits", type=int, default=2,
+                    help="Numero minimo di cifre per considerare valida una lettura")
+    ap.add_argument("--show-roi", action="store_true",
+                    help="Mostra ROI preprocessata per OCR")
+    ap.add_argument("--print-all", action="store_true",
+                    help="Stampa anche OCR grezzi non validi")
+
+    return ap.parse_args()
+
+
+def require_file(path_str, description):
+    path = Path(path_str)
+    if not path.exists():
+        print(f"Errore: {description} non trovato: {path}")
+        sys.exit(1)
+    return path
+
+
+def load_classes(labels_path):
+    with open(labels_path, "rt", encoding="utf-8") as f:
+        classes = [line.strip() for line in f if line.strip()]
+
+    if not classes:
+        print("Errore: labels.txt vuoto")
+        sys.exit(1)
+
+    return classes
+
+
+def open_capture(video_arg):
+    if video_arg is None:
+        cap = cv2.VideoCapture(0, cv2.CAP_DSHOW)
+        if not cap.isOpened():
+            cap = cv2.VideoCapture(0)
+        return cap
+
+    if str(video_arg).isdigit():
+        idx = int(video_arg)
+        cap = cv2.VideoCapture(idx, cv2.CAP_DSHOW)
+        if not cap.isOpened():
+            cap = cv2.VideoCapture(idx)
+        return cap
+
+    return cv2.VideoCapture(video_arg)
+
+
+def resize_preview(frame, max_width):
+    h, w = frame.shape[:2]
+    if max_width <= 0 or w <= max_width:
+        return frame
+    scale = max_width / float(w)
+    new_w = int(w * scale)
+    new_h = int(h * scale)
+    return cv2.resize(frame, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
+
+
+def clip_box(x1, y1, x2, y2, w, h):
+    x1 = max(0, min(x1, w - 1))
+    y1 = max(0, min(y1, h - 1))
+    x2 = max(0, min(x2, w - 1))
+    y2 = max(0, min(y2, h - 1))
+    return x1, y1, x2, y2
+
+
+def expand_box(x1, y1, x2, y2, frame_w, frame_h, pad_ratio=0.08):
+    bw = x2 - x1
+    bh = y2 - y1
+    pad_x = int(bw * pad_ratio)
+    pad_y = int(bh * pad_ratio)
+
+    x1 -= pad_x
+    y1 -= pad_y
+    x2 += pad_x
+    y2 += pad_y
+
+    return clip_box(x1, y1, x2, y2, frame_w, frame_h)
+
+
+def preprocess_for_ocr(roi):
+    gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
+
+    # Ingrandimento moderato: abbastanza per OCR, meno costoso della versione precedente
+    gray = cv2.resize(gray, None, fx=1.7, fy=1.7, interpolation=cv2.INTER_CUBIC)
+
+    # Leggera pulizia
+    gray = cv2.GaussianBlur(gray, (3, 3), 0)
+
+    # Binaria
+    gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
+
+    # Bordo bianco
+    gray = cv2.copyMakeBorder(gray, 8, 8, 8, 8,
+                              borderType=cv2.BORDER_CONSTANT, value=255)
+
+    return gray
+
+
+def ocr_digits_only(roi):
+    processed = preprocess_for_ocr(roi)
+
+    config = r'--oem 3 --psm 7 -c tessedit_char_whitelist=0123456789'
+    raw_text = pytesseract.image_to_string(processed, config=config)
+    digits = re.sub(r"\D+", "", raw_text)
+
+    return digits, processed, raw_text
+
+
+def detect_yolov2_original_style(net, frame, classes, min_confidence):
+    """
+    Parsing vicino al tuo script originale.
+    Questo evita il parsing 'generico' stile YOLOv3/4 che nel tuo caso
+    poteva produrre tantissimi box e rallentare drasticamente tutto.
+    """
+    h, w = frame.shape[:2]
+
+    blob = cv2.dnn.blobFromImage(
+        frame,
+        scalefactor=1.0 / 255.0,
+        size=(args.input_size, args.input_size),
+        mean=(0, 0, 0),
+        swapRB=False,
+        crop=False
+    )
+
+    net.setInput(blob)
+    predictions = net.forward()
+
+    # Adatta forma output
+    predictions = np.array(predictions)
+
+    if predictions.ndim == 4:
+        # es. (1, N, 1, 1) o simili: prova a schiacciare
+        predictions = predictions.reshape(predictions.shape[1], predictions.shape[-1])
+    elif predictions.ndim == 3:
+        predictions = predictions[0]
+
+    detections = []
+
+    if predictions.ndim != 2:
+        return detections
+
+    for i in range(predictions.shape[0]):
+        if predictions.shape[1] <= 5:
+            continue
+
+        prob_arr = predictions[i][5:]
+        if prob_arr.size == 0:
+            continue
+
+        class_index = int(prob_arr.argmax(axis=0))
+        confidence = float(prob_arr[class_index])
+
+        if confidence <= min_confidence:
+            continue
+
+        x_center = float(predictions[i][0]) * w
+        y_center = float(predictions[i][1]) * h
+        width_box = float(predictions[i][2]) * w
+        height_box = float(predictions[i][3]) * h
+
+        x1 = int(x_center - width_box * 0.5)
+        y1 = int(y_center - height_box * 0.5)
+        x2 = int(x_center + width_box * 0.5)
+        y2 = int(y_center + height_box * 0.5)
+
+        x1, y1, x2, y2 = clip_box(x1, y1, x2, y2, w, h)
+
+        detections.append({
+            "class_id": class_index,
+            "label": classes[class_index],
+            "confidence": confidence,
+            "box": (x1, y1, x2, y2),
+        })
+
+    return detections
+
+
+def draw_detection(frame, det, extra_text=""):
+    x1, y1, x2, y2 = det["box"]
+    label = det["label"]
+    conf = det["confidence"]
+
+    color = (255, 255, 255)
+    if label.lower() == args.label_class.lower():
+        color = (0, 255, 255)
+    elif label.lower() == "gaylord":
+        color = (0, 255, 0)
+
+    cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
+
+    text = f"{label} {conf:.2f}"
+    if extra_text:
+        text += f" | {extra_text}"
+
+    y_text = max(20, y1 - 8)
+    cv2.putText(frame, text, (x1, y_text),
+                cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2, cv2.LINE_AA)
+
+
+def best_label_detections(detections, label_name, max_boxes):
+    labels = [d for d in detections if d["label"].strip().lower() == label_name.lower()]
+
+    # Ordina per confidenza e area, cosi' preferisce box piu' leggibili
+    labels.sort(
+        key=lambda d: (
+            d["confidence"],
+            (d["box"][2] - d["box"][0]) * (d["box"][3] - d["box"][1])
+        ),
+        reverse=True
+    )
+
+    return labels[:max_boxes]
+
+
+def main():
+    global args
+    args = parse_args()
+
+    if args.tesseract_cmd:
+        pytesseract.pytesseract.tesseract_cmd = args.tesseract_cmd
+
+    require_file(args.weights, "File pesi YOLOv2")
+    require_file(args.config, "File config YOLOv2")
+    require_file(args.labels, "File labels")
+
+    classes = load_classes(args.labels)
+    print(classes)
+
+    net = cv2.dnn.readNetFromDarknet(args.config, args.weights)
+    net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
+    net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)
+
+    cap = open_capture(args.video)
+    if not cap.isOpened():
+        print("Errore: impossibile aprire la sorgente video")
+        sys.exit(1)
+
+    win_name = "YOLOv2 + OCR etichette (versione alleggerita)"
+    cv2.namedWindow(win_name, cv2.WINDOW_NORMAL)
+
+    frame_idx = 0
+    detect_cycle = 0
+
+    last_detections = []
+    last_text_by_box = {}
+    printed_texts = {}
+
+    while True:
+        grabbed, frame = cap.read()
+
+        if not grabbed or frame is None:
+            print("Fine stream o impossibile leggere il frame")
+            break
+
+        frame_idx += 1
+
+        # Detection NON su ogni frame
+        if frame_idx == 1 or (frame_idx % args.detect_every == 0):
+            detect_cycle += 1
+
+            last_detections = detect_yolov2_original_style(
+                net=net,
+                frame=frame,
+                classes=classes,
+                min_confidence=args.min_confidence
+            )
+
+            label_dets = best_label_detections(
+                last_detections,
+                args.label_class,
+                args.max_ocr_boxes
+            )
+
+            # OCR solo ogni N detection cycles
+            if detect_cycle % args.ocr_every_detect == 0:
+                new_text_by_box = {}
+
+                for det in label_dets:
+                    x1, y1, x2, y2 = det["box"]
+                    bw = x2 - x1
+                    bh = y2 - y1
+
+                    if bw < args.min_label_width or bh < args.min_label_height:
+                        continue
+
+                    rx1, ry1, rx2, ry2 = expand_box(x1, y1, x2, y2, frame.shape[1], frame.shape[0])
+                    roi = frame[ry1:ry2, rx1:rx2]
+
+                    if roi.size == 0:
+                        continue
+
+                    digits, processed, raw_text = ocr_digits_only(roi)
+
+                    if args.print_all:
+                        print(f"[frame {frame_idx}] OCR grezzo='{raw_text.strip()}' -> digits='{digits}'")
+
+                    if len(digits) >= args.ocr_min_digits:
+                        box_key = (x1, y1, x2, y2)
+                        new_text_by_box[box_key] = digits
+
+                        last_print_frame = printed_texts.get(digits, -999999)
+                        if frame_idx - last_print_frame > 30:
+                            print(f"[frame {frame_idx}] Etichetta letta: {digits}")
+                            printed_texts[digits] = frame_idx
+
+                        if args.show_roi:
+                            cv2.imshow("ROI OCR", processed)
+
+                if new_text_by_box:
+                    last_text_by_box = new_text_by_box
+
+        # Disegno su TUTTI i frame, usando l'ultima detection disponibile
+        display = frame.copy()
+
+        for det in last_detections:
+            x1, y1, x2, y2 = det["box"]
+            box_key = (x1, y1, x2, y2)
+            extra = ""
+
+            if box_key in last_text_by_box:
+                extra = f"NUM: {last_text_by_box[box_key]}"
+
+            draw_detection(display, det, extra_text=extra)
+
+        display = resize_preview(display, args.preview_width)
+        cv2.imshow(win_name, display)
+
+        key = cv2.waitKey(1) & 0xFF
+        if key == ord("q"):
+            break
+
+    cap.release()
+    cv2.destroyAllWindows()
+
+
+if __name__ == "__main__":
+    main()