import argparse import re import sys from pathlib import Path import cv2 import numpy as np import pytesseract def parse_args(): ap = argparse.ArgumentParser() ap.add_argument( "-v", "--video", default=None, help="Percorso video. Se omesso usa la webcam 0" ) ap.add_argument( "--weights", default="yolov2.weights", help="File pesi YOLO" ) ap.add_argument( "--config", default="yolov2.cfg", help="File cfg YOLO" ) ap.add_argument( "--labels", default="labels.txt", help="File classi YOLO" ) ap.add_argument( "--min-confidence", type=float, default=0.35, help="Soglia minima confidenza YOLO" ) ap.add_argument( "--nms-threshold", type=float, default=0.40, help="Soglia NMS" ) ap.add_argument( "--ocr-class", default="etichetta", help="Classe YOLO su cui fare OCR" ) ap.add_argument( "--ocr-every", type=int, default=5, help="Fai OCR ogni N frame" ) ap.add_argument( "--slot-cooldown", type=int, default=15, help="Numero minimo di frame prima di rifare OCR sulla stessa zona" ) ap.add_argument( "--slot-size", type=int, default=120, help="Dimensione griglia per deduplicare le etichette" ) ap.add_argument( "--max-ocr-boxes", type=int, default=2, help="Numero massimo di ROI etichetta su cui fare OCR per frame" ) ap.add_argument( "--min-label-width", type=int, default=60, help="Larghezza minima bbox etichetta" ) ap.add_argument( "--min-label-height", type=int, default=25, help="Altezza minima bbox etichetta" ) ap.add_argument( "--detect-width", type=int, default=960, help="Larghezza massima del frame usato per detection YOLO" ) ap.add_argument( "--show-roi", action="store_true", help="Mostra anche la ROI preprocessata per OCR" ) ap.add_argument( "--tesseract-cmd", default=None, help="Percorso esplicito a tesseract.exe, se necessario" ) ap.add_argument( "--print-all", action="store_true", help="Stampa anche OCR vuoti o corti" ) return ap.parse_args() def require_file(path_str, description): path = Path(path_str) if not path.exists(): print(f"Errore: {description} non trovato: {path}") sys.exit(1) return path def load_classes(labels_path): with open(labels_path, "rt", encoding="utf-8") as f: classes = [line.strip() for line in f if line.strip()] if not classes: print("Errore: labels.txt vuoto") sys.exit(1) return classes def open_capture(video_arg): if video_arg is None: cap = cv2.VideoCapture(0, cv2.CAP_DSHOW) if not cap.isOpened(): cap = cv2.VideoCapture(0) return cap if str(video_arg).isdigit(): idx = int(video_arg) cap = cv2.VideoCapture(idx, cv2.CAP_DSHOW) if not cap.isOpened(): cap = cv2.VideoCapture(idx) return cap return cv2.VideoCapture(video_arg) def resize_for_detection(frame, max_width): h, w = frame.shape[:2] if max_width <= 0 or w <= max_width: return frame, 1.0, 1.0 scale = max_width / float(w) new_w = int(w * scale) new_h = int(h * scale) resized = cv2.resize(frame, (new_w, new_h), interpolation=cv2.INTER_LINEAR) scale_x = w / float(new_w) scale_y = h / float(new_h) return resized, scale_x, scale_y def clip_box(x1, y1, x2, y2, w, h): x1 = max(0, min(x1, w - 1)) y1 = max(0, min(y1, h - 1)) x2 = max(0, min(x2, w - 1)) y2 = max(0, min(y2, h - 1)) return x1, y1, x2, y2 def expand_box(x1, y1, x2, y2, frame_w, frame_h, pad_ratio=0.10): bw = x2 - x1 bh = y2 - y1 pad_x = int(bw * pad_ratio) pad_y = int(bh * pad_ratio) x1 -= pad_x y1 -= pad_y x2 += pad_x y2 += pad_y return clip_box(x1, y1, x2, y2, frame_w, frame_h) def preprocess_for_ocr(roi): gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) gray = cv2.resize( gray, None, fx=2.0, fy=2.0, interpolation=cv2.INTER_CUBIC ) gray = cv2.GaussianBlur(gray, (3, 3), 0) gray = cv2.threshold( gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU )[1] gray = cv2.copyMakeBorder( gray, 10, 10, 10, 10, borderType=cv2.BORDER_CONSTANT, value=255 ) return gray def ocr_digits_only(roi): processed = preprocess_for_ocr(roi) config = r'--oem 3 --psm 7 -c tessedit_char_whitelist=0123456789' raw_text = pytesseract.image_to_string(processed, config=config) digits = re.sub(r"\D+", "", raw_text) return digits, processed, raw_text def detect_yolo(net, frame, classes, min_confidence, nms_threshold): h, w = frame.shape[:2] blob = cv2.dnn.blobFromImage( frame, scalefactor=1.0 / 255.0, size=(416, 416), swapRB=True, crop=False ) net.setInput(blob) output_layer_names = net.getUnconnectedOutLayersNames() layer_outputs = net.forward(output_layer_names) boxes = [] confidences = [] class_ids = [] for output in layer_outputs: for detection in output: scores = detection[5:] if scores.size == 0: continue class_id = int(np.argmax(scores)) confidence = float(scores[class_id]) if confidence < min_confidence: continue center_x = int(detection[0] * w) center_y = int(detection[1] * h) box_w = int(detection[2] * w) box_h = int(detection[3] * h) x = int(center_x - box_w / 2) y = int(center_y - box_h / 2) boxes.append([x, y, box_w, box_h]) confidences.append(confidence) class_ids.append(class_id) final_detections = [] if len(boxes) == 0: return final_detections indices = cv2.dnn.NMSBoxes( boxes, confidences, min_confidence, nms_threshold ) if len(indices) == 0: return final_detections indices = np.array(indices).flatten() for i in indices: x, y, bw, bh = boxes[i] x1 = x y1 = y x2 = x + bw y2 = y + bh x1, y1, x2, y2 = clip_box(x1, y1, x2, y2, w, h) final_detections.append({ "class_id": class_ids[i], "label": classes[class_ids[i]], "confidence": confidences[i], "box": (x1, y1, x2, y2), }) return final_detections def quantized_slot_key(x1, y1, x2, y2, slot_size): cx = (x1 + x2) // 2 cy = (y1 + y2) // 2 return (cx // slot_size, cy // slot_size) def draw_detection(frame, det, color=(255, 255, 255), text_extra=""): x1, y1, x2, y2 = det["box"] label = det["label"] conf = det["confidence"] cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2) text = f"{label} {conf:.2f}" if text_extra: text += f" | {text_extra}" y_text = max(20, y1 - 8) cv2.putText( frame, text, (x1, y_text), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2, cv2.LINE_AA ) def main(): args = parse_args() if args.tesseract_cmd: pytesseract.pytesseract.tesseract_cmd = args.tesseract_cmd require_file(args.weights, "File pesi YOLO") require_file(args.config, "File cfg YOLO") require_file(args.labels, "File labels YOLO") classes = load_classes(args.labels) print(classes) ocr_class_lower = args.ocr_class.strip().lower() net = cv2.dnn.readNetFromDarknet(args.config, args.weights) net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV) net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU) cap = open_capture(args.video) if not cap.isOpened(): print("Errore: impossibile aprire la sorgente video") sys.exit(1) win_name = "YOLO + OCR etichette" cv2.namedWindow(win_name, cv2.WINDOW_NORMAL) frame_idx = 0 # Memoria OCR per posizione etichetta slot_memory = {} # Memoria per evitare stampe duplicate continue dello stesso numero printed_texts = {} while True: grabbed, frame = cap.read() if not grabbed or frame is None: print("Fine stream o impossibile leggere il frame") break frame_idx += 1 orig_h, orig_w = frame.shape[:2] det_frame, scale_x, scale_y = resize_for_detection( frame, args.detect_width ) detections = detect_yolo( net, det_frame, classes, args.min_confidence, args.nms_threshold ) # Scala bbox sul frame originale scaled_detections = [] for det in detections: x1, y1, x2, y2 = det["box"] x1 = int(x1 * scale_x) y1 = int(y1 * scale_y) x2 = int(x2 * scale_x) y2 = int(y2 * scale_y) x1, y1, x2, y2 = clip_box(x1, y1, x2, y2, orig_w, orig_h) scaled_detections.append({ "class_id": det["class_id"], "label": det["label"], "confidence": det["confidence"], "box": (x1, y1, x2, y2), }) # OCR solo per etichette label_detections = [ d for d in scaled_detections if d["label"].strip().lower() == ocr_class_lower ] label_detections.sort( key=lambda d: d["confidence"], reverse=True ) label_detections = label_detections[:args.max_ocr_boxes] # Disegna tutte le detection for det in scaled_detections: color = (255, 255, 255) if det["label"].strip().lower() == ocr_class_lower: color = (0, 255, 255) elif det["label"].strip().lower() == "gaylord": color = (0, 255, 0) draw_detection(frame, det, color=color) # OCR solo ogni N frame e con cooldown per slot if frame_idx % args.ocr_every == 0: for det in label_detections: x1, y1, x2, y2 = det["box"] bw = x2 - x1 bh = y2 - y1 if bw < args.min_label_width or bh < args.min_label_height: continue slot_key = quantized_slot_key( x1, y1, x2, y2, args.slot_size ) slot_info = slot_memory.get(slot_key) if slot_info is not None: if frame_idx - slot_info["last_ocr_frame"] < args.slot_cooldown: continue rx1, ry1, rx2, ry2 = expand_box( x1, y1, x2, y2, orig_w, orig_h, pad_ratio=0.10 ) roi = frame[ry1:ry2, rx1:rx2] if roi.size == 0: continue digits, processed, raw_text = ocr_digits_only(roi) if args.print_all: print( f"[frame {frame_idx}] OCR grezzo='{raw_text.strip()}' -> digits='{digits}'" ) if len(digits) >= 2: slot_memory[slot_key] = { "text": digits, "last_ocr_frame": frame_idx, "box": (x1, y1, x2, y2), } last_print_frame = printed_texts.get(digits, -999999) if frame_idx - last_print_frame > 30: print(f"[frame {frame_idx}] Etichetta letta: {digits}") printed_texts[digits] = frame_idx if args.show_roi: cv2.imshow("ROI OCR", processed) else: slot_memory[slot_key] = { "text": slot_info["text"] if slot_info else "", "last_ocr_frame": frame_idx, "box": (x1, y1, x2, y2), } # Ridisegna testo OCR memorizzato vicino alle etichette for det in label_detections: x1, y1, x2, y2 = det["box"] slot_key = quantized_slot_key( x1, y1, x2, y2, args.slot_size ) slot_info = slot_memory.get(slot_key) if slot_info and slot_info.get("text"): cv2.putText( frame, f"NUM: {slot_info['text']}", (x1, min(orig_h - 5, y2 + 22)), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 255), 2, cv2.LINE_AA ) cv2.imshow(win_name, frame) key = cv2.waitKey(1) & 0xFF if key == ord("q"): break cap.release() cv2.destroyAllWindows() if __name__ == "__main__": main()