import argparse import re import sys from pathlib import Path import cv2 import numpy as np import pytesseract def parse_args(): ap = argparse.ArgumentParser() ap.add_argument("-v", "--video", default=None, help="Percorso video. Se omesso usa webcam 0") ap.add_argument("--weights", default="yolov2.weights", help="File pesi YOLOv2") ap.add_argument("--config", default="yolov2.cfg", help="File config YOLOv2") ap.add_argument("--labels", default="labels.txt", help="File labels classi") ap.add_argument("--tesseract-cmd", default=None, help="Percorso esplicito a tesseract.exe") # Prestazioni ap.add_argument("--input-size", type=int, default=320, help="Dimensione input YOLO (320 piu' veloce di 416)") ap.add_argument("--detect-every", type=int, default=4, help="Esegue YOLO ogni N frame") ap.add_argument("--ocr-every-detect", type=int, default=2, help="Esegue OCR ogni N cicli di detection") ap.add_argument("--preview-width", type=int, default=1280, help="Larghezza massima finestra preview") ap.add_argument("--max-ocr-boxes", type=int, default=1, help="Numero massimo di etichette da leggere per detection") # Qualita' ap.add_argument("--min-confidence", type=float, default=0.30, help="Soglia minima confidenza") ap.add_argument("--label-class", default="etichetta", help="Nome classe etichetta") ap.add_argument("--min-label-width", type=int, default=50, help="Larghezza minima bbox etichetta") ap.add_argument("--min-label-height", type=int, default=20, help="Altezza minima bbox etichetta") ap.add_argument("--ocr-min-digits", type=int, default=2, help="Numero minimo di cifre per considerare valida una lettura") ap.add_argument("--show-roi", action="store_true", help="Mostra ROI preprocessata per OCR") ap.add_argument("--print-all", action="store_true", help="Stampa anche OCR grezzi non validi") return ap.parse_args() def require_file(path_str, description): path = Path(path_str) if not path.exists(): print(f"Errore: {description} non trovato: {path}") sys.exit(1) return path def load_classes(labels_path): with open(labels_path, "rt", encoding="utf-8") as f: classes = [line.strip() for line in f if line.strip()] if not classes: print("Errore: labels.txt vuoto") sys.exit(1) return classes def open_capture(video_arg): if video_arg is None: cap = cv2.VideoCapture(0, cv2.CAP_DSHOW) if not cap.isOpened(): cap = cv2.VideoCapture(0) return cap if str(video_arg).isdigit(): idx = int(video_arg) cap = cv2.VideoCapture(idx, cv2.CAP_DSHOW) if not cap.isOpened(): cap = cv2.VideoCapture(idx) return cap return cv2.VideoCapture(video_arg) def resize_preview(frame, max_width): h, w = frame.shape[:2] if max_width <= 0 or w <= max_width: return frame scale = max_width / float(w) new_w = int(w * scale) new_h = int(h * scale) return cv2.resize(frame, (new_w, new_h), interpolation=cv2.INTER_LINEAR) def clip_box(x1, y1, x2, y2, w, h): x1 = max(0, min(x1, w - 1)) y1 = max(0, min(y1, h - 1)) x2 = max(0, min(x2, w - 1)) y2 = max(0, min(y2, h - 1)) return x1, y1, x2, y2 def expand_box(x1, y1, x2, y2, frame_w, frame_h, pad_ratio=0.08): bw = x2 - x1 bh = y2 - y1 pad_x = int(bw * pad_ratio) pad_y = int(bh * pad_ratio) x1 -= pad_x y1 -= pad_y x2 += pad_x y2 += pad_y return clip_box(x1, y1, x2, y2, frame_w, frame_h) def preprocess_for_ocr(roi): gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) # Ingrandimento moderato: abbastanza per OCR, meno costoso della versione precedente gray = cv2.resize(gray, None, fx=1.7, fy=1.7, interpolation=cv2.INTER_CUBIC) # Leggera pulizia gray = cv2.GaussianBlur(gray, (3, 3), 0) # Binaria gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] # Bordo bianco gray = cv2.copyMakeBorder(gray, 8, 8, 8, 8, borderType=cv2.BORDER_CONSTANT, value=255) return gray def ocr_digits_only(roi): processed = preprocess_for_ocr(roi) config = r'--oem 3 --psm 7 -c tessedit_char_whitelist=0123456789' raw_text = pytesseract.image_to_string(processed, config=config) digits = re.sub(r"\D+", "", raw_text) return digits, processed, raw_text def detect_yolov2_original_style(net, frame, classes, min_confidence): """ Parsing vicino al tuo script originale. Questo evita il parsing 'generico' stile YOLOv3/4 che nel tuo caso poteva produrre tantissimi box e rallentare drasticamente tutto. """ h, w = frame.shape[:2] blob = cv2.dnn.blobFromImage( frame, scalefactor=1.0 / 255.0, size=(args.input_size, args.input_size), mean=(0, 0, 0), swapRB=False, crop=False ) net.setInput(blob) predictions = net.forward() # Adatta forma output predictions = np.array(predictions) if predictions.ndim == 4: # es. (1, N, 1, 1) o simili: prova a schiacciare predictions = predictions.reshape(predictions.shape[1], predictions.shape[-1]) elif predictions.ndim == 3: predictions = predictions[0] detections = [] if predictions.ndim != 2: return detections for i in range(predictions.shape[0]): if predictions.shape[1] <= 5: continue prob_arr = predictions[i][5:] if prob_arr.size == 0: continue class_index = int(prob_arr.argmax(axis=0)) confidence = float(prob_arr[class_index]) if confidence <= min_confidence: continue x_center = float(predictions[i][0]) * w y_center = float(predictions[i][1]) * h width_box = float(predictions[i][2]) * w height_box = float(predictions[i][3]) * h x1 = int(x_center - width_box * 0.5) y1 = int(y_center - height_box * 0.5) x2 = int(x_center + width_box * 0.5) y2 = int(y_center + height_box * 0.5) x1, y1, x2, y2 = clip_box(x1, y1, x2, y2, w, h) detections.append({ "class_id": class_index, "label": classes[class_index], "confidence": confidence, "box": (x1, y1, x2, y2), }) return detections def draw_detection(frame, det, extra_text=""): x1, y1, x2, y2 = det["box"] label = det["label"] conf = det["confidence"] color = (255, 255, 255) if label.lower() == args.label_class.lower(): color = (0, 255, 255) elif label.lower() == "gaylord": color = (0, 255, 0) cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2) text = f"{label} {conf:.2f}" if extra_text: text += f" | {extra_text}" y_text = max(20, y1 - 8) cv2.putText(frame, text, (x1, y_text), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2, cv2.LINE_AA) def best_label_detections(detections, label_name, max_boxes): labels = [d for d in detections if d["label"].strip().lower() == label_name.lower()] # Ordina per confidenza e area, cosi' preferisce box piu' leggibili labels.sort( key=lambda d: ( d["confidence"], (d["box"][2] - d["box"][0]) * (d["box"][3] - d["box"][1]) ), reverse=True ) return labels[:max_boxes] def main(): global args args = parse_args() if args.tesseract_cmd: pytesseract.pytesseract.tesseract_cmd = args.tesseract_cmd require_file(args.weights, "File pesi YOLOv2") require_file(args.config, "File config YOLOv2") require_file(args.labels, "File labels") classes = load_classes(args.labels) print(classes) net = cv2.dnn.readNetFromDarknet(args.config, args.weights) net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV) net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU) cap = open_capture(args.video) if not cap.isOpened(): print("Errore: impossibile aprire la sorgente video") sys.exit(1) win_name = "YOLOv2 + OCR etichette (versione alleggerita)" cv2.namedWindow(win_name, cv2.WINDOW_NORMAL) frame_idx = 0 detect_cycle = 0 last_detections = [] last_text_by_box = {} printed_texts = {} while True: grabbed, frame = cap.read() if not grabbed or frame is None: print("Fine stream o impossibile leggere il frame") break frame_idx += 1 # Detection NON su ogni frame if frame_idx == 1 or (frame_idx % args.detect_every == 0): detect_cycle += 1 last_detections = detect_yolov2_original_style( net=net, frame=frame, classes=classes, min_confidence=args.min_confidence ) label_dets = best_label_detections( last_detections, args.label_class, args.max_ocr_boxes ) # OCR solo ogni N detection cycles if detect_cycle % args.ocr_every_detect == 0: new_text_by_box = {} for det in label_dets: x1, y1, x2, y2 = det["box"] bw = x2 - x1 bh = y2 - y1 if bw < args.min_label_width or bh < args.min_label_height: continue rx1, ry1, rx2, ry2 = expand_box(x1, y1, x2, y2, frame.shape[1], frame.shape[0]) roi = frame[ry1:ry2, rx1:rx2] if roi.size == 0: continue digits, processed, raw_text = ocr_digits_only(roi) if args.print_all: print(f"[frame {frame_idx}] OCR grezzo='{raw_text.strip()}' -> digits='{digits}'") if len(digits) >= args.ocr_min_digits: box_key = (x1, y1, x2, y2) new_text_by_box[box_key] = digits last_print_frame = printed_texts.get(digits, -999999) if frame_idx - last_print_frame > 30: print(f"[frame {frame_idx}] Etichetta letta: {digits}") printed_texts[digits] = frame_idx if args.show_roi: cv2.imshow("ROI OCR", processed) if new_text_by_box: last_text_by_box = new_text_by_box # Disegno su TUTTI i frame, usando l'ultima detection disponibile display = frame.copy() for det in last_detections: x1, y1, x2, y2 = det["box"] box_key = (x1, y1, x2, y2) extra = "" if box_key in last_text_by_box: extra = f"NUM: {last_text_by_box[box_key]}" draw_detection(display, det, extra_text=extra) display = resize_preview(display, args.preview_width) cv2.imshow(win_name, display) key = cv2.waitKey(1) & 0xFF if key == ord("q"): break cap.release() cv2.destroyAllWindows() if __name__ == "__main__": main()