540 lines
13 KiB
Python
540 lines
13 KiB
Python
import argparse
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import cv2
|
|
import numpy as np
|
|
import pytesseract
|
|
|
|
|
|
def parse_args():
|
|
ap = argparse.ArgumentParser()
|
|
|
|
ap.add_argument(
|
|
"-v", "--video",
|
|
default=None,
|
|
help="Percorso video. Se omesso usa la webcam 0"
|
|
)
|
|
ap.add_argument(
|
|
"--weights",
|
|
default="yolov2.weights",
|
|
help="File pesi YOLO"
|
|
)
|
|
ap.add_argument(
|
|
"--config",
|
|
default="yolov2.cfg",
|
|
help="File cfg YOLO"
|
|
)
|
|
ap.add_argument(
|
|
"--labels",
|
|
default="labels.txt",
|
|
help="File classi YOLO"
|
|
)
|
|
ap.add_argument(
|
|
"--min-confidence",
|
|
type=float,
|
|
default=0.35,
|
|
help="Soglia minima confidenza YOLO"
|
|
)
|
|
ap.add_argument(
|
|
"--nms-threshold",
|
|
type=float,
|
|
default=0.40,
|
|
help="Soglia NMS"
|
|
)
|
|
ap.add_argument(
|
|
"--ocr-class",
|
|
default="etichetta",
|
|
help="Classe YOLO su cui fare OCR"
|
|
)
|
|
ap.add_argument(
|
|
"--ocr-every",
|
|
type=int,
|
|
default=5,
|
|
help="Fai OCR ogni N frame"
|
|
)
|
|
ap.add_argument(
|
|
"--slot-cooldown",
|
|
type=int,
|
|
default=15,
|
|
help="Numero minimo di frame prima di rifare OCR sulla stessa zona"
|
|
)
|
|
ap.add_argument(
|
|
"--slot-size",
|
|
type=int,
|
|
default=120,
|
|
help="Dimensione griglia per deduplicare le etichette"
|
|
)
|
|
ap.add_argument(
|
|
"--max-ocr-boxes",
|
|
type=int,
|
|
default=2,
|
|
help="Numero massimo di ROI etichetta su cui fare OCR per frame"
|
|
)
|
|
ap.add_argument(
|
|
"--min-label-width",
|
|
type=int,
|
|
default=60,
|
|
help="Larghezza minima bbox etichetta"
|
|
)
|
|
ap.add_argument(
|
|
"--min-label-height",
|
|
type=int,
|
|
default=25,
|
|
help="Altezza minima bbox etichetta"
|
|
)
|
|
ap.add_argument(
|
|
"--detect-width",
|
|
type=int,
|
|
default=960,
|
|
help="Larghezza massima del frame usato per detection YOLO"
|
|
)
|
|
ap.add_argument(
|
|
"--show-roi",
|
|
action="store_true",
|
|
help="Mostra anche la ROI preprocessata per OCR"
|
|
)
|
|
ap.add_argument(
|
|
"--tesseract-cmd",
|
|
default=None,
|
|
help="Percorso esplicito a tesseract.exe, se necessario"
|
|
)
|
|
ap.add_argument(
|
|
"--print-all",
|
|
action="store_true",
|
|
help="Stampa anche OCR vuoti o corti"
|
|
)
|
|
|
|
return ap.parse_args()
|
|
|
|
|
|
def require_file(path_str, description):
|
|
path = Path(path_str)
|
|
if not path.exists():
|
|
print(f"Errore: {description} non trovato: {path}")
|
|
sys.exit(1)
|
|
return path
|
|
|
|
|
|
def load_classes(labels_path):
|
|
with open(labels_path, "rt", encoding="utf-8") as f:
|
|
classes = [line.strip() for line in f if line.strip()]
|
|
|
|
if not classes:
|
|
print("Errore: labels.txt vuoto")
|
|
sys.exit(1)
|
|
|
|
return classes
|
|
|
|
|
|
def open_capture(video_arg):
|
|
if video_arg is None:
|
|
cap = cv2.VideoCapture(0, cv2.CAP_DSHOW)
|
|
if not cap.isOpened():
|
|
cap = cv2.VideoCapture(0)
|
|
return cap
|
|
|
|
if str(video_arg).isdigit():
|
|
idx = int(video_arg)
|
|
cap = cv2.VideoCapture(idx, cv2.CAP_DSHOW)
|
|
if not cap.isOpened():
|
|
cap = cv2.VideoCapture(idx)
|
|
return cap
|
|
|
|
return cv2.VideoCapture(video_arg)
|
|
|
|
|
|
def resize_for_detection(frame, max_width):
|
|
h, w = frame.shape[:2]
|
|
|
|
if max_width <= 0 or w <= max_width:
|
|
return frame, 1.0, 1.0
|
|
|
|
scale = max_width / float(w)
|
|
new_w = int(w * scale)
|
|
new_h = int(h * scale)
|
|
|
|
resized = cv2.resize(frame, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
|
|
|
|
scale_x = w / float(new_w)
|
|
scale_y = h / float(new_h)
|
|
|
|
return resized, scale_x, scale_y
|
|
|
|
|
|
def clip_box(x1, y1, x2, y2, w, h):
|
|
x1 = max(0, min(x1, w - 1))
|
|
y1 = max(0, min(y1, h - 1))
|
|
x2 = max(0, min(x2, w - 1))
|
|
y2 = max(0, min(y2, h - 1))
|
|
return x1, y1, x2, y2
|
|
|
|
|
|
def expand_box(x1, y1, x2, y2, frame_w, frame_h, pad_ratio=0.10):
|
|
bw = x2 - x1
|
|
bh = y2 - y1
|
|
|
|
pad_x = int(bw * pad_ratio)
|
|
pad_y = int(bh * pad_ratio)
|
|
|
|
x1 -= pad_x
|
|
y1 -= pad_y
|
|
x2 += pad_x
|
|
y2 += pad_y
|
|
|
|
return clip_box(x1, y1, x2, y2, frame_w, frame_h)
|
|
|
|
|
|
def preprocess_for_ocr(roi):
|
|
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
|
|
|
|
gray = cv2.resize(
|
|
gray,
|
|
None,
|
|
fx=2.0,
|
|
fy=2.0,
|
|
interpolation=cv2.INTER_CUBIC
|
|
)
|
|
|
|
gray = cv2.GaussianBlur(gray, (3, 3), 0)
|
|
|
|
gray = cv2.threshold(
|
|
gray,
|
|
0,
|
|
255,
|
|
cv2.THRESH_BINARY + cv2.THRESH_OTSU
|
|
)[1]
|
|
|
|
gray = cv2.copyMakeBorder(
|
|
gray,
|
|
10, 10, 10, 10,
|
|
borderType=cv2.BORDER_CONSTANT,
|
|
value=255
|
|
)
|
|
|
|
return gray
|
|
|
|
|
|
def ocr_digits_only(roi):
|
|
processed = preprocess_for_ocr(roi)
|
|
|
|
config = r'--oem 3 --psm 7 -c tessedit_char_whitelist=0123456789'
|
|
raw_text = pytesseract.image_to_string(processed, config=config)
|
|
|
|
digits = re.sub(r"\D+", "", raw_text)
|
|
|
|
return digits, processed, raw_text
|
|
|
|
|
|
def detect_yolo(net, frame, classes, min_confidence, nms_threshold):
|
|
h, w = frame.shape[:2]
|
|
|
|
blob = cv2.dnn.blobFromImage(
|
|
frame,
|
|
scalefactor=1.0 / 255.0,
|
|
size=(416, 416),
|
|
swapRB=True,
|
|
crop=False
|
|
)
|
|
|
|
net.setInput(blob)
|
|
|
|
output_layer_names = net.getUnconnectedOutLayersNames()
|
|
layer_outputs = net.forward(output_layer_names)
|
|
|
|
boxes = []
|
|
confidences = []
|
|
class_ids = []
|
|
|
|
for output in layer_outputs:
|
|
for detection in output:
|
|
scores = detection[5:]
|
|
|
|
if scores.size == 0:
|
|
continue
|
|
|
|
class_id = int(np.argmax(scores))
|
|
confidence = float(scores[class_id])
|
|
|
|
if confidence < min_confidence:
|
|
continue
|
|
|
|
center_x = int(detection[0] * w)
|
|
center_y = int(detection[1] * h)
|
|
box_w = int(detection[2] * w)
|
|
box_h = int(detection[3] * h)
|
|
|
|
x = int(center_x - box_w / 2)
|
|
y = int(center_y - box_h / 2)
|
|
|
|
boxes.append([x, y, box_w, box_h])
|
|
confidences.append(confidence)
|
|
class_ids.append(class_id)
|
|
|
|
final_detections = []
|
|
|
|
if len(boxes) == 0:
|
|
return final_detections
|
|
|
|
indices = cv2.dnn.NMSBoxes(
|
|
boxes,
|
|
confidences,
|
|
min_confidence,
|
|
nms_threshold
|
|
)
|
|
|
|
if len(indices) == 0:
|
|
return final_detections
|
|
|
|
indices = np.array(indices).flatten()
|
|
|
|
for i in indices:
|
|
x, y, bw, bh = boxes[i]
|
|
|
|
x1 = x
|
|
y1 = y
|
|
x2 = x + bw
|
|
y2 = y + bh
|
|
|
|
x1, y1, x2, y2 = clip_box(x1, y1, x2, y2, w, h)
|
|
|
|
final_detections.append({
|
|
"class_id": class_ids[i],
|
|
"label": classes[class_ids[i]],
|
|
"confidence": confidences[i],
|
|
"box": (x1, y1, x2, y2),
|
|
})
|
|
|
|
return final_detections
|
|
|
|
|
|
def quantized_slot_key(x1, y1, x2, y2, slot_size):
|
|
cx = (x1 + x2) // 2
|
|
cy = (y1 + y2) // 2
|
|
return (cx // slot_size, cy // slot_size)
|
|
|
|
|
|
def draw_detection(frame, det, color=(255, 255, 255), text_extra=""):
|
|
x1, y1, x2, y2 = det["box"]
|
|
label = det["label"]
|
|
conf = det["confidence"]
|
|
|
|
cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
|
|
|
|
text = f"{label} {conf:.2f}"
|
|
if text_extra:
|
|
text += f" | {text_extra}"
|
|
|
|
y_text = max(20, y1 - 8)
|
|
|
|
cv2.putText(
|
|
frame,
|
|
text,
|
|
(x1, y_text),
|
|
cv2.FONT_HERSHEY_SIMPLEX,
|
|
0.6,
|
|
color,
|
|
2,
|
|
cv2.LINE_AA
|
|
)
|
|
|
|
|
|
def main():
|
|
args = parse_args()
|
|
|
|
if args.tesseract_cmd:
|
|
pytesseract.pytesseract.tesseract_cmd = args.tesseract_cmd
|
|
|
|
require_file(args.weights, "File pesi YOLO")
|
|
require_file(args.config, "File cfg YOLO")
|
|
require_file(args.labels, "File labels YOLO")
|
|
|
|
classes = load_classes(args.labels)
|
|
print(classes)
|
|
|
|
ocr_class_lower = args.ocr_class.strip().lower()
|
|
|
|
net = cv2.dnn.readNetFromDarknet(args.config, args.weights)
|
|
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
|
|
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)
|
|
|
|
cap = open_capture(args.video)
|
|
if not cap.isOpened():
|
|
print("Errore: impossibile aprire la sorgente video")
|
|
sys.exit(1)
|
|
|
|
win_name = "YOLO + OCR etichette"
|
|
cv2.namedWindow(win_name, cv2.WINDOW_NORMAL)
|
|
|
|
frame_idx = 0
|
|
|
|
# Memoria OCR per posizione etichetta
|
|
slot_memory = {}
|
|
|
|
# Memoria per evitare stampe duplicate continue dello stesso numero
|
|
printed_texts = {}
|
|
|
|
while True:
|
|
grabbed, frame = cap.read()
|
|
|
|
if not grabbed or frame is None:
|
|
print("Fine stream o impossibile leggere il frame")
|
|
break
|
|
|
|
frame_idx += 1
|
|
orig_h, orig_w = frame.shape[:2]
|
|
|
|
det_frame, scale_x, scale_y = resize_for_detection(
|
|
frame,
|
|
args.detect_width
|
|
)
|
|
|
|
detections = detect_yolo(
|
|
net,
|
|
det_frame,
|
|
classes,
|
|
args.min_confidence,
|
|
args.nms_threshold
|
|
)
|
|
|
|
# Scala bbox sul frame originale
|
|
scaled_detections = []
|
|
for det in detections:
|
|
x1, y1, x2, y2 = det["box"]
|
|
|
|
x1 = int(x1 * scale_x)
|
|
y1 = int(y1 * scale_y)
|
|
x2 = int(x2 * scale_x)
|
|
y2 = int(y2 * scale_y)
|
|
|
|
x1, y1, x2, y2 = clip_box(x1, y1, x2, y2, orig_w, orig_h)
|
|
|
|
scaled_detections.append({
|
|
"class_id": det["class_id"],
|
|
"label": det["label"],
|
|
"confidence": det["confidence"],
|
|
"box": (x1, y1, x2, y2),
|
|
})
|
|
|
|
# OCR solo per etichette
|
|
label_detections = [
|
|
d for d in scaled_detections
|
|
if d["label"].strip().lower() == ocr_class_lower
|
|
]
|
|
|
|
label_detections.sort(
|
|
key=lambda d: d["confidence"],
|
|
reverse=True
|
|
)
|
|
|
|
label_detections = label_detections[:args.max_ocr_boxes]
|
|
|
|
# Disegna tutte le detection
|
|
for det in scaled_detections:
|
|
color = (255, 255, 255)
|
|
|
|
if det["label"].strip().lower() == ocr_class_lower:
|
|
color = (0, 255, 255)
|
|
elif det["label"].strip().lower() == "gaylord":
|
|
color = (0, 255, 0)
|
|
|
|
draw_detection(frame, det, color=color)
|
|
|
|
# OCR solo ogni N frame e con cooldown per slot
|
|
if frame_idx % args.ocr_every == 0:
|
|
for det in label_detections:
|
|
x1, y1, x2, y2 = det["box"]
|
|
|
|
bw = x2 - x1
|
|
bh = y2 - y1
|
|
|
|
if bw < args.min_label_width or bh < args.min_label_height:
|
|
continue
|
|
|
|
slot_key = quantized_slot_key(
|
|
x1, y1, x2, y2,
|
|
args.slot_size
|
|
)
|
|
|
|
slot_info = slot_memory.get(slot_key)
|
|
|
|
if slot_info is not None:
|
|
if frame_idx - slot_info["last_ocr_frame"] < args.slot_cooldown:
|
|
continue
|
|
|
|
rx1, ry1, rx2, ry2 = expand_box(
|
|
x1, y1, x2, y2,
|
|
orig_w, orig_h,
|
|
pad_ratio=0.10
|
|
)
|
|
|
|
roi = frame[ry1:ry2, rx1:rx2]
|
|
|
|
if roi.size == 0:
|
|
continue
|
|
|
|
digits, processed, raw_text = ocr_digits_only(roi)
|
|
|
|
if args.print_all:
|
|
print(
|
|
f"[frame {frame_idx}] OCR grezzo='{raw_text.strip()}' -> digits='{digits}'"
|
|
)
|
|
|
|
if len(digits) >= 2:
|
|
slot_memory[slot_key] = {
|
|
"text": digits,
|
|
"last_ocr_frame": frame_idx,
|
|
"box": (x1, y1, x2, y2),
|
|
}
|
|
|
|
last_print_frame = printed_texts.get(digits, -999999)
|
|
if frame_idx - last_print_frame > 30:
|
|
print(f"[frame {frame_idx}] Etichetta letta: {digits}")
|
|
printed_texts[digits] = frame_idx
|
|
|
|
if args.show_roi:
|
|
cv2.imshow("ROI OCR", processed)
|
|
else:
|
|
slot_memory[slot_key] = {
|
|
"text": slot_info["text"] if slot_info else "",
|
|
"last_ocr_frame": frame_idx,
|
|
"box": (x1, y1, x2, y2),
|
|
}
|
|
|
|
# Ridisegna testo OCR memorizzato vicino alle etichette
|
|
for det in label_detections:
|
|
x1, y1, x2, y2 = det["box"]
|
|
|
|
slot_key = quantized_slot_key(
|
|
x1, y1, x2, y2,
|
|
args.slot_size
|
|
)
|
|
|
|
slot_info = slot_memory.get(slot_key)
|
|
|
|
if slot_info and slot_info.get("text"):
|
|
cv2.putText(
|
|
frame,
|
|
f"NUM: {slot_info['text']}",
|
|
(x1, min(orig_h - 5, y2 + 22)),
|
|
cv2.FONT_HERSHEY_SIMPLEX,
|
|
0.7,
|
|
(0, 255, 255),
|
|
2,
|
|
cv2.LINE_AA
|
|
)
|
|
|
|
cv2.imshow(win_name, frame)
|
|
|
|
key = cv2.waitKey(1) & 0xFF
|
|
if key == ord("q"):
|
|
break
|
|
|
|
cap.release()
|
|
cv2.destroyAllWindows()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|