Files
flywms/ware_detect.py
administrator 747298ac7a Initial import
2026-05-14 09:36:44 +02:00

267 lines
8.3 KiB
Python

from imutils.object_detection import non_max_suppression
import time
import cv2
import argparse
import numpy as np
import sys
# import the necessary packages
from PIL import Image
import pytesseract
import argparse
import cv2
import os
def imcrop(img, x1, y1, x2, y2):
if x1 < 0 or y1 < 0 or x2 > img.shape[1] or y2 > img.shape[0]:
img, x1, x2, y1, y2 = pad_img_to_fit_bbox(img, x1, x2, y1, y2)
return img[y1:y2, x1:x2, :]
def pad_img_to_fit_bbox(img, x1, x2, y1, y2):
img = np.pad(img, ((np.abs(np.minimum(0, y1)), np.maximum(y2 - img.shape[0], 0)),
(np.abs(np.minimum(0, x1)), np.maximum(x2 - img.shape[1], 0)), (0, 0)), mode="constant")
y1 += np.abs(np.minimum(0, y1))
y2 += np.abs(np.minimum(0, y1))
x1 += np.abs(np.minimum(0, x1))
x2 += np.abs(np.minimum(0, x1))
return img, x1, x2, y1, y2
def grab_text(roi):
# load the input image and grab the image dimensions
image = roi
orig = image.copy()
(H, W) = image.shape[:2]
# set the new width and height and then determine the ratio in change
# for both the width and height
(newW, newH) = (320, 320)
rW = W / float(newW)
rH = H / float(newH)
# resize the image and grab the new image dimensions
image = cv2.resize(image, (newW, newH))
(H, W) = image.shape[:2]
# define the two output layer names for the EAST detector model that
# we are interested -- the first is the output probabilities and the
# second can be used to derive the bounding box coordinates of text
layerNames = [
"feature_fusion/Conv_7/Sigmoid",
"feature_fusion/concat_3"]
net = cv2.dnn.readNet("./frozen_east_text_detection.pb")
# construct a blob from the image and then perform a forward pass of
# the model to obtain the two output layer sets
blob = cv2.dnn.blobFromImage(image, 1.0, (W, H),(123.68, 116.78, 103.94), swapRB=True, crop=False)
#start = time.time()
net.setInput(blob)
(scores, geometry) = net.forward(layerNames)
#end = time.time()
# show timing information on text prediction
#print("[INFO] text detection took {:.6f} seconds".format(end - start))
# grab the number of rows and columns from the scores volume, then
# initialize our set of bounding box rectangles and corresponding
# confidence scores
(numRows, numCols) = scores.shape[2:4]
rects = []
confidences = []
# loop over the number of rows
for y in range(0, numRows):
# extract the scores (probabilities), followed by the geometrical
# data used to derive potential bounding box coordinates that
# surround text
scoresData = scores[0, 0, y]
xData0 = geometry[0, 0, y]
xData1 = geometry[0, 1, y]
xData2 = geometry[0, 2, y]
xData3 = geometry[0, 3, y]
anglesData = geometry[0, 4, y]
# loop over the number of columns
for x in range(0, numCols):
# if our score does not have sufficient probability, ignore it
if scoresData[x] < 0.5:
continue
# compute the offset factor as our resulting feature maps will
# be 4x smaller than the input image
(offsetX, offsetY) = (x * 4.0, y * 4.0)
# extract the rotation angle for the prediction and then
# compute the sin and cosine
angle = anglesData[x]
cos = np.cos(angle)
sin = np.sin(angle)
# use the geometry volume to derive the width and height of
# the bounding box
h = xData0[x] + xData2[x]
w = xData1[x] + xData3[x]
# compute both the starting and ending (x, y)-coordinates for
# the text prediction bounding box
endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
startX = int(endX - w)
startY = int(endY - h)
# add the bounding box coordinates and probability score to
# our respective lists
rects.append((startX, startY, endX, endY))
confidences.append(scoresData[x])
# apply non-maxima suppression to suppress weak, overlapping bounding
# boxes
boxes = non_max_suppression(np.array(rects), probs=confidences)
# loop over the bounding boxes
for (startX, startY, endX, endY) in boxes:
# scale the bounding box coordinates based on the respective
# ratios
startX = int(startX * rW)
startY = int(startY * rH)
endX = int(endX * rW)
endY = int(endY * rH)
# draw the bounding box on the image
print("ok rectangle")
cv2.rectangle(orig, (startX, startY), (endX, endY), (0, 255, 0), 2)
inner_roi = imcrop(orig, startX, startY, endX, endY)
cv2.imshow(winName, inner_roi)
gray = cv2.cvtColor(inner_roi, cv2.COLOR_BGR2GRAY)
gray = cv2.threshold(gray, 0, 255,cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
text = pytesseract.image_to_string(gray)
cv2.imshow(winName, gray)
print ("TEXT: " + text)
# show the output image
#if startX and startY and endX and endY:
# inner_roi = imcrop(orig, startX, startY, endX, endY)
#print("startX, startY, endX, endY:" + str(startX) +"_"+ str(startY) +"_" +str(endX) +"_"+ str(endY))
#time.sleep(.3)
#cv2.waitKey(0)
# construct the argument parse and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-v", "--video",help="path to the (optional) video file")
args = vars(ap.parse_args())
# Minimum confidence threshold. Increasing this will improve false positives but will also reduce detection rate.
min_confidence = 0.14
model = 'yolov2.weights'
config = 'yolov2.cfg'
# Load names of classes
classes = None
with open('labels.txt', 'rt') as f:
classes = f.read().rstrip('\n').split('\n')
print(classes)
# Load weights and construct graph
net = cv2.dnn.readNetFromDarknet(config, model)
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_DEFAULT)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)
winName = 'Running YOLO warehouse'
cv2.namedWindow(winName, cv2.WINDOW_NORMAL)
camera = cv2.VideoCapture(args["video"])
# keep looping
j=1
while True:
# grab the current frame
(grabbed, frame) = camera.read()
# Get width and height
# if we are viewing a video and we did not grab a frame,
# then we have reached the end of the video
if args.get("video") and not grabbed:
break
height, width, ch = frame.shape
# Create a 4D blob from a frame.
blob = cv2.dnn.blobFromImage(frame, 1.0/255, (416, 416), True, crop=False)
net.setInput(blob)
# Run the preprocessed input blog through the network
predictions = net.forward()
probability_index = 5
for i in range(predictions.shape[0]):
prob_arr = predictions[i][probability_index:]
class_index = prob_arr.argmax(axis=0)
confidence = prob_arr[class_index]
if confidence > min_confidence:
x_center = predictions[i][0] * width
y_center = predictions[i][1] * height
width_box = predictions[i][2] * width
height_box = predictions[i][3] * height
x1 = int(x_center - width_box * 0.5)
y1 = int(y_center - height_box * 0.5)
x2 = int(x_center + width_box * 0.5)
y2 = int(y_center + height_box * 0.5)
#roi = frame[y1:y2, x1:x2]
roi = imcrop(frame, x1, y1, x2, y2)
cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 255, 255), 1)
cv2.putText(frame, classes[class_index] + " " + "{0:.1f}".format(confidence), (x1, y1),
cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 1, cv2.LINE_AA)
#cv2.imwrite("out_"+args.input, frame)
#mostro l'immagine cropped che dovrebbe contenere l'etichetta
cv2.imshow(winName, roi)
print("frame analyzed: " + str(j))
j += 1
grab_text(roi)
time.sleep(.1)
key = cv2.waitKey(1) & 0xFF
# if the 'q' key is pressed, stop the loop
if key == ord("q"):
break
# cleanup the camera and close any open windows
camera.release()
cv2.destroyAllWindows()
# USAGE
# python text_detection.py --image images/lebron_james.jpg --east frozen_east_text_detection.pb