Initial import
This commit is contained in:
266
ware_detect.py
Normal file
266
ware_detect.py
Normal file
@@ -0,0 +1,266 @@
|
||||
from imutils.object_detection import non_max_suppression
|
||||
import time
|
||||
import cv2
|
||||
import argparse
|
||||
import numpy as np
|
||||
import sys
|
||||
|
||||
|
||||
|
||||
# import the necessary packages
|
||||
from PIL import Image
|
||||
import pytesseract
|
||||
import argparse
|
||||
import cv2
|
||||
import os
|
||||
|
||||
|
||||
|
||||
def imcrop(img, x1, y1, x2, y2):
|
||||
|
||||
if x1 < 0 or y1 < 0 or x2 > img.shape[1] or y2 > img.shape[0]:
|
||||
img, x1, x2, y1, y2 = pad_img_to_fit_bbox(img, x1, x2, y1, y2)
|
||||
return img[y1:y2, x1:x2, :]
|
||||
|
||||
def pad_img_to_fit_bbox(img, x1, x2, y1, y2):
|
||||
img = np.pad(img, ((np.abs(np.minimum(0, y1)), np.maximum(y2 - img.shape[0], 0)),
|
||||
(np.abs(np.minimum(0, x1)), np.maximum(x2 - img.shape[1], 0)), (0, 0)), mode="constant")
|
||||
y1 += np.abs(np.minimum(0, y1))
|
||||
y2 += np.abs(np.minimum(0, y1))
|
||||
x1 += np.abs(np.minimum(0, x1))
|
||||
x2 += np.abs(np.minimum(0, x1))
|
||||
return img, x1, x2, y1, y2
|
||||
|
||||
|
||||
|
||||
def grab_text(roi):
|
||||
# load the input image and grab the image dimensions
|
||||
image = roi
|
||||
|
||||
orig = image.copy()
|
||||
(H, W) = image.shape[:2]
|
||||
|
||||
# set the new width and height and then determine the ratio in change
|
||||
# for both the width and height
|
||||
(newW, newH) = (320, 320)
|
||||
rW = W / float(newW)
|
||||
rH = H / float(newH)
|
||||
|
||||
# resize the image and grab the new image dimensions
|
||||
image = cv2.resize(image, (newW, newH))
|
||||
(H, W) = image.shape[:2]
|
||||
|
||||
# define the two output layer names for the EAST detector model that
|
||||
# we are interested -- the first is the output probabilities and the
|
||||
# second can be used to derive the bounding box coordinates of text
|
||||
layerNames = [
|
||||
"feature_fusion/Conv_7/Sigmoid",
|
||||
"feature_fusion/concat_3"]
|
||||
|
||||
net = cv2.dnn.readNet("./frozen_east_text_detection.pb")
|
||||
|
||||
# construct a blob from the image and then perform a forward pass of
|
||||
# the model to obtain the two output layer sets
|
||||
blob = cv2.dnn.blobFromImage(image, 1.0, (W, H),(123.68, 116.78, 103.94), swapRB=True, crop=False)
|
||||
#start = time.time()
|
||||
net.setInput(blob)
|
||||
(scores, geometry) = net.forward(layerNames)
|
||||
#end = time.time()
|
||||
|
||||
# show timing information on text prediction
|
||||
#print("[INFO] text detection took {:.6f} seconds".format(end - start))
|
||||
|
||||
# grab the number of rows and columns from the scores volume, then
|
||||
# initialize our set of bounding box rectangles and corresponding
|
||||
# confidence scores
|
||||
(numRows, numCols) = scores.shape[2:4]
|
||||
rects = []
|
||||
confidences = []
|
||||
|
||||
# loop over the number of rows
|
||||
for y in range(0, numRows):
|
||||
# extract the scores (probabilities), followed by the geometrical
|
||||
# data used to derive potential bounding box coordinates that
|
||||
# surround text
|
||||
scoresData = scores[0, 0, y]
|
||||
xData0 = geometry[0, 0, y]
|
||||
xData1 = geometry[0, 1, y]
|
||||
xData2 = geometry[0, 2, y]
|
||||
xData3 = geometry[0, 3, y]
|
||||
anglesData = geometry[0, 4, y]
|
||||
|
||||
# loop over the number of columns
|
||||
for x in range(0, numCols):
|
||||
# if our score does not have sufficient probability, ignore it
|
||||
if scoresData[x] < 0.5:
|
||||
continue
|
||||
|
||||
# compute the offset factor as our resulting feature maps will
|
||||
# be 4x smaller than the input image
|
||||
(offsetX, offsetY) = (x * 4.0, y * 4.0)
|
||||
|
||||
# extract the rotation angle for the prediction and then
|
||||
# compute the sin and cosine
|
||||
angle = anglesData[x]
|
||||
cos = np.cos(angle)
|
||||
sin = np.sin(angle)
|
||||
|
||||
# use the geometry volume to derive the width and height of
|
||||
# the bounding box
|
||||
h = xData0[x] + xData2[x]
|
||||
w = xData1[x] + xData3[x]
|
||||
|
||||
# compute both the starting and ending (x, y)-coordinates for
|
||||
# the text prediction bounding box
|
||||
endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
|
||||
endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
|
||||
startX = int(endX - w)
|
||||
startY = int(endY - h)
|
||||
|
||||
# add the bounding box coordinates and probability score to
|
||||
# our respective lists
|
||||
rects.append((startX, startY, endX, endY))
|
||||
confidences.append(scoresData[x])
|
||||
|
||||
# apply non-maxima suppression to suppress weak, overlapping bounding
|
||||
# boxes
|
||||
boxes = non_max_suppression(np.array(rects), probs=confidences)
|
||||
|
||||
# loop over the bounding boxes
|
||||
for (startX, startY, endX, endY) in boxes:
|
||||
# scale the bounding box coordinates based on the respective
|
||||
# ratios
|
||||
startX = int(startX * rW)
|
||||
startY = int(startY * rH)
|
||||
endX = int(endX * rW)
|
||||
endY = int(endY * rH)
|
||||
|
||||
# draw the bounding box on the image
|
||||
print("ok rectangle")
|
||||
cv2.rectangle(orig, (startX, startY), (endX, endY), (0, 255, 0), 2)
|
||||
inner_roi = imcrop(orig, startX, startY, endX, endY)
|
||||
cv2.imshow(winName, inner_roi)
|
||||
gray = cv2.cvtColor(inner_roi, cv2.COLOR_BGR2GRAY)
|
||||
gray = cv2.threshold(gray, 0, 255,cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
|
||||
text = pytesseract.image_to_string(gray)
|
||||
cv2.imshow(winName, gray)
|
||||
print ("TEXT: " + text)
|
||||
# show the output image
|
||||
#if startX and startY and endX and endY:
|
||||
# inner_roi = imcrop(orig, startX, startY, endX, endY)
|
||||
#print("startX, startY, endX, endY:" + str(startX) +"_"+ str(startY) +"_" +str(endX) +"_"+ str(endY))
|
||||
|
||||
#time.sleep(.3)
|
||||
#cv2.waitKey(0)
|
||||
|
||||
|
||||
|
||||
|
||||
# construct the argument parse and parse the arguments
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("-v", "--video",help="path to the (optional) video file")
|
||||
args = vars(ap.parse_args())
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# Minimum confidence threshold. Increasing this will improve false positives but will also reduce detection rate.
|
||||
min_confidence = 0.14
|
||||
model = 'yolov2.weights'
|
||||
config = 'yolov2.cfg'
|
||||
|
||||
# Load names of classes
|
||||
classes = None
|
||||
with open('labels.txt', 'rt') as f:
|
||||
classes = f.read().rstrip('\n').split('\n')
|
||||
print(classes)
|
||||
|
||||
# Load weights and construct graph
|
||||
net = cv2.dnn.readNetFromDarknet(config, model)
|
||||
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_DEFAULT)
|
||||
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)
|
||||
|
||||
winName = 'Running YOLO warehouse'
|
||||
|
||||
cv2.namedWindow(winName, cv2.WINDOW_NORMAL)
|
||||
|
||||
|
||||
camera = cv2.VideoCapture(args["video"])
|
||||
|
||||
# keep looping
|
||||
j=1
|
||||
while True:
|
||||
# grab the current frame
|
||||
(grabbed, frame) = camera.read()
|
||||
# Get width and height
|
||||
|
||||
|
||||
# if we are viewing a video and we did not grab a frame,
|
||||
# then we have reached the end of the video
|
||||
if args.get("video") and not grabbed:
|
||||
break
|
||||
|
||||
height, width, ch = frame.shape
|
||||
|
||||
# Create a 4D blob from a frame.
|
||||
blob = cv2.dnn.blobFromImage(frame, 1.0/255, (416, 416), True, crop=False)
|
||||
net.setInput(blob)
|
||||
# Run the preprocessed input blog through the network
|
||||
predictions = net.forward()
|
||||
probability_index = 5
|
||||
|
||||
for i in range(predictions.shape[0]):
|
||||
prob_arr = predictions[i][probability_index:]
|
||||
class_index = prob_arr.argmax(axis=0)
|
||||
confidence = prob_arr[class_index]
|
||||
if confidence > min_confidence:
|
||||
x_center = predictions[i][0] * width
|
||||
y_center = predictions[i][1] * height
|
||||
width_box = predictions[i][2] * width
|
||||
height_box = predictions[i][3] * height
|
||||
|
||||
x1 = int(x_center - width_box * 0.5)
|
||||
y1 = int(y_center - height_box * 0.5)
|
||||
x2 = int(x_center + width_box * 0.5)
|
||||
y2 = int(y_center + height_box * 0.5)
|
||||
#roi = frame[y1:y2, x1:x2]
|
||||
roi = imcrop(frame, x1, y1, x2, y2)
|
||||
cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 255, 255), 1)
|
||||
cv2.putText(frame, classes[class_index] + " " + "{0:.1f}".format(confidence), (x1, y1),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 1, cv2.LINE_AA)
|
||||
#cv2.imwrite("out_"+args.input, frame)
|
||||
#mostro l'immagine cropped che dovrebbe contenere l'etichetta
|
||||
cv2.imshow(winName, roi)
|
||||
print("frame analyzed: " + str(j))
|
||||
j += 1
|
||||
grab_text(roi)
|
||||
|
||||
|
||||
time.sleep(.1)
|
||||
key = cv2.waitKey(1) & 0xFF
|
||||
|
||||
# if the 'q' key is pressed, stop the loop
|
||||
if key == ord("q"):
|
||||
break
|
||||
|
||||
# cleanup the camera and close any open windows
|
||||
camera.release()
|
||||
cv2.destroyAllWindows()
|
||||
|
||||
|
||||
# USAGE
|
||||
# python text_detection.py --image images/lebron_james.jpg --east frozen_east_text_detection.pb
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user