267 lines
8.3 KiB
Python
267 lines
8.3 KiB
Python
from imutils.object_detection import non_max_suppression
|
|
import time
|
|
import cv2
|
|
import argparse
|
|
import numpy as np
|
|
import sys
|
|
|
|
|
|
|
|
# import the necessary packages
|
|
from PIL import Image
|
|
import pytesseract
|
|
import argparse
|
|
import cv2
|
|
import os
|
|
|
|
|
|
|
|
def imcrop(img, x1, y1, x2, y2):
|
|
|
|
if x1 < 0 or y1 < 0 or x2 > img.shape[1] or y2 > img.shape[0]:
|
|
img, x1, x2, y1, y2 = pad_img_to_fit_bbox(img, x1, x2, y1, y2)
|
|
return img[y1:y2, x1:x2, :]
|
|
|
|
def pad_img_to_fit_bbox(img, x1, x2, y1, y2):
|
|
img = np.pad(img, ((np.abs(np.minimum(0, y1)), np.maximum(y2 - img.shape[0], 0)),
|
|
(np.abs(np.minimum(0, x1)), np.maximum(x2 - img.shape[1], 0)), (0, 0)), mode="constant")
|
|
y1 += np.abs(np.minimum(0, y1))
|
|
y2 += np.abs(np.minimum(0, y1))
|
|
x1 += np.abs(np.minimum(0, x1))
|
|
x2 += np.abs(np.minimum(0, x1))
|
|
return img, x1, x2, y1, y2
|
|
|
|
|
|
|
|
def grab_text(roi):
|
|
# load the input image and grab the image dimensions
|
|
image = roi
|
|
|
|
orig = image.copy()
|
|
(H, W) = image.shape[:2]
|
|
|
|
# set the new width and height and then determine the ratio in change
|
|
# for both the width and height
|
|
(newW, newH) = (320, 320)
|
|
rW = W / float(newW)
|
|
rH = H / float(newH)
|
|
|
|
# resize the image and grab the new image dimensions
|
|
image = cv2.resize(image, (newW, newH))
|
|
(H, W) = image.shape[:2]
|
|
|
|
# define the two output layer names for the EAST detector model that
|
|
# we are interested -- the first is the output probabilities and the
|
|
# second can be used to derive the bounding box coordinates of text
|
|
layerNames = [
|
|
"feature_fusion/Conv_7/Sigmoid",
|
|
"feature_fusion/concat_3"]
|
|
|
|
net = cv2.dnn.readNet("./frozen_east_text_detection.pb")
|
|
|
|
# construct a blob from the image and then perform a forward pass of
|
|
# the model to obtain the two output layer sets
|
|
blob = cv2.dnn.blobFromImage(image, 1.0, (W, H),(123.68, 116.78, 103.94), swapRB=True, crop=False)
|
|
#start = time.time()
|
|
net.setInput(blob)
|
|
(scores, geometry) = net.forward(layerNames)
|
|
#end = time.time()
|
|
|
|
# show timing information on text prediction
|
|
#print("[INFO] text detection took {:.6f} seconds".format(end - start))
|
|
|
|
# grab the number of rows and columns from the scores volume, then
|
|
# initialize our set of bounding box rectangles and corresponding
|
|
# confidence scores
|
|
(numRows, numCols) = scores.shape[2:4]
|
|
rects = []
|
|
confidences = []
|
|
|
|
# loop over the number of rows
|
|
for y in range(0, numRows):
|
|
# extract the scores (probabilities), followed by the geometrical
|
|
# data used to derive potential bounding box coordinates that
|
|
# surround text
|
|
scoresData = scores[0, 0, y]
|
|
xData0 = geometry[0, 0, y]
|
|
xData1 = geometry[0, 1, y]
|
|
xData2 = geometry[0, 2, y]
|
|
xData3 = geometry[0, 3, y]
|
|
anglesData = geometry[0, 4, y]
|
|
|
|
# loop over the number of columns
|
|
for x in range(0, numCols):
|
|
# if our score does not have sufficient probability, ignore it
|
|
if scoresData[x] < 0.5:
|
|
continue
|
|
|
|
# compute the offset factor as our resulting feature maps will
|
|
# be 4x smaller than the input image
|
|
(offsetX, offsetY) = (x * 4.0, y * 4.0)
|
|
|
|
# extract the rotation angle for the prediction and then
|
|
# compute the sin and cosine
|
|
angle = anglesData[x]
|
|
cos = np.cos(angle)
|
|
sin = np.sin(angle)
|
|
|
|
# use the geometry volume to derive the width and height of
|
|
# the bounding box
|
|
h = xData0[x] + xData2[x]
|
|
w = xData1[x] + xData3[x]
|
|
|
|
# compute both the starting and ending (x, y)-coordinates for
|
|
# the text prediction bounding box
|
|
endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
|
|
endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
|
|
startX = int(endX - w)
|
|
startY = int(endY - h)
|
|
|
|
# add the bounding box coordinates and probability score to
|
|
# our respective lists
|
|
rects.append((startX, startY, endX, endY))
|
|
confidences.append(scoresData[x])
|
|
|
|
# apply non-maxima suppression to suppress weak, overlapping bounding
|
|
# boxes
|
|
boxes = non_max_suppression(np.array(rects), probs=confidences)
|
|
|
|
# loop over the bounding boxes
|
|
for (startX, startY, endX, endY) in boxes:
|
|
# scale the bounding box coordinates based on the respective
|
|
# ratios
|
|
startX = int(startX * rW)
|
|
startY = int(startY * rH)
|
|
endX = int(endX * rW)
|
|
endY = int(endY * rH)
|
|
|
|
# draw the bounding box on the image
|
|
print("ok rectangle")
|
|
cv2.rectangle(orig, (startX, startY), (endX, endY), (0, 255, 0), 2)
|
|
inner_roi = imcrop(orig, startX, startY, endX, endY)
|
|
cv2.imshow(winName, inner_roi)
|
|
gray = cv2.cvtColor(inner_roi, cv2.COLOR_BGR2GRAY)
|
|
gray = cv2.threshold(gray, 0, 255,cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
|
|
text = pytesseract.image_to_string(gray)
|
|
cv2.imshow(winName, gray)
|
|
print ("TEXT: " + text)
|
|
# show the output image
|
|
#if startX and startY and endX and endY:
|
|
# inner_roi = imcrop(orig, startX, startY, endX, endY)
|
|
#print("startX, startY, endX, endY:" + str(startX) +"_"+ str(startY) +"_" +str(endX) +"_"+ str(endY))
|
|
|
|
#time.sleep(.3)
|
|
#cv2.waitKey(0)
|
|
|
|
|
|
|
|
|
|
# construct the argument parse and parse the arguments
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("-v", "--video",help="path to the (optional) video file")
|
|
args = vars(ap.parse_args())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Minimum confidence threshold. Increasing this will improve false positives but will also reduce detection rate.
|
|
min_confidence = 0.14
|
|
model = 'yolov2.weights'
|
|
config = 'yolov2.cfg'
|
|
|
|
# Load names of classes
|
|
classes = None
|
|
with open('labels.txt', 'rt') as f:
|
|
classes = f.read().rstrip('\n').split('\n')
|
|
print(classes)
|
|
|
|
# Load weights and construct graph
|
|
net = cv2.dnn.readNetFromDarknet(config, model)
|
|
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_DEFAULT)
|
|
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)
|
|
|
|
winName = 'Running YOLO warehouse'
|
|
|
|
cv2.namedWindow(winName, cv2.WINDOW_NORMAL)
|
|
|
|
|
|
camera = cv2.VideoCapture(args["video"])
|
|
|
|
# keep looping
|
|
j=1
|
|
while True:
|
|
# grab the current frame
|
|
(grabbed, frame) = camera.read()
|
|
# Get width and height
|
|
|
|
|
|
# if we are viewing a video and we did not grab a frame,
|
|
# then we have reached the end of the video
|
|
if args.get("video") and not grabbed:
|
|
break
|
|
|
|
height, width, ch = frame.shape
|
|
|
|
# Create a 4D blob from a frame.
|
|
blob = cv2.dnn.blobFromImage(frame, 1.0/255, (416, 416), True, crop=False)
|
|
net.setInput(blob)
|
|
# Run the preprocessed input blog through the network
|
|
predictions = net.forward()
|
|
probability_index = 5
|
|
|
|
for i in range(predictions.shape[0]):
|
|
prob_arr = predictions[i][probability_index:]
|
|
class_index = prob_arr.argmax(axis=0)
|
|
confidence = prob_arr[class_index]
|
|
if confidence > min_confidence:
|
|
x_center = predictions[i][0] * width
|
|
y_center = predictions[i][1] * height
|
|
width_box = predictions[i][2] * width
|
|
height_box = predictions[i][3] * height
|
|
|
|
x1 = int(x_center - width_box * 0.5)
|
|
y1 = int(y_center - height_box * 0.5)
|
|
x2 = int(x_center + width_box * 0.5)
|
|
y2 = int(y_center + height_box * 0.5)
|
|
#roi = frame[y1:y2, x1:x2]
|
|
roi = imcrop(frame, x1, y1, x2, y2)
|
|
cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 255, 255), 1)
|
|
cv2.putText(frame, classes[class_index] + " " + "{0:.1f}".format(confidence), (x1, y1),
|
|
cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 1, cv2.LINE_AA)
|
|
#cv2.imwrite("out_"+args.input, frame)
|
|
#mostro l'immagine cropped che dovrebbe contenere l'etichetta
|
|
cv2.imshow(winName, roi)
|
|
print("frame analyzed: " + str(j))
|
|
j += 1
|
|
grab_text(roi)
|
|
|
|
|
|
time.sleep(.1)
|
|
key = cv2.waitKey(1) & 0xFF
|
|
|
|
# if the 'q' key is pressed, stop the loop
|
|
if key == ord("q"):
|
|
break
|
|
|
|
# cleanup the camera and close any open windows
|
|
camera.release()
|
|
cv2.destroyAllWindows()
|
|
|
|
|
|
# USAGE
|
|
# python text_detection.py --image images/lebron_james.jpg --east frozen_east_text_detection.pb
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|