from imutils.object_detection import non_max_suppression import time import cv2 import argparse import numpy as np import sys # import the necessary packages from PIL import Image import pytesseract import argparse import cv2 import os def imcrop(img, x1, y1, x2, y2): if x1 < 0 or y1 < 0 or x2 > img.shape[1] or y2 > img.shape[0]: img, x1, x2, y1, y2 = pad_img_to_fit_bbox(img, x1, x2, y1, y2) return img[y1:y2, x1:x2, :] def pad_img_to_fit_bbox(img, x1, x2, y1, y2): img = np.pad(img, ((np.abs(np.minimum(0, y1)), np.maximum(y2 - img.shape[0], 0)), (np.abs(np.minimum(0, x1)), np.maximum(x2 - img.shape[1], 0)), (0, 0)), mode="constant") y1 += np.abs(np.minimum(0, y1)) y2 += np.abs(np.minimum(0, y1)) x1 += np.abs(np.minimum(0, x1)) x2 += np.abs(np.minimum(0, x1)) return img, x1, x2, y1, y2 def grab_text(roi): # load the input image and grab the image dimensions image = roi orig = image.copy() (H, W) = image.shape[:2] # set the new width and height and then determine the ratio in change # for both the width and height (newW, newH) = (320, 320) rW = W / float(newW) rH = H / float(newH) # resize the image and grab the new image dimensions image = cv2.resize(image, (newW, newH)) (H, W) = image.shape[:2] # define the two output layer names for the EAST detector model that # we are interested -- the first is the output probabilities and the # second can be used to derive the bounding box coordinates of text layerNames = [ "feature_fusion/Conv_7/Sigmoid", "feature_fusion/concat_3"] net = cv2.dnn.readNet("./frozen_east_text_detection.pb") # construct a blob from the image and then perform a forward pass of # the model to obtain the two output layer sets blob = cv2.dnn.blobFromImage(image, 1.0, (W, H),(123.68, 116.78, 103.94), swapRB=True, crop=False) #start = time.time() net.setInput(blob) (scores, geometry) = net.forward(layerNames) #end = time.time() # show timing information on text prediction #print("[INFO] text detection took {:.6f} seconds".format(end - start)) # grab the number of rows and columns from the scores volume, then # initialize our set of bounding box rectangles and corresponding # confidence scores (numRows, numCols) = scores.shape[2:4] rects = [] confidences = [] # loop over the number of rows for y in range(0, numRows): # extract the scores (probabilities), followed by the geometrical # data used to derive potential bounding box coordinates that # surround text scoresData = scores[0, 0, y] xData0 = geometry[0, 0, y] xData1 = geometry[0, 1, y] xData2 = geometry[0, 2, y] xData3 = geometry[0, 3, y] anglesData = geometry[0, 4, y] # loop over the number of columns for x in range(0, numCols): # if our score does not have sufficient probability, ignore it if scoresData[x] < 0.5: continue # compute the offset factor as our resulting feature maps will # be 4x smaller than the input image (offsetX, offsetY) = (x * 4.0, y * 4.0) # extract the rotation angle for the prediction and then # compute the sin and cosine angle = anglesData[x] cos = np.cos(angle) sin = np.sin(angle) # use the geometry volume to derive the width and height of # the bounding box h = xData0[x] + xData2[x] w = xData1[x] + xData3[x] # compute both the starting and ending (x, y)-coordinates for # the text prediction bounding box endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x])) endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x])) startX = int(endX - w) startY = int(endY - h) # add the bounding box coordinates and probability score to # our respective lists rects.append((startX, startY, endX, endY)) confidences.append(scoresData[x]) # apply non-maxima suppression to suppress weak, overlapping bounding # boxes boxes = non_max_suppression(np.array(rects), probs=confidences) # loop over the bounding boxes for (startX, startY, endX, endY) in boxes: # scale the bounding box coordinates based on the respective # ratios startX = int(startX * rW) startY = int(startY * rH) endX = int(endX * rW) endY = int(endY * rH) # draw the bounding box on the image print("ok rectangle") cv2.rectangle(orig, (startX, startY), (endX, endY), (0, 255, 0), 2) inner_roi = imcrop(orig, startX, startY, endX, endY) cv2.imshow(winName, inner_roi) gray = cv2.cvtColor(inner_roi, cv2.COLOR_BGR2GRAY) gray = cv2.threshold(gray, 0, 255,cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] text = pytesseract.image_to_string(gray) cv2.imshow(winName, gray) print ("TEXT: " + text) # show the output image #if startX and startY and endX and endY: # inner_roi = imcrop(orig, startX, startY, endX, endY) #print("startX, startY, endX, endY:" + str(startX) +"_"+ str(startY) +"_" +str(endX) +"_"+ str(endY)) #time.sleep(.3) #cv2.waitKey(0) # construct the argument parse and parse the arguments ap = argparse.ArgumentParser() ap.add_argument("-v", "--video",help="path to the (optional) video file") args = vars(ap.parse_args()) # Minimum confidence threshold. Increasing this will improve false positives but will also reduce detection rate. min_confidence = 0.14 model = 'yolov2.weights' config = 'yolov2.cfg' # Load names of classes classes = None with open('labels.txt', 'rt') as f: classes = f.read().rstrip('\n').split('\n') print(classes) # Load weights and construct graph net = cv2.dnn.readNetFromDarknet(config, model) net.setPreferableBackend(cv2.dnn.DNN_BACKEND_DEFAULT) net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU) winName = 'Running YOLO warehouse' cv2.namedWindow(winName, cv2.WINDOW_NORMAL) camera = cv2.VideoCapture(args["video"]) # keep looping j=1 while True: # grab the current frame (grabbed, frame) = camera.read() # Get width and height # if we are viewing a video and we did not grab a frame, # then we have reached the end of the video if args.get("video") and not grabbed: break height, width, ch = frame.shape # Create a 4D blob from a frame. blob = cv2.dnn.blobFromImage(frame, 1.0/255, (416, 416), True, crop=False) net.setInput(blob) # Run the preprocessed input blog through the network predictions = net.forward() probability_index = 5 for i in range(predictions.shape[0]): prob_arr = predictions[i][probability_index:] class_index = prob_arr.argmax(axis=0) confidence = prob_arr[class_index] if confidence > min_confidence: x_center = predictions[i][0] * width y_center = predictions[i][1] * height width_box = predictions[i][2] * width height_box = predictions[i][3] * height x1 = int(x_center - width_box * 0.5) y1 = int(y_center - height_box * 0.5) x2 = int(x_center + width_box * 0.5) y2 = int(y_center + height_box * 0.5) #roi = frame[y1:y2, x1:x2] roi = imcrop(frame, x1, y1, x2, y2) cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 255, 255), 1) cv2.putText(frame, classes[class_index] + " " + "{0:.1f}".format(confidence), (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 1, cv2.LINE_AA) #cv2.imwrite("out_"+args.input, frame) #mostro l'immagine cropped che dovrebbe contenere l'etichetta cv2.imshow(winName, roi) print("frame analyzed: " + str(j)) j += 1 grab_text(roi) time.sleep(.1) key = cv2.waitKey(1) & 0xFF # if the 'q' key is pressed, stop the loop if key == ord("q"): break # cleanup the camera and close any open windows camera.release() cv2.destroyAllWindows() # USAGE # python text_detection.py --image images/lebron_james.jpg --east frozen_east_text_detection.pb