flywms/ware_detect.py

from imutils.object_detection import non_max_suppression
import time
import cv2
import argparse
import numpy as np
import sys


# import the necessary packages
from PIL import Image
import pytesseract
import argparse
import cv2
import os


def imcrop(img, x1, y1, x2, y2):

    if x1 < 0 or y1 < 0 or x2 > img.shape[1] or y2 > img.shape[0]:
        img, x1, x2, y1, y2 = pad_img_to_fit_bbox(img, x1, x2, y1, y2)
    return img[y1:y2, x1:x2, :]

def pad_img_to_fit_bbox(img, x1, x2, y1, y2):
        img = np.pad(img, ((np.abs(np.minimum(0, y1)), np.maximum(y2 - img.shape[0], 0)),
                           (np.abs(np.minimum(0, x1)), np.maximum(x2 - img.shape[1], 0)), (0, 0)), mode="constant")
        y1 += np.abs(np.minimum(0, y1))
        y2 += np.abs(np.minimum(0, y1))
        x1 += np.abs(np.minimum(0, x1))
        x2 += np.abs(np.minimum(0, x1))
        return img, x1, x2, y1, y2


def grab_text(roi):
    # load the input image and grab the image dimensions
    image = roi

    orig = image.copy()
    (H, W) = image.shape[:2]

    # set the new width and height and then determine the ratio in change
    # for both the width and height
    (newW, newH) = (320, 320)
    rW = W / float(newW)
    rH = H / float(newH)

    # resize the image and grab the new image dimensions
    image = cv2.resize(image, (newW, newH))
    (H, W) = image.shape[:2]

    # define the two output layer names for the EAST detector model that
    # we are interested -- the first is the output probabilities and the
    # second can be used to derive the bounding box coordinates of text
    layerNames = [
        "feature_fusion/Conv_7/Sigmoid",
        "feature_fusion/concat_3"]

    net = cv2.dnn.readNet("./frozen_east_text_detection.pb")

    # construct a blob from the image and then perform a forward pass of
    # the model to obtain the two output layer sets
    blob = cv2.dnn.blobFromImage(image, 1.0, (W, H),(123.68, 116.78, 103.94), swapRB=True, crop=False)
    #start = time.time()
    net.setInput(blob)
    (scores, geometry) = net.forward(layerNames)
    #end = time.time()

    # show timing information on text prediction
    #print("[INFO] text detection took {:.6f} seconds".format(end - start))

    # grab the number of rows and columns from the scores volume, then
    # initialize our set of bounding box rectangles and corresponding
    # confidence scores
    (numRows, numCols) = scores.shape[2:4]
    rects = []
    confidences = []

    # loop over the number of rows
    for y in range(0, numRows):
        # extract the scores (probabilities), followed by the geometrical
        # data used to derive potential bounding box coordinates that
        # surround text
        scoresData = scores[0, 0, y]
        xData0 = geometry[0, 0, y]
        xData1 = geometry[0, 1, y]
        xData2 = geometry[0, 2, y]
        xData3 = geometry[0, 3, y]
        anglesData = geometry[0, 4, y]

        # loop over the number of columns
        for x in range(0, numCols):
            # if our score does not have sufficient probability, ignore it
            if scoresData[x] < 0.5:
                continue

            # compute the offset factor as our resulting feature maps will
            # be 4x smaller than the input image
            (offsetX, offsetY) = (x * 4.0, y * 4.0)

            # extract the rotation angle for the prediction and then
            # compute the sin and cosine
            angle = anglesData[x]
            cos = np.cos(angle)
            sin = np.sin(angle)

            # use the geometry volume to derive the width and height of
            # the bounding box
            h = xData0[x] + xData2[x]
            w = xData1[x] + xData3[x]

            # compute both the starting and ending (x, y)-coordinates for
            # the text prediction bounding box
            endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
            endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
            startX = int(endX - w)
            startY = int(endY - h)

            # add the bounding box coordinates and probability score to
            # our respective lists
            rects.append((startX, startY, endX, endY))
            confidences.append(scoresData[x])

    # apply non-maxima suppression to suppress weak, overlapping bounding
    # boxes
    boxes = non_max_suppression(np.array(rects), probs=confidences)

    # loop over the bounding boxes
    for (startX, startY, endX, endY) in boxes:
        # scale the bounding box coordinates based on the respective
        # ratios
        startX = int(startX * rW)
        startY = int(startY * rH)
        endX = int(endX * rW)
        endY = int(endY * rH)

        # draw the bounding box on the image
        print("ok rectangle")
        cv2.rectangle(orig, (startX, startY), (endX, endY), (0, 255, 0), 2)
        inner_roi = imcrop(orig, startX, startY, endX, endY)
        cv2.imshow(winName, inner_roi)
        gray = cv2.cvtColor(inner_roi, cv2.COLOR_BGR2GRAY)
        gray = cv2.threshold(gray, 0, 255,cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
        text = pytesseract.image_to_string(gray)
        cv2.imshow(winName, gray)
        print ("TEXT: " + text)
    # show the output image
    #if startX and startY and endX and  endY:
    #    inner_roi = imcrop(orig, startX, startY, endX, endY)
    #print("startX, startY, endX, endY:" + str(startX) +"_"+ str(startY) +"_" +str(endX) +"_"+ str(endY))

    #time.sleep(.3)
    #cv2.waitKey(0)


# construct the argument parse and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-v", "--video",help="path to the (optional) video file")
args = vars(ap.parse_args())


# Minimum confidence threshold. Increasing this will improve false positives but will also reduce detection rate.
min_confidence = 0.14
model = 'yolov2.weights'
config = 'yolov2.cfg'

# Load names of classes
classes = None
with open('labels.txt', 'rt') as f:
    classes = f.read().rstrip('\n').split('\n')
print(classes)

# Load weights and construct graph
net = cv2.dnn.readNetFromDarknet(config, model)
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_DEFAULT)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)

winName = 'Running YOLO warehouse'

cv2.namedWindow(winName, cv2.WINDOW_NORMAL)


camera = cv2.VideoCapture(args["video"])

# keep looping
j=1
while True:
    # grab the current frame
    (grabbed, frame) = camera.read()
    # Get width and height


    # if we are viewing a video and we did not grab a frame,
    # then we have reached the end of the video
    if args.get("video") and not grabbed:
        break

    height, width, ch = frame.shape

    # Create a 4D blob from a frame.
    blob = cv2.dnn.blobFromImage(frame, 1.0/255, (416, 416), True, crop=False)
    net.setInput(blob)
    # Run the preprocessed input blog through the network
    predictions = net.forward()
    probability_index = 5

    for i in range(predictions.shape[0]):
        prob_arr = predictions[i][probability_index:]
        class_index = prob_arr.argmax(axis=0)
        confidence = prob_arr[class_index]
        if confidence > min_confidence:
            x_center = predictions[i][0] * width
            y_center = predictions[i][1] * height
            width_box = predictions[i][2] * width
            height_box = predictions[i][3] * height

            x1 = int(x_center - width_box * 0.5)
            y1 = int(y_center - height_box * 0.5)
            x2 = int(x_center + width_box * 0.5)
            y2 = int(y_center + height_box * 0.5)
            #roi = frame[y1:y2, x1:x2]
            roi = imcrop(frame, x1, y1, x2, y2)
            cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 255, 255), 1)
            cv2.putText(frame, classes[class_index] + " " + "{0:.1f}".format(confidence), (x1, y1),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 1, cv2.LINE_AA)
            #cv2.imwrite("out_"+args.input, frame)
            #mostro l'immagine cropped che dovrebbe contenere l'etichetta
            cv2.imshow(winName, roi)
            print("frame analyzed: " + str(j))
            j += 1
            grab_text(roi)


    time.sleep(.1)
    key = cv2.waitKey(1) & 0xFF

    # if the 'q' key is pressed, stop the loop
    if key == ord("q"):
	    break

# cleanup the camera and close any open windows
camera.release()
cv2.destroyAllWindows()


# USAGE
# python text_detection.py --image images/lebron_james.jpg --east frozen_east_text_detection.pb