Initial import

2026-05-14 09:36:44 +02:00
commit 747298ac7a
1212 changed files with 56349 additions and 0 deletions
--- a/ware_detect.py
+++ b/ware_detect.py
@@ -0,0 +1,266 @@
+from imutils.object_detection import non_max_suppression
+import time
+import cv2
+import argparse
+import numpy as np
+import sys
+
+
+
+# import the necessary packages
+from PIL import Image
+import pytesseract
+import argparse
+import cv2
+import os
+
+
+
+def imcrop(img, x1, y1, x2, y2):
+
+    if x1 < 0 or y1 < 0 or x2 > img.shape[1] or y2 > img.shape[0]:
+        img, x1, x2, y1, y2 = pad_img_to_fit_bbox(img, x1, x2, y1, y2)
+    return img[y1:y2, x1:x2, :]
+
+def pad_img_to_fit_bbox(img, x1, x2, y1, y2):
+        img = np.pad(img, ((np.abs(np.minimum(0, y1)), np.maximum(y2 - img.shape[0], 0)),
+                           (np.abs(np.minimum(0, x1)), np.maximum(x2 - img.shape[1], 0)), (0, 0)), mode="constant")
+        y1 += np.abs(np.minimum(0, y1))
+        y2 += np.abs(np.minimum(0, y1))
+        x1 += np.abs(np.minimum(0, x1))
+        x2 += np.abs(np.minimum(0, x1))
+        return img, x1, x2, y1, y2
+
+
+
+def grab_text(roi):
+    # load the input image and grab the image dimensions
+    image = roi
+
+    orig = image.copy()
+    (H, W) = image.shape[:2]
+
+    # set the new width and height and then determine the ratio in change
+    # for both the width and height
+    (newW, newH) = (320, 320)
+    rW = W / float(newW)
+    rH = H / float(newH)
+
+    # resize the image and grab the new image dimensions
+    image = cv2.resize(image, (newW, newH))
+    (H, W) = image.shape[:2]
+
+    # define the two output layer names for the EAST detector model that
+    # we are interested -- the first is the output probabilities and the
+    # second can be used to derive the bounding box coordinates of text
+    layerNames = [
+        "feature_fusion/Conv_7/Sigmoid",
+        "feature_fusion/concat_3"]
+
+    net = cv2.dnn.readNet("./frozen_east_text_detection.pb")
+
+    # construct a blob from the image and then perform a forward pass of
+    # the model to obtain the two output layer sets
+    blob = cv2.dnn.blobFromImage(image, 1.0, (W, H),(123.68, 116.78, 103.94), swapRB=True, crop=False)
+    #start = time.time()
+    net.setInput(blob)
+    (scores, geometry) = net.forward(layerNames)
+    #end = time.time()
+
+    # show timing information on text prediction
+    #print("[INFO] text detection took {:.6f} seconds".format(end - start))
+
+    # grab the number of rows and columns from the scores volume, then
+    # initialize our set of bounding box rectangles and corresponding
+    # confidence scores
+    (numRows, numCols) = scores.shape[2:4]
+    rects = []
+    confidences = []
+
+    # loop over the number of rows
+    for y in range(0, numRows):
+        # extract the scores (probabilities), followed by the geometrical
+        # data used to derive potential bounding box coordinates that
+        # surround text
+        scoresData = scores[0, 0, y]
+        xData0 = geometry[0, 0, y]
+        xData1 = geometry[0, 1, y]
+        xData2 = geometry[0, 2, y]
+        xData3 = geometry[0, 3, y]
+        anglesData = geometry[0, 4, y]
+
+        # loop over the number of columns
+        for x in range(0, numCols):
+            # if our score does not have sufficient probability, ignore it
+            if scoresData[x] < 0.5:
+                continue
+
+            # compute the offset factor as our resulting feature maps will
+            # be 4x smaller than the input image
+            (offsetX, offsetY) = (x * 4.0, y * 4.0)
+
+            # extract the rotation angle for the prediction and then
+            # compute the sin and cosine
+            angle = anglesData[x]
+            cos = np.cos(angle)
+            sin = np.sin(angle)
+
+            # use the geometry volume to derive the width and height of
+            # the bounding box
+            h = xData0[x] + xData2[x]
+            w = xData1[x] + xData3[x]
+
+            # compute both the starting and ending (x, y)-coordinates for
+            # the text prediction bounding box
+            endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
+            endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
+            startX = int(endX - w)
+            startY = int(endY - h)
+
+            # add the bounding box coordinates and probability score to
+            # our respective lists
+            rects.append((startX, startY, endX, endY))
+            confidences.append(scoresData[x])
+
+    # apply non-maxima suppression to suppress weak, overlapping bounding
+    # boxes
+    boxes = non_max_suppression(np.array(rects), probs=confidences)
+
+    # loop over the bounding boxes
+    for (startX, startY, endX, endY) in boxes:
+        # scale the bounding box coordinates based on the respective
+        # ratios
+        startX = int(startX * rW)
+        startY = int(startY * rH)
+        endX = int(endX * rW)
+        endY = int(endY * rH)
+
+        # draw the bounding box on the image
+        print("ok rectangle")
+        cv2.rectangle(orig, (startX, startY), (endX, endY), (0, 255, 0), 2)
+        inner_roi = imcrop(orig, startX, startY, endX, endY)
+        cv2.imshow(winName, inner_roi)
+        gray = cv2.cvtColor(inner_roi, cv2.COLOR_BGR2GRAY)
+        gray = cv2.threshold(gray, 0, 255,cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
+        text = pytesseract.image_to_string(gray)
+        cv2.imshow(winName, gray)
+        print ("TEXT: " + text)
+    # show the output image
+    #if startX and startY and endX and  endY:
+    #    inner_roi = imcrop(orig, startX, startY, endX, endY)
+    #print("startX, startY, endX, endY:" + str(startX) +"_"+ str(startY) +"_" +str(endX) +"_"+ str(endY))
+
+    #time.sleep(.3)
+    #cv2.waitKey(0)
+
+
+
+
+# construct the argument parse and parse the arguments
+ap = argparse.ArgumentParser()
+ap.add_argument("-v", "--video",help="path to the (optional) video file")
+args = vars(ap.parse_args())
+
+
+
+
+
+
+
+# Minimum confidence threshold. Increasing this will improve false positives but will also reduce detection rate.
+min_confidence = 0.14
+model = 'yolov2.weights'
+config = 'yolov2.cfg'
+
+# Load names of classes
+classes = None
+with open('labels.txt', 'rt') as f:
+    classes = f.read().rstrip('\n').split('\n')
+print(classes)
+
+# Load weights and construct graph
+net = cv2.dnn.readNetFromDarknet(config, model)
+net.setPreferableBackend(cv2.dnn.DNN_BACKEND_DEFAULT)
+net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)
+
+winName = 'Running YOLO warehouse'
+
+cv2.namedWindow(winName, cv2.WINDOW_NORMAL)
+
+
+camera = cv2.VideoCapture(args["video"])
+
+# keep looping
+j=1
+while True:
+    # grab the current frame
+    (grabbed, frame) = camera.read()
+    # Get width and height
+
+
+    # if we are viewing a video and we did not grab a frame,
+    # then we have reached the end of the video
+    if args.get("video") and not grabbed:
+        break
+
+    height, width, ch = frame.shape
+
+    # Create a 4D blob from a frame.
+    blob = cv2.dnn.blobFromImage(frame, 1.0/255, (416, 416), True, crop=False)
+    net.setInput(blob)
+    # Run the preprocessed input blog through the network
+    predictions = net.forward()
+    probability_index = 5
+
+    for i in range(predictions.shape[0]):
+        prob_arr = predictions[i][probability_index:]
+        class_index = prob_arr.argmax(axis=0)
+        confidence = prob_arr[class_index]
+        if confidence > min_confidence:
+            x_center = predictions[i][0] * width
+            y_center = predictions[i][1] * height
+            width_box = predictions[i][2] * width
+            height_box = predictions[i][3] * height
+
+            x1 = int(x_center - width_box * 0.5)
+            y1 = int(y_center - height_box * 0.5)
+            x2 = int(x_center + width_box * 0.5)
+            y2 = int(y_center + height_box * 0.5)
+            #roi = frame[y1:y2, x1:x2]
+            roi = imcrop(frame, x1, y1, x2, y2)
+            cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 255, 255), 1)
+            cv2.putText(frame, classes[class_index] + " " + "{0:.1f}".format(confidence), (x1, y1),
+                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 1, cv2.LINE_AA)
+            #cv2.imwrite("out_"+args.input, frame)
+            #mostro l'immagine cropped che dovrebbe contenere l'etichetta
+            cv2.imshow(winName, roi)
+            print("frame analyzed: " + str(j))
+            j += 1
+            grab_text(roi)
+
+
+    time.sleep(.1)
+    key = cv2.waitKey(1) & 0xFF
+
+    # if the 'q' key is pressed, stop the loop
+    if key == ord("q"):
+	    break
+
+# cleanup the camera and close any open windows
+camera.release()
+cv2.destroyAllWindows()
+
+
+# USAGE
+# python text_detection.py --image images/lebron_james.jpg --east frozen_east_text_detection.pb
+
+
+
+
+
+
+
+
+
+
+