This code uses a Detectron2 model to detect layout elements like text blocks or images in a document image. It draws boxes around detected areas and prints how many were found.
import cv2
import matplotlib.pyplot as plt
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2 import model_zoo
# Load image
image_bgr = cv2.imread('sample_document.jpg')
image = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
# Setup Detectron2 config for layout detection
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 5
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5
cfg.MODEL.WEIGHTS = "https://dl.fbaipublicfiles.com/detectron2/PubLayNet/mask_rcnn_R_50_FPN_3x/164590034/model_final_ba5f84.pkl"
predictor = DefaultPredictor(cfg)
# Run prediction
outputs = predictor(image)
# Extract boxes and classes
boxes = outputs['instances'].pred_boxes.tensor.cpu().numpy()
classes = outputs['instances'].pred_classes.cpu().numpy()
# Show results
for box, cls in zip(boxes, classes):
x1, y1, x2, y2 = box.astype(int)
cv2.rectangle(image_bgr, (x1, y1), (x2, y2), (0,255,0), 2)
cv2.putText(image_bgr, str(cls), (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,255,0), 2)
plt.imshow(cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB))
plt.axis('off')
plt.show()
print(f'Found {len(boxes)} layout elements.')