| | import cv2 |
| | import numpy as np |
| | import supervision as sv |
| |
|
| | import torch |
| | import torchvision |
| | from torchvision.transforms import ToTensor |
| |
|
| | from groundingdino.util.inference import Model |
| |
|
| | DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
| |
|
| | |
| | GROUNDING_DINO_CONFIG_PATH = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py" |
| | GROUNDING_DINO_CHECKPOINT_PATH = "./groundingdino_swint_ogc.pth" |
| |
|
| | |
| | grounding_dino_model = Model(model_config_path=GROUNDING_DINO_CONFIG_PATH, model_checkpoint_path=GROUNDING_DINO_CHECKPOINT_PATH) |
| |
|
| | |
| | EFFICIENT_SAM_CHECHPOINT_PATH = "./EfficientSAM/efficientsam_s_gpu.jit" |
| | efficientsam = torch.jit.load(EFFICIENT_SAM_CHECHPOINT_PATH) |
| |
|
| |
|
| | |
| | SOURCE_IMAGE_PATH = "./EfficientSAM/LightHQSAM/example_light_hqsam.png" |
| | CLASSES = ["bench"] |
| | BOX_THRESHOLD = 0.25 |
| | TEXT_THRESHOLD = 0.25 |
| | NMS_THRESHOLD = 0.8 |
| |
|
| |
|
| | |
| | image = cv2.imread(SOURCE_IMAGE_PATH) |
| |
|
| | |
| | detections = grounding_dino_model.predict_with_classes( |
| | image=image, |
| | classes=CLASSES, |
| | box_threshold=BOX_THRESHOLD, |
| | text_threshold=BOX_THRESHOLD |
| | ) |
| |
|
| | |
| | box_annotator = sv.BoxAnnotator() |
| | labels = [ |
| | f"{CLASSES[class_id]} {confidence:0.2f}" |
| | for _, _, confidence, class_id, _ |
| | in detections] |
| | annotated_frame = box_annotator.annotate(scene=image.copy(), detections=detections, labels=labels) |
| |
|
| | |
| | cv2.imwrite("EfficientSAM/LightHQSAM/groundingdino_annotated_image.jpg", annotated_frame) |
| |
|
| |
|
| | |
| | print(f"Before NMS: {len(detections.xyxy)} boxes") |
| | nms_idx = torchvision.ops.nms( |
| | torch.from_numpy(detections.xyxy), |
| | torch.from_numpy(detections.confidence), |
| | NMS_THRESHOLD |
| | ).numpy().tolist() |
| |
|
| | detections.xyxy = detections.xyxy[nms_idx] |
| | detections.confidence = detections.confidence[nms_idx] |
| | detections.class_id = detections.class_id[nms_idx] |
| |
|
| | print(f"After NMS: {len(detections.xyxy)} boxes") |
| |
|
| |
|
| | def efficient_sam_box_prompt_segment(image, pts_sampled, model): |
| | bbox = torch.reshape(torch.tensor(pts_sampled), [1, 1, 2, 2]) |
| | bbox_labels = torch.reshape(torch.tensor([2, 3]), [1, 1, 2]) |
| | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) |
| | img_tensor = ToTensor()(image) |
| |
|
| | predicted_logits, predicted_iou = model( |
| | img_tensor[None, ...].cuda(), |
| | bbox.cuda(), |
| | bbox_labels.cuda(), |
| | ) |
| | predicted_logits = predicted_logits.cpu() |
| | all_masks = torch.ge(torch.sigmoid(predicted_logits[0, 0, :, :, :]), 0.5).numpy() |
| | predicted_iou = predicted_iou[0, 0, ...].cpu().detach().numpy() |
| |
|
| | max_predicted_iou = -1 |
| | selected_mask_using_predicted_iou = None |
| | for m in range(all_masks.shape[0]): |
| | curr_predicted_iou = predicted_iou[m] |
| | if ( |
| | curr_predicted_iou > max_predicted_iou |
| | or selected_mask_using_predicted_iou is None |
| | ): |
| | max_predicted_iou = curr_predicted_iou |
| | selected_mask_using_predicted_iou = all_masks[m] |
| | return selected_mask_using_predicted_iou |
| |
|
| |
|
| | |
| | result_masks = [] |
| | for box in detections.xyxy: |
| | mask = efficient_sam_box_prompt_segment(image, box, efficientsam) |
| | result_masks.append(mask) |
| |
|
| | detections.mask = np.array(result_masks) |
| |
|
| | |
| | box_annotator = sv.BoxAnnotator() |
| | mask_annotator = sv.MaskAnnotator() |
| | labels = [ |
| | f"{CLASSES[class_id]} {confidence:0.2f}" |
| | for _, _, confidence, class_id, _ |
| | in detections] |
| | annotated_image = mask_annotator.annotate(scene=image.copy(), detections=detections) |
| | annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections, labels=labels) |
| |
|
| | |
| | cv2.imwrite("EfficientSAM/gronded_efficient_sam_anontated_image.jpg", annotated_image) |
| |
|