from typing import Tuple import gradio as gr import numpy as np import supervision as sv import torch import time from PIL import Image from torchvision.transforms import ToTensor # from transformers import SamModel, SamProcessor from efficient_sam.build_efficient_sam import build_efficient_sam_vits from efficientvit.models.efficientvit.sam import EfficientViTSamPredictor from efficientvit.sam_model_zoo import create_sam_model MARKDOWN = """ # EfficientViT-SAM vs EfficientSAM vs SAM Paper sourceļ¼š [EfficientViT-SAM](https://arxiv.org/abs/2402.05008) and [EfficientSAM](https://arxiv.org/abs/2312.00863) and [SAM](https://arxiv.org/abs/2304.02643) \n Github Source Code: [Link](https://github.com/pg56714/Segment-Anything-Arena) \n The SAM model takes one minute to run to completion, which slow down other models. Currently, EfficientViT-SAM and EfficientSAM are displayed first. The source code for all three models is available, but the SAM is commented out. """ BOX_EXAMPLES = [ ["https://media.roboflow.com/efficient-sam/corgi.jpg", 801, 510, 1782, 993], ] PROMPT_COLOR = sv.Color.from_hex("#D3D3D3") MASK_COLOR = sv.Color.from_hex("#FF0000") DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") # SAM_MODEL = SamModel.from_pretrained("facebook/sam-vit-huge").to(DEVICE).eval() # SAM_PROCESSOR = SamProcessor.from_pretrained("facebook/sam-vit-huge") EFFICIENT_SAM_MODEL = build_efficient_sam_vits().to(DEVICE).eval() MASK_ANNOTATOR = sv.MaskAnnotator(color=MASK_COLOR, color_lookup=sv.ColorLookup.INDEX) EFFICIENTVITSAM = EfficientViTSamPredictor( create_sam_model(name="xl1", weight_url="./weights/xl1.pt").to(DEVICE).eval() ) def annotate_image_with_box_prompt_result( image: np.ndarray, detections: sv.Detections, x_min: int, y_min: int, x_max: int, y_max: int, ) -> np.ndarray: h, w, _ = image.shape bgr_image = image[:, :, ::-1] annotated_bgr_image = MASK_ANNOTATOR.annotate( scene=bgr_image.copy(), detections=detections ) annotated_bgr_image = sv.draw_rectangle( scene=annotated_bgr_image, rect=sv.Rect( x=x_min, y=y_min, width=int(x_max - x_min), height=int(y_max - y_min), ), color=PROMPT_COLOR, thickness=sv.calculate_optimal_line_thickness(resolution_wh=(w, h)), ) return annotated_bgr_image[:, :, ::-1] def efficientvit_sam_box_inference( image: np.ndarray, x_min: int, y_min: int, x_max: int, y_max: int ) -> np.ndarray: t1 = time.time() box = np.array([[x_min, y_min, x_max, y_max]]) EFFICIENTVITSAM.set_image(image) mask = EFFICIENTVITSAM.predict(box=box, multimask_output=False) mask = mask[0] detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask) result = annotate_image_with_box_prompt_result( image=image, detections=detections, x_max=x_max, x_min=x_min, y_max=y_max, y_min=y_min, ) t2 = time.time() print(f"timecost: {t2-t1}") return result def inference_with_box( image: np.ndarray, box: np.ndarray, model: torch.jit.ScriptModule, device: torch.device, ) -> np.ndarray: bbox = torch.reshape(torch.tensor(box), [1, 1, 2, 2]) bbox_labels = torch.reshape(torch.tensor([2, 3]), [1, 1, 2]) img_tensor = ToTensor()(image) predicted_logits, predicted_iou = model( img_tensor[None, ...].to(device), bbox.to(device), bbox_labels.to(device), ) predicted_logits = predicted_logits.cpu() all_masks = torch.ge(torch.sigmoid(predicted_logits[0, 0, :, :, :]), 0.5).numpy() predicted_iou = predicted_iou[0, 0, ...].cpu().detach().numpy() max_predicted_iou = -1 selected_mask_using_predicted_iou = None for m in range(all_masks.shape[0]): curr_predicted_iou = predicted_iou[m] if ( curr_predicted_iou > max_predicted_iou or selected_mask_using_predicted_iou is None ): max_predicted_iou = curr_predicted_iou selected_mask_using_predicted_iou = all_masks[m] return selected_mask_using_predicted_iou def efficient_sam_box_inference( image: np.ndarray, x_min: int, y_min: int, x_max: int, y_max: int ) -> np.ndarray: t1 = time.time() box = np.array([[x_min, y_min], [x_max, y_max]]) mask = inference_with_box(image, box, EFFICIENT_SAM_MODEL, DEVICE) mask = mask[np.newaxis, ...] detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask) result = annotate_image_with_box_prompt_result( image=image, detections=detections, x_max=x_max, x_min=x_min, y_max=y_max, y_min=y_min, ) t2 = time.time() print(f"timecost: {t2-t1}") return result # def sam_box_inference( # image: np.ndarray, x_min: int, y_min: int, x_max: int, y_max: int # ) -> np.ndarray: # t1 = time.time() # input_boxes = [[[x_min, y_min, x_max, y_max]]] # inputs = SAM_PROCESSOR( # Image.fromarray(image), input_boxes=[input_boxes], return_tensors="pt" # ).to(DEVICE) # with torch.no_grad(): # outputs = SAM_MODEL(**inputs) # mask = SAM_PROCESSOR.image_processor.post_process_masks( # outputs.pred_masks.cpu(), # inputs["original_sizes"].cpu(), # inputs["reshaped_input_sizes"].cpu(), # )[0][0][0].numpy() # mask = mask[np.newaxis, ...] # detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask) # result = annotate_image_with_box_prompt_result( # image=image, # detections=detections, # x_max=x_max, # x_min=x_min, # y_max=y_max, # y_min=y_min, # ) # t2 = time.time() # print(f"timecost: {t2-t1}") # return result def box_inference( image: np.ndarray, x_min: int, y_min: int, x_max: int, y_max: int ) -> Tuple[np.ndarray, np.ndarray]: return ( efficientvit_sam_box_inference(image, x_min, y_min, x_max, y_max), efficient_sam_box_inference(image, x_min, y_min, x_max, y_max), # sam_box_inference(image, x_min, y_min, x_max, y_max), ) # def clear(_: np.ndarray) -> Tuple[None, None, None]: # return None, None, None def clear(_: np.ndarray) -> Tuple[None, None]: return None, None box_input_image = gr.Image() x_min_number = gr.Number(label="x_min") y_min_number = gr.Number(label="y_min") x_max_number = gr.Number(label="x_max") y_max_number = gr.Number(label="y_max") box_inputs = [box_input_image, x_min_number, y_min_number, x_max_number, y_max_number] with gr.Blocks() as demo: gr.Markdown(MARKDOWN) with gr.Row(): box_input_image.render() efficientvit_sam_box_output_image = gr.Image(label="EfficientVit-SAM") efficient_sam_box_output_image = gr.Image(label="EfficientSAM") # sam_box_output_image = gr.Image(label="SAM") with gr.Row(): x_min_number.render() y_min_number.render() x_max_number.render() y_max_number.render() submit_box_inference_button = gr.Button( value="Submit", scale=1, variant="primary" ) gr.Examples( # fn=box_inference, examples=BOX_EXAMPLES, inputs=box_inputs, outputs=[ efficientvit_sam_box_output_image, efficient_sam_box_output_image, # sam_box_output_image, ], ) submit_box_inference_button.click( efficientvit_sam_box_inference, inputs=box_inputs, outputs=efficientvit_sam_box_output_image, ) submit_box_inference_button.click( efficient_sam_box_inference, inputs=box_inputs, outputs=efficient_sam_box_output_image, ) # submit_box_inference_button.click( # sam_box_inference, inputs=box_inputs, outputs=sam_box_output_image # ) box_input_image.change( clear, inputs=box_input_image, outputs=[ efficientvit_sam_box_output_image, efficient_sam_box_output_image, # sam_box_output_image, ], ) demo.launch(debug=False, show_error=True)