add yolo v10 and modify pipeline
This commit is contained in:
@ -4,4 +4,4 @@ from .model import RTDETR
|
||||
from .predict import RTDETRPredictor
|
||||
from .val import RTDETRValidator
|
||||
|
||||
__all__ = 'RTDETRPredictor', 'RTDETRValidator', 'RTDETR'
|
||||
__all__ = "RTDETRPredictor", "RTDETRValidator", "RTDETR"
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,7 +1,12 @@
|
||||
# Ultralytics YOLO 🚀, AGPL-3.0 license
|
||||
"""
|
||||
RT-DETR model interface
|
||||
Interface for Baidu's RT-DETR, a Vision Transformer-based real-time object detector. RT-DETR offers real-time
|
||||
performance and high accuracy, excelling in accelerated backends like CUDA with TensorRT. It features an efficient
|
||||
hybrid encoder and IoU-aware query selection for enhanced detection accuracy.
|
||||
|
||||
For more information on RT-DETR, visit: https://arxiv.org/pdf/2304.08069.pdf
|
||||
"""
|
||||
|
||||
from ultralytics.engine.model import Model
|
||||
from ultralytics.nn.tasks import RTDETRDetectionModel
|
||||
|
||||
@ -12,19 +17,38 @@ from .val import RTDETRValidator
|
||||
|
||||
class RTDETR(Model):
|
||||
"""
|
||||
RTDETR model interface.
|
||||
Interface for Baidu's RT-DETR model. This Vision Transformer-based object detector provides real-time performance
|
||||
with high accuracy. It supports efficient hybrid encoding, IoU-aware query selection, and adaptable inference speed.
|
||||
|
||||
Attributes:
|
||||
model (str): Path to the pre-trained model. Defaults to 'rtdetr-l.pt'.
|
||||
"""
|
||||
|
||||
def __init__(self, model='rtdetr-l.pt') -> None:
|
||||
if model and model.split('.')[-1] not in ('pt', 'yaml', 'yml'):
|
||||
raise NotImplementedError('RT-DETR only supports creating from *.pt file or *.yaml file.')
|
||||
super().__init__(model=model, task='detect')
|
||||
def __init__(self, model="rtdetr-l.pt") -> None:
|
||||
"""
|
||||
Initializes the RT-DETR model with the given pre-trained model file. Supports .pt and .yaml formats.
|
||||
|
||||
Args:
|
||||
model (str): Path to the pre-trained model. Defaults to 'rtdetr-l.pt'.
|
||||
|
||||
Raises:
|
||||
NotImplementedError: If the model file extension is not 'pt', 'yaml', or 'yml'.
|
||||
"""
|
||||
super().__init__(model=model, task="detect")
|
||||
|
||||
@property
|
||||
def task_map(self):
|
||||
def task_map(self) -> dict:
|
||||
"""
|
||||
Returns a task map for RT-DETR, associating tasks with corresponding Ultralytics classes.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary mapping task names to Ultralytics task classes for the RT-DETR model.
|
||||
"""
|
||||
return {
|
||||
'detect': {
|
||||
'predictor': RTDETRPredictor,
|
||||
'validator': RTDETRValidator,
|
||||
'trainer': RTDETRTrainer,
|
||||
'model': RTDETRDetectionModel}}
|
||||
"detect": {
|
||||
"predictor": RTDETRPredictor,
|
||||
"validator": RTDETRValidator,
|
||||
"trainer": RTDETRTrainer,
|
||||
"model": RTDETRDetectionModel,
|
||||
}
|
||||
}
|
||||
|
@ -10,7 +10,11 @@ from ultralytics.utils import ops
|
||||
|
||||
class RTDETRPredictor(BasePredictor):
|
||||
"""
|
||||
A class extending the BasePredictor class for prediction based on an RT-DETR detection model.
|
||||
RT-DETR (Real-Time Detection Transformer) Predictor extending the BasePredictor class for making predictions using
|
||||
Baidu's RT-DETR model.
|
||||
|
||||
This class leverages the power of Vision Transformers to provide real-time object detection while maintaining
|
||||
high accuracy. It supports key features like efficient hybrid encoding and IoU-aware query selection.
|
||||
|
||||
Example:
|
||||
```python
|
||||
@ -21,10 +25,30 @@ class RTDETRPredictor(BasePredictor):
|
||||
predictor = RTDETRPredictor(overrides=args)
|
||||
predictor.predict_cli()
|
||||
```
|
||||
|
||||
Attributes:
|
||||
imgsz (int): Image size for inference (must be square and scale-filled).
|
||||
args (dict): Argument overrides for the predictor.
|
||||
"""
|
||||
|
||||
def postprocess(self, preds, img, orig_imgs):
|
||||
"""Postprocess predictions and returns a list of Results objects."""
|
||||
"""
|
||||
Postprocess the raw predictions from the model to generate bounding boxes and confidence scores.
|
||||
|
||||
The method filters detections based on confidence and class if specified in `self.args`.
|
||||
|
||||
Args:
|
||||
preds (list): List of [predictions, extra] from the model.
|
||||
img (torch.Tensor): Processed input images.
|
||||
orig_imgs (list or torch.Tensor): Original, unprocessed images.
|
||||
|
||||
Returns:
|
||||
(list[Results]): A list of Results objects containing the post-processed bounding boxes, confidence scores,
|
||||
and class labels.
|
||||
"""
|
||||
if not isinstance(preds, (list, tuple)): # list for PyTorch inference but list[0] Tensor for export inference
|
||||
preds = [preds, None]
|
||||
|
||||
nd = preds[0].shape[-1]
|
||||
bboxes, scores = preds[0].split((4, nd - 4), dim=-1)
|
||||
|
||||
@ -48,15 +72,15 @@ class RTDETRPredictor(BasePredictor):
|
||||
return results
|
||||
|
||||
def pre_transform(self, im):
|
||||
"""Pre-transform input image before inference.
|
||||
"""
|
||||
Pre-transforms the input images before feeding them into the model for inference. The input images are
|
||||
letterboxed to ensure a square aspect ratio and scale-filled. The size must be square(640) and scaleFilled.
|
||||
|
||||
Args:
|
||||
im (List(np.ndarray)): (N, 3, h, w) for tensor, [(h, w, 3) x N] for list.
|
||||
|
||||
Notes: The size must be square(640) and scaleFilled.
|
||||
im (list[np.ndarray] |torch.Tensor): Input images of shape (N,3,h,w) for tensor, [(h,w,3) x N] for list.
|
||||
|
||||
Returns:
|
||||
(list): A list of transformed imgs.
|
||||
(list): List of pre-transformed images ready for model inference.
|
||||
"""
|
||||
letterbox = LetterBox(self.imgsz, auto=False, scaleFill=True)
|
||||
return [letterbox(image=x) for x in im]
|
||||
|
@ -7,16 +7,17 @@ import torch
|
||||
from ultralytics.models.yolo.detect import DetectionTrainer
|
||||
from ultralytics.nn.tasks import RTDETRDetectionModel
|
||||
from ultralytics.utils import RANK, colorstr
|
||||
|
||||
from .val import RTDETRDataset, RTDETRValidator
|
||||
|
||||
|
||||
class RTDETRTrainer(DetectionTrainer):
|
||||
"""
|
||||
A class extending the DetectionTrainer class for training based on an RT-DETR detection model.
|
||||
Trainer class for the RT-DETR model developed by Baidu for real-time object detection. Extends the DetectionTrainer
|
||||
class for YOLO to adapt to the specific features and architecture of RT-DETR. This model leverages Vision
|
||||
Transformers and has capabilities like IoU-aware query selection and adaptable inference speed.
|
||||
|
||||
Notes:
|
||||
- F.grid_sample used in rt-detr does not support the `deterministic=True` argument.
|
||||
- F.grid_sample used in RT-DETR does not support the `deterministic=True` argument.
|
||||
- AMP training can lead to NaN outputs and may produce errors during bipartite graph matching.
|
||||
|
||||
Example:
|
||||
@ -30,43 +31,71 @@ class RTDETRTrainer(DetectionTrainer):
|
||||
"""
|
||||
|
||||
def get_model(self, cfg=None, weights=None, verbose=True):
|
||||
"""Return a YOLO detection model."""
|
||||
model = RTDETRDetectionModel(cfg, nc=self.data['nc'], verbose=verbose and RANK == -1)
|
||||
"""
|
||||
Initialize and return an RT-DETR model for object detection tasks.
|
||||
|
||||
Args:
|
||||
cfg (dict, optional): Model configuration. Defaults to None.
|
||||
weights (str, optional): Path to pre-trained model weights. Defaults to None.
|
||||
verbose (bool): Verbose logging if True. Defaults to True.
|
||||
|
||||
Returns:
|
||||
(RTDETRDetectionModel): Initialized model.
|
||||
"""
|
||||
model = RTDETRDetectionModel(cfg, nc=self.data["nc"], verbose=verbose and RANK == -1)
|
||||
if weights:
|
||||
model.load(weights)
|
||||
return model
|
||||
|
||||
def build_dataset(self, img_path, mode='val', batch=None):
|
||||
"""Build RTDETR Dataset
|
||||
def build_dataset(self, img_path, mode="val", batch=None):
|
||||
"""
|
||||
Build and return an RT-DETR dataset for training or validation.
|
||||
|
||||
Args:
|
||||
img_path (str): Path to the folder containing images.
|
||||
mode (str): `train` mode or `val` mode, users are able to customize different augmentations for each mode.
|
||||
batch (int, optional): Size of batches, this is for `rect`. Defaults to None.
|
||||
mode (str): Dataset mode, either 'train' or 'val'.
|
||||
batch (int, optional): Batch size for rectangle training. Defaults to None.
|
||||
|
||||
Returns:
|
||||
(RTDETRDataset): Dataset object for the specific mode.
|
||||
"""
|
||||
return RTDETRDataset(
|
||||
img_path=img_path,
|
||||
imgsz=self.args.imgsz,
|
||||
batch_size=batch,
|
||||
augment=mode == 'train', # no augmentation
|
||||
augment=mode == "train",
|
||||
hyp=self.args,
|
||||
rect=False, # no rect
|
||||
rect=False,
|
||||
cache=self.args.cache or None,
|
||||
prefix=colorstr(f'{mode}: '),
|
||||
data=self.data)
|
||||
prefix=colorstr(f"{mode}: "),
|
||||
data=self.data,
|
||||
)
|
||||
|
||||
def get_validator(self):
|
||||
"""Returns a DetectionValidator for RTDETR model validation."""
|
||||
self.loss_names = 'giou_loss', 'cls_loss', 'l1_loss'
|
||||
"""
|
||||
Returns a DetectionValidator suitable for RT-DETR model validation.
|
||||
|
||||
Returns:
|
||||
(RTDETRValidator): Validator object for model validation.
|
||||
"""
|
||||
self.loss_names = "giou_loss", "cls_loss", "l1_loss"
|
||||
return RTDETRValidator(self.test_loader, save_dir=self.save_dir, args=copy(self.args))
|
||||
|
||||
def preprocess_batch(self, batch):
|
||||
"""Preprocesses a batch of images by scaling and converting to float."""
|
||||
"""
|
||||
Preprocess a batch of images. Scales and converts the images to float format.
|
||||
|
||||
Args:
|
||||
batch (dict): Dictionary containing a batch of images, bboxes, and labels.
|
||||
|
||||
Returns:
|
||||
(dict): Preprocessed batch.
|
||||
"""
|
||||
batch = super().preprocess_batch(batch)
|
||||
bs = len(batch['img'])
|
||||
batch_idx = batch['batch_idx']
|
||||
bs = len(batch["img"])
|
||||
batch_idx = batch["batch_idx"]
|
||||
gt_bbox, gt_class = [], []
|
||||
for i in range(bs):
|
||||
gt_bbox.append(batch['bboxes'][batch_idx == i].to(batch_idx.device))
|
||||
gt_class.append(batch['cls'][batch_idx == i].to(device=batch_idx.device, dtype=torch.long))
|
||||
gt_bbox.append(batch["bboxes"][batch_idx == i].to(batch_idx.device))
|
||||
gt_class.append(batch["cls"][batch_idx == i].to(device=batch_idx.device, dtype=torch.long))
|
||||
return batch
|
||||
|
@ -1,7 +1,5 @@
|
||||
# Ultralytics YOLO 🚀, AGPL-3.0 license
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
|
||||
from ultralytics.data import YOLODataset
|
||||
@ -9,16 +7,22 @@ from ultralytics.data.augment import Compose, Format, v8_transforms
|
||||
from ultralytics.models.yolo.detect import DetectionValidator
|
||||
from ultralytics.utils import colorstr, ops
|
||||
|
||||
__all__ = 'RTDETRValidator', # tuple or list
|
||||
__all__ = ("RTDETRValidator",) # tuple or list
|
||||
|
||||
|
||||
# TODO: Temporarily RT-DETR does not need padding.
|
||||
class RTDETRDataset(YOLODataset):
|
||||
"""
|
||||
Real-Time DEtection and TRacking (RT-DETR) dataset class extending the base YOLODataset class.
|
||||
|
||||
This specialized dataset class is designed for use with the RT-DETR object detection model and is optimized for
|
||||
real-time detection and tracking tasks.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, data=None, **kwargs):
|
||||
super().__init__(*args, data=data, use_segments=False, use_keypoints=False, **kwargs)
|
||||
"""Initialize the RTDETRDataset class by inheriting from the YOLODataset class."""
|
||||
super().__init__(*args, data=data, **kwargs)
|
||||
|
||||
# NOTE: add stretch version load_image for rtdetr mosaic
|
||||
# NOTE: add stretch version load_image for RTDETR mosaic
|
||||
def load_image(self, i, rect_mode=False):
|
||||
"""Loads 1 image from dataset index 'i', returns (im, resized hw)."""
|
||||
return super().load_image(i=i, rect_mode=rect_mode)
|
||||
@ -33,19 +37,26 @@ class RTDETRDataset(YOLODataset):
|
||||
# transforms = Compose([LetterBox(new_shape=(self.imgsz, self.imgsz), auto=False, scaleFill=True)])
|
||||
transforms = Compose([])
|
||||
transforms.append(
|
||||
Format(bbox_format='xywh',
|
||||
normalize=True,
|
||||
return_mask=self.use_segments,
|
||||
return_keypoint=self.use_keypoints,
|
||||
batch_idx=True,
|
||||
mask_ratio=hyp.mask_ratio,
|
||||
mask_overlap=hyp.overlap_mask))
|
||||
Format(
|
||||
bbox_format="xywh",
|
||||
normalize=True,
|
||||
return_mask=self.use_segments,
|
||||
return_keypoint=self.use_keypoints,
|
||||
batch_idx=True,
|
||||
mask_ratio=hyp.mask_ratio,
|
||||
mask_overlap=hyp.overlap_mask,
|
||||
)
|
||||
)
|
||||
return transforms
|
||||
|
||||
|
||||
class RTDETRValidator(DetectionValidator):
|
||||
"""
|
||||
A class extending the DetectionValidator class for validation based on an RT-DETR detection model.
|
||||
RTDETRValidator extends the DetectionValidator class to provide validation capabilities specifically tailored for
|
||||
the RT-DETR (Real-Time DETR) object detection model.
|
||||
|
||||
The class allows building of an RTDETR-specific dataset for validation, applies Non-maximum suppression for
|
||||
post-processing, and updates evaluation metrics accordingly.
|
||||
|
||||
Example:
|
||||
```python
|
||||
@ -55,9 +66,12 @@ class RTDETRValidator(DetectionValidator):
|
||||
validator = RTDETRValidator(args=args)
|
||||
validator()
|
||||
```
|
||||
|
||||
Note:
|
||||
For further details on the attributes and methods, refer to the parent DetectionValidator class.
|
||||
"""
|
||||
|
||||
def build_dataset(self, img_path, mode='val', batch=None):
|
||||
def build_dataset(self, img_path, mode="val", batch=None):
|
||||
"""
|
||||
Build an RTDETR Dataset.
|
||||
|
||||
@ -74,11 +88,15 @@ class RTDETRValidator(DetectionValidator):
|
||||
hyp=self.args,
|
||||
rect=False, # no rect
|
||||
cache=self.args.cache or None,
|
||||
prefix=colorstr(f'{mode}: '),
|
||||
data=self.data)
|
||||
prefix=colorstr(f"{mode}: "),
|
||||
data=self.data,
|
||||
)
|
||||
|
||||
def postprocess(self, preds):
|
||||
"""Apply Non-maximum suppression to prediction outputs."""
|
||||
if not isinstance(preds, (list, tuple)): # list for PyTorch inference but list[0] Tensor for export inference
|
||||
preds = [preds, None]
|
||||
|
||||
bs, _, nd = preds[0].shape
|
||||
bboxes, scores = preds[0].split((4, nd - 4), dim=-1)
|
||||
bboxes *= self.args.imgsz
|
||||
@ -86,56 +104,32 @@ class RTDETRValidator(DetectionValidator):
|
||||
for i, bbox in enumerate(bboxes): # (300, 4)
|
||||
bbox = ops.xywh2xyxy(bbox)
|
||||
score, cls = scores[i].max(-1) # (300, )
|
||||
# Do not need threshold for evaluation as only got 300 boxes here.
|
||||
# Do not need threshold for evaluation as only got 300 boxes here
|
||||
# idx = score > self.args.conf
|
||||
pred = torch.cat([bbox, score[..., None], cls[..., None]], dim=-1) # filter
|
||||
# sort by confidence to correctly get internal metrics.
|
||||
# Sort by confidence to correctly get internal metrics
|
||||
pred = pred[score.argsort(descending=True)]
|
||||
outputs[i] = pred # [idx]
|
||||
|
||||
return outputs
|
||||
|
||||
def update_metrics(self, preds, batch):
|
||||
"""Metrics."""
|
||||
for si, pred in enumerate(preds):
|
||||
idx = batch['batch_idx'] == si
|
||||
cls = batch['cls'][idx]
|
||||
bbox = batch['bboxes'][idx]
|
||||
nl, npr = cls.shape[0], pred.shape[0] # number of labels, predictions
|
||||
shape = batch['ori_shape'][si]
|
||||
correct_bboxes = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device) # init
|
||||
self.seen += 1
|
||||
def _prepare_batch(self, si, batch):
|
||||
"""Prepares a batch for training or inference by applying transformations."""
|
||||
idx = batch["batch_idx"] == si
|
||||
cls = batch["cls"][idx].squeeze(-1)
|
||||
bbox = batch["bboxes"][idx]
|
||||
ori_shape = batch["ori_shape"][si]
|
||||
imgsz = batch["img"].shape[2:]
|
||||
ratio_pad = batch["ratio_pad"][si]
|
||||
if len(cls):
|
||||
bbox = ops.xywh2xyxy(bbox) # target boxes
|
||||
bbox[..., [0, 2]] *= ori_shape[1] # native-space pred
|
||||
bbox[..., [1, 3]] *= ori_shape[0] # native-space pred
|
||||
return dict(cls=cls, bbox=bbox, ori_shape=ori_shape, imgsz=imgsz, ratio_pad=ratio_pad)
|
||||
|
||||
if npr == 0:
|
||||
if nl:
|
||||
self.stats.append((correct_bboxes, *torch.zeros((2, 0), device=self.device), cls.squeeze(-1)))
|
||||
if self.args.plots:
|
||||
self.confusion_matrix.process_batch(detections=None, labels=cls.squeeze(-1))
|
||||
continue
|
||||
|
||||
# Predictions
|
||||
if self.args.single_cls:
|
||||
pred[:, 5] = 0
|
||||
predn = pred.clone()
|
||||
predn[..., [0, 2]] *= shape[1] / self.args.imgsz # native-space pred
|
||||
predn[..., [1, 3]] *= shape[0] / self.args.imgsz # native-space pred
|
||||
|
||||
# Evaluate
|
||||
if nl:
|
||||
tbox = ops.xywh2xyxy(bbox) # target boxes
|
||||
tbox[..., [0, 2]] *= shape[1] # native-space pred
|
||||
tbox[..., [1, 3]] *= shape[0] # native-space pred
|
||||
labelsn = torch.cat((cls, tbox), 1) # native-space labels
|
||||
# NOTE: To get correct metrics, the inputs of `_process_batch` should always be float32 type.
|
||||
correct_bboxes = self._process_batch(predn.float(), labelsn)
|
||||
# TODO: maybe remove these `self.` arguments as they already are member variable
|
||||
if self.args.plots:
|
||||
self.confusion_matrix.process_batch(predn, labelsn)
|
||||
self.stats.append((correct_bboxes, pred[:, 4], pred[:, 5], cls.squeeze(-1))) # (conf, pcls, tcls)
|
||||
|
||||
# Save
|
||||
if self.args.save_json:
|
||||
self.pred_to_json(predn, batch['im_file'][si])
|
||||
if self.args.save_txt:
|
||||
file = self.save_dir / 'labels' / f'{Path(batch["im_file"][si]).stem}.txt'
|
||||
self.save_one_txt(predn, self.args.save_conf, shape, file)
|
||||
def _prepare_pred(self, pred, pbatch):
|
||||
"""Prepares and returns a batch with transformed bounding boxes and class labels."""
|
||||
predn = pred.clone()
|
||||
predn[..., [0, 2]] *= pbatch["ori_shape"][1] / self.args.imgsz # native-space pred
|
||||
predn[..., [1, 3]] *= pbatch["ori_shape"][0] / self.args.imgsz # native-space pred
|
||||
return predn.float()
|
||||
|
Reference in New Issue
Block a user