add yolo v10 and modify pipeline

2025-03-28 13:19:54 +08:00
parent 183299c06b
commit 798c596acc
471 changed files with 19109 additions and 7342 deletions
--- a/ultralytics/data/init.py
+++ b/ultralytics/data/init.py
@ -4,5 +4,12 @@ from .base import BaseDataset
 from .build import build_dataloader, build_yolo_dataset, load_inference_source
 from .dataset import ClassificationDataset, SemanticDataset, YOLODataset

-__all__ = ('BaseDataset', 'ClassificationDataset', 'SemanticDataset', 'YOLODataset', 'build_yolo_dataset',
-           'build_dataloader', 'load_inference_source')
+__all__ = (
+    "BaseDataset",
+    "ClassificationDataset",
+    "SemanticDataset",
+    "YOLODataset",
+    "build_yolo_dataset",
+    "build_dataloader",
+    "load_inference_source",
+)
--- a/ultralytics/data/pycache/init.cpython-312.pyc
+++ b/ultralytics/data/pycache/init.cpython-312.pyc
--- a/ultralytics/data/pycache/init.cpython-39.pyc
+++ b/ultralytics/data/pycache/init.cpython-39.pyc
--- a/ultralytics/data/pycache/augment.cpython-312.pyc
+++ b/ultralytics/data/pycache/augment.cpython-312.pyc
--- a/ultralytics/data/pycache/augment.cpython-39.pyc
+++ b/ultralytics/data/pycache/augment.cpython-39.pyc
--- a/ultralytics/data/pycache/base.cpython-312.pyc
+++ b/ultralytics/data/pycache/base.cpython-312.pyc
--- a/ultralytics/data/pycache/base.cpython-39.pyc
+++ b/ultralytics/data/pycache/base.cpython-39.pyc
--- a/ultralytics/data/pycache/build.cpython-312.pyc
+++ b/ultralytics/data/pycache/build.cpython-312.pyc
--- a/ultralytics/data/pycache/build.cpython-39.pyc
+++ b/ultralytics/data/pycache/build.cpython-39.pyc
--- a/ultralytics/data/pycache/converter.cpython-312.pyc
+++ b/ultralytics/data/pycache/converter.cpython-312.pyc
--- a/ultralytics/data/pycache/converter.cpython-39.pyc
+++ b/ultralytics/data/pycache/converter.cpython-39.pyc
--- a/ultralytics/data/pycache/dataset.cpython-312.pyc
+++ b/ultralytics/data/pycache/dataset.cpython-312.pyc
--- a/ultralytics/data/pycache/dataset.cpython-39.pyc
+++ b/ultralytics/data/pycache/dataset.cpython-39.pyc
--- a/ultralytics/data/pycache/loaders.cpython-312.pyc
+++ b/ultralytics/data/pycache/loaders.cpython-312.pyc
--- a/ultralytics/data/pycache/loaders.cpython-39.pyc
+++ b/ultralytics/data/pycache/loaders.cpython-39.pyc
--- a/ultralytics/data/pycache/utils.cpython-312.pyc
+++ b/ultralytics/data/pycache/utils.cpython-312.pyc
--- a/ultralytics/data/pycache/utils.cpython-39.pyc
+++ b/ultralytics/data/pycache/utils.cpython-39.pyc
--- a/ultralytics/data/annotator.py
+++ b/ultralytics/data/annotator.py
@ -5,7 +5,7 @@ from pathlib import Path
 from ultralytics import SAM, YOLO


-def auto_annotate(data, det_model='yolov8x.pt', sam_model='sam_b.pt', device='', output_dir=None):
+def auto_annotate(data, det_model="yolov8x.pt", sam_model="sam_b.pt", device="", output_dir=None):
    """
    Automatically annotates images using a YOLO object detection model and a SAM segmentation model.

@ -29,7 +29,7 @@ def auto_annotate(data, det_model='yolov8x.pt', sam_model='sam_b.pt', device='',

    data = Path(data)
    if not output_dir:
-        output_dir = data.parent / f'{data.stem}_auto_annotate_labels'
+        output_dir = data.parent / f"{data.stem}_auto_annotate_labels"
    Path(output_dir).mkdir(exist_ok=True, parents=True)

    det_results = det_model(data, stream=True, device=device)
@ -41,10 +41,10 @@ def auto_annotate(data, det_model='yolov8x.pt', sam_model='sam_b.pt', device='',
            sam_results = sam_model(result.orig_img, bboxes=boxes, verbose=False, save=False, device=device)
            segments = sam_results[0].masks.xyn  # noqa

-            with open(f'{str(Path(output_dir) / Path(result.path).stem)}.txt', 'w') as f:
+            with open(f"{Path(output_dir) / Path(result.path).stem}.txt", "w") as f:
                for i in range(len(segments)):
                    s = segments[i]
                    if len(s) == 0:
                        continue
                    segment = map(str, segments[i].reshape(-1).tolist())
-                    f.write(f'{class_ids[i]} ' + ' '.join(segment) + '\n')
+                    f.write(f"{class_ids[i]} " + " ".join(segment) + "\n")
--- a/ultralytics/data/augment.py
+++ b/ultralytics/data/augment.py
--- a/ultralytics/data/base.py
+++ b/ultralytics/data/base.py
@ -15,7 +15,6 @@ import psutil
 from torch.utils.data import Dataset

 from ultralytics.utils import DEFAULT_CFG, LOCAL_RANK, LOGGER, NUM_THREADS, TQDM
-
 from .utils import HELP_URL, IMG_FORMATS


@ -47,20 +46,23 @@ class BaseDataset(Dataset):
        transforms (callable): Image transformation function.
    """

-    def __init__(self,
-                 img_path,
-                 imgsz=640,
-                 cache=False,
-                 augment=True,
-                 hyp=DEFAULT_CFG,
-                 prefix='',
-                 rect=False,
-                 batch_size=16,
-                 stride=32,
-                 pad=0.5,
-                 single_cls=False,
-                 classes=None,
-                 fraction=1.0):
+    def __init__(
+        self,
+        img_path,
+        imgsz=640,
+        cache=False,
+        augment=True,
+        hyp=DEFAULT_CFG,
+        prefix="",
+        rect=False,
+        batch_size=16,
+        stride=32,
+        pad=0.5,
+        single_cls=False,
+        classes=None,
+        fraction=1.0,
+    ):
+        """Initialize BaseDataset with given configuration and options."""
        super().__init__()
        self.img_path = img_path
        self.imgsz = imgsz
@ -84,11 +86,11 @@ class BaseDataset(Dataset):
        self.buffer = []  # buffer size = batch size
        self.max_buffer_length = min((self.ni, self.batch_size * 8, 1000)) if self.augment else 0

-        # Cache stuff
-        if cache == 'ram' and not self.check_cache_ram():
+        # Cache images
+        if cache == "ram" and not self.check_cache_ram():
            cache = False
        self.ims, self.im_hw0, self.im_hw = [None] * self.ni, [None] * self.ni, [None] * self.ni
-        self.npy_files = [Path(f).with_suffix('.npy') for f in self.im_files]
+        self.npy_files = [Path(f).with_suffix(".npy") for f in self.im_files]
        if cache:
            self.cache_images(cache)

@ -102,54 +104,62 @@ class BaseDataset(Dataset):
            for p in img_path if isinstance(img_path, list) else [img_path]:
                p = Path(p)  # os-agnostic
                if p.is_dir():  # dir
-                    f += glob.glob(str(p / '**' / '*.*'), recursive=True)
+                    f += glob.glob(str(p / "**" / "*.*"), recursive=True)
                    # F = list(p.rglob('*.*'))  # pathlib
                elif p.is_file():  # file
                    with open(p) as t:
                        t = t.read().strip().splitlines()
                        parent = str(p.parent) + os.sep
-                        f += [x.replace('./', parent) if x.startswith('./') else x for x in t]  # local to global path
+                        f += [x.replace("./", parent) if x.startswith("./") else x for x in t]  # local to global path
                        # F += [p.parent / x.lstrip(os.sep) for x in t]  # local to global path (pathlib)
                else:
-                    raise FileNotFoundError(f'{self.prefix}{p} does not exist')
-            im_files = sorted(x.replace('/', os.sep) for x in f if x.split('.')[-1].lower() in IMG_FORMATS)
+                    raise FileNotFoundError(f"{self.prefix}{p} does not exist")
+            im_files = sorted(x.replace("/", os.sep) for x in f if x.split(".")[-1].lower() in IMG_FORMATS)
            # self.img_files = sorted([x for x in f if x.suffix[1:].lower() in IMG_FORMATS])  # pathlib
-            assert im_files, f'{self.prefix}No images found in {img_path}'
+            assert im_files, f"{self.prefix}No images found in {img_path}"
        except Exception as e:
-            raise FileNotFoundError(f'{self.prefix}Error loading data from {img_path}\n{HELP_URL}') from e
+            raise FileNotFoundError(f"{self.prefix}Error loading data from {img_path}\n{HELP_URL}") from e
        if self.fraction < 1:
-            im_files = im_files[:round(len(im_files) * self.fraction)]
+            # im_files = im_files[: round(len(im_files) * self.fraction)]
+            num_elements_to_select = round(len(im_files) * self.fraction)
+            im_files = random.sample(im_files, num_elements_to_select)
        return im_files

    def update_labels(self, include_class: Optional[list]):
-        """include_class, filter labels to include only these classes (optional)."""
+        """Update labels to include only these classes (optional)."""
        include_class_array = np.array(include_class).reshape(1, -1)
        for i in range(len(self.labels)):
            if include_class is not None:
-                cls = self.labels[i]['cls']
-                bboxes = self.labels[i]['bboxes']
-                segments = self.labels[i]['segments']
-                keypoints = self.labels[i]['keypoints']
+                cls = self.labels[i]["cls"]
+                bboxes = self.labels[i]["bboxes"]
+                segments = self.labels[i]["segments"]
+                keypoints = self.labels[i]["keypoints"]
                j = (cls == include_class_array).any(1)
-                self.labels[i]['cls'] = cls[j]
-                self.labels[i]['bboxes'] = bboxes[j]
+                self.labels[i]["cls"] = cls[j]
+                self.labels[i]["bboxes"] = bboxes[j]
                if segments:
-                    self.labels[i]['segments'] = [segments[si] for si, idx in enumerate(j) if idx]
+                    self.labels[i]["segments"] = [segments[si] for si, idx in enumerate(j) if idx]
                if keypoints is not None:
-                    self.labels[i]['keypoints'] = keypoints[j]
+                    self.labels[i]["keypoints"] = keypoints[j]
            if self.single_cls:
-                self.labels[i]['cls'][:, 0] = 0
+                self.labels[i]["cls"][:, 0] = 0

    def load_image(self, i, rect_mode=True):
        """Loads 1 image from dataset index 'i', returns (im, resized hw)."""
        im, f, fn = self.ims[i], self.im_files[i], self.npy_files[i]
        if im is None:  # not cached in RAM
            if fn.exists():  # load npy
-                im = np.load(fn)
+                try:
+                    im = np.load(fn)
+                except Exception as e:
+                    LOGGER.warning(f"{self.prefix}WARNING ⚠️ Removing corrupt *.npy image file {fn} due to: {e}")
+                    Path(fn).unlink(missing_ok=True)
+                    im = cv2.imread(f)  # BGR
            else:  # read image
                im = cv2.imread(f)  # BGR
-                if im is None:
-                    raise FileNotFoundError(f'Image Not Found {f}')
+            if im is None:
+                raise FileNotFoundError(f"Image Not Found {f}")
+
            h0, w0 = im.shape[:2]  # orig hw
            if rect_mode:  # resize long side to imgsz while maintaining aspect ratio
                r = self.imgsz / max(h0, w0)  # ratio
@ -174,17 +184,17 @@ class BaseDataset(Dataset):
    def cache_images(self, cache):
        """Cache images to memory or disk."""
        b, gb = 0, 1 << 30  # bytes of cached images, bytes per gigabytes
-        fcn = self.cache_images_to_disk if cache == 'disk' else self.load_image
+        fcn = self.cache_images_to_disk if cache == "disk" else self.load_image
        with ThreadPool(NUM_THREADS) as pool:
            results = pool.imap(fcn, range(self.ni))
            pbar = TQDM(enumerate(results), total=self.ni, disable=LOCAL_RANK > 0)
            for i, x in pbar:
-                if cache == 'disk':
+                if cache == "disk":
                    b += self.npy_files[i].stat().st_size
                else:  # 'ram'
                    self.ims[i], self.im_hw0[i], self.im_hw[i] = x  # im, hw_orig, hw_resized = load_image(self, i)
                    b += self.ims[i].nbytes
-                pbar.desc = f'{self.prefix}Caching images ({b / gb:.1f}GB {cache})'
+                pbar.desc = f"{self.prefix}Caching images ({b / gb:.1f}GB {cache})"
            pbar.close()

    def cache_images_to_disk(self, i):
@ -200,15 +210,17 @@ class BaseDataset(Dataset):
        for _ in range(n):
            im = cv2.imread(random.choice(self.im_files))  # sample image
            ratio = self.imgsz / max(im.shape[0], im.shape[1])  # max(h, w)  # ratio
-            b += im.nbytes * ratio ** 2
+            b += im.nbytes * ratio**2
        mem_required = b * self.ni / n * (1 + safety_margin)  # GB required to cache dataset into RAM
        mem = psutil.virtual_memory()
        cache = mem_required < mem.available  # to cache or not to cache, that is the question
        if not cache:
-            LOGGER.info(f'{self.prefix}{mem_required / gb:.1f}GB RAM required to cache images '
-                        f'with {int(safety_margin * 100)}% safety margin but only '
-                        f'{mem.available / gb:.1f}/{mem.total / gb:.1f}GB available, '
-                        f"{'caching images ✅' if cache else 'not caching images ⚠️'}")
+            LOGGER.info(
+                f'{self.prefix}{mem_required / gb:.1f}GB RAM required to cache images '
+                f'with {int(safety_margin * 100)}% safety margin but only '
+                f'{mem.available / gb:.1f}/{mem.total / gb:.1f}GB available, '
+                f"{'caching images ✅' if cache else 'not caching images ⚠️'}"
+            )
        return cache

    def set_rectangle(self):
@ -216,7 +228,7 @@ class BaseDataset(Dataset):
        bi = np.floor(np.arange(self.ni) / self.batch_size).astype(int)  # batch index
        nb = bi[-1] + 1  # number of batches

-        s = np.array([x.pop('shape') for x in self.labels])  # hw
+        s = np.array([x.pop("shape") for x in self.labels])  # hw
        ar = s[:, 0] / s[:, 1]  # aspect ratio
        irect = ar.argsort()
        self.im_files = [self.im_files[i] for i in irect]
@ -243,12 +255,14 @@ class BaseDataset(Dataset):
    def get_image_and_label(self, index):
        """Get and return label information from the dataset."""
        label = deepcopy(self.labels[index])  # requires deepcopy() https://github.com/ultralytics/ultralytics/pull/1948
-        label.pop('shape', None)  # shape is for rect, remove it
-        label['img'], label['ori_shape'], label['resized_shape'] = self.load_image(index)
-        label['ratio_pad'] = (label['resized_shape'][0] / label['ori_shape'][0],
-                              label['resized_shape'][1] / label['ori_shape'][1])  # for evaluation
+        label.pop("shape", None)  # shape is for rect, remove it
+        label["img"], label["ori_shape"], label["resized_shape"] = self.load_image(index)
+        label["ratio_pad"] = (
+            label["resized_shape"][0] / label["ori_shape"][0],
+            label["resized_shape"][1] / label["ori_shape"][1],
+        )  # for evaluation
        if self.rect:
-            label['rect_shape'] = self.batch_shapes[self.batch[index]]
+            label["rect_shape"] = self.batch_shapes[self.batch[index]]
        return self.update_labels_info(label)

    def __len__(self):
@ -256,24 +270,32 @@ class BaseDataset(Dataset):
        return len(self.labels)

    def update_labels_info(self, label):
-        """custom your label format here."""
+        """Custom your label format here."""
        return label

    def build_transforms(self, hyp=None):
-        """Users can custom augmentations here
-        like:
+        """
+        Users can customize augmentations here.
+
+        Example:
+            ```python
            if self.augment:
                # Training transforms
                return Compose([])
            else:
                # Val transforms
                return Compose([])
+            ```
        """
        raise NotImplementedError

    def get_labels(self):
-        """Users can custom their own format here.
-        Make sure your output is a list with each element like below:
+        """
+        Users can customize their own format here.
+
+        Note:
+            Ensure output is a dictionary with the following keys:
+            ```python
            dict(
                im_file=im_file,
                shape=shape,  # format: (height, width)
@ -284,5 +306,6 @@ class BaseDataset(Dataset):
                normalized=True, # or False
                bbox_format="xyxy",  # or xywh, ltwh
            )
+            ```
        """
        raise NotImplementedError
--- a/ultralytics/data/build.py
+++ b/ultralytics/data/build.py
@ -9,23 +9,34 @@ import torch
 from PIL import Image
 from torch.utils.data import dataloader, distributed

-from ultralytics.data.loaders import (LOADERS, LoadImages, LoadPilAndNumpy, LoadScreenshots, LoadStreams, LoadTensor,
-                                      SourceTypes, autocast_list)
+from ultralytics.data.loaders import (
+    LOADERS,
+    LoadImagesAndVideos,
+    LoadPilAndNumpy,
+    LoadScreenshots,
+    LoadStreams,
+    LoadTensor,
+    SourceTypes,
+    autocast_list,
+)
 from ultralytics.data.utils import IMG_FORMATS, VID_FORMATS
 from ultralytics.utils import RANK, colorstr
 from ultralytics.utils.checks import check_file
-
 from .dataset import YOLODataset
 from .utils import PIN_MEMORY


 class InfiniteDataLoader(dataloader.DataLoader):
-    """Dataloader that reuses workers. Uses same syntax as vanilla DataLoader."""
+    """
+    Dataloader that reuses workers.
+
+    Uses same syntax as vanilla DataLoader.
+    """

    def __init__(self, *args, **kwargs):
        """Dataloader that infinitely recycles workers, inherits from DataLoader."""
        super().__init__(*args, **kwargs)
-        object.__setattr__(self, 'batch_sampler', _RepeatSampler(self.batch_sampler))
+        object.__setattr__(self, "batch_sampler", _RepeatSampler(self.batch_sampler))
        self.iterator = super().__iter__()

    def __len__(self):
@ -38,7 +49,9 @@ class InfiniteDataLoader(dataloader.DataLoader):
            yield next(self.iterator)

    def reset(self):
-        """Reset iterator.
+        """
+        Reset iterator.
+
        This is useful when we want to modify settings of dataset while training.
        """
        self.iterator = self._get_iterator()
@ -64,49 +77,51 @@ class _RepeatSampler:

 def seed_worker(worker_id):  # noqa
    """Set dataloader worker seed https://pytorch.org/docs/stable/notes/randomness.html#dataloader."""
-    worker_seed = torch.initial_seed() % 2 ** 32
+    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)


-def build_yolo_dataset(cfg, img_path, batch, data, mode='train', rect=False, stride=32):
-    """Build YOLO Dataset"""
+def build_yolo_dataset(cfg, img_path, batch, data, mode="train", rect=False, stride=32):
+    """Build YOLO Dataset."""
    return YOLODataset(
        img_path=img_path,
        imgsz=cfg.imgsz,
        batch_size=batch,
-        augment=mode == 'train',  # augmentation
+        augment=mode == "train",  # augmentation
        hyp=cfg,  # TODO: probably add a get_hyps_from_cfg function
        rect=cfg.rect or rect,  # rectangular batches
        cache=cfg.cache or None,
        single_cls=cfg.single_cls or False,
        stride=int(stride),
-        pad=0.0 if mode == 'train' else 0.5,
-        prefix=colorstr(f'{mode}: '),
-        use_segments=cfg.task == 'segment',
-        use_keypoints=cfg.task == 'pose',
+        pad=0.0 if mode == "train" else 0.5,
+        prefix=colorstr(f"{mode}: "),
+        task=cfg.task,
        classes=cfg.classes,
        data=data,
-        fraction=cfg.fraction if mode == 'train' else 1.0)
+        fraction=cfg.fraction if mode == "train" else 1.0,
+    )


 def build_dataloader(dataset, batch, workers, shuffle=True, rank=-1):
    """Return an InfiniteDataLoader or DataLoader for training or validation set."""
    batch = min(batch, len(dataset))
    nd = torch.cuda.device_count()  # number of CUDA devices
-    nw = min([os.cpu_count() // max(nd, 1), batch if batch > 1 else 0, workers])  # number of workers
+    nw = min([os.cpu_count() // max(nd, 1), workers])  # number of workers
    sampler = None if rank == -1 else distributed.DistributedSampler(dataset, shuffle=shuffle)
    generator = torch.Generator()
    generator.manual_seed(6148914691236517205 + RANK)
-    return InfiniteDataLoader(dataset=dataset,
-                              batch_size=batch,
-                              shuffle=shuffle and sampler is None,
-                              num_workers=nw,
-                              sampler=sampler,
-                              pin_memory=PIN_MEMORY,
-                              collate_fn=getattr(dataset, 'collate_fn', None),
-                              worker_init_fn=seed_worker,
-                              generator=generator)
+    return InfiniteDataLoader(
+        dataset=dataset,
+        batch_size=batch,
+        shuffle=shuffle and sampler is None,
+        num_workers=nw,
+        sampler=sampler,
+        pin_memory=PIN_MEMORY,
+        collate_fn=getattr(dataset, "collate_fn", None),
+        worker_init_fn=seed_worker,
+        generator=generator,
+    )


 def check_source(source):
@ -114,10 +129,10 @@ def check_source(source):
    webcam, screenshot, from_img, in_memory, tensor = False, False, False, False, False
    if isinstance(source, (str, int, Path)):  # int for local usb camera
        source = str(source)
-        is_file = Path(source).suffix[1:] in (IMG_FORMATS + VID_FORMATS)
-        is_url = source.lower().startswith(('https://', 'http://', 'rtsp://', 'rtmp://'))
-        webcam = source.isnumeric() or source.endswith('.streams') or (is_url and not is_file)
-        screenshot = source.lower() == 'screen'
+        is_file = Path(source).suffix[1:] in (IMG_FORMATS | VID_FORMATS)
+        is_url = source.lower().startswith(("https://", "http://", "rtsp://", "rtmp://", "tcp://"))
+        webcam = source.isnumeric() or source.endswith(".streams") or (is_url and not is_file)
+        screenshot = source.lower() == "screen"
        if is_url and is_file:
            source = check_file(source)  # download
    elif isinstance(source, LOADERS):
@ -130,42 +145,42 @@ def check_source(source):
    elif isinstance(source, torch.Tensor):
        tensor = True
    else:
-        raise TypeError('Unsupported image type. For supported types see https://docs.ultralytics.com/modes/predict')
+        raise TypeError("Unsupported image type. For supported types see https://docs.ultralytics.com/modes/predict")

    return source, webcam, screenshot, from_img, in_memory, tensor


-def load_inference_source(source=None, imgsz=640, vid_stride=1, stream_buffer=False):
+def load_inference_source(source=None, batch=1, vid_stride=1, buffer=False):
    """
    Loads an inference source for object detection and applies necessary transformations.

    Args:
        source (str, Path, Tensor, PIL.Image, np.ndarray): The input source for inference.
-        imgsz (int, optional): The size of the image for inference. Default is 640.
+        batch (int, optional): Batch size for dataloaders. Default is 1.
        vid_stride (int, optional): The frame interval for video sources. Default is 1.
-        stream_buffer (bool, optional): Determined whether stream frames will be buffered. Default is False.
+        buffer (bool, optional): Determined whether stream frames will be buffered. Default is False.

    Returns:
        dataset (Dataset): A dataset object for the specified input source.
    """
-    source, webcam, screenshot, from_img, in_memory, tensor = check_source(source)
-    source_type = source.source_type if in_memory else SourceTypes(webcam, screenshot, from_img, tensor)
+    source, stream, screenshot, from_img, in_memory, tensor = check_source(source)
+    source_type = source.source_type if in_memory else SourceTypes(stream, screenshot, from_img, tensor)

    # Dataloader
    if tensor:
        dataset = LoadTensor(source)
    elif in_memory:
        dataset = source
-    elif webcam:
-        dataset = LoadStreams(source, imgsz=imgsz, vid_stride=vid_stride, stream_buffer=stream_buffer)
+    elif stream:
+        dataset = LoadStreams(source, vid_stride=vid_stride, buffer=buffer)
    elif screenshot:
-        dataset = LoadScreenshots(source, imgsz=imgsz)
+        dataset = LoadScreenshots(source)
    elif from_img:
-        dataset = LoadPilAndNumpy(source, imgsz=imgsz)
+        dataset = LoadPilAndNumpy(source)
    else:
-        dataset = LoadImages(source, imgsz=imgsz, vid_stride=vid_stride)
+        dataset = LoadImagesAndVideos(source, batch=batch, vid_stride=vid_stride)

    # Attach source types to the dataset
-    setattr(dataset, 'source_type', source_type)
+    setattr(dataset, "source_type", source_type)

    return dataset
--- a/ultralytics/data/converter.py
+++ b/ultralytics/data/converter.py
@ -1,31 +1,120 @@
 # Ultralytics YOLO 🚀, AGPL-3.0 license

 import json
-import shutil
 from collections import defaultdict
 from pathlib import Path

 import cv2
 import numpy as np

-from ultralytics.utils import TQDM
+from ultralytics.utils import LOGGER, TQDM
+from ultralytics.utils.files import increment_path


 def coco91_to_coco80_class():
-    """Converts 91-index COCO class IDs to 80-index COCO class IDs.
+    """
+    Converts 91-index COCO class IDs to 80-index COCO class IDs.

    Returns:
        (list): A list of 91 class IDs where the index represents the 80-index class ID and the value is the
            corresponding 91-index class ID.
    """
    return [
-        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, None, 24, 25, None,
-        None, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, None, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
-        51, 52, 53, 54, 55, 56, 57, 58, 59, None, 60, None, None, 61, None, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
-        None, 73, 74, 75, 76, 77, 78, 79, None]
+        0,
+        1,
+        2,
+        3,
+        4,
+        5,
+        6,
+        7,
+        8,
+        9,
+        10,
+        None,
+        11,
+        12,
+        13,
+        14,
+        15,
+        16,
+        17,
+        18,
+        19,
+        20,
+        21,
+        22,
+        23,
+        None,
+        24,
+        25,
+        None,
+        None,
+        26,
+        27,
+        28,
+        29,
+        30,
+        31,
+        32,
+        33,
+        34,
+        35,
+        36,
+        37,
+        38,
+        39,
+        None,
+        40,
+        41,
+        42,
+        43,
+        44,
+        45,
+        46,
+        47,
+        48,
+        49,
+        50,
+        51,
+        52,
+        53,
+        54,
+        55,
+        56,
+        57,
+        58,
+        59,
+        None,
+        60,
+        None,
+        None,
+        61,
+        None,
+        62,
+        63,
+        64,
+        65,
+        66,
+        67,
+        68,
+        69,
+        70,
+        71,
+        72,
+        None,
+        73,
+        74,
+        75,
+        76,
+        77,
+        78,
+        79,
+        None,
+    ]


-def coco80_to_coco91_class():  #
+def coco80_to_coco91_class():
    """
    Converts 80-index (val2014) to 91-index (paper).
    For details see https://tech.amikelive.com/node-718/what-object-categories-labels-are-in-coco-dataset/.
@ -41,16 +130,102 @@ def coco80_to_coco91_class():  #
        ```
    """
    return [
-        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34,
-        35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-        64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]
+        1,
+        2,
+        3,
+        4,
+        5,
+        6,
+        7,
+        8,
+        9,
+        10,
+        11,
+        13,
+        14,
+        15,
+        16,
+        17,
+        18,
+        19,
+        20,
+        21,
+        22,
+        23,
+        24,
+        25,
+        27,
+        28,
+        31,
+        32,
+        33,
+        34,
+        35,
+        36,
+        37,
+        38,
+        39,
+        40,
+        41,
+        42,
+        43,
+        44,
+        46,
+        47,
+        48,
+        49,
+        50,
+        51,
+        52,
+        53,
+        54,
+        55,
+        56,
+        57,
+        58,
+        59,
+        60,
+        61,
+        62,
+        63,
+        64,
+        65,
+        67,
+        70,
+        72,
+        73,
+        74,
+        75,
+        76,
+        77,
+        78,
+        79,
+        80,
+        81,
+        82,
+        84,
+        85,
+        86,
+        87,
+        88,
+        89,
+        90,
+    ]


-def convert_coco(labels_dir='../coco/annotations/', use_segments=False, use_keypoints=False, cls91to80=True):
-    """Converts COCO dataset annotations to a format suitable for training YOLOv5 models.
+def convert_coco(
+    labels_dir="../coco/annotations/",
+    save_dir="coco_converted/",
+    use_segments=False,
+    use_keypoints=False,
+    cls91to80=True,
+):
+    """
+    Converts COCO dataset annotations to a YOLO annotation format  suitable for training YOLO models.

    Args:
        labels_dir (str, optional): Path to directory containing COCO dataset annotation files.
+        save_dir (str, optional): Path to directory to save results to.
        use_segments (bool, optional): Whether to include segmentation masks in the output.
        use_keypoints (bool, optional): Whether to include keypoint annotations in the output.
        cls91to80 (bool, optional): Whether to map 91 COCO class IDs to the corresponding 80 COCO class IDs.
@ -67,78 +242,79 @@ def convert_coco(labels_dir='../coco/annotations/', use_segments=False, use_keyp
    """

    # Create dataset directory
-    save_dir = Path('yolo_labels')
-    if save_dir.exists():
-        shutil.rmtree(save_dir)  # delete dir
-    for p in save_dir / 'labels', save_dir / 'images':
+    save_dir = increment_path(save_dir)  # increment if save directory already exists
+    for p in save_dir / "labels", save_dir / "images":
        p.mkdir(parents=True, exist_ok=True)  # make dir

    # Convert classes
    coco80 = coco91_to_coco80_class()

    # Import json
-    for json_file in sorted(Path(labels_dir).resolve().glob('*.json')):
-        fn = Path(save_dir) / 'labels' / json_file.stem.replace('instances_', '')  # folder name
+    for json_file in sorted(Path(labels_dir).resolve().glob("*.json")):
+        fn = Path(save_dir) / "labels" / json_file.stem.replace("instances_", "")  # folder name
        fn.mkdir(parents=True, exist_ok=True)
        with open(json_file) as f:
            data = json.load(f)

        # Create image dict
-        images = {f'{x["id"]:d}': x for x in data['images']}
+        images = {f'{x["id"]:d}': x for x in data["images"]}
        # Create image-annotations dict
        imgToAnns = defaultdict(list)
-        for ann in data['annotations']:
-            imgToAnns[ann['image_id']].append(ann)
+        for ann in data["annotations"]:
+            imgToAnns[ann["image_id"]].append(ann)

        # Write labels file
-        for img_id, anns in TQDM(imgToAnns.items(), desc=f'Annotations {json_file}'):
-            img = images[f'{img_id:d}']
-            h, w, f = img['height'], img['width'], img['file_name']
+        for img_id, anns in TQDM(imgToAnns.items(), desc=f"Annotations {json_file}"):
+            img = images[f"{img_id:d}"]
+            h, w, f = img["height"], img["width"], img["file_name"]

            bboxes = []
            segments = []
            keypoints = []
            for ann in anns:
-                if ann['iscrowd']:
+                if ann["iscrowd"]:
                    continue
                # The COCO box format is [top left x, top left y, width, height]
-                box = np.array(ann['bbox'], dtype=np.float64)
+                box = np.array(ann["bbox"], dtype=np.float64)
                box[:2] += box[2:] / 2  # xy top-left corner to center
                box[[0, 2]] /= w  # normalize x
                box[[1, 3]] /= h  # normalize y
                if box[2] <= 0 or box[3] <= 0:  # if w <= 0 and h <= 0
                    continue

-                cls = coco80[ann['category_id'] - 1] if cls91to80 else ann['category_id'] - 1  # class
+                cls = coco80[ann["category_id"] - 1] if cls91to80 else ann["category_id"] - 1  # class
                box = [cls] + box.tolist()
                if box not in bboxes:
                    bboxes.append(box)
-                if use_segments and ann.get('segmentation') is not None:
-                    if len(ann['segmentation']) == 0:
-                        segments.append([])
-                        continue
-                    elif len(ann['segmentation']) > 1:
-                        s = merge_multi_segment(ann['segmentation'])
-                        s = (np.concatenate(s, axis=0) / np.array([w, h])).reshape(-1).tolist()
-                    else:
-                        s = [j for i in ann['segmentation'] for j in i]  # all segments concatenated
-                        s = (np.array(s).reshape(-1, 2) / np.array([w, h])).reshape(-1).tolist()
-                    s = [cls] + s
-                    if s not in segments:
+                    if use_segments and ann.get("segmentation") is not None:
+                        if len(ann["segmentation"]) == 0:
+                            segments.append([])
+                            continue
+                        elif len(ann["segmentation"]) > 1:
+                            s = merge_multi_segment(ann["segmentation"])
+                            s = (np.concatenate(s, axis=0) / np.array([w, h])).reshape(-1).tolist()
+                        else:
+                            s = [j for i in ann["segmentation"] for j in i]  # all segments concatenated
+                            s = (np.array(s).reshape(-1, 2) / np.array([w, h])).reshape(-1).tolist()
+                        s = [cls] + s
                        segments.append(s)
-                if use_keypoints and ann.get('keypoints') is not None:
-                    keypoints.append(box + (np.array(ann['keypoints']).reshape(-1, 3) /
-                                            np.array([w, h, 1])).reshape(-1).tolist())
+                    if use_keypoints and ann.get("keypoints") is not None:
+                        keypoints.append(
+                            box + (np.array(ann["keypoints"]).reshape(-1, 3) / np.array([w, h, 1])).reshape(-1).tolist()
+                        )

            # Write
-            with open((fn / f).with_suffix('.txt'), 'a') as file:
+            with open((fn / f).with_suffix(".txt"), "a") as file:
                for i in range(len(bboxes)):
                    if use_keypoints:
-                        line = *(keypoints[i]),  # cls, box, keypoints
+                        line = (*(keypoints[i]),)  # cls, box, keypoints
                    else:
-                        line = *(segments[i]
-                                 if use_segments and len(segments[i]) > 0 else bboxes[i]),  # cls, box or segments
-                    file.write(('%g ' * len(line)).rstrip() % line + '\n')
+                        line = (
+                            *(segments[i] if use_segments and len(segments[i]) > 0 else bboxes[i]),
+                        )  # cls, box or segments
+                    file.write(("%g " * len(line)).rstrip() % line + "\n")
+
+    LOGGER.info(f"COCO data converted successfully.\nResults saved to {save_dir.resolve()}")


 def convert_dota_to_yolo_obb(dota_root_path: str):
@ -160,48 +336,52 @@ def convert_dota_to_yolo_obb(dota_root_path: str):

    Notes:
        The directory structure assumed for the DOTA dataset:
-            - DOTA
-                - images
-                    - train
-                    - val
-                - labels
-                    - train_original
-                    - val_original

-        After the function execution, the new labels will be saved in:
            - DOTA
-                - labels
-                    - train
-                    - val
+                ├─ images
+                │   ├─ train
+                │   └─ val
+                └─ labels
+                    ├─ train_original
+                    └─ val_original
+
+        After execution, the function will organize the labels into:
+
+            - DOTA
+                └─ labels
+                    ├─ train
+                    └─ val
    """
    dota_root_path = Path(dota_root_path)

    # Class names to indices mapping
    class_mapping = {
-        'plane': 0,
-        'ship': 1,
-        'storage-tank': 2,
-        'baseball-diamond': 3,
-        'tennis-court': 4,
-        'basketball-court': 5,
-        'ground-track-field': 6,
-        'harbor': 7,
-        'bridge': 8,
-        'large-vehicle': 9,
-        'small-vehicle': 10,
-        'helicopter': 11,
-        'roundabout': 12,
-        'soccer ball-field': 13,
-        'swimming-pool': 14,
-        'container-crane': 15,
-        'airport': 16,
-        'helipad': 17}
+        "plane": 0,
+        "ship": 1,
+        "storage-tank": 2,
+        "baseball-diamond": 3,
+        "tennis-court": 4,
+        "basketball-court": 5,
+        "ground-track-field": 6,
+        "harbor": 7,
+        "bridge": 8,
+        "large-vehicle": 9,
+        "small-vehicle": 10,
+        "helicopter": 11,
+        "roundabout": 12,
+        "soccer-ball-field": 13,
+        "swimming-pool": 14,
+        "container-crane": 15,
+        "airport": 16,
+        "helipad": 17,
+    }

    def convert_label(image_name, image_width, image_height, orig_label_dir, save_dir):
-        orig_label_path = orig_label_dir / f'{image_name}.txt'
-        save_path = save_dir / f'{image_name}.txt'
+        """Converts a single image's DOTA annotation to YOLO OBB format and saves it to a specified directory."""
+        orig_label_path = orig_label_dir / f"{image_name}.txt"
+        save_path = save_dir / f"{image_name}.txt"

-        with orig_label_path.open('r') as f, save_path.open('w') as g:
+        with orig_label_path.open("r") as f, save_path.open("w") as g:
            lines = f.readlines()
            for line in lines:
                parts = line.strip().split()
@ -211,20 +391,21 @@ def convert_dota_to_yolo_obb(dota_root_path: str):
                class_idx = class_mapping[class_name]
                coords = [float(p) for p in parts[:8]]
                normalized_coords = [
-                    coords[i] / image_width if i % 2 == 0 else coords[i] / image_height for i in range(8)]
-                formatted_coords = ['{:.6g}'.format(coord) for coord in normalized_coords]
+                    coords[i] / image_width if i % 2 == 0 else coords[i] / image_height for i in range(8)
+                ]
+                formatted_coords = ["{:.6g}".format(coord) for coord in normalized_coords]
                g.write(f"{class_idx} {' '.join(formatted_coords)}\n")

-    for phase in ['train', 'val']:
-        image_dir = dota_root_path / 'images' / phase
-        orig_label_dir = dota_root_path / 'labels' / f'{phase}_original'
-        save_dir = dota_root_path / 'labels' / phase
+    for phase in ["train", "val"]:
+        image_dir = dota_root_path / "images" / phase
+        orig_label_dir = dota_root_path / "labels" / f"{phase}_original"
+        save_dir = dota_root_path / "labels" / phase

        save_dir.mkdir(parents=True, exist_ok=True)

        image_paths = list(image_dir.iterdir())
-        for image_path in TQDM(image_paths, desc=f'Processing {phase} images'):
-            if image_path.suffix != '.png':
+        for image_path in TQDM(image_paths, desc=f"Processing {phase} images"):
+            if image_path.suffix != ".png":
                continue
            image_name_without_ext = image_path.stem
            img = cv2.imread(str(image_path))
@ -237,8 +418,8 @@ def min_index(arr1, arr2):
    Find a pair of indexes with the shortest distance between two arrays of 2D points.

    Args:
-        arr1 (np.array): A NumPy array of shape (N, 2) representing N 2D points.
-        arr2 (np.array): A NumPy array of shape (M, 2) representing M 2D points.
+        arr1 (np.ndarray): A NumPy array of shape (N, 2) representing N 2D points.
+        arr2 (np.ndarray): A NumPy array of shape (M, 2) representing M 2D points.

    Returns:
        (tuple): A tuple containing the indexes of the points with the shortest distance in arr1 and arr2 respectively.
@ -263,31 +444,30 @@ def merge_multi_segment(segments):
    segments = [np.array(i).reshape(-1, 2) for i in segments]
    idx_list = [[] for _ in range(len(segments))]

-    # record the indexes with min distance between each segment
+    # Record the indexes with min distance between each segment
    for i in range(1, len(segments)):
        idx1, idx2 = min_index(segments[i - 1], segments[i])
        idx_list[i - 1].append(idx1)
        idx_list[i].append(idx2)

-    # use two round to connect all the segments
+    # Use two round to connect all the segments
    for k in range(2):
-        # forward connection
+        # Forward connection
        if k == 0:
            for i, idx in enumerate(idx_list):
-                # middle segments have two indexes
-                # reverse the index of middle segments
+                # Middle segments have two indexes, reverse the index of middle segments
                if len(idx) == 2 and idx[0] > idx[1]:
                    idx = idx[::-1]
                    segments[i] = segments[i][::-1, :]

                segments[i] = np.roll(segments[i], -idx[0], axis=0)
                segments[i] = np.concatenate([segments[i], segments[i][:1]])
-                # deal with the first segment and the last one
+                # Deal with the first segment and the last one
                if i in [0, len(idx_list) - 1]:
                    s.append(segments[i])
                else:
                    idx = [0, idx[1] - idx[0]]
-                    s.append(segments[i][idx[0]:idx[1] + 1])
+                    s.append(segments[i][idx[0] : idx[1] + 1])

        else:
            for i in range(len(idx_list) - 1, -1, -1):
@ -296,3 +476,67 @@ def merge_multi_segment(segments):
                    nidx = abs(idx[1] - idx[0])
                    s.append(segments[i][nidx:])
    return s
+
+
+def yolo_bbox2segment(im_dir, save_dir=None, sam_model="sam_b.pt"):
+    """
+    Converts existing object detection dataset (bounding boxes) to segmentation dataset or oriented bounding box (OBB)
+    in YOLO format. Generates segmentation data using SAM auto-annotator as needed.
+
+    Args:
+        im_dir (str | Path): Path to image directory to convert.
+        save_dir (str | Path): Path to save the generated labels, labels will be saved
+            into `labels-segment` in the same directory level of `im_dir` if save_dir is None. Default: None.
+        sam_model (str): Segmentation model to use for intermediate segmentation data; optional.
+
+    Notes:
+        The input directory structure assumed for dataset:
+
+            - im_dir
+                ├─ 001.jpg
+                ├─ ..
+                └─ NNN.jpg
+            - labels
+                ├─ 001.txt
+                ├─ ..
+                └─ NNN.txt
+    """
+    from ultralytics.data import YOLODataset
+    from ultralytics.utils.ops import xywh2xyxy
+    from ultralytics.utils import LOGGER
+    from ultralytics import SAM
+    from tqdm import tqdm
+
+    # NOTE: add placeholder to pass class index check
+    dataset = YOLODataset(im_dir, data=dict(names=list(range(1000))))
+    if len(dataset.labels[0]["segments"]) > 0:  # if it's segment data
+        LOGGER.info("Segmentation labels detected, no need to generate new ones!")
+        return
+
+    LOGGER.info("Detection labels detected, generating segment labels by SAM model!")
+    sam_model = SAM(sam_model)
+    for l in tqdm(dataset.labels, total=len(dataset.labels), desc="Generating segment labels"):
+        h, w = l["shape"]
+        boxes = l["bboxes"]
+        if len(boxes) == 0:  # skip empty labels
+            continue
+        boxes[:, [0, 2]] *= w
+        boxes[:, [1, 3]] *= h
+        im = cv2.imread(l["im_file"])
+        sam_results = sam_model(im, bboxes=xywh2xyxy(boxes), verbose=False, save=False)
+        l["segments"] = sam_results[0].masks.xyn
+
+    save_dir = Path(save_dir) if save_dir else Path(im_dir).parent / "labels-segment"
+    save_dir.mkdir(parents=True, exist_ok=True)
+    for l in dataset.labels:
+        texts = []
+        lb_name = Path(l["im_file"]).with_suffix(".txt").name
+        txt_file = save_dir / lb_name
+        cls = l["cls"]
+        for i, s in enumerate(l["segments"]):
+            line = (int(cls[i]), *s.reshape(-1))
+            texts.append(("%g " * len(line)).rstrip() % line)
+        if texts:
+            with open(txt_file, "a") as f:
+                f.writelines(text + "\n" for text in texts)
+    LOGGER.info(f"Generated segment labels saved in {save_dir}")
--- a/ultralytics/data/dataset.py
+++ b/ultralytics/data/dataset.py
@ -8,15 +8,16 @@ import cv2
 import numpy as np
 import torch
 import torchvision
+from PIL import Image

 from ultralytics.utils import LOCAL_RANK, NUM_THREADS, TQDM, colorstr, is_dir_writeable
-
-from .augment import Compose, Format, Instances, LetterBox, classify_albumentations, classify_transforms, v8_transforms
+from ultralytics.utils.ops import resample_segments
+from .augment import Compose, Format, Instances, LetterBox, classify_augmentations, classify_transforms, v8_transforms
 from .base import BaseDataset
 from .utils import HELP_URL, LOGGER, get_hash, img2label_paths, verify_image, verify_image_label

 # Ultralytics dataset *.cache version, >= 1.0.0 for YOLOv8
-DATASET_CACHE_VERSION = '1.0.3'
+DATASET_CACHE_VERSION = "1.0.3"


 class YOLODataset(BaseDataset):
@ -25,40 +26,54 @@ class YOLODataset(BaseDataset):

    Args:
        data (dict, optional): A dataset YAML dictionary. Defaults to None.
-        use_segments (bool, optional): If True, segmentation masks are used as labels. Defaults to False.
-        use_keypoints (bool, optional): If True, keypoints are used as labels. Defaults to False.
+        task (str): An explicit arg to point current task, Defaults to 'detect'.

    Returns:
        (torch.utils.data.Dataset): A PyTorch dataset object that can be used for training an object detection model.
    """

-    def __init__(self, *args, data=None, use_segments=False, use_keypoints=False, **kwargs):
-        self.use_segments = use_segments
-        self.use_keypoints = use_keypoints
+    def __init__(self, *args, data=None, task="detect", **kwargs):
+        """Initializes the YOLODataset with optional configurations for segments and keypoints."""
+        self.use_segments = task == "segment"
+        self.use_keypoints = task == "pose"
+        self.use_obb = task == "obb"
        self.data = data
-        assert not (self.use_segments and self.use_keypoints), 'Can not use both segments and keypoints.'
+        assert not (self.use_segments and self.use_keypoints), "Can not use both segments and keypoints."
        super().__init__(*args, **kwargs)

-    def cache_labels(self, path=Path('./labels.cache')):
-        """Cache dataset labels, check images and read shapes.
+    def cache_labels(self, path=Path("./labels.cache")):
+        """
+        Cache dataset labels, check images and read shapes.
+
        Args:
-            path (Path): path where to save the cache file (default: Path('./labels.cache')).
+            path (Path): Path where to save the cache file. Default is Path('./labels.cache').
+
        Returns:
            (dict): labels.
        """
-        x = {'labels': []}
+        x = {"labels": []}
        nm, nf, ne, nc, msgs = 0, 0, 0, 0, []  # number missing, found, empty, corrupt, messages
-        desc = f'{self.prefix}Scanning {path.parent / path.stem}...'
+        desc = f"{self.prefix}Scanning {path.parent / path.stem}..."
        total = len(self.im_files)
-        nkpt, ndim = self.data.get('kpt_shape', (0, 0))
+        nkpt, ndim = self.data.get("kpt_shape", (0, 0))
        if self.use_keypoints and (nkpt <= 0 or ndim not in (2, 3)):
-            raise ValueError("'kpt_shape' in data.yaml missing or incorrect. Should be a list with [number of "
-                             "keypoints, number of dims (2 for x,y or 3 for x,y,visible)], i.e. 'kpt_shape: [17, 3]'")
+            raise ValueError(
+                "'kpt_shape' in data.yaml missing or incorrect. Should be a list with [number of "
+                "keypoints, number of dims (2 for x,y or 3 for x,y,visible)], i.e. 'kpt_shape: [17, 3]'"
+            )
        with ThreadPool(NUM_THREADS) as pool:
-            results = pool.imap(func=verify_image_label,
-                                iterable=zip(self.im_files, self.label_files, repeat(self.prefix),
-                                             repeat(self.use_keypoints), repeat(len(self.data['names'])), repeat(nkpt),
-                                             repeat(ndim)))
+            results = pool.imap(
+                func=verify_image_label,
+                iterable=zip(
+                    self.im_files,
+                    self.label_files,
+                    repeat(self.prefix),
+                    repeat(self.use_keypoints),
+                    repeat(len(self.data["names"])),
+                    repeat(nkpt),
+                    repeat(ndim),
+                ),
+            )
            pbar = TQDM(results, desc=desc, total=total)
            for im_file, lb, shape, segments, keypoint, nm_f, nf_f, ne_f, nc_f, msg in pbar:
                nm += nm_f
@ -66,7 +81,7 @@ class YOLODataset(BaseDataset):
                ne += ne_f
                nc += nc_f
                if im_file:
-                    x['labels'].append(
+                    x["labels"].append(
                        dict(
                            im_file=im_file,
                            shape=shape,
@ -75,60 +90,63 @@ class YOLODataset(BaseDataset):
                            segments=segments,
                            keypoints=keypoint,
                            normalized=True,
-                            bbox_format='xywh'))
+                            bbox_format="xywh",
+                        )
+                    )
                if msg:
                    msgs.append(msg)
-                pbar.desc = f'{desc} {nf} images, {nm + ne} backgrounds, {nc} corrupt'
+                pbar.desc = f"{desc} {nf} images, {nm + ne} backgrounds, {nc} corrupt"
            pbar.close()

        if msgs:
-            LOGGER.info('\n'.join(msgs))
+            LOGGER.info("\n".join(msgs))
        if nf == 0:
-            LOGGER.warning(f'{self.prefix}WARNING ⚠️ No labels found in {path}. {HELP_URL}')
-        x['hash'] = get_hash(self.label_files + self.im_files)
-        x['results'] = nf, nm, ne, nc, len(self.im_files)
-        x['msgs'] = msgs  # warnings
+            LOGGER.warning(f"{self.prefix}WARNING ⚠️ No labels found in {path}. {HELP_URL}")
+        x["hash"] = get_hash(self.label_files + self.im_files)
+        x["results"] = nf, nm, ne, nc, len(self.im_files)
+        x["msgs"] = msgs  # warnings
        save_dataset_cache_file(self.prefix, path, x)
        return x

    def get_labels(self):
        """Returns dictionary of labels for YOLO training."""
        self.label_files = img2label_paths(self.im_files)
-        cache_path = Path(self.label_files[0]).parent.with_suffix('.cache')
+        cache_path = Path(self.label_files[0]).parent.with_suffix(".cache")
        try:
            cache, exists = load_dataset_cache_file(cache_path), True  # attempt to load a *.cache file
-            assert cache['version'] == DATASET_CACHE_VERSION  # matches current version
-            assert cache['hash'] == get_hash(self.label_files + self.im_files)  # identical hash
+            assert cache["version"] == DATASET_CACHE_VERSION  # matches current version
+            assert cache["hash"] == get_hash(self.label_files + self.im_files)  # identical hash
        except (FileNotFoundError, AssertionError, AttributeError):
            cache, exists = self.cache_labels(cache_path), False  # run cache ops

        # Display cache
-        nf, nm, ne, nc, n = cache.pop('results')  # found, missing, empty, corrupt, total
+        nf, nm, ne, nc, n = cache.pop("results")  # found, missing, empty, corrupt, total
        if exists and LOCAL_RANK in (-1, 0):
-            d = f'Scanning {cache_path}... {nf} images, {nm + ne} backgrounds, {nc} corrupt'
+            d = f"Scanning {cache_path}... {nf} images, {nm + ne} backgrounds, {nc} corrupt"
            TQDM(None, desc=self.prefix + d, total=n, initial=n)  # display results
-            if cache['msgs']:
-                LOGGER.info('\n'.join(cache['msgs']))  # display warnings
+            if cache["msgs"]:
+                LOGGER.info("\n".join(cache["msgs"]))  # display warnings

        # Read cache
-        [cache.pop(k) for k in ('hash', 'version', 'msgs')]  # remove items
-        labels = cache['labels']
+        [cache.pop(k) for k in ("hash", "version", "msgs")]  # remove items
+        labels = cache["labels"]
        if not labels:
-            LOGGER.warning(f'WARNING ⚠️ No images found in {cache_path}, training may not work correctly. {HELP_URL}')
-        self.im_files = [lb['im_file'] for lb in labels]  # update im_files
+            LOGGER.warning(f"WARNING ⚠️ No images found in {cache_path}, training may not work correctly. {HELP_URL}")
+        self.im_files = [lb["im_file"] for lb in labels]  # update im_files

        # Check if the dataset is all boxes or all segments
-        lengths = ((len(lb['cls']), len(lb['bboxes']), len(lb['segments'])) for lb in labels)
+        lengths = ((len(lb["cls"]), len(lb["bboxes"]), len(lb["segments"])) for lb in labels)
        len_cls, len_boxes, len_segments = (sum(x) for x in zip(*lengths))
        if len_segments and len_boxes != len_segments:
            LOGGER.warning(
-                f'WARNING ⚠️ Box and segment counts should be equal, but got len(segments) = {len_segments}, '
-                f'len(boxes) = {len_boxes}. To resolve this only boxes will be used and all segments will be removed. '
-                'To avoid this please supply either a detect or segment dataset, not a detect-segment mixed dataset.')
+                f"WARNING ⚠️ Box and segment counts should be equal, but got len(segments) = {len_segments}, "
+                f"len(boxes) = {len_boxes}. To resolve this only boxes will be used and all segments will be removed. "
+                "To avoid this please supply either a detect or segment dataset, not a detect-segment mixed dataset."
+            )
            for lb in labels:
-                lb['segments'] = []
+                lb["segments"] = []
        if len_cls == 0:
-            LOGGER.warning(f'WARNING ⚠️ No labels found in {cache_path}, training may not work correctly. {HELP_URL}')
+            LOGGER.warning(f"WARNING ⚠️ No labels found in {cache_path}, training may not work correctly. {HELP_URL}")
        return labels

    def build_transforms(self, hyp=None):
@ -140,13 +158,18 @@ class YOLODataset(BaseDataset):
        else:
            transforms = Compose([LetterBox(new_shape=(self.imgsz, self.imgsz), scaleup=False)])
        transforms.append(
-            Format(bbox_format='xywh',
-                   normalize=True,
-                   return_mask=self.use_segments,
-                   return_keypoint=self.use_keypoints,
-                   batch_idx=True,
-                   mask_ratio=hyp.mask_ratio,
-                   mask_overlap=hyp.overlap_mask))
+            Format(
+                bbox_format="xywh",
+                normalize=True,
+                return_mask=self.use_segments,
+                return_keypoint=self.use_keypoints,
+                return_obb=self.use_obb,
+                batch_idx=True,
+                mask_ratio=hyp.mask_ratio,
+                mask_overlap=hyp.overlap_mask,
+                bgr=hyp.bgr if self.augment else 0.0,  # only affect training.
+            )
+        )
        return transforms

    def close_mosaic(self, hyp):
@ -157,15 +180,28 @@ class YOLODataset(BaseDataset):
        self.transforms = self.build_transforms(hyp)

    def update_labels_info(self, label):
-        """custom your label format here."""
-        # NOTE: cls is not with bboxes now, classification and semantic segmentation need an independent cls label
-        # we can make it also support classification and semantic segmentation by add or remove some dict keys there.
-        bboxes = label.pop('bboxes')
-        segments = label.pop('segments')
-        keypoints = label.pop('keypoints', None)
-        bbox_format = label.pop('bbox_format')
-        normalized = label.pop('normalized')
-        label['instances'] = Instances(bboxes, segments, keypoints, bbox_format=bbox_format, normalized=normalized)
+        """
+        Custom your label format here.
+
+        Note:
+            cls is not with bboxes now, classification and semantic segmentation need an independent cls label
+            Can also support classification and semantic segmentation by adding or removing dict keys there.
+        """
+        bboxes = label.pop("bboxes")
+        segments = label.pop("segments", [])
+        keypoints = label.pop("keypoints", None)
+        bbox_format = label.pop("bbox_format")
+        normalized = label.pop("normalized")
+
+        # NOTE: do NOT resample oriented boxes
+        segment_resamples = 100 if self.use_obb else 1000
+        if len(segments) > 0:
+            # list[np.array(1000, 2)] * num_samples
+            # (N, 1000, 2)
+            segments = np.stack(resample_segments(segments, n=segment_resamples), axis=0)
+        else:
+            segments = np.zeros((0, segment_resamples, 2), dtype=np.float32)
+        label["instances"] = Instances(bboxes, segments, keypoints, bbox_format=bbox_format, normalized=normalized)
        return label

    @staticmethod
@ -176,65 +212,75 @@ class YOLODataset(BaseDataset):
        values = list(zip(*[list(b.values()) for b in batch]))
        for i, k in enumerate(keys):
            value = values[i]
-            if k == 'img':
+            if k == "img":
                value = torch.stack(value, 0)
-            if k in ['masks', 'keypoints', 'bboxes', 'cls']:
+            if k in ["masks", "keypoints", "bboxes", "cls", "segments", "obb"]:
                value = torch.cat(value, 0)
            new_batch[k] = value
-        new_batch['batch_idx'] = list(new_batch['batch_idx'])
-        for i in range(len(new_batch['batch_idx'])):
-            new_batch['batch_idx'][i] += i  # add target image index for build_targets()
-        new_batch['batch_idx'] = torch.cat(new_batch['batch_idx'], 0)
+        new_batch["batch_idx"] = list(new_batch["batch_idx"])
+        for i in range(len(new_batch["batch_idx"])):
+            new_batch["batch_idx"][i] += i  # add target image index for build_targets()
+        new_batch["batch_idx"] = torch.cat(new_batch["batch_idx"], 0)
        return new_batch


 # Classification dataloaders -------------------------------------------------------------------------------------------
 class ClassificationDataset(torchvision.datasets.ImageFolder):
    """
-    YOLO Classification Dataset.
+    Extends torchvision ImageFolder to support YOLO classification tasks, offering functionalities like image
+    augmentation, caching, and verification. It's designed to efficiently handle large datasets for training deep
+    learning models, with optional image transformations and caching mechanisms to speed up training.

-    Args:
-        root (str): Dataset path.
+    This class allows for augmentations using both torchvision and Albumentations libraries, and supports caching images
+    in RAM or on disk to reduce IO overhead during training. Additionally, it implements a robust verification process
+    to ensure data integrity and consistency.

    Attributes:
-        cache_ram (bool): True if images should be cached in RAM, False otherwise.
-        cache_disk (bool): True if images should be cached on disk, False otherwise.
-        samples (list): List of samples containing file, index, npy, and im.
-        torch_transforms (callable): torchvision transforms applied to the dataset.
-        album_transforms (callable, optional): Albumentations transforms applied to the dataset if augment is True.
+        cache_ram (bool): Indicates if caching in RAM is enabled.
+        cache_disk (bool): Indicates if caching on disk is enabled.
+        samples (list): A list of tuples, each containing the path to an image, its class index, path to its .npy cache
+                        file (if caching on disk), and optionally the loaded image array (if caching in RAM).
+        torch_transforms (callable): PyTorch transforms to be applied to the images.
    """

-    def __init__(self, root, args, augment=False, cache=False, prefix=''):
+    def __init__(self, root, args, augment=False, prefix=""):
        """
        Initialize YOLO object with root, image size, augmentations, and cache settings.

        Args:
-            root (str): Dataset path.
-            args (Namespace): Argument parser containing dataset related settings.
-            augment (bool, optional): True if dataset should be augmented, False otherwise. Defaults to False.
-            cache (bool | str | optional): Cache setting, can be True, False, 'ram' or 'disk'. Defaults to False.
+            root (str): Path to the dataset directory where images are stored in a class-specific folder structure.
+            args (Namespace): Configuration containing dataset-related settings such as image size, augmentation
+                parameters, and cache settings. It includes attributes like `imgsz` (image size), `fraction` (fraction
+                of data to use), `scale`, `fliplr`, `flipud`, `cache` (disk or RAM caching for faster training),
+                `auto_augment`, `hsv_h`, `hsv_s`, `hsv_v`, and `crop_fraction`.
+            augment (bool, optional): Whether to apply augmentations to the dataset. Default is False.
+            prefix (str, optional): Prefix for logging and cache filenames, aiding in dataset identification and
+                debugging. Default is an empty string.
        """
        super().__init__(root=root)
        if augment and args.fraction < 1.0:  # reduce training fraction
-            self.samples = self.samples[:round(len(self.samples) * args.fraction)]
-        self.prefix = colorstr(f'{prefix}: ') if prefix else ''
-        self.cache_ram = cache is True or cache == 'ram'
-        self.cache_disk = cache == 'disk'
+            self.samples = self.samples[: round(len(self.samples) * args.fraction)]
+        self.prefix = colorstr(f"{prefix}: ") if prefix else ""
+        self.cache_ram = args.cache is True or args.cache == "ram"  # cache images into RAM
+        self.cache_disk = args.cache == "disk"  # cache images on hard drive as uncompressed *.npy files
        self.samples = self.verify_images()  # filter out bad images
-        self.samples = [list(x) + [Path(x[0]).with_suffix('.npy'), None] for x in self.samples]  # file, index, npy, im
-        self.torch_transforms = classify_transforms(args.imgsz)
-        self.album_transforms = classify_albumentations(
-            augment=augment,
-            size=args.imgsz,
-            scale=(1.0 - args.scale, 1.0),  # (0.08, 1.0)
-            hflip=args.fliplr,
-            vflip=args.flipud,
-            hsv_h=args.hsv_h,  # HSV-Hue augmentation (fraction)
-            hsv_s=args.hsv_s,  # HSV-Saturation augmentation (fraction)
-            hsv_v=args.hsv_v,  # HSV-Value augmentation (fraction)
-            mean=(0.0, 0.0, 0.0),  # IMAGENET_MEAN
-            std=(1.0, 1.0, 1.0),  # IMAGENET_STD
-            auto_aug=False) if augment else None
+        self.samples = [list(x) + [Path(x[0]).with_suffix(".npy"), None] for x in self.samples]  # file, index, npy, im
+        scale = (1.0 - args.scale, 1.0)  # (0.08, 1.0)
+        self.torch_transforms = (
+            classify_augmentations(
+                size=args.imgsz,
+                scale=scale,
+                hflip=args.fliplr,
+                vflip=args.flipud,
+                erasing=args.erasing,
+                auto_augment=args.auto_augment,
+                hsv_h=args.hsv_h,
+                hsv_s=args.hsv_s,
+                hsv_v=args.hsv_v,
+            )
+            if augment
+            else classify_transforms(size=args.imgsz, crop_fraction=args.crop_fraction)
+        )

    def __getitem__(self, i):
        """Returns subset of data and targets corresponding to given indices."""
@ -247,30 +293,30 @@ class ClassificationDataset(torchvision.datasets.ImageFolder):
            im = np.load(fn)
        else:  # read image
            im = cv2.imread(f)  # BGR
-        if self.album_transforms:
-            sample = self.album_transforms(image=cv2.cvtColor(im, cv2.COLOR_BGR2RGB))['image']
-        else:
-            sample = self.torch_transforms(im)
-        return {'img': sample, 'cls': j}
+        # Convert NumPy array to PIL image
+        im = Image.fromarray(cv2.cvtColor(im, cv2.COLOR_BGR2RGB))
+        sample = self.torch_transforms(im)
+        return {"img": sample, "cls": j}

    def __len__(self) -> int:
+        """Return the total number of samples in the dataset."""
        return len(self.samples)

    def verify_images(self):
        """Verify all images in dataset."""
-        desc = f'{self.prefix}Scanning {self.root}...'
-        path = Path(self.root).with_suffix('.cache')  # *.cache file path
+        desc = f"{self.prefix}Scanning {self.root}..."
+        path = Path(self.root).with_suffix(".cache")  # *.cache file path

        with contextlib.suppress(FileNotFoundError, AssertionError, AttributeError):
            cache = load_dataset_cache_file(path)  # attempt to load a *.cache file
-            assert cache['version'] == DATASET_CACHE_VERSION  # matches current version
-            assert cache['hash'] == get_hash([x[0] for x in self.samples])  # identical hash
-            nf, nc, n, samples = cache.pop('results')  # found, missing, empty, corrupt, total
+            assert cache["version"] == DATASET_CACHE_VERSION  # matches current version
+            assert cache["hash"] == get_hash([x[0] for x in self.samples])  # identical hash
+            nf, nc, n, samples = cache.pop("results")  # found, missing, empty, corrupt, total
            if LOCAL_RANK in (-1, 0):
-                d = f'{desc} {nf} images, {nc} corrupt'
+                d = f"{desc} {nf} images, {nc} corrupt"
                TQDM(None, desc=d, total=n, initial=n)
-                if cache['msgs']:
-                    LOGGER.info('\n'.join(cache['msgs']))  # display warnings
+                if cache["msgs"]:
+                    LOGGER.info("\n".join(cache["msgs"]))  # display warnings
            return samples

        # Run scan if *.cache retrieval failed
@ -285,13 +331,13 @@ class ClassificationDataset(torchvision.datasets.ImageFolder):
                    msgs.append(msg)
                nf += nf_f
                nc += nc_f
-                pbar.desc = f'{desc} {nf} images, {nc} corrupt'
+                pbar.desc = f"{desc} {nf} images, {nc} corrupt"
            pbar.close()
        if msgs:
-            LOGGER.info('\n'.join(msgs))
-        x['hash'] = get_hash([x[0] for x in self.samples])
-        x['results'] = nf, nc, len(samples), samples
-        x['msgs'] = msgs  # warnings
+            LOGGER.info("\n".join(msgs))
+        x["hash"] = get_hash([x[0] for x in self.samples])
+        x["results"] = nf, nc, len(samples), samples
+        x["msgs"] = msgs  # warnings
        save_dataset_cache_file(self.prefix, path, x)
        return samples

@ -299,6 +345,7 @@ class ClassificationDataset(torchvision.datasets.ImageFolder):
 def load_dataset_cache_file(path):
    """Load an Ultralytics *.cache dictionary from path."""
    import gc
+
    gc.disable()  # reduce pickle load time https://github.com/ultralytics/ultralytics/pull/1585
    cache = np.load(str(path), allow_pickle=True).item()  # load dict
    gc.enable()
@ -307,19 +354,29 @@ def load_dataset_cache_file(path):

 def save_dataset_cache_file(prefix, path, x):
    """Save an Ultralytics dataset *.cache dictionary x to path."""
-    x['version'] = DATASET_CACHE_VERSION  # add cache version
+    x["version"] = DATASET_CACHE_VERSION  # add cache version
    if is_dir_writeable(path.parent):
        if path.exists():
            path.unlink()  # remove *.cache file if exists
        np.save(str(path), x)  # save cache for next time
-        path.with_suffix('.cache.npy').rename(path)  # remove .npy suffix
-        LOGGER.info(f'{prefix}New cache created: {path}')
+        path.with_suffix(".cache.npy").rename(path)  # remove .npy suffix
+        LOGGER.info(f"{prefix}New cache created: {path}")
    else:
-        LOGGER.warning(f'{prefix}WARNING ⚠️ Cache directory {path.parent} is not writeable, cache not saved.')
+        LOGGER.warning(f"{prefix}WARNING ⚠️ Cache directory {path.parent} is not writeable, cache not saved.")


 # TODO: support semantic segmentation
 class SemanticDataset(BaseDataset):
+    """
+    Semantic Segmentation Dataset.
+
+    This class is responsible for handling datasets used for semantic segmentation tasks. It inherits functionalities
+    from the BaseDataset class.
+
+    Note:
+        This class is currently a placeholder and needs to be populated with methods and attributes for supporting
+        semantic segmentation tasks.
+    """

    def __init__(self):
        """Initialize a SemanticDataset object."""
--- a/ultralytics/data/explorer/init.py
+++ b/ultralytics/data/explorer/init.py
@ -0,0 +1,5 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+from .utils import plot_query_result
+
+__all__ = ["plot_query_result"]
--- a/ultralytics/data/explorer/pycache/init.cpython-312.pyc
+++ b/ultralytics/data/explorer/pycache/init.cpython-312.pyc
--- a/ultralytics/data/explorer/pycache/init.cpython-39.pyc
+++ b/ultralytics/data/explorer/pycache/init.cpython-39.pyc
--- a/ultralytics/data/explorer/pycache/explorer.cpython-312.pyc
+++ b/ultralytics/data/explorer/pycache/explorer.cpython-312.pyc
--- a/ultralytics/data/explorer/pycache/explorer.cpython-39.pyc
+++ b/ultralytics/data/explorer/pycache/explorer.cpython-39.pyc
--- a/ultralytics/data/explorer/pycache/utils.cpython-312.pyc
+++ b/ultralytics/data/explorer/pycache/utils.cpython-312.pyc
--- a/ultralytics/data/explorer/pycache/utils.cpython-39.pyc
+++ b/ultralytics/data/explorer/pycache/utils.cpython-39.pyc
--- a/ultralytics/data/explorer/explorer.py
+++ b/ultralytics/data/explorer/explorer.py
@ -0,0 +1,472 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+from io import BytesIO
+from pathlib import Path
+from typing import Any, List, Tuple, Union
+
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from matplotlib import pyplot as plt
+from pandas import DataFrame
+from tqdm import tqdm
+
+from ultralytics.data.augment import Format
+from ultralytics.data.dataset import YOLODataset
+from ultralytics.data.utils import check_det_dataset
+from ultralytics.models.yolo.model import YOLO
+from ultralytics.utils import LOGGER, IterableSimpleNamespace, checks, USER_CONFIG_DIR
+from .utils import get_sim_index_schema, get_table_schema, plot_query_result, prompt_sql_query, sanitize_batch
+
+
+class ExplorerDataset(YOLODataset):
+    def __init__(self, *args, data: dict = None, **kwargs) -> None:
+        super().__init__(*args, data=data, **kwargs)
+
+    def load_image(self, i: int) -> Union[Tuple[np.ndarray, Tuple[int, int], Tuple[int, int]], Tuple[None, None, None]]:
+        """Loads 1 image from dataset index 'i' without any resize ops."""
+        im, f, fn = self.ims[i], self.im_files[i], self.npy_files[i]
+        if im is None:  # not cached in RAM
+            if fn.exists():  # load npy
+                im = np.load(fn)
+            else:  # read image
+                im = cv2.imread(f)  # BGR
+                if im is None:
+                    raise FileNotFoundError(f"Image Not Found {f}")
+            h0, w0 = im.shape[:2]  # orig hw
+            return im, (h0, w0), im.shape[:2]
+
+        return self.ims[i], self.im_hw0[i], self.im_hw[i]
+
+    def build_transforms(self, hyp: IterableSimpleNamespace = None):
+        """Creates transforms for dataset images without resizing."""
+        return Format(
+            bbox_format="xyxy",
+            normalize=False,
+            return_mask=self.use_segments,
+            return_keypoint=self.use_keypoints,
+            batch_idx=True,
+            mask_ratio=hyp.mask_ratio,
+            mask_overlap=hyp.overlap_mask,
+        )
+
+
+class Explorer:
+    def __init__(
+        self,
+        data: Union[str, Path] = "coco128.yaml",
+        model: str = "yolov8n.pt",
+        uri: str = USER_CONFIG_DIR / "explorer",
+    ) -> None:
+        # Note duckdb==0.10.0 bug https://github.com/ultralytics/ultralytics/pull/8181
+        checks.check_requirements(["lancedb>=0.4.3", "duckdb<=0.9.2"])
+        import lancedb
+
+        self.connection = lancedb.connect(uri)
+        self.table_name = Path(data).name.lower() + "_" + model.lower()
+        self.sim_idx_base_name = (
+            f"{self.table_name}_sim_idx".lower()
+        )  # Use this name and append thres and top_k to reuse the table
+        self.model = YOLO(model)
+        self.data = data  # None
+        self.choice_set = None
+
+        self.table = None
+        self.progress = 0
+
+    def create_embeddings_table(self, force: bool = False, split: str = "train") -> None:
+        """
+        Create LanceDB table containing the embeddings of the images in the dataset. The table will be reused if it
+        already exists. Pass force=True to overwrite the existing table.
+
+        Args:
+            force (bool): Whether to overwrite the existing table or not. Defaults to False.
+            split (str): Split of the dataset to use. Defaults to 'train'.
+
+        Example:
+            ```python
+            exp = Explorer()
+            exp.create_embeddings_table()
+            ```
+        """
+        if self.table is not None and not force:
+            LOGGER.info("Table already exists. Reusing it. Pass force=True to overwrite it.")
+            return
+        if self.table_name in self.connection.table_names() and not force:
+            LOGGER.info(f"Table {self.table_name} already exists. Reusing it. Pass force=True to overwrite it.")
+            self.table = self.connection.open_table(self.table_name)
+            self.progress = 1
+            return
+        if self.data is None:
+            raise ValueError("Data must be provided to create embeddings table")
+
+        data_info = check_det_dataset(self.data)
+        if split not in data_info:
+            raise ValueError(
+                f"Split {split} is not found in the dataset. Available keys in the dataset are {list(data_info.keys())}"
+            )
+
+        choice_set = data_info[split]
+        choice_set = choice_set if isinstance(choice_set, list) else [choice_set]
+        self.choice_set = choice_set
+        dataset = ExplorerDataset(img_path=choice_set, data=data_info, augment=False, cache=False, task=self.model.task)
+
+        # Create the table schema
+        batch = dataset[0]
+        vector_size = self.model.embed(batch["im_file"], verbose=False)[0].shape[0]
+        table = self.connection.create_table(self.table_name, schema=get_table_schema(vector_size), mode="overwrite")
+        table.add(
+            self._yield_batches(
+                dataset,
+                data_info,
+                self.model,
+                exclude_keys=["img", "ratio_pad", "resized_shape", "ori_shape", "batch_idx"],
+            )
+        )
+
+        self.table = table
+
+    def _yield_batches(self, dataset: ExplorerDataset, data_info: dict, model: YOLO, exclude_keys: List[str]):
+        """Generates batches of data for embedding, excluding specified keys."""
+        for i in tqdm(range(len(dataset))):
+            self.progress = float(i + 1) / len(dataset)
+            batch = dataset[i]
+            for k in exclude_keys:
+                batch.pop(k, None)
+            batch = sanitize_batch(batch, data_info)
+            batch["vector"] = model.embed(batch["im_file"], verbose=False)[0].detach().tolist()
+            yield [batch]
+
+    def query(
+        self, imgs: Union[str, np.ndarray, List[str], List[np.ndarray]] = None, limit: int = 25
+    ) -> Any:  # pyarrow.Table
+        """
+        Query the table for similar images. Accepts a single image or a list of images.
+
+        Args:
+            imgs (str or list): Path to the image or a list of paths to the images.
+            limit (int): Number of results to return.
+
+        Returns:
+            (pyarrow.Table): An arrow table containing the results. Supports converting to:
+                - pandas dataframe: `result.to_pandas()`
+                - dict of lists: `result.to_pydict()`
+
+        Example:
+            ```python
+            exp = Explorer()
+            exp.create_embeddings_table()
+            similar = exp.query(img='https://ultralytics.com/images/zidane.jpg')
+            ```
+        """
+        if self.table is None:
+            raise ValueError("Table is not created. Please create the table first.")
+        if isinstance(imgs, str):
+            imgs = [imgs]
+        assert isinstance(imgs, list), f"img must be a string or a list of strings. Got {type(imgs)}"
+        embeds = self.model.embed(imgs)
+        # Get avg if multiple images are passed (len > 1)
+        embeds = torch.mean(torch.stack(embeds), 0).cpu().numpy() if len(embeds) > 1 else embeds[0].cpu().numpy()
+        return self.table.search(embeds).limit(limit).to_arrow()
+
+    def sql_query(
+        self, query: str, return_type: str = "pandas"
+    ) -> Union[DataFrame, Any, None]:  # pandas.dataframe or pyarrow.Table
+        """
+        Run a SQL-Like query on the table. Utilizes LanceDB predicate pushdown.
+
+        Args:
+            query (str): SQL query to run.
+            return_type (str): Type of the result to return. Can be either 'pandas' or 'arrow'. Defaults to 'pandas'.
+
+        Returns:
+            (pyarrow.Table): An arrow table containing the results.
+
+        Example:
+            ```python
+            exp = Explorer()
+            exp.create_embeddings_table()
+            query = "SELECT * FROM 'table' WHERE labels LIKE '%person%'"
+            result = exp.sql_query(query)
+            ```
+        """
+        assert return_type in {
+            "pandas",
+            "arrow",
+        }, f"Return type should be either `pandas` or `arrow`, but got {return_type}"
+        import duckdb
+
+        if self.table is None:
+            raise ValueError("Table is not created. Please create the table first.")
+
+        # Note: using filter pushdown would be a better long term solution. Temporarily using duckdb for this.
+        table = self.table.to_arrow()  # noqa NOTE: Don't comment this. This line is used by DuckDB
+        if not query.startswith("SELECT") and not query.startswith("WHERE"):
+            raise ValueError(
+                f"Query must start with SELECT or WHERE. You can either pass the entire query or just the WHERE clause. found {query}"
+            )
+        if query.startswith("WHERE"):
+            query = f"SELECT * FROM 'table' {query}"
+        LOGGER.info(f"Running query: {query}")
+
+        rs = duckdb.sql(query)
+        if return_type == "arrow":
+            return rs.arrow()
+        elif return_type == "pandas":
+            return rs.df()
+
+    def plot_sql_query(self, query: str, labels: bool = True) -> Image.Image:
+        """
+        Plot the results of a SQL-Like query on the table.
+        Args:
+            query (str): SQL query to run.
+            labels (bool): Whether to plot the labels or not.
+
+        Returns:
+            (PIL.Image): Image containing the plot.
+
+        Example:
+            ```python
+            exp = Explorer()
+            exp.create_embeddings_table()
+            query = "SELECT * FROM 'table' WHERE labels LIKE '%person%'"
+            result = exp.plot_sql_query(query)
+            ```
+        """
+        result = self.sql_query(query, return_type="arrow")
+        if len(result) == 0:
+            LOGGER.info("No results found.")
+            return None
+        img = plot_query_result(result, plot_labels=labels)
+        return Image.fromarray(img)
+
+    def get_similar(
+        self,
+        img: Union[str, np.ndarray, List[str], List[np.ndarray]] = None,
+        idx: Union[int, List[int]] = None,
+        limit: int = 25,
+        return_type: str = "pandas",
+    ) -> Union[DataFrame, Any]:  # pandas.dataframe or pyarrow.Table
+        """
+        Query the table for similar images. Accepts a single image or a list of images.
+
+        Args:
+            img (str or list): Path to the image or a list of paths to the images.
+            idx (int or list): Index of the image in the table or a list of indexes.
+            limit (int): Number of results to return. Defaults to 25.
+            return_type (str): Type of the result to return. Can be either 'pandas' or 'arrow'. Defaults to 'pandas'.
+
+        Returns:
+            (pandas.DataFrame): A dataframe containing the results.
+
+        Example:
+            ```python
+            exp = Explorer()
+            exp.create_embeddings_table()
+            similar = exp.get_similar(img='https://ultralytics.com/images/zidane.jpg')
+            ```
+        """
+        assert return_type in {
+            "pandas",
+            "arrow",
+        }, f"Return type should be either `pandas` or `arrow`, but got {return_type}"
+        img = self._check_imgs_or_idxs(img, idx)
+        similar = self.query(img, limit=limit)
+
+        if return_type == "arrow":
+            return similar
+        elif return_type == "pandas":
+            return similar.to_pandas()
+
+    def plot_similar(
+        self,
+        img: Union[str, np.ndarray, List[str], List[np.ndarray]] = None,
+        idx: Union[int, List[int]] = None,
+        limit: int = 25,
+        labels: bool = True,
+    ) -> Image.Image:
+        """
+        Plot the similar images. Accepts images or indexes.
+
+        Args:
+            img (str or list): Path to the image or a list of paths to the images.
+            idx (int or list): Index of the image in the table or a list of indexes.
+            labels (bool): Whether to plot the labels or not.
+            limit (int): Number of results to return. Defaults to 25.
+
+        Returns:
+            (PIL.Image): Image containing the plot.
+
+        Example:
+            ```python
+            exp = Explorer()
+            exp.create_embeddings_table()
+            similar = exp.plot_similar(img='https://ultralytics.com/images/zidane.jpg')
+            ```
+        """
+        similar = self.get_similar(img, idx, limit, return_type="arrow")
+        if len(similar) == 0:
+            LOGGER.info("No results found.")
+            return None
+        img = plot_query_result(similar, plot_labels=labels)
+        return Image.fromarray(img)
+
+    def similarity_index(self, max_dist: float = 0.2, top_k: float = None, force: bool = False) -> DataFrame:
+        """
+        Calculate the similarity index of all the images in the table. Here, the index will contain the data points that
+        are max_dist or closer to the image in the embedding space at a given index.
+
+        Args:
+            max_dist (float): maximum L2 distance between the embeddings to consider. Defaults to 0.2.
+            top_k (float): Percentage of the closest data points to consider when counting. Used to apply limit when running
+                           vector search. Defaults: None.
+            force (bool): Whether to overwrite the existing similarity index or not. Defaults to True.
+
+        Returns:
+            (pandas.DataFrame): A dataframe containing the similarity index. Each row corresponds to an image, and columns
+                                include indices of similar images and their respective distances.
+
+        Example:
+            ```python
+            exp = Explorer()
+            exp.create_embeddings_table()
+            sim_idx = exp.similarity_index()
+            ```
+        """
+        if self.table is None:
+            raise ValueError("Table is not created. Please create the table first.")
+        sim_idx_table_name = f"{self.sim_idx_base_name}_thres_{max_dist}_top_{top_k}".lower()
+        if sim_idx_table_name in self.connection.table_names() and not force:
+            LOGGER.info("Similarity matrix already exists. Reusing it. Pass force=True to overwrite it.")
+            return self.connection.open_table(sim_idx_table_name).to_pandas()
+
+        if top_k and not (1.0 >= top_k >= 0.0):
+            raise ValueError(f"top_k must be between 0.0 and 1.0. Got {top_k}")
+        if max_dist < 0.0:
+            raise ValueError(f"max_dist must be greater than 0. Got {max_dist}")
+
+        top_k = int(top_k * len(self.table)) if top_k else len(self.table)
+        top_k = max(top_k, 1)
+        features = self.table.to_lance().to_table(columns=["vector", "im_file"]).to_pydict()
+        im_files = features["im_file"]
+        embeddings = features["vector"]
+
+        sim_table = self.connection.create_table(sim_idx_table_name, schema=get_sim_index_schema(), mode="overwrite")
+
+        def _yield_sim_idx():
+            """Generates a dataframe with similarity indices and distances for images."""
+            for i in tqdm(range(len(embeddings))):
+                sim_idx = self.table.search(embeddings[i]).limit(top_k).to_pandas().query(f"_distance <= {max_dist}")
+                yield [
+                    {
+                        "idx": i,
+                        "im_file": im_files[i],
+                        "count": len(sim_idx),
+                        "sim_im_files": sim_idx["im_file"].tolist(),
+                    }
+                ]
+
+        sim_table.add(_yield_sim_idx())
+        self.sim_index = sim_table
+        return sim_table.to_pandas()
+
+    def plot_similarity_index(self, max_dist: float = 0.2, top_k: float = None, force: bool = False) -> Image:
+        """
+        Plot the similarity index of all the images in the table. Here, the index will contain the data points that are
+        max_dist or closer to the image in the embedding space at a given index.
+
+        Args:
+            max_dist (float): maximum L2 distance between the embeddings to consider. Defaults to 0.2.
+            top_k (float): Percentage of closest data points to consider when counting. Used to apply limit when
+                running vector search. Defaults to 0.01.
+            force (bool): Whether to overwrite the existing similarity index or not. Defaults to True.
+
+        Returns:
+            (PIL.Image): Image containing the plot.
+
+        Example:
+            ```python
+            exp = Explorer()
+            exp.create_embeddings_table()
+
+            similarity_idx_plot = exp.plot_similarity_index()
+            similarity_idx_plot.show() # view image preview
+            similarity_idx_plot.save('path/to/save/similarity_index_plot.png') # save contents to file
+            ```
+        """
+        sim_idx = self.similarity_index(max_dist=max_dist, top_k=top_k, force=force)
+        sim_count = sim_idx["count"].tolist()
+        sim_count = np.array(sim_count)
+
+        indices = np.arange(len(sim_count))
+
+        # Create the bar plot
+        plt.bar(indices, sim_count)
+
+        # Customize the plot (optional)
+        plt.xlabel("data idx")
+        plt.ylabel("Count")
+        plt.title("Similarity Count")
+        buffer = BytesIO()
+        plt.savefig(buffer, format="png")
+        buffer.seek(0)
+
+        # Use Pillow to open the image from the buffer
+        return Image.fromarray(np.array(Image.open(buffer)))
+
+    def _check_imgs_or_idxs(
+        self, img: Union[str, np.ndarray, List[str], List[np.ndarray], None], idx: Union[None, int, List[int]]
+    ) -> List[np.ndarray]:
+        if img is None and idx is None:
+            raise ValueError("Either img or idx must be provided.")
+        if img is not None and idx is not None:
+            raise ValueError("Only one of img or idx must be provided.")
+        if idx is not None:
+            idx = idx if isinstance(idx, list) else [idx]
+            img = self.table.to_lance().take(idx, columns=["im_file"]).to_pydict()["im_file"]
+
+        return img if isinstance(img, list) else [img]
+
+    def ask_ai(self, query):
+        """
+        Ask AI a question.
+
+        Args:
+            query (str): Question to ask.
+
+        Returns:
+            (pandas.DataFrame): A dataframe containing filtered results to the SQL query.
+
+        Example:
+            ```python
+            exp = Explorer()
+            exp.create_embeddings_table()
+            answer = exp.ask_ai('Show images with 1 person and 2 dogs')
+            ```
+        """
+        result = prompt_sql_query(query)
+        try:
+            df = self.sql_query(result)
+        except Exception as e:
+            LOGGER.error("AI generated query is not valid. Please try again with a different prompt")
+            LOGGER.error(e)
+            return None
+        return df
+
+    def visualize(self, result):
+        """
+        Visualize the results of a query. TODO.
+
+        Args:
+            result (pyarrow.Table): Table containing the results of a query.
+        """
+        pass
+
+    def generate_report(self, result):
+        """
+        Generate a report of the dataset.
+
+        TODO
+        """
+        pass
--- a/ultralytics/data/explorer/gui/init.py
+++ b/ultralytics/data/explorer/gui/init.py
@ -0,0 +1 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
--- a/ultralytics/data/explorer/gui/dash.py
+++ b/ultralytics/data/explorer/gui/dash.py
@ -0,0 +1,268 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+import time
+from threading import Thread
+
+import pandas as pd
+
+from ultralytics import Explorer
+from ultralytics.utils import ROOT, SETTINGS
+from ultralytics.utils.checks import check_requirements
+
+check_requirements(("streamlit>=1.29.0", "streamlit-select>=0.3"))
+
+import streamlit as st
+from streamlit_select import image_select
+
+
+def _get_explorer():
+    """Initializes and returns an instance of the Explorer class."""
+    exp = Explorer(data=st.session_state.get("dataset"), model=st.session_state.get("model"))
+    thread = Thread(
+        target=exp.create_embeddings_table, kwargs={"force": st.session_state.get("force_recreate_embeddings")}
+    )
+    thread.start()
+    progress_bar = st.progress(0, text="Creating embeddings table...")
+    while exp.progress < 1:
+        time.sleep(0.1)
+        progress_bar.progress(exp.progress, text=f"Progress: {exp.progress * 100}%")
+    thread.join()
+    st.session_state["explorer"] = exp
+    progress_bar.empty()
+
+
+def init_explorer_form():
+    """Initializes an Explorer instance and creates embeddings table with progress tracking."""
+    datasets = ROOT / "cfg" / "datasets"
+    ds = [d.name for d in datasets.glob("*.yaml")]
+    models = [
+        "yolov8n.pt",
+        "yolov8s.pt",
+        "yolov8m.pt",
+        "yolov8l.pt",
+        "yolov8x.pt",
+        "yolov8n-seg.pt",
+        "yolov8s-seg.pt",
+        "yolov8m-seg.pt",
+        "yolov8l-seg.pt",
+        "yolov8x-seg.pt",
+        "yolov8n-pose.pt",
+        "yolov8s-pose.pt",
+        "yolov8m-pose.pt",
+        "yolov8l-pose.pt",
+        "yolov8x-pose.pt",
+    ]
+    with st.form(key="explorer_init_form"):
+        col1, col2 = st.columns(2)
+        with col1:
+            st.selectbox("Select dataset", ds, key="dataset", index=ds.index("coco128.yaml"))
+        with col2:
+            st.selectbox("Select model", models, key="model")
+        st.checkbox("Force recreate embeddings", key="force_recreate_embeddings")
+
+        st.form_submit_button("Explore", on_click=_get_explorer)
+
+
+def query_form():
+    """Sets up a form in Streamlit to initialize Explorer with dataset and model selection."""
+    with st.form("query_form"):
+        col1, col2 = st.columns([0.8, 0.2])
+        with col1:
+            st.text_input(
+                "Query",
+                "WHERE labels LIKE '%person%' AND labels LIKE '%dog%'",
+                label_visibility="collapsed",
+                key="query",
+            )
+        with col2:
+            st.form_submit_button("Query", on_click=run_sql_query)
+
+
+def ai_query_form():
+    """Sets up a Streamlit form for user input to initialize Explorer with dataset and model selection."""
+    with st.form("ai_query_form"):
+        col1, col2 = st.columns([0.8, 0.2])
+        with col1:
+            st.text_input("Query", "Show images with 1 person and 1 dog", label_visibility="collapsed", key="ai_query")
+        with col2:
+            st.form_submit_button("Ask AI", on_click=run_ai_query)
+
+
+def find_similar_imgs(imgs):
+    """Initializes a Streamlit form for AI-based image querying with custom input."""
+    exp = st.session_state["explorer"]
+    similar = exp.get_similar(img=imgs, limit=st.session_state.get("limit"), return_type="arrow")
+    paths = similar.to_pydict()["im_file"]
+    st.session_state["imgs"] = paths
+    st.session_state["res"] = similar
+
+
+def similarity_form(selected_imgs):
+    """Initializes a form for AI-based image querying with custom input in Streamlit."""
+    st.write("Similarity Search")
+    with st.form("similarity_form"):
+        subcol1, subcol2 = st.columns([1, 1])
+        with subcol1:
+            st.number_input(
+                "limit", min_value=None, max_value=None, value=25, label_visibility="collapsed", key="limit"
+            )
+
+        with subcol2:
+            disabled = not len(selected_imgs)
+            st.write("Selected: ", len(selected_imgs))
+            st.form_submit_button(
+                "Search",
+                disabled=disabled,
+                on_click=find_similar_imgs,
+                args=(selected_imgs,),
+            )
+        if disabled:
+            st.error("Select at least one image to search.")
+
+
+# def persist_reset_form():
+#    with st.form("persist_reset"):
+#        col1, col2 = st.columns([1, 1])
+#        with col1:
+#            st.form_submit_button("Reset", on_click=reset)
+#
+#        with col2:
+#            st.form_submit_button("Persist", on_click=update_state, args=("PERSISTING", True))
+
+
+def run_sql_query():
+    """Executes an SQL query and returns the results."""
+    st.session_state["error"] = None
+    query = st.session_state.get("query")
+    if query.rstrip().lstrip():
+        exp = st.session_state["explorer"]
+        res = exp.sql_query(query, return_type="arrow")
+        st.session_state["imgs"] = res.to_pydict()["im_file"]
+        st.session_state["res"] = res
+
+
+def run_ai_query():
+    """Execute SQL query and update session state with query results."""
+    if not SETTINGS["openai_api_key"]:
+        st.session_state["error"] = (
+            'OpenAI API key not found in settings. Please run yolo settings openai_api_key="..."'
+        )
+        return
+    st.session_state["error"] = None
+    query = st.session_state.get("ai_query")
+    if query.rstrip().lstrip():
+        exp = st.session_state["explorer"]
+        res = exp.ask_ai(query)
+        if not isinstance(res, pd.DataFrame) or res.empty:
+            st.session_state["error"] = "No results found using AI generated query. Try another query or rerun it."
+            return
+        st.session_state["imgs"] = res["im_file"].to_list()
+        st.session_state["res"] = res
+
+
+def reset_explorer():
+    """Resets the explorer to its initial state by clearing session variables."""
+    st.session_state["explorer"] = None
+    st.session_state["imgs"] = None
+    st.session_state["error"] = None
+
+
+def utralytics_explorer_docs_callback():
+    """Resets the explorer to its initial state by clearing session variables."""
+    with st.container(border=True):
+        st.image(
+            "https://raw.githubusercontent.com/ultralytics/assets/main/logo/Ultralytics_Logotype_Original.svg",
+            width=100,
+        )
+        st.markdown(
+            "<p>This demo is built using Ultralytics Explorer API. Visit <a href='https://docs.ultralytics.com/datasets/explorer/'>API docs</a> to try examples & learn more</p>",
+            unsafe_allow_html=True,
+            help=None,
+        )
+        st.link_button("Ultrlaytics Explorer API", "https://docs.ultralytics.com/datasets/explorer/")
+
+
+def layout():
+    """Resets explorer session variables and provides documentation with a link to API docs."""
+    st.set_page_config(layout="wide", initial_sidebar_state="collapsed")
+    st.markdown("<h1 style='text-align: center;'>Ultralytics Explorer Demo</h1>", unsafe_allow_html=True)
+
+    if st.session_state.get("explorer") is None:
+        init_explorer_form()
+        return
+
+    st.button(":arrow_backward: Select Dataset", on_click=reset_explorer)
+    exp = st.session_state.get("explorer")
+    col1, col2 = st.columns([0.75, 0.25], gap="small")
+    imgs = []
+    if st.session_state.get("error"):
+        st.error(st.session_state["error"])
+    else:
+        if st.session_state.get("imgs"):
+            imgs = st.session_state.get("imgs")
+        else:
+            imgs = exp.table.to_lance().to_table(columns=["im_file"]).to_pydict()["im_file"]
+            st.session_state["res"] = exp.table.to_arrow()
+    total_imgs, selected_imgs = len(imgs), []
+    with col1:
+        subcol1, subcol2, subcol3, subcol4, subcol5 = st.columns(5)
+        with subcol1:
+            st.write("Max Images Displayed:")
+        with subcol2:
+            num = st.number_input(
+                "Max Images Displayed",
+                min_value=0,
+                max_value=total_imgs,
+                value=min(500, total_imgs),
+                key="num_imgs_displayed",
+                label_visibility="collapsed",
+            )
+        with subcol3:
+            st.write("Start Index:")
+        with subcol4:
+            start_idx = st.number_input(
+                "Start Index",
+                min_value=0,
+                max_value=total_imgs,
+                value=0,
+                key="start_index",
+                label_visibility="collapsed",
+            )
+        with subcol5:
+            reset = st.button("Reset", use_container_width=False, key="reset")
+            if reset:
+                st.session_state["imgs"] = None
+                st.experimental_rerun()
+
+        query_form()
+        ai_query_form()
+        if total_imgs:
+            labels, boxes, masks, kpts, classes = None, None, None, None, None
+            task = exp.model.task
+            if st.session_state.get("display_labels"):
+                labels = st.session_state.get("res").to_pydict()["labels"][start_idx : start_idx + num]
+                boxes = st.session_state.get("res").to_pydict()["bboxes"][start_idx : start_idx + num]
+                masks = st.session_state.get("res").to_pydict()["masks"][start_idx : start_idx + num]
+                kpts = st.session_state.get("res").to_pydict()["keypoints"][start_idx : start_idx + num]
+                classes = st.session_state.get("res").to_pydict()["cls"][start_idx : start_idx + num]
+            imgs_displayed = imgs[start_idx : start_idx + num]
+            selected_imgs = image_select(
+                f"Total samples: {total_imgs}",
+                images=imgs_displayed,
+                use_container_width=False,
+                # indices=[i for i in range(num)] if select_all else None,
+                labels=labels,
+                classes=classes,
+                bboxes=boxes,
+                masks=masks if task == "segment" else None,
+                kpts=kpts if task == "pose" else None,
+            )
+
+    with col2:
+        similarity_form(selected_imgs)
+        display_labels = st.checkbox("Labels", value=False, key="display_labels")
+        utralytics_explorer_docs_callback()
+
+
+if __name__ == "__main__":
+    layout()
--- a/ultralytics/data/explorer/utils.py
+++ b/ultralytics/data/explorer/utils.py
@ -0,0 +1,166 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+import getpass
+from typing import List
+
+import cv2
+import numpy as np
+import pandas as pd
+
+from ultralytics.data.augment import LetterBox
+from ultralytics.utils import LOGGER as logger
+from ultralytics.utils import SETTINGS
+from ultralytics.utils.checks import check_requirements
+from ultralytics.utils.ops import xyxy2xywh
+from ultralytics.utils.plotting import plot_images
+
+
+def get_table_schema(vector_size):
+    """Extracts and returns the schema of a database table."""
+    from lancedb.pydantic import LanceModel, Vector
+
+    class Schema(LanceModel):
+        im_file: str
+        labels: List[str]
+        cls: List[int]
+        bboxes: List[List[float]]
+        masks: List[List[List[int]]]
+        keypoints: List[List[List[float]]]
+        vector: Vector(vector_size)
+
+    return Schema
+
+
+def get_sim_index_schema():
+    """Returns a LanceModel schema for a database table with specified vector size."""
+    from lancedb.pydantic import LanceModel
+
+    class Schema(LanceModel):
+        idx: int
+        im_file: str
+        count: int
+        sim_im_files: List[str]
+
+    return Schema
+
+
+def sanitize_batch(batch, dataset_info):
+    """Sanitizes input batch for inference, ensuring correct format and dimensions."""
+    batch["cls"] = batch["cls"].flatten().int().tolist()
+    box_cls_pair = sorted(zip(batch["bboxes"].tolist(), batch["cls"]), key=lambda x: x[1])
+    batch["bboxes"] = [box for box, _ in box_cls_pair]
+    batch["cls"] = [cls for _, cls in box_cls_pair]
+    batch["labels"] = [dataset_info["names"][i] for i in batch["cls"]]
+    batch["masks"] = batch["masks"].tolist() if "masks" in batch else [[[]]]
+    batch["keypoints"] = batch["keypoints"].tolist() if "keypoints" in batch else [[[]]]
+    return batch
+
+
+def plot_query_result(similar_set, plot_labels=True):
+    """
+    Plot images from the similar set.
+
+    Args:
+        similar_set (list): Pyarrow or pandas object containing the similar data points
+        plot_labels (bool): Whether to plot labels or not
+    """
+    similar_set = (
+        similar_set.to_dict(orient="list") if isinstance(similar_set, pd.DataFrame) else similar_set.to_pydict()
+    )
+    empty_masks = [[[]]]
+    empty_boxes = [[]]
+    images = similar_set.get("im_file", [])
+    bboxes = similar_set.get("bboxes", []) if similar_set.get("bboxes") is not empty_boxes else []
+    masks = similar_set.get("masks") if similar_set.get("masks")[0] != empty_masks else []
+    kpts = similar_set.get("keypoints") if similar_set.get("keypoints")[0] != empty_masks else []
+    cls = similar_set.get("cls", [])
+
+    plot_size = 640
+    imgs, batch_idx, plot_boxes, plot_masks, plot_kpts = [], [], [], [], []
+    for i, imf in enumerate(images):
+        im = cv2.imread(imf)
+        im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+        h, w = im.shape[:2]
+        r = min(plot_size / h, plot_size / w)
+        imgs.append(LetterBox(plot_size, center=False)(image=im).transpose(2, 0, 1))
+        if plot_labels:
+            if len(bboxes) > i and len(bboxes[i]) > 0:
+                box = np.array(bboxes[i], dtype=np.float32)
+                box[:, [0, 2]] *= r
+                box[:, [1, 3]] *= r
+                plot_boxes.append(box)
+            if len(masks) > i and len(masks[i]) > 0:
+                mask = np.array(masks[i], dtype=np.uint8)[0]
+                plot_masks.append(LetterBox(plot_size, center=False)(image=mask))
+            if len(kpts) > i and kpts[i] is not None:
+                kpt = np.array(kpts[i], dtype=np.float32)
+                kpt[:, :, :2] *= r
+                plot_kpts.append(kpt)
+        batch_idx.append(np.ones(len(np.array(bboxes[i], dtype=np.float32))) * i)
+    imgs = np.stack(imgs, axis=0)
+    masks = np.stack(plot_masks, axis=0) if plot_masks else np.zeros(0, dtype=np.uint8)
+    kpts = np.concatenate(plot_kpts, axis=0) if plot_kpts else np.zeros((0, 51), dtype=np.float32)
+    boxes = xyxy2xywh(np.concatenate(plot_boxes, axis=0)) if plot_boxes else np.zeros(0, dtype=np.float32)
+    batch_idx = np.concatenate(batch_idx, axis=0)
+    cls = np.concatenate([np.array(c, dtype=np.int32) for c in cls], axis=0)
+
+    return plot_images(
+        imgs, batch_idx, cls, bboxes=boxes, masks=masks, kpts=kpts, max_subplots=len(images), save=False, threaded=False
+    )
+
+
+def prompt_sql_query(query):
+    """Plots images with optional labels from a similar data set."""
+    check_requirements("openai>=1.6.1")
+    from openai import OpenAI
+
+    if not SETTINGS["openai_api_key"]:
+        logger.warning("OpenAI API key not found in settings. Please enter your API key below.")
+        openai_api_key = getpass.getpass("OpenAI API key: ")
+        SETTINGS.update({"openai_api_key": openai_api_key})
+    openai = OpenAI(api_key=SETTINGS["openai_api_key"])
+
+    messages = [
+        {
+            "role": "system",
+            "content": """
+                You are a helpful data scientist proficient in SQL. You need to output exactly one SQL query based on
+                the following schema and a user request. You only need to output the format with fixed selection
+                statement that selects everything from "'table'", like `SELECT * from 'table'`
+
+                Schema:
+                im_file: string not null
+                labels: list<item: string> not null
+                child 0, item: string
+                cls: list<item: int64> not null
+                child 0, item: int64
+                bboxes: list<item: list<item: double>> not null
+                child 0, item: list<item: double>
+                    child 0, item: double
+                masks: list<item: list<item: list<item: int64>>> not null
+                child 0, item: list<item: list<item: int64>>
+                    child 0, item: list<item: int64>
+                        child 0, item: int64
+                keypoints: list<item: list<item: list<item: double>>> not null
+                child 0, item: list<item: list<item: double>>
+                    child 0, item: list<item: double>
+                        child 0, item: double
+                vector: fixed_size_list<item: float>[256] not null
+                child 0, item: float
+
+                Some details about the schema:
+                - the "labels" column contains the string values like 'person' and 'dog' for the respective objects
+                    in each image
+                - the "cls" column contains the integer values on these classes that map them the labels
+
+                Example of a correct query:
+                request - Get all data points that contain 2 or more people and at least one dog
+                correct query-
+                SELECT * FROM 'table' WHERE  ARRAY_LENGTH(cls) >= 2  AND ARRAY_LENGTH(FILTER(labels, x -> x = 'person')) >= 2  AND ARRAY_LENGTH(FILTER(labels, x -> x = 'dog')) >= 1;
+             """,
+        },
+        {"role": "user", "content": f"{query}"},
+    ]
+
+    response = openai.chat.completions.create(model="gpt-3.5-turbo", messages=messages)
+    return response.choices[0].message.content
--- a/ultralytics/data/loaders.py
+++ b/ultralytics/data/loaders.py
@ -22,76 +22,114 @@ from ultralytics.utils.checks import check_requirements

@dataclass
 class SourceTypes:
-    webcam: bool = False
+    """Class to represent various types of input sources for predictions."""
+
+    stream: bool = False
    screenshot: bool = False
    from_img: bool = False
    tensor: bool = False


 class LoadStreams:
-    """YOLOv8 streamloader, i.e. `yolo predict source='rtsp://example.com/media.mp4'  # RTSP, RTMP, HTTP streams`."""
+    """
+    Stream Loader for various types of video streams, Supports RTSP, RTMP, HTTP, and TCP streams.

-    def __init__(self, sources='file.streams', imgsz=640, vid_stride=1, stream_buffer=False):
+    Attributes:
+        sources (str): The source input paths or URLs for the video streams.
+        vid_stride (int): Video frame-rate stride, defaults to 1.
+        buffer (bool): Whether to buffer input streams, defaults to False.
+        running (bool): Flag to indicate if the streaming thread is running.
+        mode (str): Set to 'stream' indicating real-time capture.
+        imgs (list): List of image frames for each stream.
+        fps (list): List of FPS for each stream.
+        frames (list): List of total frames for each stream.
+        threads (list): List of threads for each stream.
+        shape (list): List of shapes for each stream.
+        caps (list): List of cv2.VideoCapture objects for each stream.
+        bs (int): Batch size for processing.
+
+    Methods:
+        __init__: Initialize the stream loader.
+        update: Read stream frames in daemon thread.
+        close: Close stream loader and release resources.
+        __iter__: Returns an iterator object for the class.
+        __next__: Returns source paths, transformed, and original images for processing.
+        __len__: Return the length of the sources object.
+
+    Example:
+         ```bash
+         yolo predict source='rtsp://example.com/media.mp4'
+         ```
+    """
+
+    def __init__(self, sources="file.streams", vid_stride=1, buffer=False):
        """Initialize instance variables and check for consistent input stream shapes."""
        torch.backends.cudnn.benchmark = True  # faster for fixed-size inference
-        self.stream_buffer = stream_buffer  # buffer input streams
+        self.buffer = buffer  # buffer input streams
        self.running = True  # running flag for Thread
-        self.mode = 'stream'
-        self.imgsz = imgsz
+        self.mode = "stream"
        self.vid_stride = vid_stride  # video frame-rate stride
+
        sources = Path(sources).read_text().rsplit() if os.path.isfile(sources) else [sources]
        n = len(sources)
-        self.sources = [ops.clean_str(x) for x in sources]  # clean source names for later
-        self.imgs, self.fps, self.frames, self.threads, self.shape = [[]] * n, [0] * n, [0] * n, [None] * n, [None] * n
+        self.bs = n
+        self.fps = [0] * n  # frames per second
+        self.frames = [0] * n
+        self.threads = [None] * n
        self.caps = [None] * n  # video capture objects
+        self.imgs = [[] for _ in range(n)]  # images
+        self.shape = [[] for _ in range(n)]  # image shapes
+        self.sources = [ops.clean_str(x) for x in sources]  # clean source names for later
        for i, s in enumerate(sources):  # index, source
            # Start thread to read frames from video stream
-            st = f'{i + 1}/{n}: {s}... '
-            if urlparse(s).hostname in ('www.youtube.com', 'youtube.com', 'youtu.be'):  # if source is YouTube video
-                # YouTube format i.e. 'https://www.youtube.com/watch?v=Zgi9g1ksQHc' or 'https://youtu.be/Zgi9g1ksQHc'
+            st = f"{i + 1}/{n}: {s}... "
+            if urlparse(s).hostname in ("www.youtube.com", "youtube.com", "youtu.be"):  # if source is YouTube video
+                # YouTube format i.e. 'https://www.youtube.com/watch?v=Zgi9g1ksQHc' or 'https://youtu.be/LNwODJXcvt4'
                s = get_best_youtube_url(s)
            s = eval(s) if s.isnumeric() else s  # i.e. s = '0' local webcam
            if s == 0 and (is_colab() or is_kaggle()):
-                raise NotImplementedError("'source=0' webcam not supported in Colab and Kaggle notebooks. "
-                                          "Try running 'source=0' in a local environment.")
+                raise NotImplementedError(
+                    "'source=0' webcam not supported in Colab and Kaggle notebooks. "
+                    "Try running 'source=0' in a local environment."
+                )
            self.caps[i] = cv2.VideoCapture(s)  # store video capture object
            if not self.caps[i].isOpened():
-                raise ConnectionError(f'{st}Failed to open {s}')
+                raise ConnectionError(f"{st}Failed to open {s}")
            w = int(self.caps[i].get(cv2.CAP_PROP_FRAME_WIDTH))
            h = int(self.caps[i].get(cv2.CAP_PROP_FRAME_HEIGHT))
            fps = self.caps[i].get(cv2.CAP_PROP_FPS)  # warning: may return 0 or nan
            self.frames[i] = max(int(self.caps[i].get(cv2.CAP_PROP_FRAME_COUNT)), 0) or float(
-                'inf')  # infinite stream fallback
+                "inf"
+            )  # infinite stream fallback
            self.fps[i] = max((fps if math.isfinite(fps) else 0) % 100, 0) or 30  # 30 FPS fallback

            success, im = self.caps[i].read()  # guarantee first frame
            if not success or im is None:
-                raise ConnectionError(f'{st}Failed to read images from {s}')
+                raise ConnectionError(f"{st}Failed to read images from {s}")
            self.imgs[i].append(im)
            self.shape[i] = im.shape
            self.threads[i] = Thread(target=self.update, args=([i, self.caps[i], s]), daemon=True)
-            LOGGER.info(f'{st}Success ✅ ({self.frames[i]} frames of shape {w}x{h} at {self.fps[i]:.2f} FPS)')
+            LOGGER.info(f"{st}Success ✅ ({self.frames[i]} frames of shape {w}x{h} at {self.fps[i]:.2f} FPS)")
            self.threads[i].start()
-        LOGGER.info('')  # newline
-
-        # Check for common shapes
-        self.bs = self.__len__()
+        LOGGER.info("")  # newline

    def update(self, i, cap, stream):
        """Read stream `i` frames in daemon thread."""
        n, f = 0, self.frames[i]  # frame number, frame array
        while self.running and cap.isOpened() and n < (f - 1):
-            # Only read a new frame if the buffer is empty
-            if not self.imgs[i] or not self.stream_buffer:
+            if len(self.imgs[i]) < 30:  # keep a <=30-image buffer
                n += 1
                cap.grab()  # .read() = .grab() followed by .retrieve()
                if n % self.vid_stride == 0:
                    success, im = cap.retrieve()
                    if not success:
                        im = np.zeros(self.shape[i], dtype=np.uint8)
-                        LOGGER.warning('WARNING ⚠️ Video stream unresponsive, please check your IP camera connection.')
+                        LOGGER.warning("WARNING ⚠️ Video stream unresponsive, please check your IP camera connection.")
                        cap.open(stream)  # re-open stream if signal was lost
-                    self.imgs[i].append(im)  # add image to buffer
+                    if self.buffer:
+                        self.imgs[i].append(im)
+                    else:
+                        self.imgs[i] = [im]
            else:
                time.sleep(0.01)  # wait until the buffer is empty

@ -105,7 +143,7 @@ class LoadStreams:
            try:
                cap.release()  # release video capture
            except Exception as e:
-                LOGGER.warning(f'WARNING ⚠️ Could not release VideoCapture object: {e}')
+                LOGGER.warning(f"WARNING ⚠️ Could not release VideoCapture object: {e}")
        cv2.destroyAllWindows()

    def __iter__(self):
@ -117,36 +155,62 @@ class LoadStreams:
        """Returns source paths, transformed and original images for processing."""
        self.count += 1

-        # Wait until a frame is available in each buffer
-        while not all(self.imgs):
-            if not all(x.is_alive() for x in self.threads) or cv2.waitKey(1) == ord('q'):  # q to quit
-                self.close()
-                raise StopIteration
-            time.sleep(1 / min(self.fps))
+        images = []
+        for i, x in enumerate(self.imgs):
+            # Wait until a frame is available in each buffer
+            while not x:
+                if not self.threads[i].is_alive() or cv2.waitKey(1) == ord("q"):  # q to quit
+                    self.close()
+                    raise StopIteration
+                time.sleep(1 / min(self.fps))
+                x = self.imgs[i]
+                if not x:
+                    LOGGER.warning(f"WARNING ⚠️ Waiting for stream {i}")

-        # Get and remove the next frame from imgs buffer
-        if self.stream_buffer:
-            images = [x.pop(0) for x in self.imgs]
-        else:
-            # Get the latest frame, and clear the rest from the imgs buffer
-            images = []
-            for x in self.imgs:
-                images.append(x.pop(-1) if x else None)
+            # Get and remove the first frame from imgs buffer
+            if self.buffer:
+                images.append(x.pop(0))
+
+            # Get the last frame, and clear the rest from the imgs buffer
+            else:
+                images.append(x.pop(-1) if x else np.zeros(self.shape[i], dtype=np.uint8))
                x.clear()

-        return self.sources, images, None, ''
+        return self.sources, images, [""] * self.bs

    def __len__(self):
        """Return the length of the sources object."""
-        return len(self.sources)  # 1E12 frames = 32 streams at 30 FPS for 30 years
+        return self.bs  # 1E12 frames = 32 streams at 30 FPS for 30 years


 class LoadScreenshots:
-    """YOLOv8 screenshot dataloader, i.e. `yolo predict source=screen`."""
+    """
+    YOLOv8 screenshot dataloader.

-    def __init__(self, source, imgsz=640):
-        """source = [screen_number left top width height] (pixels)."""
-        check_requirements('mss')
+    This class manages the loading of screenshot images for processing with YOLOv8.
+    Suitable for use with `yolo predict source=screen`.
+
+    Attributes:
+        source (str): The source input indicating which screen to capture.
+        screen (int): The screen number to capture.
+        left (int): The left coordinate for screen capture area.
+        top (int): The top coordinate for screen capture area.
+        width (int): The width of the screen capture area.
+        height (int): The height of the screen capture area.
+        mode (str): Set to 'stream' indicating real-time capture.
+        frame (int): Counter for captured frames.
+        sct (mss.mss): Screen capture object from `mss` library.
+        bs (int): Batch size, set to 1.
+        monitor (dict): Monitor configuration details.
+
+    Methods:
+        __iter__: Returns an iterator object.
+        __next__: Captures the next screenshot and returns it.
+    """
+
+    def __init__(self, source):
+        """Source = [screen_number left top width height] (pixels)."""
+        check_requirements("mss")
        import mss  # noqa

        source, *params = source.split()
@ -157,19 +221,19 @@ class LoadScreenshots:
            left, top, width, height = (int(x) for x in params)
        elif len(params) == 5:
            self.screen, left, top, width, height = (int(x) for x in params)
-        self.imgsz = imgsz
-        self.mode = 'stream'
+        self.mode = "stream"
        self.frame = 0
        self.sct = mss.mss()
        self.bs = 1
+        self.fps = 30

        # Parse monitor shape
        monitor = self.sct.monitors[self.screen]
-        self.top = monitor['top'] if top is None else (monitor['top'] + top)
-        self.left = monitor['left'] if left is None else (monitor['left'] + left)
-        self.width = width or monitor['width']
-        self.height = height or monitor['height']
-        self.monitor = {'left': self.left, 'top': self.top, 'width': self.width, 'height': self.height}
+        self.top = monitor["top"] if top is None else (monitor["top"] + top)
+        self.left = monitor["left"] if left is None else (monitor["left"] + left)
+        self.width = width or monitor["width"]
+        self.height = height or monitor["height"]
+        self.monitor = {"left": self.left, "top": self.top, "width": self.width, "height": self.height}

    def __iter__(self):
        """Returns an iterator of the object."""
@ -178,53 +242,75 @@ class LoadScreenshots:
    def __next__(self):
        """mss screen capture: get raw pixels from the screen as np array."""
        im0 = np.asarray(self.sct.grab(self.monitor))[:, :, :3]  # BGRA to BGR
-        s = f'screen {self.screen} (LTWH): {self.left},{self.top},{self.width},{self.height}: '
+        s = f"screen {self.screen} (LTWH): {self.left},{self.top},{self.width},{self.height}: "

        self.frame += 1
-        return [str(self.screen)], [im0], None, s  # screen, img, vid_cap, string
+        return [str(self.screen)], [im0], [s]  # screen, img, string


-class LoadImages:
-    """YOLOv8 image/video dataloader, i.e. `yolo predict source=image.jpg/vid.mp4`."""
+class LoadImagesAndVideos:
+    """
+    YOLOv8 image/video dataloader.

-    def __init__(self, path, imgsz=640, vid_stride=1):
+    This class manages the loading and pre-processing of image and video data for YOLOv8. It supports loading from
+    various formats, including single image files, video files, and lists of image and video paths.
+
+    Attributes:
+        files (list): List of image and video file paths.
+        nf (int): Total number of files (images and videos).
+        video_flag (list): Flags indicating whether a file is a video (True) or an image (False).
+        mode (str): Current mode, 'image' or 'video'.
+        vid_stride (int): Stride for video frame-rate, defaults to 1.
+        bs (int): Batch size, set to 1 for this class.
+        cap (cv2.VideoCapture): Video capture object for OpenCV.
+        frame (int): Frame counter for video.
+        frames (int): Total number of frames in the video.
+        count (int): Counter for iteration, initialized at 0 during `__iter__()`.
+
+    Methods:
+        _new_video(path): Create a new cv2.VideoCapture object for a given video path.
+    """
+
+    def __init__(self, path, batch=1, vid_stride=1):
        """Initialize the Dataloader and raise FileNotFoundError if file not found."""
        parent = None
-        if isinstance(path, str) and Path(path).suffix == '.txt':  # *.txt file with img/vid/dir on each line
+        if isinstance(path, str) and Path(path).suffix == ".txt":  # *.txt file with img/vid/dir on each line
            parent = Path(path).parent
            path = Path(path).read_text().splitlines()  # list of sources
        files = []
        for p in sorted(path) if isinstance(path, (list, tuple)) else [path]:
            a = str(Path(p).absolute())  # do not use .resolve() https://github.com/ultralytics/ultralytics/issues/2912
-            if '*' in a:
+            if "*" in a:
                files.extend(sorted(glob.glob(a, recursive=True)))  # glob
            elif os.path.isdir(a):
-                files.extend(sorted(glob.glob(os.path.join(a, '*.*'))))  # dir
+                files.extend(sorted(glob.glob(os.path.join(a, "*.*"))))  # dir
            elif os.path.isfile(a):
                files.append(a)  # files (absolute or relative to CWD)
            elif parent and (parent / p).is_file():
                files.append(str((parent / p).absolute()))  # files (relative to *.txt file parent)
            else:
-                raise FileNotFoundError(f'{p} does not exist')
+                raise FileNotFoundError(f"{p} does not exist")

-        images = [x for x in files if x.split('.')[-1].lower() in IMG_FORMATS]
-        videos = [x for x in files if x.split('.')[-1].lower() in VID_FORMATS]
+        images = [x for x in files if x.split(".")[-1].lower() in IMG_FORMATS]
+        videos = [x for x in files if x.split(".")[-1].lower() in VID_FORMATS]
        ni, nv = len(images), len(videos)

-        self.imgsz = imgsz
        self.files = images + videos
        self.nf = ni + nv  # number of files
+        self.ni = ni  # number of images
        self.video_flag = [False] * ni + [True] * nv
-        self.mode = 'image'
+        self.mode = "image"
        self.vid_stride = vid_stride  # video frame-rate stride
-        self.bs = 1
+        self.bs = batch
        if any(videos):
            self._new_video(videos[0])  # new video
        else:
            self.cap = None
        if self.nf == 0:
-            raise FileNotFoundError(f'No images or videos found in {p}. '
-                                    f'Supported formats are:\nimages: {IMG_FORMATS}\nvideos: {VID_FORMATS}')
+            raise FileNotFoundError(
+                f"No images or videos found in {p}. "
+                f"Supported formats are:\nimages: {IMG_FORMATS}\nvideos: {VID_FORMATS}"
+            )

    def __iter__(self):
        """Returns an iterator object for VideoStream or ImageFolder."""
@ -232,71 +318,105 @@ class LoadImages:
        return self

    def __next__(self):
-        """Return next image, path and metadata from dataset."""
-        if self.count == self.nf:
-            raise StopIteration
-        path = self.files[self.count]
-
-        if self.video_flag[self.count]:
-            # Read video
-            self.mode = 'video'
-            for _ in range(self.vid_stride):
-                self.cap.grab()
-            success, im0 = self.cap.retrieve()
-            while not success:
-                self.count += 1
-                self.cap.release()
-                if self.count == self.nf:  # last video
+        """Returns the next batch of images or video frames along with their paths and metadata."""
+        paths, imgs, info = [], [], []
+        while len(imgs) < self.bs:
+            if self.count >= self.nf:  # end of file list
+                if len(imgs) > 0:
+                    return paths, imgs, info  # return last partial batch
+                else:
                    raise StopIteration
-                path = self.files[self.count]
-                self._new_video(path)
-                success, im0 = self.cap.read()

-            self.frame += 1
-            # im0 = self._cv2_rotate(im0)  # for use if cv2 autorotation is False
-            s = f'video {self.count + 1}/{self.nf} ({self.frame}/{self.frames}) {path}: '
+            path = self.files[self.count]
+            if self.video_flag[self.count]:
+                self.mode = "video"
+                if not self.cap or not self.cap.isOpened():
+                    self._new_video(path)

-        else:
-            # Read image
-            self.count += 1
-            im0 = cv2.imread(path)  # BGR
-            if im0 is None:
-                raise FileNotFoundError(f'Image Not Found {path}')
-            s = f'image {self.count}/{self.nf} {path}: '
+                for _ in range(self.vid_stride):
+                    success = self.cap.grab()
+                    if not success:
+                        break  # end of video or failure

-        return [path], [im0], self.cap, s
+                if success:
+                    success, im0 = self.cap.retrieve()
+                    if success:
+                        self.frame += 1
+                        paths.append(path)
+                        imgs.append(im0)
+                        info.append(f"video {self.count + 1}/{self.nf} (frame {self.frame}/{self.frames}) {path}: ")
+                        if self.frame == self.frames:  # end of video
+                            self.count += 1
+                            self.cap.release()
+                else:
+                    # Move to the next file if the current video ended or failed to open
+                    self.count += 1
+                    if self.cap:
+                        self.cap.release()
+                    if self.count < self.nf:
+                        self._new_video(self.files[self.count])
+            else:
+                self.mode = "image"
+                im0 = cv2.imread(path)  # BGR
+                if im0 is None:
+                    raise FileNotFoundError(f"Image Not Found {path}")
+                paths.append(path)
+                imgs.append(im0)
+                info.append(f"image {self.count + 1}/{self.nf} {path}: ")
+                self.count += 1  # move to the next file
+                if self.count >= self.ni:  # end of image list
+                    break
+
+        return paths, imgs, info

    def _new_video(self, path):
-        """Create a new video capture object."""
+        """Creates a new video capture object for the given path."""
        self.frame = 0
        self.cap = cv2.VideoCapture(path)
+        self.fps = int(self.cap.get(cv2.CAP_PROP_FPS))
+        if not self.cap.isOpened():
+            raise FileNotFoundError(f"Failed to open video {path}")
        self.frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT) / self.vid_stride)

    def __len__(self):
-        """Returns the number of files in the object."""
-        return self.nf  # number of files
+        """Returns the number of batches in the object."""
+        return math.ceil(self.nf / self.bs)  # number of files


 class LoadPilAndNumpy:
+    """
+    Load images from PIL and Numpy arrays for batch processing.

-    def __init__(self, im0, imgsz=640):
+    This class is designed to manage loading and pre-processing of image data from both PIL and Numpy formats.
+    It performs basic validation and format conversion to ensure that the images are in the required format for
+    downstream processing.
+
+    Attributes:
+        paths (list): List of image paths or autogenerated filenames.
+        im0 (list): List of images stored as Numpy arrays.
+        mode (str): Type of data being processed, defaults to 'image'.
+        bs (int): Batch size, equivalent to the length of `im0`.
+
+    Methods:
+        _single_check(im): Validate and format a single image to a Numpy array.
+    """
+
+    def __init__(self, im0):
        """Initialize PIL and Numpy Dataloader."""
        if not isinstance(im0, list):
            im0 = [im0]
-        self.paths = [getattr(im, 'filename', f'image{i}.jpg') for i, im in enumerate(im0)]
+        self.paths = [getattr(im, "filename", f"image{i}.jpg") for i, im in enumerate(im0)]
        self.im0 = [self._single_check(im) for im in im0]
-        self.imgsz = imgsz
-        self.mode = 'image'
-        # Generate fake paths
+        self.mode = "image"
        self.bs = len(self.im0)

    @staticmethod
    def _single_check(im):
        """Validate and format an image to numpy array."""
-        assert isinstance(im, (Image.Image, np.ndarray)), f'Expected PIL/np.ndarray image type, but got {type(im)}'
+        assert isinstance(im, (Image.Image, np.ndarray)), f"Expected PIL/np.ndarray image type, but got {type(im)}"
        if isinstance(im, Image.Image):
-            if im.mode != 'RGB':
-                im = im.convert('RGB')
+            if im.mode != "RGB":
+                im = im.convert("RGB")
            im = np.asarray(im)[:, :, ::-1]
            im = np.ascontiguousarray(im)  # contiguous
        return im
@ -310,7 +430,7 @@ class LoadPilAndNumpy:
        if self.count == 1:  # loop only once as it's batch inference
            raise StopIteration
        self.count += 1
-        return self.paths, self.im0, None, ''
+        return self.paths, self.im0, [""] * self.bs

    def __iter__(self):
        """Enables iteration for class LoadPilAndNumpy."""
@ -319,18 +439,36 @@ class LoadPilAndNumpy:


 class LoadTensor:
+    """
+    Load images from torch.Tensor data.
+
+    This class manages the loading and pre-processing of image data from PyTorch tensors for further processing.
+
+    Attributes:
+        im0 (torch.Tensor): The input tensor containing the image(s).
+        bs (int): Batch size, inferred from the shape of `im0`.
+        mode (str): Current mode, set to 'image'.
+        paths (list): List of image paths or filenames.
+        count (int): Counter for iteration, initialized at 0 during `__iter__()`.
+
+    Methods:
+        _single_check(im, stride): Validate and possibly modify the input tensor.
+    """

    def __init__(self, im0) -> None:
+        """Initialize Tensor Dataloader."""
        self.im0 = self._single_check(im0)
        self.bs = self.im0.shape[0]
-        self.mode = 'image'
-        self.paths = [getattr(im, 'filename', f'image{i}.jpg') for i, im in enumerate(im0)]
+        self.mode = "image"
+        self.paths = [getattr(im, "filename", f"image{i}.jpg") for i, im in enumerate(im0)]

    @staticmethod
    def _single_check(im, stride=32):
        """Validate and format an image to torch.Tensor."""
-        s = f'WARNING ⚠️ torch.Tensor inputs should be BCHW i.e. shape(1, 3, 640, 640) ' \
-            f'divisible by stride {stride}. Input shape{tuple(im.shape)} is incompatible.'
+        s = (
+            f"WARNING ⚠️ torch.Tensor inputs should be BCHW i.e. shape(1, 3, 640, 640) "
+            f"divisible by stride {stride}. Input shape{tuple(im.shape)} is incompatible."
+        )
        if len(im.shape) != 4:
            if len(im.shape) != 3:
                raise ValueError(s)
@ -338,9 +476,11 @@ class LoadTensor:
            im = im.unsqueeze(0)
        if im.shape[2] % stride or im.shape[3] % stride:
            raise ValueError(s)
-        if im.max() > 1.0:
-            LOGGER.warning(f'WARNING ⚠️ torch.Tensor inputs should be normalized 0.0-1.0 but max value is {im.max()}. '
-                           f'Dividing input by 255.')
+        if im.max() > 1.0 + torch.finfo(im.dtype).eps:  # torch.float32 eps is 1.2e-07
+            LOGGER.warning(
+                f"WARNING ⚠️ torch.Tensor inputs should be normalized 0.0-1.0 but max value is {im.max()}. "
+                f"Dividing input by 255."
+            )
            im = im.float() / 255.0

        return im
@ -355,7 +495,7 @@ class LoadTensor:
        if self.count == 1:
            raise StopIteration
        self.count += 1
-        return self.paths, self.im0, None, ''
+        return self.paths, self.im0, [""] * self.bs

    def __len__(self):
        """Returns the batch size."""
@ -363,26 +503,23 @@ class LoadTensor:


 def autocast_list(source):
-    """
-    Merges a list of source of different types into a list of numpy arrays or PIL images
-    """
+    """Merges a list of source of different types into a list of numpy arrays or PIL images."""
    files = []
    for im in source:
        if isinstance(im, (str, Path)):  # filename or uri
-            files.append(Image.open(requests.get(im, stream=True).raw if str(im).startswith('http') else im))
+            files.append(Image.open(requests.get(im, stream=True).raw if str(im).startswith("http") else im))
        elif isinstance(im, (Image.Image, np.ndarray)):  # PIL or np Image
            files.append(im)
        else:
-            raise TypeError(f'type {type(im).__name__} is not a supported Ultralytics prediction source type. \n'
-                            f'See https://docs.ultralytics.com/modes/predict for supported source types.')
+            raise TypeError(
+                f"type {type(im).__name__} is not a supported Ultralytics prediction source type. \n"
+                f"See https://docs.ultralytics.com/modes/predict for supported source types."
+            )

    return files


-LOADERS = LoadStreams, LoadPilAndNumpy, LoadImages, LoadScreenshots  # tuple
-
-
-def get_best_youtube_url(url, use_pafy=False):
+def get_best_youtube_url(url, use_pafy=True):
    """
    Retrieves the URL of the best quality MP4 video stream from a given YouTube video.

@ -397,16 +534,22 @@ def get_best_youtube_url(url, use_pafy=False):
        (str): The URL of the best quality MP4 video stream, or None if no suitable stream is found.
    """
    if use_pafy:
-        check_requirements(('pafy', 'youtube_dl==2020.12.2'))
+        check_requirements(("pafy", "youtube_dl==2020.12.2"))
        import pafy  # noqa
-        return pafy.new(url).getbestvideo(preftype='mp4').url
+
+        return pafy.new(url).getbestvideo(preftype="mp4").url
    else:
-        check_requirements('yt-dlp')
+        check_requirements("yt-dlp")
        import yt_dlp
-        with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
+
+        with yt_dlp.YoutubeDL({"quiet": True}) as ydl:
            info_dict = ydl.extract_info(url, download=False)  # extract info
-        for f in reversed(info_dict.get('formats', [])):  # reversed because best is usually last
+        for f in reversed(info_dict.get("formats", [])):  # reversed because best is usually last
            # Find a format with video codec, no audio, *.mp4 extension at least 1920x1080 size
-            good_size = (f.get('width') or 0) >= 1920 or (f.get('height') or 0) >= 1080
-            if good_size and f['vcodec'] != 'none' and f['acodec'] == 'none' and f['ext'] == 'mp4':
-                return f.get('url')
+            good_size = (f.get("width") or 0) >= 1920 or (f.get("height") or 0) >= 1080
+            if good_size and f["vcodec"] != "none" and f["acodec"] == "none" and f["ext"] == "mp4":
+                return f.get("url")
+
+
+# Define constants
+LOADERS = (LoadStreams, LoadPilAndNumpy, LoadImagesAndVideos, LoadScreenshots)
--- a/ultralytics/data/scripts/get_coco.sh
+++ b/ultralytics/data/scripts/get_coco.sh
@ -1,6 +1,6 @@
 #!/bin/bash
 # Ultralytics YOLO 🚀, AGPL-3.0 license
-# Download COCO 2017 dataset http://cocodataset.org
+# Download COCO 2017 dataset https://cocodataset.org
 # Example usage: bash data/scripts/get_coco.sh
 # parent
 # ├── ultralytics
--- a/ultralytics/data/split_dota.py
+++ b/ultralytics/data/split_dota.py
@ -0,0 +1,288 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+import itertools
+from glob import glob
+from math import ceil
+from pathlib import Path
+
+import cv2
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+
+from ultralytics.data.utils import exif_size, img2label_paths
+from ultralytics.utils.checks import check_requirements
+
+check_requirements("shapely")
+from shapely.geometry import Polygon
+
+
+def bbox_iof(polygon1, bbox2, eps=1e-6):
+    """
+    Calculate iofs between bbox1 and bbox2.
+
+    Args:
+        polygon1 (np.ndarray): Polygon coordinates, (n, 8).
+        bbox2 (np.ndarray): Bounding boxes, (n ,4).
+    """
+    polygon1 = polygon1.reshape(-1, 4, 2)
+    lt_point = np.min(polygon1, axis=-2)
+    rb_point = np.max(polygon1, axis=-2)
+    bbox1 = np.concatenate([lt_point, rb_point], axis=-1)
+
+    lt = np.maximum(bbox1[:, None, :2], bbox2[..., :2])
+    rb = np.minimum(bbox1[:, None, 2:], bbox2[..., 2:])
+    wh = np.clip(rb - lt, 0, np.inf)
+    h_overlaps = wh[..., 0] * wh[..., 1]
+
+    l, t, r, b = (bbox2[..., i] for i in range(4))
+    polygon2 = np.stack([l, t, r, t, r, b, l, b], axis=-1).reshape(-1, 4, 2)
+
+    sg_polys1 = [Polygon(p) for p in polygon1]
+    sg_polys2 = [Polygon(p) for p in polygon2]
+    overlaps = np.zeros(h_overlaps.shape)
+    for p in zip(*np.nonzero(h_overlaps)):
+        overlaps[p] = sg_polys1[p[0]].intersection(sg_polys2[p[-1]]).area
+    unions = np.array([p.area for p in sg_polys1], dtype=np.float32)
+    unions = unions[..., None]
+
+    unions = np.clip(unions, eps, np.inf)
+    outputs = overlaps / unions
+    if outputs.ndim == 1:
+        outputs = outputs[..., None]
+    return outputs
+
+
+def load_yolo_dota(data_root, split="train"):
+    """
+    Load DOTA dataset.
+
+    Args:
+        data_root (str): Data root.
+        split (str): The split data set, could be train or val.
+
+    Notes:
+        The directory structure assumed for the DOTA dataset:
+            - data_root
+                - images
+                    - train
+                    - val
+                - labels
+                    - train
+                    - val
+    """
+    assert split in ["train", "val"]
+    im_dir = Path(data_root) / "images" / split
+    assert im_dir.exists(), f"Can't find {im_dir}, please check your data root."
+    im_files = glob(str(Path(data_root) / "images" / split / "*"))
+    lb_files = img2label_paths(im_files)
+    annos = []
+    for im_file, lb_file in zip(im_files, lb_files):
+        w, h = exif_size(Image.open(im_file))
+        with open(lb_file) as f:
+            lb = [x.split() for x in f.read().strip().splitlines() if len(x)]
+            lb = np.array(lb, dtype=np.float32)
+        annos.append(dict(ori_size=(h, w), label=lb, filepath=im_file))
+    return annos
+
+
+def get_windows(im_size, crop_sizes=[1024], gaps=[200], im_rate_thr=0.6, eps=0.01):
+    """
+    Get the coordinates of windows.
+
+    Args:
+        im_size (tuple): Original image size, (h, w).
+        crop_sizes (List(int)): Crop size of windows.
+        gaps (List(int)): Gap between crops.
+        im_rate_thr (float): Threshold of windows areas divided by image ares.
+    """
+    h, w = im_size
+    windows = []
+    for crop_size, gap in zip(crop_sizes, gaps):
+        assert crop_size > gap, f"invalid crop_size gap pair [{crop_size} {gap}]"
+        step = crop_size - gap
+
+        xn = 1 if w <= crop_size else ceil((w - crop_size) / step + 1)
+        xs = [step * i for i in range(xn)]
+        if len(xs) > 1 and xs[-1] + crop_size > w:
+            xs[-1] = w - crop_size
+
+        yn = 1 if h <= crop_size else ceil((h - crop_size) / step + 1)
+        ys = [step * i for i in range(yn)]
+        if len(ys) > 1 and ys[-1] + crop_size > h:
+            ys[-1] = h - crop_size
+
+        start = np.array(list(itertools.product(xs, ys)), dtype=np.int64)
+        stop = start + crop_size
+        windows.append(np.concatenate([start, stop], axis=1))
+    windows = np.concatenate(windows, axis=0)
+
+    im_in_wins = windows.copy()
+    im_in_wins[:, 0::2] = np.clip(im_in_wins[:, 0::2], 0, w)
+    im_in_wins[:, 1::2] = np.clip(im_in_wins[:, 1::2], 0, h)
+    im_areas = (im_in_wins[:, 2] - im_in_wins[:, 0]) * (im_in_wins[:, 3] - im_in_wins[:, 1])
+    win_areas = (windows[:, 2] - windows[:, 0]) * (windows[:, 3] - windows[:, 1])
+    im_rates = im_areas / win_areas
+    if not (im_rates > im_rate_thr).any():
+        max_rate = im_rates.max()
+        im_rates[abs(im_rates - max_rate) < eps] = 1
+    return windows[im_rates > im_rate_thr]
+
+
+def get_window_obj(anno, windows, iof_thr=0.7):
+    """Get objects for each window."""
+    h, w = anno["ori_size"]
+    label = anno["label"]
+    if len(label):
+        label[:, 1::2] *= w
+        label[:, 2::2] *= h
+        iofs = bbox_iof(label[:, 1:], windows)
+        # Unnormalized and misaligned coordinates
+        return [(label[iofs[:, i] >= iof_thr]) for i in range(len(windows))]  # window_anns
+    else:
+        return [np.zeros((0, 9), dtype=np.float32) for _ in range(len(windows))]  # window_anns
+
+
+def crop_and_save(anno, windows, window_objs, im_dir, lb_dir):
+    """
+    Crop images and save new labels.
+
+    Args:
+        anno (dict): Annotation dict, including `filepath`, `label`, `ori_size` as its keys.
+        windows (list): A list of windows coordinates.
+        window_objs (list): A list of labels inside each window.
+        im_dir (str): The output directory path of images.
+        lb_dir (str): The output directory path of labels.
+
+    Notes:
+        The directory structure assumed for the DOTA dataset:
+            - data_root
+                - images
+                    - train
+                    - val
+                - labels
+                    - train
+                    - val
+    """
+    im = cv2.imread(anno["filepath"])
+    name = Path(anno["filepath"]).stem
+    for i, window in enumerate(windows):
+        x_start, y_start, x_stop, y_stop = window.tolist()
+        new_name = f"{name}__{x_stop - x_start}__{x_start}___{y_start}"
+        patch_im = im[y_start:y_stop, x_start:x_stop]
+        ph, pw = patch_im.shape[:2]
+
+        cv2.imwrite(str(Path(im_dir) / f"{new_name}.jpg"), patch_im)
+        label = window_objs[i]
+        if len(label) == 0:
+            continue
+        label[:, 1::2] -= x_start
+        label[:, 2::2] -= y_start
+        label[:, 1::2] /= pw
+        label[:, 2::2] /= ph
+
+        with open(Path(lb_dir) / f"{new_name}.txt", "w") as f:
+            for lb in label:
+                formatted_coords = ["{:.6g}".format(coord) for coord in lb[1:]]
+                f.write(f"{int(lb[0])} {' '.join(formatted_coords)}\n")
+
+
+def split_images_and_labels(data_root, save_dir, split="train", crop_sizes=[1024], gaps=[200]):
+    """
+    Split both images and labels.
+
+    Notes:
+        The directory structure assumed for the DOTA dataset:
+            - data_root
+                - images
+                    - split
+                - labels
+                    - split
+        and the output directory structure is:
+            - save_dir
+                - images
+                    - split
+                - labels
+                    - split
+    """
+    im_dir = Path(save_dir) / "images" / split
+    im_dir.mkdir(parents=True, exist_ok=True)
+    lb_dir = Path(save_dir) / "labels" / split
+    lb_dir.mkdir(parents=True, exist_ok=True)
+
+    annos = load_yolo_dota(data_root, split=split)
+    for anno in tqdm(annos, total=len(annos), desc=split):
+        windows = get_windows(anno["ori_size"], crop_sizes, gaps)
+        window_objs = get_window_obj(anno, windows)
+        crop_and_save(anno, windows, window_objs, str(im_dir), str(lb_dir))
+
+
+def split_trainval(data_root, save_dir, crop_size=1024, gap=200, rates=[1.0]):
+    """
+    Split train and val set of DOTA.
+
+    Notes:
+        The directory structure assumed for the DOTA dataset:
+            - data_root
+                - images
+                    - train
+                    - val
+                - labels
+                    - train
+                    - val
+        and the output directory structure is:
+            - save_dir
+                - images
+                    - train
+                    - val
+                - labels
+                    - train
+                    - val
+    """
+    crop_sizes, gaps = [], []
+    for r in rates:
+        crop_sizes.append(int(crop_size / r))
+        gaps.append(int(gap / r))
+    for split in ["train", "val"]:
+        split_images_and_labels(data_root, save_dir, split, crop_sizes, gaps)
+
+
+def split_test(data_root, save_dir, crop_size=1024, gap=200, rates=[1.0]):
+    """
+    Split test set of DOTA, labels are not included within this set.
+
+    Notes:
+        The directory structure assumed for the DOTA dataset:
+            - data_root
+                - images
+                    - test
+        and the output directory structure is:
+            - save_dir
+                - images
+                    - test
+    """
+    crop_sizes, gaps = [], []
+    for r in rates:
+        crop_sizes.append(int(crop_size / r))
+        gaps.append(int(gap / r))
+    save_dir = Path(save_dir) / "images" / "test"
+    save_dir.mkdir(parents=True, exist_ok=True)
+
+    im_dir = Path(data_root) / "images" / "test"
+    assert im_dir.exists(), f"Can't find {im_dir}, please check your data root."
+    im_files = glob(str(im_dir / "*"))
+    for im_file in tqdm(im_files, total=len(im_files), desc="test"):
+        w, h = exif_size(Image.open(im_file))
+        windows = get_windows((h, w), crop_sizes=crop_sizes, gaps=gaps)
+        im = cv2.imread(im_file)
+        name = Path(im_file).stem
+        for window in windows:
+            x_start, y_start, x_stop, y_stop = window.tolist()
+            new_name = f"{name}__{x_stop - x_start}__{x_start}___{y_start}"
+            patch_im = im[y_start:y_stop, x_start:x_stop]
+            cv2.imwrite(str(save_dir / f"{new_name}.jpg"), patch_im)
+
+
+if __name__ == "__main__":
+    split_trainval(data_root="DOTAv2", save_dir="DOTAv2-split")
+    split_test(data_root="DOTAv2", save_dir="DOTAv2-split")
--- a/ultralytics/data/utils.py
+++ b/ultralytics/data/utils.py
@ -17,36 +17,47 @@ import numpy as np
 from PIL import Image, ImageOps

 from ultralytics.nn.autobackend import check_class_names
-from ultralytics.utils import (DATASETS_DIR, LOGGER, NUM_THREADS, ROOT, SETTINGS_YAML, TQDM, clean_url, colorstr,
-                               emojis, yaml_load)
+from ultralytics.utils import (
+    DATASETS_DIR,
+    LOGGER,
+    NUM_THREADS,
+    ROOT,
+    SETTINGS_YAML,
+    TQDM,
+    clean_url,
+    colorstr,
+    emojis,
+    yaml_load,
+    yaml_save,
+)
 from ultralytics.utils.checks import check_file, check_font, is_ascii
 from ultralytics.utils.downloads import download, safe_download, unzip_file
 from ultralytics.utils.ops import segments2boxes

-HELP_URL = 'See https://docs.ultralytics.com/datasets/detect for dataset formatting guidance.'
-IMG_FORMATS = 'bmp', 'dng', 'jpeg', 'jpg', 'mpo', 'png', 'tif', 'tiff', 'webp', 'pfm'  # image suffixes
-VID_FORMATS = 'asf', 'avi', 'gif', 'm4v', 'mkv', 'mov', 'mp4', 'mpeg', 'mpg', 'ts', 'wmv', 'webm'  # video suffixes
-PIN_MEMORY = str(os.getenv('PIN_MEMORY', True)).lower() == 'true'  # global pin_memory for dataloaders
+HELP_URL = "See https://docs.ultralytics.com/datasets/detect for dataset formatting guidance."
+IMG_FORMATS = {"bmp", "dng", "jpeg", "jpg", "mpo", "png", "tif", "tiff", "webp", "pfm"}  # image suffixes
+VID_FORMATS = {"asf", "avi", "gif", "m4v", "mkv", "mov", "mp4", "mpeg", "mpg", "ts", "wmv", "webm"}  # video suffixes
+PIN_MEMORY = str(os.getenv("PIN_MEMORY", True)).lower() == "true"  # global pin_memory for dataloaders


 def img2label_paths(img_paths):
    """Define label paths as a function of image paths."""
-    sa, sb = f'{os.sep}images{os.sep}', f'{os.sep}labels{os.sep}'  # /images/, /labels/ substrings
-    return [sb.join(x.rsplit(sa, 1)).rsplit('.', 1)[0] + '.txt' for x in img_paths]
+    sa, sb = f"{os.sep}images{os.sep}", f"{os.sep}labels{os.sep}"  # /images/, /labels/ substrings
+    return [sb.join(x.rsplit(sa, 1)).rsplit(".", 1)[0] + ".txt" for x in img_paths]


 def get_hash(paths):
    """Returns a single hash value of a list of paths (files or dirs)."""
    size = sum(os.path.getsize(p) for p in paths if os.path.exists(p))  # sizes
    h = hashlib.sha256(str(size).encode())  # hash sizes
-    h.update(''.join(paths).encode())  # hash paths
+    h.update("".join(paths).encode())  # hash paths
    return h.hexdigest()  # return hash


 def exif_size(img: Image.Image):
    """Returns exif-corrected PIL size."""
    s = img.size  # (width, height)
-    if img.format == 'JPEG':  # only support JPEG images
+    if img.format == "JPEG":  # only support JPEG images
        with contextlib.suppress(Exception):
            exif = img.getexif()
            if exif:
@ -60,24 +71,24 @@ def verify_image(args):
    """Verify one image."""
    (im_file, cls), prefix = args
    # Number (found, corrupt), message
-    nf, nc, msg = 0, 0, ''
+    nf, nc, msg = 0, 0, ""
    try:
        im = Image.open(im_file)
        im.verify()  # PIL verify
        shape = exif_size(im)  # image size
        shape = (shape[1], shape[0])  # hw
-        assert (shape[0] > 9) & (shape[1] > 9), f'image size {shape} <10 pixels'
-        assert im.format.lower() in IMG_FORMATS, f'invalid image format {im.format}'
-        if im.format.lower() in ('jpg', 'jpeg'):
-            with open(im_file, 'rb') as f:
+        assert (shape[0] > 9) & (shape[1] > 9), f"image size {shape} <10 pixels"
+        assert im.format.lower() in IMG_FORMATS, f"invalid image format {im.format}"
+        if im.format.lower() in ("jpg", "jpeg"):
+            with open(im_file, "rb") as f:
                f.seek(-2, 2)
-                if f.read() != b'\xff\xd9':  # corrupt JPEG
-                    ImageOps.exif_transpose(Image.open(im_file)).save(im_file, 'JPEG', subsampling=0, quality=100)
-                    msg = f'{prefix}WARNING ⚠️ {im_file}: corrupt JPEG restored and saved'
+                if f.read() != b"\xff\xd9":  # corrupt JPEG
+                    ImageOps.exif_transpose(Image.open(im_file)).save(im_file, "JPEG", subsampling=0, quality=100)
+                    msg = f"{prefix}WARNING ⚠️ {im_file}: corrupt JPEG restored and saved"
        nf = 1
    except Exception as e:
        nc = 1
-        msg = f'{prefix}WARNING ⚠️ {im_file}: ignoring corrupt image/label: {e}'
+        msg = f"{prefix}WARNING ⚠️ {im_file}: ignoring corrupt image/label: {e}"
    return (im_file, cls), nf, nc, msg


@ -85,21 +96,21 @@ def verify_image_label(args):
    """Verify one image-label pair."""
    im_file, lb_file, prefix, keypoint, num_cls, nkpt, ndim = args
    # Number (missing, found, empty, corrupt), message, segments, keypoints
-    nm, nf, ne, nc, msg, segments, keypoints = 0, 0, 0, 0, '', [], None
+    nm, nf, ne, nc, msg, segments, keypoints = 0, 0, 0, 0, "", [], None
    try:
        # Verify images
        im = Image.open(im_file)
        im.verify()  # PIL verify
        shape = exif_size(im)  # image size
        shape = (shape[1], shape[0])  # hw
-        assert (shape[0] > 9) & (shape[1] > 9), f'image size {shape} <10 pixels'
-        assert im.format.lower() in IMG_FORMATS, f'invalid image format {im.format}'
-        if im.format.lower() in ('jpg', 'jpeg'):
-            with open(im_file, 'rb') as f:
+        assert (shape[0] > 9) & (shape[1] > 9), f"image size {shape} <10 pixels"
+        assert im.format.lower() in IMG_FORMATS, f"invalid image format {im.format}"
+        if im.format.lower() in ("jpg", "jpeg"):
+            with open(im_file, "rb") as f:
                f.seek(-2, 2)
-                if f.read() != b'\xff\xd9':  # corrupt JPEG
-                    ImageOps.exif_transpose(Image.open(im_file)).save(im_file, 'JPEG', subsampling=0, quality=100)
-                    msg = f'{prefix}WARNING ⚠️ {im_file}: corrupt JPEG restored and saved'
+                if f.read() != b"\xff\xd9":  # corrupt JPEG
+                    ImageOps.exif_transpose(Image.open(im_file)).save(im_file, "JPEG", subsampling=0, quality=100)
+                    msg = f"{prefix}WARNING ⚠️ {im_file}: corrupt JPEG restored and saved"

        # Verify labels
        if os.path.isfile(lb_file):
@ -114,32 +125,32 @@ def verify_image_label(args):
            nl = len(lb)
            if nl:
                if keypoint:
-                    assert lb.shape[1] == (5 + nkpt * ndim), f'labels require {(5 + nkpt * ndim)} columns each'
-                    assert (lb[:, 5::ndim] <= 1).all(), 'non-normalized or out of bounds coordinate labels'
-                    assert (lb[:, 6::ndim] <= 1).all(), 'non-normalized or out of bounds coordinate labels'
+                    assert lb.shape[1] == (5 + nkpt * ndim), f"labels require {(5 + nkpt * ndim)} columns each"
+                    points = lb[:, 5:].reshape(-1, ndim)[:, :2]
                else:
-                    assert lb.shape[1] == 5, f'labels require 5 columns, {lb.shape[1]} columns detected'
-                    assert (lb[:, 1:] <= 1).all(), \
-                        f'non-normalized or out of bounds coordinates {lb[:, 1:][lb[:, 1:] > 1]}'
-                    assert (lb >= 0).all(), f'negative label values {lb[lb < 0]}'
+                    assert lb.shape[1] == 5, f"labels require 5 columns, {lb.shape[1]} columns detected"
+                    points = lb[:, 1:]
+                assert points.max() <= 1, f"non-normalized or out of bounds coordinates {points[points > 1]}"
+                assert lb.min() >= 0, f"negative label values {lb[lb < 0]}"
+
                # All labels
-                max_cls = int(lb[:, 0].max())  # max label count
-                assert max_cls <= num_cls, \
-                    f'Label class {max_cls} exceeds dataset class count {num_cls}. ' \
-                    f'Possible class labels are 0-{num_cls - 1}'
+                max_cls = lb[:, 0].max()  # max label count
+                assert max_cls <= num_cls, (
+                    f"Label class {int(max_cls)} exceeds dataset class count {num_cls}. "
+                    f"Possible class labels are 0-{num_cls - 1}"
+                )
                _, i = np.unique(lb, axis=0, return_index=True)
                if len(i) < nl:  # duplicate row check
                    lb = lb[i]  # remove duplicates
                    if segments:
                        segments = [segments[x] for x in i]
-                    msg = f'{prefix}WARNING ⚠️ {im_file}: {nl - len(i)} duplicate labels removed'
+                    msg = f"{prefix}WARNING ⚠️ {im_file}: {nl - len(i)} duplicate labels removed"
            else:
                ne = 1  # label empty
-                lb = np.zeros((0, (5 + nkpt * ndim)), dtype=np.float32) if keypoint else np.zeros(
-                    (0, 5), dtype=np.float32)
+                lb = np.zeros((0, (5 + nkpt * ndim) if keypoint else 5), dtype=np.float32)
        else:
            nm = 1  # label missing
-            lb = np.zeros((0, (5 + nkpt * ndim)), dtype=np.float32) if keypoint else np.zeros((0, 5), dtype=np.float32)
+            lb = np.zeros((0, (5 + nkpt * ndim) if keypoints else 5), dtype=np.float32)
        if keypoint:
            keypoints = lb[:, 5:].reshape(-1, nkpt, ndim)
            if ndim == 2:
@ -149,42 +160,56 @@ def verify_image_label(args):
        return im_file, lb, shape, segments, keypoints, nm, nf, ne, nc, msg
    except Exception as e:
        nc = 1
-        msg = f'{prefix}WARNING ⚠️ {im_file}: ignoring corrupt image/label: {e}'
+        msg = f"{prefix}WARNING ⚠️ {im_file}: ignoring corrupt image/label: {e}"
        return [None, None, None, None, None, nm, nf, ne, nc, msg]


 def polygon2mask(imgsz, polygons, color=1, downsample_ratio=1):
    """
+    Convert a list of polygons to a binary mask of the specified image size.
+
    Args:
-        imgsz (tuple): The image size.
-        polygons (list[np.ndarray]): [N, M], N is the number of polygons, M is the number of points(Be divided by 2).
-        color (int): color
-        downsample_ratio (int): downsample ratio
+        imgsz (tuple): The size of the image as (height, width).
+        polygons (list[np.ndarray]): A list of polygons. Each polygon is an array with shape [N, M], where
+                                     N is the number of polygons, and M is the number of points such that M % 2 = 0.
+        color (int, optional): The color value to fill in the polygons on the mask. Defaults to 1.
+        downsample_ratio (int, optional): Factor by which to downsample the mask. Defaults to 1.
+
+    Returns:
+        (np.ndarray): A binary mask of the specified image size with the polygons filled in.
    """
    mask = np.zeros(imgsz, dtype=np.uint8)
    polygons = np.asarray(polygons, dtype=np.int32)
    polygons = polygons.reshape((polygons.shape[0], -1, 2))
    cv2.fillPoly(mask, polygons, color=color)
    nh, nw = (imgsz[0] // downsample_ratio, imgsz[1] // downsample_ratio)
-    # NOTE: fillPoly first then resize is trying to keep the same way of loss calculation when mask-ratio=1.
+    # Note: fillPoly first then resize is trying to keep the same loss calculation method when mask-ratio=1
    return cv2.resize(mask, (nw, nh))


 def polygons2masks(imgsz, polygons, color, downsample_ratio=1):
    """
+    Convert a list of polygons to a set of binary masks of the specified image size.
+
    Args:
-        imgsz (tuple): The image size.
-        polygons (list[np.ndarray]): each polygon is [N, M], N is number of polygons, M is number of points (M % 2 = 0)
-        color (int): color
-        downsample_ratio (int): downsample ratio
+        imgsz (tuple): The size of the image as (height, width).
+        polygons (list[np.ndarray]): A list of polygons. Each polygon is an array with shape [N, M], where
+                                     N is the number of polygons, and M is the number of points such that M % 2 = 0.
+        color (int): The color value to fill in the polygons on the masks.
+        downsample_ratio (int, optional): Factor by which to downsample each mask. Defaults to 1.
+
+    Returns:
+        (np.ndarray): A set of binary masks of the specified image size with the polygons filled in.
    """
    return np.array([polygon2mask(imgsz, [x.reshape(-1)], color, downsample_ratio) for x in polygons])


 def polygons2masks_overlap(imgsz, segments, downsample_ratio=1):
    """Return a (640, 640) overlap mask."""
-    masks = np.zeros((imgsz[0] // downsample_ratio, imgsz[1] // downsample_ratio),
-                     dtype=np.int32 if len(segments) > 255 else np.uint8)
+    masks = np.zeros(
+        (imgsz[0] // downsample_ratio, imgsz[1] // downsample_ratio),
+        dtype=np.int32 if len(segments) > 255 else np.uint8,
+    )
    areas = []
    ms = []
    for si in range(len(segments)):
@ -206,7 +231,7 @@ def find_dataset_yaml(path: Path) -> Path:
    Find and return the YAML file associated with a Detect, Segment or Pose dataset.

    This function searches for a YAML file at the root level of the provided directory first, and if not found, it
-    performs a recursive search. It prefers YAML files that have the samestem as the provided path. An AssertionError
+    performs a recursive search. It prefers YAML files that have the same stem as the provided path. An AssertionError
    is raised if no YAML file is found or if multiple YAML files are found.

    Args:
@ -215,7 +240,7 @@ def find_dataset_yaml(path: Path) -> Path:
    Returns:
        (Path): The path of the found YAML file.
    """
-    files = list(path.glob('*.yaml')) or list(path.rglob('*.yaml'))  # try root level first and then recursive
+    files = list(path.glob("*.yaml")) or list(path.rglob("*.yaml"))  # try root level first and then recursive
    assert files, f"No YAML file found in '{path.resolve()}'"
    if len(files) > 1:
        files = [f for f in files if f.stem == path.stem]  # prefer *.yaml files that match
@ -239,57 +264,57 @@ def check_det_dataset(dataset, autodownload=True):
        (dict): Parsed dataset information and paths.
    """

-    data = check_file(dataset)
+    file = check_file(dataset)

    # Download (optional)
-    extract_dir = ''
-    if isinstance(data, (str, Path)) and (zipfile.is_zipfile(data) or is_tarfile(data)):
-        new_dir = safe_download(data, dir=DATASETS_DIR, unzip=True, delete=False)
-        data = find_dataset_yaml(DATASETS_DIR / new_dir)
-        extract_dir, autodownload = data.parent, False
+    extract_dir = ""
+    if zipfile.is_zipfile(file) or is_tarfile(file):
+        new_dir = safe_download(file, dir=DATASETS_DIR, unzip=True, delete=False)
+        file = find_dataset_yaml(DATASETS_DIR / new_dir)
+        extract_dir, autodownload = file.parent, False

-    # Read YAML (optional)
-    if isinstance(data, (str, Path)):
-        data = yaml_load(data, append_filename=True)  # dictionary
+    # Read YAML
+    data = yaml_load(file, append_filename=True)  # dictionary

    # Checks
-    for k in 'train', 'val':
+    for k in "train", "val":
        if k not in data:
-            if k == 'val' and 'validation' in data:
-                LOGGER.info("WARNING ⚠️ renaming data YAML 'validation' key to 'val' to match YOLO format.")
-                data['val'] = data.pop('validation')  # replace 'validation' key with 'val' key
-            else:
+            if k != "val" or "validation" not in data:
                raise SyntaxError(
-                    emojis(f"{dataset} '{k}:' key missing ❌.\n'train' and 'val' are required in all data YAMLs."))
-    if 'names' not in data and 'nc' not in data:
+                    emojis(f"{dataset} '{k}:' key missing ❌.\n'train' and 'val' are required in all data YAMLs.")
+                )
+            LOGGER.info("WARNING ⚠️ renaming data YAML 'validation' key to 'val' to match YOLO format.")
+            data["val"] = data.pop("validation")  # replace 'validation' key with 'val' key
+    if "names" not in data and "nc" not in data:
        raise SyntaxError(emojis(f"{dataset} key missing ❌.\n either 'names' or 'nc' are required in all data YAMLs."))
-    if 'names' in data and 'nc' in data and len(data['names']) != data['nc']:
+    if "names" in data and "nc" in data and len(data["names"]) != data["nc"]:
        raise SyntaxError(emojis(f"{dataset} 'names' length {len(data['names'])} and 'nc: {data['nc']}' must match."))
-    if 'names' not in data:
-        data['names'] = [f'class_{i}' for i in range(data['nc'])]
+    if "names" not in data:
+        data["names"] = [f"class_{i}" for i in range(data["nc"])]
    else:
-        data['nc'] = len(data['names'])
+        data["nc"] = len(data["names"])

-    data['names'] = check_class_names(data['names'])
+    data["names"] = check_class_names(data["names"])

    # Resolve paths
-    path = Path(extract_dir or data.get('path') or Path(data.get('yaml_file', '')).parent)  # dataset root
-
+    path = Path(extract_dir or data.get("path") or Path(data.get("yaml_file", "")).parent)  # dataset root
    if not path.is_absolute():
        path = (DATASETS_DIR / path).resolve()
-    data['path'] = path  # download scripts
-    for k in 'train', 'val', 'test':
+
+    # Set paths
+    data["path"] = path  # download scripts
+    for k in "train", "val", "test":
        if data.get(k):  # prepend path
            if isinstance(data[k], str):
                x = (path / data[k]).resolve()
-                if not x.exists() and data[k].startswith('../'):
+                if not x.exists() and data[k].startswith("../"):
                    x = (path / data[k][3:]).resolve()
                data[k] = str(x)
            else:
                data[k] = [str((path / x).resolve()) for x in data[k]]

    # Parse YAML
-    train, val, test, s = (data.get(x) for x in ('train', 'val', 'test', 'download'))
+    val, s = (data.get(x) for x in ("val", "download"))
    if val:
        val = [Path(x).resolve() for x in (val if isinstance(val, list) else [val])]  # val path
        if not all(x.exists() for x in val):
@ -302,22 +327,22 @@ def check_det_dataset(dataset, autodownload=True):
                raise FileNotFoundError(m)
            t = time.time()
            r = None  # success
-            if s.startswith('http') and s.endswith('.zip'):  # URL
+            if s.startswith("http") and s.endswith(".zip"):  # URL
                safe_download(url=s, dir=DATASETS_DIR, delete=True)
-            elif s.startswith('bash '):  # bash script
-                LOGGER.info(f'Running {s} ...')
+            elif s.startswith("bash "):  # bash script
+                LOGGER.info(f"Running {s} ...")
                r = os.system(s)
            else:  # python script
-                exec(s, {'yaml': data})
-            dt = f'({round(time.time() - t, 1)}s)'
-            s = f"success ✅ {dt}, saved to {colorstr('bold', DATASETS_DIR)}" if r in (0, None) else f'failure {dt} ❌'
-            LOGGER.info(f'Dataset download {s}\n')
-    check_font('Arial.ttf' if is_ascii(data['names']) else 'Arial.Unicode.ttf')  # download fonts
+                exec(s, {"yaml": data})
+            dt = f"({round(time.time() - t, 1)}s)"
+            s = f"success ✅ {dt}, saved to {colorstr('bold', DATASETS_DIR)}" if r in (0, None) else f"failure {dt} ❌"
+            LOGGER.info(f"Dataset download {s}\n")
+    check_font("Arial.ttf" if is_ascii(data["names"]) else "Arial.Unicode.ttf")  # download fonts

    return data  # dictionary


-def check_cls_dataset(dataset, split=''):
+def check_cls_dataset(dataset, split=""):
    """
    Checks a classification dataset such as Imagenet.

@ -338,54 +363,62 @@ def check_cls_dataset(dataset, split=''):
    """

    # Download (optional if dataset=https://file.zip is passed directly)
-    if str(dataset).startswith(('http:/', 'https:/')):
+    if str(dataset).startswith(("http:/", "https:/")):
        dataset = safe_download(dataset, dir=DATASETS_DIR, unzip=True, delete=False)
+    elif Path(dataset).suffix in (".zip", ".tar", ".gz"):
+        file = check_file(dataset)
+        dataset = safe_download(file, dir=DATASETS_DIR, unzip=True, delete=False)

    dataset = Path(dataset)
    data_dir = (dataset if dataset.is_dir() else (DATASETS_DIR / dataset)).resolve()
    if not data_dir.is_dir():
-        LOGGER.warning(f'\nDataset not found ⚠️, missing path {data_dir}, attempting download...')
+        LOGGER.warning(f"\nDataset not found ⚠️, missing path {data_dir}, attempting download...")
        t = time.time()
-        if str(dataset) == 'imagenet':
+        if str(dataset) == "imagenet":
            subprocess.run(f"bash {ROOT / 'data/scripts/get_imagenet.sh'}", shell=True, check=True)
        else:
-            url = f'https://github.com/ultralytics/yolov5/releases/download/v1.0/{dataset}.zip'
+            url = f"https://github.com/ultralytics/yolov5/releases/download/v1.0/{dataset}.zip"
            download(url, dir=data_dir.parent)
        s = f"Dataset download success ✅ ({time.time() - t:.1f}s), saved to {colorstr('bold', data_dir)}\n"
        LOGGER.info(s)
-    train_set = data_dir / 'train'
-    val_set = data_dir / 'val' if (data_dir / 'val').exists() else data_dir / 'validation' if \
-        (data_dir / 'validation').exists() else None  # data/test or data/val
-    test_set = data_dir / 'test' if (data_dir / 'test').exists() else None  # data/val or data/test
-    if split == 'val' and not val_set:
+    train_set = data_dir / "train"
+    val_set = (
+        data_dir / "val"
+        if (data_dir / "val").exists()
+        else data_dir / "validation"
+        if (data_dir / "validation").exists()
+        else None
+    )  # data/test or data/val
+    test_set = data_dir / "test" if (data_dir / "test").exists() else None  # data/val or data/test
+    if split == "val" and not val_set:
        LOGGER.warning("WARNING ⚠️ Dataset 'split=val' not found, using 'split=test' instead.")
-    elif split == 'test' and not test_set:
+    elif split == "test" and not test_set:
        LOGGER.warning("WARNING ⚠️ Dataset 'split=test' not found, using 'split=val' instead.")

-    nc = len([x for x in (data_dir / 'train').glob('*') if x.is_dir()])  # number of classes
-    names = [x.name for x in (data_dir / 'train').iterdir() if x.is_dir()]  # class names list
+    nc = len([x for x in (data_dir / "train").glob("*") if x.is_dir()])  # number of classes
+    names = [x.name for x in (data_dir / "train").iterdir() if x.is_dir()]  # class names list
    names = dict(enumerate(sorted(names)))

    # Print to console
-    for k, v in {'train': train_set, 'val': val_set, 'test': test_set}.items():
+    for k, v in {"train": train_set, "val": val_set, "test": test_set}.items():
        prefix = f'{colorstr(f"{k}:")} {v}...'
        if v is None:
            LOGGER.info(prefix)
        else:
-            files = [path for path in v.rglob('*.*') if path.suffix[1:].lower() in IMG_FORMATS]
+            files = [path for path in v.rglob("*.*") if path.suffix[1:].lower() in IMG_FORMATS]
            nf = len(files)  # number of files
            nd = len({file.parent for file in files})  # number of directories
            if nf == 0:
-                if k == 'train':
+                if k == "train":
                    raise FileNotFoundError(emojis(f"{dataset} '{k}:' no training images found ❌ "))
                else:
-                    LOGGER.warning(f'{prefix} found {nf} images in {nd} classes: WARNING ⚠️ no images found')
+                    LOGGER.warning(f"{prefix} found {nf} images in {nd} classes: WARNING ⚠️ no images found")
            elif nd != nc:
-                LOGGER.warning(f'{prefix} found {nf} images in {nd} classes: ERROR ❌️ requires {nc} classes, not {nd}')
+                LOGGER.warning(f"{prefix} found {nf} images in {nd} classes: ERROR ❌️ requires {nc} classes, not {nd}")
            else:
-                LOGGER.info(f'{prefix} found {nf} images in {nd} classes ✅ ')
+                LOGGER.info(f"{prefix} found {nf} images in {nd} classes ✅ ")

-    return {'train': train_set, 'val': val_set, 'test': test_set, 'nc': nc, 'names': names}
+    return {"train": train_set, "val": val_set, "test": test_set, "nc": nc, "names": names}


 class HUBDatasetStats:
@ -393,7 +426,7 @@ class HUBDatasetStats:
    A class for generating HUB dataset JSON and `-hub` dataset directory.

    Args:
-        path (str): Path to data.yaml or data.zip (with data.yaml inside data.zip). Default is 'coco128.yaml'.
+        path (str): Path to data.yaml or data.zip (with data.yaml inside data.zip). Default is 'coco8.yaml'.
        task (str): Dataset task. Options are 'detect', 'segment', 'pose', 'classify'. Default is 'detect'.
        autodownload (bool): Attempt to download dataset if not found locally. Default is False.

@ -413,39 +446,42 @@ class HUBDatasetStats:
        ```
    """

-    def __init__(self, path='coco128.yaml', task='detect', autodownload=False):
+    def __init__(self, path="coco8.yaml", task="detect", autodownload=False):
        """Initialize class."""
        path = Path(path).resolve()
-        LOGGER.info(f'Starting HUB dataset checks for {path}....')
+        LOGGER.info(f"Starting HUB dataset checks for {path}....")

        self.task = task  # detect, segment, pose, classify
-        if self.task == 'classify':
+        if self.task == "classify":
            unzip_dir = unzip_file(path)
            data = check_cls_dataset(unzip_dir)
-            data['path'] = unzip_dir
+            data["path"] = unzip_dir
        else:  # detect, segment, pose
-            zipped, data_dir, yaml_path = self._unzip(Path(path))
+            _, data_dir, yaml_path = self._unzip(Path(path))
            try:
-                # data = yaml_load(check_yaml(yaml_path))  # data dict
-                data = check_det_dataset(yaml_path, autodownload)  # data dict
-                if zipped:
-                    data['path'] = data_dir
+                # Load YAML with checks
+                data = yaml_load(yaml_path)
+                data["path"] = ""  # strip path since YAML should be in dataset root for all HUB datasets
+                yaml_save(yaml_path, data)
+                data = check_det_dataset(yaml_path, autodownload)  # dict
+                data["path"] = data_dir  # YAML path should be set to '' (relative) or parent (absolute)
            except Exception as e:
-                raise Exception('error/HUB/dataset_stats/init') from e
+                raise Exception("error/HUB/dataset_stats/init") from e

        self.hub_dir = Path(f'{data["path"]}-hub')
-        self.im_dir = self.hub_dir / 'images'
-        self.im_dir.mkdir(parents=True, exist_ok=True)  # makes /images
-        self.stats = {'nc': len(data['names']), 'names': list(data['names'].values())}  # statistics dictionary
+        self.im_dir = self.hub_dir / "images"
+        self.stats = {"nc": len(data["names"]), "names": list(data["names"].values())}  # statistics dictionary
        self.data = data

-    def _unzip(self, path):
+    @staticmethod
+    def _unzip(path):
        """Unzip data.zip."""
-        if not str(path).endswith('.zip'):  # path is data.yaml
+        if not str(path).endswith(".zip"):  # path is data.yaml
            return False, None, path
        unzip_dir = unzip_file(path, path=path.parent)
-        assert unzip_dir.is_dir(), f'Error unzipping {path}, {unzip_dir} not found. ' \
-                                   f'path/to/abc.zip MUST unzip to path/to/abc/'
+        assert unzip_dir.is_dir(), (
+            f"Error unzipping {path}, {unzip_dir} not found. " f"path/to/abc.zip MUST unzip to path/to/abc/"
+        )
        return True, str(unzip_dir), find_dataset_yaml(unzip_dir)  # zipped, data_dir, yaml_path

    def _hub_ops(self, f):
@ -457,31 +493,31 @@ class HUBDatasetStats:

        def _round(labels):
            """Update labels to integer class and 4 decimal place floats."""
-            if self.task == 'detect':
-                coordinates = labels['bboxes']
-            elif self.task == 'segment':
-                coordinates = [x.flatten() for x in labels['segments']]
-            elif self.task == 'pose':
-                n = labels['keypoints'].shape[0]
-                coordinates = np.concatenate((labels['bboxes'], labels['keypoints'].reshape(n, -1)), 1)
+            if self.task == "detect":
+                coordinates = labels["bboxes"]
+            elif self.task == "segment":
+                coordinates = [x.flatten() for x in labels["segments"]]
+            elif self.task == "pose":
+                n = labels["keypoints"].shape[0]
+                coordinates = np.concatenate((labels["bboxes"], labels["keypoints"].reshape(n, -1)), 1)
            else:
-                raise ValueError('Undefined dataset task.')
-            zipped = zip(labels['cls'], coordinates)
+                raise ValueError("Undefined dataset task.")
+            zipped = zip(labels["cls"], coordinates)
            return [[int(c[0]), *(round(float(x), 4) for x in points)] for c, points in zipped]

-        for split in 'train', 'val', 'test':
+        for split in "train", "val", "test":
            self.stats[split] = None  # predefine
            path = self.data.get(split)

            # Check split
            if path is None:  # no split
                continue
-            files = [f for f in Path(path).rglob('*.*') if f.suffix[1:].lower() in IMG_FORMATS]  # image files in split
+            files = [f for f in Path(path).rglob("*.*") if f.suffix[1:].lower() in IMG_FORMATS]  # image files in split
            if not files:  # no images
                continue

            # Get dataset statistics
-            if self.task == 'classify':
+            if self.task == "classify":
                from torchvision.datasets import ImageFolder

                dataset = ImageFolder(self.data[split])
@ -491,41 +527,36 @@ class HUBDatasetStats:
                    x[im[1]] += 1

                self.stats[split] = {
-                    'instance_stats': {
-                        'total': len(dataset),
-                        'per_class': x.tolist()},
-                    'image_stats': {
-                        'total': len(dataset),
-                        'unlabelled': 0,
-                        'per_class': x.tolist()},
-                    'labels': [{
-                        Path(k).name: v} for k, v in dataset.imgs]}
+                    "instance_stats": {"total": len(dataset), "per_class": x.tolist()},
+                    "image_stats": {"total": len(dataset), "unlabelled": 0, "per_class": x.tolist()},
+                    "labels": [{Path(k).name: v} for k, v in dataset.imgs],
+                }
            else:
                from ultralytics.data import YOLODataset

-                dataset = YOLODataset(img_path=self.data[split],
-                                      data=self.data,
-                                      use_segments=self.task == 'segment',
-                                      use_keypoints=self.task == 'pose')
-                x = np.array([
-                    np.bincount(label['cls'].astype(int).flatten(), minlength=self.data['nc'])
-                    for label in TQDM(dataset.labels, total=len(dataset), desc='Statistics')])  # shape(128x80)
+                dataset = YOLODataset(img_path=self.data[split], data=self.data, task=self.task)
+                x = np.array(
+                    [
+                        np.bincount(label["cls"].astype(int).flatten(), minlength=self.data["nc"])
+                        for label in TQDM(dataset.labels, total=len(dataset), desc="Statistics")
+                    ]
+                )  # shape(128x80)
                self.stats[split] = {
-                    'instance_stats': {
-                        'total': int(x.sum()),
-                        'per_class': x.sum(0).tolist()},
-                    'image_stats': {
-                        'total': len(dataset),
-                        'unlabelled': int(np.all(x == 0, 1).sum()),
-                        'per_class': (x > 0).sum(0).tolist()},
-                    'labels': [{
-                        Path(k).name: _round(v)} for k, v in zip(dataset.im_files, dataset.labels)]}
+                    "instance_stats": {"total": int(x.sum()), "per_class": x.sum(0).tolist()},
+                    "image_stats": {
+                        "total": len(dataset),
+                        "unlabelled": int(np.all(x == 0, 1).sum()),
+                        "per_class": (x > 0).sum(0).tolist(),
+                    },
+                    "labels": [{Path(k).name: _round(v)} for k, v in zip(dataset.im_files, dataset.labels)],
+                }

        # Save, print and return
        if save:
-            stats_path = self.hub_dir / 'stats.json'
-            LOGGER.info(f'Saving {stats_path.resolve()}...')
-            with open(stats_path, 'w') as f:
+            self.hub_dir.mkdir(parents=True, exist_ok=True)  # makes dataset-hub/
+            stats_path = self.hub_dir / "stats.json"
+            LOGGER.info(f"Saving {stats_path.resolve()}...")
+            with open(stats_path, "w") as f:
                json.dump(self.stats, f)  # save stats.json
        if verbose:
            LOGGER.info(json.dumps(self.stats, indent=2, sort_keys=False))
@ -535,22 +566,23 @@ class HUBDatasetStats:
        """Compress images for Ultralytics HUB."""
        from ultralytics.data import YOLODataset  # ClassificationDataset

-        for split in 'train', 'val', 'test':
+        self.im_dir.mkdir(parents=True, exist_ok=True)  # makes dataset-hub/images/
+        for split in "train", "val", "test":
            if self.data.get(split) is None:
                continue
            dataset = YOLODataset(img_path=self.data[split], data=self.data)
            with ThreadPool(NUM_THREADS) as pool:
-                for _ in TQDM(pool.imap(self._hub_ops, dataset.im_files), total=len(dataset), desc=f'{split} images'):
+                for _ in TQDM(pool.imap(self._hub_ops, dataset.im_files), total=len(dataset), desc=f"{split} images"):
                    pass
-        LOGGER.info(f'Done. All images saved to {self.im_dir}')
+        LOGGER.info(f"Done. All images saved to {self.im_dir}")
        return self.im_dir


 def compress_one_image(f, f_new=None, max_dim=1920, quality=50):
    """
-    Compresses a single image file to reduced size while preserving its aspect ratio and quality using either the
-    Python Imaging Library (PIL) or OpenCV library. If the input image is smaller than the maximum dimension, it will
-    not be resized.
+    Compresses a single image file to reduced size while preserving its aspect ratio and quality using either the Python
+    Imaging Library (PIL) or OpenCV library. If the input image is smaller than the maximum dimension, it will not be
+    resized.

    Args:
        f (str): The path to the input image file.
@ -573,9 +605,9 @@ def compress_one_image(f, f_new=None, max_dim=1920, quality=50):
        r = max_dim / max(im.height, im.width)  # ratio
        if r < 1.0:  # image too large
            im = im.resize((int(im.width * r), int(im.height * r)))
-        im.save(f_new or f, 'JPEG', quality=quality, optimize=True)  # save
+        im.save(f_new or f, "JPEG", quality=quality, optimize=True)  # save
    except Exception as e:  # use OpenCV
-        LOGGER.info(f'WARNING ⚠️ HUB ops PIL failure {f}: {e}')
+        LOGGER.info(f"WARNING ⚠️ HUB ops PIL failure {f}: {e}")
        im = cv2.imread(f)
        im_height, im_width = im.shape[:2]
        r = max_dim / max(im_height, im_width)  # ratio
@ -584,7 +616,7 @@ def compress_one_image(f, f_new=None, max_dim=1920, quality=50):
        cv2.imwrite(str(f_new or f), im)


-def autosplit(path=DATASETS_DIR / 'coco8/images', weights=(0.9, 0.1, 0.0), annotated_only=False):
+def autosplit(path=DATASETS_DIR / "coco8/images", weights=(0.9, 0.1, 0.0), annotated_only=False):
    """
    Automatically split a dataset into train/val/test splits and save the resulting splits into autosplit_*.txt files.

@ -602,18 +634,18 @@ def autosplit(path=DATASETS_DIR / 'coco8/images', weights=(0.9, 0.1, 0.0), annot
    """

    path = Path(path)  # images dir
-    files = sorted(x for x in path.rglob('*.*') if x.suffix[1:].lower() in IMG_FORMATS)  # image files only
+    files = sorted(x for x in path.rglob("*.*") if x.suffix[1:].lower() in IMG_FORMATS)  # image files only
    n = len(files)  # number of files
    random.seed(0)  # for reproducibility
    indices = random.choices([0, 1, 2], weights=weights, k=n)  # assign each image to a split

-    txt = ['autosplit_train.txt', 'autosplit_val.txt', 'autosplit_test.txt']  # 3 txt files
+    txt = ["autosplit_train.txt", "autosplit_val.txt", "autosplit_test.txt"]  # 3 txt files
    for x in txt:
        if (path.parent / x).exists():
            (path.parent / x).unlink()  # remove existing

-    LOGGER.info(f'Autosplitting images from {path}' + ', using *.txt labeled images only' * annotated_only)
+    LOGGER.info(f"Autosplitting images from {path}" + ", using *.txt labeled images only" * annotated_only)
    for i, img in TQDM(zip(indices, files), total=n):
        if not annotated_only or Path(img2label_paths([str(img)])[0]).exists():  # check label
-            with open(path.parent / txt[i], 'a') as f:
-                f.write(f'./{img.relative_to(path.parent).as_posix()}' + '\n')  # add image to txt file
+            with open(path.parent / txt[i], "a") as f:
+                f.write(f"./{img.relative_to(path.parent).as_posix()}" + "\n")  # add image to txt file