Why Random Baseline?

Given a standard dataset with a fixed set of models, it is easier to compare the performance of different models. But what if we are working on a new model which has performance far from the best set of models but as a first step, we simply want to check if the model is learning anything at all. In such cases, it is useful to compare the performance of the model with a random baseline.

Proposed Idea

To formalize the problem, let’s say for an arbitrary image, model predicts \(k\) bounding boxes with sizes \((h_1, w_1), (h_2, w_2), \ldots, (h_k, w_k)\).
A simple random baseline would be to generate \(k\) random bounding boxes for that image with sizes \((h_1, w_1), (h_2, w_2), \ldots, (h_k, w_k)\). In other words, we can simply move the predicted bounding boxes to random locations ensuring that the bounding boxes are within the image.

Imports

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

import numpy as np
from tqdm.notebook import tqdm
import supervision as sv
from roboflow import Roboflow
from dotenv import load_dotenv
from ultralytics import YOLO
from copy import deepcopy
from PIL import Image
from IPython.display import clear_output

load_dotenv()

True

Dataset

data_location = "/tmp/poker-cards-fmjio"

rf = Roboflow(api_key=os.getenv("ROBOFLOW_API_KEY"))
project = rf.workspace("roboflow-jvuqo").project("poker-cards-fmjio")
version = project.version(4)
dataset = version.download("yolov8", location=data_location)

loading Roboflow workspace...
loading Roboflow project...

Train Model

model = YOLO("yolo11m")

model.train(data=f"{data_location}/data.yaml", epochs=1, project="/tmp/poker-cards-fmjio", exist_ok=True)
clear_output()

Evaluate Model

test_dataset = sv.DetectionDataset.from_yolo(f"{data_location}/test/images", f"{data_location}/test/labels", f"{data_location}/data.yaml")
len(test_dataset)

annotations_list = []
detections_list = []
for _, img, annotations in tqdm(test_dataset):
    results = model.predict(img, verbose=False)[0]
    detections = sv.Detections.from_ultralytics(results)
    annotations_list.append(annotations)
    detections_list.append(detections)

mAP = sv.metrics.MeanAveragePrecision().update(detections_list, annotations_list).compute()
mAP.map50

0.1417744455736285

Random Baseline

As per our assumption, we would simply need to randomly move the existing bounding boxes keeping their sizes constant with the following constraints:

The bounding box should be within the image boundaries.

min_size = 0
max_size = model.args['imgsz']

mAPs = []
for random_seed in tqdm(range(100)):
    np.random.seed(random_seed)
    random_detections_list = []
    for detections in detections_list:
        random_detections = deepcopy(detections)
        shift = np.random.rand(len(detections))
        lower_limit = - detections.xyxy.min(axis=1) + 1e-6
        upper_limit = max_size - detections.xyxy.max(axis=1) - 1e-6
        transformed_shift = lower_limit + shift * (upper_limit - lower_limit)
        random_detections.xyxy = random_detections.xyxy + transformed_shift.reshape(-1, 1)
        random_detections_list.append(random_detections)
        
    mAP = sv.metrics.MeanAveragePrecision().update(random_detections_list, annotations_list).compute()
    mAPs.append(mAP.map50)
    
print(f"mAP50: {mAP.map50:.2f} +/- {np.std(mAPs):.2f}")

mAP50: 0.04 +/- 0.01

We can also modify the confidence values.

min_size = 0
max_size = model.args['imgsz']

mAPs = []
for random_seed in tqdm(range(100)):
    np.random.seed(random_seed)
    random_detections_list = []
    for detections in detections_list:
        random_detections = deepcopy(detections)
        shift = np.random.rand(len(detections))
        lower_limit = - detections.xyxy.min(axis=1) + 1e-6
        upper_limit = max_size - detections.xyxy.max(axis=1) - 1e-6
        transformed_shift = lower_limit + shift * (upper_limit - lower_limit)
        random_detections.xyxy = random_detections.xyxy + transformed_shift.reshape(-1, 1)
        random_detections.confidence = np.random.rand(len(detections))
        random_detections_list.append(random_detections)
        
    mAP = sv.metrics.MeanAveragePrecision().update(random_detections_list, annotations_list).compute()
    mAPs.append(mAP.map50)
    
print(f"mAP50: {mAP.map50:.2f} +/- {np.std(mAPs):.2f}")

mAP50: 0.03 +/- 0.01

So, in this case, our model got better than the random baseline with a single epoch.