eval.py

"""
Copyright (c) 2022 Magdalena Fuentes, Bea Steers, Luca Bondi(Robert Bosch GmbH), Julia Wilkins
All rights reserved.

This source code is licensed under the BSD-3-Clause license found in the
LICENSE file in the root directory of this source tree.
"""
import json
import os
from typing import List, Sequence, Iterable, Literal

import h5py
import glob
import scipy
import librosa
import numpy as np
import pandas as pd
from pathlib import Path
import skimage.transform
from moviepy.editor import VideoFileClip
from data.BatchRawDataset import is_number

import project_params

SR = 48000
MAX_LEN_ID = 11
IMAGE_SHAPE = (224, 224)

"""
A series of helper functions for evaluating these models, mainly 
IOU/GIOU scoring and video/audio pre-procesing methods.
"""


# Video helper functions
def video_to_size(cm: np.ndarray, shape=IMAGE_SHAPE) -> np.ndarray:
    """
    Converts a mask representation of a video to a specified size.

    Arguments:
        cm: original array to be converted into `shape` size.
        shape: desired image shape.

    Returns:
        np.ndarray: resized version of input
    """

    return skimage.transform.resize(cm, shape, order=0, preserve_range=True, anti_aliasing=False)


def video2audio_fname(fname: str, ext: str = None) -> str:
    """
    Gets the audio filename version of a video filename.
    Ex: convert: tau2021aus/video/street_traffic-barcelona-161-4901.mp4
        to: tau2021aus/audio/street_traffic-barcelona-161-4901.wav

    Arguments:
        fname: presumably a video filename.
        ext: specified audio filename extension (i.e. 'wav')

    Returns:
        str: audio filename with the specified extension.
    """
    fdir, _, fbase = fname.rsplit('/', 2)
    fstem = fbase.split('.')[0]
    fname = os.path.join(fdir, 'notebooks/audio', fstem)
    if not ext:
        fname += '.*'
        fs = glob.glob(fname)
        assert fs, f'No files matching {fname}'
        return fs[0]
    return fname + ext


def load_sample(vid_fname: str, aud_fname: str, hop_size: float = 0.1,
                sr: int = SR, **kw):
    """
    Load an audio and video clip.

    Arguments:
        vid_fname: video filename.
        aud_fname: audio filename.
        hop_size: hop size for padding the audio.
        sr: sr used with hop size to pad audio.

    Returns:
        np.ndarray, np.ndarray: video and audio arrays.
    """

    win_size = sr
    y, _ = librosa.load(aud_fname, sr=sr)
    aud = librosa.util.frame(
        librosa.util.pad_center(y, int((len(y) + win_size - 1))),
        frame_length=sr, hop_length=int(sr * hop_size)).T[:, None]

    clip = VideoFileClip(vid_fname)

    vid = np.array([
        video_to_size(clip.get_frame(i), **kw)
        for i in np.linspace(0, len(y) / sr, len(aud))])
    vid = vid / 255 * 2 - 1
    return vid, aud


def load_av_data_from_files(video_fnames: List[str], audio_fnames: List[str], **kw):
    """
    Loads all audio and video data.

    Arguments:
        video_fnames: list of video filenames.
        audio_fnames: list of audio filesnames.

    Yields:
        np.ndarray, np.ndarray: loaded video and audio for each
            filename in list.
    """

    for vf, af in zip(video_fnames, audio_fnames):
        yield load_sample(vf, af, **kw)


# Loader helper functions
def load_predictions(predictions_path: str):
    """
    Load the predictions generated by the model.

    Arguments:
        predictions_path: path to predictions (h5 files)

    Returns:
        dict: nested dictinonary of the structure:
            {'100': {'time': [],
                     'pred': [],
                     'model': model_filepath}}, ..}
    """

    pred = {}
    with h5py.File(predictions_path, 'r') as f:
        for uid in f.keys():
            pred[uid] = {
                'time': np.array(f[uid]['time']),
                'pred': np.array(f[uid]['pred']),
                'model': predictions_path
            }
    return pred


def load_index(index_filepath: str):
    """
    Load the index (JSON file).

    Arguments:
        index_filepath: path to index JSON file.

    Returns:
        dict: dict containing index data.
    """
    index_path = Path(os.path.join('', index_filepath))  # PASS IN YOUR INDEX FILEPATH HERE
    with index_path.open() as f:
        index = json.load(f)
    print("Length of index: ", len(index))
    return index


def create_mask(coords: Sequence, existing_mask: np.ndarray = None, frame_size=(720, 1280)):
    """
    Generate a mask given [x1,y1,x2,y2] coordinates.

    Arguments:
        coords: [x1,y1,x2,y2] coordinates of top-left and bottom right corner of box.
        existing_mask: if we have an existing mask we will update with the new coordinates we
            can pass this here, otherwise a mask of zeroes will be initialized.
        frame_size: size of overall frame. a new mask created will be of this shape.

    Returns:
        np.ndarray: binary np.ndarray mask
    """

    if existing_mask is not None:
        if existing_mask.shape != frame_size:
            raise ValueError(
                f"Existing mask is of shape {existing_mask.shape}, but target frame size is {frame_size} These must match.")
        mask = existing_mask
    else:
        mask = np.zeros(frame_size)

    if not np.any(coords) or not coords:
        return np.zeros(frame_size)

    curr_x = (coords[0], coords[2])
    curr_y = (coords[1], coords[3])
    mask[curr_y[0]:curr_y[1] + 1, curr_x[0]:curr_x[1] + 1] = 1
    return mask


# IOU Scores
def iou_score(pred_mask: np.ndarray, gt_bbox_coords: Iterable[np.ndarray] = None, gt_box_mask: np.ndarray = None,
              th: float = 0.5, frame_size=(720, 1280), target_size=None):
    """
    Computes Intersection over Union (IoU) for a given mask and bounding boxes.
    Assumes that bboxes are according to original size, and box is in format [x1, y1, x2, y2]
    where (x1,y1) is the upper left corner and (x2,y2) the lower right corner. This is for
    one *class* at a time currently.

    pred_mask:
        mask thats the size of the entire image (likelihood), predictions for one frame
    gt_bbox_coords:
        list of coordinates from ground truth [x1, y1, x2, y2]
    gt_box_mask:
        ground truth box mask (if passed directly instead of creating)
    th:
        threshold tau that determines how to binarize the predicted max (which has likelihood vals)
    frame_size:
        size of the input frames (should be the same for pred/gt)
    target_size:
        desired output frame size of both masks

    Returns:
        (iou, gt_box_mask, pred_mask):
            iou: IoU score computed (float)
            gt_box_mask: reshaped ground truth box mask (2D np.ndarray, shape=target_size)
            pred_mask: reshaped prediction mask (2D np.ndarray, shape=target_size)
    """

    if gt_bbox_coords is None and gt_box_mask is None:
        raise ValueError("Need to pass either box or box mask")

    if len(pred_mask.shape) > 2:
        pred_mask = pred_mask[..., 0]

    # Convert the likelihood thresholds to a binary mask
    pred_mask = (pred_mask > th) ** 1

    if target_size is None:
        target_size = frame_size

    if gt_box_mask is None:
        print('creating box mask')
        gt_box_mask = np.zeros(frame_size)

        # Make the overall mask
        for _box in gt_bbox_coords:
            if _box is None or sum(_box) == 0:
                continue
            # +1 is to take the "borders" into account
            gt_box_mask = create_mask(coords=_box, existing_mask=gt_box_mask, frame_size=frame_size)

        gt_box_mask = video_to_size(gt_box_mask, target_size)

    # We can adjust resolution if necessary
    pred_mask = video_to_size(pred_mask, target_size)

    # Intersection, multiply and sum the 1/0s
    overlap = np.sum(pred_mask * gt_box_mask)

    # Union: elementwise subtraction
    union = np.sum((pred_mask - gt_box_mask > 0) + gt_box_mask)
    iou = overlap / union if union else 1  # This is where 1 is assigned to empty frames
    # Empty -> doesn't have GT and didn't predict anything
    return iou, gt_box_mask, pred_mask


def iou_frame(annot, pred, frame_id: int, **kw):
    """
    Computes the IoU score for a single frame.

    Arguments:
        annot: video annotations for one video
        pred: predictions for one video
        frame_id: single frame ID, this allows us to filter down to evaluate only
            one frame at a time

    Returns:
        See return for `iou_score`. Returns this for the given `frame_id`.
    """

    _annot = annot[annot.frame_id == frame_id]
    gth = []
    for _, a in _annot.iterrows():
        # ground truth
        # this converts GT to x1,y1 x2,y2
        # the predictions
        gth.append(np.array([a.x, a.y, a.x + a.w, a.y + a.h]).astype(int))
    return iou_score(pred, gth, **kw)  # each row in annotations -> one BB (in that frame)


def iou_frame_1D(annot, pred, frame_id, **kw):
    """
    Computes the IoU score for a single frame, but for height=1 boxes. We use this
    as we evaluate the vertical regions of frames.

    Arguments:
        annot: video annotations for one video
        pred: predictions for one video
        frame_id: single frame ID, this allows us to filter down to evaluate only
            one frame at a time
    Returns:
        See return for `iou_score`. Returns this for the given `frame_id`.
    """

    _annot = annot[annot.frame_id == frame_id]
    gth = []
    for _, a in _annot.iterrows():
        gth.append(np.array([a.x, 0, a.x + a.w, 1]).astype(int))
    return iou_score(pred, gth, **kw)


def convert_azimuth_to_indexes(time: np.ndarray, azimuth: np.ndarray,
                               event_tax: np.ndarray = None, fov: int = project_params.fov,
                               num_regions: int = project_params.num_regions) -> np.ndarray:
    """
    Given azimuth values (-fov/2, fov/2), convert them to gt mask indices.

    Args:
        time: the timestamps for each azimuth value.
        azimuth: the azimuth values.
        event_tax: upsampled timestamps that we want to interpolate to
                    (see retrieve_file)

    Returns:
        np.ndarray: ground truth mask indices with values in range
                    [0,num_regions].
    """

    _time, _azimuth = np.asarray(time), np.asarray(azimuth)
    if event_tax is not None:
        _azimuth = scipy.interpolate.interp1d(
            _time, _azimuth, fill_value='extrapolate')(event_tax)
    # Turn azimuth into region index. Azimuth is natively in -180 and 180.
    # Transform into the given fov
    idxs = np.round(
        (_azimuth + fov / 2) / fov * (num_regions - 1)
    ).astype(int)
    # Clip as datasets could have angles outside the fov
    idxs = np.clip(idxs, 0, num_regions - 1)
    return idxs

def compute_file_gt(index, uid, labels_period: float = project_params.labels_period,
                           classes: list = ['bus', 'car', 'motorbike', 'truck'],
                           num_regions: int = project_params.num_regions,
                           filter_confirmed: bool = project_params.filter_confirmed,
                           class_distinction: bool = project_params.class_distinction,
                           audio_filtered_labels: bool = project_params.audio_filtered_labels,
                           point_sources: bool = project_params.point_sources):
    """
    Compute ground truth for a given file

    NOTE this should be the same as in training, but just put the project
    params as globals up top instead of params in this function

    Args:
        uid: File uid, i.e. key of index

    Returns: (num_out_frames, num_classes, num_regions)
    """
    # Retrieve file information
    file_dict = index[uid]
    file_dur = file_dict['duration']
    file_out_tax = np.arange(0, file_dur, labels_period)

    num_out_win = len(file_out_tax)
    file_gt = np.zeros((num_out_win, len(classes),
                        num_regions), np.float32)
    for event in file_dict['events']:

        # Ignore event if class is not relevant
        if event['label'] not in classes:
            continue

        if event.get('source') == 'audio':
            # Ignore audio labels in training
            continue
        # If filter_confirmed is passed as a number, that's a threshold for
        # what proportion of audio confirmations we want to require
        # If it's a bool, proceed with the rest of the logic
        if is_number(filter_confirmed):
            # If this event's confirmation threshold is less than the required
            # threshold, skip it
            if event.get('amount_confirmed', 1) <= filter_confirmed:
                continue
        # If filter_confirmed == True and this event as a whole is not
        # confirmed, skip
        elif filter_confirmed and not event.get('confirmed', True):
            continue
        # Otherwise proceed (i.e. "keep this event")

        event_mask = (file_out_tax >= event['time'][0]) & \
                     (file_out_tax <= event['time'][-1])
        if np.sum(event_mask) == 0:
            # Ignore event if no intersection between event and
            continue

        event_win_idx0 = np.flatnonzero(event_mask)[0]
        event_tax = file_out_tax[event_mask]

        if class_distinction:
            label_idx = classes.index(event['label'])
        else:
            label_idx = 0

        # we have different versions of the azimuth values, saved with
        # different suffixes
        # audio_filtered_labels means that we want azimuths to be filtered
        # to only those that coincide with an audio label
        key_sfx = '_filtered' if audio_filtered_labels else ''

        # skip events spanning only one frame
        if not len(event['time' + key_sfx]) >= 2:
            continue

        # bounding boxes take up a single region at the center of the box
        if point_sources:
            az_idx_left = az_idx_right = convert_azimuth_to_indexes(
                event['time' + key_sfx], event['azimuth' + key_sfx], event_tax)
        # bounding boxes take up all regions from left to right of its box
        else:
            az_idx_left = convert_azimuth_to_indexes(
                event['time' + key_sfx], event['azimuth_left' + key_sfx],
                event_tax)
            az_idx_right = convert_azimuth_to_indexes(
                event['time' + key_sfx], event['azimuth_right' + key_sfx],
                event_tax)

        # fill the ground truth mask
        for win_idx, (i_az_l, i_az_r) in enumerate(zip(az_idx_left,
                                                       az_idx_right)):

            file_gt[event_win_idx0 + win_idx, label_idx, i_az_l:i_az_r + 1] += 1

    # As multiple events of the same class can activate the same azimuth index,
    # here we clip
    file_gt = np.clip(file_gt, 0, 1)

    return file_gt


def score_file(gth: np.ndarray,
               pred: np.ndarray,
               uid: str,
               tau: float,
               class_list: List[str],
               model_type: Literal['point_sources', 'box_sources'],
               frame_type: str = 'all',
               labels_period: float = 0.5,
               file_duration: float = 10,
               verbose: bool = False,
               name: str = None) -> pd.DataFrame:
    """
    Get a dataframe of per-class IOU and GIOU scores for a given file UID.

    Arguments:
        gth: Ground truth np.ndarray already filtered.
        pred: Predictions np.ndarray.
        uid: File UID (found both in the index and prediction file)
        tau: Threshold used to binarize predictions.
        class_list: List of classes for the model.
        model_type: 'point_sources' or 'box_sources', for labeling the dataframe
        frame_type: Type of frames to evaluate for this file.
            One of:
                -'all' (includes active and inactive frames)
                -'active' (ground truth has something)
                -'inactive' (ground truth is empty)
        labels_period: number of annotations per second.
        file_duration: file duration [s]
        verbose: If true, print more details about the frame-by-frame scoring.
        name: name of the model (optional)

    Returns:
        pd.DataFrame: dataframe of per-class IOU and GIOU scores for a given file UID.
    """

    overall_per_class_scores = []
    fps = 1 / labels_period

    # Frames, classes, regions
    f, c, r = pred.shape

    if f < file_duration / labels_period:
        # print(f,c,r)
        return

    # Initialize scores dict for this file
    class_score = {c: None for c in class_list}

    # Loop through the classes present in this file
    for class_index, class_name in enumerate(class_list):
        if verbose:
            print("CURRENT CLASS: ", class_name)

        # Get all events of this class
        frame_scores = []

        # Loop through time
        for f, t in enumerate(np.arange(0, file_duration, 1 / fps)):
            # For a given time frame, collect all indexes of this class and their azimuth indexes

            # Initialize box of region size (1D) - one box per frame
            box = gth[f, class_index, :]
            mask_per_class = (pred[f, class_index, None] > tau) ** 1
            if verbose:
                print("TIME: ", t)
                print("GT BOX MASK: ", box)
                print("PRED BOX MASK: ", mask_per_class)
                print("PRED BOX MASK NO THRESHOLD: ", pred[f, class_index, None])

            # One score per frame per class
            iou_res = iou_score(pred_mask=mask_per_class,  # the pred is many instances
                                gt_bbox_coords=[],
                                gt_box_mask=box,
                                th=tau,
                                frame_size=(1, r))

            if verbose:
                print("IOU Score: ", iou_res[0])

            # Add this to the scores list
            # Inactive
            if frame_type == 'all':
                frame_scores.append(iou_res[0])
            elif frame_type == 'active':
                if not np.all((iou_res[1] == 0)):
                    frame_scores.append(iou_res[0])
            elif frame_type == 'inactive':
                if np.all((iou_res[1] == 0)):
                    frame_scores.append(iou_res[0])
            else:
                raise ValueError('frame_type should be "active", "inactive", or "all"')

        class_score[class_name] = frame_scores

    avgs = [np.mean(element) for element in class_score.values()]
    overall_per_class_scores.append(avgs)

    cols = class_list + ['score', 'tau', 'frame_type', 'uid', 'model_type']
    overall_per_class_scores[0] += ['iou']
    overall_per_class_scores[0] += [tau]
    overall_per_class_scores[0] += [frame_type]
    overall_per_class_scores[0] += [uid]
    overall_per_class_scores[0] += [model_type]

    if name:
        cols = cols + ['name']
        overall_per_class_scores[0] += [name]


    iou_scoring_df = pd.DataFrame(overall_per_class_scores, columns=cols)

    final = iou_scoring_df
    return final

def iou_video(annot, corrs, **kw):
    """
    Compute an array of frame by frame IOU scores for a video.

    annot:
        video annotations for one video
    corr:

    Returns:
        See return for `iou_frame` and `iou_score`. Returns this for each frame in a video.
    """
    return [iou_frame(annot, pred, frame_id, **kw) for frame_id, pred in enumerate(corrs, 1)]


def eval_video(vfname: str, annotations, corrs, **kw):
    """
    Compute an array of frame-by-frame IOU scores given a video filename.

    vfname:
        video filename
    annotations:
        video annotations for the given video filename.
    corrs:

    Returns:
        See return for `iou_video`. Gets final evaluation for a video.
    """

    if vfname not in annotations.filename.unique():
        raise ValueError(f'{vfname} is not a valid video name in the dataset')
    vannot = annotations[annotations.filename == vfname]
    return iou_video(vannot, corrs, **kw)


## GIOU Code
def get_enclosing_mask(mask1: np.ndarray, mask2: np.ndarray):
    """
    Generate coordinates [x1,y1,x2,y2] of the smallest enclosing convex
    rectangle that encloses the given two masks.

    mask1:
        binary np.ndarray (1D or 2D+) representing a mask
    mask2:
        binary np.ndarray (1D or 2D+) representing a mask

    Returns:
        coords: [x1,y1,x2,y2] list of coordinates of smallest enclosing mask.
    """
    # Where these are 2D arrays of the two masks
    mask1_min_xy = min(list(zip(*np.where(mask1 == 1))))
    mask1_max_xy = max(list(zip(*np.where(mask1 == 1))))

    mask2_min_xy = min(list(zip(*np.where(mask2 == 1))))
    mask2_max_xy = max(list(zip(*np.where(mask2 == 1))))

    min_x = min(mask1_min_xy[1], mask2_min_xy[1])
    min_y = min(mask1_min_xy[0], mask2_min_xy[0])

    max_x = max(mask1_max_xy[1], mask2_max_xy[1])
    max_y = max(mask1_max_xy[0], mask2_max_xy[0])

    coords = [min_x, min_y, max_x, max_y]
    return coords


def giou_score(pred_mask: np.ndarray, gt_mask: np.ndarray, th: float = 0.5,
               frame_size=(720, 1280), ):
    """
    Computes Generalized Intersection over Union (GIoU) score for a given predicted
    mask and ground truth mask. **NOTE** that this metric is currently only designed to
    work when there is only one bounding box in a frame.

    The score itself is IoU - (C(AUB)/C)

    pred_mask:
        prediction mask for one frame, (not binary - it will be in likelihood)
    gt_mask_coords:
        coordinates of ground truth bbox [x1, y1, x2, y2]
    th:
        threshold tau that determines how to binarize the predicted max (which has likelihood vals)
    frame_size:
        size of the input frames (should be the same for pred/gt)

    Returns:
        (giou, gt_mask, pred_mask, c_mask):
            iou: IoU score computed (float)
            gt_mask: reshaped ground truth box mask (2D np.ndarray, shape=target_size)
            pred_mask: reshaped prediction mask (2D np.ndarray, shape=target_size)
            c_mask: reshaped mask of smallest rectangle enclosing gt_mask and pred_mask.
    """

    # Convert coordinates to mask right away
    # If both the prediction and the ground truth are empty (all zeros)
    if not np.any(pred_mask) and not np.any(gt_mask):
        return 1, gt_mask, pred_mask, create_mask([0, 0, 0, 0], frame_size=frame_size)

    # One is empty the other is not
    if (not np.any(pred_mask) and np.any(gt_mask)) or (not np.any(gt_mask) and np.any(pred_mask)):
        return 0, gt_mask, pred_mask, create_mask([0, 0, 0, 0], frame_size=frame_size)

    # Convert the likelihood thresholds to a binary mask
    pred_mask = (pred_mask > th) ** 1

    # gt_mask = video_to_size(gt_mask, target_size)
    # pred_mask = video_to_size(pred_mask, target_size)

    overlap = np.sum(pred_mask * gt_mask)
    union = np.sum((pred_mask - gt_mask > 0) + gt_mask)

    # Get enclosing mask
    get_c = get_enclosing_mask(gt_mask, pred_mask)  # This gets (x,y) top left, width, height
    c_mask = create_mask(coords=get_c,
                         frame_size=frame_size)
    c_diff = np.sum(c_mask - ((pred_mask - gt_mask > 0) + gt_mask))
    res = np.abs(c_diff) / np.abs(np.sum(c_mask))

    iou = overlap / union
    giou = iou - res

    return giou, gt_mask, pred_mask, c_mask


def giou_frame(annot: pd.DataFrame, pred: np.ndarray, frame_id: int, **kw):
    """
    Computes the GIoU score for a single frame.
    **Note** that this is only designed for single bounding boxes per frames.

    annot:
        video annotations for one video
    pred:
        predictions for one video
    frame_id:
        single frame ID, this allows us to filter down to evaluate only
        one frame at a time

    Returns:
        See return for `giou_score`. Returns this for the given `frame_id`.
    """

    _annot = annot[annot.frame_id == frame_id].iloc[0]
    gth = np.array([_annot.x, _annot.y, _annot.x + _annot.w, _annot.y + _annot.h]).astype(int)
    return giou_score(pred, gth, **kw)


def giou_frame_1D(annot: pd.DataFrame, pred: np.ndarray, frame_id: int, **kw):
    """
    Computes the GIoU score for a single frame, but for height=1 boxes. We use this
    as we evaluate the vertical regions of frames.
     **Note** that this is only designed for single bounding boxes per frames.

    Arguments:
        annot: video annotations for one video
        pred:  predictions for one video
        frame_id: single frame ID, this allows us to filter down to evaluate only
            one frame at a time

    Returns:
        See return for `iou_score`. Returns this for the given `frame_id`.
    """
    _annot = annot[annot.frame_id == frame_id].iloc[0]
    gth = np.array([_annot.x, 0, _annot.x + _annot.w, 1]).astype(int)
    return giou_score(pred, gth, **kw)


def giou_video(annot: pd.DataFrame, corrs, **kw):
    """
    Compute an array of frame by frame GIOU scores for a video.

    Arguments:
        annot: video annotations for one video
        corrs:

    Returns:
        See return for `iou_frame` and `iou_score`. Returns this for each frame in a video.
    """
    return [giou_frame(annot, pred, frame_id, **kw) for frame_id, pred in enumerate(corrs, 1)]


def giou_eval_video(vfname: str, annotations: pd.DataFrame, corrs, **kw):
    """
    Compute an array of frame-by-frame GIOU scores given a video filename.

    Arguments:
        vfname: video filename
        annotations: video annotations for the given video filename.
        corrs:

    Returns:
        See return for `giou_video`. Gets final evaluation for a video.
    """

    if vfname not in annotations.filename.unique():
        raise ValueError(f'{vfname} is not a valid video name in the dataset')
    vannot = annotations[annotations.filename == vfname]
    return giou_video(vannot, corrs, **kw)


def get_x1y1x2y2(mask: np.ndarray):
    """
    Given a binary mask, calculate [x1,y1,x2,y2] coordinate form of
    this mask in a format convenient for plotting.

    mask:
        binary np.ndarray (1D or 2D+) representing a mask

    Returns:
        (min_x, min_y): bottom left corner coordinates
        width: x-dimension width of box
        height: y-dimension height of box
    """

    xy = list(zip(*np.where(mask == 1)))
    # If the mask is in 2d array form
    min_x = min([a[0] for a in xy])
    min_y = min([a[1] for a in xy])
    max_x = max([a[0] for a in xy])
    max_y = max([a[1] for a in xy])

    width = max_x - min_x
    height = max_y - min_y
    return (min_x, min_y), width, height


def get_random_prediction(n_frames, n_classes, n_regions, point_sources=True, fps=2):
    """
    Creates a random prediction as baseline.

    Args:
        n_frames: number of frames the prediction should have.
        n_classes: number of classes the prediction should have.
        n_regions: number of regions (divisions of fov).
        point_sources: if True the random predictions are created mimicking a
                        poitwise model, if False they mimic a boxwise model.

    Returns:
        dict:
            pred: (n_frames, n_classes, n_regions) with the random predictions
            model: name of the model
            time: times of random predictions
    """

    shape = (n_frames, n_classes, n_regions)
    predictions = np.zeros(shape)

    if point_sources:
        max_rand_reg = 2
    else:
        max_rand_reg = 5

    for f in range(n_frames):
        for r in range(np.random.randint(0, max_rand_reg + 1)):
            c = np.random.randint(0, n_classes)
            r_ = np.random.randint(0, n_regions)
            predictions[f, c, r_] = 1

    pred_dict = {'pred': predictions,
                 'model': f'random_{"pointwise" if point_sources else "boxwise"}',
                 'time': np.arange(n_frames) / fps}


    return pred_dict