Merge pull request #6 from Rouast-Labs/long-video-support

Long video support
Rouast-Labs · Jul 20, 2024 · 92b6728 · 92b6728
2 parents 1a9923d + 781b4a4
commit 92b6728
Show file tree

Hide file tree

Showing 9 changed files with 147 additions and 77 deletions.
diff --git a/examples/test.py b/examples/test.py
@@ -2,6 +2,7 @@
 sys.path.append('../vitallens-python')
 import argparse
 import matplotlib.pyplot as plt
+import os
 import pandas as pd
 from prpy.ffmpeg.probe import probe_video
 from prpy.ffmpeg.readwrite import read_video_from_path
@@ -20,7 +21,7 @@
 
 def run(args=None):
   # Get ground truth vitals
-  vitals = pd.read_csv(args.vitals_path)
+  vitals = pd.read_csv(args.vitals_path) if os.path.exists(args.vitals_path) else []
   ppg_gt = vitals['ppg'] if 'ppg' in vitals else None 
   resp_gt = vitals['resp'] if 'resp' in vitals else None
   # Get video

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -45,13 +45,20 @@ def test_video_fps():
   fps, *_ = probe_video(TEST_VIDEO_PATH)
   return fps
 
+@pytest.fixture(scope='session')
+def test_video_shape():
+  _, n, w, h, _, _, _ = probe_video(TEST_VIDEO_PATH)
+  return (n, h, w, 3)
+
 @pytest.fixture(scope='session')
 def test_video_faces(request):
   det = FaceDetector(
     max_faces=1, fs=1.0, iou_threshold=0.45, score_threshold=0.9)
   test_video_ndarray = request.getfixturevalue('test_video_ndarray')
   test_video_fps = request.getfixturevalue('test_video_fps')
-  boxes, _ = det(test_video_ndarray, fps=test_video_fps)
+  boxes, _ = det(test_video_ndarray,
+                 inputs_shape=test_video_ndarray.shape,
+                 fps=test_video_fps)
   boxes = (boxes * [test_video_ndarray.shape[2], test_video_ndarray.shape[1], test_video_ndarray.shape[2], test_video_ndarray.shape[1]]).astype(int)
   return boxes[:,0].astype(np.int64)
 

diff --git a/tests/test_ssd.py b/tests/test_ssd.py
@@ -19,6 +19,7 @@
 # SOFTWARE.
 
 import numpy as np
+from prpy.ffmpeg.probe import probe_video
 import pytest
 
 import sys
@@ -79,13 +80,13 @@ def test_enforce_temporal_consistency():
      [[.5,   .5,   .75,  .75 ], [.125, .5, .375, .75]],
      [[.125, .625, .375, .875], [.625, .5, .875, .75]]])
   info = np.array(
-    [[[0, 1, 1, 1, .99], [0, 1, 1, 1, .99]],
-     [[1, 1, 0, 0, .2 ], [1, 1, 1, 1, .99]],
-     [[2, 1, 1, 1, .99], [2, 1, 1, 1, .99]],
-     [[3, 1, 1, 1, .99], [3, 1, 1, 1, .99]],
-     [[4, 1, 1, 1, .99], [4, 1, 1, 1, .99]]])
+    [[[0, 1, 1, .99], [0, 1, 1, .99]],
+     [[1, 1, 0, .2 ], [1, 1, 1, .99]],
+     [[2, 1, 1, .99], [2, 1, 1, .99]],
+     [[3, 1, 1, .99], [3, 1, 1, .99]],
+     [[4, 1, 1, .99], [4, 1, 1, .99]]])
   boxes_out, info_out = enforce_temporal_consistency(
-    boxes=boxes, info=info, inputs_shape=(5, 8, 8, 3))
+    boxes=boxes, info=info, n_frames=5)
   np.testing.assert_equal(
     boxes_out,
       np.array(
@@ -97,11 +98,11 @@ def test_enforce_temporal_consistency():
   np.testing.assert_equal(
     info_out,
       np.array(
-        [[[0, 1, 1, 1, .99], [0, 1, 1, 1, .99]],
-         [[1, 1, 1, 1, .99], [1, 1, 0, 0, .2 ]],
-         [[2, 1, 1, 1, .99], [2, 1, 1, 1, .99]],
-         [[3, 1, 1, 1, .99], [3, 1, 1, 1, .99]],
-         [[4, 1, 1, 1, .99], [4, 1, 1, 1, .99]]]))
+        [[[0, 1, 1, .99], [0, 1, 1, .99]],
+         [[1, 1, 1, .99], [1, 1, 0, .2 ]],
+         [[2, 1, 1, .99], [2, 1, 1, .99]],
+         [[3, 1, 1, .99], [3, 1, 1, .99]],
+         [[4, 1, 1, .99], [4, 1, 1, .99]]]))
 
 def test_interpolate_unscanned_frames():
   # Example with 2 moving faces, 3 time steps, no detection for face 1 in time step 2, faces swapped in time step 4
@@ -110,11 +111,11 @@ def test_interpolate_unscanned_frames():
      [[.25,  .5, .5,   .75], [.125, .25,  .375, .5  ]],
      [[.375, .5, .625, .75], [.125, .375, .375, .625]]])
   info = np.array(
-    [[[0, 1, 1, 1, .99], [0, 1, 1, 1, .99]],
-     [[1, 1, 1, 1, .99], [1, 1, 0, 0, .2 ]],
-     [[2, 1, 1, 1, .99], [2, 1, 1, 1, .99]]])
+    [[[0, 1, 1, .99], [0, 1, 1, .99]],
+     [[2, 1, 1, .99], [2, 1, 0, .2 ]],
+     [[4, 1, 1, .99], [4, 1, 1, .99]]])
   boxes_out, info_out = interpolate_unscanned_frames(
-    boxes=boxes, info=info, scan_every=2, inputs_shape=(5, 8, 8, 3))
+    boxes=boxes, info=info, n_frames=5)
   np.testing.assert_equal(
     boxes_out,
       np.array(
@@ -126,25 +127,31 @@ def test_interpolate_unscanned_frames():
   np.testing.assert_equal(
     info_out,
       np.array(
-        [[[0, 1, 1, 1, .99], [0, 1, 1, 1, .99]],
-         [[1, 0, 0, 1, 0  ], [1, 0, 0, 1, 0  ]], # Imperfection of the implementation
-         [[2, 1, 1, 1, .99], [2, 1, 0, 0, .2 ]],
-         [[3, 0, 0, 1, 0  ], [3, 0, 0, 0, 0  ]],
-         [[4, 1, 1, 1, .99], [4, 1, 1, 1, .99]]]))
+        [[[0, 1, 1, .99], [0, 1, 1, .99]],
+         [[1, 0, 0, 0  ], [1, 0, 0, 0  ]], # Imperfection of the implementation
+         [[2, 1, 1, .99], [2, 1, 0, .2 ]],
+         [[3, 0, 0, 0  ], [3, 0, 0, 0  ]],
+         [[4, 1, 1, .99], [4, 1, 1, .99]]]))
 
 @pytest.mark.parametrize("file", [True, False])
 def test_FaceDetector(request, file):
   det = FaceDetector(
     max_faces=2, fs=1.0, iou_threshold=0.45, score_threshold=0.9)
   if file:
     test_video_path = request.getfixturevalue('test_video_path')
-    boxes, info = det(test_video_path)
+    test_video_shape = request.getfixturevalue('test_video_shape')
+    test_video_fps = request.getfixturevalue('test_video_fps')
+    boxes, info = det(inputs=test_video_path,
+                      inputs_shape=test_video_shape,
+                      fps=test_video_fps)
   else:
     test_video_ndarray = request.getfixturevalue('test_video_ndarray')
     test_video_fps = request.getfixturevalue('test_video_fps')
-    boxes, info = det(test_video_ndarray, fps=test_video_fps)
+    boxes, info = det(inputs=test_video_ndarray,
+                      inputs_shape=test_video_ndarray.shape,
+                      fps=test_video_fps)
   assert boxes.shape == (360, 1, 4)
-  assert info.shape == (360, 1, 5)
+  assert info.shape == (360, 1, 4)
   np.testing.assert_allclose(boxes[0,0],
                              [0.32223, 0.118318, 0.572684, 0.696835],
                              atol=0.01)

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -90,12 +90,12 @@ def test_probe_video_inputs_wrong_type():
 def test_parse_video_inputs(request, file, roi, target_size, target_fps):
   if file:
     test_video_path = request.getfixturevalue('test_video_path')
-    parsed, fps_in, video_shape_in, ds_factor = parse_video_inputs(
+    parsed, fps_in, video_shape_in, ds_factor, idxs = parse_video_inputs(
       test_video_path, roi=roi, target_size=target_size, target_fps=target_fps)
   else:
     test_video_ndarray = request.getfixturevalue('test_video_ndarray')
     test_video_fps = request.getfixturevalue('test_video_fps')
-    parsed, fps_in, video_shape_in, ds_factor = parse_video_inputs(
+    parsed, fps_in, video_shape_in, ds_factor, idxs = parse_video_inputs(
       test_video_ndarray, fps=test_video_fps, roi=roi, target_size=target_size,
       target_fps=target_fps)
   assert parsed.shape == (360 if target_fps is None else 360 // 2,
@@ -105,6 +105,7 @@ def test_parse_video_inputs(request, file, roi, target_size, target_fps):
   assert fps_in == 30
   assert video_shape_in == (360, 480, 768, 3)
   assert ds_factor == 1 if target_fps is None else 2
+  assert idxs == list(range(360)) if target_fps is None else list(range(0, 360, 2))
 
 def test_parse_video_inputs_no_file():
   with pytest.raises(Exception):

diff --git a/vitallens/client.py b/vitallens/client.py
@@ -45,7 +45,7 @@ def __init__(
       method: Method = Method.VITALLENS,
       api_key: str = None,
       detect_faces: bool = True,
-      fdet_max_faces: int = 2,
+      fdet_max_faces: int = 1,
       fdet_fs: float = 1.0,
       fdet_score_threshold: float = 0.9,
       fdet_iou_threshold: float = 0.3
@@ -65,6 +65,7 @@ def __init__(
     self.api_key = api_key
     # Load the config and model
     self.config = load_config(method.name.lower() + ".yaml")
+    self.method = method
     if self.config['model'] == 'g':
       self.rppg = GRPPGMethod(self.config)
     elif self.config['model'] == 'chrom':
@@ -151,10 +152,15 @@ def __call__(
     """
     # Probe inputs
     inputs_shape, fps = probe_video_inputs(video=video, fps=fps)
+    # TODO: Optimize performance of simple rPPG methods for long videos
+    # Warning if using long video
+    target_fps = override_fps_target if override_fps_target is not None else self.rppg.fps_target
+    if self.method != Method.VITALLENS and inputs_shape[0]/fps*target_fps > 3600:
+      logging.warn("Inference for long videos has yet to be optimized for POS / G / CHROM. This may run out of memory and crash.")
     _, height, width, _ = inputs_shape
     if self.detect_faces:
       # Detect faces
-      faces_rel, _ = self.face_detector(inputs=video, fps=fps)
+      faces_rel, _ = self.face_detector(inputs=video, inputs_shape=inputs_shape, fps=fps)
       # If no faces detected: return empty list
       if len(faces_rel) == 0:
         logging.warn("No faces to analyze")

diff --git a/vitallens/methods/simple_rppg_method.py b/vitallens/methods/simple_rppg_method.py
@@ -76,7 +76,7 @@ def __call__(
     u_roi = merge_faces(faces)
     faces = faces - [u_roi[0], u_roi[1], u_roi[0], u_roi[1]]
     # Parse the inputs
-    frames_ds, fps, inputs_shape, ds_factor = parse_video_inputs(
+    frames_ds, fps, inputs_shape, ds_factor, _ = parse_video_inputs(
       video=frames, fps=fps, target_size=None, roi=u_roi,
       target_fps=override_fps_target if override_fps_target is not None else self.fps_target)   
     assert inputs_shape[0] == faces.shape[0], "Need same number of frames as face detections"

diff --git a/vitallens/methods/vitallens.py b/vitallens/methods/vitallens.py
@@ -79,7 +79,8 @@ def __call__(
       (faces[:,3] - faces[:,1]) * 0.5 < np.maximum(0, faces[:,1] - roi[1]) + np.maximum(0, faces[:,3] - roi[3]))):
       logging.warn("Large face movement detected")
     # Parse the inputs
-    frames_ds, fps, inputs_shape, ds_factor = parse_video_inputs(
+    logging.debug("Preparing video for inference...")
+    frames_ds, fps, inputs_shape, ds_factor, _ = parse_video_inputs(
       video=frames, fps=fps, target_size=self.input_size, roi=roi,
       target_fps=override_fps_target if override_fps_target is not None else self.fps_target,
       library='prpy', scale_algorithm='bilinear')
@@ -95,6 +96,7 @@ def __call__(
       split_len = math.ceil((ds_len + (n_splits-1) * API_OVERLAP) / n_splits)
       start_idxs = [i for i in range(0, ds_len - n_splits * API_OVERLAP, split_len - API_OVERLAP)]
       end_idxs = [min(i + split_len, ds_len) for i in start_idxs]
+      logging.info("Running inference for {} frames using {} requests...".format(ds_len, n_splits))
       # Process the splits in parallel
       with concurrent.futures.ThreadPoolExecutor() as executor:
         results = list(executor.map(lambda i: self.process_api(frames_ds[start_idxs[i]:end_idxs[i]]), range(n_splits)))
@@ -211,8 +213,9 @@ def postprocess(self, sig, fps, type='ppg', filter=True):
         Lambda = detrend_lambda_for_rr_response(fps)
       else:
         raise ValueError("Type {} not implemented!".format(type))
-      # Detrend
-      sig = detrend(sig, Lambda)
+      if sig.shape[-1] < 4 * API_MAX_FRAMES:
+        # Detrend only for shorter videos for performance reasons
+        sig = detrend(sig, Lambda)
       # Moving average
       sig = moving_average(sig, size)
       # Standardize