ankane · brettshollenberger · Oct 15, 2024 · Oct 15, 2024 · ankane · Oct 14, 2024
diff --git a/lib/xgboost.rb b/lib/xgboost.rb
@@ -6,6 +6,8 @@
 require_relative "xgboost/booster"
 require_relative "xgboost/dmatrix"
 require_relative "xgboost/version"
+require_relative "xgboost/training_callback"
+require_relative "xgboost/callback_container"
 
 # scikit-learn API
 require_relative "xgboost/model"
@@ -44,8 +46,11 @@ class << self
   autoload :FFI, "xgboost/ffi"
 
   class << self
-    def train(params, dtrain, num_boost_round: 10, evals: nil, early_stopping_rounds: nil, verbose_eval: true)
+    def train(params, dtrain, num_boost_round: 10, evals: nil, early_stopping_rounds: nil, verbose_eval: true, callbacks: [])
       booster = Booster.new(params: params)
+      cb_container = CallbackContainer.new(callbacks)
+      booster = cb_container.before_training(model: booster)
+
       num_feature = dtrain.num_col
       booster.set_param("num_feature", num_feature)
       booster.feature_names = dtrain.feature_names
@@ -59,6 +64,7 @@ def train(params, dtrain, num_boost_round: 10, evals: nil, early_stopping_rounds
       end
 
       num_boost_round.times do |iteration|
+        break unless cb_container.before_iteration(model: booster, epoch: iteration)
         booster.update(dtrain, iteration)
 
         if evals.any?
@@ -80,11 +86,14 @@ def train(params, dtrain, num_boost_round: 10, evals: nil, early_stopping_rounds
             best_message = message
           elsif early_stopping_rounds && iteration - best_iter >= early_stopping_rounds
             booster.best_iteration = best_iter
+            booster.best_score = best_score
             puts "Stopping. Best iteration:\n#{best_message}" if verbose_eval
             break
           end
+          break unless cb_container.after_iteration(model: booster, epoch: iteration, res: res)
         end
       end
+      booster = cb_container.after_training(model: booster)
 
       booster
     end

diff --git a/lib/xgboost/booster.rb b/lib/xgboost/booster.rb
@@ -1,8 +1,9 @@
 module XGBoost
   class Booster
-    attr_accessor :best_iteration, :feature_names, :feature_types
+    attr_accessor :best_iteration, :feature_names, :feature_types, :best_score, :params
 
     def initialize(params: nil, model_file: nil)
+      @params = params
       @handle = ::FFI::MemoryPointer.new(:pointer)
       check_result FFI.XGBoosterCreate(nil, 0, @handle)
       ObjectSpace.define_finalizer(@handle, self.class.finalize(handle_pointer.to_i))

diff --git a/lib/xgboost/callback_container.rb b/lib/xgboost/callback_container.rb
@@ -0,0 +1,61 @@
+module XGBoost
+  class CallbackContainer
+    attr_reader :callbacks, :history
+
+    def initialize(callbacks)
+      @callbacks = callbacks
+      @history = {}
+
+      callbacks.each do |callback|
+        raise ArgumentError, 'callback must subclass XGBoost::TrainingCallback.' unless callback.is_a?(TrainingCallback)
+      end
+    end
+
+    def before_training(model: nil)
+      callbacks.each do |callback|
+        model = callback.before_training(model: model)
+        unless model.is_a?(XGBoost::Booster)
+          raise ArgumentError, "Callback #{callback.class}#before_training must return an instance of XGBoost::Booster"
+        end
+      end
+      model
+    end
+
+    def after_training(model: nil)
+      callbacks.each do |callback|
+        model = callback.after_training(model: model)
+        unless model.is_a?(XGBoost::Booster)
+          raise ArgumentError, "Callback #{callback.class}#after_training must return an instance of XGBoost::Booster"
+        end
+      end
+      model
+    end
+
+    # If ANY callback returns false, then EXIT
+    def before_iteration(model: nil, epoch: nil)
+      callbacks.none? || callbacks.all? do |callback|
+        callback.before_iteration(model: model, epoch: epoch)
+      end
+    end
+
+    # If ANY callback returns false, then EXIT
+    def after_iteration(model: nil, epoch: nil, res: nil)
+      update_history(res)
+
+      callbacks.none? || callbacks.all? do |callback|
+        callback.after_iteration(model: model, epoch: epoch, history: history)
+      end
+    end
+
+    private
+
+    def update_history(res)
+      res.each do |name, value|
+        data_name, metric_name = name.split('-', 2)
+        history[data_name] ||= {}
+        history[data_name][metric_name] ||= []
+        history[data_name][metric_name] << value
+      end
+    end
+  end
+end
diff --git a/lib/xgboost/training_callback.rb b/lib/xgboost/training_callback.rb
@@ -0,0 +1,23 @@
+module XGBoost
+  class TrainingCallback
+    def before_training(model: nil)
+      # Run before training starts
+      model
+    end
+
+    def after_training(model: nil)
+      # Run after training is finished
+      model
+    end
+
+    def before_iteration(model: nil, epoch: nil)
+      # Run before each iteration. Returns true when training should stop.
+      false
+    end
+
+    def after_iteration(model: nil, epoch: nil, history: nil)
+      # Run after each iteration. Returns true when training should stop.
+      false
+    end
+  end
+end
diff --git a/test/callbacks_test.rb b/test/callbacks_test.rb
@@ -0,0 +1,186 @@
+require_relative 'test_helper'
+
+class CallbacksTest < Minitest::Test
+  class MockCallback < XGBoost::TrainingCallback
+    attr_reader :before_training_count, :after_training_count, :before_iteration_count, :after_iteration_count,
+                :before_training_args, :after_training_args, :before_iteration_args, :after_iteration_args, :history
+
+    def initialize
+      @before_training_count = 0
+      @after_training_count = 0
+      @before_iteration_count = 0
+      @after_iteration_count = 0
+      @before_training_args = []
+      @after_training_args = []
+      @before_iteration_args = []
+      @after_iteration_args = []
+      @history = {}
+    end
+
+    def before_training(model: nil)
+      @before_training_count += 1
+      model
+    end
+
+    def after_training(model: nil)
+      @after_training_count += 1
+      model
+    end
+
+    def before_iteration(model: nil, epoch: nil)
+      @before_iteration_count += 1
+      @before_iteration_args << { epoch: epoch }
+      true
+    end
+
+    def after_iteration(model: nil, epoch: nil, history: nil)
+      @after_iteration_count += 1
+      @history = history
+      true
+    end
+  end
+
+  def test_callback_raises_when_not_training_callback
+    num_boost_round = 10
+
+    assert_raises(ArgumentError, /callback must subclass XGBoost::TrainingCallback/) do
+      XGBoost.train(
+        regression_params,
+        regression_train,
+        num_boost_round: num_boost_round,
+        callbacks: ['not a callback'],
+        evals: [[regression_train, 'train'], [regression_test, 'eval']]
+      )
+    end
+  end
+
+  def test_callback
+    callback = MockCallback.new
+    num_boost_round = 10
+
+    XGBoost.train(
+      regression_params,
+      regression_train,
+      num_boost_round: num_boost_round,
+      callbacks: [callback],
+      evals: [[regression_train, 'train'], [regression_test, 'eval']]
+    )
+
+    assert_equal 1, callback.before_training_count
+    assert_equal 1, callback.after_training_count
+    assert_equal num_boost_round, callback.before_iteration_count
+    assert_equal num_boost_round, callback.after_iteration_count
+
+    # Verify arguments
+    train_rmse = callback.history['train']['rmse']
+    assert_equal num_boost_round, train_rmse.size
+    train_rmse.each do |value|
+      assert_in_delta 0.00, value, 1.0
+    end
+    eval_rmse = callback.history['eval']['rmse']
+    assert_equal num_boost_round, eval_rmse.size
+    eval_rmse.each do |value|
+      assert_in_delta 0.00, value, 1.0
+    end
+
+    epochs = callback.before_iteration_args.map { |e| e[:epoch] }
+    assert_equal (0...num_boost_round).to_a, epochs
+  end
+
+  def test_callback_breaks_on_before_iteration
+    callback = MockCallback.new
+    def callback.before_iteration(model: nil, epoch: nil)
+      @before_iteration_count += 1
+      @before_iteration_args << { epoch: epoch }
+      # If any callback returns false, break
+      epoch.even?
+    end
+    num_boost_round = 10
+
+    XGBoost.train(
+      regression_params,
+      regression_train,
+      num_boost_round: num_boost_round,
+      callbacks: [callback],
+      evals: [[regression_train, 'train'], [regression_test, 'eval']]
+    )
+
+    assert_equal 1, callback.before_training_count
+    assert_equal 1, callback.after_training_count
+    assert_equal 2, callback.before_iteration_count
+    assert_equal 1, callback.after_iteration_count
+
+    # Verify arguments
+    train_rmse = callback.history['train']['rmse']
+    assert_equal 1, train_rmse.size
+    train_rmse.each do |value|
+      assert_in_delta 0.00, value, 1.0
+    end
+    eval_rmse = callback.history['eval']['rmse']
+    assert_equal 1, eval_rmse.size
+    eval_rmse.each do |value|
+      assert_in_delta 0.00, value, 1.0
+    end
+
+    epochs = callback.before_iteration_args.map { |e| e[:epoch] }
+    assert_equal (0...2).to_a, epochs
+  end
+
+  def test_callback_breaks_on_after_iteration
+    callback = MockCallback.new
+    def callback.after_iteration(model: nil, epoch: nil, history: nil)
+      @after_iteration_count += 1
+      @history = history
+      epoch < 7
+    end
+    num_boost_round = 10
+
+    XGBoost.train(
+      regression_params,
+      regression_train,
+      num_boost_round: num_boost_round,
+      callbacks: [callback],
+      evals: [[regression_train, 'train'], [regression_test, 'eval']]
+    )
+
+    assert_equal 1, callback.before_training_count
+    assert_equal 1, callback.after_training_count
+    assert_equal 8, callback.before_iteration_count
+    assert_equal 8, callback.after_iteration_count
+
+    # Verify arguments
+    train_rmse = callback.history['train']['rmse']
+    assert_equal 8, train_rmse.size
+    train_rmse.each do |value|
+      assert_in_delta 0.00, value, 1.0
+    end
+    eval_rmse = callback.history['eval']['rmse']
+    assert_equal 8, eval_rmse.size
+    eval_rmse.each do |value|
+      assert_in_delta 0.00, value, 1.0
+    end
+
+    epochs = callback.before_iteration_args.map { |e| e[:epoch] }
+    assert_equal (0...8).to_a, epochs
+  end
+
+  def test_updates_model_before_training
+    callback = MockCallback.new
+    def callback.before_training(model: nil)
+      model['device'] = 'cuda:0'
+      model
+    end
+
+    num_boost_round = 10
+
+    model = XGBoost.train(
+      regression_params,
+      regression_train,
+      num_boost_round: num_boost_round,
+      callbacks: [callback],
+      evals: [[regression_train, 'train'], [regression_test, 'eval']]
+    )
+
+    assert_equal model['device'], 'cuda:0'
+  end
+end