Skip to content

Commit

Permalink
Add a backend optimizer for adafactor.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 581476942
  • Loading branch information
tensorflower-gardener committed Nov 11, 2023
1 parent 55eb1d6 commit 76027f9
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 0 deletions.
5 changes: 5 additions & 0 deletions official/modeling/optimization/configs/optimization_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ class OptimizerConfig(oneof.OneOfConfig):
lars: lars optimizer.
adagrad: adagrad optimizer.
slide: slide optimizer.
adafactor: adafactor optimizer.
adafactor_keras: adafactor optimizer.
"""
type: Optional[str] = None
sgd: opt_cfg.SGDConfig = dataclasses.field(default_factory=opt_cfg.SGDConfig)
Expand Down Expand Up @@ -80,6 +82,9 @@ class OptimizerConfig(oneof.OneOfConfig):
adafactor: opt_cfg.AdafactorConfig = dataclasses.field(
default_factory=opt_cfg.AdafactorConfig
)
adafactor_keras: opt_cfg.AdafactorKerasConfig = dataclasses.field(
default_factory=opt_cfg.AdafactorKerasConfig
)


@dataclasses.dataclass
Expand Down
34 changes: 34 additions & 0 deletions official/modeling/optimization/configs/optimizer_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,3 +338,37 @@ class AdafactorConfig(BaseOptimizerConfig):
epsilon2: float = 1e-3
weight_decay: Optional[float] = None
include_in_weight_decay: Optional[str] = None


@dataclasses.dataclass
class AdafactorKerasConfig(BaseOptimizerConfig):
"""Configuration for AdafactorKeras optimizer.
The attributes for this class matches the arguments of the Adafactor
implementation provided by keras.
Attributes:
learning_rate: Initial value for the learning rate: either a floating
point value, or a
`tf_keras.optimizers.schedules.LearningRateSchedule` instance.
Defaults to 0.001.
beta_2_decay: float, defaults to -0.8. The decay rate of `beta_2`.
epsilon_1: float, defaults to 1e-30. A small offset to keep denominator
away from 0.
epsilon_2: float, defaults to 1e-3. A small offset to avoid learning
rate becoming too small by time.
clip_threshold: float, defaults to 1.0. Clipping threshold. This is a
part of Adafactor algorithm, independent from `clipnorm`, `clipvalue`
and `global_clipnorm`.
relative_step: bool, defaults to True. If `learning_rate` is a constant
and `relative_step=True`, learning rate will be adjusted based on
current iterations. This is a default learning rate decay in
Adafactor.
"""
name: str = "Adafactor"
learning_rate: float = 0.001
beta_2_decay: float = -0.8
epsilon_1: float = 1e-30
epsilon_2: float = 1e-3
clip_threshold: float = 1.0
relative_step: bool = True
1 change: 1 addition & 0 deletions official/modeling/optimization/optimizer_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
'lars': lars.LARS,
'slide': slide_optimizer.SLIDE,
'adafactor': adafactor_optimizer.Adafactor,
'adafactor_keras': tf_keras.optimizers.Adafactor,
}

LEGACY_OPTIMIZERS_CLS = {
Expand Down

0 comments on commit 76027f9

Please sign in to comment.