ServiceNow · AlexPiche · Nov 20, 2024 · Oct 31, 2024 · Oct 31, 2024 · Oct 31, 2024
diff --git a/conf/deepspeed/accelerate_base.yaml → conf/accelerate/accelerate_base.yaml b/conf/deepspeed/accelerate_base.yaml → conf/accelerate/accelerate_base.yaml
diff --git a/conf/deepspeed/accelerate_local.yaml → conf/accelerate/accelerate_local.yaml b/conf/deepspeed/accelerate_local.yaml → conf/accelerate/accelerate_local.yaml
diff --git a/conf/deepspeed/deepspeed_stage3_bf16.json → conf/accelerate/deepspeed_stage3_bf16.json b/conf/deepspeed/deepspeed_stage3_bf16.json → conf/accelerate/deepspeed_stage3_bf16.json
diff --git a/conf/finetune/rl_llama31_8b.yaml b/conf/finetune/rl_llama31_8b.yaml
@@ -17,14 +17,14 @@ wandb_resume: always
 # Whether to use only the basename or the full path as the run name
 wandb_use_basename: false
 config_name: meta-llama/Meta-Llama-3.1-8B-Instruct
-learning_rate: 0.000005
-train_batch_size: 1
-gradient_accumulation_passes: 1024
+learning_rate: 0.0000025
+train_batch_size: 4
+gradient_accumulation_passes: 256
 seq_length: 4096
 load_as_bf16: True
 max_train_steps: 100000
 save_checkpoint_steps: ???
-optim: adamw_torch
+optim: adafactor # FIXME: adamw runs OOM with accelerate
 objective: rl
 log_each_n_steps: 1
 resume_dataloader: false
@@ -33,6 +33,7 @@ use_safetensors: true
 weight_decay: 0.1
 gradient_clipping_threshold: 1
 rl:
-  kl_coef: 0.05
+  kl_coef: 0.0
+  reward_minus_kl_coef: 0.0
   use_advantages: true
   algo: reinforce
diff --git a/conf/finetune/rs_llama31_8b.yaml b/conf/finetune/rs_llama31_8b.yaml
@@ -0,0 +1,34 @@
+defaults:
+  - base
+  - _self_
+
+#Use W&B experiment logging
+use_wandb: True
+# W&B id; if given, will resume this run
+wandb_id: null
+# W&B name; if not given will use run dir
+wandb_name: null
+# W&B entity name
+wandb_entity_name: null
+# W&B project name
+wandb_project_name: tapeagents
+# W&B resume policy
+wandb_resume: always
+# Whether to use only the basename or the full path as the run name
+wandb_use_basename: false
+config_name: meta-llama/Meta-Llama-3.1-8B-Instruct
+learning_rate: 0.0000025
+train_batch_size: 4
+gradient_accumulation_passes: 256
+seq_length: 4096
+load_as_bf16: True
+max_train_steps: 100000
+save_checkpoint_steps: ???
+optim: adafactor # FIXME: adamw runs OOM with accelerate
+objective: nll
+log_each_n_steps: 1
+resume_dataloader: false
+cuda_empty_cache: true
+use_safetensors: true
+weight_decay: 0.1
+gradient_clipping_threshold: 1
diff --git a/conf/rl_debug.yaml b/conf/rl_debug.yaml
@@ -1,8 +1,10 @@
 defaults:
   - rl_gsm8k
+  - _self_
+
 max_agent_forks: 16
 attempts: 1
-
+test_every_n_iterations: -1
 finetune:
   save_checkpoint_steps: 2
   gradient_accumulation_passes: 16

diff --git a/conf/rl_gsm8k.yaml b/conf/rl_gsm8k.yaml
@@ -3,20 +3,22 @@ defaults:
   - _self_
 
 n_workers: 32
-max_loops: 10
+get_log_probs_workers: 1
+max_loops: 1
 test_every_n_iterations: 5
 model_path: meta-llama/Meta-Llama-3.1-8B-Instruct
 max_agent_forks: 1024
 attempts: 64
 force_restart: false
-max_iterations: 100
+max_iterations: 1000
+use_rejection_sampling: false
 llm:
   parameters:
-    max_tokens: 1024
+    max_tokens: 2000
     temperature: 0.7
 test_llm:
-  parameters:
-    max_tokens: 1024
+  parameters: 
+    max_tokens: ${...llm.parameters.max_tokens}
     temperature: 0.
 
 finetune:
@@ -27,16 +29,17 @@ finetune:
   # One step is one weight update. See the finetuning configuration
   # for the info in how many sequences are used for each weight update.
   save_checkpoint_steps: 10
+  seq_length: 2000
 
 vllm_config:
   vllm_kwargs:
     --download-dir: /mnt/llmd/base_models/ 
-    --max-model-len: 8000
     --gpu-memory-utilization: 0.9
     # VLLM get log probs OOM https://github.com/vllm-project/vllm/issues/5907
     --enable-chunked-prefill: ""
 
 output_dir: outputs/rl_gsm8k
+accelerate_cfg_path: conf/accelerate/accelerate_base.yaml
 
 hydra:
   run:

diff --git a/conf/rs_gsm8k.yaml b/conf/rs_gsm8k.yaml
@@ -0,0 +1,6 @@
+defaults:
+  - rl_gsm8k
+  - override finetune: rs_llama31_8b
+  - _self_
+
+use_rejection_sampling: true
diff --git a/examples/gsm8k_tuning/README.md b/examples/gsm8k_tuning/README.md
@@ -5,16 +5,21 @@ This example demonstrates how to improve the math skills of a small LLama model.
 We built a basic math agent that uses the LLama 3.1 70B model equipped with reasoning and calculator tool to solve math problems: [math_agent.py](math_agent.py).
 
 Steps to distill the math agent:
-- [run it as a teacher](produce_teacher_tapes.py), collect the tapes of the successful solutions, and produce training data for the Math Agent from them. How to run: `python -m examples.gsm8k_tuning.produce_teacher_tapes`
+- [run it as a teacher](produce_teacher_tapes.py), collect the tapes of the successful solutions, and produce training data for the Math Agent from them. How to run: `python -m examples.gsm8k_tuning.produce_teacher_tapes` or download the data generated by `meta-llama/Llama-3.1-70B-Instruct`
+```python
+    df = pd.read_json("hf://datasets/ServiceNow/llama31_70b_gsm8k_3k/training_samples_3k.jsonl", lines=True)
+    df.to_json("training_samples_3k.jsonl", orient="records", lines=True)
+```
 - [fine-tune smaller LLama 3.1 8B model](finetune_student.py) on the training data to get a tuned Math Agent. How to run: `python -m examples.gsm8k_tuning.finetune_student`
+- [merge the lora weights](../../tapeagents/finetune/lora.py) to be able to serve the model with vLLM. How to run: `python -m tapeagents.finetune.lora PATH/TO/WEIGHTS`
 - [evaluate the tuned Math Agent](evaluate_student.py) on the subset of GSM8K test set, comparing the accuracy of the teacher agent, student agent before tuning, and student agent after tuning. How to run: `python -m examples.gsm8k_tuning.evaluate_student`
 
-<img width="526" alt="image" src="https://github.com/user-attachments/assets/a7aa2908-2a86-4b85-92d2-8c133e9ac0ff">
+<img width="526" alt="image" src="https://github.com/user-attachments/assets/55d099ab-ff5c-480b-b5b3-504b4206e677">
 
 | Model | Test accuracy |
 | ----- | ------------- |
 | 8B student before tuning | 0.662 |
-| 8B student after tuning | 0.775 |
+| 8B student after tuning | 0.785 |
 | 70B teacher | 0.931 |
 
 RL tuning on both successful and unsuccessful solutions is coming soon. Stay tuned!
diff --git a/examples/rl_gsm8k/browse.py b/examples/rl_gsm8k/browse.py
@@ -0,0 +1,19 @@
+import os
+from pathlib import Path
+import sys
+
+from tapeagents.renderers.camera_ready_renderer import CameraReadyRenderer
+from tapeagents.tape_browser import TapeBrowser
+from examples.rl_gsm8k.cot_math_agent import MathTape
+
+# comment this code out if loading the prompt and completions takes too long for you
+tape_dir = Path(sys.argv[1])
+exp_dir = tape_dir
+# try to find a parent directory for tape_dir path that contains llm_calls.sqlite
+while not os.path.exists(exp_dir / "llm_calls.sqlite") and exp_dir != Path("."):
+    exp_dir = exp_dir.parent
+os.environ["TAPEAGENTS_SQLITE_DB"] = os.path.join(exp_dir, "llm_calls.sqlite")
+
+
+browser = TapeBrowser(MathTape, sys.argv[1], CameraReadyRenderer(), file_extension=".json")
+browser.launch(port=7680 if len(sys.argv) < 3 else int(sys.argv[2]))
diff --git a/examples/rl_gsm8k/cot_math_agent.py b/examples/rl_gsm8k/cot_math_agent.py
@@ -0,0 +1,109 @@
+import logging
+from typing import Annotated, Generator, Literal, TypeAlias, Union
+
+from pydantic import Field
+
+from examples.gsm8k_tuning.math_agent import extract_result_value  # noqa
+from tapeagents.agent import Agent
+from tapeagents.core import (
+    Action,
+    LLMOutputParsingFailureAction,
+    Observation,
+    Step,
+    Tape,
+    Thought,
+)
+from tapeagents.environment import Environment
+from tapeagents.llms import LLM
+from tapeagents.nodes import MonoNode
+
+logger = logging.getLogger(__name__)
+
+COT_GUIDANCE = "Think step by step. When you know the answer to the question, provide it in the following format: The answer is: <number>"
+
+
+class Task(Observation):
+    kind: Literal["task"] = "task"
+    task: str
+
+    def llm_view(self, indent: int | None = 2) -> str:
+        return f"{self.task} {COT_GUIDANCE}"
+
+
+class ReasoningThoughtwithValue(Thought):
+    """
+    Thoughts produced by the agent during the reasoning process.
+    """
+
+    kind: Literal["reasoning_thought_with_value"] = "reasoning_thought_with_value"
+    reasoning: str = Field(description="chain of thoughts")
+    value: float = Field(description="value of the reasoning")
+
+
+MathAgentStep: TypeAlias = Annotated[
+    ReasoningThoughtwithValue,
+    Field(discriminator="kind"),
+]
+
+MathTape = Tape[
+    None,
+    Union[
+        Task,
+        ReasoningThoughtwithValue,
+        LLMOutputParsingFailureAction,
+    ],
+]
+
+
+class ReasoningNode(MonoNode):
+    def parse_completion(self, completion: str, prompt_id: str) -> Generator[Step, None, None]:
+        if "The answer is" not in completion:
+            yield LLMOutputParsingFailureAction(
+                error=f"Failed to parse agent output: {completion}", llm_output=completion
+            )
+            return
+        try:
+            value = completion.split("The answer is")[-1]
+            value = value.replace(",", "")
+            value = value.replace(" ", "")
+            value = value.replace(":", "")
+            value = value.replace("$", "")
+            value = value.replace("%", "")
+            value = value.replace("€", "")
+            value = value.strip()
+            step = ReasoningThoughtwithValue(reasoning=completion, value=float(value))
+        except Exception as e:
+            logger.info(f"Failed to parse agent output: {completion}\n\nError: {e}")
+            yield LLMOutputParsingFailureAction(
+                error=f"Failed to parse agent output: {completion}\n\nError: {e}", llm_output=completion
+            )
+            return
+        yield step
+
+
+#### Agent and Environment ####
+class CoTMathAgent(Agent):
+    @classmethod
+    def create(cls, llm: LLM):
+        return super().create(
+            llm,
+            nodes=[
+                ReasoningNode(
+                    name="cot",
+                    agent_step_cls=MathAgentStep,
+                ),
+            ],
+            max_iterations=1,
+        )
+
+
+class MathEnvironment(Environment):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def react(self, tape: MathTape) -> MathTape:
+        actions = [step for step in tape.steps[-tape.metadata.n_added_steps :] if isinstance(step, Action)]
+        for action in actions:
+            if isinstance(action, LLMOutputParsingFailureAction):
+                continue
+        return tape
diff --git a/examples/rl_gsm8k/gather_jsons.py b/examples/rl_gsm8k/gather_jsons.py
@@ -0,0 +1,25 @@
+# read json files from a folder, create new json with the same name that contains all the content
+
+import sys
+import json
+import os
+
+from tapeagents.io import load_tapes
+from examples.gsm8k_tuning.math_agent import MathTape
+
+def gather_jsons(folder: str):
+    all_jsons = []
+    for root, _, files in os.walk(folder):
+        for file in files:
+            if file.endswith(".json"):
+                with open(os.path.join(root, file)) as f:
+                    all_jsons.append(json.load(f))
+
+    dst_dir = f"{folder}/all"
+    os.makedirs(dst_dir, exist_ok=True)
+    dst_name = f"{dst_dir}/tapes.json"
+    with open(dst_name, "w") as f:
+        json.dump(all_jsons, f, indent=4)
+
+
+gather_jsons(sys.argv[1])