From c98ea02777d52d9da0b006a4e1599ba2f246f503 Mon Sep 17 00:00:00 2001
From: AMindToThink <61801493+AMindToThink@users.noreply.github.com>
Date: Tue, 15 Apr 2025 12:58:07 -0500
Subject: [PATCH 01/11] value_model can't be None, so it shouldn't be Optional
 or have default None

---
 trl/trainer/ppo_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/trl/trainer/ppo_trainer.py b/trl/trainer/ppo_trainer.py
index 54b62394a8..885757767b 100644
--- a/trl/trainer/ppo_trainer.py
+++ b/trl/trainer/ppo_trainer.py
@@ -107,7 +107,7 @@ def __init__(
         ref_model: Optional[nn.Module],
         reward_model: nn.Module,
         train_dataset: Dataset,
-        value_model: Optional[nn.Module] = None,
+        value_model: nn.Module,
         data_collator: Optional[DataCollatorWithPadding] = None,
         eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
         # less commonly used

From 0f59fa52099185e0fa9fc07b22e87e8cc7a59486 Mon Sep 17 00:00:00 2001
From: AMindToThink <matthewkhoriaty@gmail.com>
Date: Tue, 15 Apr 2025 18:32:22 +0000
Subject: [PATCH 02/11] made changes, now to test saving value

---
 trl/trainer/ppo_config.py  |  6 ++++++
 trl/trainer/ppo_trainer.py | 25 +++++++++++++++++++++++--
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/trl/trainer/ppo_config.py b/trl/trainer/ppo_config.py
index 8f23084ce8..a24e48f2f9 100644
--- a/trl/trainer/ppo_config.py
+++ b/trl/trainer/ppo_config.py
@@ -58,6 +58,8 @@ class PPOConfig(OnPolicyConfig):
             Discount factor.
         lam (`float`, *optional*, defaults to `0.95`):
             Lambda value for GAE.
+        save_value_model (`bool`, *optional*, defaults to `False`):
+            Whether the value model (also known as the critic model) should be saved when the policy model is saved.
         ds3_gather_for_generation (`bool`, *optional*, defaults to `True`):
             This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
             improving generation speed. However, disabling this option allows training models that exceed the VRAM
@@ -121,6 +123,10 @@ class PPOConfig(OnPolicyConfig):
         default=0.95,
         metadata={"help": "Lambda value for GAE."},
     )
+    save_value_model: bool = field(
+        default=False,
+        metadata={"help": "Whether the value model (also known as the critic model) should be saved when the policy model is saved."}
+    )
     ds3_gather_for_generation: bool = field(
         default=True,
         metadata={
diff --git a/trl/trainer/ppo_trainer.py b/trl/trainer/ppo_trainer.py
index 54b62394a8..2a3dcf14bc 100644
--- a/trl/trainer/ppo_trainer.py
+++ b/trl/trainer/ppo_trainer.py
@@ -328,19 +328,40 @@ def null_ref_context(self):
                 self.model.policy.set_adapter(self.model_adapter_name or "default")
 
     def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False):
+        # Handle the None case here so that we can have subfolders for policy and value
+        if output_dir is None:
+            output_dir = self.args.output_dir
+            if output_dir is None:
+                raise ValueError("No output directory specified for saving the model")
+            
         backup_model = self.model
         self.model = self.model.policy  # save only the policy
 
         if self.is_deepspeed_enabled:
             backup_deepspeed = self.deepspeed
             self.deepspeed = self.model
-
-        super().save_model(output_dir, _internal_call)
+        policy_output_dir = output_dir if not self.args.save_value_model else output_dir + "/policy_model"
+        super().save_model(policy_output_dir, _internal_call)
 
         self.model = backup_model
 
         if self.is_deepspeed_enabled:
             self.deepspeed = backup_deepspeed
+        
+        if self.args.save_value_model:
+            backup_model = self.model
+            self.model = self.model.value_model
+
+            if self.is_deepspeed_enabled:
+                backup_deepspeed = self.deepspeed
+                self.deepspeed = self.model
+            value_output_dir = output_dir if not self.args.save_value_model else output_dir + "/value_model"
+            super().save_model(value_output_dir, _internal_call)
+
+            self.model = backup_model
+
+            if self.is_deepspeed_enabled:
+                self.deepspeed = backup_deepspeed 
 
     def train(self):
         args = self.args

From 4f54087b8fa6b0a2261c3dd9fa6896c914b959ec Mon Sep 17 00:00:00 2001
From: AMindToThink <matthewkhoriaty@gmail.com>
Date: Tue, 15 Apr 2025 19:59:59 +0000
Subject: [PATCH 03/11] testing

---
 trl/trainer/ppo_trainer.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/trl/trainer/ppo_trainer.py b/trl/trainer/ppo_trainer.py
index 02c8a5662a..844bc3fac6 100644
--- a/trl/trainer/ppo_trainer.py
+++ b/trl/trainer/ppo_trainer.py
@@ -328,6 +328,7 @@ def null_ref_context(self):
                 self.model.policy.set_adapter(self.model_adapter_name or "default")
 
     def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False):
+        import pdb;pdb.set_trace()
         # Handle the None case here so that we can have subfolders for policy and value
         if output_dir is None:
             output_dir = self.args.output_dir
@@ -340,7 +341,7 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
         if self.is_deepspeed_enabled:
             backup_deepspeed = self.deepspeed
             self.deepspeed = self.model
-        policy_output_dir = output_dir if not self.args.save_value_model else output_dir + "/policy_model"
+        policy_output_dir = output_dir if not self.args.save_value_model else os.path.join(output_dir, "policy_model")
         super().save_model(policy_output_dir, _internal_call)
 
         self.model = backup_model
@@ -355,7 +356,7 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
             if self.is_deepspeed_enabled:
                 backup_deepspeed = self.deepspeed
                 self.deepspeed = self.model
-            value_output_dir = output_dir if not self.args.save_value_model else output_dir + "/value_model"
+            value_output_dir = output_dir if not self.args.save_value_model else os.path.join(output_dir, "value_model")
             super().save_model(value_output_dir, _internal_call)
 
             self.model = backup_model

From aa2be6fe04f81f92b4ab123855c813b46761aeeb Mon Sep 17 00:00:00 2001
From: AMindToThink <matthewkhoriaty@gmail.com>
Date: Tue, 15 Apr 2025 20:04:34 +0000
Subject: [PATCH 04/11] get rid of output_dir None check

---
 trl/trainer/ppo_trainer.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/trl/trainer/ppo_trainer.py b/trl/trainer/ppo_trainer.py
index 844bc3fac6..0ed75b6cf0 100644
--- a/trl/trainer/ppo_trainer.py
+++ b/trl/trainer/ppo_trainer.py
@@ -328,12 +328,12 @@ def null_ref_context(self):
                 self.model.policy.set_adapter(self.model_adapter_name or "default")
 
     def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False):
-        import pdb;pdb.set_trace()
+        # import pdb;pdb.set_trace()
         # Handle the None case here so that we can have subfolders for policy and value
-        if output_dir is None:
-            output_dir = self.args.output_dir
-            if output_dir is None:
-                raise ValueError("No output directory specified for saving the model")
+        # if output_dir is None:
+        #     output_dir = self.args.output_dir
+        #     if output_dir is None:
+        #         raise ValueError("No output directory specified for saving the model")
             
         backup_model = self.model
         self.model = self.model.policy  # save only the policy

From 36c4c82e713e336d3c2e98ac5ee23adfa0447c05 Mon Sep 17 00:00:00 2001
From: AMindToThink <matthewkhoriaty@gmail.com>
Date: Tue, 15 Apr 2025 20:15:41 +0000
Subject: [PATCH 05/11] added prints

---
 trl/trainer/ppo_trainer.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/trl/trainer/ppo_trainer.py b/trl/trainer/ppo_trainer.py
index 0ed75b6cf0..afa70d4a97 100644
--- a/trl/trainer/ppo_trainer.py
+++ b/trl/trainer/ppo_trainer.py
@@ -337,7 +337,7 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
             
         backup_model = self.model
         self.model = self.model.policy  # save only the policy
-
+        print("Accessed policy, continuing")
         if self.is_deepspeed_enabled:
             backup_deepspeed = self.deepspeed
             self.deepspeed = self.model
@@ -350,6 +350,7 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
             self.deepspeed = backup_deepspeed
         
         if self.args.save_value_model:
+            print("SAVING VALUE MODEL", end="")
             backup_model = self.model
             self.model = self.model.value_model
 
@@ -357,8 +358,9 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
                 backup_deepspeed = self.deepspeed
                 self.deepspeed = self.model
             value_output_dir = output_dir if not self.args.save_value_model else os.path.join(output_dir, "value_model")
+            print(" to " + value_output_dir)
             super().save_model(value_output_dir, _internal_call)
-
+            print("reseting model to the combined")
             self.model = backup_model
 
             if self.is_deepspeed_enabled:

From 373515786f3261ace915489abeac4af5fade30e1 Mon Sep 17 00:00:00 2001
From: AMindToThink <matthewkhoriaty@gmail.com>
Date: Wed, 16 Apr 2025 16:46:55 +0000
Subject: [PATCH 06/11] fix policy not found error by making sure policy exists

---
 trl/trainer/ppo_trainer.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/trl/trainer/ppo_trainer.py b/trl/trainer/ppo_trainer.py
index afa70d4a97..2fc61283df 100644
--- a/trl/trainer/ppo_trainer.py
+++ b/trl/trainer/ppo_trainer.py
@@ -334,7 +334,14 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
         #     output_dir = self.args.output_dir
         #     if output_dir is None:
         #         raise ValueError("No output directory specified for saving the model")
-            
+        times_called = getattr(self, "_save_model_count", 0)    
+        times_called += 1
+        setattr(self, '_save_model_count', times_called)
+        print(f"save_model has been called {times_called} time")
+        print(f"{output_dir=}")
+        print(f"{type(self.model)=}")
+        if not hasattr(self.model, 'policy'):
+            return
         backup_model = self.model
         self.model = self.model.policy  # save only the policy
         print("Accessed policy, continuing")
@@ -364,7 +371,7 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
             self.model = backup_model
 
             if self.is_deepspeed_enabled:
-                self.deepspeed = backup_deepspeed 
+                self.deepspeed = backup_deepspeed
 
     def train(self):
         args = self.args

From 624cd345f286e5eadd9796bfaaa18c6e911b6859 Mon Sep 17 00:00:00 2001
From: AMindToThink <matthewkhoriaty@gmail.com>
Date: Wed, 16 Apr 2025 17:06:22 +0000
Subject: [PATCH 07/11] It works: use AutoModel with the subfolder argument to
 load the models after putting them on huggingface!

---
 trl/trainer/ppo_trainer.py | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/trl/trainer/ppo_trainer.py b/trl/trainer/ppo_trainer.py
index 2fc61283df..44cc6252d0 100644
--- a/trl/trainer/ppo_trainer.py
+++ b/trl/trainer/ppo_trainer.py
@@ -328,23 +328,16 @@ def null_ref_context(self):
                 self.model.policy.set_adapter(self.model_adapter_name or "default")
 
     def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False):
-        # import pdb;pdb.set_trace()
         # Handle the None case here so that we can have subfolders for policy and value
-        # if output_dir is None:
-        #     output_dir = self.args.output_dir
-        #     if output_dir is None:
-        #         raise ValueError("No output directory specified for saving the model")
-        times_called = getattr(self, "_save_model_count", 0)    
-        times_called += 1
-        setattr(self, '_save_model_count', times_called)
-        print(f"save_model has been called {times_called} time")
-        print(f"{output_dir=}")
-        print(f"{type(self.model)=}")
+        if output_dir is None:
+            output_dir = self.args.output_dir
+            if output_dir is None:
+                raise ValueError("No output directory specified for saving the model")
+        # I am unsure whether this early return is legal. Line 4814 in Trainer.py says that save_model has to be executed on all processes for TPU training. Previously, save_model would be called while self.model was set to self.model.policy, resulting in errors. Including this line gets rid of those errors and the model still gets uploaded.
         if not hasattr(self.model, 'policy'):
             return
         backup_model = self.model
         self.model = self.model.policy  # save only the policy
-        print("Accessed policy, continuing")
         if self.is_deepspeed_enabled:
             backup_deepspeed = self.deepspeed
             self.deepspeed = self.model
@@ -367,7 +360,6 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
             value_output_dir = output_dir if not self.args.save_value_model else os.path.join(output_dir, "value_model")
             print(" to " + value_output_dir)
             super().save_model(value_output_dir, _internal_call)
-            print("reseting model to the combined")
             self.model = backup_model
 
             if self.is_deepspeed_enabled:

From 47a0da7715ae678e95baa43c351bd122c1f9acd6 Mon Sep 17 00:00:00 2001
From: AMindToThink <matthewkhoriaty@gmail.com>
Date: Wed, 16 Apr 2025 17:08:11 +0000
Subject: [PATCH 08/11] Delete debugging prints

---
 trl/trainer/ppo_trainer.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/trl/trainer/ppo_trainer.py b/trl/trainer/ppo_trainer.py
index 44cc6252d0..8e7afdcbc8 100644
--- a/trl/trainer/ppo_trainer.py
+++ b/trl/trainer/ppo_trainer.py
@@ -350,7 +350,6 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
             self.deepspeed = backup_deepspeed
         
         if self.args.save_value_model:
-            print("SAVING VALUE MODEL", end="")
             backup_model = self.model
             self.model = self.model.value_model
 
@@ -358,7 +357,6 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
                 backup_deepspeed = self.deepspeed
                 self.deepspeed = self.model
             value_output_dir = output_dir if not self.args.save_value_model else os.path.join(output_dir, "value_model")
-            print(" to " + value_output_dir)
             super().save_model(value_output_dir, _internal_call)
             self.model = backup_model
 

From 81cb59270d2a81156b34020e79c493dece3644b1 Mon Sep 17 00:00:00 2001
From: AMindToThink <matthewkhoriaty@gmail.com>
Date: Wed, 16 Apr 2025 17:33:19 +0000
Subject: [PATCH 09/11] More description on save_value_model

---
 trl/trainer/ppo_config.py  | 4 ++--
 trl/trainer/ppo_trainer.py | 8 +++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/trl/trainer/ppo_config.py b/trl/trainer/ppo_config.py
index a24e48f2f9..0a10598305 100644
--- a/trl/trainer/ppo_config.py
+++ b/trl/trainer/ppo_config.py
@@ -59,7 +59,7 @@ class PPOConfig(OnPolicyConfig):
         lam (`float`, *optional*, defaults to `0.95`):
             Lambda value for GAE.
         save_value_model (`bool`, *optional*, defaults to `False`):
-            Whether the value model (also known as the critic model) should be saved when the policy model is saved.
+            Whether the value model (also known as the critic model) should be saved when the policy model is saved. If `False`, the folder will contain the files for the policy only. If `True`, the folder will contain sub-folders for the policy and value model. You can import them by specifying the subfolder using a keyword argument: `from_pretrained(repo_id, subfolder=subfolder)`
         ds3_gather_for_generation (`bool`, *optional*, defaults to `True`):
             This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
             improving generation speed. However, disabling this option allows training models that exceed the VRAM
@@ -125,7 +125,7 @@ class PPOConfig(OnPolicyConfig):
     )
     save_value_model: bool = field(
         default=False,
-        metadata={"help": "Whether the value model (also known as the critic model) should be saved when the policy model is saved."}
+        metadata={"help": "Whether the value model (also known as the critic model) should be saved when the policy model is saved. If `False`, the folder will contain the files for the policy only. If `True`, the folder will contain sub-folders for the policy and value model. You can import them by specifying the subfolder using a keyword argument: `from_pretrained(repo_id, subfolder=subfolder)`"}
     )
     ds3_gather_for_generation: bool = field(
         default=True,
diff --git a/trl/trainer/ppo_trainer.py b/trl/trainer/ppo_trainer.py
index 8e7afdcbc8..952c284744 100644
--- a/trl/trainer/ppo_trainer.py
+++ b/trl/trainer/ppo_trainer.py
@@ -107,7 +107,7 @@ def __init__(
         ref_model: Optional[nn.Module],
         reward_model: nn.Module,
         train_dataset: Dataset,
-        value_model: nn.Module,
+        value_model: Optional[nn.Module] = None,
         data_collator: Optional[DataCollatorWithPadding] = None,
         eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
         # less commonly used
@@ -333,14 +333,16 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
             output_dir = self.args.output_dir
             if output_dir is None:
                 raise ValueError("No output directory specified for saving the model")
-        # I am unsure whether this early return is legal. Line 4814 in Trainer.py says that save_model has to be executed on all processes for TPU training. Previously, save_model would be called while self.model was set to self.model.policy, resulting in errors. Including this line gets rid of those errors and the model still gets uploaded.
+        # I am unsure whether this early return is legal. Line 4814 in Trainer.py says that save_model has to be executed on all processes for TPU training. Previously, save_model would be called in parallel while one process had already set self.model to self.model.policy, resulting in errors. Including this line gets rid of those errors and the model still gets uploaded.
         if not hasattr(self.model, 'policy'):
             return
         backup_model = self.model
         self.model = self.model.policy  # save only the policy
+
         if self.is_deepspeed_enabled:
             backup_deepspeed = self.deepspeed
             self.deepspeed = self.model
+
         policy_output_dir = output_dir if not self.args.save_value_model else os.path.join(output_dir, "policy_model")
         super().save_model(policy_output_dir, _internal_call)
 
@@ -356,7 +358,7 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
             if self.is_deepspeed_enabled:
                 backup_deepspeed = self.deepspeed
                 self.deepspeed = self.model
-            value_output_dir = output_dir if not self.args.save_value_model else os.path.join(output_dir, "value_model")
+            value_output_dir = os.path.join(output_dir, "value_model")
             super().save_model(value_output_dir, _internal_call)
             self.model = backup_model
 

From ec23826fdad14983114b4dbeb2f4ce0c7029a2e4 Mon Sep 17 00:00:00 2001
From: AMindToThink <matthewkhoriaty@gmail.com>
Date: Wed, 16 Apr 2025 17:37:14 +0000
Subject: [PATCH 10/11] run precommit

---
 examples/scripts/sft_video_llm.py | 2 +-
 trl/trainer/ppo_config.py         | 4 +++-
 trl/trainer/ppo_trainer.py        | 4 ++--
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/examples/scripts/sft_video_llm.py b/examples/scripts/sft_video_llm.py
index ca46ae64b4..cbf172279c 100644
--- a/examples/scripts/sft_video_llm.py
+++ b/examples/scripts/sft_video_llm.py
@@ -50,12 +50,12 @@
 
 import requests
 import torch
-import wandb
 from datasets import load_dataset
 from peft import LoraConfig
 from qwen_vl_utils import process_vision_info
 from transformers import AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig, Qwen2VLProcessor
 
+import wandb
 from trl import ModelConfig, ScriptArguments, SFTConfig, SFTTrainer, TrlParser, get_kbit_device_map
 
 
diff --git a/trl/trainer/ppo_config.py b/trl/trainer/ppo_config.py
index 0a10598305..c00f1c1d02 100644
--- a/trl/trainer/ppo_config.py
+++ b/trl/trainer/ppo_config.py
@@ -125,7 +125,9 @@ class PPOConfig(OnPolicyConfig):
     )
     save_value_model: bool = field(
         default=False,
-        metadata={"help": "Whether the value model (also known as the critic model) should be saved when the policy model is saved. If `False`, the folder will contain the files for the policy only. If `True`, the folder will contain sub-folders for the policy and value model. You can import them by specifying the subfolder using a keyword argument: `from_pretrained(repo_id, subfolder=subfolder)`"}
+        metadata={
+            "help": "Whether the value model (also known as the critic model) should be saved when the policy model is saved. If `False`, the folder will contain the files for the policy only. If `True`, the folder will contain sub-folders for the policy and value model. You can import them by specifying the subfolder using a keyword argument: `from_pretrained(repo_id, subfolder=subfolder)`"
+        },
     )
     ds3_gather_for_generation: bool = field(
         default=True,
diff --git a/trl/trainer/ppo_trainer.py b/trl/trainer/ppo_trainer.py
index 952c284744..39e063edc7 100644
--- a/trl/trainer/ppo_trainer.py
+++ b/trl/trainer/ppo_trainer.py
@@ -334,7 +334,7 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
             if output_dir is None:
                 raise ValueError("No output directory specified for saving the model")
         # I am unsure whether this early return is legal. Line 4814 in Trainer.py says that save_model has to be executed on all processes for TPU training. Previously, save_model would be called in parallel while one process had already set self.model to self.model.policy, resulting in errors. Including this line gets rid of those errors and the model still gets uploaded.
-        if not hasattr(self.model, 'policy'):
+        if not hasattr(self.model, "policy"):
             return
         backup_model = self.model
         self.model = self.model.policy  # save only the policy
@@ -350,7 +350,7 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
 
         if self.is_deepspeed_enabled:
             self.deepspeed = backup_deepspeed
-        
+
         if self.args.save_value_model:
             backup_model = self.model
             self.model = self.model.value_model

From a288b00e4de8bca0c4a67969229e022d1aac7078 Mon Sep 17 00:00:00 2001
From: AMindToThink <61801493+AMindToThink@users.noreply.github.com>
Date: Wed, 16 Apr 2025 12:40:28 -0500
Subject: [PATCH 11/11] Put wandb back

---
 examples/scripts/sft_video_llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/scripts/sft_video_llm.py b/examples/scripts/sft_video_llm.py
index cbf172279c..ca46ae64b4 100644
--- a/examples/scripts/sft_video_llm.py
+++ b/examples/scripts/sft_video_llm.py
@@ -50,12 +50,12 @@
 
 import requests
 import torch
+import wandb
 from datasets import load_dataset
 from peft import LoraConfig
 from qwen_vl_utils import process_vision_info
 from transformers import AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig, Qwen2VLProcessor
 
-import wandb
 from trl import ModelConfig, ScriptArguments, SFTConfig, SFTTrainer, TrlParser, get_kbit_device_map