huypn16
/

tem-rho-swap

Model card Files Files and versions Community

huypn16 commited on May 8

Commit

2d24af3

•

1 Parent(s): 2c87c16

Update replay_main.py

Browse files

Files changed (1) hide show

replay_main.py +22 -22

replay_main.py CHANGED Viewed

@@ -9,7 +9,7 @@ from src.trainer.reinforce import REINFORCETrainer, REINFORCEConfig
 from copy import deepcopy
 from tqdm import tqdm
 from src.utils import print_text
-from src.utils import Config
 from matplotlib import pyplot as plt
 import time
@@ -112,13 +112,19 @@ def main(args):
         while not reach_terminal:
             # try:
                 print("searching for next step")
                 # response_tensors, skip, current_step_response_tensors = respond_to_batch(old_model, step_tag_id_policy, tokenizer.eos_token_id, current_sol_prefix_tensor.repeat(args.k, 1), txt_len=1100, top_p=1.0, sampling_steps=args.num_truncated_rollouts, current_step=current_step, tokenizer=tokenizer)
                 with torch.no_grad():
-                    response_tensors = reinforce_trainer.generate([current_sol_prefix_tensor[0] for i in range(args.k)], return_prompt=False, max_new_tokens=1100, batch_size=6, do_sample=True, temperature=temp)
                 proposed_solutions = [tokenizer.decode(torch.cat([current_sol_prefix_tensor[0], response_tensor], dim=-1), skip_special_tokens=True) for response_tensor in response_tensors]
                 all_proposed_solutions.extend(proposed_solutions)
                 current_step_indices = []
@@ -143,21 +149,9 @@ def main(args):
                 all_proposed_rewards.extend(rewards)
                 # choosing the base next step based cumulative lookahead rewards
-                step_rewards = [reward[current_step:].mean() for reward in rewards]
                 most_probable_base_next_step_tensor = current_step_response_tensors[step_rewards.index(max(step_rewards))]
-                # # force resampling if 1/2 the sampled lookaheads are suck
-                # is_rw_nan = 0
-                # for i, reward in enumerate(step_rewards):
-                #     next_tokens = tokenizer.decode(current_step_response_tensors[i])
-                #     if (next_tokens != tokenizer.eos_token) and :
-                #         is_rw_nan += 1
-                # if is_rw_nan >= 1:
-                #     print("We are resampling")
-                #     temp += 0.3
-                #     rp = 1.0
-                #     continue
                 print("Most probable base step: ")
                 print(tokenizer.decode(most_probable_base_next_step_tensor))
                 print(step_rewards)
@@ -169,12 +163,17 @@ def main(args):
                         step_rewards[id] = step_rewards[id_max]
                 # pg update
-                stats = reinforce_trainer.step([current_sol_prefix_tensor[0] for i in range(args.k)], [response_tensors[i] for i in range(args.k)], rewards, logged=False)
-                print(stats)
                 # pg evaluation
                 with torch.no_grad():
-                    search_response_tensors = reinforce_trainer.generate([current_sol_prefix_tensor[0] for i in range(args.k)], return_prompt=False, max_new_tokens=1100, batch_size=6, do_sample=True, temperature=0.9)
                 search_proposed_solutions = [tokenizer.decode(torch.cat([current_sol_prefix_tensor[0], response_tensor], dim=-1), skip_special_tokens=True) for response_tensor in search_response_tensors]
                 all_proposed_solutions.extend(search_proposed_solutions)
@@ -200,8 +199,9 @@ def main(args):
                 all_proposed_rewards.extend(search_rewards)
                 # choosing the pg search next step based cumulative lookahead rewards
-                search_step_rewards = [reward[current_step:].mean() for reward in search_rewards]
                 most_probable_search_next_step_tensor = current_search_step_response_tensors[search_step_rewards.index(max(search_step_rewards))]
                 print("Most probable search step: ")
                 print(tokenizer.decode(most_probable_search_next_step_tensor))
                 print(search_step_rewards)
@@ -242,7 +242,7 @@ def main(args):
         solution = all_proposed_solutions[all_proposed_rewards.index(max(all_proposed_rewards))]
         _, prediction = math_evaluation(solution, example["answer"])
-        is_passed, main_prediction = math_evaluation(main_solution, example["answer"])
         accuracy += is_passed
         print("Final solution: ", solution)

 from copy import deepcopy
 from tqdm import tqdm
 from src.utils import print_text
+from src.utils import Config, count_repeated_ngrams
 from matplotlib import pyplot as plt
 import time
         while not reach_terminal:
             # try:
                 print("searching for next step")
+                if current_step >= 30:
+                    break
                 # response_tensors, skip, current_step_response_tensors = respond_to_batch(old_model, step_tag_id_policy, tokenizer.eos_token_id, current_sol_prefix_tensor.repeat(args.k, 1), txt_len=1100, top_p=1.0, sampling_steps=args.num_truncated_rollouts, current_step=current_step, tokenizer=tokenizer)
                 with torch.no_grad():
+                    response_tensors = reinforce_trainer.generate([current_sol_prefix_tensor[0] for i in range(args.k)], return_prompt=False, max_new_tokens=1200-len(current_sol_prefix_tensor[0]), batch_size=6, do_sample=True, temperature=temp)
                 proposed_solutions = [tokenizer.decode(torch.cat([current_sol_prefix_tensor[0], response_tensor], dim=-1), skip_special_tokens=True) for response_tensor in response_tensors]
+                for id, solution in enumerate(proposed_solutions):
+                    cnt = count_repeated_ngrams(solution, tokenizer, 3)
+                    if cnt >= 10:
+                        proposed_solutions
                 all_proposed_solutions.extend(proposed_solutions)
                 current_step_indices = []
                 all_proposed_rewards.extend(rewards)
                 # choosing the base next step based cumulative lookahead rewards
+                step_rewards = [(reward[current_step:].mean() + reward[current_step]) if (len(reward) > current_step) else reward.mean() for reward in rewards]
                 most_probable_base_next_step_tensor = current_step_response_tensors[step_rewards.index(max(step_rewards))]
                 print("Most probable base step: ")
                 print(tokenizer.decode(most_probable_base_next_step_tensor))
                 print(step_rewards)
                         step_rewards[id] = step_rewards[id_max]
                 # pg update
+                try:
+                    if not any([torch.isnan(reward).item() for reward in step_rewards]):
+                        print([torch.isnan(reward) for reward in step_rewards])
+                        print(any([torch.isnan(reward).item() for reward in step_rewards]))
+                        stats = reinforce_trainer.step([current_sol_prefix_tensor[0] for i in range(args.k)], [response_tensors[i] for i in range(args.k)], rewards, logged=False)
+                        print(stats)
+                except:
+                    continue
                 # pg evaluation
                 with torch.no_grad():
+                    search_response_tensors = reinforce_trainer.generate([current_sol_prefix_tensor[0] for i in range(args.k)], return_prompt=False, max_new_tokens=1200-len(current_sol_prefix_tensor[0]), batch_size=6, do_sample=True, temperature=temp)
                 search_proposed_solutions = [tokenizer.decode(torch.cat([current_sol_prefix_tensor[0], response_tensor], dim=-1), skip_special_tokens=True) for response_tensor in search_response_tensors]
                 all_proposed_solutions.extend(search_proposed_solutions)
                 all_proposed_rewards.extend(search_rewards)
                 # choosing the pg search next step based cumulative lookahead rewards
+                search_step_rewards = [(reward[current_step:].mean() + reward[current_step]) if (len(reward) > current_step) else reward.mean() for reward in search_rewards]
                 most_probable_search_next_step_tensor = current_search_step_response_tensors[search_step_rewards.index(max(search_step_rewards))]
                 print("Most probable search step: ")
                 print(tokenizer.decode(most_probable_search_next_step_tensor))
                 print(search_step_rewards)
         solution = all_proposed_solutions[all_proposed_rewards.index(max(all_proposed_rewards))]
         _, prediction = math_evaluation(solution, example["answer"])
+        is_passed, main_prediction = math_evaluation(solution, example["answer"])
         accuracy += is_passed
         print("Final solution: ", solution)