Update: model

Browse files

Files changed (8) hide show

config.json +1 -1
{global_step23940 → global_step165430}/mp_rank_00_model_states.pt +2 -2
{global_step23940 → global_step165430}/zero_pp_rank_0_mp_rank_00_optim_states.pt +1 -1
{global_step23940 → global_step165430}/zero_pp_rank_1_mp_rank_00_optim_states.pt +1 -1
{global_step23940 → global_step165430}/zero_pp_rank_2_mp_rank_00_optim_states.pt +1 -1
{global_step23940 → global_step165430}/zero_pp_rank_3_mp_rank_00_optim_states.pt +1 -1
latest +1 -1
zero_to_fp32.py +54 -37

config.json CHANGED Viewed

@@ -24,7 +24,7 @@
   "position_buckets": 256,
   "relative_attention": true,
   "share_att_key": true,
-  "transformers_version": "4.37.2",
   "type_vocab_size": 0,
   "vocab_size": 64100
 }

   "position_buckets": 256,
   "relative_attention": true,
   "share_att_key": true,
+  "transformers_version": "4.38.2",
   "type_vocab_size": 0,
   "vocab_size": 64100
 }

{global_step23940 → global_step165430}/mp_rank_00_model_states.pt RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7ad98480a5aab0db71e19ce24c4d3be4333da5cb1955de706eb15ed4018d8113
-size 1077570732

 version https://git-lfs.github.com/spec/v1
+oid sha256:5b04e95e0d0b4bb47fea41cd9de0af4ddfe4a9e3350ec64d4c1f8b77de9f9541
+size 1077570796

{global_step23940 → global_step165430}/zero_pp_rank_0_mp_rank_00_optim_states.pt RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0139708571b55035fa4fe9a3f7be61ebaed589a1eb65bf985b3125b2e357d6ed
 size 808085192

 version https://git-lfs.github.com/spec/v1
+oid sha256:4fe7887eab40a10ededf92b4403561338b9b7d9e0269623913f2dcbada17cc00
 size 808085192

{global_step23940 → global_step165430}/zero_pp_rank_1_mp_rank_00_optim_states.pt RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c7ce8e14c3ef21ed302708f1f5b59b4f9332a2b92de0e8a06387fe6d7a7ef4ca
 size 808095752

 version https://git-lfs.github.com/spec/v1
+oid sha256:9046b582e4329869f31dc4562371217f8ee62dd267816ac7ba57990bb808be1e
 size 808095752

{global_step23940 → global_step165430}/zero_pp_rank_2_mp_rank_00_optim_states.pt RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bb94c45e8587eb0fc9d0e120933c4f24d7efe1d058c6cee66969aadcd02266d8
 size 808085064

 version https://git-lfs.github.com/spec/v1
+oid sha256:091d4c4fd84894549e6e66fadb7699e8c700d6e532ebca6feba502eda8d5d047
 size 808085064

{global_step23940 → global_step165430}/zero_pp_rank_3_mp_rank_00_optim_states.pt RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4cf626224645155eb8a8e53724c0d07abeb725703096326e9b357ac1518d8080
 size 808095496

 version https://git-lfs.github.com/spec/v1
+oid sha256:1470545a59094d545cbc35cc96811ce3765d7a602fc6f93739adf6295b514415
 size 808095496

latest CHANGED Viewed

	@@ -1 +1 @@
1	- ~~global_step24282~~


1	+ global_step165430

zero_to_fp32.py CHANGED Viewed

@@ -24,9 +24,18 @@ from dataclasses import dataclass
 # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
 # DeepSpeed data structures it has to be available in the current python environment.
 from deepspeed.utils import logger
-from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
-                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
-                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
 @dataclass
@@ -42,7 +51,7 @@ class zero_model_state:
 debug = 0
 # load to cpu
-device = torch.device('cpu')
 def atoi(text):
@@ -50,12 +59,12 @@ def atoi(text):
 def natural_keys(text):
-    '''
     alist.sort(key=natural_keys) sorts in human order
     http://nedbatchelder.com/blog/200712/human_sorting.html
     (See Toothy's implementation in the comments)
-    '''
-    return [atoi(c) for c in re.split(r'(\d+)', text)]
 def get_model_state_file(checkpoint_dir, zero_stage):
@@ -127,12 +136,14 @@ def parse_model_states(files):
         frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
-        z_model_state = zero_model_state(buffers=buffers,
-                                         param_shapes=param_shapes,
-                                         shared_params=shared_params,
-                                         ds_version=ds_version,
-                                         frozen_param_shapes=frozen_param_shapes,
-                                         frozen_param_fragments=frozen_param_fragments)
         zero_model_states.append(z_model_state)
     return zero_model_states
@@ -208,7 +219,7 @@ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
     model_files = get_model_state_files(ds_checkpoint_dir)
     zero_model_states = parse_model_states(model_files)
-    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
     if zero_stage <= 2:
         return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
@@ -225,13 +236,13 @@ def _zero2_merge_frozen_params(state_dict, zero_model_states):
     if debug:
         num_elem = sum(s.numel() for s in frozen_param_shapes.values())
-        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
         wanted_params = len(frozen_param_shapes)
         wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
         avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
-        print(f'Frozen params: Have {avail_numel} numels to process.')
-        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
     total_params = 0
     total_numel = 0
@@ -273,7 +284,8 @@ def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero
         full_single_fp32_vector = torch.cat(merged_partitions, 0)
         merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
     avail_numel = sum(
-        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
     if debug:
         wanted_params = sum([len(shapes) for shapes in param_shapes])
@@ -292,7 +304,7 @@ def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero
         avail_numel = full_single_fp32_vector.numel()
         for name, shape in shapes.items():
-            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
             total_numel += unpartitioned_numel
             total_params += 1
@@ -361,14 +373,14 @@ def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
     if debug:
         for i in range(world_size):
             num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
-            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
         frozen_param_shapes = zero_model_states[0].frozen_param_shapes
         wanted_params = len(frozen_param_shapes)
         wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
         avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
-        print(f'Frozen params: Have {avail_numel} numels to process.')
-        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
     total_params = 0
     total_numel = 0
@@ -430,9 +442,11 @@ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero
             )
         # XXX: memory usage doubles here
-        state_dict[name] = torch.cat(
-            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
-            0).narrow(0, 0, unpartitioned_numel).view(shape)
         offset += partitioned_numel
     offset *= world_size
@@ -499,9 +513,9 @@ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
     """
     if tag is None:
-        latest_path = os.path.join(checkpoint_dir, 'latest')
         if os.path.isfile(latest_path):
-            with open(latest_path, 'r') as fd:
                 tag = fd.read().strip()
         else:
             raise ValueError(f"Unable to find 'latest' file at {latest_path}")
@@ -572,19 +586,22 @@ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("checkpoint_dir",
-                        type=str,
-                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
     parser.add_argument(
         "output_file",
         type=str,
-        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
-    parser.add_argument("-t",
-                        "--tag",
-                        type=str,
-                        default=None,
-                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
-    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
     args = parser.parse_args()
     debug = args.debug

 # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
 # DeepSpeed data structures it has to be available in the current python environment.
 from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (
+    DS_VERSION,
+    OPTIMIZER_STATE_DICT,
+    SINGLE_PARTITION_OF_FP32_GROUPS,
+    FP32_FLAT_GROUPS,
+    ZERO_STAGE,
+    PARTITION_COUNT,
+    PARAM_SHAPES,
+    BUFFER_NAMES,
+    FROZEN_PARAM_SHAPES,
+    FROZEN_PARAM_FRAGMENTS,
+)
 @dataclass
 debug = 0
 # load to cpu
+device = torch.device("cpu")
 def atoi(text):
 def natural_keys(text):
+    """
     alist.sort(key=natural_keys) sorts in human order
     http://nedbatchelder.com/blog/200712/human_sorting.html
     (See Toothy's implementation in the comments)
+    """
+    return [atoi(c) for c in re.split(r"(\d+)", text)]
 def get_model_state_file(checkpoint_dir, zero_stage):
         frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(
+            buffers=buffers,
+            param_shapes=param_shapes,
+            shared_params=shared_params,
+            ds_version=ds_version,
+            frozen_param_shapes=frozen_param_shapes,
+            frozen_param_fragments=frozen_param_fragments,
+        )
         zero_model_states.append(z_model_state)
     return zero_model_states
     model_files = get_model_state_files(ds_checkpoint_dir)
     zero_model_states = parse_model_states(model_files)
+    print(f"Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}")
     if zero_stage <= 2:
         return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
     if debug:
         num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f"rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}")
         wanted_params = len(frozen_param_shapes)
         wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
         avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f"Frozen params: Have {avail_numel} numels to process.")
+        print(f"Frozen params: Need {wanted_numel} numels in {wanted_params} params")
     total_params = 0
     total_numel = 0
         full_single_fp32_vector = torch.cat(merged_partitions, 0)
         merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
     avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]
+    )
     if debug:
         wanted_params = sum([len(shapes) for shapes in param_shapes])
         avail_numel = full_single_fp32_vector.numel()
         for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, "numel") else math.prod(shape)
             total_numel += unpartitioned_numel
             total_params += 1
     if debug:
         for i in range(world_size):
             num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f"rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}")
         frozen_param_shapes = zero_model_states[0].frozen_param_shapes
         wanted_params = len(frozen_param_shapes)
         wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
         avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f"Frozen params: Have {avail_numel} numels to process.")
+        print(f"Frozen params: Need {wanted_numel} numels in {wanted_params} params")
     total_params = 0
     total_numel = 0
             )
         # XXX: memory usage doubles here
+        state_dict[name] = (
+            torch.cat(tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), 0)
+            .narrow(0, 0, unpartitioned_numel)
+            .view(shape)
+        )
         offset += partitioned_numel
     offset *= world_size
     """
     if tag is None:
+        latest_path = os.path.join(checkpoint_dir, "latest")
         if os.path.isfile(latest_path):
+            with open(latest_path, "r") as fd:
                 tag = fd.read().strip()
         else:
             raise ValueError(f"Unable to find 'latest' file at {latest_path}")
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "checkpoint_dir", type=str, help="path to the desired checkpoint folder, e.g., path/checkpoint-12"
+    )
     parser.add_argument(
         "output_file",
         type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)",
+    )
+    parser.add_argument(
+        "-t",
+        "--tag",
+        type=str,
+        default=None,
+        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1",
+    )
+    parser.add_argument("-d", "--debug", action="store_true", help="enable debug")
     args = parser.parse_args()
     debug = args.debug