Training in progress, step 4
Browse files- logs/amlt_code_runner.txt +21 -21
- model-00001-of-00003.safetensors +1 -1
- model-00002-of-00003.safetensors +1 -1
- model-00003-of-00003.safetensors +1 -1
- training_args.bin +1 -1
logs/amlt_code_runner.txt
CHANGED
|
@@ -1,22 +1,22 @@
|
|
| 1 |
-
2025-
|
| 2 |
-
2025-
|
| 3 |
-
2025-
|
| 4 |
-
2025-
|
| 5 |
-
2025-
|
| 6 |
-
2025-
|
| 7 |
-
2025-
|
| 8 |
-
2025-
|
| 9 |
-
2025-
|
| 10 |
-
2025-
|
| 11 |
-
2025-
|
| 12 |
-
2025-
|
| 13 |
-
2025-
|
| 14 |
-
2025-
|
| 15 |
-
2025-
|
| 16 |
|
| 17 |
-
2025-
|
| 18 |
-
2025-
|
| 19 |
-
2025-
|
| 20 |
-
2025-
|
| 21 |
-
2025-
|
| 22 |
-
2025-
|
|
|
|
| 1 |
+
2025-03-01 05:51:32,812:amlt-code-runner:INFO - SINGULARITY_LOCATION: westus2
|
| 2 |
+
2025-03-01 05:51:32,812:amlt-code-runner:INFO - AISC_INSTANCE_TYPE: Singularity.NC96ad_A100_v4
|
| 3 |
+
2025-03-01 05:51:34,063:amlt-code-runner:INFO - Not removing AzureML's cd commands from /etc/profile due to an error: [Errno 13] Permission denied: '/etc/profile'
|
| 4 |
+
2025-03-01 05:51:34,063:amlt-code-runner:WARNING - Environment variable 'NCCL_SOCKET_IFNAME' already set to 'eth0', not changing to '^docker0,lo'
|
| 5 |
+
2025-03-01 05:51:34,063:amlt-code-runner:INFO - RANK = 0
|
| 6 |
+
2025-03-01 05:51:34,063:amlt-code-runner:INFO - LOCAL_RANK = None
|
| 7 |
+
2025-03-01 05:51:34,064:amlt-code-runner:INFO - WORLD_SIZE = 1
|
| 8 |
+
2025-03-01 05:51:34,064:amlt-code-runner:INFO - MASTER_ADDR = node-0
|
| 9 |
+
2025-03-01 05:51:34,064:amlt-code-runner:INFO - MASTER_PORT = 9500
|
| 10 |
+
2025-03-01 05:51:34,065:amlt-code-runner:WARNING - Installing amlt runtime dependencies: ['wrapt', 'azure-identity', 'python-dateutil', 'pytz'] into /tmp/amlt-user-base
|
| 11 |
+
2025-03-01 05:51:35,296:amlt-code-runner:INFO - Setting WANDB_RUN_ID to 'kind_onion_8sfvnlnwfk_9'
|
| 12 |
+
2025-03-01 05:51:35,296:amlt-code-runner:INFO - Expanding HyperDrive arguments into /tmp/amlt_run_hd.sh
|
| 13 |
+
2025-03-01 05:51:35,592:amlt-code-runner:INFO - Parsing tracking uri /mlflow/v1.0/subscriptions/2aac527a-de5a-4fe3-95e9-5c8b9d48ed62/resourceGroups/cyrilzhang/providers/Microsoft.MachineLearningServices/workspaces/cyrilzhangws
|
| 14 |
+
2025-03-01 05:51:35,592:amlt-code-runner:INFO - Tracking uri /mlflow/v1.0/subscriptions/2aac527a-de5a-4fe3-95e9-5c8b9d48ed62/resourceGroups/cyrilzhang/providers/Microsoft.MachineLearningServices/workspaces/cyrilzhangws has sub id 2aac527a-de5a-4fe3-95e9-5c8b9d48ed62, resource group cyrilzhang, and workspace cyrilzhangws
|
| 15 |
+
2025-03-01 05:51:35,592:aml_token_auth:WARNING - The AzureMLTokenAuthentication created will not be updated due to missing params. The token expires on 2025-03-20 19:29:14.
|
| 16 |
|
| 17 |
+
2025-03-01 05:51:35,594:urllib3.connectionpool:DEBUG - Starting new HTTPS connection (1): eastus.api.azureml.ms:443
|
| 18 |
+
2025-03-01 05:51:35,959:urllib3.connectionpool:DEBUG - https://eastus.api.azureml.ms:443 "POST /mlflow/v1.0/subscriptions/2aac527a-de5a-4fe3-95e9-5c8b9d48ed62/resourceGroups/cyrilzhang/providers/Microsoft.MachineLearningServices/workspaces/cyrilzhangws/api/2.0/mlflow/runs/set-tag HTTP/11" 403 0
|
| 19 |
+
2025-03-01 05:51:35,960:amlt-code-runner:WARNING - Failed to rename job according to the amulet job name template. Run 'amlt list' client side to set the display name according to the amulet job template name. The error we encountered was: Failed to update display name:
|
| 20 |
+
2025-03-01 05:51:35,982:amlt-code-runner:INFO - Executing ./amlt_setup.sh, /tmp/amlt_run_hd.sh
|
| 21 |
+
2025-03-01 05:51:36,053:background_dirsync:INFO - Starting directory syncer from '/scratch/amlt_code/outputs' to '/mnt/output/projects/lmpref/amlt-results/kind_onion_8sfvnlnwfk_9', every 30.000000s
|
| 22 |
+
2025-03-01 05:51:36,056:background_dirsync:INFO - Starting directory syncer from '/scratch/azureml/cr/j/b2202e217c194ab682d6c0de1367ef62/exe/wd/logs' to '/scratch/amlt_code/outputs/logs', every 30.000000s
|
model-00001-of-00003.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4943162336
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d46fa09e544111a970d75a4b61d649ed92c4fbdd2021b19394a28b3a064446da
|
| 3 |
size 4943162336
|
model-00002-of-00003.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4999819336
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c9b8f8018f208f2bbb5f2fd725561caf208dacabd15febd219bfae8af6f949d0
|
| 3 |
size 4999819336
|
model-00003-of-00003.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4540516344
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:309866788c8b07654fa0c6316f2a63ac75ca4ce6ecefc11776d8def4d66a6b52
|
| 3 |
size 4540516344
|
training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 7736
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:79c197abab6cbb7c63234c86334d214f471bce9fcc36bdff48efe33d229a737a
|
| 3 |
size 7736
|