mirror of https://github.com/hpcaitech/ColossalAI
879 lines
83 KiB
Plaintext
879 lines
83 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/home/lcsjy/.conda/envs/autoparallel/lib/python3.10/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
|
" from .autonotebook import tqdm as notebook_tqdm\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[11/10/22 18:04:14] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> store_based_barrier_key:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span> to store for rank: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span> \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m[11/10/22 18:04:14]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n",
|
|
"\u001b[2;36m \u001b[0m store_based_barrier_key:\u001b[1;36m1\u001b[0m to store for rank: \u001b[1;36m0\u001b[0m \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> colossalai - torch.distributed.distributed_c10d - INFO: Rank <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span>: Completed store-based \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> barrier for key:store_based_barrier_key:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span> with <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span> nodes. \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Rank \u001b[1;36m0\u001b[0m: Completed store-based \n",
|
|
"\u001b[2;36m \u001b[0m barrier for key:store_based_barrier_key:\u001b[1;36m1\u001b[0m with \u001b[1;36m1\u001b[0m nodes. \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> store_based_barrier_key:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2</span> to store for rank: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span> \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n",
|
|
"\u001b[2;36m \u001b[0m store_based_barrier_key:\u001b[1;36m2\u001b[0m to store for rank: \u001b[1;36m0\u001b[0m \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> colossalai - torch.distributed.distributed_c10d - INFO: Rank <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span>: Completed store-based \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> barrier for key:store_based_barrier_key:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2</span> with <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span> nodes. \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Rank \u001b[1;36m0\u001b[0m: Completed store-based \n",
|
|
"\u001b[2;36m \u001b[0m barrier for key:store_based_barrier_key:\u001b[1;36m2\u001b[0m with \u001b[1;36m1\u001b[0m nodes. \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> store_based_barrier_key:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span> to store for rank: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span> \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n",
|
|
"\u001b[2;36m \u001b[0m store_based_barrier_key:\u001b[1;36m3\u001b[0m to store for rank: \u001b[1;36m0\u001b[0m \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> colossalai - torch.distributed.distributed_c10d - INFO: Rank <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span>: Completed store-based \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> barrier for key:store_based_barrier_key:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span> with <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span> nodes. \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Rank \u001b[1;36m0\u001b[0m: Completed store-based \n",
|
|
"\u001b[2;36m \u001b[0m barrier for key:store_based_barrier_key:\u001b[1;36m3\u001b[0m with \u001b[1;36m1\u001b[0m nodes. \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> store_based_barrier_key:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">4</span> to store for rank: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span> \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n",
|
|
"\u001b[2;36m \u001b[0m store_based_barrier_key:\u001b[1;36m4\u001b[0m to store for rank: \u001b[1;36m0\u001b[0m \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> colossalai - torch.distributed.distributed_c10d - INFO: Rank <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span>: Completed store-based \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> barrier for key:store_based_barrier_key:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">4</span> with <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span> nodes. \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Rank \u001b[1;36m0\u001b[0m: Completed store-based \n",
|
|
"\u001b[2;36m \u001b[0m barrier for key:store_based_barrier_key:\u001b[1;36m4\u001b[0m with \u001b[1;36m1\u001b[0m nodes. \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> store_based_barrier_key:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">5</span> to store for rank: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span> \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n",
|
|
"\u001b[2;36m \u001b[0m store_based_barrier_key:\u001b[1;36m5\u001b[0m to store for rank: \u001b[1;36m0\u001b[0m \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> colossalai - torch.distributed.distributed_c10d - INFO: Rank <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span>: Completed store-based \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> barrier for key:store_based_barrier_key:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">5</span> with <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span> nodes. \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Rank \u001b[1;36m0\u001b[0m: Completed store-based \n",
|
|
"\u001b[2;36m \u001b[0m barrier for key:store_based_barrier_key:\u001b[1;36m5\u001b[0m with \u001b[1;36m1\u001b[0m nodes. \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> store_based_barrier_key:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">6</span> to store for rank: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span> \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n",
|
|
"\u001b[2;36m \u001b[0m store_based_barrier_key:\u001b[1;36m6\u001b[0m to store for rank: \u001b[1;36m0\u001b[0m \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> colossalai - torch.distributed.distributed_c10d - INFO: Rank <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span>: Completed store-based \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> barrier for key:store_based_barrier_key:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">6</span> with <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span> nodes. \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Rank \u001b[1;36m0\u001b[0m: Completed store-based \n",
|
|
"\u001b[2;36m \u001b[0m barrier for key:store_based_barrier_key:\u001b[1;36m6\u001b[0m with \u001b[1;36m1\u001b[0m nodes. \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> store_based_barrier_key:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">7</span> to store for rank: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span> \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n",
|
|
"\u001b[2;36m \u001b[0m store_based_barrier_key:\u001b[1;36m7\u001b[0m to store for rank: \u001b[1;36m0\u001b[0m \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> colossalai - torch.distributed.distributed_c10d - INFO: Rank <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span>: Completed store-based \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> barrier for key:store_based_barrier_key:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">7</span> with <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span> nodes. \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Rank \u001b[1;36m0\u001b[0m: Completed store-based \n",
|
|
"\u001b[2;36m \u001b[0m barrier for key:store_based_barrier_key:\u001b[1;36m7\u001b[0m with \u001b[1;36m1\u001b[0m nodes. \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> store_based_barrier_key:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">8</span> to store for rank: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span> \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n",
|
|
"\u001b[2;36m \u001b[0m store_based_barrier_key:\u001b[1;36m8\u001b[0m to store for rank: \u001b[1;36m0\u001b[0m \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> colossalai - torch.distributed.distributed_c10d - INFO: Rank <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span>: Completed store-based \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> barrier for key:store_based_barrier_key:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">8</span> with <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span> nodes. \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Rank \u001b[1;36m0\u001b[0m: Completed store-based \n",
|
|
"\u001b[2;36m \u001b[0m barrier for key:store_based_barrier_key:\u001b[1;36m8\u001b[0m with \u001b[1;36m1\u001b[0m nodes. \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> colossalai - colossalai - INFO: \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> <span style=\"color: #800080; text-decoration-color: #800080\">/home/lcsjy/ColossalAI/colossalai/context/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">parallel_context.py</span>:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">521</span> set_device \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - colossalai - INFO: \n",
|
|
"\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/context/\u001b[0m\u001b[95mparallel_context.py\u001b[0m:\u001b[1;36m521\u001b[0m set_device \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> colossalai - colossalai - INFO: process rank <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span> is bound to device <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span> \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - colossalai - INFO: process rank \u001b[1;36m0\u001b[0m is bound to device \u001b[1;36m0\u001b[0m \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> colossalai - colossalai - INFO: \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> <span style=\"color: #800080; text-decoration-color: #800080\">/home/lcsjy/ColossalAI/colossalai/context/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">parallel_context.py</span>:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">557</span> set_seed \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - colossalai - INFO: \n",
|
|
"\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/context/\u001b[0m\u001b[95mparallel_context.py\u001b[0m:\u001b[1;36m557\u001b[0m set_seed \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> colossalai - colossalai - INFO: initialized seed on rank <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span>, numpy: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1024</span>, python \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> random: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1024</span>, ParallelMode.DATA: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1024</span>, ParallelMode.TENSOR: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1024</span>,the default parallel \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> seed is ParallelMode.DATA. \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - colossalai - INFO: initialized seed on rank \u001b[1;36m0\u001b[0m, numpy: \u001b[1;36m1024\u001b[0m, python \n",
|
|
"\u001b[2;36m \u001b[0m random: \u001b[1;36m1024\u001b[0m, ParallelMode.DATA: \u001b[1;36m1024\u001b[0m, ParallelMode.TENSOR: \u001b[1;36m1024\u001b[0m,the default parallel \n",
|
|
"\u001b[2;36m \u001b[0m seed is ParallelMode.DATA. \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> colossalai - colossalai - INFO: <span style=\"color: #800080; text-decoration-color: #800080\">/home/lcsjy/ColossalAI/colossalai/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">initialize.py</span>:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">117</span> \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> launch \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - colossalai - INFO: \u001b[35m/home/lcsjy/ColossalAI/colossalai/\u001b[0m\u001b[95minitialize.py\u001b[0m:\u001b[1;36m117\u001b[0m \n",
|
|
"\u001b[2;36m \u001b[0m launch \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> colossalai - colossalai - INFO: Distributed environment is initialized, data parallel \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> size: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span>, pipeline parallel size: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span>, tensor parallel size: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span> \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - colossalai - INFO: Distributed environment is initialized, data parallel \n",
|
|
"\u001b[2;36m \u001b[0m size: \u001b[1;36m1\u001b[0m, pipeline parallel size: \u001b[1;36m1\u001b[0m, tensor parallel size: \u001b[1;36m1\u001b[0m \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"import time\n",
|
|
"import torchvision.models as tm\n",
|
|
"import torch\n",
|
|
"import colossalai\n",
|
|
"from colossalai.fx import symbolic_trace, metainfo_trace\n",
|
|
"from colossalai.auto_parallel.checkpoint import CheckpointSolverRotor\n",
|
|
"from functools import partial\n",
|
|
"from colossalai.utils import free_port\n",
|
|
"\n",
|
|
"from bench_utils import bench, bench_rotor\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"\n",
|
|
"colossalai.launch(config={}, rank=0, world_size=1, host='localhost', port=free_port(), backend='nccl')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### ResNet152 with batch size = 512 fails"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"(78990.4404296875, inf)"
|
|
]
|
|
},
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"def data_gen(batch_size, shape, device='cuda'):\n",
|
|
" data = torch.empty(batch_size, *shape, device=device)\n",
|
|
" label = torch.empty(batch_size, dtype=torch.long, device=device).random_(1000)\n",
|
|
" return {'x': data}, label\n",
|
|
"\n",
|
|
"model = tm.resnet152()\n",
|
|
"gm = symbolic_trace(model)\n",
|
|
"gm = metainfo_trace(gm, torch.empty(512, 3, 224, 224, device='meta'))\n",
|
|
"bench(gm, torch.nn.CrossEntropyLoss(), partial(data_gen, batch_size=512, shape=(3, 224, 224)), num_steps=5)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### ResNet152 with batch size = 2048 succeeds "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"(74495.8486328125, 5634.262561798096)"
|
|
]
|
|
},
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"def data_gen(batch_size, shape, device='cuda'):\n",
|
|
" data = torch.empty(batch_size, *shape, device=device)\n",
|
|
" label = torch.empty(batch_size, dtype=torch.long, device=device).random_(1000)\n",
|
|
" return {'x': data}, label\n",
|
|
"\n",
|
|
"model = tm.resnet152()\n",
|
|
"gm = symbolic_trace(model)\n",
|
|
"gm = metainfo_trace(gm, torch.empty(2048, 3, 224, 224, device='meta'))\n",
|
|
"solver = CheckpointSolverRotor(gm.graph, free_memory=torch.cuda.mem_get_info(device=0)[0] * 0.95)\n",
|
|
"gm.graph = solver.solve()\n",
|
|
"bench(gm, torch.nn.CrossEntropyLoss(), partial(data_gen, batch_size=2048, shape=(3, 224, 224)), num_steps=5)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Benchmarking on ResNet18"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[11/10/22 18:04:20] </span><span style=\"color: #800000; text-decoration-color: #800000\">WARNING </span> colossalai - colossalai - WARNING: \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> <span style=\"color: #800080; text-decoration-color: #800080\">/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">ckpt_solver_rotor.py</span>:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">82</span> \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> solve \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m[11/10/22 18:04:20]\u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n",
|
|
"\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n",
|
|
"\u001b[2;36m \u001b[0m solve \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #800000; text-decoration-color: #800000\">WARNING </span> colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> chain from index <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span> to <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">14</span> with memory <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">500</span> \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n",
|
|
"\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #800000; text-decoration-color: #800000\">WARNING </span> colossalai - colossalai - WARNING: \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> <span style=\"color: #800080; text-decoration-color: #800080\">/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">ckpt_solver_rotor.py</span>:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">82</span> \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> solve \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n",
|
|
"\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n",
|
|
"\u001b[2;36m \u001b[0m solve \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #800000; text-decoration-color: #800000\">WARNING </span> colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> chain from index <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span> to <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">14</span> with memory <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">500</span> \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n",
|
|
"\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #800000; text-decoration-color: #800000\">WARNING </span> colossalai - colossalai - WARNING: \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> <span style=\"color: #800080; text-decoration-color: #800080\">/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">ckpt_solver_rotor.py</span>:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">82</span> \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> solve \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n",
|
|
"\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n",
|
|
"\u001b[2;36m \u001b[0m solve \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #800000; text-decoration-color: #800000\">WARNING </span> colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> chain from index <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span> to <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">14</span> with memory <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">500</span> \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n",
|
|
"\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #800000; text-decoration-color: #800000\">WARNING </span> colossalai - colossalai - WARNING: \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> <span style=\"color: #800080; text-decoration-color: #800080\">/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">ckpt_solver_rotor.py</span>:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">82</span> \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> solve \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n",
|
|
"\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n",
|
|
"\u001b[2;36m \u001b[0m solve \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #800000; text-decoration-color: #800000\">WARNING </span> colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> chain from index <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span> to <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">14</span> with memory <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">500</span> \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n",
|
|
"\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[11/10/22 18:04:21] </span><span style=\"color: #800000; text-decoration-color: #800000\">WARNING </span> colossalai - colossalai - WARNING: \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> <span style=\"color: #800080; text-decoration-color: #800080\">/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">ckpt_solver_rotor.py</span>:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">82</span> \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> solve \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m[11/10/22 18:04:21]\u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n",
|
|
"\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n",
|
|
"\u001b[2;36m \u001b[0m solve \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #800000; text-decoration-color: #800000\">WARNING </span> colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> chain from index <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span> to <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">14</span> with memory <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">500</span> \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n",
|
|
"\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #800000; text-decoration-color: #800000\">WARNING </span> colossalai - colossalai - WARNING: \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> <span style=\"color: #800080; text-decoration-color: #800080\">/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">ckpt_solver_rotor.py</span>:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">82</span> \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> solve \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n",
|
|
"\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n",
|
|
"\u001b[2;36m \u001b[0m solve \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #800000; text-decoration-color: #800000\">WARNING </span> colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> chain from index <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span> to <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">14</span> with memory <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">500</span> \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n",
|
|
"\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #800000; text-decoration-color: #800000\">WARNING </span> colossalai - colossalai - WARNING: \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> <span style=\"color: #800080; text-decoration-color: #800080\">/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">ckpt_solver_rotor.py</span>:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">82</span> \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> solve \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n",
|
|
"\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n",
|
|
"\u001b[2;36m \u001b[0m solve \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #800000; text-decoration-color: #800000\">WARNING </span> colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> chain from index <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span> to <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">14</span> with memory <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">500</span> \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n",
|
|
"\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[11/10/22 18:04:22] </span><span style=\"color: #800000; text-decoration-color: #800000\">WARNING </span> colossalai - colossalai - WARNING: \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> <span style=\"color: #800080; text-decoration-color: #800080\">/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">ckpt_solver_rotor.py</span>:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">82</span> \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> solve \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m[11/10/22 18:04:22]\u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n",
|
|
"\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n",
|
|
"\u001b[2;36m \u001b[0m solve \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #800000; text-decoration-color: #800000\">WARNING </span> colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> chain from index <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span> to <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">14</span> with memory <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">500</span> \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n",
|
|
"\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #800000; text-decoration-color: #800000\">WARNING </span> colossalai - colossalai - WARNING: \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> <span style=\"color: #800080; text-decoration-color: #800080\">/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">ckpt_solver_rotor.py</span>:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">82</span> \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> solve \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n",
|
|
"\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n",
|
|
"\u001b[2;36m \u001b[0m solve \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #800000; text-decoration-color: #800000\">WARNING </span> colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> chain from index <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span> to <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">14</span> with memory <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">500</span> \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n",
|
|
"\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[11/10/22 18:04:23] </span><span style=\"color: #800000; text-decoration-color: #800000\">WARNING </span> colossalai - colossalai - WARNING: \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> <span style=\"color: #800080; text-decoration-color: #800080\">/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">ckpt_solver_rotor.py</span>:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">82</span> \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> solve \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m[11/10/22 18:04:23]\u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n",
|
|
"\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n",
|
|
"\u001b[2;36m \u001b[0m solve \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #800000; text-decoration-color: #800000\">WARNING </span> colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n",
|
|
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> chain from index <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span> to <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">14</span> with memory <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">500</span> \n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n",
|
|
"\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"def data_gen(batch_size, shape, device='cuda'):\n",
|
|
" data = torch.empty(batch_size, *shape, device=device)\n",
|
|
" label = torch.empty(batch_size, dtype=torch.long, device=device).random_(1000)\n",
|
|
" return (data, ), label\n",
|
|
"\n",
|
|
"model = tm.resnet18()\n",
|
|
"gm = symbolic_trace(model)\n",
|
|
"gm = metainfo_trace(gm, torch.empty(128, 3, 224, 224, device='meta'))\n",
|
|
"peak_hist, step_hist = bench_rotor(gm, torch.nn.CrossEntropyLoss(), partial(data_gen, batch_size=128, shape=(3, 224, 224)), num_steps=5, sample_points=20, free_memory=2700 * 1024**2)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"[<matplotlib.lines.Line2D at 0x7f8f259d0af0>]"
|
|
]
|
|
},
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
},
|
|
{
|
|
"data": {
|
|
"image/png": "",
|
|
"text/plain": [
|
|
"<Figure size 800x800 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"plt.figure(figsize=(8, 8))\n",
|
|
"plt.plot(peak_hist, step_hist)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"[540.0,\n",
|
|
" 653.6842105263158,\n",
|
|
" 767.3684210526316,\n",
|
|
" 881.0526315789474,\n",
|
|
" 994.7368421052631,\n",
|
|
" 1108.421052631579,\n",
|
|
" 1222.1052631578948,\n",
|
|
" 1335.7894736842104,\n",
|
|
" 1449.4736842105262,\n",
|
|
" 1563.157894736842,\n",
|
|
" 26711.86572265625,\n",
|
|
" 26711.86572265625,\n",
|
|
" 26711.86572265625,\n",
|
|
" 26711.86572265625,\n",
|
|
" 26711.86572265625,\n",
|
|
" 26711.86572265625,\n",
|
|
" 26711.86572265625,\n",
|
|
" 26711.86572265625,\n",
|
|
" 26711.86572265625,\n",
|
|
" 26711.86572265625]"
|
|
]
|
|
},
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"peak_hist"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3.10.6 ('autoparallel': conda)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.6"
|
|
},
|
|
"orig_nbformat": 4,
|
|
"vscode": {
|
|
"interpreter": {
|
|
"hash": "cc0ad6865167fb9a52c12f0fd0c8203c9a7690797bfee612a871d56b9d2024ce"
|
|
}
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|