From 238da6d4c9256e11553fd89470ad0b9f30740c22 Mon Sep 17 00:00:00 2001 From: shihlCST <1665105642@qq.com> Date: Thu, 6 Jun 2024 14:48:54 +0800 Subject: [PATCH] add_statistic_category --- .../troubleshooter/migrator/dump/alexnet.py | 2 +- .../migrator/dump/demo/test_ms_dump.py | 82 +++++------ .../migrator/dump/demo/test_torch_dump.py | 62 ++++---- .../migrator/dump/torch_swin_tiny.py | 2 +- .../st/troubleshooter/migrator/dump/utils.py | 33 ++++- .../migrator/test_api_dump_communication.py | 6 +- .../migrator/test_bfloat16_ms.py | 6 +- .../troubleshooter/migrator/test_l2norm_ms.py | 2 +- tests/st/troubleshooter/migrator/test_mint.py | 6 +- troubleshooter/docs/api/migrator/api_dump.md | 24 ++-- .../migrator/api_dump/ad_dump/hooks.py | 2 +- .../migrator/api_dump/api_dump_compare.py | 11 +- .../migrator/api_dump/ms_dump/hooks.py | 134 +++++++++++------- .../migrator/api_dump/pt_dump/dump/dump.py | 118 +++++++++------ .../migrator/api_dump/pt_dump/dump/utils.py | 10 +- .../migrator/api_dump/universal_interface.py | 12 +- 16 files changed, 304 insertions(+), 208 deletions(-) diff --git a/tests/st/troubleshooter/migrator/dump/alexnet.py b/tests/st/troubleshooter/migrator/dump/alexnet.py index 7a9a010..70d7e0a 100644 --- a/tests/st/troubleshooter/migrator/dump/alexnet.py +++ b/tests/st/troubleshooter/migrator/dump/alexnet.py @@ -105,7 +105,7 @@ class AlexNet(nn.Cell): if __name__ == "__main__": net = AlexNet() api_dump_init(net) - api_dump_start() + api_dump_start(statistic_category = ['max', 'min', 'avg', 'md5', 'l2norm']) grad_net = ms.grad(net, None, net.trainable_params()) output = grad_net(ms.Tensor(np.random.random([1, 227, 227, 3]).astype(np.float32))) print(output) diff --git a/tests/st/troubleshooter/migrator/dump/demo/test_ms_dump.py b/tests/st/troubleshooter/migrator/dump/demo/test_ms_dump.py index a1065c1..a0dae18 100644 --- a/tests/st/troubleshooter/migrator/dump/demo/test_ms_dump.py +++ b/tests/st/troubleshooter/migrator/dump/demo/test_ms_dump.py @@ -8,7 +8,7 @@ import mindspore as ms import numpy as np import pytest from mindspore import Tensor, nn, ops -from tests.st.troubleshooter.migrator.dump.utils import get_pkl_npy_stack_list, get_md5_list +from tests.st.troubleshooter.migrator.dump.utils import get_csv_npy_stack_list, get_md5_list from troubleshooter.migrator import api_dump_init, api_dump_start, api_dump_stop @@ -58,7 +58,7 @@ class BaseTrainOneStep: grad_fn = ms.value_and_grad( forward_fn, None, self.optimizer.parameters) for s in range(self.step): - api_dump_start() + api_dump_start(statistic_category = ['max', 'min', 'avg', 'md5', 'l2norm']) loss, grads = grad_fn(self.data, self.label) self.optimizer(grads) print("step:", s, " loss:", loss) @@ -68,7 +68,7 @@ def train_ms_one_step_all(data_path, dump_path, info_path=None, retain_backward= **api_dump_start_args): class Net(BaseNet): def construct(self, x): - api_dump_start(**api_dump_start_args) + api_dump_start(**api_dump_start_args,statistic_category = ['max', 'min', 'avg', 'md5', 'l2norm']) x = self.conv(x) x = ops.clip(x, Tensor(0.2, ms.float32), Tensor(0.5, ms.float32)) x = self.bn(x) @@ -92,10 +92,10 @@ def test_api_dump_ms_all(): dump_path = Path(tempfile.mkdtemp(prefix="ms_all")) try: train_ms_one_step_all(data_path, dump_path) - pkl_list, npy_list, stack_list = get_pkl_npy_stack_list( + csv_list, npy_list, stack_list = get_csv_npy_stack_list( dump_path, 'mindspore') - assert len(pkl_list) == 21 - assert set(pkl_list) == set(npy_list) + assert len(csv_list) == 21 + assert set(csv_list) == set(npy_list) assert len(stack_list) == 7 finally: shutil.rmtree(data_path) @@ -123,7 +123,7 @@ def train_ms_one_step_all_overflow(data_path, dump_path, info_path=None, retain_ **api_dump_start_args): class Net(BaseNet): def construct(self, x): - api_dump_start(overflow_check=True, **api_dump_start_args) + api_dump_start(overflow_check=True, **api_dump_start_args,statistic_category = ['max', 'min', 'avg', 'md5', 'l2norm']) x = self.conv(x) x = ops.clip(x, Tensor(0.2, ms.float32), Tensor(0.5, ms.float32)) x = self.bn(x) @@ -147,10 +147,10 @@ def test_api_dump_ms_all_overflow(): dump_path = Path(tempfile.mkdtemp(prefix="ms_all")) try: train_ms_one_step_all_overflow(data_path, dump_path) - pkl_list, npy_list, stack_list = get_pkl_npy_stack_list( + csv_list, npy_list, stack_list = get_csv_npy_stack_list( dump_path, 'mindspore') - assert len(pkl_list) == 6 - assert set(pkl_list) == set(npy_list) + assert len(csv_list) == 6 + assert set(csv_list) == set(npy_list) assert len(stack_list) == 2 finally: shutil.rmtree(data_path) @@ -167,13 +167,13 @@ def test_api_dump_ms_all_with_scalar(): dump_path = Path(tempfile.mkdtemp(prefix="ms_all_with_scalar")) try: train_ms_one_step_all(data_path, dump_path, filter_data=False) - pkl_list, npy_list, stack_list = get_pkl_npy_stack_list( + csv_list, npy_list, stack_list = get_csv_npy_stack_list( dump_path, 'mindspore') - assert len(pkl_list) == 25 - assert 'Functional_clip_0_forward_input.1' in pkl_list - assert 'Functional_clip_0_forward_input.2' in pkl_list - assert 'Tensor_reshape_0_forward_input.1' in pkl_list - assert 'Tensor_reshape_0_forward_input.2' in pkl_list + assert len(csv_list) == 25 + assert 'Functional_clip_0_forward_input.1' in csv_list + assert 'Functional_clip_0_forward_input.2' in csv_list + assert 'Tensor_reshape_0_forward_input.1' in csv_list + assert 'Tensor_reshape_0_forward_input.2' in csv_list finally: shutil.rmtree(data_path) shutil.rmtree(dump_path) @@ -189,7 +189,7 @@ def test_api_dump_ms_all_with_full_stack(): dump_path = Path(tempfile.mkdtemp(prefix="ms_all_with_full_stack")) try: train_ms_one_step_all(data_path, dump_path, filter_stack=False) - pkl_list, npy_list, stack_list = get_pkl_npy_stack_list( + csv_list, npy_list, stack_list = get_csv_npy_stack_list( dump_path, 'mindspore') assert len(stack_list) == 7 finally: @@ -202,14 +202,14 @@ def train_ms_one_step_part(data_path, dump_path, info_path=None, retain_backwad= def construct(self, x): api_dump_stop() x = self.conv(x) - api_dump_start() + api_dump_start(statistic_category = ['max', 'min', 'avg', 'md5', 'l2norm']) x = ops.clip(x, Tensor(0.2, ms.float32), Tensor(0.5, ms.float32)) api_dump_stop() x = self.bn(x) x = self.relu(x) x = x.reshape(1, -1) x = self.linear(x) - api_dump_start() + api_dump_start(statistic_category = ['max', 'min', 'avg', 'md5', 'l2norm']) x = self.relu(x) api_dump_stop() return x @@ -228,10 +228,10 @@ def test_api_dump_ms_part(): dump_path = Path(tempfile.mkdtemp(prefix="ms_part")) try: train_ms_one_step_part(data_path, dump_path) - pkl_list, npy_list, stack_list = get_pkl_npy_stack_list( + csv_list, npy_list, stack_list = get_csv_npy_stack_list( dump_path, 'mindspore') - assert len(pkl_list) == 6 - assert set(pkl_list) == set(npy_list) + assert len(csv_list) == 6 + assert set(csv_list) == set(npy_list) assert len(stack_list) == 2 finally: shutil.rmtree(data_path) @@ -241,7 +241,7 @@ def test_api_dump_ms_part(): def train_ms_one_step_api_list(data_path, dump_path, info_path=None, retain_backwad=True): class Net(BaseNet): def construct(self, x): - api_dump_start(mode='api_list', scope=['relu', 'conv2d']) + api_dump_start(mode = 'api_list', scope = ['relu', 'conv2d'], statistic_category = ['max', 'min', 'avg', 'md5', 'l2norm']) x = self.conv(x) x = ops.clip(x, Tensor(0.2, ms.float32), Tensor(0.5, ms.float32)) x = self.bn(x) @@ -265,10 +265,10 @@ def test_api_dump_ms_api_list(): dump_path = Path(tempfile.mkdtemp(prefix="ms_part")) try: train_ms_one_step_api_list(data_path, dump_path) - pkl_list, npy_list, stack_list = get_pkl_npy_stack_list( + csv_list, npy_list, stack_list = get_csv_npy_stack_list( dump_path, 'mindspore') - assert len(pkl_list) == 9 - assert set(pkl_list) == set(npy_list) + assert len(csv_list) == 9 + assert set(csv_list) == set(npy_list) assert len(stack_list) == 3 finally: shutil.rmtree(data_path) @@ -278,8 +278,8 @@ def test_api_dump_ms_api_list(): def train_ms_one_step_list(data_path, dump_path, info_path=None, retain_backwad=True): class Net(BaseNet): def construct(self, x): - api_dump_start(mode='list', scope=[ - 'NN_BatchNorm2d_0', 'NN_ReLU_0']) + api_dump_start(mode = 'list', scope = [ + 'NN_BatchNorm2d_0', 'NN_ReLU_0'],statistic_category = ['max', 'min', 'avg', 'md5', 'l2norm']) x = self.conv(x) x = ops.clip(x, 0.2, 0.5) x = self.bn(x) @@ -303,10 +303,10 @@ def test_api_dump_ms_list(): dump_path = Path(tempfile.mkdtemp(prefix="ms_list")) try: train_ms_one_step_list(data_path, dump_path) - pkl_list, npy_list, stack_list = get_pkl_npy_stack_list( + csv_list, npy_list, stack_list = get_csv_npy_stack_list( dump_path, 'mindspore') - assert len(pkl_list) == 6 - assert set(pkl_list) == set(npy_list) + assert len(csv_list) == 6 + assert set(csv_list) == set(npy_list) assert len(stack_list) == 2 finally: shutil.rmtree(data_path) @@ -316,8 +316,8 @@ def test_api_dump_ms_list(): def train_ms_one_step_range(data_path, dump_path, info_path=None, retain_backwad=True): class Net(BaseNet): def construct(self, x): - api_dump_start(mode='range', scope=[ - 'Functional_clip_0', 'Tensor_reshape_0']) + api_dump_start(mode = 'range', scope = [ + 'Functional_clip_0', 'Tensor_reshape_0'],statistic_category = ['max', 'min', 'avg', 'md5', 'l2norm']) x = self.conv(x) x = ops.clip(x, 0.2, 0.5) x = self.bn(x) @@ -341,10 +341,10 @@ def test_api_dump_ms_range(): dump_path = Path(tempfile.mkdtemp(prefix="ms_range")) try: train_ms_one_step_range(data_path, dump_path) - pkl_list, npy_list, stack_list = get_pkl_npy_stack_list( + csv_list, npy_list, stack_list = get_csv_npy_stack_list( dump_path, 'mindspore') - assert len(pkl_list) == 12 - assert set(pkl_list) == set(npy_list) + assert len(csv_list) == 12 + assert set(csv_list) == set(npy_list) assert len(stack_list) == 4 finally: shutil.rmtree(data_path) @@ -360,7 +360,7 @@ def test_api_dump_ms_with_not_float_output(): x = Tensor(np.random.randn(8, 5).astype(np.float32)) dump_path = Path(tempfile.mkdtemp(prefix="with_not_float_output")) ts.migrator.api_dump_init(ms.nn.Cell(), dump_path, retain_backward=True) - ts.migrator.api_dump_start() + ts.migrator.api_dump_start(statistic_category = ['max', 'min', 'avg', 'md5', 'l2norm']) out = x.max(axis=1, return_indices=True) ts.migrator.api_dump_stop() shutil.rmtree(dump_path) @@ -371,7 +371,7 @@ def train_ms_one_step_jit(data_path, dump_path, info_path=None, retain_backward= **api_dump_start_args): class Net(BaseNet): def construct(self, x): - api_dump_start(**api_dump_start_args) + api_dump_start(**api_dump_start_args , statistic_category=['max','min','avg','md5','l2norm']) x = self.conv(x) x = ops.clip(x, Tensor(0.2, ms.float32), Tensor(0.5, ms.float32)) x = self.bn(x) @@ -405,10 +405,10 @@ def test_api_dump_ms_jit(): dump_path = Path(tempfile.mkdtemp(prefix="ms_jit")) try: train_ms_one_step_jit(data_path, dump_path) - pkl_list, npy_list, stack_list = get_pkl_npy_stack_list( + csv_list, npy_list, stack_list = get_csv_npy_stack_list( dump_path, 'mindspore') - assert len(pkl_list) == 21 - assert set(pkl_list) == set(npy_list) + assert len(csv_list) == 21 + assert set(csv_list) == set(npy_list) assert len(stack_list) == 7 finally: shutil.rmtree(data_path) diff --git a/tests/st/troubleshooter/migrator/dump/demo/test_torch_dump.py b/tests/st/troubleshooter/migrator/dump/demo/test_torch_dump.py index 501a9fe..d24da70 100644 --- a/tests/st/troubleshooter/migrator/dump/demo/test_torch_dump.py +++ b/tests/st/troubleshooter/migrator/dump/demo/test_torch_dump.py @@ -10,7 +10,7 @@ import torch.optim as optim import troubleshooter as ts from troubleshooter.migrator import api_dump_init, api_dump_start, api_dump_stop -from tests.st.troubleshooter.migrator.dump.utils import get_pkl_npy_stack_list +from tests.st.troubleshooter.migrator.dump.utils import get_csv_npy_stack_list ts.fix_random() @@ -40,7 +40,7 @@ class BaseTrainOneStep: def __call__(self): for s in range(self.step): - api_dump_start() + api_dump_start(statistic_category = ['max', 'min', 'avg', 'md5', 'l2norm']) out = self.net(self.data) loss = self.criterion(out, self.label) loss.backward() @@ -62,7 +62,7 @@ def train_pt_one_step_all(data_path, dump_path, info_path=None, retain_backward= **api_dump_start_args): class Net(BaseNet): def forward(self, x): - api_dump_start(**api_dump_start_args) + api_dump_start(**api_dump_start_args, statistic_category = ['max', 'min', 'avg', 'md5', 'l2norm']) x = self.conv(x) x = torch.clip(x, 0.2, 0.5) x = self.bn(x) @@ -86,10 +86,10 @@ def test_api_dump_torch_all(): dump_path = Path(tempfile.mkdtemp(prefix="torch_all")) try: train_pt_one_step_all(data_path, dump_path) - pkl_list, npy_list, stack_list = get_pkl_npy_stack_list( + csv_list, npy_list, stack_list = get_csv_npy_stack_list( dump_path, 'torch') - assert len(pkl_list) == 25 # 21 apis + 4 layers - assert set(pkl_list) == set(npy_list) + assert len(csv_list) == 25 # 21 apis + 4 layers + assert set(csv_list) == set(npy_list) assert len(stack_list) == 11 # 7 apis + 4 layers finally: shutil.rmtree(data_path) @@ -106,13 +106,13 @@ def test_api_dump_pt_all_with_scalar(): dump_path = Path(tempfile.mkdtemp(prefix="pt_all_with_scalar")) try: train_pt_one_step_all(data_path, dump_path, filter_data=False) - pkl_list, npy_list, stack_list = get_pkl_npy_stack_list( + csv_list, npy_list, stack_list = get_csv_npy_stack_list( dump_path, 'torch') - assert len(pkl_list) == 29 # 25 apis + 4 layers - assert 'Torch_clip_0_forward_input.1' in pkl_list - assert 'Torch_clip_0_forward_input.2' in pkl_list - assert 'Tensor_reshape_0_forward_input.1' in pkl_list - assert 'Tensor_reshape_0_forward_input.2' in pkl_list + assert len(csv_list) == 29 # 25 apis + 4 layers + assert 'Torch_clip_0_forward_input.1' in csv_list + assert 'Torch_clip_0_forward_input.2' in csv_list + assert 'Tensor_reshape_0_forward_input.1' in csv_list + assert 'Tensor_reshape_0_forward_input.2' in csv_list finally: shutil.rmtree(data_path) shutil.rmtree(dump_path) @@ -128,7 +128,7 @@ def test_api_dump_pt_all_with_full_stack(): dump_path = Path(tempfile.mkdtemp(prefix="pt_all_with_full_stack")) try: train_pt_one_step_all(data_path, dump_path, filter_stack=False) - pkl_list, npy_list, stack_list = get_pkl_npy_stack_list( + csv_list, npy_list, stack_list = get_csv_npy_stack_list( dump_path, 'torch') assert len(stack_list) == 11 # 7 apis + 4 layers finally: @@ -141,14 +141,14 @@ def train_pt_one_step_part(data_path, dump_path, info_path=None, retain_backwad= def forward(self, x): api_dump_stop() x = self.conv(x) - api_dump_start() + api_dump_start(statistic_category = ['max', 'min', 'avg', 'md5', 'l2norm']) x = torch.clip(x, 0.2, 0.5) api_dump_stop() x = self.bn(x) x = self.relu(x) x = x.reshape(1, -1) x = self.linear(x) - api_dump_start() + api_dump_start(statistic_category = ['max', 'min', 'avg', 'md5', 'l2norm']) x = self.relu(x) api_dump_stop() return x @@ -167,10 +167,10 @@ def test_api_dump_torch_part(): dump_path = Path(tempfile.mkdtemp(prefix="torch_part")) try: train_pt_one_step_part(data_path, dump_path) - pkl_list, npy_list, stack_list = get_pkl_npy_stack_list( + csv_list, npy_list, stack_list = get_csv_npy_stack_list( dump_path, 'torch') - assert len(pkl_list) == 7 # 6 apis + 1 layer - assert set(pkl_list) == set(npy_list) + assert len(csv_list) == 7 # 6 apis + 1 layer + assert set(csv_list) == set(npy_list) assert len(stack_list) == 3 # 2 apis + 1 layer finally: shutil.rmtree(data_path) @@ -180,7 +180,7 @@ def test_api_dump_torch_part(): def train_pt_one_step_api_list(data_path, dump_path, info_path=None, retain_backwad=True): class Net(BaseNet): def forward(self, x): - api_dump_start(mode='api_list', scope=['relu', 'conv2d']) + api_dump_start(mode = 'api_list', scope = ['relu', 'conv2d'], statistic_category = ['max', 'min', 'avg', 'md5', 'l2norm']) x = self.conv(x) x = torch.clip(x, 0.2, 0.5) x = self.bn(x) @@ -204,11 +204,11 @@ def test_api_dump_torch_api_list(): dump_path = Path(tempfile.mkdtemp(prefix="torch_api_list")) try: train_pt_one_step_api_list(data_path, dump_path) - pkl_list, npy_list, stack_list = get_pkl_npy_stack_list( + csv_list, npy_list, stack_list = get_csv_npy_stack_list( dump_path, 'torch') - assert len(pkl_list) == 10 # 9 apis + 1 layer - assert set(pkl_list) == set(npy_list) + assert len(csv_list) == 10 # 9 apis + 1 layer + assert set(csv_list) == set(npy_list) assert len(stack_list) == 4 # 3 apis + 1 layer finally: shutil.rmtree(data_path) @@ -218,8 +218,8 @@ def test_api_dump_torch_api_list(): def train_pt_one_step_list(data_path, dump_path, info_path=None, retain_backwad=True): class Net(BaseNet): def forward(self, x): - api_dump_start(mode='list', scope=[ - 'NN_BatchNorm2d_0', 'NN_ReLU_0']) + api_dump_start(mode = 'list', scope = [ + 'NN_BatchNorm2d_0', 'NN_ReLU_0'], statistic_category = ['max', 'min', 'avg', 'md5', 'l2norm']) x = self.conv(x) x = torch.clip(x, 0.2, 0.5) x = self.bn(x) @@ -243,10 +243,10 @@ def test_api_dump_torch_list(): dump_path = Path(tempfile.mkdtemp(prefix="torch_list")) try: train_pt_one_step_list(data_path, dump_path) - pkl_list, npy_list, stack_list = get_pkl_npy_stack_list( + csv_list, npy_list, stack_list = get_csv_npy_stack_list( dump_path, 'torch') - assert len(pkl_list) == 6 - assert set(pkl_list) == set(npy_list) + assert len(csv_list) == 6 + assert set(csv_list) == set(npy_list) assert len(stack_list) == 2 finally: shutil.rmtree(data_path) @@ -256,7 +256,7 @@ def test_api_dump_torch_list(): def train_pt_one_step_range(data_path, dump_path, info_path=None, retain_backwad=True): class Net(BaseNet): def forward(self, x): - api_dump_start(mode='range', scope=['Torch_clip_0', 'Tensor_reshape_0']) + api_dump_start(mode = 'range', scope = ['Torch_clip_0', 'Tensor_reshape_0'], statistic_category = ['max', 'min', 'avg', 'md5', 'l2norm']) x = self.conv(x) x = torch.clip(x, 0.2, 0.5) x = self.bn(x) @@ -280,11 +280,11 @@ def test_api_dump_torch_range(): dump_path = Path(tempfile.mkdtemp(prefix="torch_range")) try: train_pt_one_step_range(data_path, dump_path) - pkl_list, npy_list, stack_list = get_pkl_npy_stack_list( + csv_list, npy_list, stack_list = get_csv_npy_stack_list( dump_path, 'torch') - assert len(pkl_list) == 14 # 12 apis + 2 layers - assert set(pkl_list) == set(npy_list) + assert len(csv_list) == 14 # 12 apis + 2 layers + assert set(csv_list) == set(npy_list) assert len(stack_list) == 6 # 4 apis + 2 layers finally: shutil.rmtree(data_path) diff --git a/tests/st/troubleshooter/migrator/dump/torch_swin_tiny.py b/tests/st/troubleshooter/migrator/dump/torch_swin_tiny.py index fa3aace..371dc66 100644 --- a/tests/st/troubleshooter/migrator/dump/torch_swin_tiny.py +++ b/tests/st/troubleshooter/migrator/dump/torch_swin_tiny.py @@ -681,7 +681,7 @@ if __name__ == '__main__': net = swin_tiny_patch4_window7_224(5) ts.migrator.api_dump_init(net, 'torch_dump_new', retain_backward=True) data = np.random.random((1, 3, 224, 224)).astype(np.float32) - ts.migrator.api_dump_start() + ts.migrator.api_dump_start(statistic_category = ['max', 'min', 'avg', 'md5', 'l2norm']) out = net(torch.tensor(data)) criterion = nn.MSELoss() # 均方损失函数 target_tensor = torch.randn(1, 5) diff --git a/tests/st/troubleshooter/migrator/dump/utils.py b/tests/st/troubleshooter/migrator/dump/utils.py index f4563b1..9691897 100644 --- a/tests/st/troubleshooter/migrator/dump/utils.py +++ b/tests/st/troubleshooter/migrator/dump/utils.py @@ -12,6 +12,15 @@ def load_pkl(path): ret.append(json.loads(line)) return ret +def load_csv(path): + ret = [] + with open(path, "r") as f: + while True: + line = f.readline() + if not line.strip(): + break + ret.append(json.loads(line)) + return ret def get_pkl_list(path): pkl = load_pkl(path) @@ -21,6 +30,14 @@ def get_pkl_list(path): dump_list.append(name) return dump_list +def get_csv_list(path): + csv = load_csv(path) + dump_list = [] + for line in csv: + name = line[0] + dump_list.append(name) + return dump_list + def get_stack_list(path): with open(path, 'r') as f: @@ -46,14 +63,22 @@ def get_pkl_npy_stack_list(path, framework): path/'rank0'/f'{framework}_api_dump_stack.json') return pkl_list, npy_list, stack_list +def get_csv_npy_stack_list(path, framework): + assert framework in { + 'torch', 'mindspore'}, "framework must in 'torch' or 'mindspore'" + csv_list = get_pkl_list(path/'rank0'/f'{framework}_api_dump_info.csv') + npy_list = get_npy_list(path/'rank0'/f'{framework}_api_dump') + stack_list = get_stack_list( + path/'rank0'/f'{framework}_api_dump_stack.json') + return csv_list, npy_list, stack_list def get_md5_list(path, framework): assert framework in { 'torch', 'mindspore'}, "framework must in 'torch' or 'mindspore'" - pkl = load_pkl(path/'rank0'/f'{framework}_api_dump_info.pkl') + csv = load_pkl(path/'rank0'/f'{framework}_api_dump_info.csv') md5_position = 6 md5_list = [] - for line in pkl: + for line in csv: md5 = line[md5_position] md5_list.append(md5) return md5_list @@ -62,10 +87,10 @@ def get_md5_list(path, framework): def get_l2norm_list(path, framework): assert framework in { 'torch', 'mindspore'}, "framework must in 'torch' or 'mindspore'" - pkl = load_pkl(path/'rank0'/f'{framework}_api_dump_info.pkl') + csv = load_pkl(path/'rank0'/f'{framework}_api_dump_info.csv') l2norm_position = 7 l2norm_list = [] - for line in pkl: + for line in csv: l2norm = line[l2norm_position] l2norm_list.append(l2norm) return l2norm_list \ No newline at end of file diff --git a/tests/st/troubleshooter/migrator/test_api_dump_communication.py b/tests/st/troubleshooter/migrator/test_api_dump_communication.py index 543c5f5..55d9455 100644 --- a/tests/st/troubleshooter/migrator/test_api_dump_communication.py +++ b/tests/st/troubleshooter/migrator/test_api_dump_communication.py @@ -23,7 +23,7 @@ import shutil from mindspore import dtype as mstype from pathlib import Path from troubleshooter.migrator import api_dump_init, api_dump_start, api_dump_stop -from tests.st.troubleshooter.migrator.dump.utils import get_pkl_npy_stack_list +from tests.st.troubleshooter.migrator.dump.utils import get_csv_npy_stack_list try: from mindspore.communication import comm_func comm_func_label = True @@ -53,12 +53,12 @@ def test_api_dump_communicate(): dump_path = Path(tempfile.mkdtemp(prefix="ms_api_dump_communication")) try: api_dump_init(net, dump_path, retain_backward=True) - api_dump_start() + api_dump_start(statistic_category = ['min', 'avg', 'l2norm']) input = ms.Tensor(np.ones([3, 4]).astype(np.float32)) expect_output = [[2, 2, 2, 2],[2, 2, 2, 2],[2, 2, 2, 2]] output = ops.grad(all_reduce_dump)(input) api_dump_stop() - pkl_list, npy_list, stack_list = get_pkl_npy_stack_list(dump_path, 'mindspore') + csv_list, npy_list, stack_list = get_csv_npy_stack_list(dump_path, 'mindspore') assert 'Functional_all_reduce_0_backward_input' in npy_list assert 'Functional_all_reduce_0_forward_input.0' in npy_list assert 'Functional_all_reduce_0_forward_output' in npy_list diff --git a/tests/st/troubleshooter/migrator/test_bfloat16_ms.py b/tests/st/troubleshooter/migrator/test_bfloat16_ms.py index 738a643..61eba3d 100644 --- a/tests/st/troubleshooter/migrator/test_bfloat16_ms.py +++ b/tests/st/troubleshooter/migrator/test_bfloat16_ms.py @@ -21,7 +21,7 @@ import shutil import mindspore as ms from mindspore import Tensor, context, ops, nn from troubleshooter.migrator import api_dump_init, api_dump_start, api_dump_stop -from tests.st.troubleshooter.migrator.dump.utils import get_pkl_npy_stack_list +from tests.st.troubleshooter.migrator.dump.utils import get_csv_npy_stack_list class Net(nn.Cell): @@ -59,13 +59,13 @@ def test_conv2d_bfloat16(): try: net = Net() api_dump_init(net, dump_path, retain_backward=True) - api_dump_start() + api_dump_start(statistic_category = ['max', 'min', 'avg', 'md5', 'l2norm']) x = Tensor(np.ones([10, 32, 32, 32]), ms.bfloat16) weight = Tensor(np.ones([32, 32, 3, 3]), ms.bfloat16) grads = conv2d_backward_func(x, weight) dx, dw = grads api_dump_stop() - pkl_list, npy_list, stack_list = get_pkl_npy_stack_list( + csv_list, npy_list, stack_list = get_csv_npy_stack_list( dump_path, 'mindspore') assert 'Functional_conv2d_0_forward_input.0' in npy_list assert 'Functional_conv2d_0_forward_input.1' in npy_list diff --git a/tests/st/troubleshooter/migrator/test_l2norm_ms.py b/tests/st/troubleshooter/migrator/test_l2norm_ms.py index 2be6e79..d6e7eea 100644 --- a/tests/st/troubleshooter/migrator/test_l2norm_ms.py +++ b/tests/st/troubleshooter/migrator/test_l2norm_ms.py @@ -35,7 +35,7 @@ def test_l2norm_ms(): net = Net() api_dump_init(net, dump_path, retain_backward=True) - api_dump_start() + api_dump_start(statistic_category = ['max', 'min', 'avg', 'md5', 'l2norm']) data = ms.Tensor(np.ones([2,10]), ms.float16) out = net(data) api_dump_stop() diff --git a/tests/st/troubleshooter/migrator/test_mint.py b/tests/st/troubleshooter/migrator/test_mint.py index 8bd998d..f4212b6 100644 --- a/tests/st/troubleshooter/migrator/test_mint.py +++ b/tests/st/troubleshooter/migrator/test_mint.py @@ -21,7 +21,7 @@ import mindspore as ms import mindspore.nn as nn from mindspore import Tensor, ops from troubleshooter.migrator import api_dump_init, api_dump_start, api_dump_stop -from tests.st.troubleshooter.migrator.dump.utils import get_pkl_npy_stack_list +from tests.st.troubleshooter.migrator.dump.utils import get_csv_npy_stack_list class MaxNet(nn.Cell): @@ -65,7 +65,7 @@ def test_mint(): dump_path = Path(tempfile.mkdtemp(prefix="ms_api_dump_mint")) try: api_dump_init(net2, dump_path, retain_backward=True) - api_dump_start() + api_dump_start(statistic_category=['max', 'min', 'avg', 'md5', 'l2norm']) input0 = Tensor(np.array([[0.0, 0.3, 0.4, 0.5, 0.1], [3.2, 0.4, 0.1, 2.9, 4.0]]), ms.float32) input1 = Tensor(np.array([[180, 234, 154], [244, 48, 247]]), ms.float32) @@ -79,7 +79,7 @@ def test_mint(): grad = avg_pool2d_backward_func(image, 2, 2, 0, True, False) api_dump_stop() - pkl_list, npy_list, stack_list = get_pkl_npy_stack_list(dump_path, 'mindspore') + csv_list, npy_list, stack_list = get_csv_npy_stack_list(dump_path, 'mindspore') assert 'Functional_max_0_forward_input.0' in npy_list assert 'Functional_max_0_forward_output.0' in npy_list assert 'NN_Linear_0_forward_input.0' in npy_list diff --git a/troubleshooter/docs/api/migrator/api_dump.md b/troubleshooter/docs/api/migrator/api_dump.md index c4580cb..aeae35f 100644 --- a/troubleshooter/docs/api/migrator/api_dump.md +++ b/troubleshooter/docs/api/migrator/api_dump.md @@ -27,7 +27,7 @@ MindSpore 生成的目录结构。 output_path # 输出目录 └── rank0   ├── mindspore_api_dump # npy数据目录 -   ├── mindspore_api_dump_info.pkl # dump的info信息 +   ├── mindspore_api_dump_info.csv # dump的info信息   └── mindspore_api_dump_stack.json # dump的堆栈信息 ``` @@ -37,7 +37,7 @@ Torch 生成的目录结构。 output_path # 输出目录 └── rank0   ├── torch_api_dump # npy数据目录 -   ├── torch_api_dump_info.pkl # dump的info信息 +   ├── torch_api_dump_info.csv # dump的info信息   ├── torch_api_dump_stack.json # dump的堆栈信息   └── pt_net.pth # 存储的网络 state_dict 中的内容(仅在compare_statedict 为 `True`时保存) ``` @@ -48,7 +48,7 @@ MindTorch 生成的目录结构。 output_path # 输出目录 └── rank0   ├── mindtorch_api_dump # npy数据目录 -   ├── mindtorch_api_dump_info.pkl # dump的info信息 +   ├── mindtorch_api_dump_info.csv # dump的info信息   ├── mindtorch_api_dump_stack.json # dump的堆栈信息   └── ad_net.pth # 存储的网络 state_dict 中的内容(仅在compare_statedict 为 `True`时保存) ``` @@ -73,7 +73,7 @@ output_path # 输出目录 其中LAYER表示该npy文件用于存储模块名称;LAYERNAME表示模块的名称。 -- `api_dump_info.pkl`文件为网络在dump时按照API的执行顺序保存的信息,文件项格式如下: +- `api_dump_info.csv`文件为网络在dump时按照API的执行顺序保存的信息,文件项格式如下: ``` [数据名称,保留字段,保留字段,数据类型,数据shape,[最大值,最小值,均值], md5值, l2norm值] ``` @@ -91,18 +91,18 @@ output_path # 输出目录 - mode(str, 可选):dump 模式,目前支持 `'all'`、`'list'`、`'range'`、`'api_list'`,默认值 `'all'`。`'all'` 模式会 dump 全部 API 的数据;`'list'`、`'range'`、`'api_list'` 模式通过配合 `scope` 参数可以实现 dump 特定 API、范围、名称等功能。 -- scope(list, 可选):dump 范围。根据 `mode` 配置的模式选择 dump 的 API 范围。API 范围中的名称可以通过输出目录下的 `api_dump_info.pkl` 文件获取)。 +- scope(list, 可选):dump 范围。根据 `mode` 配置的模式选择 dump 的 API 范围。API 范围中的名称可以通过输出目录下的 `api_dump_info.csv` 文件获取)。 - `mode` 为 `'list'` 时,`scope` 为 dump 特定的文件列表,例如 `['Functional_softmax_1', 'NN_Dense_1', 'Tensor___matmul___1']`,只会 dump 列表中的三个 API; - `mode` 为 `'range'` 时,`scope` 为 dump 的区间范围,例如 `['NN_Dense_1', 'Tensor___matmul___1']`,会 dump 从`'NN_Dense_1'`直到`'Tensor__matmul___1'`的所有 API; - `mode` 为 `'api_list'` 时,`scope` 为 dump 特定的 API 列表,例如 `['relu', 'softmax', 'layernorm']`,会 dump 名称中含有 `relu`、`softmax`、`layernorm` 关键字的所有 API,不区分 `Tensor`、`Functional` 等方法类型。 -- dump_type(`str`, 可选):dump 保存的数据类型,目前支持 `'all'`、`'statistics'`、`'npy'`、`'stack'`, 默认值为 `'all'`。以下模式均会保存数据的堆栈信息(`api_dump_stack.json`)与执行顺序(`api_dump_info.pkl`)。 +- dump_type(`str`, 可选):dump 保存的数据类型,目前支持 `'all'`、`'statistics'`、`'npy'`、`'stack'`, 默认值为 `'all'`。以下模式均会保存数据的堆栈信息(`api_dump_stack.json`)与执行顺序(`api_dump_info.csv`)。 - - 为 `'all'`时会保存数据的统计信息(`api_dump_info.pkl`文件中数据的最大/最小/均值信息)和 npy 文件,速度最慢,存储空间占用大; + - 为 `'all'`时会保存数据的统计信息(`api_dump_info.csv`文件中数据的最大/最小/均值信息)和 npy 文件,速度最慢,存储空间占用大; - 为 `'npy'`时,**不会保存数据的统计信息**,会保存数据的npy文件,速度较`'all'`模式快,存储空间占用大; - 为 `'statistics'` 时,**不会保存npy文件**,会保存数据的统计信息,存储空间占用小,结合`api_dump_compare`,可以根据统计信息初步定位精度问题; - - 为 `'stack'`时,只会保存数据的堆栈信息(`api_dump_stack.json`)与执行顺序(`api_dump_info.pkl`),运行速度最快,存储空间占用最小,常用于快速验证`api_dump_compare`中API映射结果。 + - 为 `'stack'`时,只会保存数据的堆栈信息(`api_dump_stack.json`)与执行顺序(`api_dump_info.csv`),运行速度最快,存储空间占用最小,常用于快速验证`api_dump_compare`中API映射结果。 - filter_data(`bool`, 可选):是否开启 dump 数据过滤,默认值为 `True`。为 `True` 时,非浮点类型的 Tensor 和标量将会被过滤,不会被保存。 @@ -110,6 +110,14 @@ output_path # 输出目录 - overflow_check(`bool`, 可选):是否开启溢出检测及dump,默认值为 `False`。为 `True`时,开启溢出检测并dump溢出数据。 +- statistic_category(`list`,可选): 是否开启用户自定义dump输出结果,默认值为[`'max'`, `'min'`,`'l2norm'`]时,只会dump出`api_dump_info.csv`文件中数据的最大值和最小值和l2norm信息,减少dump没必要的数据的时间开支;目前支持的模式有`max`, `min`, `avg`, `md5`,`l2norm`五种模式, + + - `max`: 统计最大值 + - `min`: 统计最小值 + - `avg`: 统计平均值 + - `md5`: 统计md5信息 + - `l2norm`: 统计l2norm信息 + ## troubleshooter.migrator.api_dump_stop > troubleshooter.migrator.api_dump_stop() diff --git a/troubleshooter/troubleshooter/migrator/api_dump/ad_dump/hooks.py b/troubleshooter/troubleshooter/migrator/api_dump/ad_dump/hooks.py index d7934e1..426aa81 100644 --- a/troubleshooter/troubleshooter/migrator/api_dump/ad_dump/hooks.py +++ b/troubleshooter/troubleshooter/migrator/api_dump/ad_dump/hooks.py @@ -7,7 +7,7 @@ from mindtorch.torch.tensor import cast_to_adapter_tensor NNCount = defaultdict(int) def make_adapter_dump_dirs(rank): - dump_file_name, dump_path = "mindtorch_api_dump_info.pkl", "mindtorch_api_dump" + dump_file_name, dump_path = "mindtorch_api_dump_info.csv", "mindtorch_api_dump" dump_stack_file = "mindtorch_api_dump_stack.json" dump_root_dir = DumpUtil.dump_ori_dir if DumpUtil.dump_ori_dir else "./" Path(dump_root_dir).mkdir(mode=0o700, parents=True, exist_ok=True) diff --git a/troubleshooter/troubleshooter/migrator/api_dump/api_dump_compare.py b/troubleshooter/troubleshooter/migrator/api_dump/api_dump_compare.py index f85b6a7..2125e17 100644 --- a/troubleshooter/troubleshooter/migrator/api_dump/api_dump_compare.py +++ b/troubleshooter/troubleshooter/migrator/api_dump/api_dump_compare.py @@ -72,9 +72,7 @@ def _get_npy_shape_map(pkl_path): def _read_line(line): prefix, dump_step, _, data_type, data_shape, data_summary, md5_nume, l2norm = line return {prefix: data_shape} - ret = {} - pkl = load_pkl(pkl_path) for l in pkl: shape = _read_line(l) @@ -303,15 +301,15 @@ def get_npy_map_list( def get_dump_path(root_path): root_path = Path(root_path) - ms_pkl_path = root_path.joinpath("rank0", "mindspore_api_dump_info.pkl") + ms_pkl_path = root_path.joinpath("rank0", "mindspore_api_dump_info.csv") ms_npy_path = root_path.joinpath("rank0", "mindspore_api_dump") ms_npy_path_not_empty = ms_npy_path.exists() and list(ms_npy_path.iterdir()) - pt_pkl_path = root_path.joinpath("rank0", "torch_api_dump_info.pkl") + pt_pkl_path = root_path.joinpath("rank0", "torch_api_dump_info.csv") pt_npy_path = root_path.joinpath("rank0", "torch_api_dump") pt_npy_path_not_empty = pt_npy_path.exists() and list(pt_npy_path.iterdir()) - ad_pkl_path = root_path.joinpath('rank0', 'mindtorch_api_dump_info.pkl') + ad_pkl_path = root_path.joinpath('rank0', 'mindtorch_api_dump_info.csv') ad_npy_path = root_path.joinpath('rank0', 'mindtorch_api_dump') ad_npy_path_not_empty = ad_npy_path.exists() and list(ad_npy_path.iterdir()) @@ -483,9 +481,7 @@ def compare_mindtorch_summary(origin_pkl_path, target_pkl_path, name_map_list, f def _read_line(line): prefix, dump_step, _, data_type, data_shape, data_summary, md5_nume, l2norm = line return {prefix: (data_type, data_shape, data_summary)} - ret = {} - pkl = load_pkl(pkl_path) for l in pkl: summary = _read_line(l) @@ -532,7 +528,6 @@ def compare_summary(origin_pkl_path, target_pkl_path, name_map_list, **print_kwa def _read_line(line): prefix, dump_step, _, data_type, data_shape, data_summary, md5_nume, l2norm = line return {prefix: (data_shape, data_summary)} - ret = {} pkl = load_pkl(pkl_path) diff --git a/troubleshooter/troubleshooter/migrator/api_dump/ms_dump/hooks.py b/troubleshooter/troubleshooter/migrator/api_dump/ms_dump/hooks.py index f7a02ec..5fbf43d 100644 --- a/troubleshooter/troubleshooter/migrator/api_dump/ms_dump/hooks.py +++ b/troubleshooter/troubleshooter/migrator/api_dump/ms_dump/hooks.py @@ -54,6 +54,8 @@ class DumpUtil(object): dump_filter_stack = True dump_count = 0 dump_overflow = False + statistic_category = None + @staticmethod def set_ori_dir(path): @@ -66,7 +68,7 @@ class DumpUtil(object): DumpUtil.dump_stack_file = dump_stack_file DumpUtil.dump_init_enable = True @staticmethod - def set_dump_switch(switch, mode, scope, api_list, filter_switch, dump_mode, dump_type, filter_stack, overflow): + def set_dump_switch(switch, mode, scope, api_list, filter_switch, dump_mode, dump_type, filter_stack, overflow,statistic_category): DumpUtil.dump_switch = switch DumpUtil.dump_switch_mode = mode DumpUtil.dump_switch_scope = scope @@ -76,6 +78,7 @@ class DumpUtil(object): DumpUtil.dump_type = dump_type DumpUtil.dump_filter_stack = filter_stack DumpUtil.dump_overflow = overflow + DumpUtil.statistic_category = statistic_category if mode == Const.ACL: DumpUtil.dump_switch_scope = [api_name.replace("backward", "forward") for api_name in scope] @@ -164,60 +167,88 @@ class DataInfo(object): self.l2norm = l2norm -def get_not_float_tensor_info(data, compute_summary): +def get_not_float_tensor_info(data, compute_summary, statistic_category): saved_tensor = data.asnumpy() + tensor_max, tensor_min, tensor_mean = math.nan, math.nan, math.nan if compute_summary: if saved_tensor.size == 0 or saved_tensor.dtype == np.bool_: - tensor_max = [] - tensor_min = [] - tensor_mean = [] + pass elif len(saved_tensor.shape) == 0: - tensor_max = saved_tensor.astype(np.float32).tolist() - tensor_min = saved_tensor.astype(np.float32).tolist() - tensor_mean = saved_tensor.astype(np.float32).tolist() + if 'max' in statistic_category: + tensor_max = saved_tensor.astype(np.float32).tolist() + if 'min' in statistic_category: + tensor_min = saved_tensor.astype(np.float32).tolist() + if 'avg' in statistic_category: + tensor_mean = saved_tensor.astype(np.float32).tolist() else: - tensor_max = saved_tensor.max().astype(np.float32).tolist() - tensor_min = saved_tensor.min().astype(np.float32).tolist() - tensor_mean = saved_tensor.astype(np.float32).mean().tolist() + if 'max' in statistic_category: + tensor_max = saved_tensor.max().astype(np.float32).tolist() + if 'min' in statistic_category: + tensor_min = saved_tensor.min().astype(np.float32).tolist() + if 'avg' in statistic_category: + tensor_mean = saved_tensor.astype(np.float32).mean().tolist() else: - tensor_max = math.nan - tensor_min = math.nan - tensor_mean = math.nan + pass summary_data = [tensor_max, tensor_min, tensor_mean] - md5_nume = hashlib.md5(saved_tensor).hexdigest() - l2norm = np.linalg.norm(saved_tensor).item() - return DataInfo(data, saved_tensor, summary_data, str(data.dtype), tuple(data.shape), md5_nume, l2norm) - - -def get_scalar_data_info(data, compute_summary): + if 'md5' in statistic_category and 'l2norm' in statistic_category: + md5_nume = hashlib.md5(saved_tensor).hexdigest() + l2norm = np.linalg.norm(saved_tensor).item() + return DataInfo(data, saved_tensor, summary_data, str(data.dtype), tuple(data.shape), md5_nume, l2norm) + elif 'md5' in statistic_category: + md5_nume = hashlib.md5(saved_tensor).hexdigest() + return DataInfo(data, saved_tensor, summary_data, str(data.dtype), tuple(data.shape), md5_nume,[]) + elif 'l2norm' in statistic_category: + l2norm = np.linalg.norm(saved_tensor).item() + return DataInfo(data, saved_tensor, summary_data, str(data.dtype), tuple(data.shape), [], l2norm) + else: + return DataInfo(data, saved_tensor, summary_data, str(data.dtype), tuple(data.shape), [], []) + +def get_scalar_data_info(data, compute_summary, statistic_category): if compute_summary: summary_data = [data, data, data] else: summary_data = [math.nan] * 3 - md5_nume = hashlib.md5(str(data).encode()).hexdigest() - l2norm = np.linalg.norm(data).item() - return DataInfo(data, data, summary_data, str(type(data)), [], md5_nume, l2norm) - + if 'md5' in statistic_category and 'l2norm' in statistic_category: + md5_nume = hashlib.md5(str(data).encode()).hexdigest() + l2norm = np.linalg.norm(data).item() + return DataInfo(data, data, summary_data, str(type(data)), [], md5_nume, l2norm) + elif 'md5' in statistic_category: + md5_nume = hashlib.md5(str(data).encode()).hexdigest() + return DataInfo(data, data, summary_data, str(type(data)), [], md5_nume, []) + elif 'l2norm' in statistic_category: + l2norm = np.linalg.norm(data).item() + return DataInfo(data, data, summary_data, str(type(data)), [], [], l2norm) + else: + return DataInfo(data, data, summary_data, str(type(data)), [], [], []) -def get_float_tensor_info(data, compute_summary): +def get_float_tensor_info(data, compute_summary,statistic_category): dtype = str(data.dtype) + tensor_max, tensor_min, tensor_mean = math.nan, math.nan, math.nan if data.dtype == mstype.bfloat16: data = ops.Cast()(data, dtype=mstype.float32) - saved_tensor = data.asnumpy() if compute_summary: - tensor_max = saved_tensor.max().astype(np.float32).tolist() - tensor_min = saved_tensor.min().astype(np.float32).tolist() - tensor_mean = saved_tensor.mean().astype(np.float32).tolist() + if 'max' in statistic_category: + tensor_max = saved_tensor.max().astype(np.float32).tolist() + if 'min' in statistic_category: + tensor_min = saved_tensor.min().astype(np.float32).tolist() + if 'avg' in statistic_category: + tensor_mean = saved_tensor.mean().astype(np.float32).tolist() else: - tensor_max = math.nan - tensor_min = math.nan - tensor_mean = math.nan + pass summary_data = [tensor_max, tensor_min, tensor_mean] - md5_nume = hashlib.md5(saved_tensor).hexdigest() - l2norm = np.linalg.norm(saved_tensor).item() - return DataInfo(data, saved_tensor, summary_data, str(data.dtype), tuple(data.shape), md5_nume, l2norm) - + if 'md5' in statistic_category and 'l2norm' in statistic_category: + md5_nume = hashlib.md5(saved_tensor).hexdigest() + l2norm = np.linalg.norm(saved_tensor).item() + return DataInfo(data, saved_tensor, summary_data, str(data.dtype), tuple(data.shape), md5_nume, l2norm) + elif 'md5' in statistic_category: + md5_nume = hashlib.md5(saved_tensor).hexdigest() + return DataInfo(data, saved_tensor, summary_data, str(data.dtype), tuple(data.shape), md5_nume, []) + elif 'l2norm' in statistic_category: + l2norm = np.linalg.norm(saved_tensor).item() + return DataInfo(data, saved_tensor, summary_data, str(data.dtype), tuple(data.shape), [], l2norm) + else: + return DataInfo(data, saved_tensor, summary_data, str(data.dtype), tuple(data.shape), [], []) def set_dump_path(fpath=None): if fpath is None: @@ -231,7 +262,7 @@ def set_dump_path(fpath=None): def set_dump_switch(switch, mode=Const.ALL, scope=None, api_list=None, filter_switch=Const.ON, dump_mode=Const.ALL, dump_type=Const.ALL, - filter_stack=True, overflow=False): + filter_stack=True, overflow=False,statistic_category=None): if scope is None: scope = [] if api_list is None: @@ -239,7 +270,7 @@ def set_dump_switch(switch, mode=Const.ALL, scope=None, api_list=None, DumpUtil.set_dump_switch(switch, mode=mode, scope=scope, api_list=api_list, filter_switch=filter_switch, dump_mode=dump_mode, dump_type=dump_type, - filter_stack=filter_stack, overflow=overflow) + filter_stack=filter_stack, overflow=overflow,statistic_category=statistic_category) if switch == "ON": logger.user_attention(f"API dump has started. Dump data will be saved to {DumpUtil.dump_ori_dir}. ") @@ -273,13 +304,14 @@ def dump_data(dump_file_name, dump_step, prefix, data_info, dump_type): f.write('\n') -def dump_tensor(x, prefix, dump_step, dump_file_name, dump_type): +def dump_tensor(x, prefix, dump_step, dump_file_name, dump_type, statistic_category): compute_summary = True if dump_type in ['all', 'statistics'] else False dump_npy = True if dump_type in ['all', 'npy'] else False + if isinstance(x, (tuple, list)) and x: res = [] for i, item in enumerate(x): - output_hook_tensor = dump_tensor(item, "{}.{}".format(prefix, i), dump_step, dump_file_name, dump_type) + output_hook_tensor = dump_tensor(item, "{}.{}".format(prefix, i), dump_step, dump_file_name, dump_type,statistic_category) res.append(output_hook_tensor) return res if universal_interface.g_retain_backward else None elif isinstance(x, ms.Tensor): @@ -288,7 +320,7 @@ def dump_tensor(x, prefix, dump_step, dump_file_name, dump_type): grad = grad[0] nonlocal dump_file_name, dump_step, prefix, dump_npy, compute_summary prefix = prefix.replace('_forward_output', '_backward_input') - data_info_ = get_info(grad, compute_summary) + data_info_ = get_info(grad, compute_summary, statistic_category) dump_data(dump_file_name, dump_step, prefix, data_info_, dump_npy) dump_flag = True @@ -300,7 +332,7 @@ def dump_tensor(x, prefix, dump_step, dump_file_name, dump_type): data_info_func = get_float_tensor_info if dump_flag: - data_info = data_info_func(x, compute_summary) + data_info = data_info_func(x, compute_summary, statistic_category) dump_data(dump_file_name, dump_step, prefix, data_info, dump_npy) if universal_interface.g_retain_backward and "_output" in prefix: def backward_hook_func(grad): @@ -310,7 +342,7 @@ def dump_tensor(x, prefix, dump_step, dump_file_name, dump_type): return x if universal_interface.g_retain_backward else None elif DumpUtil.dump_filter_switch == Const.OFF: if isinstance(x, bool) or isinstance(x, int) or isinstance(x, float): - data_info = get_scalar_data_info(x, compute_summary) + data_info = get_scalar_data_info(x, compute_summary, statistic_category) dump_data(dump_file_name, dump_step, prefix, data_info, dump_npy) return x if universal_interface.g_retain_backward else None @@ -389,14 +421,14 @@ def ad_dump_acc_cmp(name, in_feat, out_feat, dump_step): if name[:6] == "LAYER_": from mindtorch.torch.tensor import Tensor # backward hook will not be executed if we move this func to another file, bug? - return dump_api_tensor(dump_step, Tensor([0]), name_template, None, dump_file_name, DumpUtil.dump_type) - return dump_api_tensor(dump_step, in_feat, name_template, out_feat, dump_file_name, DumpUtil.dump_type) + return dump_api_tensor(dump_step, Tensor([0]), name_template, None, dump_file_name, DumpUtil.dump_type,DumpUtil.statistic_category) + return dump_api_tensor(dump_step, in_feat, name_template, out_feat, dump_file_name, DumpUtil.dump_type,DumpUtil.statistic_category) else: msg = f"Current mode '{DumpUtil.dump_switch_mode}' is not supported. Please use the field in {Const.DUMP_MODE}" raise ValueError(msg) def make_dump_dirs(rank): - dump_file_name, dump_path = "mindspore_api_dump_info.pkl", "mindspore_api_dump" + dump_file_name, dump_path = "mindspore_api_dump_info.csv", "mindspore_api_dump" dump_stack_file = "mindspore_api_dump_stack.json" dump_root_dir = DumpUtil.dump_ori_dir if DumpUtil.dump_ori_dir else "./" Path(dump_root_dir).mkdir(mode=0o700, parents=True, exist_ok=True) @@ -491,10 +523,10 @@ def dump_acc_cmp(name, in_feat, out_feat, dump_step): if isinstance(out_feat, ms.Tensor): if not check_overflow(out_feat): dump_stack_info(name_template, dump_stack_file, DumpUtil.dump_filter_stack) - return dump_api_tensor(dump_step, in_feat, name_template, out_feat, dump_file_name, DumpUtil.dump_type) + return dump_api_tensor(dump_step, in_feat, name_template, out_feat, dump_file_name, DumpUtil.dump_type, DumpUtil.statistic_category) else: dump_stack_info(name_template, dump_stack_file, DumpUtil.dump_filter_stack) - return dump_api_tensor(dump_step, in_feat, name_template, out_feat, dump_file_name, DumpUtil.dump_type) + return dump_api_tensor(dump_step, in_feat, name_template, out_feat, dump_file_name, DumpUtil.dump_type, DumpUtil.statistic_category) else: msg = f"Current mode '{DumpUtil.dump_switch_mode}' is not supported. Please use the field in {Const.DUMP_MODE}" raise ValueError(msg) @@ -549,10 +581,10 @@ def all_finite(inputs): def check_overflow(out_feat): return all_finite((out_feat,)) -def dump_api_tensor(dump_step, in_feat, name_template, out_feat, dump_file, dump_type): +def dump_api_tensor(dump_step, in_feat, name_template, out_feat, dump_file, dump_type, statistic_category): if in_feat is not None: - dump_tensor(in_feat, name_template.format("input"), dump_step, dump_file, dump_type) - return dump_tensor(out_feat, name_template.format("output"), dump_step, dump_file, dump_type) + dump_tensor(in_feat, name_template.format("input"), dump_step, dump_file, dump_type,statistic_category) + return dump_tensor(out_feat, name_template.format("output"), dump_step, dump_file, dump_type,statistic_category) def acc_cmp_dump(name, **kwargs): dump_step = kwargs.get('dump_step', 1) diff --git a/troubleshooter/troubleshooter/migrator/api_dump/pt_dump/dump/dump.py b/troubleshooter/troubleshooter/migrator/api_dump/pt_dump/dump/dump.py index 2ba6e95..8b8b1f3 100644 --- a/troubleshooter/troubleshooter/migrator/api_dump/pt_dump/dump/dump.py +++ b/troubleshooter/troubleshooter/migrator/api_dump/pt_dump/dump/dump.py @@ -64,56 +64,85 @@ class DataInfo(object): self.l2norm = l2norm -def get_not_float_tensor_info(data, compute_summary): +def get_not_float_tensor_info(data, compute_summary, statistic_category): saved_tensor = data.contiguous().cpu().detach().numpy() + tensor_max, tensor_min, tensor_mean = math.nan, math.nan, math.nan if compute_summary: if data.numel() == 0 or data.dtype == torch.bool: - tensor_max = math.nan - tensor_min = math.nan - tensor_mean = math.nan + pass elif len(data.shape) == 0: - tensor_max = data.cpu().detach().float().numpy().tolist() - tensor_min = data.cpu().detach().float().numpy().tolist() - tensor_mean = data.cpu().detach().float().numpy().tolist() + if 'max' in statistic_category: + tensor_max = data.cpu().detach().float().numpy().tolist() + if 'min' in statistic_category: + tensor_min = data.cpu().detach().float().numpy().tolist() + if 'avg' in statistic_category: + tensor_mean = data.cpu().detach().float().numpy().tolist() else: - tensor_max = TorchFunc['max'](data).cpu().detach().float().numpy().tolist() - tensor_min = TorchFunc['min'](data).cpu().detach().float().numpy().tolist() - tensor_mean = TorchFunc['mean'](data.float()).cpu().detach().float().numpy().tolist() + if 'max' in statistic_category: + tensor_max = TorchFunc['max'](data).cpu().detach().float().numpy().tolist() + if 'min' in statistic_category: + tensor_min = TorchFunc['min'](data).cpu().detach().float().numpy().tolist() + if 'avg' in statistic_category: + tensor_mean = TorchFunc['mean'](data.float()).cpu().detach().float().numpy().tolist() else: - tensor_max = math.nan - tensor_min = math.nan - tensor_mean = math.nan + pass summary_data = [tensor_max, tensor_min, tensor_mean] - md5_nume = hashlib.md5(saved_tensor).hexdigest() - l2norm = np.linalg.norm(saved_tensor).item() - return DataInfo(data, saved_tensor, summary_data, str(data.dtype), tuple(data.shape), md5_nume, l2norm) - - -def get_scalar_data_info(data, compute_summary): + if 'md5' in statistic_category and 'l2norm' in statistic_category: + md5_nume = hashlib.md5(saved_tensor).hexdigest() + l2norm = np.linalg.norm(saved_tensor).item() + return DataInfo(data, saved_tensor, summary_data, str(data.dtype), tuple(data.shape), md5_nume, l2norm) + elif 'md5' in statistic_category: + md5_nume = hashlib.md5(saved_tensor).hexdigest() + return DataInfo(data, saved_tensor, summary_data, str(data.dtype), tuple(data.shape), md5_nume, []) + elif 'l2norm' in statistic_category: + l2norm = np.linalg.norm(saved_tensor).item() + return DataInfo(data, saved_tensor, summary_data, str(data.dtype), tuple(data.shape), [], l2norm) + else: + return DataInfo(data, saved_tensor, summary_data, str(data.dtype), tuple(data.shape), [], []) + +def get_scalar_data_info(data, compute_summary, statistic_category): if compute_summary: summary_data = [data, data, data] else: summary_data = [math.nan] * 3 - md5_nume = hashlib.md5(str(data).encode()).hexdigest() - l2norm = np.linalg.norm(data).item() - return DataInfo(data, data, summary_data, str(type(data)), [], md5_nume, l2norm) - + if 'md5' in statistic_category and 'l2norm' in statistic_category: + md5_nume = hashlib.md5(str(data).encode()).hexdigest() + l2norm = np.linalg.norm(data).item() + return DataInfo(data, data, summary_data, str(type(data)), [], md5_nume, l2norm) + elif 'md5' in statistic_category: + md5_nume = hashlib.md5(str(data).encode()).hexdigest() + return DataInfo(data, data, summary_data, str(type(data)), [], md5_nume, []) + elif 'l2norm' in statistic_category: + l2norm = np.linalg.norm(data).item() + return DataInfo(data, data, summary_data, str(type(data)), [], [], l2norm) + else: + return DataInfo(data, data, summary_data, str(type(data)), [], [], []) -def get_float_tensor_info(data, compute_summary): +def get_float_tensor_info(data, compute_summary, statistic_category): saved_tensor = data.contiguous().cpu().detach().numpy() + tensor_max, tensor_min, tensor_mean = math.nan, math.nan, math.nan if compute_summary: - tensor_max = TorchFunc['max'](data).cpu().detach().float().numpy().tolist() - tensor_min = TorchFunc['min'](data).cpu().detach().float().numpy().tolist() - tensor_mean = TorchFunc['mean'](data).cpu().detach().float().numpy().tolist() + if 'max' in statistic_category: + tensor_max = TorchFunc['max'](data).cpu().detach().float().numpy().tolist() + if 'min' in statistic_category: + tensor_min = TorchFunc['min'](data).cpu().detach().float().numpy().tolist() + if 'avg' in statistic_category: + tensor_mean = TorchFunc['mean'](data).cpu().detach().float().numpy().tolist() else: - tensor_max = math.nan - tensor_min = math.nan - tensor_mean = math.nan + pass summary_data = [tensor_max, tensor_min, tensor_mean] - md5_nume = hashlib.md5(saved_tensor).hexdigest() - l2norm = np.linalg.norm(saved_tensor).item() - return DataInfo(data, saved_tensor, summary_data, str(data.dtype), tuple(data.shape), md5_nume, l2norm) - + if 'md5' in statistic_category and 'l2norm' in statistic_category: + md5_nume = hashlib.md5(saved_tensor).hexdigest() + l2norm = np.linalg.norm(saved_tensor).item() + return DataInfo(data, saved_tensor, summary_data, str(data.dtype), tuple(data.shape), md5_nume, l2norm) + elif 'md5' in statistic_category: + md5_nume = hashlib.md5(saved_tensor).hexdigest() + return DataInfo(data, saved_tensor, summary_data, str(data.dtype), tuple(data.shape), md5_nume, []) + elif 'l2norm' in statistic_category: + l2norm = np.linalg.norm(saved_tensor).item() + return DataInfo(data, saved_tensor, summary_data, str(data.dtype), tuple(data.shape), [], l2norm) + else: + return DataInfo(data, saved_tensor, summary_data, str(data.dtype), tuple(data.shape), [], []) def json_dump_condition(prefix): cur_threading_id = threading.current_thread().ident @@ -123,12 +152,13 @@ def json_dump_condition(prefix): return (Const.BACKWARD in prefix and backward_threading_id == cur_threading_id) or 'forward' in prefix -def dump_tensor(x, prefix, dump_step, dump_file_name, dump_type): +def dump_tensor(x, prefix, dump_step, dump_file_name, dump_type, statistic_category): compute_summary = True if dump_type in ['all', 'statistics'] else False dump_npy = True if dump_type in ['all', 'npy'] else False + if isinstance(x, (tuple, list)) and x: for i, item in enumerate(x): - dump_tensor(item, "{}.{}".format(prefix, i), dump_step, dump_file_name, dump_type) + dump_tensor(item, "{}.{}".format(prefix, i), dump_step, dump_file_name, dump_type, statistic_category) return elif isinstance(x, torch.Tensor): def backward_hook(grad, get_info): @@ -136,7 +166,7 @@ def dump_tensor(x, prefix, dump_step, dump_file_name, dump_type): return nonlocal dump_file_name, dump_step, prefix, dump_npy, compute_summary prefix = prefix.replace('_forward_output', '_backward_input') - data_info_ = get_info(grad, compute_summary) + data_info_ = get_info(grad, compute_summary, statistic_category) dump_data(dump_file_name, dump_step, prefix, data_info_, dump_npy) dump_flag = True @@ -148,14 +178,14 @@ def dump_tensor(x, prefix, dump_step, dump_file_name, dump_type): data_info_func = get_float_tensor_info if dump_flag: - data_info = data_info_func(x, compute_summary) + data_info = data_info_func(x, compute_summary, statistic_category) dump_data(dump_file_name, dump_step, prefix, data_info, dump_npy) if universal_interface.g_retain_backward and x.requires_grad is True and "_output" in prefix: x.register_hook(partial(backward_hook, get_info=get_float_tensor_info)) elif DumpUtil.dump_filter_switch == Const.OFF: if isinstance(x, bool) or isinstance(x, int) or isinstance(x, float): - data_info = get_scalar_data_info(x, compute_summary) + data_info = get_scalar_data_info(x, compute_summary, statistic_category) dump_data(dump_file_name, dump_step, prefix, data_info, dump_npy) @@ -244,10 +274,10 @@ def dump_stack_info(name_template, dump_file, filter_stack): f.write(json_str) -def dump_api_tensor(dump_step, in_feat, name_template, out_feat, dump_file, dump_type): +def dump_api_tensor(dump_step, in_feat, name_template, out_feat, dump_file, dump_type, statistic_category): if in_feat is not None: - dump_tensor(in_feat, name_template.format("input"), dump_step, dump_file, dump_type) - dump_tensor(out_feat, name_template.format("output"), dump_step, dump_file, dump_type) + dump_tensor(in_feat, name_template.format("input"), dump_step, dump_file, dump_type, statistic_category) + dump_tensor(out_feat, name_template.format("output"), dump_step, dump_file, dump_type, statistic_category) def dump_acc_cmp(name, in_feat, out_feat, dump_step, module): @@ -265,8 +295,8 @@ def dump_acc_cmp(name, in_feat, out_feat, dump_step, module): if DumpUtil.check_switch_scope(name.rstrip('_forward')): dump_stack_info(name_template, dump_stack_file, DumpUtil.dump_filter_stack) if name[:6] == "LAYER_": - return dump_api_tensor(dump_step, torch.Tensor([0]), name_template, None, dump_file_name, DumpUtil.dump_type) - return dump_api_tensor(dump_step, in_feat, name_template, out_feat, dump_file_name, DumpUtil.dump_type) + return dump_api_tensor(dump_step, torch.Tensor([0]), name_template, None, dump_file_name, DumpUtil.dump_type, DumpUtil.statistic_category) + return dump_api_tensor(dump_step, in_feat, name_template, out_feat, dump_file_name, DumpUtil.dump_type, DumpUtil.statistic_category) else: msg = f"Current mode '{DumpUtil.dump_switch_mode}' is not supported. Please use the field in {Const.DUMP_MODE}" raise ValueError(msg) diff --git a/troubleshooter/troubleshooter/migrator/api_dump/pt_dump/dump/utils.py b/troubleshooter/troubleshooter/migrator/api_dump/pt_dump/dump/utils.py index a03030c..3af7e46 100644 --- a/troubleshooter/troubleshooter/migrator/api_dump/pt_dump/dump/utils.py +++ b/troubleshooter/troubleshooter/migrator/api_dump/pt_dump/dump/utils.py @@ -30,6 +30,7 @@ class DumpUtil(object): dump_stack_dic = {} dump_filter_stack = True dump_count = 0 + statistic_category = None @staticmethod def set_ori_dir(path): @@ -48,7 +49,7 @@ class DumpUtil(object): @staticmethod def set_dump_switch(switch, mode, scope, api_list, - filter_switch, dump_mode, dump_type, filter_stack, overflow): + filter_switch, dump_mode, dump_type, filter_stack, overflow,statistic_category): DumpUtil.dump_switch = switch DumpUtil.dump_switch_mode = mode DumpUtil.dump_switch_scope = scope @@ -58,6 +59,7 @@ class DumpUtil(object): DumpUtil.dump_type = dump_type DumpUtil.dump_filter_stack = filter_stack DumpUtil.dump_overflow = overflow + DumpUtil.statistic_category = statistic_category if mode == Const.ACL: DumpUtil.dump_switch_scope = [api_name.replace("backward", "forward") for api_name in scope] @@ -157,7 +159,7 @@ def generate_dump_path_str(): def set_dump_switch(switch, mode=Const.ALL, scope=None, api_list=None, filter_switch=Const.ON, dump_mode=Const.ALL, dump_type=Const.ALL, - filter_stack=True, overflow=False): + filter_stack=True, overflow=False,statistic_category=None): if scope is None: scope = [] if api_list is None: @@ -165,7 +167,7 @@ def set_dump_switch(switch, mode=Const.ALL, scope=None, api_list=None, DumpUtil.set_dump_switch(switch, mode=mode, scope=scope, api_list=api_list, filter_switch=filter_switch, dump_mode=dump_mode, dump_type=dump_type, - filter_stack=filter_stack, overflow=overflow) + filter_stack=filter_stack, overflow=overflow,statistic_category=statistic_category) if switch == "ON": dump_path_str = generate_dump_path_str() @@ -204,7 +206,7 @@ def make_dump_data_dir(dump_file_name): def make_dump_dirs(rank): - dump_file_name, dump_path = "torch_api_dump_info.pkl", "torch_api_dump" + dump_file_name, dump_path = "torch_api_dump_info.csv", "torch_api_dump" dump_stack_file = "torch_api_dump_stack.json" dump_root_dir = DumpUtil.dump_ori_dir if DumpUtil.dump_ori_dir else "./" Path(dump_root_dir).mkdir(mode=0o700, parents=True, exist_ok=True) diff --git a/troubleshooter/troubleshooter/migrator/api_dump/universal_interface.py b/troubleshooter/troubleshooter/migrator/api_dump/universal_interface.py index 0b58941..e435aa0 100644 --- a/troubleshooter/troubleshooter/migrator/api_dump/universal_interface.py +++ b/troubleshooter/troubleshooter/migrator/api_dump/universal_interface.py @@ -127,24 +127,28 @@ def api_dump_init(net, output_path=os.path.join(os.getcwd(), "ts_api_dump"), *, f"mindspore.nn.Cell, torch.nn.Module or mindtorch.torch.nn.Module, but got {type(net)}.") -def api_dump_start(mode='all', scope=None, dump_type="all", filter_data=True, filter_stack=True, overflow_check=False): +def api_dump_start(mode = 'all', scope = None, dump_type = "all", filter_data = True, filter_stack = True, overflow_check = False, statistic_category = ['max', 'min', 'l2norm']): check_mode_and_scope(mode, scope) if scope is None: scope = [] support_dump_type = {'all', 'statistics', 'stack', 'npy'} + support_statistic_category = {'min', 'avg', 'max', 'md5','l2norm'} enum_check(dump_type, 'support_dump_type', support_dump_type) + if statistic_category is not None: + for param in statistic_category: + enum_check(param, 'statistic_category', support_statistic_category) type_check(filter_data, 'filter_data', bool) filter_switch = 'ON' if filter_data else 'OFF' if API_DUMP_FRAMEWORK_TYPE == "torch": pt_set_dump_switch("ON", mode, scope=scope, api_list=scope, filter_switch=filter_switch, - dump_type=dump_type, filter_stack=filter_stack) + dump_type=dump_type, filter_stack=filter_stack, statistic_category=statistic_category) elif API_DUMP_FRAMEWORK_TYPE == "mindspore": ms_set_dump_switch("ON", mode, scope=scope, api_list=scope, filter_switch=filter_switch, - dump_type=dump_type, filter_stack=filter_stack, overflow=overflow_check) + dump_type=dump_type, filter_stack=filter_stack, overflow=overflow_check, statistic_category=statistic_category) elif API_DUMP_FRAMEWORK_TYPE == "mindtorch": mindtorch.module_hooker.torch_enable() ad_set_dump_switch("ON", mode, scope=scope, api_list=scope, filter_switch=filter_switch, - dump_type=dump_type, filter_stack=filter_stack) + dump_type=dump_type, filter_stack=filter_stack, statistic_category=statistic_category) mindtorch.module_hooker.torch_pop() else: raise RuntimeError("You must call 'troubleshooter.api_dump.init' before calling" -- Gitee