From 4020a3ab792ac0a74cf953b6ba3bea901b59d4e6 Mon Sep 17 00:00:00 2001 From: tiger <18297133@qq.com> Date: Tue, 25 Apr 2023 10:52:22 +0800 Subject: [PATCH 1/2] Weight description modification --- troubleshooter/README.md | 2 +- .../troubleshooter/migrator/diff_handler.py | 23 +++++++++++++------ 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/troubleshooter/README.md b/troubleshooter/README.md index 6fb07c3..a14a4ba 100644 --- a/troubleshooter/README.md +++ b/troubleshooter/README.md @@ -387,7 +387,7 @@ x = self.sqrt(y) 出现 nan, 给出“User Warning 'nan' is detected”报错。 print(output) ### 应用场景4:跟踪API报错的详细执行过程 -当API报错时,我们仅能看到有限的堆栈信息,有时需要了解API的调用流程和参数传递&变化过程,以定位报错的原因。此场景下,可以应用tracking功能进行错误跟踪, +当API报错时,我们仅能看到有限的堆栈信息,有时需要了解API的调用流程和参数传递&变化过程,以定位报错的原因,此场景下,可以应用tracking功能进行错误跟踪。 #### 如何使用: ##### 方法1:使用ts.tracking跟踪异常产生过程 diff --git a/troubleshooter/troubleshooter/migrator/diff_handler.py b/troubleshooter/troubleshooter/migrator/diff_handler.py index 122471c..2312115 100644 --- a/troubleshooter/troubleshooter/migrator/diff_handler.py +++ b/troubleshooter/troubleshooter/migrator/diff_handler.py @@ -99,7 +99,8 @@ class DifferenceFinder: none_flag = False if not (orig_name_list and target_name_list): - logger.user_error("The comparison file is not found in the directory. Please check whether the directory is correct") + logger.user_error("The comparison file is not found in the directory. " + "Please check whether the directory is correct") exit(1) for name in orig_name_list: @@ -170,7 +171,8 @@ class DifferenceFinder: diff_detail = ("Shape is inconsistent", orig_value.shape, target_value.shape) result_list.append((orig_name, target_name, result, diff_detail)) - logger.user_attention("The compare directory information:\n The orig dir: %s \n The target dir: %s", self.orig_dir, self.target_dir) + logger.user_attention("The compare directory information:\n The orig dir: %s \n The target dir: %s", + self.orig_dir, self.target_dir) print_diff_result(result_list) class WeightMigrator: @@ -194,13 +196,19 @@ class WeightMigrator: elif isinstance(pt_object, torch.nn.Module): pt_para_dict = pt_object.state_dict() else: - raise ValueError("The file cannot be parsed properly. For customized parameter saved files, " - "please load and parse them yourself, and set the 'pth_para_dict' parameter directly") + raise ValueError("PTH file parsing failed, possible reasons: " + "1) If using a custom method to save parameter files, please load and set " + "the 'pth_para_dict' parameter yourself to use the conversion tool." + "2) If the input is an optimizer parameter, this tool does not support " + "the conversion of optimizer parameters.") values = list(pt_para_dict.values()) if values and not isinstance(values[0], torch.Tensor): - raise ValueError("The file cannot be parsed properly. For customized parameter saved files, " - "please load and parse them yourself, and set the 'pth_para_dict' parameter directly") + raise ValueError("PTH file parsing failed, possible reasons: " + "1) If using a custom method to save parameter files, please load and set " + "the 'pth_para_dict' parameter yourself to use the conversion tool." + "2) If the input is an optimizer parameter, this tool does not support " + "the conversion of optimizer parameters.") return pt_para_dict def _get_object(self, name): @@ -337,7 +345,8 @@ class WeightMigrator: ms_para_after_conv = ckpt_after_conv_dict.get(ms_para_name) if ms_para_after_conv is not None: - name_map_list.append((ms_para_name, ms_para_name, (ms_para.shape == ms_para_after_conv.shape), ms_para.shape, ms_para_after_conv.shape)) + name_map_list.append((ms_para_name, ms_para_name, (ms_para.shape == ms_para_after_conv.shape), + ms_para.shape, ms_para_after_conv.shape)) ckpt_after_conv_dict.pop(ms_para_name) else: name_map_list.append((ms_para_name, None, None, ms_para.shape, None)) -- Gitee From 4f9906a15319b8f992884c6175fb179cf6c23fea Mon Sep 17 00:00:00 2001 From: tiger <18297133@qq.com> Date: Thu, 27 Apr 2023 15:45:58 +0800 Subject: [PATCH 2/2] add migrator test case --- .../migrator/weight_migrator_demo_4_1.py | 71 ----- .../migrator/weight_migrator_demo_4_2.py | 70 ----- .../migrator/weight_migrator_demo_5.py | 12 +- .../tests/diff_handler/test_migrator.py | 267 ++++++++++++++++++ .../troubleshooter/migrator/diff_handler.py | 2 +- 5 files changed, 276 insertions(+), 146 deletions(-) delete mode 100644 troubleshooter/examples/migrator/weight_migrator_demo_4_1.py delete mode 100644 troubleshooter/examples/migrator/weight_migrator_demo_4_2.py create mode 100644 troubleshooter/tests/diff_handler/test_migrator.py diff --git a/troubleshooter/examples/migrator/weight_migrator_demo_4_1.py b/troubleshooter/examples/migrator/weight_migrator_demo_4_1.py deleted file mode 100644 index 854ebae..0000000 --- a/troubleshooter/examples/migrator/weight_migrator_demo_4_1.py +++ /dev/null @@ -1,71 +0,0 @@ -"""PyTorch training""" -import torch -import torch.nn as nn -import troubleshooter as ts -from collections import OrderedDict -import mindspore -#@场景:验证模型场景下提取权重参数 -class MyNet3(nn.Module): - def __init__(self, in_features, out_classes): - super(MyNet3, self).__init__() - - self.features = nn.Sequential( - OrderedDict([ - ('Linear_mm', nn.Linear(in_features, 64)), - ('bn_mm', nn.BatchNorm1d(64)), - ('relu_mm', nn.ReLU()), - ('Linear_mm', nn.Linear(64, out_classes)) - ]) - ) - - def forward(self, x): - x = self.features(x) - return x - -if __name__ == '__main__': - - - #@验证:函数式转换 - #torch_net=MyModule1() - - #@验证:ParameterList权重参数 - #torch_net=MyModule2() - - #@验证:ModuleList+Sequential - #torch_net=MyModule3() - - # @验证:ModuleList - #torch_net = MyNet(in_channels=10,out_channels=2,hidden_size=20) - - # @验证:Sequential - #torch_net = MyNet1(in_channels=3,out_channels=10,hidden_size=128) - - # @验证:Sequential+OrderedDict封装 - #torch_net = MyNet2(in_channels=3,out_channels=10,hidden_size=128) - - - # @验证:Sequential + BatchNorm1d - torch_net = MyNet3(in_features=10,out_classes=2) - - # @验证模型保存的权重迁移工具 - torch.save(torch_net, "torch_net.pth") - pth_path = "./torch_net.pth" - - model = torch.load(pth_path) - - # @验证模型场景下提取权重参数 - #pd = model.state_dict() - #for name, param in pd.items(): - # print(name,":",param.size()) - - wm = ts.weight_migrator(pt_model=torch_net, pth_file_path=pth_path, ckpt_save_path='./convert_resnet.ckpt') - #w_maps = wm.get_weight_map(full_name_map=True, print_map=True) - # test_map = {'bn1.bias': 'bn1.beta',} - - wm.convert(weight_name_prefix="miao") - - #打印ckpt - #pth = "./convert_resnet.ckpt" - #param_dict = mindspore.load_checkpoint(pth) - #for key, value in param_dict.items(): - # print(key, ':', value) \ No newline at end of file diff --git a/troubleshooter/examples/migrator/weight_migrator_demo_4_2.py b/troubleshooter/examples/migrator/weight_migrator_demo_4_2.py deleted file mode 100644 index 8c8ead1..0000000 --- a/troubleshooter/examples/migrator/weight_migrator_demo_4_2.py +++ /dev/null @@ -1,70 +0,0 @@ -"""PyTorch training""" -import torch -import torch.nn as nn -import troubleshooter as ts -from collections import OrderedDict -import mindspore -#@场景:验证直接传入参数字典的场景 -class MyNet3(nn.Module): - def __init__(self, in_features, out_classes): - super(MyNet3, self).__init__() - - self.features = nn.Sequential( - OrderedDict([ - ('Linear_mm', nn.Linear(in_features, 64)), - ('bn_mm', nn.BatchNorm1d(64)), - ('relu_mm', nn.ReLU()), - ('Linear_mm', nn.Linear(64, out_classes)) - ]) - ) - - def forward(self, x): - x = self.features(x) - return x - -if __name__ == '__main__': - - - #@验证:函数式转换 - #torch_net=MyModule1() - - #@验证:ParameterList权重参数 - #torch_net=MyModule2() - - #@验证:ModuleList+Sequential - #torch_net=MyModule3() - - # @验证:ModuleList - #torch_net = MyNet(in_channels=10,out_channels=2,hidden_size=20) - - # @验证:Sequential - #torch_net = MyNet1(in_channels=3,out_channels=10,hidden_size=128) - - # @验证:Sequential+OrderedDict封装 - #torch_net = MyNet2(in_channels=3,out_channels=10,hidden_size=128) - - - # @验证:Sequential + BatchNorm1d - torch_net = MyNet3(in_features=10,out_classes=2) - - # @验证模型保存的权重迁移工具 - torch.save(torch_net, "torch_net.pth") - pth_path = "./torch_net.pth" - - model = torch.load(pth_path) - - # @验证模型场景下提取权重参数 - pd = model.state_dict() - for name, param in pd.items(): - print(name,":",param.size()) - wm = ts.weight_migrator(pt_model=torch_net, pth_para_dict=pd, ckpt_save_path='./convert_resnet.ckpt') - #w_maps = wm.get_weight_map(full_name_map=True, print_map=True) - # test_map = {'bn1.bias': 'bn1.beta',} - - wm.convert(weight_name_prefix="miao") - - #打印ckpt - #pth = "./convert_resnet.ckpt" - #param_dict = mindspore.load_checkpoint(pth) - #for key, value in param_dict.items(): - # print(key, ':', value) \ No newline at end of file diff --git a/troubleshooter/examples/migrator/weight_migrator_demo_5.py b/troubleshooter/examples/migrator/weight_migrator_demo_5.py index c1c5fc1..7ecf8e2 100644 --- a/troubleshooter/examples/migrator/weight_migrator_demo_5.py +++ b/troubleshooter/examples/migrator/weight_migrator_demo_5.py @@ -57,9 +57,13 @@ class Net(nn.Cell): #@场景: CellList -class MyNet(nn.Cell): +# 0.weight : Parameter (name=0.weight, shape=(20, 10), dtype=Float32, requires_grad=True) +# 0.bias : Parameter (name=0.bias, shape=(20,), dtype=Float32, requires_grad=True) +# 1.weight : Parameter (name=1.weight, shape=(2, 20), dtype=Float32, requires_grad=True) +# 1.bias : Parameter (name=1.bias, shape=(2,), dtype=Float32, requires_grad=True) +class MyNet_CellList(nn.Cell): def __init__(self, in_channels, out_channels, hidden_size): - super(MyNet, self).__init__() + super(MyNet_CellList, self).__init__() self.fc_layers = nn.CellList() self.fc_layers.append(nn.Dense(in_channels, hidden_size)) self.fc_layers.append(nn.Dense(hidden_size, out_channels)) @@ -195,7 +199,7 @@ class MyNet(nn.Cell): #net = Net() #@验证2:CellList -#net = MyNet(in_channels=10,out_channels=2, hidden_size=20) +net = MyNet_CellList(in_channels=10,out_channels=2, hidden_size=20) #@验证3:SequentialCell #net = MyNet(in_channels=3, out_channels=10, hidden_size=128) @@ -204,7 +208,7 @@ class MyNet(nn.Cell): #net = MyNet(in_features=10, out_classes=2) #@验证5 SequentialCell+BatchNorm1d+OrderedDict -net = MyNet(in_features=10, out_classes=2) +#net = MyNet(in_features=10, out_classes=2) pth = "./test.ckpt" mindspore.save_checkpoint(net, pth) diff --git a/troubleshooter/tests/diff_handler/test_migrator.py b/troubleshooter/tests/diff_handler/test_migrator.py new file mode 100644 index 0000000..9ed848f --- /dev/null +++ b/troubleshooter/tests/diff_handler/test_migrator.py @@ -0,0 +1,267 @@ +import mindspore +import torch +import torch.nn as nn +import troubleshooter as ts +import torch.optim as optim +from collections import OrderedDict + + +class MyModule(nn.Module): + def __init__(self, in_features, out_classes): + super(MyModule, self).__init__() + + self.features = nn.Sequential( + OrderedDict([ + ('Linear_mm', nn.Linear(in_features, 64)), + ('bn_mm', nn.BatchNorm1d(64)), + ('relu_mm', nn.ReLU()), + ('Linear_mm', nn.Linear(64, out_classes)) + ]) + ) + + def forward(self, x): + x = self.features(x) + return x + + +def test_ordereddict_sequential_case(capsys): + torch_net = MyModule(in_features=10,out_classes=2) + torch.save(torch_net.state_dict(), "/tmp/torch_net.pth") + pth_path = "/tmp/torch_net.pth" + wm = ts.weight_migrator(pt_model=torch_net, pth_file_path=pth_path, ckpt_save_path='/tmp/convert_resnet.ckpt') + wm.convert() + result = capsys.readouterr().out + key_result = 'features.bn_mm.weight | features.bn_mm.gamma' + assert result.count('True') == 4 and result.count(key_result) == 1 + + +def test_save_model_pth_case(capsys): + torch_net = MyModule(in_features=10,out_classes=2) + #save model + torch.save(torch_net, "/tmp/torch_net.pth") + pth_path = "/tmp/torch_net.pth" + wm = ts.weight_migrator(pt_model=torch_net, pth_file_path=pth_path, ckpt_save_path='/tmp/convert_resnet.ckpt') + wm.convert() + result = capsys.readouterr().out + key_result = 'features.bn_mm.weight | features.bn_mm.gamma' + assert result.count('True') == 4 and result.count(key_result) == 1 + + +def test_torch_modulelist_and_loadckpt_case(capsys): + class MyNet_CellList(mindspore.nn.Cell): + def __init__(self, in_channels, out_channels, hidden_size): + super(MyNet_CellList, self).__init__() + self.fc_layers = mindspore.nn.CellList() + self.fc_layers.append(mindspore.nn.Dense(in_channels, hidden_size)) + self.fc_layers.append(mindspore.nn.Dense(hidden_size, out_channels)) + self.relu = mindspore.nn.ReLU() + + def construct(self, x): + for i in range(len(self.fc_layers)): + x = self.fc_layers[i](x) + x = self.relu(x) + + return x + class MyNet(nn.Module): + def __init__(self, in_channels, out_channels, hidden_size): + super(MyNet, self).__init__() + self.fc_layers = nn.ModuleList() + self.fc_layers.append(nn.Linear(in_channels, hidden_size)) + self.fc_layers.append(nn.Linear(hidden_size, out_channels)) + self.relu = nn.ReLU() + + def forward(self, x): + for i in range(len(self.fc_layers)): + x = self.fc_layers[i](x) + x = self.relu(x) + + return x + + torch_net=MyNet(in_channels=10,out_channels=2,hidden_size=20) + ms_net = MyNet_CellList(in_channels=10, out_channels=2, hidden_size=20) + torch.save(torch_net.state_dict(), "/tmp/torch_net.pth") + pth_path = "/tmp/torch_net.pth" + wm = ts.weight_migrator(pt_model=torch_net, pth_file_path=pth_path, ckpt_save_path='/tmp/convert_resnet.ckpt') + wm.convert(print_conv_info=False) + param_dict = mindspore.load_checkpoint("/tmp/convert_resnet.ckpt") + res = mindspore.load_param_into_net(ms_net, param_dict) + ms_param_dict = ms_net.parameters_dict() + assert len(ms_param_dict) == 4 + + +def test_modulelist_sequential_case(capsys): + class MyModule(nn.Module): + def __init__(self): + super(MyModule, self).__init__() + self.features = nn.ModuleList([ + nn.Conv2d(1, 32, kernel_size=3), + nn.ReLU(), + nn.MaxPool2d(kernel_size=2, stride=2), + nn.Conv2d(32, 64, kernel_size=3), + nn.ReLU(), + nn.MaxPool2d(kernel_size=2, stride=2), + nn.Conv2d(64, 128, kernel_size=3), + nn.ReLU() + ]) + + self.classifier = nn.Sequential( + nn.Linear(128 * 10 * 10, 512), + nn.ReLU(), + nn.Linear(512, 10) + ) + + def forward(self, x): + for layer in self.features: + x = layer(x) + x = x.view(-1, 128 * 10 * 10) + x = self.classifier(x) + return x + + torch_net=MyModule() + torch.save(torch_net.state_dict(), "/tmp/torch_net.pth") + pth_path = "/tmp/torch_net.pth" + wm = ts.weight_migrator(pt_model=torch_net, pth_file_path=pth_path, ckpt_save_path='/tmp/convert_resnet.ckpt') + wm.convert() + result = capsys.readouterr().out + key_result = 'features.0.weight | features.0.weight' + assert result.count('False') == 20 and result.count(key_result) == 1 + + +def test_weight_name_prefix_case(capsys): + torch_net = MyModule(in_features=10,out_classes=2) + torch.save(torch_net.state_dict(), "/tmp/torch_net.pth") + pth_path = "/tmp/torch_net.pth" + wm = ts.weight_migrator(pt_model=torch_net, pth_file_path=pth_path, ckpt_save_path='/tmp/convert_resnet.ckpt') + wm.convert(weight_name_prefix="pre_test") + result = capsys.readouterr().out + key_result = 'pre_test.features.Linear_mm.weight' + assert result.count('pre_test') == 7 and result.count(key_result) == 1 + + +def test_save_model_pth_and_input_dict_case(capsys): + torch_net = MyModule(in_features=10,out_classes=2) + #save model + torch.save(torch_net, "/tmp/torch_net.pth") + pth_path = "/tmp/torch_net.pth" + model = torch.load(pth_path) + # @验证模型场景下提取权重参数 + pd = model.state_dict() + wm = ts.weight_migrator(pt_model=torch_net, pth_para_dict=pd, ckpt_save_path='/tmp/convert_resnet.ckpt') + wm.convert() + result = capsys.readouterr().out + key_result = 'features.bn_mm.weight | features.bn_mm.gamma' + assert result.count('True') == 4 and result.count(key_result) == 1 + + +def test_save_optimizer_case(capsys): + class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.fc = nn.Linear(10, 2) + def forward(self, x): + x = self.fc(x) + return x + + model = Net() + + # 定义优化器 + optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9) + + # 训练模型 + inputs = torch.randn(5, 10) + labels = torch.randn(5, 2) + for epoch in range(10): + optimizer.zero_grad() + outputs = model(inputs) + loss = nn.MSELoss()(outputs, labels) + loss.backward() + optimizer.step() + + # 存储优化器参数 + torch.save(optimizer.state_dict(), 'optimizer.pth') + + # 加载优化器参数 + new_optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9) + opt_para = torch.load('optimizer.pth') + new_optimizer.load_state_dict(opt_para) + pth_path = './optimizer.pth' + try: + wm = ts.weight_migrator(pt_model=model, pth_file_path=pth_path, ckpt_save_path='./convert_resnet.ckpt') + wm.convert() + except ValueError as e: + error_str = str(e) + assert error_str.count('PTH file parsing failed, possible reasons:') == 1 + + +def test_save_model_pth_and_input_dict_case(capsys): + torch_net = MyModule(in_features=10, out_classes=2) + #save model + torch.save(torch_net, "/tmp/torch_net.pth") + pth_path = "/tmp/torch_net.pth" + model = torch.load(pth_path) + # @验证模型场景下提取权重参数 + pd = model.state_dict() + wm = ts.weight_migrator(pt_model=torch_net, pth_para_dict=pd, ckpt_save_path='/tmp/convert_resnet.ckpt') + wm.convert() + result = capsys.readouterr().out + key_result = 'features.bn_mm.weight | features.bn_mm.gamma' + assert result.count('True') == 4 and result.count(key_result) == 1 + + +def test_save_model_pth_and_input_dict_case(capsys): + def custorm_weight_name(weight_name_map): + prefix = '.custorm.' + custorm_name_map = {} + for key, value in weight_name_map.items(): + index = value.find(".") + value = value[0:index] + prefix + value[index + 1:] + #print(key, ":", value) + custorm_name_map[key] = str(value) + return custorm_name_map + + torch_net = MyModule(in_features=10, out_classes=2) + #save model + torch.save(torch_net, "/tmp/torch_net.pth") + pth_path = "/tmp/torch_net.pth" + # model = torch.load(pth_path) + # @验证模型场景下提取权重参数 + # pd = model.state_dict() + wm = ts.weight_migrator(pt_model=torch_net, pth_file_path=pth_path, ckpt_save_path='/tmp/convert_resnet.ckpt') + name_map, value_map = wm.get_weight_map(full_name_map=True) + w_map = custorm_weight_name(name_map) + # 将定制好的map传入转换接口 + # weight_map:传入定制后的map,以定制后的map进行权重转换 + wm.convert(weight_name_map=w_map) + result = capsys.readouterr().out + key_result = 'features.bn_mm.weight | features.custorm.bn_mm.gamma' + assert result.count('.custorm.') == 7 and result.count(key_result) == 1 + + +def test_conv1d_value_case(capsys): + class MSNet(mindspore.nn.Cell): + def __init__(self): + super(MSNet, self).__init__() + self.conv1d = mindspore.nn.Conv1d(256, 256, kernel_size=1, has_bias=True) + + def construct(self, A): + return self.conv1d(A) + + class torchNet(torch.nn.Module): + def __init__(self): + super(torchNet, self).__init__() + self.conv1d = torch.nn.Conv1d(256, 256, kernel_size=1) + + def forward(self, A): + return self.conv1d(A) + torch_net = torchNet() + ms_net = MSNet() + #save model + torch.save(torch_net.state_dict(), "/tmp/torch_net.pth") + pth_path = "/tmp/torch_net.pth" + wm = ts.weight_migrator(pt_model=torch_net, pth_file_path=pth_path, ckpt_save_path='/tmp/convert_resnet.ckpt') + wm.convert() + result = capsys.readouterr().out + param_dict = mindspore.load_checkpoint("/tmp/convert_resnet.ckpt") + res = mindspore.load_param_into_net(ms_net, param_dict) + ms_param_dict = ms_net.parameters_dict() + assert len(ms_param_dict) == 2 \ No newline at end of file diff --git a/troubleshooter/troubleshooter/migrator/diff_handler.py b/troubleshooter/troubleshooter/migrator/diff_handler.py index 2312115..a0864f3 100644 --- a/troubleshooter/troubleshooter/migrator/diff_handler.py +++ b/troubleshooter/troubleshooter/migrator/diff_handler.py @@ -245,7 +245,7 @@ class WeightMigrator: if prefix: custorm_name_map = {} for key, value in weight_name_map.items(): - print(key, ":", prefix + '.' + value) + # print(key, ":", prefix + '.' + value) custorm_name_map[key] = str(prefix) + '.' + str(value) return custorm_name_map else: -- Gitee