diff --git a/collector/npu_collector.go b/collector/npu_collector.go index c0259fec65b3baa1739064c1d8ede1c19f774bd3..749d79fc57f428b6a816deb9b66439fc2a9beadf 100644 --- a/collector/npu_collector.go +++ b/collector/npu_collector.go @@ -1262,13 +1262,13 @@ func updateNPUNetworkInfo(ch chan<- prometheus.Metric, npu *HuaWeiNPUCard, chip float64(getHealthCode(chip.NetHealthStatus)), collectCardLabelValue(chip, namespaceValue, podNameValue, containerName)...)) } - updateStatInfoOfMac(ch, npu, chip, cNameArray) updateStatInfoOfRoCE(ch, npu, chip, cNameArray) updateOpticalInfo(ch, npu, chip, cNameArray) updateBandwidthInfo(ch, npu, chip, cNameArray) updateLinkSpeedInfo(ch, npu, chip, devInfo) updateLinkStatInfo(ch, npu, chip, devInfo) + updateLinkStatusInfo(ch, npu, chip, devInfo) } func updateLinkSpeedInfo(ch chan<- prometheus.Metric, npu *HuaWeiNPUCard, chip *HuaWeiAIChip, @@ -1313,6 +1313,27 @@ func updateLinkStatInfo(ch chan<- prometheus.Metric, npu *HuaWeiNPUCard, chip *H } } +func updateLinkStatusInfo(ch chan<- prometheus.Metric, npu *HuaWeiNPUCard, chip *HuaWeiAIChip, + devInfo container.DevicesInfo) { + if chip.NetInfo == nil { + hwlog.RunLog.Error("NetInfo is nil in function updateLinkStatusInfo") + return + } + // use deep copy to prevent the pointer structure from being assigned nil by other goroutine + linkStatusInfo := common.DeepCopyLinkStatusInfo(chip.NetInfo.LinkStatusInfo) + if !validate(ch, npu, chip, linkStatusInfo) { + hwlog.RunLog.Warnf("Invalid param in function updateLinkStatusInfo") + return + } + cNameArray := getContainerNameArray(devInfo) + containerName, namespaceValue, podNameValue := getContainerInfoWithDefault(cNameArray) + if validateNum(float64(hccn.GetLinkStatusCode(linkStatusInfo.LinkState))) { + ch <- prometheus.NewMetricWithTimestamp(npu.Timestamp, prometheus.MustNewConstMetric(npuChipInfoDescLinkStatus, + prometheus.GaugeValue, float64(hccn.GetLinkStatusCode(linkStatusInfo.LinkState)), + collectCardLabelValue(chip, namespaceValue, podNameValue, containerName)...)) + } +} + func updateContainerInfo(ch chan<- prometheus.Metric, npu *HuaWeiNPUCard, chip *HuaWeiAIChip, devInfo container.DevicesInfo) { // use deep copy to prevent the pointer structure from being assigned nil by other goroutine @@ -1439,11 +1460,6 @@ func updateNPUCommonInfo(ch chan<- prometheus.Metric, npu *HuaWeiNPUCard, chip * func updateChipBaseInfo(ch chan<- prometheus.Metric, npu *HuaWeiNPUCard, chip *HuaWeiAIChip, devInfo container.DevicesInfo) { containerName, namespaceValue, podNameValue := getContainerInfoWithDefault(getContainerNameArray(devInfo)) - if validateNum(float64(hccn.GetLinkStatusCode(chip.LinkStatus))) { - ch <- prometheus.NewMetricWithTimestamp(npu.Timestamp, prometheus.MustNewConstMetric(npuChipInfoDescLinkStatus, - prometheus.GaugeValue, float64(hccn.GetLinkStatusCode(chip.LinkStatus)), - collectCardLabelValue(chip, namespaceValue, podNameValue, containerName)...)) - } if validateNum(float64(chip.Utilization)) { ch <- prometheus.NewMetricWithTimestamp(npu.Timestamp, prometheus.MustNewConstMetric(npuChipInfoDescUtil, prometheus.GaugeValue, float64(chip.Utilization), collectCardLabelValue(chip, namespaceValue, podNameValue, @@ -1544,7 +1560,10 @@ func updateSioInfo(ch chan<- prometheus.Metric, npu *HuaWeiNPUCard, chip *HuaWei func updateHccsInfo(ch chan<- prometheus.Metric, npu *HuaWeiNPUCard, chip *HuaWeiAIChip, devInfo container.DevicesInfo) { - + if chip.ChipIfo == nil || chip.BoardInfo == nil { + hwlog.RunLog.Warn("Invalid param in function updateHccsInfo") + return + } devType := common.GetDevType(chip.ChipIfo.Name, chip.BoardInfo.BoardId) if devType != common.Ascend910B && devType != common.Ascend910A3 { return @@ -1766,7 +1785,6 @@ func packChipInfoPart2(logicID int32, dmgr devmanager.DeviceInterface, hwChip *H setNetHealthStatus(logicID, dmgr, hwChip) setProcessInfo(logicID, dmgr, hwChip) setPCIeBusInfo(logicID, dmgr, hwChip) - setLinkStatus(logicID, dmgr, hwChip) hwChip.ErrorCode = errCode hwChip.Utilization = int(util) hwChip.VDieID = vdieID @@ -1822,24 +1840,6 @@ func setPCIeBusInfo(logicID int32, dmgr devmanager.DeviceInterface, hwChip *HuaW hwChip.PCIeBusInfo = pcieInfo } -func setLinkStatus(logicID int32, dmgr devmanager.DeviceInterface, hwChip *HuaWeiAIChip) { - hwChip.LinkStatus = Abnormal - if !dmgr.IsTrainingCard() { - return - } - - phyID, err := dmgr.GetPhysicIDFromLogicID(logicID) - if err != nil { - hwlog.RunLog.Error("set link status failed") - return - } - if linkStatus, err := hccn.GetNPULinkStatus(phyID); err != nil { - hwChip.LinkStatus = Abnormal - } else { - hwChip.LinkStatus = linkStatus - } -} - func getMainOptInfo(opticalInfo map[string]string) *common.OpticalInfo { mainOpticalInfo := common.OpticalInfo{} mainOpticalInfo.OpticalTxPower0 = hccn.GetFloatDataFromStr(opticalInfo[txPower0], txPower0) @@ -1894,6 +1894,21 @@ func getMainStatInfo(statInfo map[string]int) *common.StatInfo { func networkPackInfo(phyID int32) common.NpuNetInfo { newNetInfo := common.NpuNetInfo{} + + newNetInfo.LinkStatusInfo = &common.LinkStatusInfo{} + if linkState, err := hccn.GetNPULinkStatus(phyID); err == nil { + newNetInfo.LinkStatusInfo.LinkState = linkState + } else { + newNetInfo.LinkStatusInfo.LinkState = Abnormal + } + + if linkState, err := hccn.GetNPULinkStatus(phyID); err == nil { + newNetInfo.LinkStatusInfo = &common.LinkStatusInfo{} + newNetInfo.LinkStatusInfo.LinkState = linkState + } else { + newNetInfo.LinkStatusInfo = &common.LinkStatusInfo{} + newNetInfo.LinkStatusInfo.LinkState = Abnormal + } if tx, rx, err := hccn.GetNPUInterfaceTraffic(phyID); err == nil { newNetInfo.BandwidthInfo = &common.BandwidthInfo{} newNetInfo.BandwidthInfo.RxValue = rx diff --git a/devmanager/common/types.go b/devmanager/common/types.go index b59d904f27c21d88b5154420703a26c66256a059..802d09d7d6372b72c47034394717fcefcb2e4963 100644 --- a/devmanager/common/types.go +++ b/devmanager/common/types.go @@ -261,6 +261,8 @@ type NpuNetInfo struct { StatInfo *StatInfo // Network port real-time bandwidth BandwidthInfo *BandwidthInfo + // LinkStatusInfo refers to the link state + LinkStatusInfo *LinkStatusInfo } // BandwidthInfo contains network port real-time bandwidth @@ -339,6 +341,12 @@ type LinkStatInfo struct { LinkUPNum float64 } +// LinkStatusInfo refers to the link state +type LinkStatusInfo struct { + // The state of link + LinkState string +} + // LinkSpeedInfo the transfer rate of network port type LinkSpeedInfo struct { // The rate of network port diff --git a/devmanager/common/utils.go b/devmanager/common/utils.go index 90118197bb3815c0d61fc632e373c966b8658f70..a4316e45b29b3bd67cd1e05acc87f63e9056ba4f 100644 --- a/devmanager/common/utils.go +++ b/devmanager/common/utils.go @@ -344,6 +344,17 @@ func DeepCopyLinkStatInfo(linkStatInfo *LinkStatInfo) *LinkStatInfo { } } +// DeepCopyLinkStatusInfo copy LinkStatusInfo deeply +func DeepCopyLinkStatusInfo(linkStatusInfo *LinkStatusInfo) *LinkStatusInfo { + if linkStatusInfo == nil { + return nil + } + + return &LinkStatusInfo{ + LinkState: linkStatusInfo.LinkState, + } +} + // DeepCopyBandwidthInfo copy BandwidthInfo deeply func DeepCopyBandwidthInfo(bandwidthInfo *BandwidthInfo) *BandwidthInfo { if bandwidthInfo == nil {