Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions plugins/ptp_operator/metrics/logparser.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,9 @@ func extractRegularMetrics(processName, output string) (interfaceName string, pt
// If there is no delay from master this mean we are out of sync
clockState = ptp.HOLDOVER
log.Warningf("no delay from master process %s out of sync", processName)
// DEBUG: Log full offset line that triggered HOLDOVER via missing delay
log.Infof("DEBUG_HOLDOVER: extractRegularMetrics set HOLDOVER due to missing delay - process=%s, iface=%s, offset=%.0f, fields=%d",
processName, interfaceName, ptpOffset, len(fields))
}
return
}
Expand Down
8 changes: 8 additions & 0 deletions plugins/ptp_operator/metrics/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,9 @@ func (p *PTPEventManager) GenPTPEvent(ptpProfileName string, oStats *stats.Stats
}
case ptp.HOLDOVER:
oStats.SetLastSyncState(clockState)
// DEBUG: Log GenPTPEvent HOLDOVER transition
log.Infof("DEBUG_HOLDOVER: GenPTPEvent HOLDOVER - profile=%s, resource=%s, lastState=%s, offset=%d, eventType=%s",
ptpProfileName, eventResourceName, lastClockState, ptpOffset, eventType)
if lastClockState != ptp.HOLDOVER { //send event only once
log.Infof(" publishing event for (profile %s) %s with last state %s and current clock state %s and offset %d )",
ptpProfileName, eventResourceName, lastClockState, clockState, ptpOffset)
Expand All @@ -465,6 +468,11 @@ func (p *PTPEventManager) GenPTPEvent(ptpProfileName string, oStats *stats.Stats
return
case ptp.FREERUN:
oStats.SetLastSyncState(clockState)
// DEBUG: Log GenPTPEvent FREERUN transition, especially when coming from HOLDOVER
if lastClockState == ptp.HOLDOVER {
log.Infof("DEBUG_HOLDOVER: GenPTPEvent FREERUN from HOLDOVER - profile=%s, resource=%s, offset=%d (event suppressed, waiting for holdover goroutine)",
ptpProfileName, eventResourceName, ptpOffset)
}
if lastClockState != ptp.HOLDOVER {
if lastClockState != ptp.FREERUN { // don't send event if last event was freerun
log.Infof("publishing event for (profile %s) %s with last state %s and current clock state %s and offset %d for ( Max/Min Threshold %d/%d )",
Expand Down
5 changes: 5 additions & 0 deletions plugins/ptp_operator/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,11 @@ func (p *PTPEventManager) ExtractMetrics(msg string) {
return
} else if processName != syncE4lProcessName &&
!p.validLogToProcess(profileName, processName, len(ptp4lCfg.Interfaces)) {
// DEBUG: Log when port events are dropped due to validation
if strings.Contains(output, " port ") {
log.Infof("DEBUG_HOLDOVER: Port event DROPPED by validLogToProcess - profile=%s, process=%s, ifaceCount=%d, config=%s, output=%s",
profileName, processName, len(ptp4lCfg.Interfaces), configName, output)
}
log.Infof("%s skipped parsing %s output %s\n", processName, ptp4lCfg.Name, output)
return
}
Expand Down
91 changes: 85 additions & 6 deletions plugins/ptp_operator/metrics/ptp4lParse.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@
var gmStatusIdentifier = "T-GM-STATUS"
var bcStatusIdentifier = "T-BC-STATUS"

// clockClassFreeRunning is the default clock class indicating the clock is free-running
// (not locked to any reference). Per IEEE 1588, class 248 = default, 255 = slave-only.
const clockClassFreeRunning float64 = 248

// ParsePTP4l ... parse ptp4l for various events
func (p *PTPEventManager) ParsePTP4l(processName, configName, profileName, output string, fields []string,
ptpInterface ptp4lconf.PTPInterface, ptp4lCfg *ptp4lconf.PTP4lConfig, ptpStats stats.PTPStats) {
Expand All @@ -45,19 +49,53 @@
}
masterResource := fmt.Sprintf("%s/%s", alias, MasterClockType)

previousClockClass := ptpStats[master].ClockClass()
ptpStats[master].SetClockClass(int64(clockClass))
ClockClassMetrics.With(prometheus.Labels{
"process": processName, "config": configName, "node": ptpNodeName}).Set(clockClass)
p.PublishClockClassEvent(clockClass, masterResource, ptp.PtpClockClassChange)

// Safety net for regular OC/BC only (not T-BC or T-GM which have their own state
// management via T-BC-STATUS and T-GM-STATUS logs): if clock class degrades to
// free-running (248+) and the sync state is stuck at HOLDOVER (e.g. because the
// port event was missed or the holdover goroutine was never spawned), force the
// transition to FREERUN.
// ProfileType != NONE excludes T-BC; HasProcessEnabled("GM") excludes T-GM
// (whose ProfileType is also NONE but has the GM process active).
if ptp4lCfg.ProfileType == ptp4lconf.NONE &&
!ptpStats[master].HasProcessEnabled(gmProcessName) &&
clockClass >= clockClassFreeRunning && previousClockClass < int64(clockClassFreeRunning) {
currentSyncState := ptpStats[master].LastSyncState()
if currentSyncState == ptp.HOLDOVER || currentSyncState == ptp.LOCKED {
log.Infof("clock class changed from %d to %.0f (free-running), current sync state is %s, forcing FREERUN for %s",
previousClockClass, clockClass, currentSyncState, masterResource)
ptpStats[master].SetLastSyncState(ptp.FREERUN)
UpdateSyncStateMetrics(ptp4lProcessName, alias, ptp.FREERUN)
p.PublishEvent(ptp.FREERUN, FreeRunOffsetValue, masterResource, ptp.PtpStateChange)
// Also update CLOCK_REALTIME if phc2sys stats exist
if _, ok := ptpStats[ClockRealTime]; ok {
ptpStats[ClockRealTime].SetLastSyncState(ptp.FREERUN)
UpdateSyncStateMetrics(phc2sysProcessName, ClockRealTime, ptp.FREERUN)
p.PublishEvent(ptp.FREERUN, FreeRunOffsetValue, ClockRealTime, ptp.OsClockSyncStateChange)
}
}
}
}
} else if strings.Contains(output, " port ") && processName == ptp4lProcessName { // ignore anything reported by other process
log.Infof("output: %s", output)
portID, role, syncState := extractPTP4lEventState(output)
if portID == 0 || role == types.UNKNOWN {
// DEBUG: Log ignored port events (like Announce timeout)
log.Infof("DEBUG_HOLDOVER: Port event ignored (portID=%d, role=UNKNOWN) - output: %s", portID, output)
return
}
// DEBUG: Log parsed port event
log.Infof("DEBUG_HOLDOVER: Port event parsed - portID=%d, role=%d, syncState=%s, config=%s, profile=%s",
portID, role, syncState, configName, ptp4lCfg.Profile)

if ptpInterface, err = ptp4lCfg.ByPortID(portID); err != nil {
log.Error(err)
log.Errorf("DEBUG_HOLDOVER: ByPortID(%d) FAILED for config=%s, profile=%s, numInterfaces=%d - %v",
portID, configName, ptp4lCfg.Profile, len(ptp4lCfg.Interfaces), err)
log.Errorf("possible error due to file watcher not updated")
return
}
Expand All @@ -71,7 +109,7 @@
ptpStats.CheckSource(master, configName, masterOffsetSource)
ptpStats[master].SetRole(role)
}

log.Infof("DEBUG_HOLDOVER: lastRole=%d, newRole=%d, iface=%s, masterOffsetSource=%s", lastRole, role, ptpIFace, masterOffsetSource)
if lastRole != role {
/*
If role changes from FAULTY to SLAVE/PASSIVE/MASTER , then cancel HOLDOVER timer
Expand Down Expand Up @@ -100,6 +138,7 @@
UpdateInterfaceRoleMetrics(processName, ptpIFace, role)
}
if ptp4lCfg.ProfileType == ptp4lconf.TBC {
log.Infof("TBC: ptp4lCfg.ProfileType: %s", ptp4lCfg.ProfileType)
// For TBC: set ptp4l clock_state to FREERUN when port goes FAULTY
if role == types.FAULTY && lastRole == types.SLAVE {
if _, ok := ptpStats[master]; ok {
Expand All @@ -111,20 +150,26 @@
}
}
}
log.Infof("DEBUG_HOLDOVER: Returning early - profileType is TBC for config=%s", configName)
return // no need to go to holdover state for TBC
}
if lastRole != types.SLAVE { //tsphc doesn't have slave port and doesnt have fault state yet
log.Infof("DEBUG_HOLDOVER: Returning early - lastRole=%d is not SLAVE for iface=%s, config=%s", lastRole, ptpIFace, configName)
return // no need to go to holdover state if the Fault was not in master(slave) port
}

if _, ok := ptpStats[master]; !ok { //
log.Errorf("DEBUG_HOLDOVER: Returning early - no ptpStats[master] for portID=%d, role=%d, config=%s", portID, role, configName)
log.Errorf("no offset stats found for master for portid %d with role %s (the port started in fault state)", portID, role)
return
}
// Enter the HOLDOVER state: If current sycState is HOLDOVER(Role is at FAULTY) ,then spawn a go routine to hold the state until
// holdover timeout, always put only master offset from ptp4l to HOLDOVER,when this goes to FREERUN
// make any slave interface master offset to FREERUN
// Only if master (slave port ) offset was reported by ptp4l
// DEBUG: Log the holdover entry condition check
log.Infof("DEBUG_HOLDOVER: Port event holdover check - syncState=%s, LastSyncState=%s, condition(syncState!=LastSyncState)=%v, role=%d, lastRole=%d",
syncState, ptpStats[master].LastSyncState(), syncState != ptpStats[master].LastSyncState(), role, lastRole)
if syncState != "" && syncState != ptpStats[master].LastSyncState() && syncState == ptp.HOLDOVER {
// Put master in HOLDOVER state
ptpStats[master].SetRole(types.FAULTY) // update slave port as faulty
Expand All @@ -135,12 +180,18 @@
ptpStats[master].SetLastSyncState(syncState)
p.PublishEvent(syncState, ptpStats[master].LastOffset(), masterResource, ptp.PtpStateChange)
UpdateSyncStateMetrics(ptpStats[master].ProcessName(), alias, syncState)
log.Infof("DEBUG_HOLDOVER: PublishEvent syncState: %s, lastOffset: %d, masterResource: %s", syncState, ptpStats[master].LastOffset(), masterResource)
if ptpOpts, ok := p.PtpConfigMapUpdates.PtpProcessOpts[profileName]; ok && ptpOpts != nil {

Check failure on line 184 in plugins/ptp_operator/metrics/ptp4lParse.go

View workflow job for this annotation

GitHub Actions / Linting

unnecessary leading newline (whitespace)

p.maybePublishOSClockSyncStateChangeEvent(ptpOpts, configName, profileName)
threshold := p.PtpThreshold(profileName, true)
log.Infof("DEBUG_HOLDOVER: threshhold: %+v, ptpOpts: %+v, ok %v", threshold, ptpOpts, ok)

Check failure on line 188 in plugins/ptp_operator/metrics/ptp4lParse.go

View workflow job for this annotation

GitHub Actions / Linting

`threshhold` is a misspelling of `threshold` (misspell)
if p.mock {
log.Infof("mock holdover is set to %s", ptpStats[MasterClockType].Alias())
} else {
// DEBUG: Log holdover goroutine spawn
log.Infof("DEBUG_HOLDOVER: Spawning holdover goroutine for config=%s, profile=%s, iface=%s, timeout=%d",
configName, profileName, ptpStats[MasterClockType].Alias(), threshold.HoldOverTimeout)
go handleHoldOverState(p, ptpOpts, configName, profileName, threshold.HoldOverTimeout, ptpStats[MasterClockType].Alias(), threshold.Close)
}
}
Expand All @@ -159,30 +210,58 @@
fmt.Println("Recovered in f", r)
}
}()
// DEBUG: Log goroutine start
log.Infof("DEBUG_HOLDOVER: Holdover goroutine started for config=%s, profile=%s, iface=%s, timeout=%d",
configName, ptpProfileName, ptpIFace, holdoverTimeout)
select {
case <-c:
log.Infof("call received to close holderover timeout")
log.Infof("DEBUG_HOLDOVER: Holdover cancelled via channel for config=%s, profile=%s, iface=%s",
configName, ptpProfileName, ptpIFace)
return
case <-time.After(time.Duration(holdoverTimeout) * time.Second):
log.Infof("holdover time expired for interface %s", ptpIFace)
log.Infof("DEBUG_HOLDOVER: Holdover timeout expired for config=%s, profile=%s, iface=%s",
configName, ptpProfileName, ptpIFace)
ptpStats := ptpManager.GetStats(types.ConfigName(configName))
// DEBUG: Log what we found
if ptpStats == nil {
log.Errorf("DEBUG_HOLDOVER: GetStats returned nil for config=%s", configName)
return
}
log.Infof("DEBUG_HOLDOVER: Stats found for config=%s, checking master entry", configName)
if mStats, found := ptpStats[master]; found {
if mStats.LastSyncState() == ptp.HOLDOVER { // if it was still in holdover while timing out then switch to FREERUN
currentState := mStats.LastSyncState()
log.Infof("DEBUG_HOLDOVER: Master stats found - alias=%s, process=%s, currentState=%s",
mStats.Alias(), mStats.ProcessName(), currentState)
if currentState == ptp.HOLDOVER { // if it was still in holdover while timing out then switch to FREERUN
log.Infof("HOLDOVER timeout after %d secs,setting clock state to FREERUN from HOLDOVER state for %s",
holdoverTimeout, master)
masterResource := fmt.Sprintf("%s/%s", mStats.Alias(), MasterClockType)
ptpStats[MasterClockType].SetLastSyncState(ptp.FREERUN)
ptpManager.PublishEvent(ptp.FREERUN, ptpStats[MasterClockType].LastOffset(), masterResource, ptp.PtpStateChange)
log.Infof("DEBUG_HOLDOVER: About to update SyncState metric - process=%s, iface=%s, state=FREERUN",
mStats.ProcessName(), mStats.Alias())
UpdateSyncStateMetrics(mStats.ProcessName(), mStats.Alias(), ptp.FREERUN)
// don't check of os clock sync state if phc2 not enabled
ptpManager.maybePublishOSClockSyncStateChangeEvent(ptpOpts, configName, ptpProfileName)
} else {
log.Infof("DEBUG_HOLDOVER: Master state is %s (not HOLDOVER), no action needed", currentState)
}
} else {
log.Errorf("failed to switch from holdover, could not find ptpStats for interface %s", ptpIFace)
log.Errorf("DEBUG_HOLDOVER: failed to switch from holdover, could not find ptpStats[master] for config=%s, iface=%s. Available keys: %v",
configName, ptpIFace, getStatsKeys(ptpStats))
}
}
}

// DEBUG helper function
func getStatsKeys(ptpStats stats.PTPStats) []string {
keys := make([]string, 0, len(ptpStats))
for k := range ptpStats {
keys = append(keys, string(k))
}
return keys
}

func (p *PTPEventManager) maybePublishOSClockSyncStateChangeEvent(
ptpOpts *ptpConfig.PtpProcessOpts, configName, ptpProfileName string) {
if ptpOpts == nil {
Expand Down
6 changes: 6 additions & 0 deletions plugins/ptp_operator/metrics/registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,9 @@ func UpdatePTPOffsetMetrics(metricsType, process, eventResourceName string, offs

// DeletedPTPMetrics ... update metrics for deleted ptp config
func DeletedPTPMetrics(clockType, processName, eventResourceName string) {
// DEBUG: Log metric deletion with full labels
log.Infof("DEBUG_SYNCSTATE: Deleting SyncState metric - process=%s, node=%s, iface=%s, clockType=%s",
processName, ptpNodeName, eventResourceName, clockType)
PtpOffset.Delete(prometheus.Labels{"from": clockType,
"process": processName, "node": ptpNodeName, "iface": eventResourceName})
PtpMaxOffset.Delete(prometheus.Labels{"from": clockType,
Expand Down Expand Up @@ -230,6 +233,9 @@ func UpdateSyncStateMetrics(process, iface string, state ptp.SyncState) {
log.Errorf("wrong metrics are processed, ignoring interface %s with process %s", iface, process)
return
}
// DEBUG: Log all SyncState metric updates with full labels
log.Infof("DEBUG_SYNCSTATE: Updating SyncState metric - process=%s, node=%s, iface=%s, state=%s, value=%.0f",
process, ptpNodeName, iface, state, clockState)
SyncState.With(prometheus.Labels{
"process": process, "node": ptpNodeName, "iface": iface}).Set(clockState)
}
Expand Down
49 changes: 35 additions & 14 deletions plugins/ptp_operator/ptp_operator_plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ const (
ptpConfigDir = "/var/run/"
phc2sysProcessName = "phc2sys"
ptp4lProcessName = "ptp4l"
pmcProcessName = "pmc"
ts2PhcProcessName = "ts2phc"
chronydProcessName = "chronyd"
gnssProcessName = "gnss"
Expand Down Expand Up @@ -395,6 +396,13 @@ func processPtp4lConfigFileUpdates() {
if eventManager.Ptp4lConfigInterfaces[ptpConfigFileName] != nil &&
HasEqualInterface(newInterfaces, eventManager.Ptp4lConfigInterfaces[ptpConfigFileName].Interfaces) {
log.Infof("skipped update,interface not changed in ptl4lconfig")
// DEBUG: Log current interface roles before re-adding config
for _, iface := range eventManager.Ptp4lConfigInterfaces[ptpConfigFileName].Interfaces {
if iface != nil {
log.Infof("DEBUG_HOLDOVER: Skipped update - preserving iface=%s, portID=%d, role=%d for config=%s",
iface.Name, iface.PortID, iface.Role, ptpConfigFileName)
}
}
// add to eventManager
eventManager.AddPTPConfig(ptpConfigFileName, ptp4lConfig)
continue
Expand Down Expand Up @@ -469,6 +477,9 @@ func processPtp4lConfigFileUpdates() {
Role: role,
}
ptpInterfaces = append(ptpInterfaces, ptpIFace)
// DEBUG: Log initial role assignment from file watcher
log.Infof("DEBUG_HOLDOVER: FileWatcher built iface=%s, portID=%d, initialRole=%d for config=%s",
*ptpInterface, index+1, role, ptpConfigFileName)

// Update interface role metrics
ptpMetrics.UpdateInterfaceRoleMetrics(ptp4lProcessName, *ptpInterface, role)
Expand All @@ -484,6 +495,8 @@ func processPtp4lConfigFileUpdates() {
// if profile is not set then set it to default profile
ptp4lConfig.ProfileType = eventManager.GetProfileType(ptp4lConfig.Profile)
// add to eventManager
log.Infof("DEBUG_HOLDOVER: Adding ptp4l config %s with profile %s and %d interfaces",
ptpConfigFileName, *ptpConfigEvent.Profile, len(ptpInterfaces))
eventManager.AddPTPConfig(ptpConfigFileName, ptp4lConfig)
// clean up process metrics
for cName, opts := range eventManager.PtpConfigMapUpdates.PtpProcessOpts {
Expand Down Expand Up @@ -566,6 +579,8 @@ func processPtp4lConfigFileUpdates() {
}
if t, ok2 := eventManager.PtpConfigMapUpdates.EventThreshold[ptpConfig.Profile]; ok2 {
// Make sure that the function does close the channel
log.Infof("DEBUG_HOLDOVER: Closing holdover channel for profile=%s, config=%s",
ptpConfig.Profile, ptpConfigFileName)
t.SafeClose()
}
ptpMetrics.DeleteThresholdMetrics(ptpConfig.Profile)
Expand All @@ -585,27 +600,33 @@ func processPtp4lConfigFileUpdates() {
} else {
ptpMetrics.DeletedPTPMetrics(s.OffsetSource(), ts2PhcProcessName, s.Alias())
for _, p := range ptpStats {
if p.PtpDependentEventState() != nil {
if p.HasProcessEnabled(gnssProcessName) {
if ptpMetrics.NmeaStatus != nil {
ptpMetrics.NmeaStatus.Delete(prometheus.Labels{
"process": ts2PhcProcessName, "node": eventManager.NodeName(), "iface": p.Alias()})
}
masterResource := fmt.Sprintf("%s/%s", p.Alias(), MasterClockType)
eventManager.PublishEvent(ptp.FREERUN, ptpMetrics.FreeRunOffsetValue, masterResource, ptp.GnssStateChange)
}
}
if ptpMetrics.ClockClassMetrics != nil {
ptpMetrics.ClockClassMetrics.Delete(prometheus.Labels{
"process": ptp4lProcessName, "config": string(ptpConfigFileName), "node": eventManager.NodeName()})
}
ptpMetrics.DeletedPTPMetrics(MasterClockType, ts2PhcProcessName, p.Alias())
p.DeleteAllMetrics([]*prometheus.GaugeVec{ptpMetrics.PtpOffset, ptpMetrics.SyncState})
}
}
for _, p := range ptpStats {
if p.PtpDependentEventState() != nil {
if p.HasProcessEnabled(gnssProcessName) {
if ptpMetrics.NmeaStatus != nil {
ptpMetrics.NmeaStatus.Delete(prometheus.Labels{
"process": ts2PhcProcessName, "node": eventManager.NodeName(), "iface": p.Alias()})
}
masterResource := fmt.Sprintf("%s/%s", p.Alias(), MasterClockType)
eventManager.PublishEvent(ptp.FREERUN, ptpMetrics.FreeRunOffsetValue, masterResource, ptp.GnssStateChange)
}
}
if ptpMetrics.ClockClassMetrics != nil {
ptpMetrics.ClockClassMetrics.Delete(prometheus.Labels{
"process": ptp4lProcessName, "config": string(ptpConfigFileName), "node": eventManager.NodeName()})
ptpMetrics.ClockClassMetrics.Delete(prometheus.Labels{
"process": pmcProcessName, "config": string(ptpConfigFileName), "node": eventManager.NodeName()})
}
}

masterResource := fmt.Sprintf("%s/%s", s.Alias(), MasterClockType)
eventManager.PublishEvent(ptp.FREERUN, ptpMetrics.FreeRunOffsetValue, masterResource, ptp.PtpStateChange)
}
log.Infof("DEBUG_HOLDOVER: Deleting stats and config for %s", ptpConfigFileName)
eventManager.DeleteStatsConfig(ptpConfigFileName)
eventManager.DeletePTPConfig(ptpConfigFileName)
// clean up process metrics
Expand Down
Loading
Loading