diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..916c5a78 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +scratch/ +.venv +venv diff --git a/.pymarkdown.yml b/.pymarkdown.yml new file mode 100644 index 00000000..757a8671 --- /dev/null +++ b/.pymarkdown.yml @@ -0,0 +1,92 @@ +system: + exclude_path: scratch/ +extensions: + markdown-tables: + enabled: true + markdown-strikethrough: + enabled: true + markdown-task-list-items: + enabled: true +plugins: + # Inconsistent Unordered List Start style + # md004: + ul-style: + enabled: true + style: consistent + + # Inconsistent indentation for list items at the same level + # md005: + list-indent: + enabled: true + + # Unordered list indentation + # md007: + ul-indent: + enabled: true + + # Multiple consecutive blank lines + # md012: + no-multiple-blanks: + enabled: false + + # Line length + # md013: + line-length: + enabled: false + line_length: 100 + code_blocks: false + + # Dollar signs used before commands without showing output + # md014: + commands-show-output: + enabled: false + + # Headings should be surrounded by blank lines + # md022: + blanks-around-headings: + enabled: false + + # Multiple headings cannot contain the same content + # md024: + no-duplicate-heading: + enabled: false + + # Ordered list item prefix + # md029: + ol-prefix: + enabled: false + + # Fenced code blocks should be surrounded by blank lines + # md031: + blanks-around-fences: + enabled: false + + # List blocks should be surrounded by blank lines + # md032: + blanks-around-lists: + enabled: false + + # Inline HTML + # md033: + no-inline-html: + enabled: false + + # Bare URL used + # md034: + no-bare-urls: + enabled: false + + # Emphasis possibly used instead of a heading element + # md036: + no-emphasis-as-heading: + enabled: false + + # First line in file should be a top-level heading + # md041: + first-line-heading: + enabled: false + + # Each file should end with a single newline character + # md047: + single-trailing-newline: + enabled: false diff --git a/.pyspelling.yml b/.pyspelling.yml new file mode 100644 index 00000000..37843828 --- /dev/null +++ b/.pyspelling.yml @@ -0,0 +1,20 @@ +matrix: + - name: Markdown + aspell: + lang: en + # ignore-case: true + dictionary: + encoding: utf-8 + wordlists: + - .wordlist-md + output: scratch/dictionary.dic + pipeline: + - pyspelling.filters.markdown: + - pyspelling.filters.html: + comments: false + ignores: + - code + - pre + sources: + - '!**/INFO.md|!**/TODO.md|!venv/**|!scratch/**|**/*.md' + default_encoding: utf-8 diff --git a/.wordlist-md b/.wordlist-md new file mode 100644 index 00000000..624193fd --- /dev/null +++ b/.wordlist-md @@ -0,0 +1,904 @@ +AccountNumber +ack +ACLs +acm +ACM +ACMMetricsCollectorFederationError +ACMMetricsCollectorForwardRemoteWriteError +ACMRemoteWriteError +ACMThanosCompactHalted +ACMUWLMetricsCollectorFederationError +ACMUWLMetricsCollectorForwardRemoteWriteError +activeMetadataServers +addon +Addon +addon's +Addons +ae +alertmanager +Alertmanager +ALERTMANAGER +AlertmanagerClusterFailedToSendAlerts +AlertmanagerConfig +AlertmanagerFailedReload +AlertmanagerFailedToSendAlerts +alertname +allocatable +api +apirequestcounts +APIs +apiserver +apiservers +apiVersion +AppArmor +ARGS +AuditLogError +autoscaler +autoscaling +awk +aws +az +backend +Backend +backfill +backfills +backingstore +backingstores +backplane +Backplane +balancers +bb +bootable +bootOrder +brq +BugZilla +busybox +cae +cali +cassandra +CatalogSourcesUnhealthy +CatalogSourcesUnHealthy +ccc +CCS +cd +CDI +CDIDataImportCronOutdated +CDIDataVolumeUnusualRestartCount +CDIDefaultStorageClassDegraded +CDIMultipleDefaultVirtStorageClasses +CDINoDefaultStorageClass +CDINotReady +CDIOperatorDown +CDI's +CDIStorageProfilesIncomplete +ceph +Ceph +CEPH +CephBlockPool +cephcluster +CephClusterCriticallyFull +CephClusterErrorState +CephClusterNearFull +CephClusterReadOnly +CephClusterWarningState +CephDataRecoveryTakingTooLong +cephFilesystems +cephfs +CephFS +CephFSStaleSubvolume +CephMdsMissingReplicas +CephMgrIsAbsent +CephMgrIsMissingReplicas +CephMonHighNumberOfLeaderChanges +CephMonLowNumber +CephMonQuorumAtRisk +CephMonQuorumLost +CephMonVersionMismatch +CephNodeDown +CephOSDCriticallyFull +CephOSDDiskNotResponding +CephOSDDiskUnavailable +CephOSDDown +CephOSDFlapping +CephOSDNearFull +CephOSDSlowOps +CephOSDVersionMismatch +CephPGRepairTakingTooLong +CephPoolQuotaBytesCriticallyExhausted +CephPoolQuotaBytesNearExhaustion +CephXattrSetLatency +certfile +cgroups +chmod +chronyd +ci +CIDR +claimRef +CLF +CLF's +cli +Cli +CLI +CLO +ClusterLogForwarder +ClusterLogForwarderOutputErrorRate +ClusterMonitoringOperatorDeprecatedConfig +ClusterObjectStoreState +ClusterOperatorDegraded +ClusterOperatorDown +ClusterRole +ClusterRoleBinding +ClusterVersionOperatorDown +CMO +CNAO +CnaoDown +CnaoNmstateMigration +CNO +CollectorHigh +commandline +compactions +comparator +config +Config +configmap +configMap +ConfigMap +configmaps +ConfigMaps +configs +connectedClient +containerd +containerDisk +CONTROLLERPOD +CoreDNS +COREDNS +CoreDNSErrorsHigh +cpu +CPUs +CPUS +CrashLoopBackOff +crc +CRC +CRD +CRDs +CreateContainerError +CRI +crictl +crio +crl +cron +CRs +crt +csi +CSI +csvs +CVO +cvzf +DAEMONPOD +daemonset +Daemonset +DaemonSet +daemonsets +DataImportCron +DataImportCronTemplate +DataImportCronTemplates +datapath +datapoints +datastore +datavolume +DataVolume +DataVolumes +dataVolumeTemplates +dcb +dce +ddc +DDoS +decrypting +defrag +Defrag +defragment +defragmentation +DeprecatedMachineType +descheduler +Descheduler +DeschedulerPSIDisabled +dev +devEnableEvictionsInBackground +DevKubeVirtRelieveAndMigrate +df +diag +DIC +DICTs +dir +disableAlerts +dns +DNS +DNSErrors +dnses +DNSNxDomain +dockerd +DoS +dropdown +dstat +dumpxml +DuplicateWaspAgentDSDetected +dv +eaa +eb +eBPF +ee +eg +EgressFirewall +EgressIP +EgressService +elasticsearch +eno +env +EOF +ErrImagePull +etcd +Etcd +ETCD +etcdDatabaseQuotaLowSpace +etcdGRPCRequestsSlow +etcdHighFsyncDurations +etcdHighNumberOfFailedGRPCRequests +etcdInsufficientMembers +etcdMembersDown +etcdNoLeader +etcdSignerCAExpirationCritical +eth +ethernet +ethtool +exfiltrated +exfiltration +ExternalEgressHighTrend +ExternalIngressHighTrend +ExtremelyHighIndividualControlPlaneCPU +ExtremelyHighIndividualControlPlaneMemory +failover +FedRAMP +fi +filesystem +finalizers +fio +fisk +FlowCollector +flowlogs +fluentd +fmt +ForbiddenResponseRate +fs +fsync +GarbageCollectorSyncFailed +GbE +ge +Gi +gluster +gmail +Grafana +graphviz +GRO +gRPC +GuestFilesystemAlmostOutOfSpace +GuestVCPUQueueHighCritical +GuestVCPUQueueHighWarning +gz +gzip +gzipped +HAControlPlaneDown +hadoop +HAProxy +hardcoded +HCO +HCOGoldenImageWithNoArchitectureAnnotation +HCOGoldenImageWithNoSupportedArchitecture +HCOInstallationIncomplete +HCOMisconfiguredDescheduler +HCOMultiArchGoldenImagesDisabled +HCOOperatorConditionsUnhealthy +HDDs +HighCPUWorkload +HighlyAvailableWorkloadIncorrectlySpread +HighNodeCPUFrequency +HighOverallControlPlaneMemory +HighRBDCloneSnapshotCount +HorizontalScaling +hostpath +hostPrefix +HPA +HPP +HPPNotReady +HPPOperatorDown +HPPSharingPoolPathWithOS +html +http +https +HTTPS +hyperconverged +HyperConverged +ICMP +IfNotPresent +imageName +ImagePullBackOff +imagePullSecrets +ImageRegistryStorageFull +ImageRegistryStorageReadOnly +ImageStream +ImageStreamImportFailed +ImageStreams +ImageStreamTags +ImportSuccess +ing +IngressHTTPLatencyTrend +inode +inodes +InstallPlanFailed +InstallPlanMissing +InstallPlanPending +integrations +involvedObject +io +IOPS +IOs +iostat +IOV +ip +IPAM +IPsec +IPsecErrors +ipv +IPv +istio +Istio +Jira +journalctl +json +JSON +jsonpath +jx +kafka +KCS +KEDA +kfree +kms +KMS +KMSServerConnectionAlert +Knowledgebase +krbd +kube +Kube +KubeAggregatedAPIErrors +KubeAPIDown +KubeAPIErrorBudgetBurn +KubeControllerManager +KubeControllerManagerDown +kubectl +KubeDeploymentReplicasMismatch +KubeJobFailed +kubelet +Kubelet +KubeletConfig +KubeletDown +KubeletHealthState +kubelets +Kubelets +KubemacpoolDown +KubeMacPoolDuplicateMacsFound +KubeNodeNotReady +KubePersistentVolumeFillingUp +KubePersistentVolumeInodesFillingUp +KubePodNotReady +kubernetes +Kubernetes +KubeSchedulerDown +kubevirt +KubeVirt +KubeVirtComponentExceedsRequestedCPU +KubeVirtComponentExceedsRequestedMemory +KubeVirtCRModified +KubeVirtDeprecatedAPIRequested +KubeVirtNoAvailableNodesToRunVMs +KubeVirtRelieveAndMigrate +KubevirtVmHighMemoryUsage +KubeVirtVMIExcessiveMigrations +KVM +LACP +lastTimestamp +LatencyHighTrend +le +li +libvirt +lifecycle +linter +liveness +LoadBalancer +localdomain +localhost +logLevel +Lokistack +LongLifecycle +lookups +LowKVMNodesCount +LowReadyVirtControllersCount +LowReadyVirtOperatorsCount +LowVirtAPICount +LowVirtControllersCount +LowVirtOperatorCount +lsblk +LSO +lsof +MachineConfig +MachineConfigControllerPausedPoolKubeletCA +MachineConfigControllerPoolAlert +MachineConfigPool +MachineConfigPool's +managedResources +mapOptions +matcher +matchLabels +MCC +MCCDrainError +MCD +MCDPivotError +MCDRebootError +MCO +MCP +MCs +md +mds +MDS +MDSCacheUsageHigh +MDSCpuUsageHigh +MDSCpuUsageHighNeedsHorizontalScaling +MDSCpuUsageHighNeedsVerticalScaling +MDSs +mdstat +MemberList +metricsServer +microservice +microservices +migratable +minio +mirrorpod +misconfiguration +misconfigurations +misconfigured +Misconfigured +mitigations +mkdir +mon +mons +MONs +msg +mTLS +mtr +mtu +MTU +MTUs +multicluster +MultiClusterObservability +MYPOD +NAMESERVER +nameservers +namespace +Namespace +NAMESPACE +namespaces +namespacestore +nbdb +Netcat +netmask +NetObservLokiError +NetObservNoFlows +NetpolDenied +netstat +NetworkAddonsConfigNotReady +NetworkPolicies +NetworkPolicy +NFS +NFSv +NIC +NICs +NMState +nocrl +NodeClockNotSynchronizing +NodeFileDescriptorLimit +NodeFilesystemAlmostOutOfFiles +NodeFilesystemAlmostOutOfSpace +NodeFilesystemFilesFillingUp +NodeFilesystemSpaceFillingUp +NodeNetworkInterfaceDown +NodePort +NodeRAIDDegraded +nodeSelector +nodes's +NodeWithoutOVNKubeNodePodRunning +nodown +noin +NoLeadingVirtOperator +NooBaa +NooBaaSystemCapacityWarning +noout +NoOvnClusterManagerLeader +NoOvnMasterLeader +NoReadyVirtController +NoReadyVirtOperator +NorthboundStale +northd +NoRunningOvnControlPlane +NoRunningOvnMaster +NotReady +nr +NTP +NVMe +NX +OBC +ObcQuotaBytesAlert +ObcQuotaBytesExhaustedAlert +ObcQuotaObjectsAlert +ObcQuotaObjectsExhaustedAlert +ObjectBucketClaim +observability +Observability +Observability's +observatorium +oc +ocm +OCM +OCP +OCPBUGS +OCPV +ocs +OCS +odf +Odf +ODF +ODFCorePodRestarted +ODFDiskUtilizationHigh +OdfMirrorDaemonStatus +ODFNodeLatencyHighOnNONOSDNodes +ODFNodeLatencyHighOnOSDNodes +ODFNodeMTULessThan +ODFNodeNICBandwidthSaturation +ODFOperatorNotUpgradeable +ODFPersistentVolumeMirrorStatus +OdfPoolMirroringImageHealth +ODFRBDClientBlocked +ODF's +olm +OLM +onboarded +Onboarding +OOM +OOMKilled +opensearch +openshift +Openshift +OpenShift +OPENSHIFT +openshiftapps +OpenShift's +openssl +OperatorCondition +OperatorConditionsUnhealthy +operatorframework +OperatorHub +OrphanedVirtualMachineInstances +osd +OSD +OSDCpuLoadHigh +osdmap +osds +OSDs +OsImage +ostree +ostreed +otop +OutdatedVirtualMachineInstanceWorkloads +overcommit +ovn +OVN +ovnkube +OVNkube +OVNKubernetesControllerDisconnectedSouthboundDatabase +OVNKubernetesNorthdInactive +OVS +PacketDropsByDevice +PacketDropsByKernel +Pagerduty +PagerDuty +PBD +PDB +pe +perf +performant +PersistentVolume +PersistentVolumeClaims +PersistentVolumeFillingUp +PersistentVolumes +PersistentVolumeUsageCritical +PersistentVolumeUsageNearFull +PGRepair +pgs +pkcs +PodDisruptionBudget +PodDisruptionBudgetAtLimit +PodDisruptionBudgetLimit +podman +PodMonitor +PodMonitors +POSIX +pre +Pre +preloaded +PrivateLink +proc +profileCustomizations +prometheus +PrometheusDuplicateTimestamps +PrometheusKubernetesListWatchFailures +PrometheusOperatorRejectedResources +PrometheusPossibleNarrowSelectors +PrometheusRemoteStorageFailures +PrometheusRule +PrometheusRuleFailures +PrometheusScrapeBodySizeLimitHit +PrometheusTargetSyncFailure +promql +PromQL +provisioner +provisioners +ps +pullspec +punctuations +pv +PV +pvc +pvcs +PVCs +PVs +PVS +PWD +QEMU +QoS +querier +Querier +radosnamespace +RBAC +rbd +RBD +reachability +readonly +Readonly +ReadWriteMany +rebalance +rebalancing +redhat +ReplicaSet +retransmissions +retransmits +retriable +reweight +RFE +RGW +RHEL +RHOCP +RHOSDFP +RHSTOR +rmi +roundtrip +RPC +rsh +RTT +runbook +Runbook +runbooks +Runbooks +runnable +runStrategy +rxbounce +sbdb +sc +scalers +SCC +SCCs +schedulability +schedulable +schemas +SELinux +SERVFAIL +ServiceMonitor +ServiceMonitors +setxattr +sifg +SingleStackIPv +sj +sk +skb +SLI +SLIs +SLO +SLOs +smtp +snapshotting +Snapshotting +snet +SOPs +sourthbound +SouthboundStale +spec'd +src +SRE +SSD +SSDs +SSL +SSP +SSPCommonTemplatesModificationReverted +SSPDown +SSPFailingToReconcile +SSPHighRateRejectedVms +SSPOperatorDown +SSPTemplateValidatorDown +stateful +StorageAutoScaler +StorageAutoScalerCRIsInvalid +StorageAutoScalingCapacityReached +StorageAutoScalingFailed +storageclass +StorageClass +StorageClasses +storageClassName +storageclient +StorageClientHeartbeatMissed +StorageClientIncompatibleOperatorVersion +storagecluster +storageCluster +storageconsumer +StorageConsumer +storageconsumers +StorageQuotaUtilizationThresholdReached +storeAPI +StoreAPIs +STS +subcommand +subcommands +subdirectory +subfolder +subnet +Subnet +SubnetAllocationThresholdExceeded +subnets +suboptimal +Suboptimal +subresource +subtree +subvolume +subvolumegroup +subvolumes +Subvolumes +sudo +symlinks +SYS +sysctl +systemctl +systemd +SystemMemoryExceedsReservation +TargetDown +TCP +tcpdump +TelemeterClientFailures +th +thanos +Thanos +ThanosRuleQueueIsDroppingAlerts +ThanosRuleRuleEvaluationLatencyHigh +tls +TLS +todo +tolerations +toofull +topk +tracepoint +traceroute +TSO +TTL +txn +UI +uid +UID +un +uncordon +undercloud +Unencrypted +unfound +uniq +unmapping +unmount +unmounting +unparsable +unpause +Unpause +Unpausing +unschedulable +UnsupportedHCOModification +upgradeable +Upgradeable +url +urlencode +usr +util +uwl +UWL +vA +Validator +vB +vCPU +vCPUs +Velero +verifyEndpoint +VerticalScaling +veth +virsh +virt +VirtAPIDown +VirtApiRESTErrorsBurst +VirtApiRESTErrorsHigh +VirtControllerDown +VirtControllerRESTErrorsBurst +VirtControllerRESTErrorsHigh +VirtHandlerDaemonSetRolloutFailing +VirtHandlerRESTErrorsBurst +VirtHandlerRESTErrorsHigh +VirtOperatorDown +VirtOperatorRESTErrorsBurst +VirtOperatorRESTErrorsHigh +VirtualMachine +VirtualMachineCRCErrors +VirtualMachineInstance +VirtualMachines +VirtualMachineStuckInUnhealthyState +VirtualMachineStuckOnNode +visibily +VLAN +VLANs +vm +VM +VMCannotBeEvicted +vmi +VMI +VMIs +VMReplicaSet +vms +VM's +VMs +VMStorageClassWarning +volumeattachment +volumeAttributes +volumesnapshot +VolumeSnapshots +VPC +wal +WAL +wc +webhook +webhooks +WIP +xattr +xattrs +xxErrors +yaml +YAML +Dockerfile +burstable +nearfull +supportability +antpu +Errorf +parallely +noup +blocklist +blocklisted +blocklisting +abff +aaf \ No newline at end of file diff --git a/alerts/cluster-kube-apiserver-operator/ExtremelyHighIndividualControlPlaneCPU.md b/alerts/cluster-kube-apiserver-operator/ExtremelyHighIndividualControlPlaneCPU.md index efbb0570..b163e34b 100644 --- a/alerts/cluster-kube-apiserver-operator/ExtremelyHighIndividualControlPlaneCPU.md +++ b/alerts/cluster-kube-apiserver-operator/ExtremelyHighIndividualControlPlaneCPU.md @@ -87,33 +87,33 @@ sum by (node) ( These are the conditions that could trigger the alert: -- there is a new workload that is generating more calls to the apiserver +* there is a new workload that is generating more calls to the apiserver and causing high CPU usage. In this case, increase the CPU and memory on your control plane nodes. -- the alert is triggered based on the node metrics, so it could be that a +* the alert is triggered based on the node metrics, so it could be that a component on the node is causing the high CPU usage. -- apiserver/etcd is processing more requests due to client retries that is +* apiserver/etcd is processing more requests due to client retries that is being caused by an underlying condition. -- uneven distribution of requests to the apiserver instance(s) due to http2 +* uneven distribution of requests to the apiserver instance(s) due to http2 (it multiplexes requests over a single TCP connection). The load balancers are not at application layer, and so does not understand http2. ## Mitigation -- if a workload is generating load to the apiserver that is causing high CPU +* if a workload is generating load to the apiserver that is causing high CPU usage, then increase the CPU and memory on your control plane nodes. -- If the sustained high CPU usage is due to a cluster degradation: +* If the sustained high CPU usage is due to a cluster degradation: - - find out the root cause of the degradation, and then + * find out the root cause of the degradation, and then determine the next steps accordingly. If this needs to be reported, then capture the following dataset, and file a new issue in BugZilla with links to the captured dataset: -- must-gather -- audit logs -- dump of prometheus data +* must-gather +* audit logs +* dump of prometheus data How to gather the audit logs of the cluster: diff --git a/alerts/cluster-logging-operator/ClusterLogForwarderOutputErrorRate.md b/alerts/cluster-logging-operator/ClusterLogForwarderOutputErrorRate.md index b4233068..a306494e 100644 --- a/alerts/cluster-logging-operator/ClusterLogForwarderOutputErrorRate.md +++ b/alerts/cluster-logging-operator/ClusterLogForwarderOutputErrorRate.md @@ -18,31 +18,31 @@ the configured log store. 1. Examine `ClusterLogForwarderOutputErrorRate` alert details in the OpenShift console: - - Note the error being generated by the collector - - (i.e. "error trying to connect: dns error: failed to lookup address - information: Name or service not known") - - Note the key labels: - - `app_kubernetes_io_instance`: The name of the collector. - - `namespace`: Namespace of the collector. - - `component_id`: The ID of the specific sink in the log collector - that is failing - (i.e `output_lokistack_otlp_application`, `output_my_splunk`). + - Note the error being generated by the collector + - (i.e. "error trying to connect: dns error: failed to lookup address + information: Name or service not known") + - Note the key labels: + - `app_kubernetes_io_instance`: The name of the collector. + - `namespace`: Namespace of the collector. + - `component_id`: The ID of the specific sink in the log collector + that is failing + (i.e `output_lokistack_otlp_application`, `output_my_splunk`). 2. Review the `ClusterLogForwarder` (CLF) configuration: 1. Identify the output definition corresponding to the `component_id` obtained from the alert details. - - See naming schemas for + - See naming schemas for [non-Lokistack](./ClusterLogForwarderConfigNamingSchema.md#non-lokistack-outputs) and [Lokistack](./ClusterLogForwarderConfigNamingSchema.md#lokistack-outputs) outputs. 3. Verify the log receiver: - - Verify the service is available. - - Verify the authorization credentials. + - Verify the service is available. + - Verify the authorization credentials. 4. Verify the `ClusterLogForwarder` - - Verify the URL of the output is correct for the service - - Verify the TLS configuration is correct for the service by inspecting the + - Verify the URL of the output is correct for the service + - Verify the TLS configuration is correct for the service by inspecting the spec'd secrets and configmaps - - Verify the authorization credentials are correct for the service by inspecting + - Verify the authorization credentials are correct for the service by inspecting the spec'd secrets ## Mitigation diff --git a/alerts/cluster-logging-operator/CollectorHigh403ForbiddenResponseRate.md b/alerts/cluster-logging-operator/CollectorHigh403ForbiddenResponseRate.md index 1f7afe3e..bc765160 100644 --- a/alerts/cluster-logging-operator/CollectorHigh403ForbiddenResponseRate.md +++ b/alerts/cluster-logging-operator/CollectorHigh403ForbiddenResponseRate.md @@ -19,29 +19,29 @@ resulting in data loss at the configured log store. 1. Examine `CollectorHigh403ForbiddenResponseRate` alert details in the OpenShift console: - - Note the key labels: - - `app_kubernetes_io_instance`: The name of the collector. - - `namespace`: Namespace of the collector. - - `component_id`: The ID of the specific sink in `Vector` that's failing + - Note the key labels: + - `app_kubernetes_io_instance`: The name of the collector. + - `namespace`: Namespace of the collector. + - `component_id`: The ID of the specific sink in `Vector` that's failing (i.e `output_lokistack_otlp_application`, `output_my_splunk`). 2. Review the `ClusterLogForwarder` (CLF) configuration: 1. Identify the output definition corresponding to the `component_id` obtained from the alert details. - - See naming schemas for + - See naming schemas for [non-Lokistack](./ClusterLogForwarderConfigNamingSchema.md#non-lokistack-outputs) and [Lokistack](./ClusterLogForwarderConfigNamingSchema.md#lokistack-outputs) outputs. 2. Examine the secret or authentication fields defined for the affected output. 3. Validate authentication credentials in the identified secrets: - - Identify the Kubernetes `Secret` object referenced by the affected CLF - output. - - Verify the following: + - Identify the Kubernetes `Secret` object referenced by the affected CLF + output. + - Verify the following: 1. The `Secret` object exists in the specified namespace. 2. The expected keys (e.g., token, password, etc.) are present and contain valid, non-empty values. 4. Verify log store authorization with the provided credentials. - - Confirm that the credentials provided in the identified Secret possess the + - Confirm that the credentials provided in the identified Secret possess the necessary permissions (i.e write) to the target log store. ### Diagnosis for Red Hat Managed Lokistack diff --git a/alerts/cluster-monitoring-operator/ClusterMonitoringOperatorDeprecatedConfig.md b/alerts/cluster-monitoring-operator/ClusterMonitoringOperatorDeprecatedConfig.md index 38942677..d89c03a9 100644 --- a/alerts/cluster-monitoring-operator/ClusterMonitoringOperatorDeprecatedConfig.md +++ b/alerts/cluster-monitoring-operator/ClusterMonitoringOperatorDeprecatedConfig.md @@ -28,12 +28,12 @@ the deprecated config as shown in the following example: ## Mitigation -* For the `k8sPrometheusAdapter.dedicatedServiceMonitors` +- For the `k8sPrometheusAdapter.dedicatedServiceMonitors` field, you can remove the block. For more information, see `Monitoring deprecated and removed features` under [Deprecated and removed features](https://docs.openshift.com/container-platform/4.16/release_notes/ocp-4-16-release-notes.html#ocp-4-16-deprecated-removed-features_release-notes). -* For the other `k8sPrometheusAdapter` fields, see `Monitoring deprecated and +- For the other `k8sPrometheusAdapter` fields, see `Monitoring deprecated and removed features` under [Deprecated and removed features](https://docs.openshift.com/container-platform/4.16/release_notes/ocp-4-16-release-notes.html#ocp-4-16-deprecated-removed-features_release-notes). You might need to migrate some of the fields under [metricsServer](https://docs.openshift.com/container-platform/latest/observability/monitoring/config-map-reference-for-the-cluster-monitoring-operator.html#metricsserverconfig). diff --git a/alerts/cluster-monitoring-operator/PrometheusOperatorRejectedResources.md b/alerts/cluster-monitoring-operator/PrometheusOperatorRejectedResources.md index 712d6479..7d82eba1 100644 --- a/alerts/cluster-monitoring-operator/PrometheusOperatorRejectedResources.md +++ b/alerts/cluster-monitoring-operator/PrometheusOperatorRejectedResources.md @@ -106,19 +106,19 @@ The mitigation depends on which resources are being rejected and why. ### ServiceMonitor and PodMonitor -- Invalid relabeling configuration (for example, a malformed regular expression). - - Fix the relabeling configuration syntax. -- Invalid TLS configuration. - - Fix the TLS configuration. -- A scrape interval less than the scrape timeout. - - Change the scrape timeout or the scrape interval value. -- Invalid secret or configmap key reference. - - Verify that the secret/configmap object exists and that they key is present +* Invalid relabeling configuration (for example, a malformed regular expression). + * Fix the relabeling configuration syntax. +* Invalid TLS configuration. + * Fix the TLS configuration. +* A scrape interval less than the scrape timeout. + * Change the scrape timeout or the scrape interval value. +* Invalid secret or configmap key reference. + * Verify that the secret/configmap object exists and that they key is present in the secret/configmap. -- Violation of file system access rules, which can occur when a `ServiceMonitor` +* Violation of file system access rules, which can occur when a `ServiceMonitor` or `PodMonitor` object references a file to use as a bearer token or references a TLS file. These configurations are not allowed in user-defined monitoring. - - you must create a secret that contains the credential data in the + * you must create a secret that contains the credential data in the same namespace as the `ServiceMonitor` or `PodMonitor` object and use a secret key reference in the `ServiceMonitor` or `PodMonitor` configuration. @@ -127,35 +127,35 @@ When the alert is triggered by an resource managed by a 3rd-party operator, it might not be possible to fix the root cause. The resolution will depend on the status of the operator: -- The operator is a certified Red Hat operator. - - If the operator is installed in the `openshift-operators` namespace, it +* The operator is a certified Red Hat operator. + * If the operator is installed in the `openshift-operators` namespace, it should be removed and installed in a different namespace because `openshift-operators` might contain community operators which don't have the same level of support. - - If the operator is deployed in another namespace than `openshift-operators` + * If the operator is deployed in another namespace than `openshift-operators` and its documentation requires adding the `openshift.io/cluster-monitoring: "true"` label to this namespace during the installation, ensure that the label exists. - - Otherwise you can exclude the resource from user-defined monitoring by adding + * Otherwise you can exclude the resource from user-defined monitoring by adding the `openshift.io/user-monitoring:"false"` label to the resource's namespace or the resource itself (the latter requires at least OCP 4.16). -- The operator is a community operator. - - You can exclude the resource from user-defined monitoring by adding the +* The operator is a community operator. + * You can exclude the resource from user-defined monitoring by adding the `openshift.io/user-monitoring:"false"` label to the resource's namespace or the resource itself (the latter requires at least OCP 4.16). ### AlertmanagerConfig -- Invalid secret or configmap key reference. - - Verify that the secret/configmap object exists and that they key is present +* Invalid secret or configmap key reference. + * Verify that the secret/configmap object exists and that they key is present in the secret/configmap. -- Invalid receiver or route settings (for example, a missing URL in a Slack action). - - Fix the improper syntax. -- Configuration option which is not yet available in the Alertmanager version. - - Update the resource to not use this option. -- Unsupported match rules in inhibition rules. - - Fix the match rule syntax. +* Invalid receiver or route settings (for example, a missing URL in a Slack action). + * Fix the improper syntax. +* Configuration option which is not yet available in the Alertmanager version. + * Update the resource to not use this option. +* Unsupported match rules in inhibition rules. + * Fix the match rule syntax. The admission webhook should be able to catch most of these errors. In this case, the admission webhook might be offline. Please check the @@ -175,4 +175,4 @@ webhook might be offline. Please check the ## Additional resources -- ["PrometheusOperatorRejectedResources" alert firing continuously in a Red Hat OpenShift Service in RHOCP 4](https://access.redhat.com/solutions/6992399) +* ["PrometheusOperatorRejectedResources" alert firing continuously in a Red Hat OpenShift Service in RHOCP 4](https://access.redhat.com/solutions/6992399) diff --git a/alerts/openshift-container-storage-operator/HighRBDCloneSnapshotCount.md b/alerts/openshift-container-storage-operator/HighRBDCloneSnapshotCount.md index 8cd4afde..90565f3e 100644 --- a/alerts/openshift-container-storage-operator/HighRBDCloneSnapshotCount.md +++ b/alerts/openshift-container-storage-operator/HighRBDCloneSnapshotCount.md @@ -39,12 +39,12 @@ rbd children --pool [--namespace ] rbd snap ls --pool [--namespace ] ``` 6. Correlate with Kubernetes resources: - * Find associated PVCs: oc get pv -o jsonpath='{.spec.claimRef.name}' - --field-selector=spec.csi.volumeAttributes.imageName= - * Check VolumeSnapshots: oc get volumesnapshot -A | grep + * Find associated PVCs: oc get pv -o jsonpath='{.spec.claimRef.name}' + --field-selector=spec.csi.volumeAttributes.imageName= + * Check VolumeSnapshots: oc get volumesnapshot -A | grep 7. Determine the source: - * Is a backup tool (e.g., Velero) creating frequent snapshots? - * Is a CI/CD pipeline cloning the same image repeatedly? + * Is a backup tool (e.g., Velero) creating frequent snapshots? + * Is a CI/CD pipeline cloning the same image repeatedly? ## Mitigation @@ -55,8 +55,8 @@ oc delete pvc -n This triggers the CSI driver to clean up the underlying RBD clone. 2. Implement lifecycle policies : - * Configure snapshot retention in backup tools (e.g., Velero TTL). - * Limit the number of concurrent clones in automation workflows. + * Configure snapshot retention in backup tools (e.g., Velero TTL). + * Limit the number of concurrent clones in automation workflows. 3. Monitor growth rate to catch issues early: ```bash # triggers when an RBD image is, on average , gaining more than 10 new clones diff --git a/alerts/openshift-container-storage-operator/ODFNodeNICBandwidthSaturation.md b/alerts/openshift-container-storage-operator/ODFNodeNICBandwidthSaturation.md index a69bb566..faa3bd18 100644 --- a/alerts/openshift-container-storage-operator/ODFNodeNICBandwidthSaturation.md +++ b/alerts/openshift-container-storage-operator/ODFNodeNICBandwidthSaturation.md @@ -30,12 +30,12 @@ rate(node_network_transmit_bytes_total{...}) * 8 ## Mitigation 1. Short term: Throttle non-essential traffic on the node. - * Taint the OSD node to prevent scheduling of non-storage workloads. - * Drain existing non-essential pods from the node. + * Taint the OSD node to prevent scheduling of non-storage workloads. + * Drain existing non-essential pods from the node. 2. Long term: - * Upgrade to higher-speed NICs (e.g., 25GbE → 100GbE). - * Use multiple bonded interfaces with LACP. - * Separate storage and client traffic using VLANs or dedicated NICs. + * Upgrade to higher-speed NICs (e.g., 25GbE → 100GbE). + * Use multiple bonded interfaces with LACP. + * Separate storage and client traffic using VLANs or dedicated NICs. 3. Tune Ceph osd_max_backfills, osd_recovery_max_active to reduce recovery bandwidth. 4. Enable NIC offload features (TSO, GRO) if disabled. diff --git a/alerts/openshift-virtualization-operator/VirtualMachineStuckInUnhealthyState.md b/alerts/openshift-virtualization-operator/VirtualMachineStuckInUnhealthyState.md index cb0fce62..1a618170 100644 --- a/alerts/openshift-virtualization-operator/VirtualMachineStuckInUnhealthyState.md +++ b/alerts/openshift-virtualization-operator/VirtualMachineStuckInUnhealthyState.md @@ -261,8 +261,8 @@ misconfigurations: - Use local image registries where possible to reduce latency - Configure DataVolume import methods appropriately: - * **Pod import method**: Images pulled to temporary pods (default) - * **Node import method**: Images pulled directly to nodes + - **Pod import method**: Images pulled to temporary pods (default) + - **Node import method**: Images pulled directly to nodes (requires pre-pulling) - Pre-pull critical containerDisk images to nodes only if using node import method diff --git a/hack/Dockerfile.lint b/hack/Dockerfile.lint new file mode 100644 index 00000000..ea8bc817 --- /dev/null +++ b/hack/Dockerfile.lint @@ -0,0 +1,7 @@ +FROM quay.io/fedora/fedora:latest + +RUN dnf upgrade -y && \ + dnf -y install aspell aspell-en which python3 python3-pip && \ + dnf clean all + +WORKDIR /src diff --git a/hack/README.md b/hack/README.md new file mode 100644 index 00000000..f1b3f3f6 --- /dev/null +++ b/hack/README.md @@ -0,0 +1,30 @@ +# Linting Markdown + +Running a linter container + +```sh +# build container +podman build -t linter -f hack/Dockerfile.lint + +# run lint container +podman run -it --rm -v ${PWD}:/src:z linter +``` + +Running `lint.sh` + +```sh +hack/lint.sh + +# lint spelling +hack/lint_spelling.sh + +# lint markdown +hack/lint_spelling.sh +``` + +Fix markdown errors + +```sh +. venv/bin/activate +pymarkdown fix --recurse . +``` diff --git a/hack/lint.sh b/hack/lint.sh new file mode 100755 index 00000000..020e5c88 --- /dev/null +++ b/hack/lint.sh @@ -0,0 +1,71 @@ +#!/bin/bash +# shellcheck disable=SC2015,SC1091 +set -e + +usage(){ + echo " + usage: $0 + " +} + +failed(){ + echo " + failed: ${1:-lint} + " + return 1 +} + +py_setup_venv(){ + python3 -m venv venv + source venv/bin/activate + pip install -q -U pip + + py_check_venv || usage +} + +py_check_venv(){ + # activate python venv + [ -d venv ] && source venv/bin/activate || py_setup_venv + [ -e $(dirname "$0")/requirements.txt ] && pip install -q -r $(dirname "$0")/requirements.txt +} + +py_bin_checks(){ + which python || exit 0 + which pip || exit 0 +} + +lint_spelling(){ + which aspell || return + which pyspelling || return + [ -e .pyspelling.yml ] || return + [ -e .wordlist-md ] || return + + pyspelling -c .pyspelling.yml +} + +lint_markdown(){ + which pymarkdown || fail "no pymarkdown" + pymarkdown scan --recurse . +} + +lint_init(){ + mkdir -p scratch +} + +fix_markdown(){ + which pymarkdown || failed "no pymarkdown" + pymarkdown fix --recurse . +} + +lint(){ + lint_spelling + lint_markdown +} + +py_check_venv +py_bin_checks + +lint_init + +FUNCTION=$(basename -s .sh ${0}) +"${FUNCTION}" 0 || failed "${FUNCTION}" diff --git a/hack/lint_markdown.sh b/hack/lint_markdown.sh new file mode 120000 index 00000000..6976ebdf --- /dev/null +++ b/hack/lint_markdown.sh @@ -0,0 +1 @@ +lint.sh \ No newline at end of file diff --git a/hack/lint_spelling.sh b/hack/lint_spelling.sh new file mode 120000 index 00000000..6976ebdf --- /dev/null +++ b/hack/lint_spelling.sh @@ -0,0 +1 @@ +lint.sh \ No newline at end of file diff --git a/hack/requirements.txt b/hack/requirements.txt new file mode 100644 index 00000000..ce093042 --- /dev/null +++ b/hack/requirements.txt @@ -0,0 +1,2 @@ +pyspelling +pymarkdownlnt