ceremonyclient/node/store/disk_monitor.go
Cassandra Heart dbd95bd9e9
v2.1.0 (#439)
* v2.1.0 [omit consensus and adjacent] - this commit will be amended with the full release after the file copy is complete

* 2.1.0 main node rollup
2025-09-30 02:48:15 -05:00

260 lines
6.5 KiB
Go

package store
import (
"context"
"fmt"
"os"
"path/filepath"
"strconv"
"syscall"
"time"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"go.uber.org/zap"
"source.quilibrium.com/quilibrium/monorepo/config"
)
// DiskMonitor watches a partition for disk usage and alerts based on configured
// thresholds
//
// Usage:
//
// errCh := make(chan error, 1)
// // Create monitor with DB config and logger
// monitor := store.NewDiskMonitor(dbPath, dbConfig, logger, errCh)
//
// // Optionally customize check interval
// monitor = monitor.WithCheckInterval(30 * time.Second)
//
// // Start monitoring with context
// ctx, cancel := context.WithCancel(context.Background())
// defer cancel()
// monitor.Start(ctx)
//
// // Handle termination signals
//
// go func() {
// select {
// case err := <-errCh:
// log.Error("Disk monitor triggered shutdown", zap.Error(err))
// cancel() // Cancel the context to stop monitor
// // Initiate graceful shutdown
// }
// }()
const diskMonitorNamespace = "disk_monitor"
var (
// Disk usage percentage metric
diskUsagePercentage = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: diskMonitorNamespace,
Name: "usage_percentage",
Help: "Current disk usage percentage for the monitored path",
},
[]string{"core_id", "path"},
)
// Disk space metrics in bytes
diskTotalSpace = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: diskMonitorNamespace,
Name: "total_bytes",
Help: "Total disk space in bytes for the monitored path",
},
[]string{"core_id", "path"},
)
diskUsedSpace = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: diskMonitorNamespace,
Name: "used_bytes",
Help: "Used disk space in bytes for the monitored path",
},
[]string{"core_id", "path"},
)
diskFreeSpace = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: diskMonitorNamespace,
Name: "free_bytes",
Help: "Free disk space in bytes for the monitored path",
},
[]string{"core_id", "path"},
)
)
func init() {
// Register the metrics with Prometheus
prometheus.MustRegister(diskUsagePercentage)
prometheus.MustRegister(diskTotalSpace)
prometheus.MustRegister(diskUsedSpace)
prometheus.MustRegister(diskFreeSpace)
}
type DiskMonitor struct {
path string
coreId uint
noticePercentage int
warnPercentage int
terminatePercentage int
log *zap.Logger
errCh chan error
checkInterval time.Duration
}
// NewDiskMonitor creates a new disk monitor for the given path using thresholds
// from config
func NewDiskMonitor(
coreId uint,
cfg config.DBConfig,
log *zap.Logger,
errCh chan error,
) *DiskMonitor {
var path string
if coreId == 0 {
path = cfg.Path
} else {
if len(cfg.WorkerPaths) != 0 {
path = cfg.WorkerPaths[coreId-1]
} else {
path = fmt.Sprintf(cfg.WorkerPathPrefix, coreId)
}
}
return &DiskMonitor{
path: path,
coreId: coreId,
noticePercentage: cfg.NoticePercentage,
warnPercentage: cfg.WarnPercentage,
terminatePercentage: cfg.TerminatePercentage,
log: log,
errCh: errCh,
checkInterval: time.Minute,
}
}
// WithCheckInterval sets a custom interval for checking disk usage
func (d *DiskMonitor) WithCheckInterval(interval time.Duration) *DiskMonitor {
d.checkInterval = interval
return d
}
// getDiskStats calculates disk usage statistics for the partition containing path
// Returns usage percentage, total space, used space, free space, and error
func (d *DiskMonitor) getDiskStats() (int, uint64, uint64, uint64, error) {
absPath, err := filepath.Abs(d.path)
if err != nil {
return 0, 0, 0, 0, errors.Wrap(err, "get disk stats")
}
if _, err := os.Stat(absPath); os.IsNotExist(err) {
return 0, 0, 0, 0, errors.Wrap(
fmt.Errorf("path does not exist: %s", absPath),
"get disk stats",
)
}
var stat syscall.Statfs_t
if err := syscall.Statfs(absPath, &stat); err != nil {
return 0, 0, 0, 0, errors.Wrap(
fmt.Errorf("failed to get disk stats: %w", err),
"get disk stats",
)
}
totalSpace := stat.Blocks * uint64(stat.Bsize)
freeSpace := stat.Bfree * uint64(stat.Bsize)
usedSpace := totalSpace - freeSpace
// Avoid division by zero
var usagePercentage int
if totalSpace > 0 {
usagePercentage = int((usedSpace * 100) / totalSpace)
}
return usagePercentage, totalSpace, usedSpace, freeSpace, nil
}
// Start begins monitoring disk usage in a separate goroutine
func (d *DiskMonitor) Start(ctx context.Context) {
go func() {
ticker := time.NewTicker(d.checkInterval)
defer ticker.Stop()
d.checkDiskUsage()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
d.checkDiskUsage()
}
}
}()
}
func (d *DiskMonitor) checkDiskUsage() {
usagePercentage, totalSpace, usedSpace, freeSpace, err := d.getDiskStats()
if err != nil {
d.log.Error(
"Failed to check disk usage",
zap.Error(err),
zap.String("path", d.path),
)
return
}
// Update Prometheus metrics
coreIdStr := strconv.FormatUint(uint64(d.coreId), 10)
diskUsagePercentage.WithLabelValues(coreIdStr, d.path).Set(
float64(usagePercentage),
)
diskTotalSpace.WithLabelValues(coreIdStr, d.path).Set(float64(totalSpace))
diskUsedSpace.WithLabelValues(coreIdStr, d.path).Set(float64(usedSpace))
diskFreeSpace.WithLabelValues(coreIdStr, d.path).Set(float64(freeSpace))
// Handle the different threshold levels
switch {
case usagePercentage >= d.terminatePercentage:
d.log.Error(
"disk usage critical",
zap.String("path", d.path),
zap.Int("usage_percentage", usagePercentage),
zap.Int("threshold", d.terminatePercentage),
zap.Uint64("free_bytes", freeSpace),
zap.Uint64("total_bytes", totalSpace),
)
if d.errCh != nil {
d.errCh <- errors.Wrap(
fmt.Errorf(
"disk usage for %s reached critical threshold: %d%%",
d.path,
usagePercentage,
),
"check disk usage",
)
}
case usagePercentage >= d.warnPercentage:
d.log.Warn(
"disk usage high",
zap.String("path", d.path),
zap.Int("usage_percentage", usagePercentage),
zap.Int("threshold", d.warnPercentage),
zap.Uint64("free_bytes", freeSpace),
zap.Uint64("total_bytes", totalSpace),
)
case usagePercentage >= d.noticePercentage:
d.log.Info(
"disk usage notice",
zap.String("path", d.path),
zap.Int("usage_percentage", usagePercentage),
zap.Int("threshold", d.noticePercentage),
zap.Uint64("free_bytes", freeSpace),
zap.Uint64("total_bytes", totalSpace),
)
}
}