mirror of
https://github.com/QuilibriumNetwork/ceremonyclient.git
synced 2026-02-21 10:27:26 +08:00
* v2.1.0 [omit consensus and adjacent] - this commit will be amended with the full release after the file copy is complete * 2.1.0 main node rollup
260 lines
6.5 KiB
Go
260 lines
6.5 KiB
Go
package store
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strconv"
|
|
"syscall"
|
|
"time"
|
|
|
|
"github.com/pkg/errors"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"go.uber.org/zap"
|
|
"source.quilibrium.com/quilibrium/monorepo/config"
|
|
)
|
|
|
|
// DiskMonitor watches a partition for disk usage and alerts based on configured
|
|
// thresholds
|
|
//
|
|
// Usage:
|
|
//
|
|
// errCh := make(chan error, 1)
|
|
// // Create monitor with DB config and logger
|
|
// monitor := store.NewDiskMonitor(dbPath, dbConfig, logger, errCh)
|
|
//
|
|
// // Optionally customize check interval
|
|
// monitor = monitor.WithCheckInterval(30 * time.Second)
|
|
//
|
|
// // Start monitoring with context
|
|
// ctx, cancel := context.WithCancel(context.Background())
|
|
// defer cancel()
|
|
// monitor.Start(ctx)
|
|
//
|
|
// // Handle termination signals
|
|
//
|
|
// go func() {
|
|
// select {
|
|
// case err := <-errCh:
|
|
// log.Error("Disk monitor triggered shutdown", zap.Error(err))
|
|
// cancel() // Cancel the context to stop monitor
|
|
// // Initiate graceful shutdown
|
|
// }
|
|
// }()
|
|
const diskMonitorNamespace = "disk_monitor"
|
|
|
|
var (
|
|
// Disk usage percentage metric
|
|
diskUsagePercentage = prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Namespace: diskMonitorNamespace,
|
|
Name: "usage_percentage",
|
|
Help: "Current disk usage percentage for the monitored path",
|
|
},
|
|
[]string{"core_id", "path"},
|
|
)
|
|
|
|
// Disk space metrics in bytes
|
|
diskTotalSpace = prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Namespace: diskMonitorNamespace,
|
|
Name: "total_bytes",
|
|
Help: "Total disk space in bytes for the monitored path",
|
|
},
|
|
[]string{"core_id", "path"},
|
|
)
|
|
|
|
diskUsedSpace = prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Namespace: diskMonitorNamespace,
|
|
Name: "used_bytes",
|
|
Help: "Used disk space in bytes for the monitored path",
|
|
},
|
|
[]string{"core_id", "path"},
|
|
)
|
|
|
|
diskFreeSpace = prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Namespace: diskMonitorNamespace,
|
|
Name: "free_bytes",
|
|
Help: "Free disk space in bytes for the monitored path",
|
|
},
|
|
[]string{"core_id", "path"},
|
|
)
|
|
)
|
|
|
|
func init() {
|
|
// Register the metrics with Prometheus
|
|
prometheus.MustRegister(diskUsagePercentage)
|
|
prometheus.MustRegister(diskTotalSpace)
|
|
prometheus.MustRegister(diskUsedSpace)
|
|
prometheus.MustRegister(diskFreeSpace)
|
|
}
|
|
|
|
type DiskMonitor struct {
|
|
path string
|
|
coreId uint
|
|
noticePercentage int
|
|
warnPercentage int
|
|
terminatePercentage int
|
|
log *zap.Logger
|
|
errCh chan error
|
|
checkInterval time.Duration
|
|
}
|
|
|
|
// NewDiskMonitor creates a new disk monitor for the given path using thresholds
|
|
// from config
|
|
func NewDiskMonitor(
|
|
coreId uint,
|
|
cfg config.DBConfig,
|
|
log *zap.Logger,
|
|
errCh chan error,
|
|
) *DiskMonitor {
|
|
var path string
|
|
if coreId == 0 {
|
|
path = cfg.Path
|
|
} else {
|
|
if len(cfg.WorkerPaths) != 0 {
|
|
path = cfg.WorkerPaths[coreId-1]
|
|
} else {
|
|
path = fmt.Sprintf(cfg.WorkerPathPrefix, coreId)
|
|
}
|
|
}
|
|
|
|
return &DiskMonitor{
|
|
path: path,
|
|
coreId: coreId,
|
|
noticePercentage: cfg.NoticePercentage,
|
|
warnPercentage: cfg.WarnPercentage,
|
|
terminatePercentage: cfg.TerminatePercentage,
|
|
log: log,
|
|
errCh: errCh,
|
|
checkInterval: time.Minute,
|
|
}
|
|
}
|
|
|
|
// WithCheckInterval sets a custom interval for checking disk usage
|
|
func (d *DiskMonitor) WithCheckInterval(interval time.Duration) *DiskMonitor {
|
|
d.checkInterval = interval
|
|
return d
|
|
}
|
|
|
|
// getDiskStats calculates disk usage statistics for the partition containing path
|
|
// Returns usage percentage, total space, used space, free space, and error
|
|
func (d *DiskMonitor) getDiskStats() (int, uint64, uint64, uint64, error) {
|
|
absPath, err := filepath.Abs(d.path)
|
|
if err != nil {
|
|
return 0, 0, 0, 0, errors.Wrap(err, "get disk stats")
|
|
}
|
|
|
|
if _, err := os.Stat(absPath); os.IsNotExist(err) {
|
|
return 0, 0, 0, 0, errors.Wrap(
|
|
fmt.Errorf("path does not exist: %s", absPath),
|
|
"get disk stats",
|
|
)
|
|
}
|
|
|
|
var stat syscall.Statfs_t
|
|
if err := syscall.Statfs(absPath, &stat); err != nil {
|
|
return 0, 0, 0, 0, errors.Wrap(
|
|
fmt.Errorf("failed to get disk stats: %w", err),
|
|
"get disk stats",
|
|
)
|
|
}
|
|
|
|
totalSpace := stat.Blocks * uint64(stat.Bsize)
|
|
freeSpace := stat.Bfree * uint64(stat.Bsize)
|
|
usedSpace := totalSpace - freeSpace
|
|
|
|
// Avoid division by zero
|
|
var usagePercentage int
|
|
if totalSpace > 0 {
|
|
usagePercentage = int((usedSpace * 100) / totalSpace)
|
|
}
|
|
|
|
return usagePercentage, totalSpace, usedSpace, freeSpace, nil
|
|
}
|
|
|
|
// Start begins monitoring disk usage in a separate goroutine
|
|
func (d *DiskMonitor) Start(ctx context.Context) {
|
|
go func() {
|
|
ticker := time.NewTicker(d.checkInterval)
|
|
defer ticker.Stop()
|
|
|
|
d.checkDiskUsage()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
d.checkDiskUsage()
|
|
}
|
|
}
|
|
}()
|
|
}
|
|
|
|
func (d *DiskMonitor) checkDiskUsage() {
|
|
usagePercentage, totalSpace, usedSpace, freeSpace, err := d.getDiskStats()
|
|
if err != nil {
|
|
d.log.Error(
|
|
"Failed to check disk usage",
|
|
zap.Error(err),
|
|
zap.String("path", d.path),
|
|
)
|
|
return
|
|
}
|
|
|
|
// Update Prometheus metrics
|
|
coreIdStr := strconv.FormatUint(uint64(d.coreId), 10)
|
|
diskUsagePercentage.WithLabelValues(coreIdStr, d.path).Set(
|
|
float64(usagePercentage),
|
|
)
|
|
diskTotalSpace.WithLabelValues(coreIdStr, d.path).Set(float64(totalSpace))
|
|
diskUsedSpace.WithLabelValues(coreIdStr, d.path).Set(float64(usedSpace))
|
|
diskFreeSpace.WithLabelValues(coreIdStr, d.path).Set(float64(freeSpace))
|
|
|
|
// Handle the different threshold levels
|
|
switch {
|
|
case usagePercentage >= d.terminatePercentage:
|
|
d.log.Error(
|
|
"disk usage critical",
|
|
zap.String("path", d.path),
|
|
zap.Int("usage_percentage", usagePercentage),
|
|
zap.Int("threshold", d.terminatePercentage),
|
|
zap.Uint64("free_bytes", freeSpace),
|
|
zap.Uint64("total_bytes", totalSpace),
|
|
)
|
|
|
|
if d.errCh != nil {
|
|
d.errCh <- errors.Wrap(
|
|
fmt.Errorf(
|
|
"disk usage for %s reached critical threshold: %d%%",
|
|
d.path,
|
|
usagePercentage,
|
|
),
|
|
"check disk usage",
|
|
)
|
|
}
|
|
case usagePercentage >= d.warnPercentage:
|
|
d.log.Warn(
|
|
"disk usage high",
|
|
zap.String("path", d.path),
|
|
zap.Int("usage_percentage", usagePercentage),
|
|
zap.Int("threshold", d.warnPercentage),
|
|
zap.Uint64("free_bytes", freeSpace),
|
|
zap.Uint64("total_bytes", totalSpace),
|
|
)
|
|
case usagePercentage >= d.noticePercentage:
|
|
d.log.Info(
|
|
"disk usage notice",
|
|
zap.String("path", d.path),
|
|
zap.Int("usage_percentage", usagePercentage),
|
|
zap.Int("threshold", d.noticePercentage),
|
|
zap.Uint64("free_bytes", freeSpace),
|
|
zap.Uint64("total_bytes", totalSpace),
|
|
)
|
|
}
|
|
}
|