Files
EonaCat.LogStack/EonaCat.LogStack.Status/Services/MonitoringService.cs
2026-04-06 08:15:54 +02:00

538 lines
20 KiB
C#

using System.Net;
using System.Net.Http;
using System.Net.NetworkInformation;
using System.Net.Security;
using System.Net.Sockets;
using System.Security.Cryptography.X509Certificates;
using System.Diagnostics;
using System.Text;
using System.Text.Json;
using Microsoft.EntityFrameworkCore;
using EonaCat.LogStack.Status.Data;
using EonaCat.LogStack.Status.Models;
using Monitor = EonaCat.LogStack.Status.Models.Monitor;
namespace EonaCat.LogStack.Status.Services;
// This file is part of the EonaCat project(s) which is released under the Apache License.
// See the LICENSE file or go to https://EonaCat.com/License for full license details.
public class MonitoringService
{
private readonly IDbContextFactory<DatabaseContext> _dbFactory;
private readonly ILogger<MonitoringService> _log;
public MonitoringService(IDbContextFactory<DatabaseContext> dbFactory, ILogger<MonitoringService> log)
{
_dbFactory = dbFactory;
_log = log;
}
// check ─
public async Task<MonitorCheck> CheckMonitorAsync(Monitor monitor)
{
var sw = Stopwatch.StartNew();
MonitorStatus status;
string? message = null;
try
{
(status, message) = monitor.Type switch
{
MonitorType.TCP => await CheckTcpAsync(monitor.Host, monitor.Port ?? 80, monitor.TimeoutMs),
MonitorType.UDP => await CheckUdpAsync(monitor.Host, monitor.Port ?? 53, monitor.TimeoutMs),
MonitorType.Ping => await CheckPingAsync(monitor.Host, monitor.TimeoutMs),
MonitorType.AppLocal => CheckLocalProcess(monitor.ProcessName ?? monitor.Name),
MonitorType.AppRemote => await CheckTcpAsync(monitor.Host, monitor.Port ?? 80, monitor.TimeoutMs),
MonitorType.HTTP => await CheckHttpAsync(monitor.Url ?? $"http://{monitor.Host}", monitor.TimeoutMs, monitor.ExpectedKeyword, monitor.ExpectedStatusCode),
MonitorType.HTTPS => await CheckHttpAsync(monitor.Url ?? $"https://{monitor.Host}", monitor.TimeoutMs, monitor.ExpectedKeyword, monitor.ExpectedStatusCode),
_ => (MonitorStatus.Unknown, "Unknown monitor type")
};
}
catch (Exception ex)
{
status = MonitorStatus.Down;
message = ex.Message;
}
sw.Stop();
// failure threshold ─
if (status == MonitorStatus.Down || status == MonitorStatus.Warning)
{
monitor.ConsecutiveFailures++;
if (monitor.ConsecutiveFailures < monitor.FailureThreshold)
{
// Not enough consecutive failures yet - keep previous status
status = monitor.LastStatus == MonitorStatus.Unknown ? MonitorStatus.Unknown : monitor.LastStatus;
message = $"[Grace: {monitor.ConsecutiveFailures}/{monitor.FailureThreshold}] {message}";
}
}
else
{
monitor.ConsecutiveFailures = 0;
}
var check = new MonitorCheck
{
MonitorId = monitor.Id,
Status = status,
ResponseMs = sw.Elapsed.TotalMilliseconds,
Message = message,
CheckedAt = DateTime.UtcNow
};
await using var db = await _dbFactory.CreateDbContextAsync();
var prevStatus = monitor.LastStatus;
db.MonitorChecks.Add(check);
monitor.LastChecked = DateTime.UtcNow;
monitor.LastStatus = status;
monitor.LastResponseMs = check.ResponseMs;
db.Monitors.Update(monitor);
await db.SaveChangesAsync();
await EvaluateAlertRulesAsync(monitor, check, prevStatus, db);
return check;
}
private async Task<(MonitorStatus, string?)> CheckTcpAsync(string host, int port, int timeoutMs)
{
using var client = new TcpClient();
var cts = new CancellationTokenSource(timeoutMs);
try
{
await client.ConnectAsync(host, port, cts.Token);
return (MonitorStatus.Up, $"Connected to {host}:{port}");
}
catch (OperationCanceledException)
{
return (MonitorStatus.Down, $"Timeout connecting to {host}:{port}");
}
catch (Exception ex)
{
return (MonitorStatus.Down, ex.Message);
}
}
private async Task<(MonitorStatus, string?)> CheckUdpAsync(string host, int port, int timeoutMs)
{
try
{
using var udp = new UdpClient();
udp.Connect(host, port);
var data = new byte[] { 0x00 };
await udp.SendAsync(data, data.Length);
return (MonitorStatus.Up, $"UDP {host}:{port} reachable");
}
catch (Exception ex)
{
return (MonitorStatus.Warning, $"UDP check: {ex.Message}");
}
}
/// <summary>ICMP ping check.</summary>
private async Task<(MonitorStatus, string?)> CheckPingAsync(string host, int timeoutMs)
{
try
{
using var ping = new Ping();
var reply = await ping.SendPingAsync(host, timeoutMs);
if (reply.Status == IPStatus.Success)
{
return (MonitorStatus.Up, $"Ping {host} = {reply.RoundtripTime}ms TTL={reply.Options?.Ttl}");
}
return (MonitorStatus.Down, $"Ping {host}: {reply.Status}");
}
catch (Exception ex)
{
return (MonitorStatus.Down, $"Ping error: {ex.Message}");
}
}
private (MonitorStatus, string?) CheckLocalProcess(string processName)
{
var procs = Process.GetProcessesByName(processName);
if (procs.Length > 0)
{
return (MonitorStatus.Up, $"Process '{processName}' running (PID: {procs[0].Id})");
}
return (MonitorStatus.Down, $"Process '{processName}' not found");
}
private async Task<(MonitorStatus, string?)> CheckHttpAsync(string url, int timeoutMs, string? expectedKeyword, int? expectedStatusCode)
{
using var handler = new HttpClientHandler
{
ServerCertificateCustomValidationCallback = HttpClientHandler.DangerousAcceptAnyServerCertificateValidator
};
using var client = new HttpClient(handler) { Timeout = TimeSpan.FromMilliseconds(timeoutMs) };
try
{
var resp = await client.GetAsync(url);
var code = (int)resp.StatusCode;
string? body = null;
// Keyword assertion
if (!string.IsNullOrEmpty(expectedKeyword))
{
body = await resp.Content.ReadAsStringAsync();
if (!body.Contains(expectedKeyword, StringComparison.OrdinalIgnoreCase))
{
return (MonitorStatus.Down, $"HTTP {code} - keyword '{expectedKeyword}' not found");
}
}
// Status code assertion
if (expectedStatusCode.HasValue)
{
if (code == expectedStatusCode.Value)
{
return (MonitorStatus.Up, $"HTTP {code} (expected)");
}
return code >= 200 && code < 400
? (MonitorStatus.Warning, $"HTTP {code} (expected {expectedStatusCode})")
: (MonitorStatus.Down, $"HTTP {code} (expected {expectedStatusCode})");
}
if (code >= 200 && code < 400)
{
return (MonitorStatus.Up, $"HTTP {code}");
}
if (code >= 400 && code < 500)
{
return (MonitorStatus.Warning, $"HTTP {code}");
}
return (MonitorStatus.Down, $"HTTP {code}");
}
catch (TaskCanceledException)
{
return (MonitorStatus.Down, "Timeout");
}
catch (Exception ex)
{
return (MonitorStatus.Down, ex.Message);
}
}
public async Task<CertificateEntry> CheckCertificateAsync(CertificateEntry cert)
{
try
{
using var client = new TcpClient();
await client.ConnectAsync(cert.Domain, cert.Port);
using var ssl = new SslStream(client.GetStream(), false, (_, c, _, _) => true);
await ssl.AuthenticateAsClientAsync(cert.Domain);
var x509 = ssl.RemoteCertificate as X509Certificate2
?? new X509Certificate2(ssl.RemoteCertificate!);
cert.ExpiresAt = x509.NotAfter.ToUniversalTime();
cert.IssuedAt = x509.NotBefore.ToUniversalTime();
cert.Issuer = x509.Issuer;
cert.Subject = x509.Subject;
cert.Thumbprint = x509.Thumbprint;
cert.LastError = null;
}
catch (Exception ex)
{
cert.LastError = ex.Message;
}
cert.LastChecked = DateTime.UtcNow;
await using var db = await _dbFactory.CreateDbContextAsync();
db.Certificates.Update(cert);
await db.SaveChangesAsync();
return cert;
}
public async Task<DashboardStats> GetStatsAsync(bool isAdmin)
{
await using var db = await _dbFactory.CreateDbContextAsync();
var monitors = await db.Monitors.Where(m => m.IsActive && (isAdmin || m.IsPublic)).ToListAsync();
var certs = await db.Certificates.ToListAsync();
var now = DateTime.UtcNow;
return new DashboardStats
{
TotalMonitors = monitors.Count,
UpCount = monitors.Count(m => m.LastStatus == MonitorStatus.Up),
DownCount = monitors.Count(m => m.LastStatus == MonitorStatus.Down),
WarnCount = monitors.Count(m => m.LastStatus == MonitorStatus.Warning || m.LastStatus == MonitorStatus.Degraded),
UnknownCount = monitors.Count(m => m.LastStatus == MonitorStatus.Unknown),
CertCount = certs.Count,
CertExpiringSoon = certs.Count(c => c.ExpiresAt.HasValue && c.ExpiresAt.Value > now && (c.ExpiresAt.Value - now).TotalDays <= 30),
CertExpired = certs.Count(c => c.ExpiresAt.HasValue && c.ExpiresAt.Value <= now),
TotalLogs = await db.Logs.LongCountAsync(),
ErrorLogs = await db.Logs.LongCountAsync(l => l.Level == "error" || l.Level == "critical"),
OverallUptime = monitors.Count > 0 ? (double)monitors.Count(m => m.LastStatus == MonitorStatus.Up) / monitors.Count * 100 : 0,
ActiveIncidents = await db.Incidents.CountAsync(i => i.Status != IncidentStatus.Resolved),
ResolvedIncidents = await db.Incidents.CountAsync(i => i.Status == IncidentStatus.Resolved)
};
}
/// <summary>
/// Returns uptime percentages and response time stats for a single monitor.
/// </summary>
public async Task<UptimeReport> GetUptimeReportAsync(int monitorId)
{
await using var db = await _dbFactory.CreateDbContextAsync();
var monitor = await db.Monitors.FindAsync(monitorId);
if (monitor == null)
{
throw new KeyNotFoundException($"Monitor {monitorId} not found.");
}
var now = DateTime.UtcNow;
var checks24h = await db.MonitorChecks
.Where(c => c.MonitorId == monitorId && c.CheckedAt >= now.AddHours(-24))
.ToListAsync();
var checks7d = await db.MonitorChecks
.Where(c => c.MonitorId == monitorId && c.CheckedAt >= now.AddDays(-7))
.ToListAsync();
var checks30d = await db.MonitorChecks
.Where(c => c.MonitorId == monitorId && c.CheckedAt >= now.AddDays(-30))
.ToListAsync();
static double CalcUptime(List<MonitorCheck> list) =>
list.Count == 0 ? 100.0 : (double)list.Count(c => c.Status == MonitorStatus.Up) / list.Count * 100.0;
return new UptimeReport
{
MonitorId = monitorId,
MonitorName = monitor.Name,
Uptime24h = CalcUptime(checks24h),
Uptime7d = CalcUptime(checks7d),
Uptime30d = CalcUptime(checks30d),
TotalChecks = checks30d.Count,
UpChecks = checks30d.Count(c => c.Status == MonitorStatus.Up),
DownChecks = checks30d.Count(c => c.Status == MonitorStatus.Down),
AvgResponseMs = checks30d.Count > 0 ? checks30d.Average(c => c.ResponseMs) : 0
};
}
/// <summary>
/// Returns log volume bucketed by hour for the last <paramref name="hours"/> hours.
/// </summary>
public async Task<List<LogStatsBucket>> GetLogStatsAsync(int hours = 24)
{
await using var db = await _dbFactory.CreateDbContextAsync();
var from = DateTime.UtcNow.AddHours(-hours);
var logs = await db.Logs.Where(l => l.Timestamp >= from).ToListAsync();
return logs
.GroupBy(l => new DateTime(l.Timestamp.Year, l.Timestamp.Month, l.Timestamp.Day, l.Timestamp.Hour, 0, 0, DateTimeKind.Utc))
.OrderBy(g => g.Key)
.Select(g => new LogStatsBucket
{
BucketStart = g.Key,
Total = g.LongCount(),
Errors = g.LongCount(l => l.Level == "error" || l.Level == "critical"),
Warnings = g.LongCount(l => l.Level == "warn" || l.Level == "warning")
})
.ToList();
}
private async Task EvaluateAlertRulesAsync(Monitor monitor, MonitorCheck check, MonitorStatus prevStatus, DatabaseContext db)
{
var rules = await db.AlertRules
.Where(r => r.IsEnabled && (r.MonitorId == monitor.Id || r.MonitorId == null))
.ToListAsync();
var globalWebhook = await db.Settings.FirstOrDefaultAsync(s => s.Key == "AlertWebhookUrl");
var webhookUrl = globalWebhook?.Value;
foreach (var rule in rules)
{
bool fired = rule.Condition switch
{
AlertRuleCondition.IsDown => check.Status == MonitorStatus.Down && prevStatus != MonitorStatus.Down,
AlertRuleCondition.IsUp => check.Status == MonitorStatus.Up && prevStatus == MonitorStatus.Down,
AlertRuleCondition.ResponseAboveMs => check.ResponseMs > (rule.ThresholdValue ?? double.MaxValue),
AlertRuleCondition.CertExpiresWithinDays => false, // evaluated by cert loop separately
_ => false
};
if (!fired)
{
continue;
}
// Cooldown check
if (rule.LastFiredAt.HasValue &&
(DateTime.UtcNow - rule.LastFiredAt.Value).TotalMinutes < rule.CooldownMinutes)
{
continue;
}
rule.LastFiredAt = DateTime.UtcNow;
db.AlertRules.Update(rule);
// Auto-create incident when a monitor goes down
var autoIncidents = await db.Settings.FirstOrDefaultAsync(s => s.Key == "AutoCreateIncidents");
if (autoIncidents?.Value == "true" && rule.Condition == AlertRuleCondition.IsDown)
{
var incident = new Incident
{
Title = $"{monitor.Name} is down",
Body = check.Message,
Severity = IncidentSeverity.Major,
Status = IncidentStatus.Investigating,
MonitorId = monitor.Id,
IsPublic = monitor.IsPublic
};
db.Incidents.Add(incident);
}
// Fire webhook
var target = rule.WebhookUrl ?? webhookUrl;
if (!string.IsNullOrEmpty(target))
{
_ = Task.Run(() => FireWebhookAsync(target, monitor, check, rule.Condition));
}
await db.SaveChangesAsync();
}
}
private async Task FireWebhookAsync(string url, Monitor monitor, MonitorCheck check, AlertRuleCondition condition)
{
try
{
using var client = new HttpClient { Timeout = TimeSpan.FromSeconds(10) };
var payload = JsonSerializer.Serialize(new
{
monitorId = monitor.Id,
monitorName = monitor.Name,
condition = condition.ToString(),
status = check.Status.ToString(),
responseMs = check.ResponseMs,
message = check.Message,
checkedAt = check.CheckedAt.ToString("o")
});
await client.PostAsync(url, new StringContent(payload, Encoding.UTF8, "application/json"));
}
catch (Exception ex)
{
_log.LogWarning("Webhook delivery to {Url} failed: {Msg}", url, ex.Message);
}
}
}
public class MonitoringBackgroundService : BackgroundService
{
private readonly IServiceScopeFactory _scopeFactory;
private readonly ILogger<MonitoringBackgroundService> _log;
public MonitoringBackgroundService(IServiceScopeFactory scopeFactory, ILogger<MonitoringBackgroundService> log)
{
_scopeFactory = scopeFactory;
_log = log;
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
while (!stoppingToken.IsCancellationRequested)
{
try
{
using var scope = _scopeFactory.CreateScope();
var dbFactory = scope.ServiceProvider.GetRequiredService<IDbContextFactory<DatabaseContext>>();
await using var db = await dbFactory.CreateDbContextAsync(stoppingToken);
var monitors = await db.Monitors.Where(m => m.IsActive).ToListAsync(stoppingToken);
var now = DateTime.UtcNow;
foreach (var m in monitors)
{
if (m.LastChecked == null || (now - m.LastChecked.Value).TotalSeconds >= m.IntervalSeconds)
{
var captured = m;
_ = Task.Run(async () =>
{
using var checkScope = _scopeFactory.CreateScope();
var svc = checkScope.ServiceProvider.GetRequiredService<MonitoringService>();
await svc.CheckMonitorAsync(captured);
}, stoppingToken);
}
}
// Check certs every hour
var certs = await db.Certificates.ToListAsync(stoppingToken);
foreach (var c in certs)
{
if (c.LastChecked == null || (now - c.LastChecked.Value).TotalHours >= 1)
{
var captured = c;
_ = Task.Run(async () =>
{
using var certScope = _scopeFactory.CreateScope();
var svc = certScope.ServiceProvider.GetRequiredService<MonitoringService>();
await svc.CheckCertificateAsync(captured);
}, stoppingToken);
}
}
// Log retention purge - run once per hour
if (now.Minute == 0)
{
using var purgeScope = _scopeFactory.CreateScope();
var ingest = purgeScope.ServiceProvider.GetRequiredService<IngestionService>();
var auth = purgeScope.ServiceProvider.GetRequiredService<AuthenticationService>();
var days = int.TryParse(await auth.GetSettingAsync("MaxLogRetentionDays", "30"), out var d) ? d : 30;
await ingest.PurgeOldLogsAsync(days);
}
}
catch (Exception ex)
{
_log.LogError(ex, "Error in monitor loop");
}
await Task.Delay(10_000, stoppingToken);
}
}
}
public class IngestionService
{
private readonly IDbContextFactory<DatabaseContext> _dbFactory;
public IngestionService(IDbContextFactory<DatabaseContext> dbFactory)
{
_dbFactory = dbFactory;
}
public async Task IngestAsync(LogEntry entry)
{
await using var db = await _dbFactory.CreateDbContextAsync();
db.Logs.Add(entry);
await db.SaveChangesAsync();
}
public async Task IngestBatchAsync(IEnumerable<LogEntry> entries)
{
await using var db = await _dbFactory.CreateDbContextAsync();
db.Logs.AddRange(entries);
await db.SaveChangesAsync();
}
public async Task PurgeOldLogsAsync(int retentionDays)
{
await using var db = await _dbFactory.CreateDbContextAsync();
var cutoff = DateTime.UtcNow.AddDays(-retentionDays);
// Use ExecuteDeleteAsync for efficiency with large tables
await db.Logs.Where(l => l.Timestamp < cutoff).ExecuteDeleteAsync();
}
}