538 lines
20 KiB
C#
538 lines
20 KiB
C#
using System.Net;
|
|
using System.Net.Http;
|
|
using System.Net.NetworkInformation;
|
|
using System.Net.Security;
|
|
using System.Net.Sockets;
|
|
using System.Security.Cryptography.X509Certificates;
|
|
using System.Diagnostics;
|
|
using System.Text;
|
|
using System.Text.Json;
|
|
using Microsoft.EntityFrameworkCore;
|
|
using EonaCat.LogStack.Status.Data;
|
|
using EonaCat.LogStack.Status.Models;
|
|
using Monitor = EonaCat.LogStack.Status.Models.Monitor;
|
|
|
|
namespace EonaCat.LogStack.Status.Services;
|
|
|
|
// This file is part of the EonaCat project(s) which is released under the Apache License.
|
|
// See the LICENSE file or go to https://EonaCat.com/License for full license details.
|
|
|
|
public class MonitoringService
|
|
{
|
|
private readonly IDbContextFactory<DatabaseContext> _dbFactory;
|
|
private readonly ILogger<MonitoringService> _log;
|
|
|
|
public MonitoringService(IDbContextFactory<DatabaseContext> dbFactory, ILogger<MonitoringService> log)
|
|
{
|
|
_dbFactory = dbFactory;
|
|
_log = log;
|
|
}
|
|
|
|
// check ─
|
|
|
|
public async Task<MonitorCheck> CheckMonitorAsync(Monitor monitor)
|
|
{
|
|
var sw = Stopwatch.StartNew();
|
|
MonitorStatus status;
|
|
string? message = null;
|
|
|
|
try
|
|
{
|
|
(status, message) = monitor.Type switch
|
|
{
|
|
MonitorType.TCP => await CheckTcpAsync(monitor.Host, monitor.Port ?? 80, monitor.TimeoutMs),
|
|
MonitorType.UDP => await CheckUdpAsync(monitor.Host, monitor.Port ?? 53, monitor.TimeoutMs),
|
|
MonitorType.Ping => await CheckPingAsync(monitor.Host, monitor.TimeoutMs),
|
|
MonitorType.AppLocal => CheckLocalProcess(monitor.ProcessName ?? monitor.Name),
|
|
MonitorType.AppRemote => await CheckTcpAsync(monitor.Host, monitor.Port ?? 80, monitor.TimeoutMs),
|
|
MonitorType.HTTP => await CheckHttpAsync(monitor.Url ?? $"http://{monitor.Host}", monitor.TimeoutMs, monitor.ExpectedKeyword, monitor.ExpectedStatusCode),
|
|
MonitorType.HTTPS => await CheckHttpAsync(monitor.Url ?? $"https://{monitor.Host}", monitor.TimeoutMs, monitor.ExpectedKeyword, monitor.ExpectedStatusCode),
|
|
_ => (MonitorStatus.Unknown, "Unknown monitor type")
|
|
};
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
status = MonitorStatus.Down;
|
|
message = ex.Message;
|
|
}
|
|
|
|
sw.Stop();
|
|
|
|
// failure threshold ─
|
|
if (status == MonitorStatus.Down || status == MonitorStatus.Warning)
|
|
{
|
|
monitor.ConsecutiveFailures++;
|
|
if (monitor.ConsecutiveFailures < monitor.FailureThreshold)
|
|
{
|
|
// Not enough consecutive failures yet - keep previous status
|
|
status = monitor.LastStatus == MonitorStatus.Unknown ? MonitorStatus.Unknown : monitor.LastStatus;
|
|
message = $"[Grace: {monitor.ConsecutiveFailures}/{monitor.FailureThreshold}] {message}";
|
|
}
|
|
}
|
|
else
|
|
{
|
|
monitor.ConsecutiveFailures = 0;
|
|
}
|
|
|
|
var check = new MonitorCheck
|
|
{
|
|
MonitorId = monitor.Id,
|
|
Status = status,
|
|
ResponseMs = sw.Elapsed.TotalMilliseconds,
|
|
Message = message,
|
|
CheckedAt = DateTime.UtcNow
|
|
};
|
|
|
|
await using var db = await _dbFactory.CreateDbContextAsync();
|
|
|
|
var prevStatus = monitor.LastStatus;
|
|
|
|
db.MonitorChecks.Add(check);
|
|
monitor.LastChecked = DateTime.UtcNow;
|
|
monitor.LastStatus = status;
|
|
monitor.LastResponseMs = check.ResponseMs;
|
|
db.Monitors.Update(monitor);
|
|
await db.SaveChangesAsync();
|
|
await EvaluateAlertRulesAsync(monitor, check, prevStatus, db);
|
|
|
|
return check;
|
|
}
|
|
|
|
private async Task<(MonitorStatus, string?)> CheckTcpAsync(string host, int port, int timeoutMs)
|
|
{
|
|
using var client = new TcpClient();
|
|
var cts = new CancellationTokenSource(timeoutMs);
|
|
try
|
|
{
|
|
await client.ConnectAsync(host, port, cts.Token);
|
|
return (MonitorStatus.Up, $"Connected to {host}:{port}");
|
|
}
|
|
catch (OperationCanceledException)
|
|
{
|
|
return (MonitorStatus.Down, $"Timeout connecting to {host}:{port}");
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
return (MonitorStatus.Down, ex.Message);
|
|
}
|
|
}
|
|
|
|
private async Task<(MonitorStatus, string?)> CheckUdpAsync(string host, int port, int timeoutMs)
|
|
{
|
|
try
|
|
{
|
|
using var udp = new UdpClient();
|
|
udp.Connect(host, port);
|
|
var data = new byte[] { 0x00 };
|
|
await udp.SendAsync(data, data.Length);
|
|
return (MonitorStatus.Up, $"UDP {host}:{port} reachable");
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
return (MonitorStatus.Warning, $"UDP check: {ex.Message}");
|
|
}
|
|
}
|
|
|
|
/// <summary>ICMP ping check.</summary>
|
|
private async Task<(MonitorStatus, string?)> CheckPingAsync(string host, int timeoutMs)
|
|
{
|
|
try
|
|
{
|
|
using var ping = new Ping();
|
|
var reply = await ping.SendPingAsync(host, timeoutMs);
|
|
if (reply.Status == IPStatus.Success)
|
|
{
|
|
return (MonitorStatus.Up, $"Ping {host} = {reply.RoundtripTime}ms TTL={reply.Options?.Ttl}");
|
|
}
|
|
|
|
return (MonitorStatus.Down, $"Ping {host}: {reply.Status}");
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
return (MonitorStatus.Down, $"Ping error: {ex.Message}");
|
|
}
|
|
}
|
|
|
|
private (MonitorStatus, string?) CheckLocalProcess(string processName)
|
|
{
|
|
var procs = Process.GetProcessesByName(processName);
|
|
if (procs.Length > 0)
|
|
{
|
|
return (MonitorStatus.Up, $"Process '{processName}' running (PID: {procs[0].Id})");
|
|
}
|
|
|
|
return (MonitorStatus.Down, $"Process '{processName}' not found");
|
|
}
|
|
|
|
private async Task<(MonitorStatus, string?)> CheckHttpAsync(string url, int timeoutMs, string? expectedKeyword, int? expectedStatusCode)
|
|
{
|
|
using var handler = new HttpClientHandler
|
|
{
|
|
ServerCertificateCustomValidationCallback = HttpClientHandler.DangerousAcceptAnyServerCertificateValidator
|
|
};
|
|
using var client = new HttpClient(handler) { Timeout = TimeSpan.FromMilliseconds(timeoutMs) };
|
|
try
|
|
{
|
|
var resp = await client.GetAsync(url);
|
|
var code = (int)resp.StatusCode;
|
|
string? body = null;
|
|
|
|
// Keyword assertion
|
|
if (!string.IsNullOrEmpty(expectedKeyword))
|
|
{
|
|
body = await resp.Content.ReadAsStringAsync();
|
|
if (!body.Contains(expectedKeyword, StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
return (MonitorStatus.Down, $"HTTP {code} - keyword '{expectedKeyword}' not found");
|
|
}
|
|
}
|
|
|
|
// Status code assertion
|
|
if (expectedStatusCode.HasValue)
|
|
{
|
|
if (code == expectedStatusCode.Value)
|
|
{
|
|
return (MonitorStatus.Up, $"HTTP {code} (expected)");
|
|
}
|
|
|
|
return code >= 200 && code < 400
|
|
? (MonitorStatus.Warning, $"HTTP {code} (expected {expectedStatusCode})")
|
|
: (MonitorStatus.Down, $"HTTP {code} (expected {expectedStatusCode})");
|
|
}
|
|
|
|
if (code >= 200 && code < 400)
|
|
{
|
|
return (MonitorStatus.Up, $"HTTP {code}");
|
|
}
|
|
|
|
if (code >= 400 && code < 500)
|
|
{
|
|
return (MonitorStatus.Warning, $"HTTP {code}");
|
|
}
|
|
|
|
return (MonitorStatus.Down, $"HTTP {code}");
|
|
}
|
|
catch (TaskCanceledException)
|
|
{
|
|
return (MonitorStatus.Down, "Timeout");
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
return (MonitorStatus.Down, ex.Message);
|
|
}
|
|
}
|
|
|
|
public async Task<CertificateEntry> CheckCertificateAsync(CertificateEntry cert)
|
|
{
|
|
try
|
|
{
|
|
using var client = new TcpClient();
|
|
await client.ConnectAsync(cert.Domain, cert.Port);
|
|
using var ssl = new SslStream(client.GetStream(), false, (_, c, _, _) => true);
|
|
await ssl.AuthenticateAsClientAsync(cert.Domain);
|
|
|
|
var x509 = ssl.RemoteCertificate as X509Certificate2
|
|
?? new X509Certificate2(ssl.RemoteCertificate!);
|
|
|
|
cert.ExpiresAt = x509.NotAfter.ToUniversalTime();
|
|
cert.IssuedAt = x509.NotBefore.ToUniversalTime();
|
|
cert.Issuer = x509.Issuer;
|
|
cert.Subject = x509.Subject;
|
|
cert.Thumbprint = x509.Thumbprint;
|
|
cert.LastError = null;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
cert.LastError = ex.Message;
|
|
}
|
|
|
|
cert.LastChecked = DateTime.UtcNow;
|
|
|
|
await using var db = await _dbFactory.CreateDbContextAsync();
|
|
db.Certificates.Update(cert);
|
|
await db.SaveChangesAsync();
|
|
return cert;
|
|
}
|
|
|
|
public async Task<DashboardStats> GetStatsAsync(bool isAdmin)
|
|
{
|
|
await using var db = await _dbFactory.CreateDbContextAsync();
|
|
var monitors = await db.Monitors.Where(m => m.IsActive && (isAdmin || m.IsPublic)).ToListAsync();
|
|
var certs = await db.Certificates.ToListAsync();
|
|
var now = DateTime.UtcNow;
|
|
|
|
return new DashboardStats
|
|
{
|
|
TotalMonitors = monitors.Count,
|
|
UpCount = monitors.Count(m => m.LastStatus == MonitorStatus.Up),
|
|
DownCount = monitors.Count(m => m.LastStatus == MonitorStatus.Down),
|
|
WarnCount = monitors.Count(m => m.LastStatus == MonitorStatus.Warning || m.LastStatus == MonitorStatus.Degraded),
|
|
UnknownCount = monitors.Count(m => m.LastStatus == MonitorStatus.Unknown),
|
|
CertCount = certs.Count,
|
|
CertExpiringSoon = certs.Count(c => c.ExpiresAt.HasValue && c.ExpiresAt.Value > now && (c.ExpiresAt.Value - now).TotalDays <= 30),
|
|
CertExpired = certs.Count(c => c.ExpiresAt.HasValue && c.ExpiresAt.Value <= now),
|
|
TotalLogs = await db.Logs.LongCountAsync(),
|
|
ErrorLogs = await db.Logs.LongCountAsync(l => l.Level == "error" || l.Level == "critical"),
|
|
OverallUptime = monitors.Count > 0 ? (double)monitors.Count(m => m.LastStatus == MonitorStatus.Up) / monitors.Count * 100 : 0,
|
|
ActiveIncidents = await db.Incidents.CountAsync(i => i.Status != IncidentStatus.Resolved),
|
|
ResolvedIncidents = await db.Incidents.CountAsync(i => i.Status == IncidentStatus.Resolved)
|
|
};
|
|
}
|
|
|
|
/// <summary>
|
|
/// Returns uptime percentages and response time stats for a single monitor.
|
|
/// </summary>
|
|
public async Task<UptimeReport> GetUptimeReportAsync(int monitorId)
|
|
{
|
|
await using var db = await _dbFactory.CreateDbContextAsync();
|
|
var monitor = await db.Monitors.FindAsync(monitorId);
|
|
if (monitor == null)
|
|
{
|
|
throw new KeyNotFoundException($"Monitor {monitorId} not found.");
|
|
}
|
|
|
|
var now = DateTime.UtcNow;
|
|
|
|
var checks24h = await db.MonitorChecks
|
|
.Where(c => c.MonitorId == monitorId && c.CheckedAt >= now.AddHours(-24))
|
|
.ToListAsync();
|
|
|
|
var checks7d = await db.MonitorChecks
|
|
.Where(c => c.MonitorId == monitorId && c.CheckedAt >= now.AddDays(-7))
|
|
.ToListAsync();
|
|
|
|
var checks30d = await db.MonitorChecks
|
|
.Where(c => c.MonitorId == monitorId && c.CheckedAt >= now.AddDays(-30))
|
|
.ToListAsync();
|
|
|
|
static double CalcUptime(List<MonitorCheck> list) =>
|
|
list.Count == 0 ? 100.0 : (double)list.Count(c => c.Status == MonitorStatus.Up) / list.Count * 100.0;
|
|
|
|
return new UptimeReport
|
|
{
|
|
MonitorId = monitorId,
|
|
MonitorName = monitor.Name,
|
|
Uptime24h = CalcUptime(checks24h),
|
|
Uptime7d = CalcUptime(checks7d),
|
|
Uptime30d = CalcUptime(checks30d),
|
|
TotalChecks = checks30d.Count,
|
|
UpChecks = checks30d.Count(c => c.Status == MonitorStatus.Up),
|
|
DownChecks = checks30d.Count(c => c.Status == MonitorStatus.Down),
|
|
AvgResponseMs = checks30d.Count > 0 ? checks30d.Average(c => c.ResponseMs) : 0
|
|
};
|
|
}
|
|
|
|
/// <summary>
|
|
/// Returns log volume bucketed by hour for the last <paramref name="hours"/> hours.
|
|
/// </summary>
|
|
public async Task<List<LogStatsBucket>> GetLogStatsAsync(int hours = 24)
|
|
{
|
|
await using var db = await _dbFactory.CreateDbContextAsync();
|
|
var from = DateTime.UtcNow.AddHours(-hours);
|
|
var logs = await db.Logs.Where(l => l.Timestamp >= from).ToListAsync();
|
|
|
|
return logs
|
|
.GroupBy(l => new DateTime(l.Timestamp.Year, l.Timestamp.Month, l.Timestamp.Day, l.Timestamp.Hour, 0, 0, DateTimeKind.Utc))
|
|
.OrderBy(g => g.Key)
|
|
.Select(g => new LogStatsBucket
|
|
{
|
|
BucketStart = g.Key,
|
|
Total = g.LongCount(),
|
|
Errors = g.LongCount(l => l.Level == "error" || l.Level == "critical"),
|
|
Warnings = g.LongCount(l => l.Level == "warn" || l.Level == "warning")
|
|
})
|
|
.ToList();
|
|
}
|
|
|
|
private async Task EvaluateAlertRulesAsync(Monitor monitor, MonitorCheck check, MonitorStatus prevStatus, DatabaseContext db)
|
|
{
|
|
var rules = await db.AlertRules
|
|
.Where(r => r.IsEnabled && (r.MonitorId == monitor.Id || r.MonitorId == null))
|
|
.ToListAsync();
|
|
|
|
var globalWebhook = await db.Settings.FirstOrDefaultAsync(s => s.Key == "AlertWebhookUrl");
|
|
var webhookUrl = globalWebhook?.Value;
|
|
|
|
foreach (var rule in rules)
|
|
{
|
|
bool fired = rule.Condition switch
|
|
{
|
|
AlertRuleCondition.IsDown => check.Status == MonitorStatus.Down && prevStatus != MonitorStatus.Down,
|
|
AlertRuleCondition.IsUp => check.Status == MonitorStatus.Up && prevStatus == MonitorStatus.Down,
|
|
AlertRuleCondition.ResponseAboveMs => check.ResponseMs > (rule.ThresholdValue ?? double.MaxValue),
|
|
AlertRuleCondition.CertExpiresWithinDays => false, // evaluated by cert loop separately
|
|
_ => false
|
|
};
|
|
|
|
if (!fired)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// Cooldown check
|
|
if (rule.LastFiredAt.HasValue &&
|
|
(DateTime.UtcNow - rule.LastFiredAt.Value).TotalMinutes < rule.CooldownMinutes)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
rule.LastFiredAt = DateTime.UtcNow;
|
|
db.AlertRules.Update(rule);
|
|
|
|
// Auto-create incident when a monitor goes down
|
|
var autoIncidents = await db.Settings.FirstOrDefaultAsync(s => s.Key == "AutoCreateIncidents");
|
|
if (autoIncidents?.Value == "true" && rule.Condition == AlertRuleCondition.IsDown)
|
|
{
|
|
var incident = new Incident
|
|
{
|
|
Title = $"{monitor.Name} is down",
|
|
Body = check.Message,
|
|
Severity = IncidentSeverity.Major,
|
|
Status = IncidentStatus.Investigating,
|
|
MonitorId = monitor.Id,
|
|
IsPublic = monitor.IsPublic
|
|
};
|
|
db.Incidents.Add(incident);
|
|
}
|
|
|
|
// Fire webhook
|
|
var target = rule.WebhookUrl ?? webhookUrl;
|
|
if (!string.IsNullOrEmpty(target))
|
|
{
|
|
_ = Task.Run(() => FireWebhookAsync(target, monitor, check, rule.Condition));
|
|
}
|
|
|
|
await db.SaveChangesAsync();
|
|
}
|
|
}
|
|
|
|
private async Task FireWebhookAsync(string url, Monitor monitor, MonitorCheck check, AlertRuleCondition condition)
|
|
{
|
|
try
|
|
{
|
|
using var client = new HttpClient { Timeout = TimeSpan.FromSeconds(10) };
|
|
var payload = JsonSerializer.Serialize(new
|
|
{
|
|
monitorId = monitor.Id,
|
|
monitorName = monitor.Name,
|
|
condition = condition.ToString(),
|
|
status = check.Status.ToString(),
|
|
responseMs = check.ResponseMs,
|
|
message = check.Message,
|
|
checkedAt = check.CheckedAt.ToString("o")
|
|
});
|
|
await client.PostAsync(url, new StringContent(payload, Encoding.UTF8, "application/json"));
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_log.LogWarning("Webhook delivery to {Url} failed: {Msg}", url, ex.Message);
|
|
}
|
|
}
|
|
}
|
|
|
|
public class MonitoringBackgroundService : BackgroundService
|
|
{
|
|
private readonly IServiceScopeFactory _scopeFactory;
|
|
private readonly ILogger<MonitoringBackgroundService> _log;
|
|
|
|
public MonitoringBackgroundService(IServiceScopeFactory scopeFactory, ILogger<MonitoringBackgroundService> log)
|
|
{
|
|
_scopeFactory = scopeFactory;
|
|
_log = log;
|
|
}
|
|
|
|
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
|
{
|
|
while (!stoppingToken.IsCancellationRequested)
|
|
{
|
|
try
|
|
{
|
|
using var scope = _scopeFactory.CreateScope();
|
|
var dbFactory = scope.ServiceProvider.GetRequiredService<IDbContextFactory<DatabaseContext>>();
|
|
|
|
await using var db = await dbFactory.CreateDbContextAsync(stoppingToken);
|
|
var monitors = await db.Monitors.Where(m => m.IsActive).ToListAsync(stoppingToken);
|
|
var now = DateTime.UtcNow;
|
|
|
|
foreach (var m in monitors)
|
|
{
|
|
if (m.LastChecked == null || (now - m.LastChecked.Value).TotalSeconds >= m.IntervalSeconds)
|
|
{
|
|
var captured = m;
|
|
_ = Task.Run(async () =>
|
|
{
|
|
using var checkScope = _scopeFactory.CreateScope();
|
|
var svc = checkScope.ServiceProvider.GetRequiredService<MonitoringService>();
|
|
await svc.CheckMonitorAsync(captured);
|
|
}, stoppingToken);
|
|
}
|
|
}
|
|
|
|
// Check certs every hour
|
|
var certs = await db.Certificates.ToListAsync(stoppingToken);
|
|
foreach (var c in certs)
|
|
{
|
|
if (c.LastChecked == null || (now - c.LastChecked.Value).TotalHours >= 1)
|
|
{
|
|
var captured = c;
|
|
_ = Task.Run(async () =>
|
|
{
|
|
using var certScope = _scopeFactory.CreateScope();
|
|
var svc = certScope.ServiceProvider.GetRequiredService<MonitoringService>();
|
|
await svc.CheckCertificateAsync(captured);
|
|
}, stoppingToken);
|
|
}
|
|
}
|
|
|
|
// Log retention purge - run once per hour
|
|
if (now.Minute == 0)
|
|
{
|
|
using var purgeScope = _scopeFactory.CreateScope();
|
|
var ingest = purgeScope.ServiceProvider.GetRequiredService<IngestionService>();
|
|
var auth = purgeScope.ServiceProvider.GetRequiredService<AuthenticationService>();
|
|
var days = int.TryParse(await auth.GetSettingAsync("MaxLogRetentionDays", "30"), out var d) ? d : 30;
|
|
await ingest.PurgeOldLogsAsync(days);
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_log.LogError(ex, "Error in monitor loop");
|
|
}
|
|
|
|
await Task.Delay(10_000, stoppingToken);
|
|
}
|
|
}
|
|
}
|
|
|
|
public class IngestionService
|
|
{
|
|
private readonly IDbContextFactory<DatabaseContext> _dbFactory;
|
|
|
|
public IngestionService(IDbContextFactory<DatabaseContext> dbFactory)
|
|
{
|
|
_dbFactory = dbFactory;
|
|
}
|
|
|
|
public async Task IngestAsync(LogEntry entry)
|
|
{
|
|
await using var db = await _dbFactory.CreateDbContextAsync();
|
|
db.Logs.Add(entry);
|
|
await db.SaveChangesAsync();
|
|
}
|
|
|
|
public async Task IngestBatchAsync(IEnumerable<LogEntry> entries)
|
|
{
|
|
await using var db = await _dbFactory.CreateDbContextAsync();
|
|
db.Logs.AddRange(entries);
|
|
await db.SaveChangesAsync();
|
|
}
|
|
|
|
public async Task PurgeOldLogsAsync(int retentionDays)
|
|
{
|
|
await using var db = await _dbFactory.CreateDbContextAsync();
|
|
var cutoff = DateTime.UtcNow.AddDays(-retentionDays);
|
|
// Use ExecuteDeleteAsync for efficiency with large tables
|
|
await db.Logs.Where(l => l.Timestamp < cutoff).ExecuteDeleteAsync();
|
|
}
|
|
}
|