nordabiz/scripts/uptimerobot_sync.py
Maciej Pienczyn 9540f7f2e0
Some checks are pending
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
feat: add uptime monitoring dashboard with UptimeRobot integration
External monitoring via UptimeRobot (free tier) with internal health
logger to differentiate ISP outages from server issues. Includes:
- 4 new DB models (UptimeMonitor, UptimeCheck, UptimeIncident, InternalHealthLog)
- Migration 082 with tables, indexes, and permissions
- Internal health logger script (cron */5 min)
- UptimeRobot sync script (cron hourly) with automatic cause correlation
- Admin dashboard /admin/uptime with uptime %, response time charts,
  incident log with editable notes/causes, pattern analysis, monthly report
- SLA comparison table (99.9%/99.5%/99%)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-15 07:53:05 +01:00

282 lines
8.3 KiB
Python

#!/usr/bin/env python3
"""
UptimeRobot Sync
================
Cron job (0 * * * *) - synchronizuje dane z UptimeRobot API co godzinę.
Pobiera response times, logi up/down, koreluje z internal_health_logs.
Użycie:
0 * * * * cd /var/www/nordabiznes && DATABASE_URL=$(grep DATABASE_URL .env | cut -d'=' -f2) UPTIMEROBOT_API_KEY=$(grep UPTIMEROBOT_API_KEY .env | cut -d'=' -f2) /var/www/nordabiznes/venv/bin/python3 scripts/uptimerobot_sync.py
Wymagane env:
UPTIMEROBOT_API_KEY - API key z UptimeRobot (Main API Key)
"""
import os
import sys
import json
import urllib.request
import urllib.error
from datetime import datetime, timedelta
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from database import (
SessionLocal, UptimeMonitor, UptimeCheck, UptimeIncident, InternalHealthLog
)
API_KEY = os.environ.get('UPTIMEROBOT_API_KEY', '')
API_BASE = 'https://api.uptimerobot.com/v2'
RETENTION_DAYS = 90
# UptimeRobot status codes
UR_STATUS = {
0: 'paused',
1: 'not_checked',
2: 'up',
8: 'seems_down',
9: 'down',
}
def api_request(endpoint, extra_params=None):
"""Wyślij zapytanie do UptimeRobot API v2"""
params = {
'api_key': API_KEY,
'format': 'json',
}
if extra_params:
params.update(extra_params)
data = json.dumps(params).encode('utf-8')
req = urllib.request.Request(
f'{API_BASE}/{endpoint}',
data=data,
headers={'Content-Type': 'application/json'},
method='POST'
)
try:
with urllib.request.urlopen(req, timeout=30) as resp:
return json.loads(resp.read().decode('utf-8'))
except Exception as e:
print(f"API error ({endpoint}): {e}", file=sys.stderr)
return None
def sync_monitors(db):
"""Synchronizuj listę monitorów z UptimeRobot"""
result = api_request('getMonitors', {
'response_times': 1,
'response_times_limit': 1,
'logs': 1,
'logs_limit': 50,
'all_time_uptime_ratio': 1,
'custom_uptime_ratios': '1-7-30-90',
})
if not result or result.get('stat') != 'ok':
print(f"Błąd API getMonitors: {result}", file=sys.stderr)
return []
monitors = result.get('monitors', [])
synced = []
for m in monitors:
ur_id = m['id']
# Upsert monitor
monitor = db.query(UptimeMonitor).filter_by(uptimerobot_id=ur_id).first()
if not monitor:
monitor = UptimeMonitor(
uptimerobot_id=ur_id,
name=m.get('friendly_name', ''),
url=m.get('url', ''),
check_interval_sec=m.get('interval', 300),
created_at=datetime.now()
)
db.add(monitor)
db.flush()
print(f"Nowy monitor: {monitor.name} ({monitor.url})")
else:
monitor.name = m.get('friendly_name', monitor.name)
monitor.url = m.get('url', monitor.url)
# Sync response times
sync_response_times(db, monitor, m.get('response_times', []))
# Sync logs (up/down events) → incydenty
sync_logs(db, monitor, m.get('logs', []))
synced.append(monitor)
db.commit()
return synced
def sync_response_times(db, monitor, response_times):
"""Zapisz response times jako uptime_checks"""
if not response_times:
return
for rt in response_times:
ts = datetime.fromtimestamp(rt['datetime'])
# Sprawdź czy już istnieje (unikaj duplikatów)
exists = db.query(UptimeCheck).filter_by(
monitor_id=monitor.id,
checked_at=ts
).first()
if not exists:
check = UptimeCheck(
monitor_id=monitor.id,
checked_at=ts,
status='up', # response time = was up
response_time_ms=rt.get('value', 0),
)
db.add(check)
def sync_logs(db, monitor, logs):
"""Przetwórz logi up/down z UptimeRobot na incydenty"""
if not logs:
return
for log in logs:
log_type = log.get('type', 0)
ts = datetime.fromtimestamp(log['datetime'])
duration = log.get('duration', 0)
if log_type == 1: # DOWN
# Sprawdź czy incydent już istnieje
existing = db.query(UptimeIncident).filter(
UptimeIncident.monitor_id == monitor.id,
UptimeIncident.started_at == ts
).first()
if existing:
# Aktualizuj jeśli się zakończył
if duration > 0 and not existing.ended_at:
existing.ended_at = ts + timedelta(seconds=duration)
existing.duration_seconds = duration
existing.auto_resolved = True
# Koreluj przyczynę
existing.cause = correlate_cause(db, ts, duration)
continue
ended_at = ts + timedelta(seconds=duration) if duration > 0 else None
incident = UptimeIncident(
monitor_id=monitor.id,
started_at=ts,
ended_at=ended_at,
duration_seconds=duration if duration > 0 else None,
cause=correlate_cause(db, ts, duration) if duration > 0 else 'unknown',
auto_resolved=duration > 0
)
db.add(incident)
# Dodaj check DOWN
down_check = UptimeCheck(
monitor_id=monitor.id,
checked_at=ts,
status='down',
response_time_ms=None,
)
db.add(down_check)
elif log_type == 2: # UP (recovery)
# Dodaj check UP
up_check = UptimeCheck(
monitor_id=monitor.id,
checked_at=ts,
status='up',
response_time_ms=None,
)
db.add(up_check)
def correlate_cause(db, incident_start, duration_seconds):
"""
Koreluj incydent z wewnętrznymi logami health.
Sprawdź czy serwer działał w czasie incydentu.
Logika:
- Jeśli internal_health_logs w oknie incydentu mają app_ok=True → ISP
- Jeśli mają app_ok=False → server
- Jeśli brak logów → infra (cały serwer padł)
"""
if not duration_seconds or duration_seconds <= 0:
return 'unknown'
incident_end = incident_start + timedelta(seconds=duration_seconds)
# Szukaj logów health z okna incydentu (z 5-min marginesem)
margin = timedelta(minutes=5)
health_logs = db.query(InternalHealthLog).filter(
InternalHealthLog.checked_at >= incident_start - margin,
InternalHealthLog.checked_at <= incident_end + margin
).all()
if not health_logs:
# Brak logów = cała infrastruktura padła (np. prąd, FortiGate)
return 'infra'
# Sprawdź czy app działała
app_ok_count = sum(1 for h in health_logs if h.app_ok)
total = len(health_logs)
if app_ok_count == total:
# Serwer działał normalnie → problem z internetem (ISP)
return 'isp'
elif app_ok_count == 0:
# App nie działała → problem z serwerem
return 'server'
else:
# Mieszane — częściowa awaria
return 'server'
def cleanup_old_checks(db):
"""Usuń stare uptime_checks (>90 dni)"""
cutoff = datetime.now() - timedelta(days=RETENTION_DAYS)
deleted = db.query(UptimeCheck).filter(
UptimeCheck.checked_at < cutoff
).delete()
if deleted:
print(f"Usunięto {deleted} starych uptime checks (>{RETENTION_DAYS} dni)")
def main():
if not API_KEY:
print("BŁĄD: Brak UPTIMEROBOT_API_KEY w zmiennych środowiskowych", file=sys.stderr)
print("Ustaw klucz API w .env: UPTIMEROBOT_API_KEY=ur...", file=sys.stderr)
sys.exit(1)
db = SessionLocal()
try:
print(f"[{datetime.now()}] Synchronizacja UptimeRobot...")
monitors = sync_monitors(db)
print(f"Zsynchronizowano {len(monitors)} monitorów")
# Cleanup raz dziennie (o 4:00)
now = datetime.now()
if now.hour == 4 and now.minute < 5:
cleanup_old_checks(db)
db.commit()
print(f"[{datetime.now()}] Synchronizacja zakończona")
except Exception as e:
print(f"ERROR: {e}", file=sys.stderr)
db.rollback()
sys.exit(1)
finally:
db.close()
if __name__ == '__main__':
main()