Some checks are pending
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
External monitoring via UptimeRobot (free tier) with internal health logger to differentiate ISP outages from server issues. Includes: - 4 new DB models (UptimeMonitor, UptimeCheck, UptimeIncident, InternalHealthLog) - Migration 082 with tables, indexes, and permissions - Internal health logger script (cron */5 min) - UptimeRobot sync script (cron hourly) with automatic cause correlation - Admin dashboard /admin/uptime with uptime %, response time charts, incident log with editable notes/causes, pattern analysis, monthly report - SLA comparison table (99.9%/99.5%/99%) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
282 lines
8.3 KiB
Python
282 lines
8.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
UptimeRobot Sync
|
|
================
|
|
Cron job (0 * * * *) - synchronizuje dane z UptimeRobot API co godzinę.
|
|
Pobiera response times, logi up/down, koreluje z internal_health_logs.
|
|
|
|
Użycie:
|
|
0 * * * * cd /var/www/nordabiznes && DATABASE_URL=$(grep DATABASE_URL .env | cut -d'=' -f2) UPTIMEROBOT_API_KEY=$(grep UPTIMEROBOT_API_KEY .env | cut -d'=' -f2) /var/www/nordabiznes/venv/bin/python3 scripts/uptimerobot_sync.py
|
|
|
|
Wymagane env:
|
|
UPTIMEROBOT_API_KEY - API key z UptimeRobot (Main API Key)
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import urllib.request
|
|
import urllib.error
|
|
from datetime import datetime, timedelta
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
from database import (
|
|
SessionLocal, UptimeMonitor, UptimeCheck, UptimeIncident, InternalHealthLog
|
|
)
|
|
|
|
API_KEY = os.environ.get('UPTIMEROBOT_API_KEY', '')
|
|
API_BASE = 'https://api.uptimerobot.com/v2'
|
|
RETENTION_DAYS = 90
|
|
|
|
# UptimeRobot status codes
|
|
UR_STATUS = {
|
|
0: 'paused',
|
|
1: 'not_checked',
|
|
2: 'up',
|
|
8: 'seems_down',
|
|
9: 'down',
|
|
}
|
|
|
|
|
|
def api_request(endpoint, extra_params=None):
|
|
"""Wyślij zapytanie do UptimeRobot API v2"""
|
|
params = {
|
|
'api_key': API_KEY,
|
|
'format': 'json',
|
|
}
|
|
if extra_params:
|
|
params.update(extra_params)
|
|
|
|
data = json.dumps(params).encode('utf-8')
|
|
req = urllib.request.Request(
|
|
f'{API_BASE}/{endpoint}',
|
|
data=data,
|
|
headers={'Content-Type': 'application/json'},
|
|
method='POST'
|
|
)
|
|
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
return json.loads(resp.read().decode('utf-8'))
|
|
except Exception as e:
|
|
print(f"API error ({endpoint}): {e}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def sync_monitors(db):
|
|
"""Synchronizuj listę monitorów z UptimeRobot"""
|
|
result = api_request('getMonitors', {
|
|
'response_times': 1,
|
|
'response_times_limit': 1,
|
|
'logs': 1,
|
|
'logs_limit': 50,
|
|
'all_time_uptime_ratio': 1,
|
|
'custom_uptime_ratios': '1-7-30-90',
|
|
})
|
|
|
|
if not result or result.get('stat') != 'ok':
|
|
print(f"Błąd API getMonitors: {result}", file=sys.stderr)
|
|
return []
|
|
|
|
monitors = result.get('monitors', [])
|
|
synced = []
|
|
|
|
for m in monitors:
|
|
ur_id = m['id']
|
|
|
|
# Upsert monitor
|
|
monitor = db.query(UptimeMonitor).filter_by(uptimerobot_id=ur_id).first()
|
|
if not monitor:
|
|
monitor = UptimeMonitor(
|
|
uptimerobot_id=ur_id,
|
|
name=m.get('friendly_name', ''),
|
|
url=m.get('url', ''),
|
|
check_interval_sec=m.get('interval', 300),
|
|
created_at=datetime.now()
|
|
)
|
|
db.add(monitor)
|
|
db.flush()
|
|
print(f"Nowy monitor: {monitor.name} ({monitor.url})")
|
|
else:
|
|
monitor.name = m.get('friendly_name', monitor.name)
|
|
monitor.url = m.get('url', monitor.url)
|
|
|
|
# Sync response times
|
|
sync_response_times(db, monitor, m.get('response_times', []))
|
|
|
|
# Sync logs (up/down events) → incydenty
|
|
sync_logs(db, monitor, m.get('logs', []))
|
|
|
|
synced.append(monitor)
|
|
|
|
db.commit()
|
|
return synced
|
|
|
|
|
|
def sync_response_times(db, monitor, response_times):
|
|
"""Zapisz response times jako uptime_checks"""
|
|
if not response_times:
|
|
return
|
|
|
|
for rt in response_times:
|
|
ts = datetime.fromtimestamp(rt['datetime'])
|
|
|
|
# Sprawdź czy już istnieje (unikaj duplikatów)
|
|
exists = db.query(UptimeCheck).filter_by(
|
|
monitor_id=monitor.id,
|
|
checked_at=ts
|
|
).first()
|
|
|
|
if not exists:
|
|
check = UptimeCheck(
|
|
monitor_id=monitor.id,
|
|
checked_at=ts,
|
|
status='up', # response time = was up
|
|
response_time_ms=rt.get('value', 0),
|
|
)
|
|
db.add(check)
|
|
|
|
|
|
def sync_logs(db, monitor, logs):
|
|
"""Przetwórz logi up/down z UptimeRobot na incydenty"""
|
|
if not logs:
|
|
return
|
|
|
|
for log in logs:
|
|
log_type = log.get('type', 0)
|
|
ts = datetime.fromtimestamp(log['datetime'])
|
|
duration = log.get('duration', 0)
|
|
|
|
if log_type == 1: # DOWN
|
|
# Sprawdź czy incydent już istnieje
|
|
existing = db.query(UptimeIncident).filter(
|
|
UptimeIncident.monitor_id == monitor.id,
|
|
UptimeIncident.started_at == ts
|
|
).first()
|
|
|
|
if existing:
|
|
# Aktualizuj jeśli się zakończył
|
|
if duration > 0 and not existing.ended_at:
|
|
existing.ended_at = ts + timedelta(seconds=duration)
|
|
existing.duration_seconds = duration
|
|
existing.auto_resolved = True
|
|
# Koreluj przyczynę
|
|
existing.cause = correlate_cause(db, ts, duration)
|
|
continue
|
|
|
|
ended_at = ts + timedelta(seconds=duration) if duration > 0 else None
|
|
|
|
incident = UptimeIncident(
|
|
monitor_id=monitor.id,
|
|
started_at=ts,
|
|
ended_at=ended_at,
|
|
duration_seconds=duration if duration > 0 else None,
|
|
cause=correlate_cause(db, ts, duration) if duration > 0 else 'unknown',
|
|
auto_resolved=duration > 0
|
|
)
|
|
db.add(incident)
|
|
|
|
# Dodaj check DOWN
|
|
down_check = UptimeCheck(
|
|
monitor_id=monitor.id,
|
|
checked_at=ts,
|
|
status='down',
|
|
response_time_ms=None,
|
|
)
|
|
db.add(down_check)
|
|
|
|
elif log_type == 2: # UP (recovery)
|
|
# Dodaj check UP
|
|
up_check = UptimeCheck(
|
|
monitor_id=monitor.id,
|
|
checked_at=ts,
|
|
status='up',
|
|
response_time_ms=None,
|
|
)
|
|
db.add(up_check)
|
|
|
|
|
|
def correlate_cause(db, incident_start, duration_seconds):
|
|
"""
|
|
Koreluj incydent z wewnętrznymi logami health.
|
|
Sprawdź czy serwer działał w czasie incydentu.
|
|
|
|
Logika:
|
|
- Jeśli internal_health_logs w oknie incydentu mają app_ok=True → ISP
|
|
- Jeśli mają app_ok=False → server
|
|
- Jeśli brak logów → infra (cały serwer padł)
|
|
"""
|
|
if not duration_seconds or duration_seconds <= 0:
|
|
return 'unknown'
|
|
|
|
incident_end = incident_start + timedelta(seconds=duration_seconds)
|
|
|
|
# Szukaj logów health z okna incydentu (z 5-min marginesem)
|
|
margin = timedelta(minutes=5)
|
|
health_logs = db.query(InternalHealthLog).filter(
|
|
InternalHealthLog.checked_at >= incident_start - margin,
|
|
InternalHealthLog.checked_at <= incident_end + margin
|
|
).all()
|
|
|
|
if not health_logs:
|
|
# Brak logów = cała infrastruktura padła (np. prąd, FortiGate)
|
|
return 'infra'
|
|
|
|
# Sprawdź czy app działała
|
|
app_ok_count = sum(1 for h in health_logs if h.app_ok)
|
|
total = len(health_logs)
|
|
|
|
if app_ok_count == total:
|
|
# Serwer działał normalnie → problem z internetem (ISP)
|
|
return 'isp'
|
|
elif app_ok_count == 0:
|
|
# App nie działała → problem z serwerem
|
|
return 'server'
|
|
else:
|
|
# Mieszane — częściowa awaria
|
|
return 'server'
|
|
|
|
|
|
def cleanup_old_checks(db):
|
|
"""Usuń stare uptime_checks (>90 dni)"""
|
|
cutoff = datetime.now() - timedelta(days=RETENTION_DAYS)
|
|
deleted = db.query(UptimeCheck).filter(
|
|
UptimeCheck.checked_at < cutoff
|
|
).delete()
|
|
if deleted:
|
|
print(f"Usunięto {deleted} starych uptime checks (>{RETENTION_DAYS} dni)")
|
|
|
|
|
|
def main():
|
|
if not API_KEY:
|
|
print("BŁĄD: Brak UPTIMEROBOT_API_KEY w zmiennych środowiskowych", file=sys.stderr)
|
|
print("Ustaw klucz API w .env: UPTIMEROBOT_API_KEY=ur...", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
db = SessionLocal()
|
|
try:
|
|
print(f"[{datetime.now()}] Synchronizacja UptimeRobot...")
|
|
|
|
monitors = sync_monitors(db)
|
|
print(f"Zsynchronizowano {len(monitors)} monitorów")
|
|
|
|
# Cleanup raz dziennie (o 4:00)
|
|
now = datetime.now()
|
|
if now.hour == 4 and now.minute < 5:
|
|
cleanup_old_checks(db)
|
|
|
|
db.commit()
|
|
print(f"[{datetime.now()}] Synchronizacja zakończona")
|
|
|
|
except Exception as e:
|
|
print(f"ERROR: {e}", file=sys.stderr)
|
|
db.rollback()
|
|
sys.exit(1)
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|