stamina: loop continuously over general exceptions, with ~regular alerts
authorFrédéric Péters <fpeters@0d.be>
Sat, 18 Jul 2020 08:58:06 +0000 (10:58 +0200)
committerFrédéric Péters <fpeters@0d.be>
Sat, 18 Jul 2020 08:58:06 +0000 (10:58 +0200)
nonstop/management/commands/stamina.py

index dd8e5e4..b604126 100644 (file)
@@ -5,6 +5,7 @@ import logging
 import random
 import signal
 import sys
+import time
 
 import requests
 
@@ -26,10 +27,50 @@ class Command(BaseCommand):
     quit = False
 
     def handle(self, verbosity, **kwargs):
-        try:
-            asyncio.run(self.main(), debug=settings.DEBUG)
-        except KeyboardInterrupt:
-            pass
+        alert_index = 0
+        latest_alert_timestamp = 0
+        latest_exception_timestamp = 0
+
+        def exception_alert_thresholds():
+            yield 0
+            duration = 3
+            while duration < 3600:
+                yield duration
+                duration *= 5
+            duration = 3600
+            while True:
+                yield duration
+                duration += 3600
+
+        while True:
+            try:
+                asyncio.run(self.main(), debug=settings.DEBUG)
+            except KeyboardInterrupt:
+                break
+            except Exception:
+                timestamp = time.time()
+                if (timestamp - latest_exception_timestamp) > 300:
+                    # if latest exception was a "long" time ago, assume
+                    # things went smooth for a while and reset things
+                    alert_index = 0
+                    latest_alert_timestamp = 0
+                    latest_exception_timestamp = 0
+
+                alert_threshold = 0
+                for i, threshold in enumerate(exception_alert_thresholds()):
+                    if i == alert_index:
+                        alert_threshold = threshold
+                        break
+
+                if (timestamp - latest_alert_timestamp) > alert_threshold:
+                    logger.exception('General exception (alert index: %s)', alert_index)
+                    latest_alert_timestamp = timestamp
+                    alert_index += 1
+
+                time.sleep(2)  # retry after a bit
+                latest_exception_timestamp = timestamp
+                continue
+            break
 
     def get_playlist(self, zone, start_datetime, end_datetime):
         current_datetime = start_datetime