internetstandards · aequitas · Oct 15, 2024 · Oct 21, 2024 · Oct 28, 2024 · Nov 5, 2024
diff --git a/docker/batch-test.env b/docker/batch-test.env
@@ -114,5 +114,6 @@ CRON_WEEKLY_POSTGRESQL_BACKUP=True
 
 # selftest runs against public domain (example.(nl|com)) which will never work in the test environment
 CRON_15MIN_RUN_TESTS=False
+CRON_15MIN_RUN_TESTS_BATCH=False
 
 INTERNETNL_BRANDING=True
diff --git a/docker/compose.yaml b/docker/compose.yaml
@@ -109,7 +109,7 @@ services:
       resolver-permissive:
         condition: service_healthy
     # set hostname for Sentry
-    hostname: app
+    # hostname: app
     environment:
       - INTERNET_NL_CHECK_SUPPORT_IPV6
       - INTERNET_NL_CHECK_SUPPORT_DNSSEC
@@ -233,6 +233,22 @@ services:
     # time after which a SIGKILL is sent to celery after a SIGTERM (warm shutdown), default 10s
     # insufficient short grace period causes issues on batch when tasks are killed during the hourly worker restart
     stop_grace_period: 10m
+    # SIGTERM is default, but make it explicit
+    stop_signal: SIGTERM
+
+# celery accepts tasks during warm shutdown?????
+
+# Session terminated, killing shell...
+# worker: Warm shutdown (MainProcess)
+# [2024-10-29 18:45:01,807: INFO/MainProcess] Task checks.tasks.ipv6.batch_web[506e07c1-a208-4a52-a578-e95cdad9fdf0] received
+#  ...killed.
+# celery --app internetnl worker --without-gossip --pool=eventlet --time-limit=300 --concurrency=500 --queues nassl_worker,batch_nassl
+# ENABLE_BATCH is set for this server but the database is lacking the required indexes. Consider running `manage.py api_create_db_indexes`.
+# Batch enabled.
+
+# hoe zit dat met healthcheck tijdens warm shutdown?????
+
+
 
     depends_on:
       db-migrate:
@@ -257,7 +273,7 @@ services:
       resolver-permissive:
         condition: service_healthy
     # set hostname for Sentry
-    hostname: worker
+    # hostname: worker
     environment:
       - INTERNET_NL_CHECK_SUPPORT_IPV6
       - INTERNET_NL_CHECK_SUPPORT_DNSSEC
@@ -340,6 +356,7 @@ services:
 
     command: celery --app internetnl worker --without-gossip --pool=eventlet --time-limit=600 --concurrency=$WORKER_SLOW_CONCURRENCY
       --queues slow_db_worker,batch_slow
+    # hostname: worker-slow
 
   beat:
     image: ${DOCKER_IMAGE_APP:-${DOCKER_REGISTRY:-ghcr.io/internetstandards}/internet.nl:${RELEASE}}
@@ -652,6 +669,7 @@ services:
       - CRON_DAILY_DELETE_BATCH_RESULTS
       - CRON_15MIN_RUN_TESTS
       - CRON_DAILY_TRUNCATE_EXPORTER_LOGS
+      - CRON_15MIN_RUN_TESTS_BATCH
       - IPV4_IP_APP_INTERNAL
       - INTERNETNL_DOMAINNAME
       - INTERNETNL_CACHE_TTL
@@ -701,6 +719,8 @@ services:
       - WORKER_SLOW_REPLICAS
       - DOCKER_REGISTRY
       - CRON_DAILY_DATABASE_CLEANUP
+      - WORKER_REPLICAS
+      - RELEASE
 
     restart: unless-stopped
     logging:

diff --git a/...on-docker/periodic/daily/database_cleanup → ...on-docker/periodic/15min/database_cleanup b/...on-docker/periodic/daily/database_cleanup → ...on-docker/periodic/15min/database_cleanup
diff --git a/docker/cron-docker/periodic/15min/restart_nassl_worker b/docker/cron-docker/periodic/15min/restart_nassl_worker
@@ -1,4 +1,12 @@
 #!/bin/sh
 set -e
-# find nassl worker and restart the container(s)
-docker ps --filter label=com.docker.compose.service=worker-nassl --quiet |  xargs --no-run-if-empty docker restart
+# stop and start worker one at a time to ensure (batch) tasks are still being picked up
+# workers are sent a TERM signal with which a 10 minute grace period before QUIT is sent
+for worker in $(docker ps --filter label=com.docker.compose.service=worker-nassl --quiet); do
+  docker exec "$worker" celery --app=internetnl control shutdown --destination "celery@$worker" --timeout 300 || true
+
+  # docker stop "$worker"
+  # docker start "$worker"
+  # wait for container to be healthy
+  timeout 300 sh -c "while docker inspect \"$worker\"| jq --exit-status '.[0].State.Health.Status != \"healthy\"' >/dev/null;do sleep 1;done" || true
+done
diff --git a/docker/cron-docker/periodic/15min/restart_slow_worker b/docker/cron-docker/periodic/15min/restart_slow_worker
@@ -0,0 +1,12 @@
+#!/bin/sh
+set -e
+# stop and start worker one at a time to ensure (batch) tasks are still being picked up
+# workers are sent a TERM signal with which a 10 minute grace period before QUIT is sent
+for worker in $(docker ps --filter label=com.docker.compose.service=worker-slow --quiet); do
+  # tell celery worker to graceful shutdown
+  docker exec "$worker" celery --app=internetnl control shutdown --destination "celery@$worker" --timeout 300 || true
+  # docker stop "$worker"
+  # docker start "$worker"
+  # wait for container to be healthy
+  timeout 300 sh -c "while docker inspect \"$worker\"| jq --exit-status '.[0].State.Health.Status != \"healthy\"' >/dev/null;do sleep 1;done" || true
+done
diff --git a/...er/cron-docker/periodic/15min/auto_update → docker/cron-docker/periodic/5min/auto_update b/...er/cron-docker/periodic/15min/auto_update → docker/cron-docker/periodic/5min/auto_update
diff --git a/docker/cron-docker/periodic/5min/restart_worker b/docker/cron-docker/periodic/5min/restart_worker
@@ -0,0 +1,12 @@
+#!/bin/sh
+set -e
+# stop and start worker one at a time to ensure (batch) tasks are still being picked up
+# workers are sent a TERM signal with which a 10 minute grace period before QUIT is sent
+for worker in $(docker ps --filter label=com.docker.compose.service=worker --quiet); do
+  # tell celery worker to graceful shutdown
+  docker exec "$worker" celery --app=internetnl control shutdown --destination "celery@$worker" --timeout 300 || true
+  # docker stop "$worker"
+  # docker start "$worker"
+  # wait for container to be healthy
+  timeout 300 sh -c "while docker inspect \"$worker\"| jq --exit-status '.[0].State.Health.Status != \"healthy\"' >/dev/null;do sleep 1;done" || true
+done
diff --git a/docker/cron-docker/periodic/daily/restart_slow_worker b/docker/cron-docker/periodic/daily/restart_slow_worker
diff --git a/docker/cron-docker/periodic/hourly/restart_worker b/docker/cron-docker/periodic/hourly/restart_worker
diff --git a/docker/cron/periodic/15min/tests-batch.py b/docker/cron/periodic/15min/tests-batch.py
@@ -0,0 +1,222 @@
+#!/usr/bin/env python3
+
+# run tests on example domains and write metrics to prometheus textfile
+
+# for iterative development
+# docker run -ti -e INTERNETNL_DOMAINNAME=internet.nl -v $PWD/docker/cron/periodic/15min/tests.py:/tests.py \
+# ghcr.io/internetstandards/cron:latest /tests.py --debug
+
+import sys
+import os
+import time
+from prometheus_client import REGISTRY, Gauge, generate_latest
+import prometheus_client
+import logging
+import requests
+import datetime
+
+log = logging.getLogger(__name__)
+
+DEBUG = "--debug" in sys.argv
+
+# file to write metrics to https://github.com/prometheus/node_exporter?tab=readme-ov-file#textfile-collector
+OUTPUT_TEXTFILE = "/prometheus-textfile-directory/tests-batch.prom"
+
+
+BATCH_REQUEST_TIMEOUT = 60 * 5
+REQUEST_TIMEOUT = 30
+
+REQUEST_TYPES = ["web", "mail"]
+
+IPV4_IP_APP_INTERNAL = os.environ.get("IPV4_IP_APP_INTERNAL")
+INTERNETNL_DOMAINNAME = os.environ.get("INTERNETNL_DOMAINNAME")
+# talk directly to the internal app container as the webserver might
+# have access restrictions in place
+URL_BASE = f"http://{IPV4_IP_APP_INTERNAL}:8080"
+HEADERS = {"Host": INTERNETNL_DOMAINNAME}
+
+TEST_DOMAINS = {
+    # domain's to use in website tests
+    "web": [
+        "internet.nl",
+        "example.nl",
+        "example.com",
+        "internetsociety.org",
+        "ripe.net",
+        "surf.nl",
+        "ecp.nl",
+        "forumstandaardisatie.nl",
+        "minez.nl",
+    ],
+    # domain's to use in mail tests
+    "mail": [
+        "internetsociety.org",
+        "ripe.net",
+        "surf.nl",
+        "ecp.nl",
+        # these are currently really slow and will probably improve when
+        # we switch to sslyze, for now disable these in monitoring
+        # "internet.nl",
+        # "forumstandaardisatie.nl",
+        # "minez.nl",
+    ],
+}
+
+METRIC_BATCH_RUN = Gauge("tests_batch_run_total", "Batch requests that have been run.", ["request_type"])
+METRIC_BATCH_SUCCESS = Gauge("tests_batch_success_total", "Batch requests runs that succeeded.", ["request_type"])
+METRIC_BATCH_FAILURE = Gauge("tests_batch_failure_total", "Batch requests runs that failed.", ["request_type"])
+METRIC_BATCH_TIMEOUT = Gauge("tests_batch_timeout_total", "Batch requests that ran into timeout.", ["request_type"])
+METRIC_BATCH_RUNTIME = Gauge(
+    "tests_batch_runtime_seconds", "Amount of time batch request ran before done.", ["request_type"]
+)
+METRIC_BATCH_STAGE_RUNTIME = Gauge(
+    "tests_batch_stage_runtime_seconds", "Amount of time each stage in batch request took.", ["request_type", "stage"]
+)
+
+METRIC_BATCH_DOMAIN = Gauge("tests_batch_domain_total", "Amount of domains batch request.", ["request_type", "domain"])
+
+METRIC_BATCH_DOMAIN_SUCCESS = Gauge(
+    "tests_batch_domain_success",
+    "Amount of successful domain tests in batch request per domain.",
+    ["request_type", "domain"],
+)
+METRIC_BATCH_DOMAIN_SCORE = Gauge(
+    "tests_batch_domain_score", "Per domain test scores for batch request.", ["request_type", "domain"]
+)
+
+METRIC_BATCH_DOMAIN_CATEGORIES = Gauge(
+    "tests_batch_domain_categories",
+    "Domain verdict and status per category.",
+    ["request_type", "domain", "category", "verdict", "status"],
+)
+
+METRIC_BATCH_DOMAIN_TESTS = Gauge(
+    "tests_batch_domain_tests",
+    "Domain verdict and status per test.",
+    ["request_type", "domain", "test", "verdict", "status"],
+)
+
+
+def wait_for_request_status(url: str, expected_status: list[str], timeout: int = 10, interval: int = 1, auth=None):
+    """Poll url and parse JSON for request.status, return if value matches expected status or
+    fail when timeout expires."""
+
+    log.debug("waiting for status: %s", expected_status)
+
+    max_tries = int(timeout / interval)
+
+    tries = 0
+    status = "n/a"
+    while tries < max_tries:
+        status_response = requests.get(url, auth=auth, headers=HEADERS)
+        status_response.raise_for_status()
+
+        log.debug(status_response.text)
+        status_data = status_response.json()
+        status: str = status_data["request"]["status"]
+        if status in expected_status:
+            break
+        time.sleep(interval)
+        tries += 1
+    else:
+        raise TimeoutError(f"request status never reached '{str(expected_status)}' states, current state: '{status}'")
+
+
+def run_test_batch(request_type: str, domains: list[str]):
+    request_data = {"type": "web", "domains": domains, "name": f"periodic test {str(datetime.datetime.now())}"}
+
+    auth = ("periodic_tests", "periodic_tests")
+    api_url: str = URL_BASE + "/api/batch/v2/"
+
+    test_start = int(time.time())
+
+    # start batch request
+    register_response = requests.post(api_url + "requests", json=request_data, auth=auth, headers=HEADERS)
+    register_response.raise_for_status()
+    log.debug(register_response.text)
+
+    # get test_id from register data
+    register_data = register_response.json()
+    test_id: str = register_data["request"]["request_id"]
+
+    # wait for batch tests to start
+    wait_for_request_status(
+        api_url + "requests/" + test_id, ["running", "generating", "done"], timeout=BATCH_REQUEST_TIMEOUT, auth=auth
+    )
+    registering_time = int(time.time()) - test_start
+    METRIC_BATCH_STAGE_RUNTIME.labels(request_type, "registering").set(registering_time)
+
+    # wait for batch tests to complete and report to be generated
+    wait_for_request_status(
+        api_url + "requests/" + test_id, ["generating", "done"], timeout=BATCH_REQUEST_TIMEOUT, auth=auth
+    )
+    running_time = int(time.time()) - test_start - registering_time
+    METRIC_BATCH_STAGE_RUNTIME.labels(request_type, "running").set(running_time)
+
+    # wait for report generation and batch to be done
+    wait_for_request_status(api_url + "requests/" + test_id, ["done"], timeout=BATCH_REQUEST_TIMEOUT, auth=auth)
+    generating_time = int(time.time()) - test_start - running_time
+    METRIC_BATCH_STAGE_RUNTIME.labels(request_type, "generating").set(generating_time)
+
+    # get batch results
+    results_response = requests.get(api_url + "requests/" + test_id + "/results", auth=auth, headers=HEADERS)
+    results_response.raise_for_status()
+    log.debug(results_response.text)
+
+    results_response_data = results_response.json()
+
+    METRIC_BATCH_RUNTIME.labels(request_type).set(int(time.time() - test_start))
+    METRIC_BATCH_SUCCESS.labels(request_type).set(1 if results_response_data["request"]["status"] == "done" else 0)
+
+    for domain, results in results_response_data["domains"].items():
+        METRIC_BATCH_DOMAIN.labels(request_type, domain).set(1)
+        METRIC_BATCH_DOMAIN_SUCCESS.labels(request_type, domain).set(1 if results["status"] == "ok" else 0)
+        METRIC_BATCH_DOMAIN_SCORE.labels(request_type, domain).set(results["scoring"]["percentage"])
+
+        for category, result in results["results"]["categories"].items():
+            METRIC_BATCH_DOMAIN_CATEGORIES.labels(
+                request_type, domain, category, result["verdict"], result["status"]
+            ).inc(1)
+
+        for test, result in results["results"]["tests"].items():
+            METRIC_BATCH_DOMAIN_TESTS.labels(request_type, domain, test, result["verdict"], result["status"]).inc(1)
+
+
+def run_batch_tests():
+    for request_type in REQUEST_TYPES:
+        domains = TEST_DOMAINS[request_type]
+        log.info(f"testing: {request_type} {domains}")
+
+        METRIC_BATCH_RUN.labels(request_type).set(1)
+        METRIC_BATCH_FAILURE.labels(request_type).set(0)
+        METRIC_BATCH_TIMEOUT.labels(request_type).set(0)
+        METRIC_BATCH_SUCCESS.labels(request_type).set(0)
+        try:
+            run_test_batch(request_type, domains)
+
+        except Exception:
+            log.exception("Error during test")
+            METRIC_BATCH_FAILURE.labels(request_type).set(1)
+
+
+def main():
+    logging.basicConfig(level=logging.DEBUG if DEBUG else logging.ERROR)
+
+    # disable internal metrics
+    REGISTRY.unregister(prometheus_client.GC_COLLECTOR)
+    REGISTRY.unregister(prometheus_client.PLATFORM_COLLECTOR)
+    REGISTRY.unregister(prometheus_client.PROCESS_COLLECTOR)
+
+    # run test probes against domains and collect metrics
+    run_batch_tests()
+
+    # write metrics to stdout or file in prometheus textfile format
+    if DEBUG:
+        print(generate_latest(REGISTRY).decode())
+    else:
+        with open(OUTPUT_TEXTFILE, "w") as f:
+            f.write(generate_latest(REGISTRY).decode())
+
+
+if __name__ == "__main__" and os.environ.get("CRON_15MIN_RUN_TESTS_BATCH", "False") == "True":
+    main()
diff --git a/docker/defaults.env b/docker/defaults.env
@@ -262,6 +262,9 @@ CRON_DAILY_DATABASE_CLEANUP=True
 # enable running tests every 15 minutes for metrics collection
 CRON_15MIN_RUN_TESTS=True
 
+# enable running batch tests every 15 minutes for metrics collection, enable in local.env for batch deployments
+CRON_15MIN_RUN_TESTS_BATCH=False
+
 # enables internet.nl specific content (eg: contact information, faq, security.txt), only enable for internet.nl
 # instances. For customization see: documentation/Customize.md
 INTERNETNL_BRANDING=False

diff --git a/docker/develop.env b/docker/develop.env
@@ -63,6 +63,7 @@ LOGGING_DRIVER=json-file
 CRON_DAILY_POSTGRESQL_BACKUP=False
 CRON_WEEKLY_POSTGRESQL_BACKUP=False
 CRON_15MIN_RUN_TESTS=False
+CRON_15MIN_RUN_TESTS_BATCH=False
 
 INTERNETNL_BRANDING=False