Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Batch periodic tests #1526

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docker/batch-test.env
Original file line number Diff line number Diff line change
Expand Up @@ -114,5 +114,6 @@ CRON_WEEKLY_POSTGRESQL_BACKUP=True

# selftest runs against public domain (example.(nl|com)) which will never work in the test environment
CRON_15MIN_RUN_TESTS=False
CRON_15MIN_RUN_TESTS_BATCH=False

INTERNETNL_BRANDING=True
24 changes: 22 additions & 2 deletions docker/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ services:
resolver-permissive:
condition: service_healthy
# set hostname for Sentry
hostname: app
# hostname: app
environment:
- INTERNET_NL_CHECK_SUPPORT_IPV6
- INTERNET_NL_CHECK_SUPPORT_DNSSEC
Expand Down Expand Up @@ -233,6 +233,22 @@ services:
# time after which a SIGKILL is sent to celery after a SIGTERM (warm shutdown), default 10s
# insufficient short grace period causes issues on batch when tasks are killed during the hourly worker restart
stop_grace_period: 10m
# SIGTERM is default, but make it explicit
stop_signal: SIGTERM

# celery accepts tasks during warm shutdown?????

# Session terminated, killing shell...
# worker: Warm shutdown (MainProcess)
# [2024-10-29 18:45:01,807: INFO/MainProcess] Task checks.tasks.ipv6.batch_web[506e07c1-a208-4a52-a578-e95cdad9fdf0] received
# ...killed.
# celery --app internetnl worker --without-gossip --pool=eventlet --time-limit=300 --concurrency=500 --queues nassl_worker,batch_nassl
# ENABLE_BATCH is set for this server but the database is lacking the required indexes. Consider running `manage.py api_create_db_indexes`.
# Batch enabled.

# hoe zit dat met healthcheck tijdens warm shutdown?????



depends_on:
db-migrate:
Expand All @@ -257,7 +273,7 @@ services:
resolver-permissive:
condition: service_healthy
# set hostname for Sentry
hostname: worker
# hostname: worker
environment:
- INTERNET_NL_CHECK_SUPPORT_IPV6
- INTERNET_NL_CHECK_SUPPORT_DNSSEC
Expand Down Expand Up @@ -340,6 +356,7 @@ services:

command: celery --app internetnl worker --without-gossip --pool=eventlet --time-limit=600 --concurrency=$WORKER_SLOW_CONCURRENCY
--queues slow_db_worker,batch_slow
# hostname: worker-slow

beat:
image: ${DOCKER_IMAGE_APP:-${DOCKER_REGISTRY:-ghcr.io/internetstandards}/internet.nl:${RELEASE}}
Expand Down Expand Up @@ -652,6 +669,7 @@ services:
- CRON_DAILY_DELETE_BATCH_RESULTS
- CRON_15MIN_RUN_TESTS
- CRON_DAILY_TRUNCATE_EXPORTER_LOGS
- CRON_15MIN_RUN_TESTS_BATCH
- IPV4_IP_APP_INTERNAL
- INTERNETNL_DOMAINNAME
- INTERNETNL_CACHE_TTL
Expand Down Expand Up @@ -701,6 +719,8 @@ services:
- WORKER_SLOW_REPLICAS
- DOCKER_REGISTRY
- CRON_DAILY_DATABASE_CLEANUP
- WORKER_REPLICAS
- RELEASE

restart: unless-stopped
logging:
Expand Down
12 changes: 10 additions & 2 deletions docker/cron-docker/periodic/15min/restart_nassl_worker
Original file line number Diff line number Diff line change
@@ -1,4 +1,12 @@
#!/bin/sh
set -e
# find nassl worker and restart the container(s)
docker ps --filter label=com.docker.compose.service=worker-nassl --quiet | xargs --no-run-if-empty docker restart
# stop and start worker one at a time to ensure (batch) tasks are still being picked up
# workers are sent a TERM signal with which a 10 minute grace period before QUIT is sent
for worker in $(docker ps --filter label=com.docker.compose.service=worker-nassl --quiet); do
docker exec "$worker" celery --app=internetnl control shutdown --destination "celery@$worker" --timeout 300 || true

# docker stop "$worker"
# docker start "$worker"
# wait for container to be healthy
timeout 300 sh -c "while docker inspect \"$worker\"| jq --exit-status '.[0].State.Health.Status != \"healthy\"' >/dev/null;do sleep 1;done" || true
done
12 changes: 12 additions & 0 deletions docker/cron-docker/periodic/15min/restart_slow_worker
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/sh
set -e
# stop and start worker one at a time to ensure (batch) tasks are still being picked up
# workers are sent a TERM signal with which a 10 minute grace period before QUIT is sent
for worker in $(docker ps --filter label=com.docker.compose.service=worker-slow --quiet); do
# tell celery worker to graceful shutdown
docker exec "$worker" celery --app=internetnl control shutdown --destination "celery@$worker" --timeout 300 || true
# docker stop "$worker"
# docker start "$worker"
# wait for container to be healthy
timeout 300 sh -c "while docker inspect \"$worker\"| jq --exit-status '.[0].State.Health.Status != \"healthy\"' >/dev/null;do sleep 1;done" || true
done
12 changes: 12 additions & 0 deletions docker/cron-docker/periodic/5min/restart_worker
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/sh
set -e
# stop and start worker one at a time to ensure (batch) tasks are still being picked up
# workers are sent a TERM signal with which a 10 minute grace period before QUIT is sent
for worker in $(docker ps --filter label=com.docker.compose.service=worker --quiet); do
# tell celery worker to graceful shutdown
docker exec "$worker" celery --app=internetnl control shutdown --destination "celery@$worker" --timeout 300 || true
# docker stop "$worker"
# docker start "$worker"
# wait for container to be healthy
timeout 300 sh -c "while docker inspect \"$worker\"| jq --exit-status '.[0].State.Health.Status != \"healthy\"' >/dev/null;do sleep 1;done" || true
done
24 changes: 0 additions & 24 deletions docker/cron-docker/periodic/daily/restart_slow_worker

This file was deleted.

4 changes: 0 additions & 4 deletions docker/cron-docker/periodic/hourly/restart_worker

This file was deleted.

222 changes: 222 additions & 0 deletions docker/cron/periodic/15min/tests-batch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
#!/usr/bin/env python3

# run tests on example domains and write metrics to prometheus textfile

# for iterative development
# docker run -ti -e INTERNETNL_DOMAINNAME=internet.nl -v $PWD/docker/cron/periodic/15min/tests.py:/tests.py \
# ghcr.io/internetstandards/cron:latest /tests.py --debug

import sys
import os
import time
from prometheus_client import REGISTRY, Gauge, generate_latest
import prometheus_client
import logging
import requests
import datetime

log = logging.getLogger(__name__)

DEBUG = "--debug" in sys.argv

# file to write metrics to https://github.com/prometheus/node_exporter?tab=readme-ov-file#textfile-collector
OUTPUT_TEXTFILE = "/prometheus-textfile-directory/tests-batch.prom"


BATCH_REQUEST_TIMEOUT = 60 * 5
REQUEST_TIMEOUT = 30

REQUEST_TYPES = ["web", "mail"]

IPV4_IP_APP_INTERNAL = os.environ.get("IPV4_IP_APP_INTERNAL")
INTERNETNL_DOMAINNAME = os.environ.get("INTERNETNL_DOMAINNAME")
# talk directly to the internal app container as the webserver might
# have access restrictions in place
URL_BASE = f"http://{IPV4_IP_APP_INTERNAL}:8080"
HEADERS = {"Host": INTERNETNL_DOMAINNAME}

TEST_DOMAINS = {
# domain's to use in website tests
"web": [
"internet.nl",
"example.nl",
"example.com",
"internetsociety.org",
"ripe.net",
"surf.nl",
"ecp.nl",
"forumstandaardisatie.nl",
"minez.nl",
],
# domain's to use in mail tests
"mail": [
"internetsociety.org",
"ripe.net",
"surf.nl",
"ecp.nl",
# these are currently really slow and will probably improve when
# we switch to sslyze, for now disable these in monitoring
# "internet.nl",
# "forumstandaardisatie.nl",
# "minez.nl",
],
}

METRIC_BATCH_RUN = Gauge("tests_batch_run_total", "Batch requests that have been run.", ["request_type"])
METRIC_BATCH_SUCCESS = Gauge("tests_batch_success_total", "Batch requests runs that succeeded.", ["request_type"])
METRIC_BATCH_FAILURE = Gauge("tests_batch_failure_total", "Batch requests runs that failed.", ["request_type"])
METRIC_BATCH_TIMEOUT = Gauge("tests_batch_timeout_total", "Batch requests that ran into timeout.", ["request_type"])
METRIC_BATCH_RUNTIME = Gauge(
"tests_batch_runtime_seconds", "Amount of time batch request ran before done.", ["request_type"]
)
METRIC_BATCH_STAGE_RUNTIME = Gauge(
"tests_batch_stage_runtime_seconds", "Amount of time each stage in batch request took.", ["request_type", "stage"]
)

METRIC_BATCH_DOMAIN = Gauge("tests_batch_domain_total", "Amount of domains batch request.", ["request_type", "domain"])

METRIC_BATCH_DOMAIN_SUCCESS = Gauge(
"tests_batch_domain_success",
"Amount of successful domain tests in batch request per domain.",
["request_type", "domain"],
)
METRIC_BATCH_DOMAIN_SCORE = Gauge(
"tests_batch_domain_score", "Per domain test scores for batch request.", ["request_type", "domain"]
)

METRIC_BATCH_DOMAIN_CATEGORIES = Gauge(
"tests_batch_domain_categories",
"Domain verdict and status per category.",
["request_type", "domain", "category", "verdict", "status"],
)

METRIC_BATCH_DOMAIN_TESTS = Gauge(
"tests_batch_domain_tests",
"Domain verdict and status per test.",
["request_type", "domain", "test", "verdict", "status"],
)


def wait_for_request_status(url: str, expected_status: list[str], timeout: int = 10, interval: int = 1, auth=None):
"""Poll url and parse JSON for request.status, return if value matches expected status or
fail when timeout expires."""

log.debug("waiting for status: %s", expected_status)

max_tries = int(timeout / interval)

tries = 0
status = "n/a"
while tries < max_tries:
status_response = requests.get(url, auth=auth, headers=HEADERS)
status_response.raise_for_status()

log.debug(status_response.text)
status_data = status_response.json()
status: str = status_data["request"]["status"]
if status in expected_status:
break
time.sleep(interval)
tries += 1
else:
raise TimeoutError(f"request status never reached '{str(expected_status)}' states, current state: '{status}'")


def run_test_batch(request_type: str, domains: list[str]):
request_data = {"type": "web", "domains": domains, "name": f"periodic test {str(datetime.datetime.now())}"}

auth = ("periodic_tests", "periodic_tests")
api_url: str = URL_BASE + "/api/batch/v2/"

test_start = int(time.time())

# start batch request
register_response = requests.post(api_url + "requests", json=request_data, auth=auth, headers=HEADERS)
register_response.raise_for_status()
log.debug(register_response.text)

# get test_id from register data
register_data = register_response.json()
test_id: str = register_data["request"]["request_id"]

# wait for batch tests to start
wait_for_request_status(
api_url + "requests/" + test_id, ["running", "generating", "done"], timeout=BATCH_REQUEST_TIMEOUT, auth=auth
)
registering_time = int(time.time()) - test_start
METRIC_BATCH_STAGE_RUNTIME.labels(request_type, "registering").set(registering_time)

# wait for batch tests to complete and report to be generated
wait_for_request_status(
api_url + "requests/" + test_id, ["generating", "done"], timeout=BATCH_REQUEST_TIMEOUT, auth=auth
)
running_time = int(time.time()) - test_start - registering_time
METRIC_BATCH_STAGE_RUNTIME.labels(request_type, "running").set(running_time)

# wait for report generation and batch to be done
wait_for_request_status(api_url + "requests/" + test_id, ["done"], timeout=BATCH_REQUEST_TIMEOUT, auth=auth)
generating_time = int(time.time()) - test_start - running_time
METRIC_BATCH_STAGE_RUNTIME.labels(request_type, "generating").set(generating_time)

# get batch results
results_response = requests.get(api_url + "requests/" + test_id + "/results", auth=auth, headers=HEADERS)
results_response.raise_for_status()
log.debug(results_response.text)

results_response_data = results_response.json()

METRIC_BATCH_RUNTIME.labels(request_type).set(int(time.time() - test_start))
METRIC_BATCH_SUCCESS.labels(request_type).set(1 if results_response_data["request"]["status"] == "done" else 0)

for domain, results in results_response_data["domains"].items():
METRIC_BATCH_DOMAIN.labels(request_type, domain).set(1)
METRIC_BATCH_DOMAIN_SUCCESS.labels(request_type, domain).set(1 if results["status"] == "ok" else 0)
METRIC_BATCH_DOMAIN_SCORE.labels(request_type, domain).set(results["scoring"]["percentage"])

for category, result in results["results"]["categories"].items():
METRIC_BATCH_DOMAIN_CATEGORIES.labels(
request_type, domain, category, result["verdict"], result["status"]
).inc(1)

for test, result in results["results"]["tests"].items():
METRIC_BATCH_DOMAIN_TESTS.labels(request_type, domain, test, result["verdict"], result["status"]).inc(1)


def run_batch_tests():
for request_type in REQUEST_TYPES:
domains = TEST_DOMAINS[request_type]
log.info(f"testing: {request_type} {domains}")

METRIC_BATCH_RUN.labels(request_type).set(1)
METRIC_BATCH_FAILURE.labels(request_type).set(0)
METRIC_BATCH_TIMEOUT.labels(request_type).set(0)
METRIC_BATCH_SUCCESS.labels(request_type).set(0)
try:
run_test_batch(request_type, domains)

except Exception:
log.exception("Error during test")
METRIC_BATCH_FAILURE.labels(request_type).set(1)


def main():
logging.basicConfig(level=logging.DEBUG if DEBUG else logging.ERROR)

# disable internal metrics
REGISTRY.unregister(prometheus_client.GC_COLLECTOR)
REGISTRY.unregister(prometheus_client.PLATFORM_COLLECTOR)
REGISTRY.unregister(prometheus_client.PROCESS_COLLECTOR)

# run test probes against domains and collect metrics
run_batch_tests()

# write metrics to stdout or file in prometheus textfile format
if DEBUG:
print(generate_latest(REGISTRY).decode())
else:
with open(OUTPUT_TEXTFILE, "w") as f:
f.write(generate_latest(REGISTRY).decode())


if __name__ == "__main__" and os.environ.get("CRON_15MIN_RUN_TESTS_BATCH", "False") == "True":
main()
3 changes: 3 additions & 0 deletions docker/defaults.env
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,9 @@ CRON_DAILY_DATABASE_CLEANUP=True
# enable running tests every 15 minutes for metrics collection
CRON_15MIN_RUN_TESTS=True

# enable running batch tests every 15 minutes for metrics collection, enable in local.env for batch deployments
CRON_15MIN_RUN_TESTS_BATCH=False

# enables internet.nl specific content (eg: contact information, faq, security.txt), only enable for internet.nl
# instances. For customization see: documentation/Customize.md
INTERNETNL_BRANDING=False
Expand Down
1 change: 1 addition & 0 deletions docker/develop.env
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ LOGGING_DRIVER=json-file
CRON_DAILY_POSTGRESQL_BACKUP=False
CRON_WEEKLY_POSTGRESQL_BACKUP=False
CRON_15MIN_RUN_TESTS=False
CRON_15MIN_RUN_TESTS_BATCH=False

INTERNETNL_BRANDING=False

Expand Down
Loading
Loading