summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVladimír Čunát <vladimir.cunat@nic.cz>2024-12-03 12:01:16 +0100
committerVladimír Čunát <vladimir.cunat@nic.cz>2024-12-03 12:01:16 +0100
commit84fa76b71e2ea15084e481cc3f8d8454ae8bdd9f (patch)
tree2ed19e473c478d5772910227dccd1cff19a3dc5c
parentMerge !1626: manager: TLS certificate files auto-reload (diff)
parentdoc: debugging with kresctl moved to dev (diff)
downloadknot-resolver-84fa76b71e2ea15084e481cc3f8d8454ae8bdd9f.tar.xz
knot-resolver-84fa76b71e2ea15084e481cc3f8d8454ae8bdd9f.zip
Merge !1450: manager: subprocess debugging via GDB
-rw-r--r--doc/dev/debugging-with-kresctl.rst109
-rw-r--r--doc/dev/index.rst7
-rwxr-xr-xpoe2
-rw-r--r--python/knot_resolver/client/commands/debug.py144
-rw-r--r--python/knot_resolver/client/commands/pids.py63
-rw-r--r--python/knot_resolver/client/main.py17
-rw-r--r--python/knot_resolver/controller/interface.py5
-rw-r--r--python/knot_resolver/controller/supervisord/__init__.py8
-rw-r--r--python/knot_resolver/manager/manager.py25
-rw-r--r--python/knot_resolver/manager/server.py42
10 files changed, 409 insertions, 13 deletions
diff --git a/doc/dev/debugging-with-kresctl.rst b/doc/dev/debugging-with-kresctl.rst
new file mode 100644
index 00000000..53fcddd1
--- /dev/null
+++ b/doc/dev/debugging-with-kresctl.rst
@@ -0,0 +1,109 @@
+.. SPDX-License-Identifier: GPL-3.0-or-later
+
+.. _debugging-with-kresctl:
+
+**********************
+Debugging with kresctl
+**********************
+
+Knot Resolver is made up of several independent components,
+so it can be difficult to debug the individual parts.
+To help with this, there is an option in the kresctl utility
+that can run GDB-compatible debugger on a specific component of the resolver, see the ``debug`` command.
+
+.. program:: kresctl
+
+.. option:: pids
+
+ Lists the PIDs of the Manager's subprocesses, separated by newlines.
+
+ .. option:: --json
+
+ Makes the output more verbose, in JSON. In addition to the subprocesses'
+ PIDs, it also prints their types and statuses.
+
+ .. option:: [proc_type]
+
+ :default: all
+
+ Optional. The type of process to query. See :ref:`Subprocess types
+ <debugging-with-kresctl-subprocess-types>` for more info.
+
+
+.. option:: debug
+
+ Executes a GDB-compatible debugger and attaches it to the Manager's
+ subprocesses. By default, the debugger is ``gdb`` and the subprocesses are
+ only the ``kresd`` workers.
+
+ .. warning::
+
+ The ``debug`` command is a utility for Knot Resolver developers and is
+ not intended to be used by end-users. Running this command **will** make
+ your resolver unresponsive.
+
+ .. note::
+
+ Modern kernels will prevent debuggers from tracing processes that are
+ not their descendants, which is exactly the scenario that happens with
+ ``kresctl debug``. There are three ways to work around this, listed in
+ the order in which they are preferred in terms of security:
+
+ 1. Grant the debugger the ``cap_sys_ptrace`` capability
+ (**recommended**)
+
+ * For ``gdb``, this may be achieved by using the ``setcap``
+ command like so:
+
+ .. code-block:: bash
+
+ sudo setcap cap_sys_ptrace=eip /usr/bin/gdb
+
+ 2. Run the debugger as root
+
+ * You may use the ``--sudo`` option to achieve this
+
+ 3. Set ``/proc/sys/kernel/yama/ptrace_scope`` to ``0``
+
+ * This will allow **all** programs in your current session to
+ trace each other. Handle with care!
+
+ .. note::
+
+ This command will only work if executed on the same machine where Knot
+ Resolver is running. Remote debugging is currently not supported.
+
+ .. option:: [proc_type]
+
+ :default: kresd
+
+ Optional. The type of process to debug. See :ref:`Subprocess types
+ <debugging-with-kresctl-subprocess-types>` for more info.
+
+ .. option:: --sudo
+
+ Run the debugger with sudo.
+
+ .. option:: --gdb <command>
+
+ Use a custom GDB executable. This may be a command on ``PATH``, or an
+ absolute path to an executable.
+
+ .. option:: --print-only
+
+ Prints the GDB command line into ``stderr`` as a Python array, does not
+ execute GDB.
+
+
+.. _debugging-with-kresctl-subprocess-types:
+
+Subprocess types
+----------------
+
+Some of ``kresctl``'s commands (like :option:`pids` and :option:`debug`) take a subprocess
+type value determining which subprocesses will be affected by them. The possible
+values are as follows:
+
+* ``kresd`` -- the worker daemons
+* ``gc`` -- the cache garbage collector
+* ``all`` -- all of the Manager's subprocesses
diff --git a/doc/dev/index.rst b/doc/dev/index.rst
index a13e3d61..f0d55763 100644
--- a/doc/dev/index.rst
+++ b/doc/dev/index.rst
@@ -31,6 +31,13 @@ Welcome to Knot Resolver's documentation for developers and advanced users!
layered-protocols
.. toctree::
+ :caption: Debugging
+ :name: debugging-chapter
+ :maxdepth: 1
+
+ debugging-with-kresctl
+
+.. toctree::
:caption: Lua configuration
:name: configuration-lua-chapter
:maxdepth: 1
diff --git a/poe b/poe
index d1f58894..815428a3 100755
--- a/poe
+++ b/poe
@@ -1,4 +1,4 @@
#!/bin/sh
script_dir="$(dirname "$(readlink -f "$0")")"
-exec poetry --directory "$script_dir" run poe --root "$script_dir" "$@"
+exec poetry --directory "$script_dir" run -- poe --root "$script_dir" "$@"
diff --git a/python/knot_resolver/client/commands/debug.py b/python/knot_resolver/client/commands/debug.py
new file mode 100644
index 00000000..5d9a81df
--- /dev/null
+++ b/python/knot_resolver/client/commands/debug.py
@@ -0,0 +1,144 @@
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+from typing import List, Optional, Tuple, Type
+
+from knot_resolver.client.command import Command, CommandArgs, CompWords, register_command
+from knot_resolver.utils import which
+from knot_resolver.utils.requests import request
+
+PROCS_TYPE = List
+
+
+@register_command
+class DebugCommand(Command):
+ def __init__(self, namespace: argparse.Namespace) -> None:
+ self.proc_type: Optional[str] = namespace.proc_type
+ self.sudo: bool = namespace.sudo
+ self.gdb: str = namespace.gdb
+ self.print_only: bool = namespace.print_only
+ self.gdb_args: List[str] = namespace.extra
+ super().__init__(namespace)
+
+ @staticmethod
+ def register_args_subparser(
+ subparser: "argparse._SubParsersAction[argparse.ArgumentParser]",
+ ) -> Tuple[argparse.ArgumentParser, "Type[Command]"]:
+ debug = subparser.add_parser(
+ "debug",
+ help="Run GDB on the manager's subprocesses",
+ )
+ debug.add_argument(
+ "proc_type",
+ help="Optional, the type of process to debug. May be 'kresd' (default), 'gc', or 'all'.",
+ type=str,
+ nargs="?",
+ default="kresd",
+ )
+ debug.add_argument(
+ "--sudo",
+ dest="sudo",
+ help="Run GDB with sudo",
+ action="store_true",
+ default=False,
+ )
+ debug.add_argument(
+ "--gdb",
+ help="Custom GDB executable (may be a command on PATH, or an absolute path)",
+ type=str,
+ default=None,
+ )
+ debug.add_argument(
+ "--print-only",
+ help="Prints the GDB command line into stderr as a Python array, does not execute GDB",
+ action="store_true",
+ default=False,
+ )
+ return debug, DebugCommand
+
+ @staticmethod
+ def completion(args: List[str], parser: argparse.ArgumentParser) -> CompWords:
+ return {}
+
+ def run(self, args: CommandArgs) -> None: # noqa: PLR0912, PLR0915
+ if self.gdb is None:
+ try:
+ gdb_cmd = str(which.which("gdb"))
+ except RuntimeError:
+ print("Could not find 'gdb' in $PATH. Is GDB installed?", file=sys.stderr)
+ sys.exit(1)
+ elif "/" not in self.gdb:
+ try:
+ gdb_cmd = str(which.which(self.gdb))
+ except RuntimeError:
+ print(f"Could not find '{self.gdb}' in $PATH.", file=sys.stderr)
+ sys.exit(1)
+ else:
+ gdb_cmd_path = Path(self.gdb).absolute()
+ if not gdb_cmd_path.exists():
+ print(f"Could not find '{self.gdb}'.", file=sys.stderr)
+ sys.exit(1)
+ gdb_cmd = str(gdb_cmd_path)
+
+ response = request(args.socket, "GET", f"processes/{self.proc_type}")
+ if response.status != 200:
+ print(response, file=sys.stderr)
+ sys.exit(1)
+
+ procs = json.loads(response.body)
+ if not isinstance(procs, PROCS_TYPE):
+ print(
+ f"Unexpected response type '{type(procs).__name__}' from manager. Expected '{PROCS_TYPE.__name__}'",
+ file=sys.stderr,
+ )
+ sys.exit(1)
+ if len(procs) == 0:
+ print(
+ f"There are no processes of type '{self.proc_type}' available to debug",
+ file=sys.stderr,
+ )
+
+ exec_args = []
+
+ # Put `sudo --` at the beginning of the command.
+ if self.sudo:
+ try:
+ sudo_cmd = str(which.which("sudo"))
+ except RuntimeError:
+ print("Could not find 'sudo' in $PATH. Is sudo installed?", file=sys.stderr)
+ sys.exit(1)
+ exec_args.extend([sudo_cmd, "--"])
+
+ # Attach GDB to processes - the processes are attached using the `add-inferior` and `attach` GDB
+ # commands. This way, we can debug multiple processes.
+ exec_args.extend([gdb_cmd, "--"])
+ exec_args.extend(["-init-eval-command", "set detach-on-fork off"])
+ exec_args.extend(["-init-eval-command", "set schedule-multiple on"])
+ exec_args.extend(["-init-eval-command", f'attach {procs[0]["pid"]}'])
+ inferior = 2
+ for proc in procs[1:]:
+ exec_args.extend(["-init-eval-command", "add-inferior"])
+ exec_args.extend(["-init-eval-command", f"inferior {inferior}"])
+ exec_args.extend(["-init-eval-command", f'attach {proc["pid"]}'])
+ inferior += 1
+
+ num_inferiors = inferior - 1
+ if num_inferiors > 1:
+ # Now we switch back to the first process and add additional provided GDB arguments.
+ exec_args.extend(["-init-eval-command", "inferior 1"])
+ exec_args.extend(
+ [
+ "-init-eval-command",
+ "echo \\n\\nYou are now debugging multiple Knot Resolver processes. To switch between "
+ "them, use the 'inferior <n>' command, where <n> is an integer from 1 to "
+ f"{num_inferiors}.\\n\\n",
+ ]
+ )
+ exec_args.extend(self.gdb_args)
+
+ if self.print_only:
+ print(f"{exec_args}")
+ else:
+ os.execl(*exec_args)
diff --git a/python/knot_resolver/client/commands/pids.py b/python/knot_resolver/client/commands/pids.py
new file mode 100644
index 00000000..a1ab5f8c
--- /dev/null
+++ b/python/knot_resolver/client/commands/pids.py
@@ -0,0 +1,63 @@
+import argparse
+import json
+import sys
+from typing import Iterable, List, Optional, Tuple, Type
+
+from knot_resolver.client.command import Command, CommandArgs, CompWords, register_command
+from knot_resolver.utils.requests import request
+
+PROCESSES_TYPE = Iterable
+
+
+@register_command
+class PidsCommand(Command):
+ def __init__(self, namespace: argparse.Namespace) -> None:
+ self.proc_type: Optional[str] = namespace.proc_type
+ self.json: int = namespace.json
+
+ super().__init__(namespace)
+
+ @staticmethod
+ def register_args_subparser(
+ subparser: "argparse._SubParsersAction[argparse.ArgumentParser]",
+ ) -> Tuple[argparse.ArgumentParser, "Type[Command]"]:
+ pids = subparser.add_parser("pids", help="List the PIDs of the Manager's subprocesses")
+ pids.add_argument(
+ "proc_type",
+ help="Optional, the type of process to query. May be 'kresd', 'gc', or 'all' (default).",
+ nargs="?",
+ default="all",
+ )
+ pids.add_argument(
+ "--json",
+ help="Optional, makes the output more verbose, in JSON.",
+ action="store_true",
+ default=False,
+ )
+ return pids, PidsCommand
+
+ @staticmethod
+ def completion(args: List[str], parser: argparse.ArgumentParser) -> CompWords:
+ return {}
+
+ def run(self, args: CommandArgs) -> None:
+ response = request(args.socket, "GET", f"processes/{self.proc_type}")
+
+ if response.status == 200:
+ processes = json.loads(response.body)
+ if isinstance(processes, PROCESSES_TYPE):
+ if self.json:
+ print(json.dumps(processes, indent=2))
+ else:
+ for p in processes:
+ print(p["pid"])
+
+ else:
+ print(
+ f"Unexpected response type '{type(processes).__name__}' from manager. Expected '{PROCESSES_TYPE.__name__}'",
+ file=sys.stderr,
+ )
+ sys.exit(1)
+ else:
+ print(response, file=sys.stderr)
+ sys.exit(1)
diff --git a/python/knot_resolver/client/main.py b/python/knot_resolver/client/main.py
index 75cd6a77..461b7fc4 100644
--- a/python/knot_resolver/client/main.py
+++ b/python/knot_resolver/client/main.py
@@ -1,6 +1,7 @@
import argparse
import importlib
import os
+import sys
from knot_resolver.constants import VERSION
@@ -68,7 +69,21 @@ def main() -> None:
parser = create_main_argument_parser()
install_commands_parsers(parser)
- namespace = parser.parse_args()
+ # TODO: This is broken with unpatched versions of poethepoet, because they drop the `--` pseudo-argument.
+ # Patch submitted at <https://github.com/nat-n/poethepoet/pull/163>.
+ try:
+ pa_index = sys.argv.index("--", 1)
+ argv_to_parse = sys.argv[1:pa_index]
+ argv_extra = sys.argv[(pa_index + 1) :]
+ except ValueError:
+ argv_to_parse = sys.argv[1:]
+ argv_extra = []
+
+ namespace = parser.parse_args(argv_to_parse)
+ if hasattr(namespace, "extra"):
+ raise TypeError("'extra' is already an attribute - this is disallowed for commands")
+ namespace.extra = argv_extra
+
client = KresClient(namespace, parser)
client.execute()
diff --git a/python/knot_resolver/controller/interface.py b/python/knot_resolver/controller/interface.py
index 43c24257..49808d01 100644
--- a/python/knot_resolver/controller/interface.py
+++ b/python/knot_resolver/controller/interface.py
@@ -109,6 +109,7 @@ class Subprocess(ABC):
self._id = kresid
self._config = config
self._registered_worker: bool = False
+ self._pid: Optional[int] = None
self._config_file: Optional[Path] = None
if self.type is SubprocessType.KRESD:
@@ -190,6 +191,10 @@ class Subprocess(ABC):
pass
@abstractmethod
+ async def get_pid(self) -> int:
+ pass
+
+ @abstractmethod
def status(self) -> SubprocessStatus:
pass
diff --git a/python/knot_resolver/controller/supervisord/__init__.py b/python/knot_resolver/controller/supervisord/__init__.py
index 347ac1e7..ddb9b29b 100644
--- a/python/knot_resolver/controller/supervisord/__init__.py
+++ b/python/knot_resolver/controller/supervisord/__init__.py
@@ -223,6 +223,14 @@ class SupervisordSubprocess(Subprocess):
fast = _create_fast_proxy(self._config)
fast.startProcess(self.name)
+ @async_in_a_thread
+ def get_pid(self) -> int:
+ if self._pid is None:
+ supervisord = _create_supervisord_proxy(self._config)
+ info = supervisord.getProcessInfo(self.name)
+ self._pid = info["pid"]
+ return self._pid
+
def get_used_config(self) -> KresConfig:
return self._config
diff --git a/python/knot_resolver/manager/manager.py b/python/knot_resolver/manager/manager.py
index f9c68708..952c8b7d 100644
--- a/python/knot_resolver/manager/manager.py
+++ b/python/knot_resolver/manager/manager.py
@@ -55,6 +55,14 @@ async def _deny_max_worker_changes(config_old: KresConfig, config_new: KresConfi
return Result.ok(None)
+async def _subprocess_desc(subprocess: Subprocess) -> object:
+ return {
+ "type": subprocess.type.name,
+ "pid": await subprocess.get_pid(),
+ "status": subprocess.status().name,
+ }
+
+
class KresManager: # pylint: disable=too-many-instance-attributes
"""
Core of the whole operation. Orchestrates individual instances under some
@@ -63,7 +71,7 @@ class KresManager: # pylint: disable=too-many-instance-attributes
Instantiate with `KresManager.create()`, not with the usual constructor!
"""
- def __init__(self, shutdown_trigger: Callable[[int], None], _i_know_what_i_am_doing: bool = False):
+ def __init__(self, _i_know_what_i_am_doing: bool = False):
if not _i_know_what_i_am_doing:
logger.error(
"Trying to create an instance of KresManager using normal constructor. Please use "
@@ -80,19 +88,18 @@ class KresManager: # pylint: disable=too-many-instance-attributes
self._watchdog_task: Optional["asyncio.Task[None]"] = None
self._fix_counter: _FixCounter = _FixCounter()
self._config_store: ConfigStore
- self._shutdown_trigger: Callable[[int], None] = shutdown_trigger
+ self._shutdown_triggers: List[Callable[[int], None]] = []
@staticmethod
async def create(
subprocess_controller: SubprocessController,
config_store: ConfigStore,
- shutdown_trigger: Callable[[int], None],
) -> "KresManager":
"""
Creates new instance of KresManager.
"""
- inst = KresManager(shutdown_trigger, _i_know_what_i_am_doing=True)
+ inst = KresManager(_i_know_what_i_am_doing=True)
await inst._async_init(subprocess_controller, config_store) # noqa: SLF001
return inst
@@ -211,6 +218,9 @@ class KresManager: # pylint: disable=too-many-instance-attributes
await self._gc.stop()
self._gc = None
+ def add_shutdown_trigger(self, trigger: Callable[[int], None]) -> None:
+ self._shutdown_triggers.append(trigger)
+
async def validate_config(self, _old: KresConfig, new: KresConfig) -> Result[NoneType, str]:
async with self._manager_lock:
if _old.rate_limiting != new.rate_limiting:
@@ -233,6 +243,10 @@ class KresManager: # pylint: disable=too-many-instance-attributes
logger.debug("Canary process test passed.")
return Result.ok(None)
+ async def get_processes(self, proc_type: Optional[SubprocessType]) -> List[object]:
+ processes = await self._controller.get_all_running_instances()
+ return [await _subprocess_desc(pr) for pr in processes if proc_type is None or pr.type == proc_type]
+
async def _reload_system_state(self) -> None:
async with self._manager_lock:
self._workers = []
@@ -338,7 +352,8 @@ class KresManager: # pylint: disable=too-many-instance-attributes
logger.warning("Collecting all remaining workers...")
await self._reload_system_state()
logger.warning("Terminating...")
- self._shutdown_trigger(1)
+ for trigger in self._shutdown_triggers:
+ trigger(1)
async def _instability_handler(self) -> None:
if self._fix_counter.is_too_high():
diff --git a/python/knot_resolver/manager/server.py b/python/knot_resolver/manager/server.py
index b09ff7b9..06fab0cf 100644
--- a/python/knot_resolver/manager/server.py
+++ b/python/knot_resolver/manager/server.py
@@ -21,6 +21,7 @@ from aiohttp.web_runner import AppRunner, TCPSite, UnixSite
from knot_resolver.constants import CONFIG_FILE, USER
from knot_resolver.controller import get_best_controller_implementation
from knot_resolver.controller.exceptions import SubprocessControllerExecError
+from knot_resolver.controller.interface import SubprocessType
from knot_resolver.controller.registered_workers import command_single_registered_worker
from knot_resolver.datamodel import kres_config_json_schema
from knot_resolver.datamodel.cache_schema import CacheClearRPCSchema
@@ -87,7 +88,7 @@ class Server:
# This is top-level class containing pretty much everything. Instead of global
# variables, we use instance attributes. That's why there are so many and it's
# ok.
- def __init__(self, store: ConfigStore, config_path: Optional[Path]):
+ def __init__(self, store: ConfigStore, config_path: Optional[Path], manager: KresManager):
# config store & server dynamic reconfiguration
self.config_store = store
@@ -100,6 +101,7 @@ class Server:
self._config_path: Optional[Path] = config_path
self._exit_code: int = 0
self._shutdown_event = asyncio.Event()
+ self._manager = manager
async def _reconfigure(self, config: KresConfig) -> None:
await self._reconfigure_listen_address(config)
@@ -323,6 +325,30 @@ class Server:
await self._reload_config()
return web.Response(text="Reloading...")
+ async def _handler_processes(self, request: web.Request) -> web.Response:
+ """
+ Route handler for listing PIDs of subprocesses
+ """
+
+ proc_type: Optional[SubprocessType] = None
+
+ if "path" in request.match_info and len(request.match_info["path"]) > 0:
+ ptstr = request.match_info["path"]
+ if ptstr == "/kresd":
+ proc_type = SubprocessType.KRESD
+ elif ptstr == "/gc":
+ proc_type = SubprocessType.GC
+ elif ptstr == "/all":
+ proc_type = None
+ else:
+ return web.Response(text=f"Invalid process type '{ptstr}'", status=400)
+
+ return web.json_response(
+ await self._manager.get_processes(proc_type),
+ headers={"Access-Control-Allow-Origin": "*"},
+ dumps=partial(json.dumps, indent=4),
+ )
+
def _setup_routes(self) -> None:
self.app.add_routes(
[
@@ -339,6 +365,7 @@ class Server:
web.get("/metrics/json", self._handler_metrics_json),
web.get("/metrics/prometheus", self._handler_metrics_prometheus),
web.post("/cache/clear", self._handler_cache_clear),
+ web.get("/processes{path:.*}", self._handler_processes),
]
)
@@ -410,7 +437,7 @@ async def _init_config_store(config: Dict[str, Any]) -> ConfigStore:
return ConfigStore(config_validated)
-async def _init_manager(config_store: ConfigStore, server: Server) -> KresManager:
+async def _init_manager(config_store: ConfigStore) -> KresManager:
"""
Called asynchronously when the application initializes.
"""
@@ -420,7 +447,7 @@ async def _init_manager(config_store: ConfigStore, server: Server) -> KresManage
# Create KresManager. This will perform autodetection of available service managers and
# select the most appropriate to use (or use the one configured directly)
- manager = await KresManager.create(controller, config_store, server.trigger_shutdown)
+ manager = await KresManager.create(controller, config_store)
logger.info("Initial configuration applied. Process manager initialized...")
return manager
@@ -559,11 +586,14 @@ async def start_server(config: Path = CONFIG_FILE) -> int: # noqa: PLR0915
await files.init_files_watchdog(config_store)
+ # After we have loaded the configuration, we can start worrying about subprocess management.
+ manager = await _init_manager(config_store)
+
# prepare instance of the server (no side effects)
- server = Server(config_store, config)
+ server = Server(config_store, config, manager)
- # After we have loaded the configuration, we can start worring about subprocess management.
- manager = await _init_manager(config_store, server)
+ # add Server's shutdown trigger to the manager
+ manager.add_shutdown_trigger(server.trigger_shutdown)
except SubprocessControllerExecError as e:
# if we caught this exception, some component wants to perform a reexec during startup. Most likely, it would