diff options
author | Guillaume Abrioux <gabrioux@ibm.com> | 2024-01-15 13:38:39 +0100 |
---|---|---|
committer | Guillaume Abrioux <gabrioux@ibm.com> | 2024-01-25 16:07:21 +0100 |
commit | 3b8c945a6afeb7c8ab9f862468929f579c78adc3 (patch) | |
tree | c222bac1e39778b8584ecf176e5ff45f3a2d8c8a /src/pybind | |
parent | node-proxy: rename attribute and class (diff) | |
download | ceph-3b8c945a6afeb7c8ab9f862468929f579c78adc3.tar.xz ceph-3b8c945a6afeb7c8ab9f862468929f579c78adc3.zip |
node-proxy: make it a separate daemon
The current implementation requires the inclusion of all the recent
modifications in the cephadm binary, which won't be backported.
Since we need the node-proxy code backported to reef, let's move the
code make it a separate daemon.
Signed-off-by: Guillaume Abrioux <gabrioux@ibm.com>
Co-authored-by: Adam King <adking@redhat.com>
Diffstat (limited to 'src/pybind')
-rw-r--r-- | src/pybind/mgr/cephadm/agent.py | 104 | ||||
-rw-r--r-- | src/pybind/mgr/cephadm/inventory.py | 23 | ||||
-rw-r--r-- | src/pybind/mgr/cephadm/module.py | 84 | ||||
-rw-r--r-- | src/pybind/mgr/cephadm/serve.py | 3 | ||||
-rw-r--r-- | src/pybind/mgr/cephadm/services/cephadmservice.py | 12 | ||||
-rw-r--r-- | src/pybind/mgr/cephadm/services/node_proxy.py | 180 | ||||
-rw-r--r-- | src/pybind/mgr/cephadm/tests/test_node_proxy.py | 16 | ||||
-rw-r--r-- | src/pybind/mgr/cephadm/utils.py | 2 | ||||
-rw-r--r-- | src/pybind/mgr/orchestrator/_interface.py | 29 | ||||
-rw-r--r-- | src/pybind/mgr/orchestrator/module.py | 45 |
10 files changed, 418 insertions, 80 deletions
diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py index b589886e566..68495d3bc06 100644 --- a/src/pybind/mgr/cephadm/agent.py +++ b/src/pybind/mgr/cephadm/agent.py @@ -16,14 +16,15 @@ import time from orchestrator import DaemonDescriptionStatus from orchestrator._interface import daemon_type_to_service -from ceph.utils import datetime_now +from ceph.utils import datetime_now, http_req from ceph.deployment.inventory import Devices from ceph.deployment.service_spec import ServiceSpec, PlacementSpec from cephadm.services.cephadmservice import CephadmDaemonDeploySpec from cephadm.ssl_cert_utils import SSLCerts from mgr_util import test_port_allocation, PortAlreadyInUse -from typing import Any, Dict, List, Set, TYPE_CHECKING, Optional +from urllib.error import HTTPError, URLError +from typing import Any, Dict, List, Set, TYPE_CHECKING, Optional, MutableMapping if TYPE_CHECKING: from cephadm.module import CephadmOrchestrator @@ -138,8 +139,9 @@ class NodeProxyEndpoint: self.validate_node_proxy_data(data) - host = data["cephx"]["name"] - results['result'] = self.mgr.node_proxy_cache.oob.get(host) + # expecting name to be "node-proxy.<hostname>" + hostname = data['cephx']['name'][11:] + results['result'] = self.mgr.node_proxy_cache.oob.get(hostname, '') if not results['result']: raise cherrypy.HTTPError(400, 'The provided host has no iDrac details.') return results @@ -160,13 +162,15 @@ class NodeProxyEndpoint: raise cherrypy.HTTPError(400, 'The field \'cephx\' must be provided.') elif 'name' not in data['cephx'].keys(): cherrypy.response.status = 400 - raise cherrypy.HTTPError(400, 'The field \'host\' must be provided.') - elif 'secret' not in data['cephx'].keys(): - raise cherrypy.HTTPError(400, 'The agent keyring must be provided.') - elif not self.mgr.agent_cache.agent_keys.get(data['cephx']['name']): - raise cherrypy.HTTPError(502, f'Make sure the agent is running on {data["cephx"]["name"]}') - elif data['cephx']['secret'] != self.mgr.agent_cache.agent_keys[data['cephx']['name']]: - raise cherrypy.HTTPError(403, f'Got wrong keyring from agent on host {data["cephx"]["name"]}.') + raise cherrypy.HTTPError(400, 'The field \'name\' must be provided.') + # expecting name to be "node-proxy.<hostname>" + hostname = data['cephx']['name'][11:] + if 'secret' not in data['cephx'].keys(): + raise cherrypy.HTTPError(400, 'The node-proxy keyring must be provided.') + elif not self.mgr.node_proxy_cache.keyrings.get(hostname, ''): + raise cherrypy.HTTPError(502, f'Make sure the node-proxy is running on {hostname}') + elif data['cephx']['secret'] != self.mgr.node_proxy_cache.keyrings[hostname]: + raise cherrypy.HTTPError(403, f'Got wrong keyring from agent on host {hostname}.') except AttributeError: raise cherrypy.HTTPError(400, 'Malformed data received.') @@ -289,12 +293,19 @@ class NodeProxyEndpoint: :rtype: dict[str, Any] """ method: str = cherrypy.request.method + header: MutableMapping[str, str] = {} hostname: Optional[str] = kw.get('hostname') led_type: Optional[str] = kw.get('type') id_drive: Optional[str] = kw.get('id') - data: Optional[str] = None - # this method is restricted to 'GET' or 'PATCH' - action: str = 'get_led' if method == 'GET' else 'set_led' + payload: Optional[Dict[str, str]] = None + endpoint: List[Any] = ['led', led_type] + device: str = id_drive if id_drive else '' + + ssl_root_crt = self.mgr.http_server.agent.ssl_certs.get_root_cert() + ssl_ctx = ssl.create_default_context() + ssl_ctx.check_hostname = True + ssl_ctx.verify_mode = ssl.CERT_REQUIRED + ssl_ctx.load_verify_locations(cadata=ssl_root_crt) if not hostname: msg: str = "listing enclosure LED status for all nodes is not implemented." @@ -311,16 +322,32 @@ class NodeProxyEndpoint: self.mgr.log.debug(msg) raise cherrypy.HTTPError(400, msg) + if led_type == 'drive': + endpoint.append(device) + if hostname not in self.mgr.node_proxy_cache.data.keys(): # TODO(guits): update unit test for this msg = f"'{hostname}' not found." self.mgr.log.debug(msg) raise cherrypy.HTTPError(400, msg) + addr: str = self.mgr.inventory.get_addr(hostname) + if method == 'PATCH': # TODO(guits): need to check the request is authorized # allowing a specific keyring only ? (client.admin or client.agent.. ?) - data = json.dumps(cherrypy.request.json) + data: Dict[str, Any] = cherrypy.request.json + if 'state' not in data.keys(): + msg = "'state' key not provided." + raise cherrypy.HTTPError(400, msg) + if 'keyring' not in data.keys(): + msg = "'keyring' key must be provided." + raise cherrypy.HTTPError(400, msg) + if data['keyring'] != self.mgr.node_proxy_cache.keyrings.get(hostname): + msg = 'wrong keyring provided.' + raise cherrypy.HTTPError(401, msg) + payload = {} + payload['state'] = data['state'] if led_type == 'drive': if id_drive not in self.mgr.node_proxy_cache.data[hostname]['status']['storage'].keys(): @@ -329,28 +356,23 @@ class NodeProxyEndpoint: self.mgr.log.debug(msg) raise cherrypy.HTTPError(400, msg) - payload: Dict[str, Any] = {"node_proxy_oob_cmd": - {"action": action, - "type": led_type, - "id": id_drive, - "host": hostname, - "data": data}} - try: - message_thread = AgentMessageThread( - hostname, self.mgr.agent_cache.agent_ports[hostname], payload, self.mgr) - message_thread.start() - message_thread.join() # TODO(guits): Add a timeout? - except KeyError: - raise cherrypy.HTTPError(502, f"{hostname}'s agent not running, please check.") - agent_response = message_thread.get_agent_response() + endpoint = f'/{"/".join(endpoint)}' + header = self.mgr.node_proxy.generate_auth_header(hostname) + try: - response_json: Dict[str, Any] = json.loads(agent_response) - except json.decoder.JSONDecodeError: - cherrypy.response.status = 503 - else: - cherrypy.response.status = response_json.get('http_code', 503) - if cherrypy.response.status != 200: - raise cherrypy.HTTPError(cherrypy.response.status, "Couldn't change the LED status.") + headers, result, status = http_req(hostname=addr, + port='8080', + headers=header, + method=method, + data=json.dumps(payload), + endpoint=endpoint, + ssl_ctx=ssl_ctx) + response_json = json.loads(result) + except HTTPError as e: + self.mgr.log.debug(e) + except URLError: + raise cherrypy.HTTPError(502, f'Make sure the node-proxy agent is deployed and running on {hostname}') + return response_json @cherrypy.expose @@ -842,16 +864,6 @@ class CephadmAgentHelpers: host, self.mgr.agent_cache.agent_ports[host], payload, self.mgr, daemon_spec) message_thread.start() - def _shutdown_node_proxy(self) -> None: - hosts = set([h for h in self.mgr.cache.get_hosts() if - (h in self.mgr.agent_cache.agent_ports and not self.mgr.agent_cache.messaging_agent(h))]) - - for host in hosts: - payload: Dict[str, Any] = {'node_proxy_shutdown': host} - message_thread = AgentMessageThread( - host, self.mgr.agent_cache.agent_ports[host], payload, self.mgr) - message_thread.start() - def _request_ack_all_not_up_to_date(self) -> None: self.mgr.agent_helpers._request_agent_acks( set([h for h in self.mgr.cache.get_hosts() if diff --git a/src/pybind/mgr/cephadm/inventory.py b/src/pybind/mgr/cephadm/inventory.py index 27bf55c921c..235737ef10e 100644 --- a/src/pybind/mgr/cephadm/inventory.py +++ b/src/pybind/mgr/cephadm/inventory.py @@ -29,7 +29,7 @@ logger = logging.getLogger(__name__) HOST_CACHE_PREFIX = "host." SPEC_STORE_PREFIX = "spec." AGENT_CACHE_PREFIX = 'agent.' -NODE_PROXY_CACHE_PREFIX = 'node_proxy/data' +NODE_PROXY_CACHE_PREFIX = 'node_proxy' class HostCacheStatus(enum.Enum): @@ -1411,20 +1411,25 @@ class NodeProxyCache: self.mgr = mgr self.data: Dict[str, Any] = {} self.oob: Dict[str, Any] = {} + self.keyrings: Dict[str, str] = {} def load(self) -> None: - _oob = self.mgr.get_store('node_proxy/oob', "{}") + _oob = self.mgr.get_store(f'{NODE_PROXY_CACHE_PREFIX}/oob', '{}') self.oob = json.loads(_oob) - for k, v in self.mgr.get_store_prefix(NODE_PROXY_CACHE_PREFIX).items(): + _keyrings = self.mgr.get_store(f'{NODE_PROXY_CACHE_PREFIX}/keyrings', '{}') + self.keyrings = json.loads(_keyrings) + + for k, v in self.mgr.get_store_prefix(f'{NODE_PROXY_CACHE_PREFIX}/data').items(): host = k.split('/')[-1:][0] if host not in self.mgr.inventory.keys(): # remove entry for host that no longer exists - self.mgr.set_store(f"{NODE_PROXY_CACHE_PREFIX}/{host}", None) + self.mgr.set_store(f"{NODE_PROXY_CACHE_PREFIX}/data/{host}", None) try: self.oob.pop(host) self.data.pop(host) + self.keyrings.pop(host) except KeyError: pass continue @@ -1434,7 +1439,15 @@ class NodeProxyCache: def save(self, host: str = '', data: Dict[str, Any] = {}) -> None: - self.mgr.set_store(f"{NODE_PROXY_CACHE_PREFIX}/{host}", json.dumps(data)) + self.mgr.set_store(f"{NODE_PROXY_CACHE_PREFIX}/data/{host}", json.dumps(data)) + + def update_oob(self, host: str, host_oob_info: Dict[str, str]) -> None: + self.oob[host] = host_oob_info + self.mgr.set_store(f"{NODE_PROXY_CACHE_PREFIX}/oob", json.dumps(self.oob)) + + def update_keyring(self, host: str, key: str) -> None: + self.keyrings[host] = key + self.mgr.set_store(f"{NODE_PROXY_CACHE_PREFIX}/keyrings", json.dumps(self.keyrings)) def fullreport(self, **kw: Any) -> Dict[str, Any]: """ diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index b82f861ce6a..5c5e602e944 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -11,6 +11,7 @@ from configparser import ConfigParser from contextlib import contextmanager from functools import wraps from tempfile import TemporaryDirectory, NamedTemporaryFile +from urllib.error import HTTPError from threading import Event from cephadm.service_discovery import ServiceDiscovery @@ -72,6 +73,7 @@ from .services.osd import OSDRemovalQueue, OSDService, OSD, NotFoundError from .services.monitoring import GrafanaService, AlertmanagerService, PrometheusService, \ NodeExporterService, SNMPGatewayService, LokiService, PromtailService from .services.jaeger import ElasticSearchService, JaegerAgentService, JaegerCollectorService, JaegerQueryService +from .services.node_proxy import NodeProxy from .schedule import HostAssignment from .inventory import Inventory, SpecStore, HostCache, AgentCache, EventStore, \ ClientKeyringStore, ClientKeyringSpec, TunedProfileStore, NodeProxyCache @@ -444,6 +446,12 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, desc='Multiplied by agent refresh rate to calculate how long agent must not report before being marked down' ), Option( + 'hw_monitoring', + type='bool', + default=False, + desc='Deploy hw monitoring daemon on every host.' + ), + Option( 'max_osd_draining_count', type='int', default=10, @@ -560,6 +568,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self.agent_refresh_rate = 0 self.agent_down_multiplier = 0.0 self.agent_starting_port = 0 + self.hw_monitoring = False self.service_discovery_port = 0 self.secure_monitoring_stack = False self.apply_spec_fails: List[Tuple[str, str]] = [] @@ -640,7 +649,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, PrometheusService, NodeExporterService, LokiService, PromtailService, CrashService, IscsiService, IngressService, CustomContainerService, CephfsMirrorService, NvmeofService, CephadmAgent, CephExporterService, SNMPGatewayService, ElasticSearchService, - JaegerQueryService, JaegerAgentService, JaegerCollectorService + JaegerQueryService, JaegerAgentService, JaegerCollectorService, NodeProxy ] # https://github.com/python/mypy/issues/8993 @@ -651,6 +660,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self.osd_service: OSDService = cast(OSDService, self.cephadm_services['osd']) self.iscsi_service: IscsiService = cast(IscsiService, self.cephadm_services['iscsi']) self.nvmeof_service: NvmeofService = cast(NvmeofService, self.cephadm_services['nvmeof']) + self.node_proxy_service: NodeProxy = cast(NodeProxy, self.cephadm_services['node-proxy']) self.scheduled_async_actions: List[Callable] = [] @@ -663,6 +673,9 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self.http_server = CephadmHttpServer(self) self.http_server.start() + + self.node_proxy = NodeProxy(self) + self.agent_helpers = CephadmAgentHelpers(self) if self.use_agent: self.agent_helpers._apply_agent() @@ -829,7 +842,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, Generate a unique random service name """ suffix = daemon_type not in [ - 'mon', 'crash', 'ceph-exporter', + 'mon', 'crash', 'ceph-exporter', 'node-proxy', 'prometheus', 'node-exporter', 'grafana', 'alertmanager', 'container', 'agent', 'snmp-gateway', 'loki', 'promtail', 'elasticsearch', 'jaeger-collector', 'jaeger-agent', 'jaeger-query' @@ -1621,13 +1634,12 @@ Then run the following: spec.oob['addr'] = spec.hostname if not spec.oob.get('port'): spec.oob['port'] = '443' - data = json.loads(self.get_store('node_proxy/oob', '{}')) - data[spec.hostname] = dict() - data[spec.hostname]['addr'] = spec.oob['addr'] - data[spec.hostname]['port'] = spec.oob['port'] - data[spec.hostname]['username'] = spec.oob['username'] - data[spec.hostname]['password'] = spec.oob['password'] - self.set_store('node_proxy/oob', json.dumps(data)) + host_oob_info = dict() + host_oob_info['addr'] = spec.oob['addr'] + host_oob_info['port'] = spec.oob['port'] + host_oob_info['username'] = spec.oob['username'] + host_oob_info['password'] = spec.oob['password'] + self.node_proxy_cache.update_oob(spec.hostname, host_oob_info) # prime crush map? if spec.location: @@ -1653,6 +1665,51 @@ Then run the following: return self._add_host(spec) @handle_orch_error + def hardware_light(self, light_type: str, action: str, hostname: str, device: Optional[str] = None) -> Dict[str, Any]: + try: + result = self.node_proxy.led(light_type=light_type, + action=action, + hostname=hostname, + device=device) + except RuntimeError as e: + self.log.error(e) + raise OrchestratorValidationError(f'Make sure the node-proxy agent is deployed and running on {hostname}') + except HTTPError as e: + self.log.error(e) + raise OrchestratorValidationError(f"http error while querying node-proxy API: {e}") + return result + + @handle_orch_error + def hardware_shutdown(self, hostname: str, force: Optional[bool] = False, yes_i_really_mean_it: bool = False) -> str: + if not yes_i_really_mean_it: + raise OrchestratorError("you must pass --yes-i-really-mean-it") + + try: + self.node_proxy.shutdown(hostname, force) + except RuntimeError as e: + self.log.error(e) + raise OrchestratorValidationError(f'Make sure the node-proxy agent is deployed and running on {hostname}') + except HTTPError as e: + self.log.error(e) + raise OrchestratorValidationError(f"Can't shutdown node {hostname}: {e}") + return f'Shutdown scheduled on {hostname}' + + @handle_orch_error + def hardware_powercycle(self, hostname: str, yes_i_really_mean_it: bool = False) -> str: + if not yes_i_really_mean_it: + raise OrchestratorError("you must pass --yes-i-really-mean-it") + + try: + self.node_proxy.powercycle(hostname) + except RuntimeError as e: + self.log.error(e) + raise OrchestratorValidationError(f'Make sure the node-proxy agent is deployed and running on {hostname}') + except HTTPError as e: + self.log.error(e) + raise OrchestratorValidationError(f"Can't perform powercycle on node {hostname}: {e}") + return f'Powercycle scheduled on {hostname}' + + @handle_orch_error def node_proxy_summary(self, hostname: Optional[str] = None) -> Dict[str, Any]: return self.node_proxy_cache.summary(hostname=hostname) @@ -2724,6 +2781,15 @@ Then run the following: pass deps = sorted([self.get_mgr_ip(), server_port, root_cert, str(self.device_enhanced_scan)]) + elif daemon_type == 'node-proxy': + root_cert = '' + server_port = '' + try: + server_port = str(self.http_server.agent.server_port) + root_cert = self.http_server.agent.ssl_certs.get_root_cert() + except Exception: + pass + deps = sorted([self.get_mgr_ip(), server_port, root_cert]) elif daemon_type == 'iscsi': if spec: iscsi_spec = cast(IscsiServiceSpec, spec) diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index 262ecbd27f6..600329509a0 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -113,6 +113,9 @@ class CephadmServe: if self.mgr.agent_helpers._handle_use_agent_setting(): continue + if self.mgr.node_proxy_service.handle_hw_monitoring_setting(): + continue + if self.mgr.upgrade.continue_upgrade(): continue diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py index 1681be00348..115ee8d0102 100644 --- a/src/pybind/mgr/cephadm/services/cephadmservice.py +++ b/src/pybind/mgr/cephadm/services/cephadmservice.py @@ -42,7 +42,7 @@ def get_auth_entity(daemon_type: str, daemon_id: str, host: str = "") -> AuthEnt # the CephService class refers to service types, not daemon types if daemon_type in ['rgw', 'rbd-mirror', 'cephfs-mirror', 'nfs', "iscsi", 'nvmeof', 'ingress', 'ceph-exporter']: return AuthEntity(f'client.{daemon_type}.{daemon_id}') - elif daemon_type in ['crash', 'agent']: + elif daemon_type in ['crash', 'agent', 'node-proxy']: if host == "": raise OrchestratorError( f'Host not provided to generate <{daemon_type}> auth entity name') @@ -1236,16 +1236,6 @@ class CephadmAgent(CephService): return daemon_spec - def pre_remove(self, daemon: DaemonDescription) -> None: - super().pre_remove(daemon) - - assert daemon.daemon_id is not None - daemon_id: str = daemon.daemon_id - - logger.info('Removing agent %s...' % daemon_id) - - self.mgr.agent_helpers._shutdown_node_proxy() - def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]: agent = self.mgr.http_server.agent try: diff --git a/src/pybind/mgr/cephadm/services/node_proxy.py b/src/pybind/mgr/cephadm/services/node_proxy.py new file mode 100644 index 00000000000..ebbbaf212c7 --- /dev/null +++ b/src/pybind/mgr/cephadm/services/node_proxy.py @@ -0,0 +1,180 @@ +import json +import ssl +import base64 + +from urllib.error import HTTPError, URLError +from typing import List, Any, Dict, Tuple, Optional, MutableMapping + +from .cephadmservice import CephadmDaemonDeploySpec, CephService +from ceph.deployment.service_spec import ServiceSpec, PlacementSpec +from ceph.utils import http_req +from orchestrator import OrchestratorError + + +class NodeProxy(CephService): + TYPE = 'node-proxy' + + def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec: + assert self.TYPE == daemon_spec.daemon_type + daemon_id, host = daemon_spec.daemon_id, daemon_spec.host + + if not self.mgr.http_server.agent: + raise OrchestratorError('Cannot deploy node-proxy before creating cephadm endpoint') + + keyring = self.get_keyring_with_caps(self.get_auth_entity(daemon_id, host=host), []) + daemon_spec.keyring = keyring + self.mgr.node_proxy_cache.update_keyring(host, keyring) + + daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec) + + return daemon_spec + + def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]: + # node-proxy is re-using the agent endpoint and therefore + # needs similar checks to see if the endpoint is ready. + self.agent_endpoint = self.mgr.http_server.agent + try: + assert self.agent_endpoint + assert self.agent_endpoint.ssl_certs.get_root_cert() + assert self.agent_endpoint.server_port + except Exception: + raise OrchestratorError( + 'Cannot deploy node-proxy daemons until cephadm endpoint has finished generating certs') + + listener_cert, listener_key = self.agent_endpoint.ssl_certs.generate_cert(daemon_spec.host, self.mgr.inventory.get_addr(daemon_spec.host)) + cfg = { + 'target_ip': self.mgr.get_mgr_ip(), + 'target_port': self.agent_endpoint.server_port, + 'name': f'node-proxy.{daemon_spec.host}', + 'keyring': daemon_spec.keyring, + 'root_cert.pem': self.agent_endpoint.ssl_certs.get_root_cert(), + 'listener.crt': listener_cert, + 'listener.key': listener_key, + } + config = {'node-proxy.json': json.dumps(cfg)} + + return config, sorted([str(self.mgr.get_mgr_ip()), str(self.agent_endpoint.server_port), + self.agent_endpoint.ssl_certs.get_root_cert()]) + + def handle_hw_monitoring_setting(self) -> bool: + # function to apply or remove node-proxy service spec depending + # on whether the hw_mointoring config option is set or not. + # It should return True when it either creates or deletes a spec + # and False otherwise. + if self.mgr.hw_monitoring: + if 'node-proxy' not in self.mgr.spec_store: + spec = ServiceSpec( + service_type='node-proxy', + placement=PlacementSpec(host_pattern='*') + ) + self.mgr.spec_store.save(spec) + return True + return False + else: + if 'node-proxy' in self.mgr.spec_store: + self.mgr.spec_store.rm('node-proxy') + return True + return False + + def get_ssl_ctx(self) -> ssl.SSLContext: + ssl_root_crt = self.mgr.http_server.agent.ssl_certs.get_root_cert() + ssl_ctx = ssl.create_default_context() + ssl_ctx.check_hostname = True + ssl_ctx.verify_mode = ssl.CERT_REQUIRED + ssl_ctx.load_verify_locations(cadata=ssl_root_crt) + return ssl_ctx + + def led(self, light_type: str, action: str, hostname: str, device: Optional[str] = None) -> Dict[str, Any]: + ssl_ctx: ssl.SSLContext = self.get_ssl_ctx() + header: MutableMapping[str, str] = {} + method: str = 'PATCH' if action in ['on', 'off'] else 'GET' + payload: Optional[Dict[str, str]] = None + addr: str = self.mgr.inventory.get_addr(hostname) + endpoint: List[str] = ['led', light_type] + _device: str = device if device else '' + + if light_type == 'drive': + endpoint.append(_device) + + if method == 'PATCH': + payload = dict(state=action) + + header = self.generate_auth_header(hostname) + + endpoint = f'/{"/".join(endpoint)}' + + try: + headers, result, status = http_req(hostname=addr, + port='8080', + headers=header, + method=method, + data=json.dumps(payload), + endpoint=endpoint, + ssl_ctx=ssl_ctx) + result_json = json.loads(result) + except HTTPError as e: + self.mgr.log.error(e) + raise + except URLError as e: + raise RuntimeError(e) + + return result_json + + def generate_auth_header(self, hostname: str) -> Dict[str, str]: + try: + username = self.mgr.node_proxy_cache.oob[hostname]['username'] + password = self.mgr.node_proxy_cache.oob[hostname]['password'] + auth: bytes = f'{username}:{password}'.encode('utf-8') + auth_str: str = base64.b64encode(auth).decode('utf-8') + header = {'Authorization': f'Basic {auth_str}'} + except KeyError as e: + self.mgr.log.error(f'Check oob information is provided for {hostname}.') + raise RuntimeError(e) + return header + + def shutdown(self, hostname: str, force: Optional[bool] = False) -> Dict[str, Any]: + ssl_ctx: ssl.SSLContext = self.get_ssl_ctx() + header: Dict[str, str] = self.generate_auth_header(hostname) + addr: str = self.mgr.inventory.get_addr(hostname) + + endpoint = '/shutdown' + payload: Dict[str, Optional[bool]] = dict(force=force) + + try: + headers, result, status = http_req(hostname=addr, + port='8080', + headers=header, + data=json.dumps(payload), + endpoint=endpoint, + ssl_ctx=ssl_ctx) + result_json = json.loads(result) + except HTTPError as e: + self.mgr.log.error(e) + raise + except URLError as e: + raise RuntimeError(e) + + return result_json + + def powercycle(self, hostname: str) -> Dict[str, Any]: + ssl_ctx: ssl.SSLContext = self.get_ssl_ctx() + header: Dict[str, str] = self.generate_auth_header(hostname) + addr: str = self.mgr.inventory.get_addr(hostname) + + endpoint = '/powercycle' + + try: + headers, result, status = http_req(hostname=addr, + port='8080', + headers=header, + data="{}", + endpoint=endpoint, + ssl_ctx=ssl_ctx) + result_json = json.loads(result) + except HTTPError as e: + self.mgr.log.error(e) + raise + except URLError as e: + raise RuntimeError(e) + + return result_json diff --git a/src/pybind/mgr/cephadm/tests/test_node_proxy.py b/src/pybind/mgr/cephadm/tests/test_node_proxy.py index b713d04cd59..0c9ee127547 100644 --- a/src/pybind/mgr/cephadm/tests/test_node_proxy.py +++ b/src/pybind/mgr/cephadm/tests/test_node_proxy.py @@ -35,8 +35,8 @@ class FakeMgr: class TestNodeProxyEndpoint(helper.CPWebCase): mgr = FakeMgr() app = NodeProxyEndpoint(mgr) - mgr.agent_cache.agent_keys = {"host01": "fake-secret01", - "host02": "fake-secret02"} + mgr.node_proxy.keyrings = {"host01": "fake-secret01", + "host02": "fake-secret02"} mgr.node_proxy.oob = {"host01": {"username": "oob-user01", "password": "oob-pass01"}, "host02": {"username": "oob-user02", @@ -68,38 +68,38 @@ class TestNodeProxyEndpoint(helper.CPWebCase): self.assertStatus('400 Bad Request') def test_oob_data_misses_secret_field(self): - data = '{"cephx": {"name": "host01"}}' + data = '{"cephx": {"name": "node-proxy.host01"}}' self.getPage("/oob", method="POST", body=data, headers=[('Content-Type', 'application/json'), ('Content-Length', str(len(data)))]) self.assertStatus('400 Bad Request') def test_oob_agent_not_running(self): - data = '{"cephx": {"name": "host03", "secret": "fake-secret03"}}' + data = '{"cephx": {"name": "node-proxy.host03", "secret": "fake-secret03"}}' self.getPage("/oob", method="POST", body=data, headers=[('Content-Type', 'application/json'), ('Content-Length', str(len(data)))]) self.assertStatus('502 Bad Gateway') def test_oob_wrong_keyring(self): - data = '{"cephx": {"name": "host01", "secret": "wrong-keyring"}}' + data = '{"cephx": {"name": "node-proxy.host01", "secret": "wrong-keyring"}}' self.getPage("/oob", method="POST", body=data, headers=[('Content-Type', 'application/json'), ('Content-Length', str(len(data)))]) self.assertStatus('403 Forbidden') def test_oob_ok(self): - data = '{"cephx": {"name": "host01", "secret": "fake-secret01"}}' + data = '{"cephx": {"name": "node-proxy.host01", "secret": "fake-secret01"}}' self.getPage("/oob", method="POST", body=data, headers=[('Content-Type', 'application/json'), ('Content-Length', str(len(data)))]) self.assertStatus('200 OK') def test_data_missing_patch(self): - data = '{"cephx": {"name": "host01", "secret": "fake-secret01"}}' + data = '{"cephx": {"name": "node-proxy.host01", "secret": "fake-secret01"}}' self.getPage("/data", method="POST", body=data, headers=[('Content-Type', 'application/json'), ('Content-Length', str(len(data)))]) self.assertStatus('400 Bad Request') def test_data_raises_alert(self): patch = node_proxy_data.full_set_with_critical - data = {"cephx": {"name": "host01", "secret": "fake-secret01"}, "patch": patch} + data = {"cephx": {"name": "node-proxy.host01", "secret": "fake-secret01"}, "patch": patch} data_str = json.dumps(data) self.getPage("/data", method="POST", body=data_str, headers=[('Content-Type', 'application/json'), ('Content-Length', str(len(data_str)))]) diff --git a/src/pybind/mgr/cephadm/utils.py b/src/pybind/mgr/cephadm/utils.py index 63672936c7c..3aedfbd86f0 100644 --- a/src/pybind/mgr/cephadm/utils.py +++ b/src/pybind/mgr/cephadm/utils.py @@ -31,7 +31,7 @@ RESCHEDULE_FROM_OFFLINE_HOSTS_TYPES = ['haproxy', 'nfs'] CEPH_UPGRADE_ORDER = CEPH_TYPES + GATEWAY_TYPES + MONITORING_STACK_TYPES # these daemon types use the ceph container image -CEPH_IMAGE_TYPES = CEPH_TYPES + ['iscsi', 'nfs'] +CEPH_IMAGE_TYPES = CEPH_TYPES + ['iscsi', 'nfs', 'node-proxy'] # these daemons do not use the ceph image. There are other daemons # that also don't use the ceph image, but we only care about those diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py index f97b61e8f9b..362badcc35e 100644 --- a/src/pybind/mgr/orchestrator/_interface.py +++ b/src/pybind/mgr/orchestrator/_interface.py @@ -359,6 +359,33 @@ class Orchestrator(object): """ raise NotImplementedError() + def hardware_light(self, light_type: str, action: str, hostname: str, device: Optional[str] = None) -> OrchResult[Dict[str, Any]]: + """ + Light a chassis or device ident LED. + + :param light_type: led type (chassis or device). + :param action: set or get status led. + :param hostname: the name of the host. + :param device: the device id (when light_type = 'device') + """ + raise NotImplementedError() + + def hardware_powercycle(self, hostname: str, yes_i_really_mean_it: bool = False) -> OrchResult[str]: + """ + Reboot a host. + + :param hostname: the name of the host being rebooted. + """ + raise NotImplementedError() + + def hardware_shutdown(self, hostname: str, force: Optional[bool] = False, yes_i_really_mean_it: bool = False) -> OrchResult[str]: + """ + Shutdown a host. + + :param hostname: the name of the host to shutdown. + """ + raise NotImplementedError() + def hardware_status(self, hostname: Optional[str] = None, category: Optional[str] = 'summary') -> OrchResult[str]: """ Display hardware status. @@ -869,6 +896,7 @@ def daemon_type_to_service(dtype: str) -> str: 'crashcollector': 'crash', # Specific Rook Daemon 'container': 'container', 'agent': 'agent', + 'node-proxy': 'node-proxy', 'snmp-gateway': 'snmp-gateway', 'elasticsearch': 'elasticsearch', 'jaeger-agent': 'jaeger-agent', @@ -901,6 +929,7 @@ def service_to_daemon_types(stype: str) -> List[str]: 'crash': ['crash'], 'container': ['container'], 'agent': ['agent'], + 'node-proxy': ['node-proxy'], 'snmp-gateway': ['snmp-gateway'], 'elasticsearch': ['elasticsearch'], 'jaeger-agent': ['jaeger-agent'], diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py index 27d91d1a4d3..22e1dd2f652 100644 --- a/src/pybind/mgr/orchestrator/module.py +++ b/src/pybind/mgr/orchestrator/module.py @@ -591,6 +591,51 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule, return table.get_string() + class HardwareLightType(enum.Enum): + chassis = 'chassis' + device = 'drive' + + class HardwareLightAction(enum.Enum): + on = 'on' + off = 'off' + get = 'get' + + @_cli_write_command('orch hardware light') + def _hardware_light(self, + light_type: HardwareLightType, action: HardwareLightAction, + hostname: str, device: Optional[str] = None) -> HandleCommandResult: + """Enable or Disable a device or chassis LED""" + if light_type == self.HardwareLightType.device and not device: + return HandleCommandResult(stderr='you must pass a device ID.', + retval=-errno.ENOENT) + + completion = self.hardware_light(light_type.value, action.value, hostname, device) + data = raise_if_exception(completion) + output: str = '' + if action == self.HardwareLightAction.get: + status = 'on' if data["LocationIndicatorActive"] else 'off' + if light_type == self.HardwareLightType.device: + output = f'ident LED for {device} on {hostname} is: {status}' + else: + output = f'ident chassis LED for {hostname} is: {status}' + else: + pass + return HandleCommandResult(stdout=output) + + @_cli_write_command('orch hardware powercycle') + def _hardware_powercycle(self, hostname: str, yes_i_really_mean_it: bool = False) -> HandleCommandResult: + """Reboot a host""" + completion = self.hardware_powercycle(hostname, yes_i_really_mean_it=yes_i_really_mean_it) + raise_if_exception(completion) + return HandleCommandResult(stdout=completion.result_str()) + + @_cli_write_command('orch hardware shutdown') + def _hardware_shutdown(self, hostname: str, force: Optional[bool] = False, yes_i_really_mean_it: bool = False) -> HandleCommandResult: + """Shutdown a host""" + completion = self.hardware_shutdown(hostname, force, yes_i_really_mean_it=yes_i_really_mean_it) + raise_if_exception(completion) + return HandleCommandResult(stdout=completion.result_str()) + @_cli_write_command('orch host rm') def _remove_host(self, hostname: str, force: bool = False, offline: bool = False, rm_crush_entry: bool = False) -> HandleCommandResult: """Remove a host""" |