summaryrefslogtreecommitdiffstats
path: root/src/pybind
diff options
context:
space:
mode:
authorGuillaume Abrioux <gabrioux@ibm.com>2024-01-15 13:38:39 +0100
committerGuillaume Abrioux <gabrioux@ibm.com>2024-01-25 16:07:21 +0100
commit3b8c945a6afeb7c8ab9f862468929f579c78adc3 (patch)
treec222bac1e39778b8584ecf176e5ff45f3a2d8c8a /src/pybind
parentnode-proxy: rename attribute and class (diff)
downloadceph-3b8c945a6afeb7c8ab9f862468929f579c78adc3.tar.xz
ceph-3b8c945a6afeb7c8ab9f862468929f579c78adc3.zip
node-proxy: make it a separate daemon
The current implementation requires the inclusion of all the recent modifications in the cephadm binary, which won't be backported. Since we need the node-proxy code backported to reef, let's move the code make it a separate daemon. Signed-off-by: Guillaume Abrioux <gabrioux@ibm.com> Co-authored-by: Adam King <adking@redhat.com>
Diffstat (limited to 'src/pybind')
-rw-r--r--src/pybind/mgr/cephadm/agent.py104
-rw-r--r--src/pybind/mgr/cephadm/inventory.py23
-rw-r--r--src/pybind/mgr/cephadm/module.py84
-rw-r--r--src/pybind/mgr/cephadm/serve.py3
-rw-r--r--src/pybind/mgr/cephadm/services/cephadmservice.py12
-rw-r--r--src/pybind/mgr/cephadm/services/node_proxy.py180
-rw-r--r--src/pybind/mgr/cephadm/tests/test_node_proxy.py16
-rw-r--r--src/pybind/mgr/cephadm/utils.py2
-rw-r--r--src/pybind/mgr/orchestrator/_interface.py29
-rw-r--r--src/pybind/mgr/orchestrator/module.py45
10 files changed, 418 insertions, 80 deletions
diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py
index b589886e566..68495d3bc06 100644
--- a/src/pybind/mgr/cephadm/agent.py
+++ b/src/pybind/mgr/cephadm/agent.py
@@ -16,14 +16,15 @@ import time
from orchestrator import DaemonDescriptionStatus
from orchestrator._interface import daemon_type_to_service
-from ceph.utils import datetime_now
+from ceph.utils import datetime_now, http_req
from ceph.deployment.inventory import Devices
from ceph.deployment.service_spec import ServiceSpec, PlacementSpec
from cephadm.services.cephadmservice import CephadmDaemonDeploySpec
from cephadm.ssl_cert_utils import SSLCerts
from mgr_util import test_port_allocation, PortAlreadyInUse
-from typing import Any, Dict, List, Set, TYPE_CHECKING, Optional
+from urllib.error import HTTPError, URLError
+from typing import Any, Dict, List, Set, TYPE_CHECKING, Optional, MutableMapping
if TYPE_CHECKING:
from cephadm.module import CephadmOrchestrator
@@ -138,8 +139,9 @@ class NodeProxyEndpoint:
self.validate_node_proxy_data(data)
- host = data["cephx"]["name"]
- results['result'] = self.mgr.node_proxy_cache.oob.get(host)
+ # expecting name to be "node-proxy.<hostname>"
+ hostname = data['cephx']['name'][11:]
+ results['result'] = self.mgr.node_proxy_cache.oob.get(hostname, '')
if not results['result']:
raise cherrypy.HTTPError(400, 'The provided host has no iDrac details.')
return results
@@ -160,13 +162,15 @@ class NodeProxyEndpoint:
raise cherrypy.HTTPError(400, 'The field \'cephx\' must be provided.')
elif 'name' not in data['cephx'].keys():
cherrypy.response.status = 400
- raise cherrypy.HTTPError(400, 'The field \'host\' must be provided.')
- elif 'secret' not in data['cephx'].keys():
- raise cherrypy.HTTPError(400, 'The agent keyring must be provided.')
- elif not self.mgr.agent_cache.agent_keys.get(data['cephx']['name']):
- raise cherrypy.HTTPError(502, f'Make sure the agent is running on {data["cephx"]["name"]}')
- elif data['cephx']['secret'] != self.mgr.agent_cache.agent_keys[data['cephx']['name']]:
- raise cherrypy.HTTPError(403, f'Got wrong keyring from agent on host {data["cephx"]["name"]}.')
+ raise cherrypy.HTTPError(400, 'The field \'name\' must be provided.')
+ # expecting name to be "node-proxy.<hostname>"
+ hostname = data['cephx']['name'][11:]
+ if 'secret' not in data['cephx'].keys():
+ raise cherrypy.HTTPError(400, 'The node-proxy keyring must be provided.')
+ elif not self.mgr.node_proxy_cache.keyrings.get(hostname, ''):
+ raise cherrypy.HTTPError(502, f'Make sure the node-proxy is running on {hostname}')
+ elif data['cephx']['secret'] != self.mgr.node_proxy_cache.keyrings[hostname]:
+ raise cherrypy.HTTPError(403, f'Got wrong keyring from agent on host {hostname}.')
except AttributeError:
raise cherrypy.HTTPError(400, 'Malformed data received.')
@@ -289,12 +293,19 @@ class NodeProxyEndpoint:
:rtype: dict[str, Any]
"""
method: str = cherrypy.request.method
+ header: MutableMapping[str, str] = {}
hostname: Optional[str] = kw.get('hostname')
led_type: Optional[str] = kw.get('type')
id_drive: Optional[str] = kw.get('id')
- data: Optional[str] = None
- # this method is restricted to 'GET' or 'PATCH'
- action: str = 'get_led' if method == 'GET' else 'set_led'
+ payload: Optional[Dict[str, str]] = None
+ endpoint: List[Any] = ['led', led_type]
+ device: str = id_drive if id_drive else ''
+
+ ssl_root_crt = self.mgr.http_server.agent.ssl_certs.get_root_cert()
+ ssl_ctx = ssl.create_default_context()
+ ssl_ctx.check_hostname = True
+ ssl_ctx.verify_mode = ssl.CERT_REQUIRED
+ ssl_ctx.load_verify_locations(cadata=ssl_root_crt)
if not hostname:
msg: str = "listing enclosure LED status for all nodes is not implemented."
@@ -311,16 +322,32 @@ class NodeProxyEndpoint:
self.mgr.log.debug(msg)
raise cherrypy.HTTPError(400, msg)
+ if led_type == 'drive':
+ endpoint.append(device)
+
if hostname not in self.mgr.node_proxy_cache.data.keys():
# TODO(guits): update unit test for this
msg = f"'{hostname}' not found."
self.mgr.log.debug(msg)
raise cherrypy.HTTPError(400, msg)
+ addr: str = self.mgr.inventory.get_addr(hostname)
+
if method == 'PATCH':
# TODO(guits): need to check the request is authorized
# allowing a specific keyring only ? (client.admin or client.agent.. ?)
- data = json.dumps(cherrypy.request.json)
+ data: Dict[str, Any] = cherrypy.request.json
+ if 'state' not in data.keys():
+ msg = "'state' key not provided."
+ raise cherrypy.HTTPError(400, msg)
+ if 'keyring' not in data.keys():
+ msg = "'keyring' key must be provided."
+ raise cherrypy.HTTPError(400, msg)
+ if data['keyring'] != self.mgr.node_proxy_cache.keyrings.get(hostname):
+ msg = 'wrong keyring provided.'
+ raise cherrypy.HTTPError(401, msg)
+ payload = {}
+ payload['state'] = data['state']
if led_type == 'drive':
if id_drive not in self.mgr.node_proxy_cache.data[hostname]['status']['storage'].keys():
@@ -329,28 +356,23 @@ class NodeProxyEndpoint:
self.mgr.log.debug(msg)
raise cherrypy.HTTPError(400, msg)
- payload: Dict[str, Any] = {"node_proxy_oob_cmd":
- {"action": action,
- "type": led_type,
- "id": id_drive,
- "host": hostname,
- "data": data}}
- try:
- message_thread = AgentMessageThread(
- hostname, self.mgr.agent_cache.agent_ports[hostname], payload, self.mgr)
- message_thread.start()
- message_thread.join() # TODO(guits): Add a timeout?
- except KeyError:
- raise cherrypy.HTTPError(502, f"{hostname}'s agent not running, please check.")
- agent_response = message_thread.get_agent_response()
+ endpoint = f'/{"/".join(endpoint)}'
+ header = self.mgr.node_proxy.generate_auth_header(hostname)
+
try:
- response_json: Dict[str, Any] = json.loads(agent_response)
- except json.decoder.JSONDecodeError:
- cherrypy.response.status = 503
- else:
- cherrypy.response.status = response_json.get('http_code', 503)
- if cherrypy.response.status != 200:
- raise cherrypy.HTTPError(cherrypy.response.status, "Couldn't change the LED status.")
+ headers, result, status = http_req(hostname=addr,
+ port='8080',
+ headers=header,
+ method=method,
+ data=json.dumps(payload),
+ endpoint=endpoint,
+ ssl_ctx=ssl_ctx)
+ response_json = json.loads(result)
+ except HTTPError as e:
+ self.mgr.log.debug(e)
+ except URLError:
+ raise cherrypy.HTTPError(502, f'Make sure the node-proxy agent is deployed and running on {hostname}')
+
return response_json
@cherrypy.expose
@@ -842,16 +864,6 @@ class CephadmAgentHelpers:
host, self.mgr.agent_cache.agent_ports[host], payload, self.mgr, daemon_spec)
message_thread.start()
- def _shutdown_node_proxy(self) -> None:
- hosts = set([h for h in self.mgr.cache.get_hosts() if
- (h in self.mgr.agent_cache.agent_ports and not self.mgr.agent_cache.messaging_agent(h))])
-
- for host in hosts:
- payload: Dict[str, Any] = {'node_proxy_shutdown': host}
- message_thread = AgentMessageThread(
- host, self.mgr.agent_cache.agent_ports[host], payload, self.mgr)
- message_thread.start()
-
def _request_ack_all_not_up_to_date(self) -> None:
self.mgr.agent_helpers._request_agent_acks(
set([h for h in self.mgr.cache.get_hosts() if
diff --git a/src/pybind/mgr/cephadm/inventory.py b/src/pybind/mgr/cephadm/inventory.py
index 27bf55c921c..235737ef10e 100644
--- a/src/pybind/mgr/cephadm/inventory.py
+++ b/src/pybind/mgr/cephadm/inventory.py
@@ -29,7 +29,7 @@ logger = logging.getLogger(__name__)
HOST_CACHE_PREFIX = "host."
SPEC_STORE_PREFIX = "spec."
AGENT_CACHE_PREFIX = 'agent.'
-NODE_PROXY_CACHE_PREFIX = 'node_proxy/data'
+NODE_PROXY_CACHE_PREFIX = 'node_proxy'
class HostCacheStatus(enum.Enum):
@@ -1411,20 +1411,25 @@ class NodeProxyCache:
self.mgr = mgr
self.data: Dict[str, Any] = {}
self.oob: Dict[str, Any] = {}
+ self.keyrings: Dict[str, str] = {}
def load(self) -> None:
- _oob = self.mgr.get_store('node_proxy/oob', "{}")
+ _oob = self.mgr.get_store(f'{NODE_PROXY_CACHE_PREFIX}/oob', '{}')
self.oob = json.loads(_oob)
- for k, v in self.mgr.get_store_prefix(NODE_PROXY_CACHE_PREFIX).items():
+ _keyrings = self.mgr.get_store(f'{NODE_PROXY_CACHE_PREFIX}/keyrings', '{}')
+ self.keyrings = json.loads(_keyrings)
+
+ for k, v in self.mgr.get_store_prefix(f'{NODE_PROXY_CACHE_PREFIX}/data').items():
host = k.split('/')[-1:][0]
if host not in self.mgr.inventory.keys():
# remove entry for host that no longer exists
- self.mgr.set_store(f"{NODE_PROXY_CACHE_PREFIX}/{host}", None)
+ self.mgr.set_store(f"{NODE_PROXY_CACHE_PREFIX}/data/{host}", None)
try:
self.oob.pop(host)
self.data.pop(host)
+ self.keyrings.pop(host)
except KeyError:
pass
continue
@@ -1434,7 +1439,15 @@ class NodeProxyCache:
def save(self,
host: str = '',
data: Dict[str, Any] = {}) -> None:
- self.mgr.set_store(f"{NODE_PROXY_CACHE_PREFIX}/{host}", json.dumps(data))
+ self.mgr.set_store(f"{NODE_PROXY_CACHE_PREFIX}/data/{host}", json.dumps(data))
+
+ def update_oob(self, host: str, host_oob_info: Dict[str, str]) -> None:
+ self.oob[host] = host_oob_info
+ self.mgr.set_store(f"{NODE_PROXY_CACHE_PREFIX}/oob", json.dumps(self.oob))
+
+ def update_keyring(self, host: str, key: str) -> None:
+ self.keyrings[host] = key
+ self.mgr.set_store(f"{NODE_PROXY_CACHE_PREFIX}/keyrings", json.dumps(self.keyrings))
def fullreport(self, **kw: Any) -> Dict[str, Any]:
"""
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index b82f861ce6a..5c5e602e944 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -11,6 +11,7 @@ from configparser import ConfigParser
from contextlib import contextmanager
from functools import wraps
from tempfile import TemporaryDirectory, NamedTemporaryFile
+from urllib.error import HTTPError
from threading import Event
from cephadm.service_discovery import ServiceDiscovery
@@ -72,6 +73,7 @@ from .services.osd import OSDRemovalQueue, OSDService, OSD, NotFoundError
from .services.monitoring import GrafanaService, AlertmanagerService, PrometheusService, \
NodeExporterService, SNMPGatewayService, LokiService, PromtailService
from .services.jaeger import ElasticSearchService, JaegerAgentService, JaegerCollectorService, JaegerQueryService
+from .services.node_proxy import NodeProxy
from .schedule import HostAssignment
from .inventory import Inventory, SpecStore, HostCache, AgentCache, EventStore, \
ClientKeyringStore, ClientKeyringSpec, TunedProfileStore, NodeProxyCache
@@ -444,6 +446,12 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
desc='Multiplied by agent refresh rate to calculate how long agent must not report before being marked down'
),
Option(
+ 'hw_monitoring',
+ type='bool',
+ default=False,
+ desc='Deploy hw monitoring daemon on every host.'
+ ),
+ Option(
'max_osd_draining_count',
type='int',
default=10,
@@ -560,6 +568,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
self.agent_refresh_rate = 0
self.agent_down_multiplier = 0.0
self.agent_starting_port = 0
+ self.hw_monitoring = False
self.service_discovery_port = 0
self.secure_monitoring_stack = False
self.apply_spec_fails: List[Tuple[str, str]] = []
@@ -640,7 +649,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
PrometheusService, NodeExporterService, LokiService, PromtailService, CrashService, IscsiService,
IngressService, CustomContainerService, CephfsMirrorService, NvmeofService,
CephadmAgent, CephExporterService, SNMPGatewayService, ElasticSearchService,
- JaegerQueryService, JaegerAgentService, JaegerCollectorService
+ JaegerQueryService, JaegerAgentService, JaegerCollectorService, NodeProxy
]
# https://github.com/python/mypy/issues/8993
@@ -651,6 +660,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
self.osd_service: OSDService = cast(OSDService, self.cephadm_services['osd'])
self.iscsi_service: IscsiService = cast(IscsiService, self.cephadm_services['iscsi'])
self.nvmeof_service: NvmeofService = cast(NvmeofService, self.cephadm_services['nvmeof'])
+ self.node_proxy_service: NodeProxy = cast(NodeProxy, self.cephadm_services['node-proxy'])
self.scheduled_async_actions: List[Callable] = []
@@ -663,6 +673,9 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
self.http_server = CephadmHttpServer(self)
self.http_server.start()
+
+ self.node_proxy = NodeProxy(self)
+
self.agent_helpers = CephadmAgentHelpers(self)
if self.use_agent:
self.agent_helpers._apply_agent()
@@ -829,7 +842,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
Generate a unique random service name
"""
suffix = daemon_type not in [
- 'mon', 'crash', 'ceph-exporter',
+ 'mon', 'crash', 'ceph-exporter', 'node-proxy',
'prometheus', 'node-exporter', 'grafana', 'alertmanager',
'container', 'agent', 'snmp-gateway', 'loki', 'promtail',
'elasticsearch', 'jaeger-collector', 'jaeger-agent', 'jaeger-query'
@@ -1621,13 +1634,12 @@ Then run the following:
spec.oob['addr'] = spec.hostname
if not spec.oob.get('port'):
spec.oob['port'] = '443'
- data = json.loads(self.get_store('node_proxy/oob', '{}'))
- data[spec.hostname] = dict()
- data[spec.hostname]['addr'] = spec.oob['addr']
- data[spec.hostname]['port'] = spec.oob['port']
- data[spec.hostname]['username'] = spec.oob['username']
- data[spec.hostname]['password'] = spec.oob['password']
- self.set_store('node_proxy/oob', json.dumps(data))
+ host_oob_info = dict()
+ host_oob_info['addr'] = spec.oob['addr']
+ host_oob_info['port'] = spec.oob['port']
+ host_oob_info['username'] = spec.oob['username']
+ host_oob_info['password'] = spec.oob['password']
+ self.node_proxy_cache.update_oob(spec.hostname, host_oob_info)
# prime crush map?
if spec.location:
@@ -1653,6 +1665,51 @@ Then run the following:
return self._add_host(spec)
@handle_orch_error
+ def hardware_light(self, light_type: str, action: str, hostname: str, device: Optional[str] = None) -> Dict[str, Any]:
+ try:
+ result = self.node_proxy.led(light_type=light_type,
+ action=action,
+ hostname=hostname,
+ device=device)
+ except RuntimeError as e:
+ self.log.error(e)
+ raise OrchestratorValidationError(f'Make sure the node-proxy agent is deployed and running on {hostname}')
+ except HTTPError as e:
+ self.log.error(e)
+ raise OrchestratorValidationError(f"http error while querying node-proxy API: {e}")
+ return result
+
+ @handle_orch_error
+ def hardware_shutdown(self, hostname: str, force: Optional[bool] = False, yes_i_really_mean_it: bool = False) -> str:
+ if not yes_i_really_mean_it:
+ raise OrchestratorError("you must pass --yes-i-really-mean-it")
+
+ try:
+ self.node_proxy.shutdown(hostname, force)
+ except RuntimeError as e:
+ self.log.error(e)
+ raise OrchestratorValidationError(f'Make sure the node-proxy agent is deployed and running on {hostname}')
+ except HTTPError as e:
+ self.log.error(e)
+ raise OrchestratorValidationError(f"Can't shutdown node {hostname}: {e}")
+ return f'Shutdown scheduled on {hostname}'
+
+ @handle_orch_error
+ def hardware_powercycle(self, hostname: str, yes_i_really_mean_it: bool = False) -> str:
+ if not yes_i_really_mean_it:
+ raise OrchestratorError("you must pass --yes-i-really-mean-it")
+
+ try:
+ self.node_proxy.powercycle(hostname)
+ except RuntimeError as e:
+ self.log.error(e)
+ raise OrchestratorValidationError(f'Make sure the node-proxy agent is deployed and running on {hostname}')
+ except HTTPError as e:
+ self.log.error(e)
+ raise OrchestratorValidationError(f"Can't perform powercycle on node {hostname}: {e}")
+ return f'Powercycle scheduled on {hostname}'
+
+ @handle_orch_error
def node_proxy_summary(self, hostname: Optional[str] = None) -> Dict[str, Any]:
return self.node_proxy_cache.summary(hostname=hostname)
@@ -2724,6 +2781,15 @@ Then run the following:
pass
deps = sorted([self.get_mgr_ip(), server_port, root_cert,
str(self.device_enhanced_scan)])
+ elif daemon_type == 'node-proxy':
+ root_cert = ''
+ server_port = ''
+ try:
+ server_port = str(self.http_server.agent.server_port)
+ root_cert = self.http_server.agent.ssl_certs.get_root_cert()
+ except Exception:
+ pass
+ deps = sorted([self.get_mgr_ip(), server_port, root_cert])
elif daemon_type == 'iscsi':
if spec:
iscsi_spec = cast(IscsiServiceSpec, spec)
diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py
index 262ecbd27f6..600329509a0 100644
--- a/src/pybind/mgr/cephadm/serve.py
+++ b/src/pybind/mgr/cephadm/serve.py
@@ -113,6 +113,9 @@ class CephadmServe:
if self.mgr.agent_helpers._handle_use_agent_setting():
continue
+ if self.mgr.node_proxy_service.handle_hw_monitoring_setting():
+ continue
+
if self.mgr.upgrade.continue_upgrade():
continue
diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py
index 1681be00348..115ee8d0102 100644
--- a/src/pybind/mgr/cephadm/services/cephadmservice.py
+++ b/src/pybind/mgr/cephadm/services/cephadmservice.py
@@ -42,7 +42,7 @@ def get_auth_entity(daemon_type: str, daemon_id: str, host: str = "") -> AuthEnt
# the CephService class refers to service types, not daemon types
if daemon_type in ['rgw', 'rbd-mirror', 'cephfs-mirror', 'nfs', "iscsi", 'nvmeof', 'ingress', 'ceph-exporter']:
return AuthEntity(f'client.{daemon_type}.{daemon_id}')
- elif daemon_type in ['crash', 'agent']:
+ elif daemon_type in ['crash', 'agent', 'node-proxy']:
if host == "":
raise OrchestratorError(
f'Host not provided to generate <{daemon_type}> auth entity name')
@@ -1236,16 +1236,6 @@ class CephadmAgent(CephService):
return daemon_spec
- def pre_remove(self, daemon: DaemonDescription) -> None:
- super().pre_remove(daemon)
-
- assert daemon.daemon_id is not None
- daemon_id: str = daemon.daemon_id
-
- logger.info('Removing agent %s...' % daemon_id)
-
- self.mgr.agent_helpers._shutdown_node_proxy()
-
def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
agent = self.mgr.http_server.agent
try:
diff --git a/src/pybind/mgr/cephadm/services/node_proxy.py b/src/pybind/mgr/cephadm/services/node_proxy.py
new file mode 100644
index 00000000000..ebbbaf212c7
--- /dev/null
+++ b/src/pybind/mgr/cephadm/services/node_proxy.py
@@ -0,0 +1,180 @@
+import json
+import ssl
+import base64
+
+from urllib.error import HTTPError, URLError
+from typing import List, Any, Dict, Tuple, Optional, MutableMapping
+
+from .cephadmservice import CephadmDaemonDeploySpec, CephService
+from ceph.deployment.service_spec import ServiceSpec, PlacementSpec
+from ceph.utils import http_req
+from orchestrator import OrchestratorError
+
+
+class NodeProxy(CephService):
+ TYPE = 'node-proxy'
+
+ def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
+ assert self.TYPE == daemon_spec.daemon_type
+ daemon_id, host = daemon_spec.daemon_id, daemon_spec.host
+
+ if not self.mgr.http_server.agent:
+ raise OrchestratorError('Cannot deploy node-proxy before creating cephadm endpoint')
+
+ keyring = self.get_keyring_with_caps(self.get_auth_entity(daemon_id, host=host), [])
+ daemon_spec.keyring = keyring
+ self.mgr.node_proxy_cache.update_keyring(host, keyring)
+
+ daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
+
+ return daemon_spec
+
+ def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
+ # node-proxy is re-using the agent endpoint and therefore
+ # needs similar checks to see if the endpoint is ready.
+ self.agent_endpoint = self.mgr.http_server.agent
+ try:
+ assert self.agent_endpoint
+ assert self.agent_endpoint.ssl_certs.get_root_cert()
+ assert self.agent_endpoint.server_port
+ except Exception:
+ raise OrchestratorError(
+ 'Cannot deploy node-proxy daemons until cephadm endpoint has finished generating certs')
+
+ listener_cert, listener_key = self.agent_endpoint.ssl_certs.generate_cert(daemon_spec.host, self.mgr.inventory.get_addr(daemon_spec.host))
+ cfg = {
+ 'target_ip': self.mgr.get_mgr_ip(),
+ 'target_port': self.agent_endpoint.server_port,
+ 'name': f'node-proxy.{daemon_spec.host}',
+ 'keyring': daemon_spec.keyring,
+ 'root_cert.pem': self.agent_endpoint.ssl_certs.get_root_cert(),
+ 'listener.crt': listener_cert,
+ 'listener.key': listener_key,
+ }
+ config = {'node-proxy.json': json.dumps(cfg)}
+
+ return config, sorted([str(self.mgr.get_mgr_ip()), str(self.agent_endpoint.server_port),
+ self.agent_endpoint.ssl_certs.get_root_cert()])
+
+ def handle_hw_monitoring_setting(self) -> bool:
+ # function to apply or remove node-proxy service spec depending
+ # on whether the hw_mointoring config option is set or not.
+ # It should return True when it either creates or deletes a spec
+ # and False otherwise.
+ if self.mgr.hw_monitoring:
+ if 'node-proxy' not in self.mgr.spec_store:
+ spec = ServiceSpec(
+ service_type='node-proxy',
+ placement=PlacementSpec(host_pattern='*')
+ )
+ self.mgr.spec_store.save(spec)
+ return True
+ return False
+ else:
+ if 'node-proxy' in self.mgr.spec_store:
+ self.mgr.spec_store.rm('node-proxy')
+ return True
+ return False
+
+ def get_ssl_ctx(self) -> ssl.SSLContext:
+ ssl_root_crt = self.mgr.http_server.agent.ssl_certs.get_root_cert()
+ ssl_ctx = ssl.create_default_context()
+ ssl_ctx.check_hostname = True
+ ssl_ctx.verify_mode = ssl.CERT_REQUIRED
+ ssl_ctx.load_verify_locations(cadata=ssl_root_crt)
+ return ssl_ctx
+
+ def led(self, light_type: str, action: str, hostname: str, device: Optional[str] = None) -> Dict[str, Any]:
+ ssl_ctx: ssl.SSLContext = self.get_ssl_ctx()
+ header: MutableMapping[str, str] = {}
+ method: str = 'PATCH' if action in ['on', 'off'] else 'GET'
+ payload: Optional[Dict[str, str]] = None
+ addr: str = self.mgr.inventory.get_addr(hostname)
+ endpoint: List[str] = ['led', light_type]
+ _device: str = device if device else ''
+
+ if light_type == 'drive':
+ endpoint.append(_device)
+
+ if method == 'PATCH':
+ payload = dict(state=action)
+
+ header = self.generate_auth_header(hostname)
+
+ endpoint = f'/{"/".join(endpoint)}'
+
+ try:
+ headers, result, status = http_req(hostname=addr,
+ port='8080',
+ headers=header,
+ method=method,
+ data=json.dumps(payload),
+ endpoint=endpoint,
+ ssl_ctx=ssl_ctx)
+ result_json = json.loads(result)
+ except HTTPError as e:
+ self.mgr.log.error(e)
+ raise
+ except URLError as e:
+ raise RuntimeError(e)
+
+ return result_json
+
+ def generate_auth_header(self, hostname: str) -> Dict[str, str]:
+ try:
+ username = self.mgr.node_proxy_cache.oob[hostname]['username']
+ password = self.mgr.node_proxy_cache.oob[hostname]['password']
+ auth: bytes = f'{username}:{password}'.encode('utf-8')
+ auth_str: str = base64.b64encode(auth).decode('utf-8')
+ header = {'Authorization': f'Basic {auth_str}'}
+ except KeyError as e:
+ self.mgr.log.error(f'Check oob information is provided for {hostname}.')
+ raise RuntimeError(e)
+ return header
+
+ def shutdown(self, hostname: str, force: Optional[bool] = False) -> Dict[str, Any]:
+ ssl_ctx: ssl.SSLContext = self.get_ssl_ctx()
+ header: Dict[str, str] = self.generate_auth_header(hostname)
+ addr: str = self.mgr.inventory.get_addr(hostname)
+
+ endpoint = '/shutdown'
+ payload: Dict[str, Optional[bool]] = dict(force=force)
+
+ try:
+ headers, result, status = http_req(hostname=addr,
+ port='8080',
+ headers=header,
+ data=json.dumps(payload),
+ endpoint=endpoint,
+ ssl_ctx=ssl_ctx)
+ result_json = json.loads(result)
+ except HTTPError as e:
+ self.mgr.log.error(e)
+ raise
+ except URLError as e:
+ raise RuntimeError(e)
+
+ return result_json
+
+ def powercycle(self, hostname: str) -> Dict[str, Any]:
+ ssl_ctx: ssl.SSLContext = self.get_ssl_ctx()
+ header: Dict[str, str] = self.generate_auth_header(hostname)
+ addr: str = self.mgr.inventory.get_addr(hostname)
+
+ endpoint = '/powercycle'
+
+ try:
+ headers, result, status = http_req(hostname=addr,
+ port='8080',
+ headers=header,
+ data="{}",
+ endpoint=endpoint,
+ ssl_ctx=ssl_ctx)
+ result_json = json.loads(result)
+ except HTTPError as e:
+ self.mgr.log.error(e)
+ raise
+ except URLError as e:
+ raise RuntimeError(e)
+
+ return result_json
diff --git a/src/pybind/mgr/cephadm/tests/test_node_proxy.py b/src/pybind/mgr/cephadm/tests/test_node_proxy.py
index b713d04cd59..0c9ee127547 100644
--- a/src/pybind/mgr/cephadm/tests/test_node_proxy.py
+++ b/src/pybind/mgr/cephadm/tests/test_node_proxy.py
@@ -35,8 +35,8 @@ class FakeMgr:
class TestNodeProxyEndpoint(helper.CPWebCase):
mgr = FakeMgr()
app = NodeProxyEndpoint(mgr)
- mgr.agent_cache.agent_keys = {"host01": "fake-secret01",
- "host02": "fake-secret02"}
+ mgr.node_proxy.keyrings = {"host01": "fake-secret01",
+ "host02": "fake-secret02"}
mgr.node_proxy.oob = {"host01": {"username": "oob-user01",
"password": "oob-pass01"},
"host02": {"username": "oob-user02",
@@ -68,38 +68,38 @@ class TestNodeProxyEndpoint(helper.CPWebCase):
self.assertStatus('400 Bad Request')
def test_oob_data_misses_secret_field(self):
- data = '{"cephx": {"name": "host01"}}'
+ data = '{"cephx": {"name": "node-proxy.host01"}}'
self.getPage("/oob", method="POST", body=data, headers=[('Content-Type', 'application/json'),
('Content-Length', str(len(data)))])
self.assertStatus('400 Bad Request')
def test_oob_agent_not_running(self):
- data = '{"cephx": {"name": "host03", "secret": "fake-secret03"}}'
+ data = '{"cephx": {"name": "node-proxy.host03", "secret": "fake-secret03"}}'
self.getPage("/oob", method="POST", body=data, headers=[('Content-Type', 'application/json'),
('Content-Length', str(len(data)))])
self.assertStatus('502 Bad Gateway')
def test_oob_wrong_keyring(self):
- data = '{"cephx": {"name": "host01", "secret": "wrong-keyring"}}'
+ data = '{"cephx": {"name": "node-proxy.host01", "secret": "wrong-keyring"}}'
self.getPage("/oob", method="POST", body=data, headers=[('Content-Type', 'application/json'),
('Content-Length', str(len(data)))])
self.assertStatus('403 Forbidden')
def test_oob_ok(self):
- data = '{"cephx": {"name": "host01", "secret": "fake-secret01"}}'
+ data = '{"cephx": {"name": "node-proxy.host01", "secret": "fake-secret01"}}'
self.getPage("/oob", method="POST", body=data, headers=[('Content-Type', 'application/json'),
('Content-Length', str(len(data)))])
self.assertStatus('200 OK')
def test_data_missing_patch(self):
- data = '{"cephx": {"name": "host01", "secret": "fake-secret01"}}'
+ data = '{"cephx": {"name": "node-proxy.host01", "secret": "fake-secret01"}}'
self.getPage("/data", method="POST", body=data, headers=[('Content-Type', 'application/json'),
('Content-Length', str(len(data)))])
self.assertStatus('400 Bad Request')
def test_data_raises_alert(self):
patch = node_proxy_data.full_set_with_critical
- data = {"cephx": {"name": "host01", "secret": "fake-secret01"}, "patch": patch}
+ data = {"cephx": {"name": "node-proxy.host01", "secret": "fake-secret01"}, "patch": patch}
data_str = json.dumps(data)
self.getPage("/data", method="POST", body=data_str, headers=[('Content-Type', 'application/json'),
('Content-Length', str(len(data_str)))])
diff --git a/src/pybind/mgr/cephadm/utils.py b/src/pybind/mgr/cephadm/utils.py
index 63672936c7c..3aedfbd86f0 100644
--- a/src/pybind/mgr/cephadm/utils.py
+++ b/src/pybind/mgr/cephadm/utils.py
@@ -31,7 +31,7 @@ RESCHEDULE_FROM_OFFLINE_HOSTS_TYPES = ['haproxy', 'nfs']
CEPH_UPGRADE_ORDER = CEPH_TYPES + GATEWAY_TYPES + MONITORING_STACK_TYPES
# these daemon types use the ceph container image
-CEPH_IMAGE_TYPES = CEPH_TYPES + ['iscsi', 'nfs']
+CEPH_IMAGE_TYPES = CEPH_TYPES + ['iscsi', 'nfs', 'node-proxy']
# these daemons do not use the ceph image. There are other daemons
# that also don't use the ceph image, but we only care about those
diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py
index f97b61e8f9b..362badcc35e 100644
--- a/src/pybind/mgr/orchestrator/_interface.py
+++ b/src/pybind/mgr/orchestrator/_interface.py
@@ -359,6 +359,33 @@ class Orchestrator(object):
"""
raise NotImplementedError()
+ def hardware_light(self, light_type: str, action: str, hostname: str, device: Optional[str] = None) -> OrchResult[Dict[str, Any]]:
+ """
+ Light a chassis or device ident LED.
+
+ :param light_type: led type (chassis or device).
+ :param action: set or get status led.
+ :param hostname: the name of the host.
+ :param device: the device id (when light_type = 'device')
+ """
+ raise NotImplementedError()
+
+ def hardware_powercycle(self, hostname: str, yes_i_really_mean_it: bool = False) -> OrchResult[str]:
+ """
+ Reboot a host.
+
+ :param hostname: the name of the host being rebooted.
+ """
+ raise NotImplementedError()
+
+ def hardware_shutdown(self, hostname: str, force: Optional[bool] = False, yes_i_really_mean_it: bool = False) -> OrchResult[str]:
+ """
+ Shutdown a host.
+
+ :param hostname: the name of the host to shutdown.
+ """
+ raise NotImplementedError()
+
def hardware_status(self, hostname: Optional[str] = None, category: Optional[str] = 'summary') -> OrchResult[str]:
"""
Display hardware status.
@@ -869,6 +896,7 @@ def daemon_type_to_service(dtype: str) -> str:
'crashcollector': 'crash', # Specific Rook Daemon
'container': 'container',
'agent': 'agent',
+ 'node-proxy': 'node-proxy',
'snmp-gateway': 'snmp-gateway',
'elasticsearch': 'elasticsearch',
'jaeger-agent': 'jaeger-agent',
@@ -901,6 +929,7 @@ def service_to_daemon_types(stype: str) -> List[str]:
'crash': ['crash'],
'container': ['container'],
'agent': ['agent'],
+ 'node-proxy': ['node-proxy'],
'snmp-gateway': ['snmp-gateway'],
'elasticsearch': ['elasticsearch'],
'jaeger-agent': ['jaeger-agent'],
diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py
index 27d91d1a4d3..22e1dd2f652 100644
--- a/src/pybind/mgr/orchestrator/module.py
+++ b/src/pybind/mgr/orchestrator/module.py
@@ -591,6 +591,51 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule,
return table.get_string()
+ class HardwareLightType(enum.Enum):
+ chassis = 'chassis'
+ device = 'drive'
+
+ class HardwareLightAction(enum.Enum):
+ on = 'on'
+ off = 'off'
+ get = 'get'
+
+ @_cli_write_command('orch hardware light')
+ def _hardware_light(self,
+ light_type: HardwareLightType, action: HardwareLightAction,
+ hostname: str, device: Optional[str] = None) -> HandleCommandResult:
+ """Enable or Disable a device or chassis LED"""
+ if light_type == self.HardwareLightType.device and not device:
+ return HandleCommandResult(stderr='you must pass a device ID.',
+ retval=-errno.ENOENT)
+
+ completion = self.hardware_light(light_type.value, action.value, hostname, device)
+ data = raise_if_exception(completion)
+ output: str = ''
+ if action == self.HardwareLightAction.get:
+ status = 'on' if data["LocationIndicatorActive"] else 'off'
+ if light_type == self.HardwareLightType.device:
+ output = f'ident LED for {device} on {hostname} is: {status}'
+ else:
+ output = f'ident chassis LED for {hostname} is: {status}'
+ else:
+ pass
+ return HandleCommandResult(stdout=output)
+
+ @_cli_write_command('orch hardware powercycle')
+ def _hardware_powercycle(self, hostname: str, yes_i_really_mean_it: bool = False) -> HandleCommandResult:
+ """Reboot a host"""
+ completion = self.hardware_powercycle(hostname, yes_i_really_mean_it=yes_i_really_mean_it)
+ raise_if_exception(completion)
+ return HandleCommandResult(stdout=completion.result_str())
+
+ @_cli_write_command('orch hardware shutdown')
+ def _hardware_shutdown(self, hostname: str, force: Optional[bool] = False, yes_i_really_mean_it: bool = False) -> HandleCommandResult:
+ """Shutdown a host"""
+ completion = self.hardware_shutdown(hostname, force, yes_i_really_mean_it=yes_i_really_mean_it)
+ raise_if_exception(completion)
+ return HandleCommandResult(stdout=completion.result_str())
+
@_cli_write_command('orch host rm')
def _remove_host(self, hostname: str, force: bool = False, offline: bool = False, rm_crush_entry: bool = False) -> HandleCommandResult:
"""Remove a host"""