summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorMilind Changire <mchangir@redhat.com>2024-01-11 13:51:31 +0100
committerMilind Changire <mchangir@redhat.com>2024-11-04 14:15:10 +0100
commit4d2af0c57839418b135e2ef45778fde77e12199a (patch)
tree96fcbc8758e0ad4e93e14f90e6a351674ab87c1f /src
parentMerge pull request #59486 from ceph/ceph-ec-benchmark-increase-default-buffer... (diff)
downloadceph-4d2af0c57839418b135e2ef45778fde77e12199a.tar.xz
ceph-4d2af0c57839418b135e2ef45778fde77e12199a.zip
mgr/snap_schedule: handle volume delete
Remove references to timers and db connection that refer to paths in the deleted volume. This is a best case effort and not 100% safe. There can be race conditions where the db connection is closed while the module is executing a query on the db while the volume itself is deleted. Hopefully there won't be many tracebacks seen. Fixes: https://tracker.ceph.com/issues/63999 Signed-off-by: Milind Changire <mchangir@redhat.com>
Diffstat (limited to 'src')
-rw-r--r--src/pybind/mgr/snap_schedule/fs/schedule_client.py24
-rw-r--r--src/pybind/mgr/snap_schedule/module.py19
2 files changed, 42 insertions, 1 deletions
diff --git a/src/pybind/mgr/snap_schedule/fs/schedule_client.py b/src/pybind/mgr/snap_schedule/fs/schedule_client.py
index b58f20f1275..12e5e980737 100644
--- a/src/pybind/mgr/snap_schedule/fs/schedule_client.py
+++ b/src/pybind/mgr/snap_schedule/fs/schedule_client.py
@@ -163,6 +163,7 @@ class SnapSchedClient(CephfsClient):
self.sqlite_connections: Dict[str, DBInfo] = {}
self.active_timers: Dict[Tuple[str, str], List[Timer]] = {}
self.conn_lock: Lock = Lock() # lock to protect add/lookup db connections
+ self.timers_lock: Lock = Lock()
# restart old schedules
for fs_name in self.get_all_filesystems():
@@ -273,6 +274,27 @@ class SnapSchedClient(CephfsClient):
if self._is_allowed_repeat(r, path)][0:1]
return rows
+ def delete_references_to_unavailable_fs(self, available_fs_names: Set[str]) -> None:
+ fs_to_remove: Set[str] = set()
+ self.timers_lock.acquire()
+ for fs, path in list(self.active_timers.keys()): # each key is a tuple
+ if fs not in available_fs_names:
+ fs_to_remove.add(fs)
+ log.debug(f'Cancelled timers for "{fs}:{path}"')
+ for t in self.active_timers[(fs, path)]:
+ t.cancel()
+ log.debug(f'Removed timer instance for "{fs}"')
+ del self.active_timers[(fs, path)]
+ self.timers_lock.release()
+
+ self.conn_lock.acquire()
+ for fs in fs_to_remove:
+ log.debug(f'Closed DB connection to "{fs}"')
+ self.sqlite_connections[fs].db.close()
+ log.debug(f'Removed DB connection to "{fs}"')
+ del self.sqlite_connections[fs]
+ self.conn_lock.release()
+
def refresh_snap_timers(self, fs: str, path: str, olddb: Optional[sqlite3.Connection] = None) -> None:
try:
log.debug((f'SnapDB on {fs} changed for {path}, '
@@ -286,6 +308,7 @@ class SnapSchedClient(CephfsClient):
with self.get_schedule_db(fs) as conn_mgr:
db = conn_mgr.dbinfo.db
rows = self.fetch_schedules(db, path)
+ self.timers_lock.acquire()
timers = self.active_timers.get((fs, path), [])
for timer in timers:
timer.cancel()
@@ -299,6 +322,7 @@ class SnapSchedClient(CephfsClient):
timers.append(t)
log.debug(f'Will snapshot {path} in fs {fs} in {row[1]}s')
self.active_timers[(fs, path)] = timers
+ self.timers_lock.release()
except Exception:
self._log_exception('refresh_snap_timers')
diff --git a/src/pybind/mgr/snap_schedule/module.py b/src/pybind/mgr/snap_schedule/module.py
index d8f04a62b94..adf982448b1 100644
--- a/src/pybind/mgr/snap_schedule/module.py
+++ b/src/pybind/mgr/snap_schedule/module.py
@@ -8,12 +8,14 @@ import json
import sqlite3
from typing import Any, Dict, Optional, Tuple, Union
from .fs.schedule_client import SnapSchedClient
-from mgr_module import MgrModule, CLIReadCommand, CLIWriteCommand, Option
+from mgr_module import MgrModule, CLIReadCommand, CLIWriteCommand, Option, NotifyType
from mgr_util import CephfsConnectionException
from threading import Event
class Module(MgrModule):
+ NOTIFY_TYPES = [NotifyType.fs_map]
+
MODULE_OPTIONS = [
Option(
'allow_m_granularity',
@@ -37,6 +39,21 @@ class Module(MgrModule):
self._initialized = Event()
self.client = SnapSchedClient(self)
+ def notify(self, notify_type: NotifyType, notify_id: str) -> None:
+ if notify_type != NotifyType.fs_map:
+ return
+ fs_map = self.get('fs_map')
+ if not fs_map:
+ return
+
+ # we don't know for which fs config has been changed
+ fs_names = set()
+ for fs in fs_map['filesystems']:
+ fs_name = fs['mdsmap']['fs_name']
+ fs_names.add(fs_name)
+
+ self.client.delete_references_to_unavailable_fs(fs_names)
+
def _subvolume_exist(self, fs: str, subvol: Union[str, None], group: Union[str, None]) -> bool:
rc, subvolumes, err = self.remote('volumes', 'subvolume_ls', fs, group)
if rc == 0: