From 1b418f75e6346286ceed7f44ce4f81b46ff8e012 Mon Sep 17 00:00:00 2001 From: Andrea Restle-Lay Date: Wed, 18 Dec 2024 17:12:52 -0500 Subject: AAP-36604 (analytics) Thousands of zombie/orphaned Slow/Stuck DB queries in controller querying active host count (#15715) * lint * change timeout to 5 minutes * change timeout to 5 minutes --- awx/main/tasks/host_metrics.py | 49 ++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/awx/main/tasks/host_metrics.py b/awx/main/tasks/host_metrics.py index e5f1263ad1..9e163cfa73 100644 --- a/awx/main/tasks/host_metrics.py +++ b/awx/main/tasks/host_metrics.py @@ -11,6 +11,8 @@ from awx.main.dispatch.publish import task from awx.main.models.inventory import HostMetric, HostMetricSummaryMonthly from awx.main.tasks.helpers import is_run_threshold_reached from awx.conf.license import get_license +from awx.main.utils.pglock import advisory_lock + logger = logging.getLogger('awx.main.tasks.host_metrics') @@ -90,7 +92,10 @@ class HostMetricTask: class HostMetricSummaryMonthlyTask: + LOCK_KEY = 'host_metric_summary_monthly' + LOCK_SESSION_TIMEOUT = 300000 # 5 minutes. """ + Task runs every four hours, longer lock timeout avoids premature termination due to high db load or other latency. This task computes last [threshold] months of HostMetricSummaryMonthly table [threshold] is setting CLEANUP_HOST_METRICS_HARD_THRESHOLD Each record in the table represents changes in HostMetric table in one month @@ -115,29 +120,37 @@ class HostMetricSummaryMonthlyTask: self.records_to_update = [] def execute(self): - self._load_existing_summaries() - self._load_hosts_added() - self._load_hosts_deleted() - # Get first month after last hard delete - month = self._get_first_month() - license_consumed = self._get_license_consumed_before(month) + with advisory_lock( + HostMetricSummaryMonthlyTask.LOCK_KEY, lock_session_timeout_milliseconds=HostMetricSummaryMonthlyTask.LOCK_SESSION_TIMEOUT, wait=False + ) as acquired: + if not acquired: + logger.info("Another instance of host_metric_summary_monthly is already running. Exiting.") + return + + self._load_existing_summaries() + self._load_hosts_added() + self._load_hosts_deleted() + + # Get first month after last hard delete + month = self._get_first_month() + license_consumed = self._get_license_consumed_before(month) - # Fill record for each month - while month <= datetime.date.today().replace(day=1): - summary = self._find_or_create_summary(month) - # Update summary and update license_consumed by hosts added/removed this month - self._update_summary(summary, month, license_consumed) - license_consumed = summary.license_consumed + # Fill record for each month + while month <= datetime.date.today().replace(day=1): + summary = self._find_or_create_summary(month) + # Update summary and update license_consumed by hosts added/removed this month + self._update_summary(summary, month, license_consumed) + license_consumed = summary.license_consumed - month = month + relativedelta(months=1) + month = month + relativedelta(months=1) - # Create/Update stats - HostMetricSummaryMonthly.objects.bulk_create(self.records_to_create, batch_size=1000) - HostMetricSummaryMonthly.objects.bulk_update(self.records_to_update, ['license_consumed', 'hosts_added', 'hosts_deleted'], batch_size=1000) + # Create/Update stats + HostMetricSummaryMonthly.objects.bulk_create(self.records_to_create, batch_size=1000) + HostMetricSummaryMonthly.objects.bulk_update(self.records_to_update, ['license_consumed', 'hosts_added', 'hosts_deleted'], batch_size=1000) - # Set timestamp of last run - settings.HOST_METRIC_SUMMARY_TASK_LAST_TS = now() + # Set timestamp of last run + settings.HOST_METRIC_SUMMARY_TASK_LAST_TS = now() def _get_license_consumed_before(self, month): license_consumed = 0 -- cgit v1.2.3