summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c24
-rw-r--r--mm/vmscan.c94
-rw-r--r--mm/workingset.c7
3 files changed, 87 insertions, 38 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 490d5b4676c1..108d5b097db1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -569,23 +569,6 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
* common workload, threshold and synchronization as vmstat[] should be
* implemented.
*/
-static unsigned long
-mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx)
-{
- long val = 0;
- int cpu;
-
- /* Per-cpu values can be negative, use a signed accumulator */
- for_each_possible_cpu(cpu)
- val += per_cpu(memcg->stat->count[idx], cpu);
- /*
- * Summing races with updates, so val may be negative. Avoid exposing
- * transient negative values.
- */
- if (val < 0)
- val = 0;
- return val;
-}
static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
enum mem_cgroup_events_index idx)
@@ -5244,6 +5227,13 @@ static int memory_stat_show(struct seq_file *m, void *v)
seq_printf(m, "pgmajfault %lu\n",
events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
+ seq_printf(m, "workingset_refault %lu\n",
+ stat[MEMCG_WORKINGSET_REFAULT]);
+ seq_printf(m, "workingset_activate %lu\n",
+ stat[MEMCG_WORKINGSET_ACTIVATE]);
+ seq_printf(m, "workingset_nodereclaim %lu\n",
+ stat[MEMCG_WORKINGSET_NODERECLAIM]);
+
return 0;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7a30150b4dee..e8cb983a8c84 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2006,6 +2006,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
* Both inactive lists should also be large enough that each inactive
* page has a chance to be referenced again before it is reclaimed.
*
+ * If that fails and refaulting is observed, the inactive list grows.
+ *
* The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
* on this LRU, maintained by the pageout code. A zone->inactive_ratio
* of 3 means 3:1 or 25% of the pages are kept on the inactive list.
@@ -2022,12 +2024,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
* 10TB 320 32GB
*/
static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
- struct scan_control *sc, bool trace)
+ struct mem_cgroup *memcg,
+ struct scan_control *sc, bool actual_reclaim)
{
- unsigned long inactive_ratio;
- unsigned long inactive, active;
- enum lru_list inactive_lru = file * LRU_FILE;
enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ enum lru_list inactive_lru = file * LRU_FILE;
+ unsigned long inactive, active;
+ unsigned long inactive_ratio;
+ unsigned long refaults;
unsigned long gb;
/*
@@ -2040,27 +2045,43 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
- gb = (inactive + active) >> (30 - PAGE_SHIFT);
- if (gb)
- inactive_ratio = int_sqrt(10 * gb);
+ if (memcg)
+ refaults = mem_cgroup_read_stat(memcg,
+ MEMCG_WORKINGSET_ACTIVATE);
else
- inactive_ratio = 1;
+ refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
+
+ /*
+ * When refaults are being observed, it means a new workingset
+ * is being established. Disable active list protection to get
+ * rid of the stale workingset quickly.
+ */
+ if (file && actual_reclaim && lruvec->refaults != refaults) {
+ inactive_ratio = 0;
+ } else {
+ gb = (inactive + active) >> (30 - PAGE_SHIFT);
+ if (gb)
+ inactive_ratio = int_sqrt(10 * gb);
+ else
+ inactive_ratio = 1;
+ }
- if (trace)
- trace_mm_vmscan_inactive_list_is_low(lruvec_pgdat(lruvec)->node_id,
- sc->reclaim_idx,
- lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
- lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
- inactive_ratio, file);
+ if (actual_reclaim)
+ trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
+ lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
+ lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
+ inactive_ratio, file);
return inactive * inactive_ratio < active;
}
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
- struct lruvec *lruvec, struct scan_control *sc)
+ struct lruvec *lruvec, struct mem_cgroup *memcg,
+ struct scan_control *sc)
{
if (is_active_lru(lru)) {
- if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
+ if (inactive_list_is_low(lruvec, is_file_lru(lru),
+ memcg, sc, true))
shrink_active_list(nr_to_scan, lruvec, sc, lru);
return 0;
}
@@ -2169,7 +2190,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* lruvec even if it has plenty of old anonymous pages unless the
* system is under heavy pressure.
*/
- if (!inactive_list_is_low(lruvec, true, sc, false) &&
+ if (!inactive_list_is_low(lruvec, true, memcg, sc, false) &&
lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
scan_balance = SCAN_FILE;
goto out;
@@ -2320,7 +2341,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
nr[lru] -= nr_to_scan;
nr_reclaimed += shrink_list(lru, nr_to_scan,
- lruvec, sc);
+ lruvec, memcg, sc);
}
}
@@ -2387,7 +2408,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (inactive_list_is_low(lruvec, false, sc, true))
+ if (inactive_list_is_low(lruvec, false, memcg, sc, true))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
}
@@ -2703,6 +2724,26 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
sc->gfp_mask = orig_mask;
}
+static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
+{
+ struct mem_cgroup *memcg;
+
+ memcg = mem_cgroup_iter(root_memcg, NULL, NULL);
+ do {
+ unsigned long refaults;
+ struct lruvec *lruvec;
+
+ if (memcg)
+ refaults = mem_cgroup_read_stat(memcg,
+ MEMCG_WORKINGSET_ACTIVATE);
+ else
+ refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
+
+ lruvec = mem_cgroup_lruvec(pgdat, memcg);
+ lruvec->refaults = refaults;
+ } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
+}
+
/*
* This is the main entry point to direct page reclaim.
*
@@ -2723,6 +2764,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc)
{
int initial_priority = sc->priority;
+ pg_data_t *last_pgdat;
+ struct zoneref *z;
+ struct zone *zone;
retry:
delayacct_freepages_start();
@@ -2749,6 +2793,15 @@ retry:
sc->may_writepage = 1;
} while (--sc->priority >= 0);
+ last_pgdat = NULL;
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
+ sc->nodemask) {
+ if (zone->zone_pgdat == last_pgdat)
+ continue;
+ last_pgdat = zone->zone_pgdat;
+ snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
+ }
+
delayacct_freepages_end();
if (sc->nr_reclaimed)
@@ -3033,7 +3086,7 @@ static void age_active_anon(struct pglist_data *pgdat,
do {
struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
- if (inactive_list_is_low(lruvec, false, sc, true))
+ if (inactive_list_is_low(lruvec, false, memcg, sc, true))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
@@ -3280,6 +3333,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
pgdat->kswapd_failures++;
out:
+ snapshot_refaults(NULL, pgdat);
/*
* Return the order kswapd stopped reclaiming at as
* prepare_kswapd_sleep() takes it into account. If another caller
diff --git a/mm/workingset.c b/mm/workingset.c
index eda05c71fa49..51c6f61d4cea 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -269,7 +269,6 @@ bool workingset_refault(void *shadow)
lruvec = mem_cgroup_lruvec(pgdat, memcg);
refault = atomic_long_read(&lruvec->inactive_age);
active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
- rcu_read_unlock();
/*
* The unsigned subtraction here gives an accurate distance
@@ -290,11 +289,15 @@ bool workingset_refault(void *shadow)
refault_distance = (refault - eviction) & EVICTION_MASK;
inc_node_state(pgdat, WORKINGSET_REFAULT);
+ mem_cgroup_inc_stat(memcg, MEMCG_WORKINGSET_REFAULT);
if (refault_distance <= active_file) {
inc_node_state(pgdat, WORKINGSET_ACTIVATE);
+ mem_cgroup_inc_stat(memcg, MEMCG_WORKINGSET_ACTIVATE);
+ rcu_read_unlock();
return true;
}
+ rcu_read_unlock();
return false;
}
@@ -472,6 +475,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
if (WARN_ON_ONCE(node->exceptional))
goto out_invalid;
inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
+ mem_cgroup_inc_page_stat(virt_to_page(node),
+ MEMCG_WORKINGSET_NODERECLAIM);
__radix_tree_delete_node(&mapping->page_tree, node,
workingset_update_node, mapping);