
Evictable pages are divided into multiple generations for each lruvec. The youngest generation number is stored in lrugen->max_seq for both anon and file types as they are aged on an equal footing. The oldest generation numbers are stored in lrugen->min_seq[] separately for anon and file types as clean file pages can be evicted regardless of swap constraints. These three variables are monotonically increasing. Generation numbers are truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into the gen counter in folio->flags. Each truncated generation number is an index to lrugen->lists[]. The sliding window technique is used to track at least MIN_NR_GENS and at most MAX_NR_GENS generations. The gen counter stores a value within [1, MAX_NR_GENS] while a page is on one of lrugen->lists[]. Otherwise it stores 0. There are two conceptually independent procedures: "the aging", which produces young generations, and "the eviction", which consumes old generations. They form a closed-loop system, i.e., "the page reclaim". Both procedures can be invoked from userspace for the purposes of working set estimation and proactive reclaim. These techniques are commonly used to optimize job scheduling (bin packing) in data centers [1][2]. To avoid confusion, the terms "hot" and "cold" will be applied to the multi-gen LRU, as a new convention; the terms "active" and "inactive" will be applied to the active/inactive LRU, as usual. The protection of hot pages and the selection of cold pages are based on page access channels and patterns. There are two access channels: one through page tables and the other through file descriptors. The protection of the former channel is by design stronger because: 1. The uncertainty in determining the access patterns of the former channel is higher due to the approximation of the accessed bit. 2. The cost of evicting the former channel is higher due to the TLB flushes required and the likelihood of encountering the dirty bit. 3. The penalty of underprotecting the former channel is higher because applications usually do not prepare themselves for major page faults like they do for blocked I/O. E.g., GUI applications commonly use dedicated I/O threads to avoid blocking rendering threads. There are also two access patterns: one with temporal locality and the other without. For the reasons listed above, the former channel is assumed to follow the former pattern unless VM_SEQ_READ or VM_RAND_READ is present; the latter channel is assumed to follow the latter pattern unless outlying refaults have been observed [3][4]. The next patch will address the "outlying refaults". Three macros, i.e., LRU_REFS_WIDTH, LRU_REFS_PGOFF and LRU_REFS_MASK, used later are added in this patch to make the entire patchset less diffy. A page is added to the youngest generation on faulting. The aging needs to check the accessed bit at least twice before handing this page over to the eviction. The first check takes care of the accessed bit set on the initial fault; the second check makes sure this page has not been used since then. This protocol, AKA second chance, requires a minimum of two generations, hence MIN_NR_GENS. [1] https://dl.acm.org/doi/10.1145/3297858.3304053 [2] https://dl.acm.org/doi/10.1145/3503222.3507731 [3] https://lwn.net/Articles/495543/ [4] https://lwn.net/Articles/815342/ Link: https://lkml.kernel.org/r/20220918080010.2920238-6-yuzhao@google.com Signed-off-by: Yu Zhao <yuzhao@google.com> Acked-by: Brian Geffon <bgeffon@google.com> Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org> Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name> Acked-by: Steven Barrett <steven@liquorix.net> Acked-by: Suleiman Souhlal <suleiman@google.com> Tested-by: Daniel Byrne <djbyrne@mtu.edu> Tested-by: Donald Carr <d@chaos-reins.com> Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com> Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> Tested-by: Shuang Zhai <szhai2@cs.rochester.edu> Tested-by: Sofia Trinh <sofia.trinh@edi.works> Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Cc: Barry Song <baohua@kernel.org> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Hillf Danton <hdanton@sina.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Matthew Wilcox <willy@infradead.org> Cc: Mel Gorman <mgorman@suse.de> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Michael Larabel <Michael@MichaelLarabel.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Rapoport <rppt@kernel.org> Cc: Mike Rapoport <rppt@linux.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Qi Zheng <zhengqi.arch@bytedance.com> Cc: Tejun Heo <tj@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
113 lines
3.3 KiB
C
113 lines
3.3 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef PAGE_FLAGS_LAYOUT_H
|
|
#define PAGE_FLAGS_LAYOUT_H
|
|
|
|
#include <linux/numa.h>
|
|
#include <generated/bounds.h>
|
|
|
|
/*
|
|
* When a memory allocation must conform to specific limitations (such
|
|
* as being suitable for DMA) the caller will pass in hints to the
|
|
* allocator in the gfp_mask, in the zone modifier bits. These bits
|
|
* are used to select a priority ordered list of memory zones which
|
|
* match the requested limits. See gfp_zone() in include/linux/gfp.h
|
|
*/
|
|
#if MAX_NR_ZONES < 2
|
|
#define ZONES_SHIFT 0
|
|
#elif MAX_NR_ZONES <= 2
|
|
#define ZONES_SHIFT 1
|
|
#elif MAX_NR_ZONES <= 4
|
|
#define ZONES_SHIFT 2
|
|
#elif MAX_NR_ZONES <= 8
|
|
#define ZONES_SHIFT 3
|
|
#else
|
|
#error ZONES_SHIFT "Too many zones configured"
|
|
#endif
|
|
|
|
#define ZONES_WIDTH ZONES_SHIFT
|
|
|
|
#ifdef CONFIG_SPARSEMEM
|
|
#include <asm/sparsemem.h>
|
|
#define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
|
|
#else
|
|
#define SECTIONS_SHIFT 0
|
|
#endif
|
|
|
|
#ifndef BUILD_VDSO32_64
|
|
/*
|
|
* page->flags layout:
|
|
*
|
|
* There are five possibilities for how page->flags get laid out. The first
|
|
* pair is for the normal case without sparsemem. The second pair is for
|
|
* sparsemem when there is plenty of space for node and section information.
|
|
* The last is when there is insufficient space in page->flags and a separate
|
|
* lookup is necessary.
|
|
*
|
|
* No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS |
|
|
* " plus space for last_cpupid: | NODE | ZONE | LAST_CPUPID ... | FLAGS |
|
|
* classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS |
|
|
* " plus space for last_cpupid: | SECTION | NODE | ZONE | LAST_CPUPID ... | FLAGS |
|
|
* classic sparse no space for node: | SECTION | ZONE | ... | FLAGS |
|
|
*/
|
|
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
|
|
#define SECTIONS_WIDTH SECTIONS_SHIFT
|
|
#else
|
|
#define SECTIONS_WIDTH 0
|
|
#endif
|
|
|
|
#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
|
|
<= BITS_PER_LONG - NR_PAGEFLAGS
|
|
#define NODES_WIDTH NODES_SHIFT
|
|
#elif defined(CONFIG_SPARSEMEM_VMEMMAP)
|
|
#error "Vmemmap: No space for nodes field in page flags"
|
|
#else
|
|
#define NODES_WIDTH 0
|
|
#endif
|
|
|
|
/*
|
|
* Note that this #define MUST have a value so that it can be tested with
|
|
* the IS_ENABLED() macro.
|
|
*/
|
|
#if NODES_SHIFT != 0 && NODES_WIDTH == 0
|
|
#define NODE_NOT_IN_PAGE_FLAGS 1
|
|
#endif
|
|
|
|
#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
|
|
#define KASAN_TAG_WIDTH 8
|
|
#else
|
|
#define KASAN_TAG_WIDTH 0
|
|
#endif
|
|
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
#define LAST__PID_SHIFT 8
|
|
#define LAST__PID_MASK ((1 << LAST__PID_SHIFT)-1)
|
|
|
|
#define LAST__CPU_SHIFT NR_CPUS_BITS
|
|
#define LAST__CPU_MASK ((1 << LAST__CPU_SHIFT)-1)
|
|
|
|
#define LAST_CPUPID_SHIFT (LAST__PID_SHIFT+LAST__CPU_SHIFT)
|
|
#else
|
|
#define LAST_CPUPID_SHIFT 0
|
|
#endif
|
|
|
|
#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
|
|
KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
|
|
#define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
|
|
#else
|
|
#define LAST_CPUPID_WIDTH 0
|
|
#endif
|
|
|
|
#if LAST_CPUPID_SHIFT != 0 && LAST_CPUPID_WIDTH == 0
|
|
#define LAST_CPUPID_NOT_IN_PAGE_FLAGS
|
|
#endif
|
|
|
|
#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
|
|
KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
|
|
#error "Not enough bits in page flags"
|
|
#endif
|
|
|
|
#define LRU_REFS_WIDTH 0
|
|
|
|
#endif
|
|
#endif /* _LINUX_PAGE_FLAGS_LAYOUT */
|