
In NUMA balancing memory tiering mode, if there are hot pages in slow memory node and cold pages in fast memory node, we need to promote/demote hot/cold pages between the fast and cold memory nodes. A choice is to promote/demote as fast as possible. But the CPU cycles and memory bandwidth consumed by the high promoting/demoting throughput will hurt the latency of some workload because of accessing inflating and slow memory bandwidth contention. A way to resolve this issue is to restrict the max promoting/demoting throughput. It will take longer to finish the promoting/demoting. But the workload latency will be better. This is implemented in this patch as the page promotion rate limit mechanism. The number of the candidate pages to be promoted to the fast memory node via NUMA balancing is counted, if the count exceeds the limit specified by the users, the NUMA balancing promotion will be stopped until the next second. A new sysctl knob kernel.numa_balancing_promote_rate_limit_MBps is added for the users to specify the limit. Link: https://lkml.kernel.org/r/20220713083954.34196-3-ying.huang@intel.com Signed-off-by: "Huang, Ying" <ying.huang@intel.com> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com> Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: osalvador <osalvador@suse.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@surriel.com> Cc: Shakeel Butt <shakeelb@google.com> Cc: Wei Xu <weixugc@google.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Zhong Jiang <zhongjiang-ali@linux.alibaba.com> Cc: Zi Yan <ziy@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
39 lines
956 B
C
39 lines
956 B
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _LINUX_SCHED_SYSCTL_H
|
|
#define _LINUX_SCHED_SYSCTL_H
|
|
|
|
#include <linux/types.h>
|
|
|
|
struct ctl_table;
|
|
|
|
#ifdef CONFIG_DETECT_HUNG_TASK
|
|
/* used for hung_task and block/ */
|
|
extern unsigned long sysctl_hung_task_timeout_secs;
|
|
#else
|
|
/* Avoid need for ifdefs elsewhere in the code */
|
|
enum { sysctl_hung_task_timeout_secs = 0 };
|
|
#endif
|
|
|
|
enum sched_tunable_scaling {
|
|
SCHED_TUNABLESCALING_NONE,
|
|
SCHED_TUNABLESCALING_LOG,
|
|
SCHED_TUNABLESCALING_LINEAR,
|
|
SCHED_TUNABLESCALING_END,
|
|
};
|
|
|
|
#define NUMA_BALANCING_DISABLED 0x0
|
|
#define NUMA_BALANCING_NORMAL 0x1
|
|
#define NUMA_BALANCING_MEMORY_TIERING 0x2
|
|
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
extern int sysctl_numa_balancing_mode;
|
|
extern unsigned int sysctl_numa_balancing_promote_rate_limit;
|
|
#else
|
|
#define sysctl_numa_balancing_mode 0
|
|
#endif
|
|
|
|
int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer,
|
|
size_t *lenp, loff_t *ppos);
|
|
|
|
#endif /* _LINUX_SCHED_SYSCTL_H */
|