From 475031b6ed43d208925c81bea612f48c3259c3c8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 24 Jun 2022 17:06:31 +0200 Subject: [PATCH 01/34] arm64: head: move kimage_vaddr variable into C file This variable definition does not need to be in head.S so move it out. Signed-off-by: Ard Biesheuvel Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20220624150651.1358849-2-ardb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/head.S | 7 ------- arch/arm64/mm/mmu.c | 3 +++ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 6a98f1a38c29..1cdecce552bb 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -469,13 +469,6 @@ SYM_FUNC_START_LOCAL(__primary_switched) ASM_BUG() SYM_FUNC_END(__primary_switched) - .pushsection ".rodata", "a" -SYM_DATA_START(kimage_vaddr) - .quad _text -SYM_DATA_END(kimage_vaddr) -EXPORT_SYMBOL(kimage_vaddr) - .popsection - /* * end early head section, begin head code that is also used for * hotplug and needs to have the same protections as the text region diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 626ec32873c6..fde2b326419a 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -49,6 +49,9 @@ u64 idmap_ptrs_per_pgd = PTRS_PER_PGD; u64 __section(".mmuoff.data.write") vabits_actual; EXPORT_SYMBOL(vabits_actual); +u64 kimage_vaddr __ro_after_init = (u64)&_text; +EXPORT_SYMBOL(kimage_vaddr); + u64 kimage_voffset __ro_after_init; EXPORT_SYMBOL(kimage_voffset); From 0d9b1ffefabee93727bae68201593fac80a79002 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 24 Jun 2022 17:06:32 +0200 Subject: [PATCH 02/34] arm64: mm: make vabits_actual a build time constant if possible Currently, we only support 52-bit virtual addressing on 64k pages configurations, and in all other cases, vabits_actual is guaranteed to equal VA_BITS (== VA_BITS_MIN). So get rid of the variable entirely in that case. While at it, move the assignment out of the asm entry code - it has no need to be there. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220624150651.1358849-3-ardb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/memory.h | 4 ++++ arch/arm64/kernel/head.S | 15 +-------------- arch/arm64/mm/init.c | 15 ++++++++++++++- arch/arm64/mm/mmu.c | 4 +++- 4 files changed, 22 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 0af70d9abede..c751cd9b94f8 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -174,7 +174,11 @@ #include #include +#if VA_BITS > 48 extern u64 vabits_actual; +#else +#define vabits_actual ((u64)VA_BITS) +#endif extern s64 memstart_addr; /* PHYS_OFFSET - the physical address of the start of memory. */ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 1cdecce552bb..dc07858eb673 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -293,19 +293,6 @@ SYM_FUNC_START_LOCAL(__create_page_tables) adrp x0, idmap_pg_dir adrp x3, __idmap_text_start // __pa(__idmap_text_start) -#ifdef CONFIG_ARM64_VA_BITS_52 - mrs_s x6, SYS_ID_AA64MMFR2_EL1 - and x6, x6, #(0xf << ID_AA64MMFR2_LVA_SHIFT) - mov x5, #52 - cbnz x6, 1f -#endif - mov x5, #VA_BITS_MIN -1: - adr_l x6, vabits_actual - str x5, [x6] - dmb sy - dc ivac, x6 // Invalidate potentially stale cache line - /* * VA_BITS may be too small to allow for an ID mapping to be created * that covers system RAM if that is located sufficiently high in the @@ -713,7 +700,7 @@ SYM_FUNC_START(__enable_mmu) SYM_FUNC_END(__enable_mmu) SYM_FUNC_START(__cpu_secondary_check52bitva) -#ifdef CONFIG_ARM64_VA_BITS_52 +#if VA_BITS > 48 ldr_l x0, vabits_actual cmp x0, #52 b.ne 2f diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 339ee84e5a61..1faa6760895e 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -265,7 +265,20 @@ early_param("mem", early_mem); void __init arm64_memblock_init(void) { - s64 linear_region_size = PAGE_END - _PAGE_OFFSET(vabits_actual); + s64 linear_region_size; + +#if VA_BITS > 48 + if (cpuid_feature_extract_unsigned_field( + read_sysreg_s(SYS_ID_AA64MMFR2_EL1), + ID_AA64MMFR2_LVA_SHIFT)) + vabits_actual = VA_BITS; + + /* make the variable visible to secondaries with the MMU off */ + dcache_clean_inval_poc((u64)&vabits_actual, + (u64)&vabits_actual + sizeof(vabits_actual)); +#endif + + linear_region_size = PAGE_END - _PAGE_OFFSET(vabits_actual); /* * Corner case: 52-bit VA capable systems running KVM in nVHE mode may diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index fde2b326419a..88b4177254a0 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -46,8 +46,10 @@ u64 idmap_t0sz = TCR_T0SZ(VA_BITS_MIN); u64 idmap_ptrs_per_pgd = PTRS_PER_PGD; -u64 __section(".mmuoff.data.write") vabits_actual; +#if VA_BITS > 48 +u64 vabits_actual __ro_after_init = VA_BITS_MIN; EXPORT_SYMBOL(vabits_actual); +#endif u64 kimage_vaddr __ro_after_init = (u64)&_text; EXPORT_SYMBOL(kimage_vaddr); From e8d13cced5c5038cc93de9561cf2cb4f22205061 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 24 Jun 2022 17:06:33 +0200 Subject: [PATCH 03/34] arm64: head: move assignment of idmap_t0sz to C code Setting idmap_t0sz involves fiddling with the caches if done with the MMU off. Since we will be creating an initial ID map with the MMU and caches off, and the permanent ID map with the MMU and caches on, let's move this assignment of idmap_t0sz out of the startup code, and replace it with a macro that simply issues the three instructions needed to calculate the value wherever it is needed before the MMU is turned on. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220624150651.1358849-4-ardb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/assembler.h | 14 ++++++++++++++ arch/arm64/include/asm/mmu_context.h | 2 +- arch/arm64/kernel/head.S | 13 +------------ arch/arm64/mm/mmu.c | 4 +++- arch/arm64/mm/proc.S | 2 +- 5 files changed, 20 insertions(+), 15 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 8c5a61aeaf8e..9468f45c07a6 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -359,6 +359,20 @@ alternative_cb_end bfi \valreg, \t1sz, #TCR_T1SZ_OFFSET, #TCR_TxSZ_WIDTH .endm +/* + * idmap_get_t0sz - get the T0SZ value needed to cover the ID map + * + * Calculate the maximum allowed value for TCR_EL1.T0SZ so that the + * entire ID map region can be mapped. As T0SZ == (64 - #bits used), + * this number conveniently equals the number of leading zeroes in + * the physical address of _end. + */ + .macro idmap_get_t0sz, reg + adrp \reg, _end + orr \reg, \reg, #(1 << VA_BITS_MIN) - 1 + clz \reg, \reg + .endm + /* * tcr_compute_pa_size - set TCR.(I)PS to the highest supported * ID_AA64MMFR0_EL1.PARange value diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 6770667b34a3..6ac0086ebb1a 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -60,7 +60,7 @@ static inline void cpu_switch_mm(pgd_t *pgd, struct mm_struct *mm) * TCR_T0SZ(VA_BITS), unless system RAM is positioned very high in * physical memory, in which case it will be smaller. */ -extern u64 idmap_t0sz; +extern int idmap_t0sz; extern u64 idmap_ptrs_per_pgd; /* diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index dc07858eb673..7f361bc72d12 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -299,22 +299,11 @@ SYM_FUNC_START_LOCAL(__create_page_tables) * physical address space. So for the ID map, use an extended virtual * range in that case, and configure an additional translation level * if needed. - * - * Calculate the maximum allowed value for TCR_EL1.T0SZ so that the - * entire ID map region can be mapped. As T0SZ == (64 - #bits used), - * this number conveniently equals the number of leading zeroes in - * the physical address of __idmap_text_end. */ - adrp x5, __idmap_text_end - clz x5, x5 + idmap_get_t0sz x5 cmp x5, TCR_T0SZ(VA_BITS_MIN) // default T0SZ small enough? b.ge 1f // .. then skip VA range extension - adr_l x6, idmap_t0sz - str x5, [x6] - dmb sy - dc ivac, x6 // Invalidate potentially stale cache line - #if (VA_BITS < 48) #define EXTRA_SHIFT (PGDIR_SHIFT + PAGE_SHIFT - 3) #define EXTRA_PTRS (1 << (PHYS_MASK_SHIFT - EXTRA_SHIFT)) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 88b4177254a0..9b4fc78f7a4d 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -43,7 +43,7 @@ #define NO_CONT_MAPPINGS BIT(1) #define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */ -u64 idmap_t0sz = TCR_T0SZ(VA_BITS_MIN); +int idmap_t0sz __ro_after_init; u64 idmap_ptrs_per_pgd = PTRS_PER_PGD; #if VA_BITS > 48 @@ -771,6 +771,8 @@ void __init paging_init(void) { pgd_t *pgdp = pgd_set_fixmap(__pa_symbol(swapper_pg_dir)); + idmap_t0sz = 63UL - __fls(__pa_symbol(_end) | GENMASK(VA_BITS_MIN - 1, 0)); + map_kernel(pgdp); map_mem(pgdp); diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 50bbed947bec..e802badf9ac0 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -469,7 +469,7 @@ SYM_FUNC_START(__cpu_setup) add x9, x9, #64 tcr_set_t1sz tcr, x9 #else - ldr_l x9, idmap_t0sz + idmap_get_t0sz x9 #endif tcr_set_t0sz tcr, x9 From ebd9aea1f27ef27ed8581c16a96352b3cce89f39 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 24 Jun 2022 17:06:34 +0200 Subject: [PATCH 04/34] arm64: head: drop idmap_ptrs_per_pgd The assignment of idmap_ptrs_per_pgd lacks any cache invalidation, even though it is updated with the MMU and caches disabled. However, we never bother to read the value again except in the very next instruction, and so we can just drop the variable entirely. Signed-off-by: Ard Biesheuvel Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20220624150651.1358849-5-ardb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/mmu_context.h | 1 - arch/arm64/kernel/head.S | 7 +++---- arch/arm64/mm/mmu.c | 1 - 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 6ac0086ebb1a..7b387c3b312a 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -61,7 +61,6 @@ static inline void cpu_switch_mm(pgd_t *pgd, struct mm_struct *mm) * physical memory, in which case it will be smaller. */ extern int idmap_t0sz; -extern u64 idmap_ptrs_per_pgd; /* * Ensure TCR.T0SZ is set to the provided value. diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 7f361bc72d12..53126a35d73c 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -300,6 +300,7 @@ SYM_FUNC_START_LOCAL(__create_page_tables) * range in that case, and configure an additional translation level * if needed. */ + mov x4, #PTRS_PER_PGD idmap_get_t0sz x5 cmp x5, TCR_T0SZ(VA_BITS_MIN) // default T0SZ small enough? b.ge 1f // .. then skip VA range extension @@ -319,18 +320,16 @@ SYM_FUNC_START_LOCAL(__create_page_tables) #error "Mismatch between VA_BITS and page size/number of translation levels" #endif - mov x4, EXTRA_PTRS - create_table_entry x0, x3, EXTRA_SHIFT, x4, x5, x6 + mov x2, EXTRA_PTRS + create_table_entry x0, x3, EXTRA_SHIFT, x2, x5, x6 #else /* * If VA_BITS == 48, we don't have to configure an additional * translation level, but the top-level table has more entries. */ mov x4, #1 << (PHYS_MASK_SHIFT - PGDIR_SHIFT) - str_l x4, idmap_ptrs_per_pgd, x5 #endif 1: - ldr_l x4, idmap_ptrs_per_pgd adr_l x6, __idmap_text_end // __pa(__idmap_text_end) map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14 diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 9b4fc78f7a4d..63732ca0ccf5 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -44,7 +44,6 @@ #define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */ int idmap_t0sz __ro_after_init; -u64 idmap_ptrs_per_pgd = PTRS_PER_PGD; #if VA_BITS > 48 u64 vabits_actual __ro_after_init = VA_BITS_MIN; From 53519ddf5894476bf87b1c8694cee13524e211ac Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 24 Jun 2022 17:06:35 +0200 Subject: [PATCH 05/34] arm64: head: simplify page table mapping macros (slightly) Simplify the macros in head.S that are used to set up the early page tables, by switching to immediates for the number of bits that are interpreted as the table index at each level. This makes it much easier to infer from the instruction stream what is going on, and reduces the number of instructions emitted substantially. Note that the extended ID map for cases where no additional level needs to be configured now uses a compile time size as well, which means that we interpret up to 10 bits as the table index at the root level (for 52-bit physical addressing), without taking into account whether or not this is supported on the current system. However, those bits can only be set if we are executing the image from an address that exceeds the 48-bit PA range, and are guaranteed to be cleared otherwise, and given that we are dealing with a mapping in the lower TTBR0 range of the address space, the result is therefore the same as if we'd mask off only 6 bits. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220624150651.1358849-6-ardb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/head.S | 55 ++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 33 deletions(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 53126a35d73c..9fdde2f9cc0f 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -179,31 +179,20 @@ SYM_CODE_END(preserve_boot_args) * vstart: virtual address of start of range * vend: virtual address of end of range - we map [vstart, vend] * shift: shift used to transform virtual address into index - * ptrs: number of entries in page table + * order: #imm 2log(number of entries in page table) * istart: index in table corresponding to vstart * iend: index in table corresponding to vend * count: On entry: how many extra entries were required in previous level, scales * our end index. * On exit: returns how many extra entries required for next page table level * - * Preserves: vstart, vend, shift, ptrs + * Preserves: vstart, vend * Returns: istart, iend, count */ - .macro compute_indices, vstart, vend, shift, ptrs, istart, iend, count - lsr \iend, \vend, \shift - mov \istart, \ptrs - sub \istart, \istart, #1 - and \iend, \iend, \istart // iend = (vend >> shift) & (ptrs - 1) - mov \istart, \ptrs - mul \istart, \istart, \count - add \iend, \iend, \istart // iend += count * ptrs - // our entries span multiple tables - - lsr \istart, \vstart, \shift - mov \count, \ptrs - sub \count, \count, #1 - and \istart, \istart, \count - + .macro compute_indices, vstart, vend, shift, order, istart, iend, count + ubfx \istart, \vstart, \shift, \order + ubfx \iend, \vend, \shift, \order + add \iend, \iend, \count, lsl \order sub \count, \iend, \istart .endm @@ -218,38 +207,39 @@ SYM_CODE_END(preserve_boot_args) * vend: virtual address of end of range - we map [vstart, vend - 1] * flags: flags to use to map last level entries * phys: physical address corresponding to vstart - physical memory is contiguous - * pgds: the number of pgd entries + * order: #imm 2log(number of entries in PGD table) * * Temporaries: istart, iend, tmp, count, sv - these need to be different registers * Preserves: vstart, flags * Corrupts: tbl, rtbl, vend, istart, iend, tmp, count, sv */ - .macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, tmp, count, sv + .macro map_memory, tbl, rtbl, vstart, vend, flags, phys, order, istart, iend, tmp, count, sv sub \vend, \vend, #1 add \rtbl, \tbl, #PAGE_SIZE - mov \sv, \rtbl mov \count, #0 - compute_indices \vstart, \vend, #PGDIR_SHIFT, \pgds, \istart, \iend, \count + + compute_indices \vstart, \vend, #PGDIR_SHIFT, #\order, \istart, \iend, \count + mov \sv, \rtbl populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp mov \tbl, \sv - mov \sv, \rtbl #if SWAPPER_PGTABLE_LEVELS > 3 - compute_indices \vstart, \vend, #PUD_SHIFT, #PTRS_PER_PUD, \istart, \iend, \count + compute_indices \vstart, \vend, #PUD_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count + mov \sv, \rtbl populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp mov \tbl, \sv - mov \sv, \rtbl #endif #if SWAPPER_PGTABLE_LEVELS > 2 - compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #PTRS_PER_PMD, \istart, \iend, \count + compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count + mov \sv, \rtbl populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp mov \tbl, \sv #endif - compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #PTRS_PER_PTE, \istart, \iend, \count - bic \count, \phys, #SWAPPER_BLOCK_SIZE - 1 - populate_entries \tbl, \count, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp + compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count + bic \rtbl, \phys, #SWAPPER_BLOCK_SIZE - 1 + populate_entries \tbl, \rtbl, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp .endm /* @@ -300,12 +290,12 @@ SYM_FUNC_START_LOCAL(__create_page_tables) * range in that case, and configure an additional translation level * if needed. */ - mov x4, #PTRS_PER_PGD idmap_get_t0sz x5 cmp x5, TCR_T0SZ(VA_BITS_MIN) // default T0SZ small enough? b.ge 1f // .. then skip VA range extension #if (VA_BITS < 48) +#define IDMAP_PGD_ORDER (VA_BITS - PGDIR_SHIFT) #define EXTRA_SHIFT (PGDIR_SHIFT + PAGE_SHIFT - 3) #define EXTRA_PTRS (1 << (PHYS_MASK_SHIFT - EXTRA_SHIFT)) @@ -323,16 +313,16 @@ SYM_FUNC_START_LOCAL(__create_page_tables) mov x2, EXTRA_PTRS create_table_entry x0, x3, EXTRA_SHIFT, x2, x5, x6 #else +#define IDMAP_PGD_ORDER (PHYS_MASK_SHIFT - PGDIR_SHIFT) /* * If VA_BITS == 48, we don't have to configure an additional * translation level, but the top-level table has more entries. */ - mov x4, #1 << (PHYS_MASK_SHIFT - PGDIR_SHIFT) #endif 1: adr_l x6, __idmap_text_end // __pa(__idmap_text_end) - map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14 + map_memory x0, x1, x3, x6, x7, x3, IDMAP_PGD_ORDER, x10, x11, x12, x13, x14 /* * Map the kernel image (starting with PHYS_OFFSET). @@ -340,13 +330,12 @@ SYM_FUNC_START_LOCAL(__create_page_tables) adrp x0, init_pg_dir mov_q x5, KIMAGE_VADDR // compile time __va(_text) add x5, x5, x23 // add KASLR displacement - mov x4, PTRS_PER_PGD adrp x6, _end // runtime __pa(_end) adrp x3, _text // runtime __pa(_text) sub x6, x6, x3 // _end - _text add x6, x6, x5 // runtime __va(_end) - map_memory x0, x1, x5, x6, x7, x3, x4, x10, x11, x12, x13, x14 + map_memory x0, x1, x5, x6, x7, x3, (VA_BITS - PGDIR_SHIFT), x10, x11, x12, x13, x14 /* * Since the page tables have been populated with non-cacheable From 50fcd39d24c24bb86d695c93ad41fccfed1fc7ac Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 24 Jun 2022 17:06:36 +0200 Subject: [PATCH 06/34] arm64: head: switch to map_memory macro for the extended ID map In a future patch, we will start using an ID map that covers the entire image, rather than a single page. This means that we need to deal with the pathological case of an extended ID map where the kernel image does not fit neatly inside a single entry at the root level, which means we will need to create additional table entries and map additional pages for page tables. The existing map_memory macro already takes care of most of that, so let's just extend it to deal with this case as well. While at it, drop the conditional branch on the value of T0SZ: we don't set the variable anymore in the entry code, and so we can just let the map_memory macro deal with the case where the output address exceeds VA_BITS. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220624150651.1358849-7-ardb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/head.S | 76 +++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 39 deletions(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 9fdde2f9cc0f..eb54c0289c8a 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -122,29 +122,6 @@ SYM_CODE_START_LOCAL(preserve_boot_args) b dcache_inval_poc // tail call SYM_CODE_END(preserve_boot_args) -/* - * Macro to create a table entry to the next page. - * - * tbl: page table address - * virt: virtual address - * shift: #imm page table shift - * ptrs: #imm pointers per table page - * - * Preserves: virt - * Corrupts: ptrs, tmp1, tmp2 - * Returns: tbl -> next level table page address - */ - .macro create_table_entry, tbl, virt, shift, ptrs, tmp1, tmp2 - add \tmp1, \tbl, #PAGE_SIZE - phys_to_pte \tmp2, \tmp1 - orr \tmp2, \tmp2, #PMD_TYPE_TABLE // address of next table and entry type - lsr \tmp1, \virt, #\shift - sub \ptrs, \ptrs, #1 - and \tmp1, \tmp1, \ptrs // table index - str \tmp2, [\tbl, \tmp1, lsl #3] - add \tbl, \tbl, #PAGE_SIZE // next level table page - .endm - /* * Macro to populate page table entries, these entries can be pointers to the next level * or last level entries pointing to physical memory. @@ -209,15 +186,27 @@ SYM_CODE_END(preserve_boot_args) * phys: physical address corresponding to vstart - physical memory is contiguous * order: #imm 2log(number of entries in PGD table) * + * If extra_shift is set, an extra level will be populated if the end address does + * not fit in 'extra_shift' bits. This assumes vend is in the TTBR0 range. + * * Temporaries: istart, iend, tmp, count, sv - these need to be different registers * Preserves: vstart, flags * Corrupts: tbl, rtbl, vend, istart, iend, tmp, count, sv */ - .macro map_memory, tbl, rtbl, vstart, vend, flags, phys, order, istart, iend, tmp, count, sv + .macro map_memory, tbl, rtbl, vstart, vend, flags, phys, order, istart, iend, tmp, count, sv, extra_shift sub \vend, \vend, #1 add \rtbl, \tbl, #PAGE_SIZE mov \count, #0 + .ifnb \extra_shift + tst \vend, #~((1 << (\extra_shift)) - 1) + b.eq .L_\@ + compute_indices \vstart, \vend, #\extra_shift, #(PAGE_SHIFT - 3), \istart, \iend, \count + mov \sv, \rtbl + populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp + mov \tbl, \sv + .endif +.L_\@: compute_indices \vstart, \vend, #PGDIR_SHIFT, #\order, \istart, \iend, \count mov \sv, \rtbl populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp @@ -284,20 +273,32 @@ SYM_FUNC_START_LOCAL(__create_page_tables) adrp x3, __idmap_text_start // __pa(__idmap_text_start) /* - * VA_BITS may be too small to allow for an ID mapping to be created - * that covers system RAM if that is located sufficiently high in the - * physical address space. So for the ID map, use an extended virtual - * range in that case, and configure an additional translation level - * if needed. + * The ID map carries a 1:1 mapping of the physical address range + * covered by the loaded image, which could be anywhere in DRAM. This + * means that the required size of the VA (== PA) space is decided at + * boot time, and could be more than the configured size of the VA + * space for ordinary kernel and user space mappings. + * + * There are three cases to consider here: + * - 39 <= VA_BITS < 48, and the ID map needs up to 48 VA bits to cover + * the placement of the image. In this case, we configure one extra + * level of translation on the fly for the ID map only. (This case + * also covers 42-bit VA/52-bit PA on 64k pages). + * + * - VA_BITS == 48, and the ID map needs more than 48 VA bits. This can + * only happen when using 64k pages, in which case we need to extend + * the root level table rather than add a level. Note that we can + * treat this case as 'always extended' as long as we take care not + * to program an unsupported T0SZ value into the TCR register. + * + * - Combinations that would require two additional levels of + * translation are not supported, e.g., VA_BITS==36 on 16k pages, or + * VA_BITS==39/4k pages with 5-level paging, where the input address + * requires more than 47 or 48 bits, respectively. */ - idmap_get_t0sz x5 - cmp x5, TCR_T0SZ(VA_BITS_MIN) // default T0SZ small enough? - b.ge 1f // .. then skip VA range extension - #if (VA_BITS < 48) #define IDMAP_PGD_ORDER (VA_BITS - PGDIR_SHIFT) #define EXTRA_SHIFT (PGDIR_SHIFT + PAGE_SHIFT - 3) -#define EXTRA_PTRS (1 << (PHYS_MASK_SHIFT - EXTRA_SHIFT)) /* * If VA_BITS < 48, we have to configure an additional table level. @@ -309,20 +310,17 @@ SYM_FUNC_START_LOCAL(__create_page_tables) #if VA_BITS != EXTRA_SHIFT #error "Mismatch between VA_BITS and page size/number of translation levels" #endif - - mov x2, EXTRA_PTRS - create_table_entry x0, x3, EXTRA_SHIFT, x2, x5, x6 #else #define IDMAP_PGD_ORDER (PHYS_MASK_SHIFT - PGDIR_SHIFT) +#define EXTRA_SHIFT /* * If VA_BITS == 48, we don't have to configure an additional * translation level, but the top-level table has more entries. */ #endif -1: adr_l x6, __idmap_text_end // __pa(__idmap_text_end) - map_memory x0, x1, x3, x6, x7, x3, IDMAP_PGD_ORDER, x10, x11, x12, x13, x14 + map_memory x0, x1, x3, x6, x7, x3, IDMAP_PGD_ORDER, x10, x11, x12, x13, x14, EXTRA_SHIFT /* * Map the kernel image (starting with PHYS_OFFSET). From e42ade29e3bcb32049da298d2927522c104abae9 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 24 Jun 2022 17:06:37 +0200 Subject: [PATCH 07/34] arm64: head: split off idmap creation code Split off the creation of the ID map page tables, so that we can avoid running it again unnecessarily when KASLR is in effect (which only randomizes the virtual placement). This will permit us to drop some explicit cache maintenance to the PoC which was necessary because the cache invalidation being performed on some global variables might otherwise clobber unrelated variables that happen to share a cacheline. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220624150651.1358849-8-ardb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/head.S | 119 ++++++++++++++++++++------------------- 1 file changed, 61 insertions(+), 58 deletions(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index eb54c0289c8a..1cbc52097bf9 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -84,7 +84,7 @@ * Register Scope Purpose * x21 primary_entry() .. start_kernel() FDT pointer passed at boot in x0 * x23 primary_entry() .. start_kernel() physical misalignment/KASLR offset - * x28 __create_page_tables() callee preserved temp register + * x28 clear_page_tables() callee preserved temp register * x19/x20 __primary_switch() callee preserved temp registers * x24 __primary_switch() .. relocate_kernel() current RELR displacement */ @@ -94,7 +94,10 @@ SYM_CODE_START(primary_entry) adrp x23, __PHYS_OFFSET and x23, x23, MIN_KIMG_ALIGN - 1 // KASLR offset, defaults to 0 bl set_cpu_boot_mode_flag - bl __create_page_tables + bl clear_page_tables + bl create_idmap + bl create_kernel_mapping + /* * The following calls CPU setup code, see arch/arm64/mm/proc.S for * details. @@ -122,6 +125,35 @@ SYM_CODE_START_LOCAL(preserve_boot_args) b dcache_inval_poc // tail call SYM_CODE_END(preserve_boot_args) +SYM_FUNC_START_LOCAL(clear_page_tables) + mov x28, lr + + /* + * Invalidate the init page tables to avoid potential dirty cache lines + * being evicted. Other page tables are allocated in rodata as part of + * the kernel image, and thus are clean to the PoC per the boot + * protocol. + */ + adrp x0, init_pg_dir + adrp x1, init_pg_end + bl dcache_inval_poc + + /* + * Clear the init page tables. + */ + adrp x0, init_pg_dir + adrp x1, init_pg_end + sub x1, x1, x0 +1: stp xzr, xzr, [x0], #16 + stp xzr, xzr, [x0], #16 + stp xzr, xzr, [x0], #16 + stp xzr, xzr, [x0], #16 + subs x1, x1, #64 + b.ne 1b + + ret x28 +SYM_FUNC_END(clear_page_tables) + /* * Macro to populate page table entries, these entries can be pointers to the next level * or last level entries pointing to physical memory. @@ -231,44 +263,8 @@ SYM_CODE_END(preserve_boot_args) populate_entries \tbl, \rtbl, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp .endm -/* - * Setup the initial page tables. We only setup the barest amount which is - * required to get the kernel running. The following sections are required: - * - identity mapping to enable the MMU (low address, TTBR0) - * - first few MB of the kernel linear mapping to jump to once the MMU has - * been enabled - */ -SYM_FUNC_START_LOCAL(__create_page_tables) - mov x28, lr - /* - * Invalidate the init page tables to avoid potential dirty cache lines - * being evicted. Other page tables are allocated in rodata as part of - * the kernel image, and thus are clean to the PoC per the boot - * protocol. - */ - adrp x0, init_pg_dir - adrp x1, init_pg_end - bl dcache_inval_poc - - /* - * Clear the init page tables. - */ - adrp x0, init_pg_dir - adrp x1, init_pg_end - sub x1, x1, x0 -1: stp xzr, xzr, [x0], #16 - stp xzr, xzr, [x0], #16 - stp xzr, xzr, [x0], #16 - stp xzr, xzr, [x0], #16 - subs x1, x1, #64 - b.ne 1b - - mov x7, SWAPPER_MM_MMUFLAGS - - /* - * Create the identity mapping. - */ +SYM_FUNC_START_LOCAL(create_idmap) adrp x0, idmap_pg_dir adrp x3, __idmap_text_start // __pa(__idmap_text_start) @@ -319,22 +315,10 @@ SYM_FUNC_START_LOCAL(__create_page_tables) */ #endif adr_l x6, __idmap_text_end // __pa(__idmap_text_end) + mov x7, SWAPPER_MM_MMUFLAGS map_memory x0, x1, x3, x6, x7, x3, IDMAP_PGD_ORDER, x10, x11, x12, x13, x14, EXTRA_SHIFT - /* - * Map the kernel image (starting with PHYS_OFFSET). - */ - adrp x0, init_pg_dir - mov_q x5, KIMAGE_VADDR // compile time __va(_text) - add x5, x5, x23 // add KASLR displacement - adrp x6, _end // runtime __pa(_end) - adrp x3, _text // runtime __pa(_text) - sub x6, x6, x3 // _end - _text - add x6, x6, x5 // runtime __va(_end) - - map_memory x0, x1, x5, x6, x7, x3, (VA_BITS - PGDIR_SHIFT), x10, x11, x12, x13, x14 - /* * Since the page tables have been populated with non-cacheable * accesses (MMU disabled), invalidate those tables again to @@ -344,14 +328,32 @@ SYM_FUNC_START_LOCAL(__create_page_tables) adrp x0, idmap_pg_dir adrp x1, idmap_pg_end - bl dcache_inval_poc + b dcache_inval_poc // tail call +SYM_FUNC_END(create_idmap) + +SYM_FUNC_START_LOCAL(create_kernel_mapping) + adrp x0, init_pg_dir + mov_q x5, KIMAGE_VADDR // compile time __va(_text) + add x5, x5, x23 // add KASLR displacement + adrp x6, _end // runtime __pa(_end) + adrp x3, _text // runtime __pa(_text) + sub x6, x6, x3 // _end - _text + add x6, x6, x5 // runtime __va(_end) + mov x7, SWAPPER_MM_MMUFLAGS + + map_memory x0, x1, x5, x6, x7, x3, (VA_BITS - PGDIR_SHIFT), x10, x11, x12, x13, x14 + + /* + * Since the page tables have been populated with non-cacheable + * accesses (MMU disabled), invalidate those tables again to + * remove any speculatively loaded cache lines. + */ + dmb sy adrp x0, init_pg_dir adrp x1, init_pg_end - bl dcache_inval_poc - - ret x28 -SYM_FUNC_END(__create_page_tables) + b dcache_inval_poc // tail call +SYM_FUNC_END(create_kernel_mapping) /* * Initialize CPU registers with task-specific and cpu-specific context. @@ -836,7 +838,8 @@ SYM_FUNC_START_LOCAL(__primary_switch) pre_disable_mmu_workaround msr sctlr_el1, x20 // disable the MMU isb - bl __create_page_tables // recreate kernel mapping + bl clear_page_tables + bl create_kernel_mapping // Recreate kernel mapping tlbi vmalle1 // Remove any stale TLB entries dsb nsh From 2e945851e26836c0f2d34be3763ddf55870e49fe Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 24 Jun 2022 17:06:38 +0200 Subject: [PATCH 08/34] arm64: kernel: drop unnecessary PoC cache clean+invalidate Some early boot code runs before the virtual placement of the kernel is finalized, and we used to go back to the very start and recreate the ID map along with the page tables describing the virtual kernel mapping, and this involved setting some global variables with the caches off. In order to ensure that global state created by the KASLR code is not corrupted by the cache invalidation that occurs in that case, we needed to clean those global variables to the PoC explicitly. This is no longer needed now that the ID map is created only once (and the associated global variable updates are no longer repeated). So drop the cache maintenance that is no longer necessary. Signed-off-by: Ard Biesheuvel Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20220624150651.1358849-9-ardb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/kaslr.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index 418b2bba1521..d5542666182f 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -13,7 +13,6 @@ #include #include -#include #include #include #include @@ -72,9 +71,6 @@ u64 __init kaslr_early_init(void) * we end up running with module randomization disabled. */ module_alloc_base = (u64)_etext - MODULES_VSIZE; - dcache_clean_inval_poc((unsigned long)&module_alloc_base, - (unsigned long)&module_alloc_base + - sizeof(module_alloc_base)); /* * Try to map the FDT early. If this fails, we simply bail, @@ -174,13 +170,6 @@ u64 __init kaslr_early_init(void) module_alloc_base += (module_range * (seed & ((1 << 21) - 1))) >> 21; module_alloc_base &= PAGE_MASK; - dcache_clean_inval_poc((unsigned long)&module_alloc_base, - (unsigned long)&module_alloc_base + - sizeof(module_alloc_base)); - dcache_clean_inval_poc((unsigned long)&memstart_offset_seed, - (unsigned long)&memstart_offset_seed + - sizeof(memstart_offset_seed)); - return offset; } From 723d3a8ed1726081ca40f602073c53de28eebb93 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 24 Jun 2022 17:06:39 +0200 Subject: [PATCH 09/34] arm64: head: pass ID map root table address to __enable_mmu() We will be adding an initial ID map that covers the entire kernel image, so we will pass the actual ID map root table to use to __enable_mmu(), rather than hard code it. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220624150651.1358849-10-ardb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/head.S | 14 ++++++++------ arch/arm64/kernel/sleep.S | 1 + 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 1cbc52097bf9..70c462bbd6bf 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -595,6 +595,7 @@ SYM_FUNC_START_LOCAL(secondary_startup) bl __cpu_secondary_check52bitva bl __cpu_setup // initialise processor adrp x1, swapper_pg_dir + adrp x2, idmap_pg_dir bl __enable_mmu ldr x8, =__secondary_switched br x8 @@ -648,6 +649,7 @@ SYM_FUNC_END(__secondary_too_slow) * * x0 = SCTLR_EL1 value for turning on the MMU. * x1 = TTBR1_EL1 value + * x2 = ID map root table address * * Returns to the caller via x30/lr. This requires the caller to be covered * by the .idmap.text section. @@ -656,14 +658,13 @@ SYM_FUNC_END(__secondary_too_slow) * If it isn't, park the CPU */ SYM_FUNC_START(__enable_mmu) - mrs x2, ID_AA64MMFR0_EL1 - ubfx x2, x2, #ID_AA64MMFR0_TGRAN_SHIFT, 4 - cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED_MIN + mrs x3, ID_AA64MMFR0_EL1 + ubfx x3, x3, #ID_AA64MMFR0_TGRAN_SHIFT, 4 + cmp x3, #ID_AA64MMFR0_TGRAN_SUPPORTED_MIN b.lt __no_granule_support - cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED_MAX + cmp x3, #ID_AA64MMFR0_TGRAN_SUPPORTED_MAX b.gt __no_granule_support - update_early_cpu_boot_status 0, x2, x3 - adrp x2, idmap_pg_dir + update_early_cpu_boot_status 0, x3, x4 phys_to_ttbr x1, x1 phys_to_ttbr x2, x2 msr ttbr0_el1, x2 // load TTBR0 @@ -819,6 +820,7 @@ SYM_FUNC_START_LOCAL(__primary_switch) #endif adrp x1, init_pg_dir + adrp x2, idmap_pg_dir bl __enable_mmu #ifdef CONFIG_RELOCATABLE #ifdef CONFIG_RELR diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index 4ea9392f86e0..e36b09d942f7 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -104,6 +104,7 @@ SYM_CODE_START(cpu_resume) bl __cpu_setup /* enable the MMU early - so we can access sleep_save_stash by va */ adrp x1, swapper_pg_dir + adrp x2, idmap_pg_dir bl __enable_mmu ldr x8, =_cpu_resume br x8 From 1682c45b920643cbde31d8a5b7ca7c2be92d6928 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 24 Jun 2022 17:06:40 +0200 Subject: [PATCH 10/34] arm64: mm: provide idmap pointer to cpu_replace_ttbr1() In preparation for changing the way we initialize the permanent ID map, update cpu_replace_ttbr1() so we can use it with the initial ID map as well. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220624150651.1358849-11-ardb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/mmu_context.h | 13 +++++++++---- arch/arm64/kernel/cpufeature.c | 2 +- arch/arm64/kernel/suspend.c | 2 +- arch/arm64/mm/kasan_init.c | 4 ++-- arch/arm64/mm/mmu.c | 2 +- 5 files changed, 14 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 7b387c3b312a..c7ccd82db1d2 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -105,13 +105,18 @@ static inline void cpu_uninstall_idmap(void) cpu_switch_mm(mm->pgd, mm); } -static inline void cpu_install_idmap(void) +static inline void __cpu_install_idmap(pgd_t *idmap) { cpu_set_reserved_ttbr0(); local_flush_tlb_all(); cpu_set_idmap_tcr_t0sz(); - cpu_switch_mm(lm_alias(idmap_pg_dir), &init_mm); + cpu_switch_mm(lm_alias(idmap), &init_mm); +} + +static inline void cpu_install_idmap(void) +{ + __cpu_install_idmap(idmap_pg_dir); } /* @@ -142,7 +147,7 @@ static inline void cpu_install_ttbr0(phys_addr_t ttbr0, unsigned long t0sz) * Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD, * avoiding the possibility of conflicting TLB entries being allocated. */ -static inline void __nocfi cpu_replace_ttbr1(pgd_t *pgdp) +static inline void __nocfi cpu_replace_ttbr1(pgd_t *pgdp, pgd_t *idmap) { typedef void (ttbr_replace_func)(phys_addr_t); extern ttbr_replace_func idmap_cpu_replace_ttbr1; @@ -165,7 +170,7 @@ static inline void __nocfi cpu_replace_ttbr1(pgd_t *pgdp) replace_phys = (void *)__pa_symbol(function_nocfi(idmap_cpu_replace_ttbr1)); - cpu_install_idmap(); + __cpu_install_idmap(idmap); replace_phys(ttbr1); cpu_uninstall_idmap(); } diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 8d88433de81d..a97913d19709 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -3218,7 +3218,7 @@ subsys_initcall_sync(init_32bit_el0_mask); static void __maybe_unused cpu_enable_cnp(struct arm64_cpu_capabilities const *cap) { - cpu_replace_ttbr1(lm_alias(swapper_pg_dir)); + cpu_replace_ttbr1(lm_alias(swapper_pg_dir), idmap_pg_dir); } /* diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index 2b0887e58a7c..9135fe0f3df5 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -52,7 +52,7 @@ void notrace __cpu_suspend_exit(void) /* Restore CnP bit in TTBR1_EL1 */ if (system_supports_cnp()) - cpu_replace_ttbr1(lm_alias(swapper_pg_dir)); + cpu_replace_ttbr1(lm_alias(swapper_pg_dir), idmap_pg_dir); /* * PSTATE was not saved over suspend/resume, re-enable any detected diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index c12cd700598f..e969e68de005 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -236,7 +236,7 @@ static void __init kasan_init_shadow(void) */ memcpy(tmp_pg_dir, swapper_pg_dir, sizeof(tmp_pg_dir)); dsb(ishst); - cpu_replace_ttbr1(lm_alias(tmp_pg_dir)); + cpu_replace_ttbr1(lm_alias(tmp_pg_dir), idmap_pg_dir); clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); @@ -280,7 +280,7 @@ static void __init kasan_init_shadow(void) PAGE_KERNEL_RO)); memset(kasan_early_shadow_page, KASAN_SHADOW_INIT, PAGE_SIZE); - cpu_replace_ttbr1(lm_alias(swapper_pg_dir)); + cpu_replace_ttbr1(lm_alias(swapper_pg_dir), idmap_pg_dir); } static void __init kasan_init_depth(void) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 63732ca0ccf5..8cbc87dcaad3 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -777,7 +777,7 @@ void __init paging_init(void) pgd_clear_fixmap(); - cpu_replace_ttbr1(lm_alias(swapper_pg_dir)); + cpu_replace_ttbr1(lm_alias(swapper_pg_dir), idmap_pg_dir); init_mm.pgd = swapper_pg_dir; memblock_phys_free(__pa_symbol(init_pg_dir), From b013c1e1c659b0742f81cc4a95fe61faf6929ae5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 24 Jun 2022 17:06:41 +0200 Subject: [PATCH 11/34] arm64: head: add helper function to remap regions in early page tables The asm macros used to create the initial ID map and kernel mappings don't support randomly remapping parts of the address space after it has been populated. What we can do, however, given that all block or page mappings are created at the final level, is take a subset of the mapped range and update its attributes or output address. This will permit us to make parts of these page tables read-only, or remap a part of it to cover the device tree. So add a helper that encapsulates this. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220624150651.1358849-12-ardb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/head.S | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 70c462bbd6bf..7397555f8437 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -263,6 +263,39 @@ SYM_FUNC_END(clear_page_tables) populate_entries \tbl, \rtbl, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp .endm +/* + * Remap a subregion created with the map_memory macro with modified attributes + * or output address. The entire remapped region must have been covered in the + * invocation of map_memory. + * + * x0: last level table address (returned in first argument to map_memory) + * x1: start VA of the existing mapping + * x2: start VA of the region to update + * x3: end VA of the region to update (exclusive) + * x4: start PA associated with the region to update + * x5: attributes to set on the updated region + * x6: order of the last level mappings + */ +SYM_FUNC_START_LOCAL(remap_region) + sub x3, x3, #1 // make end inclusive + + // Get the index offset for the start of the last level table + lsr x1, x1, x6 + bfi x1, xzr, #0, #PAGE_SHIFT - 3 + + // Derive the start and end indexes into the last level table + // associated with the provided region + lsr x2, x2, x6 + lsr x3, x3, x6 + sub x2, x2, x1 + sub x3, x3, x1 + + mov x1, #1 + lsl x6, x1, x6 // block size at this level + + populate_entries x0, x4, x2, x3, x5, x6, x7 + ret +SYM_FUNC_END(remap_region) SYM_FUNC_START_LOCAL(create_idmap) adrp x0, idmap_pg_dir From c3cee924bd855184d15bc4aa6088dcf8e2c1394c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 24 Jun 2022 17:06:42 +0200 Subject: [PATCH 12/34] arm64: head: cover entire kernel image in initial ID map As a first step towards avoiding the need to create, tear down and recreate the kernel virtual mapping with MMU and caches disabled, start by expanding the ID map so it covers the page tables as well as all executable code. This will allow us to populate the page tables with the MMU and caches on, and call KASLR init code before setting up the virtual mapping. Since this ID map is only needed at boot, create it as a temporary set of page tables, and populate the permanent ID map after enabling the MMU and caches. While at it, switch to read-only attributes for the where possible, as writable permissions are only needed for the initial kernel page tables. Note that on 4k granule configurations, the permanent ID map will now be reduced to a single page rather than a 2M block mapping. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220624150651.1358849-13-ardb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/kernel-pgtable.h | 16 +++++++---- arch/arm64/kernel/head.S | 31 +++++++++++++++------- arch/arm64/kernel/vmlinux.lds.S | 7 +++-- arch/arm64/mm/mmu.c | 35 ++++++++++++++++++++++++- arch/arm64/mm/proc.S | 8 +++--- 5 files changed, 76 insertions(+), 21 deletions(-) diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 96dc0f7da258..5395e5a04f35 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -35,10 +35,8 @@ */ #if ARM64_KERNEL_USES_PMD_MAPS #define SWAPPER_PGTABLE_LEVELS (CONFIG_PGTABLE_LEVELS - 1) -#define IDMAP_PGTABLE_LEVELS (ARM64_HW_PGTABLE_LEVELS(PHYS_MASK_SHIFT) - 1) #else #define SWAPPER_PGTABLE_LEVELS (CONFIG_PGTABLE_LEVELS) -#define IDMAP_PGTABLE_LEVELS (ARM64_HW_PGTABLE_LEVELS(PHYS_MASK_SHIFT)) #endif @@ -87,7 +85,13 @@ + EARLY_PUDS((vstart), (vend)) /* each PUD needs a next level page table */ \ + EARLY_PMDS((vstart), (vend))) /* each PMD needs a next level page table */ #define INIT_DIR_SIZE (PAGE_SIZE * EARLY_PAGES(KIMAGE_VADDR, _end)) -#define IDMAP_DIR_SIZE (IDMAP_PGTABLE_LEVELS * PAGE_SIZE) + +/* the initial ID map may need two extra pages if it needs to be extended */ +#if VA_BITS < 48 +#define INIT_IDMAP_DIR_SIZE (INIT_DIR_SIZE + (2 * PAGE_SIZE)) +#else +#define INIT_IDMAP_DIR_SIZE INIT_DIR_SIZE +#endif /* Initial memory map size */ #if ARM64_KERNEL_USES_PMD_MAPS @@ -107,9 +111,11 @@ #define SWAPPER_PMD_FLAGS (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S) #if ARM64_KERNEL_USES_PMD_MAPS -#define SWAPPER_MM_MMUFLAGS (PMD_ATTRINDX(MT_NORMAL) | SWAPPER_PMD_FLAGS) +#define SWAPPER_RW_MMUFLAGS (PMD_ATTRINDX(MT_NORMAL) | SWAPPER_PMD_FLAGS) +#define SWAPPER_RX_MMUFLAGS (SWAPPER_RW_MMUFLAGS | PMD_SECT_RDONLY) #else -#define SWAPPER_MM_MMUFLAGS (PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS) +#define SWAPPER_RW_MMUFLAGS (PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS) +#define SWAPPER_RX_MMUFLAGS (SWAPPER_RW_MMUFLAGS | PTE_RDONLY) #endif /* diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 7397555f8437..93734c91a29a 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -87,6 +87,7 @@ * x28 clear_page_tables() callee preserved temp register * x19/x20 __primary_switch() callee preserved temp registers * x24 __primary_switch() .. relocate_kernel() current RELR displacement + * x28 create_idmap() callee preserved temp register */ SYM_CODE_START(primary_entry) bl preserve_boot_args @@ -298,9 +299,7 @@ SYM_FUNC_START_LOCAL(remap_region) SYM_FUNC_END(remap_region) SYM_FUNC_START_LOCAL(create_idmap) - adrp x0, idmap_pg_dir - adrp x3, __idmap_text_start // __pa(__idmap_text_start) - + mov x28, lr /* * The ID map carries a 1:1 mapping of the physical address range * covered by the loaded image, which could be anywhere in DRAM. This @@ -347,11 +346,22 @@ SYM_FUNC_START_LOCAL(create_idmap) * translation level, but the top-level table has more entries. */ #endif - adr_l x6, __idmap_text_end // __pa(__idmap_text_end) - mov x7, SWAPPER_MM_MMUFLAGS + adrp x0, init_idmap_pg_dir + adrp x3, _text + adrp x6, _end + mov x7, SWAPPER_RX_MMUFLAGS map_memory x0, x1, x3, x6, x7, x3, IDMAP_PGD_ORDER, x10, x11, x12, x13, x14, EXTRA_SHIFT + /* Remap the kernel page tables r/w in the ID map */ + adrp x1, _text + adrp x2, init_pg_dir + adrp x3, init_pg_end + bic x4, x2, #SWAPPER_BLOCK_SIZE - 1 + mov x5, SWAPPER_RW_MMUFLAGS + mov x6, #SWAPPER_BLOCK_SHIFT + bl remap_region + /* * Since the page tables have been populated with non-cacheable * accesses (MMU disabled), invalidate those tables again to @@ -359,9 +369,10 @@ SYM_FUNC_START_LOCAL(create_idmap) */ dmb sy - adrp x0, idmap_pg_dir - adrp x1, idmap_pg_end - b dcache_inval_poc // tail call + adrp x0, init_idmap_pg_dir + adrp x1, init_idmap_pg_end + bl dcache_inval_poc + ret x28 SYM_FUNC_END(create_idmap) SYM_FUNC_START_LOCAL(create_kernel_mapping) @@ -372,7 +383,7 @@ SYM_FUNC_START_LOCAL(create_kernel_mapping) adrp x3, _text // runtime __pa(_text) sub x6, x6, x3 // _end - _text add x6, x6, x5 // runtime __va(_end) - mov x7, SWAPPER_MM_MMUFLAGS + mov x7, SWAPPER_RW_MMUFLAGS map_memory x0, x1, x5, x6, x7, x3, (VA_BITS - PGDIR_SHIFT), x10, x11, x12, x13, x14 @@ -853,7 +864,7 @@ SYM_FUNC_START_LOCAL(__primary_switch) #endif adrp x1, init_pg_dir - adrp x2, idmap_pg_dir + adrp x2, init_idmap_pg_dir bl __enable_mmu #ifdef CONFIG_RELOCATABLE #ifdef CONFIG_RELR diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 2d4a8f995175..533673793bc0 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -198,8 +198,7 @@ SECTIONS } idmap_pg_dir = .; - . += IDMAP_DIR_SIZE; - idmap_pg_end = .; + . += PAGE_SIZE; #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 tramp_pg_dir = .; @@ -235,6 +234,10 @@ SECTIONS __inittext_end = .; __initdata_begin = .; + init_idmap_pg_dir = .; + . += INIT_IDMAP_DIR_SIZE; + init_idmap_pg_end = .; + .init.data : { INIT_DATA INIT_SETUP(16) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 8cbc87dcaad3..fd558cfcf3ad 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -766,9 +766,40 @@ static void __init map_kernel(pgd_t *pgdp) kasan_copy_shadow(pgdp); } +static void __init create_idmap(void) +{ + u64 start = __pa_symbol(__idmap_text_start); + u64 size = __pa_symbol(__idmap_text_end) - start; + pgd_t *pgd = idmap_pg_dir; + u64 pgd_phys; + + /* check if we need an additional level of translation */ + if (VA_BITS < 48 && idmap_t0sz < (64 - VA_BITS_MIN)) { + pgd_phys = early_pgtable_alloc(PAGE_SHIFT); + set_pgd(&idmap_pg_dir[start >> VA_BITS], + __pgd(pgd_phys | P4D_TYPE_TABLE)); + pgd = __va(pgd_phys); + } + __create_pgd_mapping(pgd, start, start, size, PAGE_KERNEL_ROX, + early_pgtable_alloc, 0); + + if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0)) { + extern u32 __idmap_kpti_flag; + u64 pa = __pa_symbol(&__idmap_kpti_flag); + + /* + * The KPTI G-to-nG conversion code needs a read-write mapping + * of its synchronization flag in the ID map. + */ + __create_pgd_mapping(pgd, pa, pa, sizeof(u32), PAGE_KERNEL, + early_pgtable_alloc, 0); + } +} + void __init paging_init(void) { pgd_t *pgdp = pgd_set_fixmap(__pa_symbol(swapper_pg_dir)); + extern pgd_t init_idmap_pg_dir[]; idmap_t0sz = 63UL - __fls(__pa_symbol(_end) | GENMASK(VA_BITS_MIN - 1, 0)); @@ -777,13 +808,15 @@ void __init paging_init(void) pgd_clear_fixmap(); - cpu_replace_ttbr1(lm_alias(swapper_pg_dir), idmap_pg_dir); + cpu_replace_ttbr1(lm_alias(swapper_pg_dir), init_idmap_pg_dir); init_mm.pgd = swapper_pg_dir; memblock_phys_free(__pa_symbol(init_pg_dir), __pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir)); memblock_allow_resize(); + + create_idmap(); } /* diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index e802badf9ac0..605c6640f94b 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -222,8 +222,10 @@ SYM_FUNC_END(idmap_cpu_replace_ttbr1) * * Called exactly once from stop_machine context by each CPU found during boot. */ -__idmap_kpti_flag: - .long 1 + .pushsection ".data", "aw", %progbits +SYM_DATA(__idmap_kpti_flag, .long 1) + .popsection + SYM_FUNC_START(idmap_kpti_install_ng_mappings) cpu .req w0 num_cpus .req w1 @@ -245,7 +247,7 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings) mrs swapper_ttb, ttbr1_el1 restore_ttbr1 swapper_ttb - adr flag_ptr, __idmap_kpti_flag + adr_l flag_ptr, __idmap_kpti_flag cbnz cpu, __idmap_kpti_secondary From d7bea550279db28cd154fd54843ebc858ffdf0b7 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 24 Jun 2022 17:06:43 +0200 Subject: [PATCH 13/34] arm64: head: use relative references to the RELA and RELR tables Formerly, we had to access the RELA and RELR tables via the kernel mapping that was being relocated, and so deriving the start and end addresses using ADRP/ADD references was not possible, as the relocation code runs from the ID map. Now that we map the entire kernel image via the ID map, we can simplify this, and just load the entries via the ID map as well. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220624150651.1358849-14-ardb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/head.S | 13 ++++--------- arch/arm64/kernel/vmlinux.lds.S | 12 ++++-------- 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 93734c91a29a..f1497f7b4da0 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -757,13 +757,10 @@ SYM_FUNC_START_LOCAL(__relocate_kernel) * Iterate over each entry in the relocation table, and apply the * relocations in place. */ - ldr w9, =__rela_offset // offset to reloc table - ldr w10, =__rela_size // size of reloc table - + adr_l x9, __rela_start + adr_l x10, __rela_end mov_q x11, KIMAGE_VADDR // default virtual offset add x11, x11, x23 // actual virtual offset - add x9, x9, x11 // __va(.rela) - add x10, x9, x10 // __va(.rela) + sizeof(.rela) 0: cmp x9, x10 b.hs 1f @@ -813,10 +810,8 @@ SYM_FUNC_START_LOCAL(__relocate_kernel) * __relocate_kernel is called twice with non-zero displacements (i.e. * if there is both a physical misalignment and a KASLR displacement). */ - ldr w9, =__relr_offset // offset to reloc table - ldr w10, =__relr_size // size of reloc table - add x9, x9, x11 // __va(.relr) - add x10, x9, x10 // __va(.relr) + sizeof(.relr) + adr_l x9, __relr_start + adr_l x10, __relr_end sub x15, x23, x24 // delta from previous offset cbz x15, 7f // nothing to do if unchanged diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 533673793bc0..20ebdd48db14 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -256,21 +256,17 @@ SECTIONS HYPERVISOR_RELOC_SECTION .rela.dyn : ALIGN(8) { + __rela_start = .; *(.rela .rela*) + __rela_end = .; } - __rela_offset = ABSOLUTE(ADDR(.rela.dyn) - KIMAGE_VADDR); - __rela_size = SIZEOF(.rela.dyn); - -#ifdef CONFIG_RELR .relr.dyn : ALIGN(8) { + __relr_start = .; *(.relr.dyn) + __relr_end = .; } - __relr_offset = ABSOLUTE(ADDR(.relr.dyn) - KIMAGE_VADDR); - __relr_size = SIZEOF(.relr.dyn); -#endif - . = ALIGN(SEGMENT_ALIGN); __initdata_end = .; __init_end = .; From f70b3a23324a2d31efb83cc01302acb343e4ec1b Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 24 Jun 2022 17:06:44 +0200 Subject: [PATCH 14/34] arm64: head: create a temporary FDT mapping in the initial ID map We need to access the DT very early to get at the command line and the KASLR seed, which currently means we rely on some hacks to call into the kernel before really calling into the kernel, which is undesirable. So instead, let's create a mapping for the FDT in the initial ID map, which is feasible now that it has been extended to cover more than a single page or block, and can be updated in place to remap other output addresses. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220624150651.1358849-15-ardb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/kernel-pgtable.h | 6 ++++-- arch/arm64/kernel/head.S | 14 +++++++++++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 5395e5a04f35..02e59fa8f293 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -8,6 +8,7 @@ #ifndef __ASM_KERNEL_PGTABLE_H #define __ASM_KERNEL_PGTABLE_H +#include #include #include @@ -88,10 +89,11 @@ /* the initial ID map may need two extra pages if it needs to be extended */ #if VA_BITS < 48 -#define INIT_IDMAP_DIR_SIZE (INIT_DIR_SIZE + (2 * PAGE_SIZE)) +#define INIT_IDMAP_DIR_SIZE ((INIT_IDMAP_DIR_PAGES + 2) * PAGE_SIZE) #else -#define INIT_IDMAP_DIR_SIZE INIT_DIR_SIZE +#define INIT_IDMAP_DIR_SIZE (INIT_IDMAP_DIR_PAGES * PAGE_SIZE) #endif +#define INIT_IDMAP_DIR_PAGES EARLY_PAGES(KIMAGE_VADDR, _end + MAX_FDT_SIZE + SWAPPER_BLOCK_SIZE) /* Initial memory map size */ #if ARM64_KERNEL_USES_PMD_MAPS diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index f1497f7b4da0..8283ff848328 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -83,6 +83,7 @@ * * Register Scope Purpose * x21 primary_entry() .. start_kernel() FDT pointer passed at boot in x0 + * x22 create_idmap() .. start_kernel() ID map VA of the DT blob * x23 primary_entry() .. start_kernel() physical misalignment/KASLR offset * x28 clear_page_tables() callee preserved temp register * x19/x20 __primary_switch() callee preserved temp registers @@ -348,7 +349,7 @@ SYM_FUNC_START_LOCAL(create_idmap) #endif adrp x0, init_idmap_pg_dir adrp x3, _text - adrp x6, _end + adrp x6, _end + MAX_FDT_SIZE + SWAPPER_BLOCK_SIZE mov x7, SWAPPER_RX_MMUFLAGS map_memory x0, x1, x3, x6, x7, x3, IDMAP_PGD_ORDER, x10, x11, x12, x13, x14, EXTRA_SHIFT @@ -362,6 +363,17 @@ SYM_FUNC_START_LOCAL(create_idmap) mov x6, #SWAPPER_BLOCK_SHIFT bl remap_region + /* Remap the FDT after the kernel image */ + adrp x1, _text + adrp x22, _end + SWAPPER_BLOCK_SIZE + bic x2, x22, #SWAPPER_BLOCK_SIZE - 1 + bfi x22, x21, #0, #SWAPPER_BLOCK_SHIFT // remapped FDT address + add x3, x2, #MAX_FDT_SIZE + SWAPPER_BLOCK_SIZE + bic x4, x21, #SWAPPER_BLOCK_SIZE - 1 + mov x5, SWAPPER_RW_MMUFLAGS + mov x6, #SWAPPER_BLOCK_SHIFT + bl remap_region + /* * Since the page tables have been populated with non-cacheable * accesses (MMU disabled), invalidate those tables again to From a004393f45d9a55e55d76f252914bdddffdde204 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 24 Jun 2022 17:06:45 +0200 Subject: [PATCH 15/34] arm64: idreg-override: use early FDT mapping in ID map Instead of calling into the kernel to map the FDT into the kernel page tables before even calling start_kernel(), let's switch to the initial, temporary mapping of the device tree that has been added to the ID map. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220624150651.1358849-16-ardb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/head.S | 1 + arch/arm64/kernel/idreg-override.c | 17 ++++++----------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 8283ff848328..64ebff634b83 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -472,6 +472,7 @@ SYM_FUNC_START_LOCAL(__primary_switched) #endif mov x0, x21 // pass FDT address in x0 bl early_fdt_map // Try mapping the FDT early + mov x0, x22 // pass FDT address in x0 bl init_feature_override // Parse cpu feature overrides #ifdef CONFIG_RANDOMIZE_BASE tst x23, ~(MIN_KIMG_ALIGN - 1) // already running randomized? diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index 8a2ceb591686..f92836e196e5 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -201,16 +201,11 @@ static __init void __parse_cmdline(const char *cmdline, bool parse_aliases) } while (1); } -static __init const u8 *get_bootargs_cmdline(void) +static __init const u8 *get_bootargs_cmdline(const void *fdt) { const u8 *prop; - void *fdt; int node; - fdt = get_early_fdt_ptr(); - if (!fdt) - return NULL; - node = fdt_path_offset(fdt, "/chosen"); if (node < 0) return NULL; @@ -222,9 +217,9 @@ static __init const u8 *get_bootargs_cmdline(void) return strlen(prop) ? prop : NULL; } -static __init void parse_cmdline(void) +static __init void parse_cmdline(const void *fdt) { - const u8 *prop = get_bootargs_cmdline(); + const u8 *prop = get_bootargs_cmdline(fdt); if (IS_ENABLED(CONFIG_CMDLINE_FORCE) || !prop) __parse_cmdline(CONFIG_CMDLINE, true); @@ -234,9 +229,9 @@ static __init void parse_cmdline(void) } /* Keep checkers quiet */ -void init_feature_override(void); +void init_feature_override(const void *fdt); -asmlinkage void __init init_feature_override(void) +asmlinkage void __init init_feature_override(const void *fdt) { int i; @@ -247,7 +242,7 @@ asmlinkage void __init init_feature_override(void) } } - parse_cmdline(); + parse_cmdline(fdt); for (i = 0; i < ARRAY_SIZE(regs); i++) { if (regs[i]->override) From c0be8f18a3bfcfd369eba21337e6c89a4bb8b0e8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 24 Jun 2022 17:06:46 +0200 Subject: [PATCH 16/34] arm64: head: factor out TTBR1 assignment into a macro Create a macro load_ttbr1 to avoid having to repeat the same instruction sequence 3 times in a subsequent patch. No functional change intended. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220624150651.1358849-17-ardb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/assembler.h | 17 +++++++++++++---- arch/arm64/kernel/head.S | 5 +---- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 9468f45c07a6..b2584709c332 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -479,6 +479,18 @@ alternative_endif _cond_extable .Licache_op\@, \fixup .endm +/* + * load_ttbr1 - install @pgtbl as a TTBR1 page table + * pgtbl preserved + * tmp1/tmp2 clobbered, either may overlap with pgtbl + */ + .macro load_ttbr1, pgtbl, tmp1, tmp2 + phys_to_ttbr \tmp1, \pgtbl + offset_ttbr1 \tmp1, \tmp2 + msr ttbr1_el1, \tmp1 + isb + .endm + /* * To prevent the possibility of old and new partial table walks being visible * in the tlb, switch the ttbr to a zero page when we invalidate the old @@ -492,10 +504,7 @@ alternative_endif isb tlbi vmalle1 dsb nsh - phys_to_ttbr \tmp, \page_table - offset_ttbr1 \tmp, \tmp2 - msr ttbr1_el1, \tmp - isb + load_ttbr1 \page_table, \tmp, \tmp2 .endm /* diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 64ebff634b83..d704d0bd8ffc 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -722,12 +722,9 @@ SYM_FUNC_START(__enable_mmu) cmp x3, #ID_AA64MMFR0_TGRAN_SUPPORTED_MAX b.gt __no_granule_support update_early_cpu_boot_status 0, x3, x4 - phys_to_ttbr x1, x1 phys_to_ttbr x2, x2 msr ttbr0_el1, x2 // load TTBR0 - offset_ttbr1 x1, x3 - msr ttbr1_el1, x1 // load TTBR1 - isb + load_ttbr1 x1, x1, x3 set_sctlr_el1 x0 From 6495b9ba62711b581680fbdd90d0bfc48cf5c91b Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 24 Jun 2022 17:06:47 +0200 Subject: [PATCH 17/34] arm64: head: populate kernel page tables with MMU and caches on Now that we can access the entire kernel image via the ID map, we can execute the page table population code with the MMU and caches enabled. The only thing we need to ensure is that translations via TTBR1 remain disabled while we are updating the page tables the second time around, in case KASLR wants them to be randomized. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220624150651.1358849-18-ardb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/head.S | 62 +++++++++++----------------------------- 1 file changed, 16 insertions(+), 46 deletions(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index d704d0bd8ffc..583cbea865e1 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -85,8 +85,6 @@ * x21 primary_entry() .. start_kernel() FDT pointer passed at boot in x0 * x22 create_idmap() .. start_kernel() ID map VA of the DT blob * x23 primary_entry() .. start_kernel() physical misalignment/KASLR offset - * x28 clear_page_tables() callee preserved temp register - * x19/x20 __primary_switch() callee preserved temp registers * x24 __primary_switch() .. relocate_kernel() current RELR displacement * x28 create_idmap() callee preserved temp register */ @@ -96,9 +94,7 @@ SYM_CODE_START(primary_entry) adrp x23, __PHYS_OFFSET and x23, x23, MIN_KIMG_ALIGN - 1 // KASLR offset, defaults to 0 bl set_cpu_boot_mode_flag - bl clear_page_tables bl create_idmap - bl create_kernel_mapping /* * The following calls CPU setup code, see arch/arm64/mm/proc.S for @@ -128,32 +124,14 @@ SYM_CODE_START_LOCAL(preserve_boot_args) SYM_CODE_END(preserve_boot_args) SYM_FUNC_START_LOCAL(clear_page_tables) - mov x28, lr - - /* - * Invalidate the init page tables to avoid potential dirty cache lines - * being evicted. Other page tables are allocated in rodata as part of - * the kernel image, and thus are clean to the PoC per the boot - * protocol. - */ - adrp x0, init_pg_dir - adrp x1, init_pg_end - bl dcache_inval_poc - /* * Clear the init page tables. */ adrp x0, init_pg_dir adrp x1, init_pg_end - sub x1, x1, x0 -1: stp xzr, xzr, [x0], #16 - stp xzr, xzr, [x0], #16 - stp xzr, xzr, [x0], #16 - stp xzr, xzr, [x0], #16 - subs x1, x1, #64 - b.ne 1b - - ret x28 + sub x2, x1, x0 + mov x1, xzr + b __pi_memset // tail call SYM_FUNC_END(clear_page_tables) /* @@ -399,16 +377,8 @@ SYM_FUNC_START_LOCAL(create_kernel_mapping) map_memory x0, x1, x5, x6, x7, x3, (VA_BITS - PGDIR_SHIFT), x10, x11, x12, x13, x14 - /* - * Since the page tables have been populated with non-cacheable - * accesses (MMU disabled), invalidate those tables again to - * remove any speculatively loaded cache lines. - */ - dmb sy - - adrp x0, init_pg_dir - adrp x1, init_pg_end - b dcache_inval_poc // tail call + dsb ishst // sync with page table walker + ret SYM_FUNC_END(create_kernel_mapping) /* @@ -863,14 +833,15 @@ SYM_FUNC_END(__relocate_kernel) #endif SYM_FUNC_START_LOCAL(__primary_switch) -#ifdef CONFIG_RANDOMIZE_BASE - mov x19, x0 // preserve new SCTLR_EL1 value - mrs x20, sctlr_el1 // preserve old SCTLR_EL1 value -#endif - - adrp x1, init_pg_dir + adrp x1, reserved_pg_dir adrp x2, init_idmap_pg_dir bl __enable_mmu + + bl clear_page_tables + bl create_kernel_mapping + + adrp x1, init_pg_dir + load_ttbr1 x1, x1, x2 #ifdef CONFIG_RELOCATABLE #ifdef CONFIG_RELR mov x24, #0 // no RELR displacement yet @@ -886,9 +857,8 @@ SYM_FUNC_START_LOCAL(__primary_switch) * to take into account by discarding the current kernel mapping and * creating a new one. */ - pre_disable_mmu_workaround - msr sctlr_el1, x20 // disable the MMU - isb + adrp x1, reserved_pg_dir // Disable translations via TTBR1 + load_ttbr1 x1, x1, x2 bl clear_page_tables bl create_kernel_mapping // Recreate kernel mapping @@ -896,8 +866,8 @@ SYM_FUNC_START_LOCAL(__primary_switch) dsb nsh isb - set_sctlr_el1 x19 // re-enable the MMU - + adrp x1, init_pg_dir // Re-enable translations via TTBR1 + load_ttbr1 x1, x1, x2 bl __relocate_kernel #endif #endif From 005e12676af09a308f18cb94aa593bb30dee031e Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 24 Jun 2022 17:06:48 +0200 Subject: [PATCH 18/34] arm64: head: record CPU boot mode after enabling the MMU In order to avoid having to touch memory with the MMU and caches disabled, and therefore having to invalidate it from the caches explicitly, just defer storing the value until after the MMU has been turned on, unless we are giving up with an error. While at it, move the associated variable definitions into C code. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220624150651.1358849-19-ardb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/head.S | 50 ++++++++++-------------------------- arch/arm64/kernel/hyp-stub.S | 4 +-- arch/arm64/mm/mmu.c | 8 ++++++ 3 files changed, 23 insertions(+), 39 deletions(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 583cbea865e1..8de346dd4470 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -82,6 +82,7 @@ * primary lowlevel boot path: * * Register Scope Purpose + * x20 primary_entry() .. __primary_switch() CPU boot mode * x21 primary_entry() .. start_kernel() FDT pointer passed at boot in x0 * x22 create_idmap() .. start_kernel() ID map VA of the DT blob * x23 primary_entry() .. start_kernel() physical misalignment/KASLR offset @@ -91,9 +92,9 @@ SYM_CODE_START(primary_entry) bl preserve_boot_args bl init_kernel_el // w0=cpu_boot_mode + mov x20, x0 adrp x23, __PHYS_OFFSET and x23, x23, MIN_KIMG_ALIGN - 1 // KASLR offset, defaults to 0 - bl set_cpu_boot_mode_flag bl create_idmap /* @@ -429,6 +430,9 @@ SYM_FUNC_START_LOCAL(__primary_switched) sub x4, x4, x0 // the kernel virtual and str_l x4, kimage_voffset, x5 // physical mappings + mov x0, x20 + bl set_cpu_boot_mode_flag + // Clear BSS adr_l x0, __bss_start mov x1, xzr @@ -454,6 +458,7 @@ SYM_FUNC_START_LOCAL(__primary_switched) ret // to __primary_switch() 0: #endif + mov x0, x20 bl switch_to_vhe // Prefer VHE if possible ldp x29, x30, [sp], #16 bl start_kernel @@ -553,52 +558,21 @@ SYM_FUNC_START_LOCAL(set_cpu_boot_mode_flag) b.ne 1f add x1, x1, #4 1: str w0, [x1] // Save CPU boot mode - dmb sy - dc ivac, x1 // Invalidate potentially stale cache line ret SYM_FUNC_END(set_cpu_boot_mode_flag) -/* - * These values are written with the MMU off, but read with the MMU on. - * Writers will invalidate the corresponding address, discarding up to a - * 'Cache Writeback Granule' (CWG) worth of data. The linker script ensures - * sufficient alignment that the CWG doesn't overlap another section. - */ - .pushsection ".mmuoff.data.write", "aw" -/* - * We need to find out the CPU boot mode long after boot, so we need to - * store it in a writable variable. - * - * This is not in .bss, because we set it sufficiently early that the boot-time - * zeroing of .bss would clobber it. - */ -SYM_DATA_START(__boot_cpu_mode) - .long BOOT_CPU_MODE_EL2 - .long BOOT_CPU_MODE_EL1 -SYM_DATA_END(__boot_cpu_mode) -/* - * The booting CPU updates the failed status @__early_cpu_boot_status, - * with MMU turned off. - */ -SYM_DATA_START(__early_cpu_boot_status) - .quad 0 -SYM_DATA_END(__early_cpu_boot_status) - - .popsection - /* * This provides a "holding pen" for platforms to hold all secondary * cores are held until we're ready for them to initialise. */ SYM_FUNC_START(secondary_holding_pen) bl init_kernel_el // w0=cpu_boot_mode - bl set_cpu_boot_mode_flag - mrs x0, mpidr_el1 + mrs x2, mpidr_el1 mov_q x1, MPIDR_HWID_BITMASK - and x0, x0, x1 + and x2, x2, x1 adr_l x3, secondary_holding_pen_release pen: ldr x4, [x3] - cmp x4, x0 + cmp x4, x2 b.eq secondary_startup wfe b pen @@ -610,7 +584,6 @@ SYM_FUNC_END(secondary_holding_pen) */ SYM_FUNC_START(secondary_entry) bl init_kernel_el // w0=cpu_boot_mode - bl set_cpu_boot_mode_flag b secondary_startup SYM_FUNC_END(secondary_entry) @@ -618,6 +591,7 @@ SYM_FUNC_START_LOCAL(secondary_startup) /* * Common entry point for secondary CPUs. */ + mov x20, x0 // preserve boot mode bl switch_to_vhe bl __cpu_secondary_check52bitva bl __cpu_setup // initialise processor @@ -629,6 +603,9 @@ SYM_FUNC_START_LOCAL(secondary_startup) SYM_FUNC_END(secondary_startup) SYM_FUNC_START_LOCAL(__secondary_switched) + mov x0, x20 + bl set_cpu_boot_mode_flag + str_l xzr, __early_cpu_boot_status, x3 adr_l x5, vectors msr vbar_el1, x5 isb @@ -691,7 +668,6 @@ SYM_FUNC_START(__enable_mmu) b.lt __no_granule_support cmp x3, #ID_AA64MMFR0_TGRAN_SUPPORTED_MAX b.gt __no_granule_support - update_early_cpu_boot_status 0, x3, x4 phys_to_ttbr x2, x2 msr ttbr0_el1, x2 // load TTBR0 load_ttbr1 x1, x1, x3 diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index 43d212618834..5bafb53fafb4 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -223,11 +223,11 @@ SYM_FUNC_END(__hyp_reset_vectors) /* * Entry point to switch to VHE if deemed capable + * + * w0: boot mode, as returned by init_kernel_el() */ SYM_FUNC_START(switch_to_vhe) // Need to have booted at EL2 - adr_l x1, __boot_cpu_mode - ldr w0, [x1] cmp w0, #BOOT_CPU_MODE_EL2 b.ne 1f diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index fd558cfcf3ad..05d77f2342a9 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -56,6 +56,14 @@ EXPORT_SYMBOL(kimage_vaddr); u64 kimage_voffset __ro_after_init; EXPORT_SYMBOL(kimage_voffset); +u32 __boot_cpu_mode[] = { BOOT_CPU_MODE_EL2, BOOT_CPU_MODE_EL1 }; + +/* + * The booting CPU updates the failed status @__early_cpu_boot_status, + * with MMU turned off. + */ +long __section(".mmuoff.data.write") __early_cpu_boot_status; + /* * Empty_zero_page is a special page that is used for zero-initialized data * and COW. From fc5a89f75d2aad3e566e030675ac420aee49729c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 24 Jun 2022 17:06:49 +0200 Subject: [PATCH 19/34] arm64: kaslr: defer initialization to initcall where permitted The early KASLR init code runs extremely early, and anything that could be deferred until later should be. So let's defer the randomization of the module region until much later - this also simplifies the arithmetic, given that we no longer have to reason about the link time vs load time placement of the core kernel explicitly. Also get rid of the global status variable, and infer the status reported by the diagnostic print from other KASLR related context. While at it, get rid of the special case for KASAN without KASAN_VMALLOC, which never occurs in practice. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220624150651.1358849-20-ardb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/kaslr.c | 95 +++++++++++++++++---------------------- 1 file changed, 40 insertions(+), 55 deletions(-) diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index d5542666182f..3edee81d8ea7 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -20,14 +20,6 @@ #include #include -enum kaslr_status { - KASLR_ENABLED, - KASLR_DISABLED_CMDLINE, - KASLR_DISABLED_NO_SEED, - KASLR_DISABLED_FDT_REMAP, -}; - -static enum kaslr_status __initdata kaslr_status; u64 __ro_after_init module_alloc_base; u16 __initdata memstart_offset_seed; @@ -63,15 +55,9 @@ struct arm64_ftr_override kaslr_feature_override __initdata; u64 __init kaslr_early_init(void) { void *fdt; - u64 seed, offset, mask, module_range; + u64 seed, offset, mask; unsigned long raw; - /* - * Set a reasonable default for module_alloc_base in case - * we end up running with module randomization disabled. - */ - module_alloc_base = (u64)_etext - MODULES_VSIZE; - /* * Try to map the FDT early. If this fails, we simply bail, * and proceed with KASLR disabled. We will make another @@ -79,7 +65,6 @@ u64 __init kaslr_early_init(void) */ fdt = get_early_fdt_ptr(); if (!fdt) { - kaslr_status = KASLR_DISABLED_FDT_REMAP; return 0; } @@ -93,7 +78,6 @@ u64 __init kaslr_early_init(void) * return 0 if that is the case. */ if (kaslr_feature_override.val & kaslr_feature_override.mask & 0xf) { - kaslr_status = KASLR_DISABLED_CMDLINE; return 0; } @@ -106,7 +90,6 @@ u64 __init kaslr_early_init(void) seed ^= raw; if (!seed) { - kaslr_status = KASLR_DISABLED_NO_SEED; return 0; } @@ -126,19 +109,43 @@ u64 __init kaslr_early_init(void) /* use the top 16 bits to randomize the linear region */ memstart_offset_seed = seed >> 48; - if (!IS_ENABLED(CONFIG_KASAN_VMALLOC) && - (IS_ENABLED(CONFIG_KASAN_GENERIC) || - IS_ENABLED(CONFIG_KASAN_SW_TAGS))) - /* - * KASAN without KASAN_VMALLOC does not expect the module region - * to intersect the vmalloc region, since shadow memory is - * allocated for each module at load time, whereas the vmalloc - * region is shadowed by KASAN zero pages. So keep modules - * out of the vmalloc region if KASAN is enabled without - * KASAN_VMALLOC, and put the kernel well within 4 GB of the - * module region. - */ - return offset % SZ_2G; + return offset; +} + +static int __init kaslr_init(void) +{ + u64 module_range; + u32 seed; + + /* + * Set a reasonable default for module_alloc_base in case + * we end up running with module randomization disabled. + */ + module_alloc_base = (u64)_etext - MODULES_VSIZE; + + if (kaslr_feature_override.val & kaslr_feature_override.mask & 0xf) { + pr_info("KASLR disabled on command line\n"); + return 0; + } + + if (!kaslr_offset()) { + pr_warn("KASLR disabled due to lack of seed\n"); + return 0; + } + + pr_info("KASLR enabled\n"); + + /* + * KASAN without KASAN_VMALLOC does not expect the module region to + * intersect the vmalloc region, since shadow memory is allocated for + * each module at load time, whereas the vmalloc region will already be + * shadowed by KASAN zero pages. + */ + BUILD_BUG_ON((IS_ENABLED(CONFIG_KASAN_GENERIC) || + IS_ENABLED(CONFIG_KASAN_SW_TAGS)) && + !IS_ENABLED(CONFIG_KASAN_VMALLOC)); + + seed = get_random_u32(); if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) { /* @@ -150,8 +157,7 @@ u64 __init kaslr_early_init(void) * resolved normally.) */ module_range = SZ_2G - (u64)(_end - _stext); - module_alloc_base = max((u64)_end + offset - SZ_2G, - (u64)MODULES_VADDR); + module_alloc_base = max((u64)_end - SZ_2G, (u64)MODULES_VADDR); } else { /* * Randomize the module region by setting module_alloc_base to @@ -163,33 +169,12 @@ u64 __init kaslr_early_init(void) * when ARM64_MODULE_PLTS is enabled. */ module_range = MODULES_VSIZE - (u64)(_etext - _stext); - module_alloc_base = (u64)_etext + offset - MODULES_VSIZE; } /* use the lower 21 bits to randomize the base of the module region */ module_alloc_base += (module_range * (seed & ((1 << 21) - 1))) >> 21; module_alloc_base &= PAGE_MASK; - return offset; -} - -static int __init kaslr_init(void) -{ - switch (kaslr_status) { - case KASLR_ENABLED: - pr_info("KASLR enabled\n"); - break; - case KASLR_DISABLED_CMDLINE: - pr_info("KASLR disabled on command line\n"); - break; - case KASLR_DISABLED_NO_SEED: - pr_warn("KASLR disabled due to lack of seed\n"); - break; - case KASLR_DISABLED_FDT_REMAP: - pr_warn("KASLR disabled due to FDT remapping failure\n"); - break; - } - return 0; } -core_initcall(kaslr_init) +subsys_initcall(kaslr_init) From aacd149b62382c63911060b8f64c1e3d89bd405a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 24 Jun 2022 17:06:50 +0200 Subject: [PATCH 20/34] arm64: head: avoid relocating the kernel twice for KASLR Currently, when KASLR is in effect, we set up the kernel virtual address space twice: the first time, the KASLR seed is looked up in the device tree, and the kernel virtual mapping is torn down and recreated again, after which the relocations are applied a second time. The latter step means that statically initialized global pointer variables will be reset to their initial values, and to ensure that BSS variables are not set to values based on the initial translation, they are cleared again as well. All of this is needed because we need the command line (taken from the DT) to tell us whether or not to randomize the virtual address space before entering the kernel proper. However, this code has expanded little by little and now creates global state unrelated to the virtual randomization of the kernel before the mapping is torn down and set up again, and the BSS cleared for a second time. This has created some issues in the past, and it would be better to avoid this little dance if possible. So instead, let's use the temporary mapping of the device tree, and execute the bare minimum of code to decide whether or not KASLR should be enabled, and what the seed is. Only then, create the virtual kernel mapping, clear BSS, etc and proceed as normal. This avoids the issues around inconsistent global state due to BSS being cleared twice, and is generally more maintainable, as it permits us to defer all the remaining DT parsing and KASLR initialization to a later time. This means the relocation fixup code runs only a single time as well, allowing us to simplify the RELR handling code too, which is not idempotent and was therefore required to keep track of the offset that was applied the first time around. Note that this means we have to clone a pair of FDT library objects, so that we can control how they are built - we need the stack protector and other instrumentation disabled so that the code can tolerate being called this early. Note that only the kernel page tables and the temporary stack are mapped read-write at this point, which ensures that the early code does not modify any global state inadvertently. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220624150651.1358849-21-ardb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/Makefile | 2 +- arch/arm64/kernel/head.S | 73 ++++++------------- arch/arm64/kernel/image-vars.h | 4 ++ arch/arm64/kernel/kaslr.c | 87 ---------------------- arch/arm64/kernel/pi/Makefile | 33 +++++++++ arch/arm64/kernel/pi/kaslr_early.c | 112 +++++++++++++++++++++++++++++ 6 files changed, 171 insertions(+), 140 deletions(-) create mode 100644 arch/arm64/kernel/pi/Makefile create mode 100644 arch/arm64/kernel/pi/kaslr_early.c diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index fa7981d0d917..88a96511580e 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -59,7 +59,7 @@ obj-$(CONFIG_ACPI) += acpi.o obj-$(CONFIG_ACPI_NUMA) += acpi_numa.o obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL) += acpi_parking_protocol.o obj-$(CONFIG_PARAVIRT) += paravirt.o -obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o +obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o pi/ obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o obj-$(CONFIG_ELF_CORE) += elfcore.o obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o \ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 8de346dd4470..5a2ff6466b6b 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -86,15 +86,13 @@ * x21 primary_entry() .. start_kernel() FDT pointer passed at boot in x0 * x22 create_idmap() .. start_kernel() ID map VA of the DT blob * x23 primary_entry() .. start_kernel() physical misalignment/KASLR offset - * x24 __primary_switch() .. relocate_kernel() current RELR displacement + * x24 __primary_switch() linear map KASLR seed * x28 create_idmap() callee preserved temp register */ SYM_CODE_START(primary_entry) bl preserve_boot_args bl init_kernel_el // w0=cpu_boot_mode mov x20, x0 - adrp x23, __PHYS_OFFSET - and x23, x23, MIN_KIMG_ALIGN - 1 // KASLR offset, defaults to 0 bl create_idmap /* @@ -441,6 +439,10 @@ SYM_FUNC_START_LOCAL(__primary_switched) bl __pi_memset dsb ishst // Make zero page visible to PTW +#ifdef CONFIG_RANDOMIZE_BASE + adrp x5, memstart_offset_seed // Save KASLR linear map seed + strh w24, [x5, :lo12:memstart_offset_seed] +#endif #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) bl kasan_early_init #endif @@ -448,16 +450,6 @@ SYM_FUNC_START_LOCAL(__primary_switched) bl early_fdt_map // Try mapping the FDT early mov x0, x22 // pass FDT address in x0 bl init_feature_override // Parse cpu feature overrides -#ifdef CONFIG_RANDOMIZE_BASE - tst x23, ~(MIN_KIMG_ALIGN - 1) // already running randomized? - b.ne 0f - bl kaslr_early_init // parse FDT for KASLR options - cbz x0, 0f // KASLR disabled? just proceed - orr x23, x23, x0 // record KASLR offset - ldp x29, x30, [sp], #16 // we must enable KASLR, return - ret // to __primary_switch() -0: -#endif mov x0, x20 bl switch_to_vhe // Prefer VHE if possible ldp x29, x30, [sp], #16 @@ -759,27 +751,17 @@ SYM_FUNC_START_LOCAL(__relocate_kernel) * entry in x9, the address being relocated by the current address or * bitmap entry in x13 and the address being relocated by the current * bit in x14. - * - * Because addends are stored in place in the binary, RELR relocations - * cannot be applied idempotently. We use x24 to keep track of the - * currently applied displacement so that we can correctly relocate if - * __relocate_kernel is called twice with non-zero displacements (i.e. - * if there is both a physical misalignment and a KASLR displacement). */ adr_l x9, __relr_start adr_l x10, __relr_end - sub x15, x23, x24 // delta from previous offset - cbz x15, 7f // nothing to do if unchanged - mov x24, x23 // save new offset - 2: cmp x9, x10 b.hs 7f ldr x11, [x9], #8 tbnz x11, #0, 3f // branch to handle bitmaps add x13, x11, x23 ldr x12, [x13] // relocate address entry - add x12, x12, x15 + add x12, x12, x23 str x12, [x13], #8 // adjust to start of bitmap b 2b @@ -788,7 +770,7 @@ SYM_FUNC_START_LOCAL(__relocate_kernel) cbz x11, 6f tbz x11, #0, 5f // skip bit if not set ldr x12, [x14] // relocate bit - add x12, x12, x15 + add x12, x12, x23 str x12, [x14] 5: add x14, x14, #8 // move to next bit's address @@ -812,40 +794,27 @@ SYM_FUNC_START_LOCAL(__primary_switch) adrp x1, reserved_pg_dir adrp x2, init_idmap_pg_dir bl __enable_mmu - +#ifdef CONFIG_RELOCATABLE + adrp x23, __PHYS_OFFSET + and x23, x23, MIN_KIMG_ALIGN - 1 +#ifdef CONFIG_RANDOMIZE_BASE + mov x0, x22 + adrp x1, init_pg_end + mov sp, x1 + mov x29, xzr + bl __pi_kaslr_early_init + and x24, x0, #SZ_2M - 1 // capture memstart offset seed + bic x0, x0, #SZ_2M - 1 + orr x23, x23, x0 // record kernel offset +#endif +#endif bl clear_page_tables bl create_kernel_mapping adrp x1, init_pg_dir load_ttbr1 x1, x1, x2 #ifdef CONFIG_RELOCATABLE -#ifdef CONFIG_RELR - mov x24, #0 // no RELR displacement yet -#endif bl __relocate_kernel -#ifdef CONFIG_RANDOMIZE_BASE - ldr x8, =__primary_switched - adrp x0, __PHYS_OFFSET - blr x8 - - /* - * If we return here, we have a KASLR displacement in x23 which we need - * to take into account by discarding the current kernel mapping and - * creating a new one. - */ - adrp x1, reserved_pg_dir // Disable translations via TTBR1 - load_ttbr1 x1, x1, x2 - bl clear_page_tables - bl create_kernel_mapping // Recreate kernel mapping - - tlbi vmalle1 // Remove any stale TLB entries - dsb nsh - isb - - adrp x1, init_pg_dir // Re-enable translations via TTBR1 - load_ttbr1 x1, x1, x2 - bl __relocate_kernel -#endif #endif ldr x8, =__primary_switched adrp x0, __PHYS_OFFSET diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 241c86b67d01..0c381a405bf0 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -41,6 +41,10 @@ __efistub_dcache_clean_poc = __pi_dcache_clean_poc; __efistub___memcpy = __pi_memcpy; __efistub___memmove = __pi_memmove; __efistub___memset = __pi_memset; + +__pi___memcpy = __pi_memcpy; +__pi___memmove = __pi_memmove; +__pi___memset = __pi_memset; #endif __efistub__text = _text; diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index 3edee81d8ea7..325455d16dbc 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -23,95 +23,8 @@ u64 __ro_after_init module_alloc_base; u16 __initdata memstart_offset_seed; -static __init u64 get_kaslr_seed(void *fdt) -{ - int node, len; - fdt64_t *prop; - u64 ret; - - node = fdt_path_offset(fdt, "/chosen"); - if (node < 0) - return 0; - - prop = fdt_getprop_w(fdt, node, "kaslr-seed", &len); - if (!prop || len != sizeof(u64)) - return 0; - - ret = fdt64_to_cpu(*prop); - *prop = 0; - return ret; -} - struct arm64_ftr_override kaslr_feature_override __initdata; -/* - * This routine will be executed with the kernel mapped at its default virtual - * address, and if it returns successfully, the kernel will be remapped, and - * start_kernel() will be executed from a randomized virtual offset. The - * relocation will result in all absolute references (e.g., static variables - * containing function pointers) to be reinitialized, and zero-initialized - * .bss variables will be reset to 0. - */ -u64 __init kaslr_early_init(void) -{ - void *fdt; - u64 seed, offset, mask; - unsigned long raw; - - /* - * Try to map the FDT early. If this fails, we simply bail, - * and proceed with KASLR disabled. We will make another - * attempt at mapping the FDT in setup_machine() - */ - fdt = get_early_fdt_ptr(); - if (!fdt) { - return 0; - } - - /* - * Retrieve (and wipe) the seed from the FDT - */ - seed = get_kaslr_seed(fdt); - - /* - * Check if 'nokaslr' appears on the command line, and - * return 0 if that is the case. - */ - if (kaslr_feature_override.val & kaslr_feature_override.mask & 0xf) { - return 0; - } - - /* - * Mix in any entropy obtainable architecturally if enabled - * and supported. - */ - - if (arch_get_random_seed_long_early(&raw)) - seed ^= raw; - - if (!seed) { - return 0; - } - - /* - * OK, so we are proceeding with KASLR enabled. Calculate a suitable - * kernel image offset from the seed. Let's place the kernel in the - * middle half of the VMALLOC area (VA_BITS_MIN - 2), and stay clear of - * the lower and upper quarters to avoid colliding with other - * allocations. - * Even if we could randomize at page granularity for 16k and 64k pages, - * let's always round to 2 MB so we don't interfere with the ability to - * map using contiguous PTEs - */ - mask = ((1UL << (VA_BITS_MIN - 2)) - 1) & ~(SZ_2M - 1); - offset = BIT(VA_BITS_MIN - 3) + (seed & mask); - - /* use the top 16 bits to randomize the linear region */ - memstart_offset_seed = seed >> 48; - - return offset; -} - static int __init kaslr_init(void) { u64 module_range; diff --git a/arch/arm64/kernel/pi/Makefile b/arch/arm64/kernel/pi/Makefile new file mode 100644 index 000000000000..839291430cb3 --- /dev/null +++ b/arch/arm64/kernel/pi/Makefile @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright 2022 Google LLC + +KBUILD_CFLAGS := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) -fpie \ + -Os -DDISABLE_BRANCH_PROFILING $(DISABLE_STACKLEAK_PLUGIN) \ + $(call cc-option,-mbranch-protection=none) \ + -I$(srctree)/scripts/dtc/libfdt -fno-stack-protector \ + -include $(srctree)/include/linux/hidden.h \ + -D__DISABLE_EXPORTS -ffreestanding -D__NO_FORTIFY \ + $(call cc-option,-fno-addrsig) + +# remove SCS flags from all objects in this directory +KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_SCS), $(KBUILD_CFLAGS)) +# disable LTO +KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO), $(KBUILD_CFLAGS)) + +GCOV_PROFILE := n +KASAN_SANITIZE := n +KCSAN_SANITIZE := n +UBSAN_SANITIZE := n +KCOV_INSTRUMENT := n + +$(obj)/%.pi.o: OBJCOPYFLAGS := --prefix-symbols=__pi_ \ + --remove-section=.note.gnu.property \ + --prefix-alloc-sections=.init +$(obj)/%.pi.o: $(obj)/%.o FORCE + $(call if_changed,objcopy) + +$(obj)/lib-%.o: $(srctree)/lib/%.c FORCE + $(call if_changed_rule,cc_o_c) + +obj-y := kaslr_early.pi.o lib-fdt.pi.o lib-fdt_ro.pi.o +extra-y := $(patsubst %.pi.o,%.o,$(obj-y)) diff --git a/arch/arm64/kernel/pi/kaslr_early.c b/arch/arm64/kernel/pi/kaslr_early.c new file mode 100644 index 000000000000..6c3855e69395 --- /dev/null +++ b/arch/arm64/kernel/pi/kaslr_early.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright 2022 Google LLC +// Author: Ard Biesheuvel + +// NOTE: code in this file runs *very* early, and is not permitted to use +// global variables or anything that relies on absolute addressing. + +#include +#include +#include +#include +#include +#include + +#include +#include + +/* taken from lib/string.c */ +static char *__strstr(const char *s1, const char *s2) +{ + size_t l1, l2; + + l2 = strlen(s2); + if (!l2) + return (char *)s1; + l1 = strlen(s1); + while (l1 >= l2) { + l1--; + if (!memcmp(s1, s2, l2)) + return (char *)s1; + s1++; + } + return NULL; +} +static bool cmdline_contains_nokaslr(const u8 *cmdline) +{ + const u8 *str; + + str = __strstr(cmdline, "nokaslr"); + return str == cmdline || (str > cmdline && *(str - 1) == ' '); +} + +static bool is_kaslr_disabled_cmdline(void *fdt) +{ + if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) { + int node; + const u8 *prop; + + node = fdt_path_offset(fdt, "/chosen"); + if (node < 0) + goto out; + + prop = fdt_getprop(fdt, node, "bootargs", NULL); + if (!prop) + goto out; + + if (cmdline_contains_nokaslr(prop)) + return true; + + if (IS_ENABLED(CONFIG_CMDLINE_EXTEND)) + goto out; + + return false; + } +out: + return cmdline_contains_nokaslr(CONFIG_CMDLINE); +} + +static u64 get_kaslr_seed(void *fdt) +{ + int node, len; + fdt64_t *prop; + u64 ret; + + node = fdt_path_offset(fdt, "/chosen"); + if (node < 0) + return 0; + + prop = fdt_getprop_w(fdt, node, "kaslr-seed", &len); + if (!prop || len != sizeof(u64)) + return 0; + + ret = fdt64_to_cpu(*prop); + *prop = 0; + return ret; +} + +asmlinkage u64 kaslr_early_init(void *fdt) +{ + u64 seed; + + if (is_kaslr_disabled_cmdline(fdt)) + return 0; + + seed = get_kaslr_seed(fdt); + if (!seed) { +#ifdef CONFIG_ARCH_RANDOM + if (!__early_cpu_has_rndr() || + !__arm64_rndr((unsigned long *)&seed)) +#endif + return 0; + } + + /* + * OK, so we are proceeding with KASLR enabled. Calculate a suitable + * kernel image offset from the seed. Let's place the kernel in the + * middle half of the VMALLOC area (VA_BITS_MIN - 2), and stay clear of + * the lower and upper quarters to avoid colliding with other + * allocations. + */ + return BIT(VA_BITS_MIN - 3) + (seed & GENMASK(VA_BITS_MIN - 3, 0)); +} From 7559d9f97581654fbd0c3fa21878b6d043bbe439 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 24 Jun 2022 17:06:51 +0200 Subject: [PATCH 21/34] arm64: setup: drop early FDT pointer helpers We no longer need to call into the kernel to map the FDT before calling into the kernel so let's drop the helpers we added for this. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220624150651.1358849-22-ardb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/setup.h | 3 --- arch/arm64/kernel/head.S | 2 -- arch/arm64/kernel/setup.c | 15 --------------- 3 files changed, 20 deletions(-) diff --git a/arch/arm64/include/asm/setup.h b/arch/arm64/include/asm/setup.h index 6437df661700..5f147a418281 100644 --- a/arch/arm64/include/asm/setup.h +++ b/arch/arm64/include/asm/setup.h @@ -5,9 +5,6 @@ #include -void *get_early_fdt_ptr(void); -void early_fdt_map(u64 dt_phys); - /* * These two variables are used in the head.S file. */ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 5a2ff6466b6b..6bf685f988f1 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -446,8 +446,6 @@ SYM_FUNC_START_LOCAL(__primary_switched) #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) bl kasan_early_init #endif - mov x0, x21 // pass FDT address in x0 - bl early_fdt_map // Try mapping the FDT early mov x0, x22 // pass FDT address in x0 bl init_feature_override // Parse cpu feature overrides mov x0, x20 diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index fea3223704b6..d0e6c7a291da 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -163,21 +163,6 @@ static void __init smp_build_mpidr_hash(void) pr_warn("Large number of MPIDR hash buckets detected\n"); } -static void *early_fdt_ptr __initdata; - -void __init *get_early_fdt_ptr(void) -{ - return early_fdt_ptr; -} - -asmlinkage void __init early_fdt_map(u64 dt_phys) -{ - int fdt_size; - - early_fixmap_init(); - early_fdt_ptr = fixmap_remap_fdt(dt_phys, &fdt_size, PAGE_KERNEL); -} - static void __init setup_machine_fdt(phys_addr_t dt_phys) { int size; From fbf6ad5efe95665c188248b6abee94f4bf296604 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 29 Jun 2022 10:32:46 +0200 Subject: [PATCH 22/34] arm64: lds: use PROVIDE instead of conditional definitions Currently, a build with CONFIG_EFI=n and CONFIG_KASAN=y will not complete successfully because of missing symbols. This is due to the fact that the __pi_ prefixed aliases for __memcpy/__memmove were put inside a #ifdef CONFIG_EFI block inadvertently, and are therefore missing from the build in question. These definitions should only be provided when needed, as they will otherwise clutter up the symbol table, kallsyms etc for no reason. Fortunately, instead of using CPP conditionals, we can achieve the same result by using the linker's PROVIDE() directive, which only defines a symbol if it is required to complete the link. So let's use that for all symbols alias definitions. Reported-by: kernel test robot Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220629083246.3729177-1-ardb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/image-vars.h | 61 +++++++++++++++++----------------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 0c381a405bf0..afa69e04e75e 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -10,11 +10,8 @@ #error This file should only be included in vmlinux.lds.S #endif -#ifdef CONFIG_EFI - -__efistub_kernel_size = _edata - _text; -__efistub_primary_entry_offset = primary_entry - _text; - +PROVIDE(__efistub_kernel_size = _edata - _text); +PROVIDE(__efistub_primary_entry_offset = primary_entry - _text); /* * The EFI stub has its own symbol namespace prefixed by __efistub_, to @@ -25,35 +22,37 @@ __efistub_primary_entry_offset = primary_entry - _text; * linked at. The routines below are all implemented in assembler in a * position independent manner */ -__efistub_memcmp = __pi_memcmp; -__efistub_memchr = __pi_memchr; -__efistub_memcpy = __pi_memcpy; -__efistub_memmove = __pi_memmove; -__efistub_memset = __pi_memset; -__efistub_strlen = __pi_strlen; -__efistub_strnlen = __pi_strnlen; -__efistub_strcmp = __pi_strcmp; -__efistub_strncmp = __pi_strncmp; -__efistub_strrchr = __pi_strrchr; -__efistub_dcache_clean_poc = __pi_dcache_clean_poc; +PROVIDE(__efistub_memcmp = __pi_memcmp); +PROVIDE(__efistub_memchr = __pi_memchr); +PROVIDE(__efistub_memcpy = __pi_memcpy); +PROVIDE(__efistub_memmove = __pi_memmove); +PROVIDE(__efistub_memset = __pi_memset); +PROVIDE(__efistub_strlen = __pi_strlen); +PROVIDE(__efistub_strnlen = __pi_strnlen); +PROVIDE(__efistub_strcmp = __pi_strcmp); +PROVIDE(__efistub_strncmp = __pi_strncmp); +PROVIDE(__efistub_strrchr = __pi_strrchr); +PROVIDE(__efistub_dcache_clean_poc = __pi_dcache_clean_poc); -#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) -__efistub___memcpy = __pi_memcpy; -__efistub___memmove = __pi_memmove; -__efistub___memset = __pi_memset; +PROVIDE(__efistub__text = _text); +PROVIDE(__efistub__end = _end); +PROVIDE(__efistub__edata = _edata); +PROVIDE(__efistub_screen_info = screen_info); +PROVIDE(__efistub__ctype = _ctype); -__pi___memcpy = __pi_memcpy; -__pi___memmove = __pi_memmove; -__pi___memset = __pi_memset; -#endif +/* + * The __ prefixed memcpy/memset/memmove symbols are provided by KASAN, which + * instruments the conventional ones. Therefore, any references from the EFI + * stub or other position independent, low level C code should be redirected to + * the non-instrumented versions as well. + */ +PROVIDE(__efistub___memcpy = __pi_memcpy); +PROVIDE(__efistub___memmove = __pi_memmove); +PROVIDE(__efistub___memset = __pi_memset); -__efistub__text = _text; -__efistub__end = _end; -__efistub__edata = _edata; -__efistub_screen_info = screen_info; -__efistub__ctype = _ctype; - -#endif +PROVIDE(__pi___memcpy = __pi_memcpy); +PROVIDE(__pi___memmove = __pi_memmove); +PROVIDE(__pi___memset = __pi_memset); #ifdef CONFIG_KVM From bdbcd22d491212c266589892f0818c65a2bc4704 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 29 Jun 2022 09:42:07 +0530 Subject: [PATCH 23/34] arm64: head: remove __PHYS_OFFSET It's very easy to confuse __PHYS_OFFSET and PHYS_OFFSET. To clarify things, let's remove __PHYS_OFFSET and use KERNEL_START directly, with comments to show that we're using physical address, as we do for other objects. At the same time, update the comment regarding the kernel entry address to mention __pa(KERNEL_START) rather than __pa(PAGE_OFFSET). There should be no functional change as a result of this patch. Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Will Deacon Acked-by: Ard Biesheuvel Signed-off-by: Mark Rutland Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/20220629041207.1670133-1-anshuman.khandual@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/head.S | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 6bf685f988f1..c300b43659dc 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -37,8 +37,6 @@ #include "efi-header.S" -#define __PHYS_OFFSET KERNEL_START - #if (PAGE_OFFSET & 0x1fffff) != 0 #error PAGE_OFFSET must be at least 2MB aligned #endif @@ -51,9 +49,6 @@ * MMU = off, D-cache = off, I-cache = on or off, * x0 = physical address to the FDT blob. * - * This code is mostly position independent so you call this at - * __pa(PAGE_OFFSET). - * * Note that the callee-saved registers are used for storing variables * that are useful before the MMU is enabled. The allocations are described * in the entry routines. @@ -409,7 +404,7 @@ SYM_FUNC_END(create_kernel_mapping) /* * The following fragment of code is executed with the MMU enabled. * - * x0 = __PHYS_OFFSET + * x0 = __pa(KERNEL_START) */ SYM_FUNC_START_LOCAL(__primary_switched) adr_l x4, init_task @@ -793,7 +788,7 @@ SYM_FUNC_START_LOCAL(__primary_switch) adrp x2, init_idmap_pg_dir bl __enable_mmu #ifdef CONFIG_RELOCATABLE - adrp x23, __PHYS_OFFSET + adrp x23, KERNEL_START and x23, x23, MIN_KIMG_ALIGN - 1 #ifdef CONFIG_RANDOMIZE_BASE mov x0, x22 @@ -815,6 +810,6 @@ SYM_FUNC_START_LOCAL(__primary_switch) bl __relocate_kernel #endif ldr x8, =__primary_switched - adrp x0, __PHYS_OFFSET + adrp x0, KERNEL_START // __pa(KERNEL_START) br x8 SYM_FUNC_END(__primary_switch) From 0aaa68532e9da5cd6b1383e1535a5526253e359f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 1 Jul 2022 13:10:45 +0200 Subject: [PATCH 24/34] arm64: mm: fix booting with 52-bit address space Joey reports that booting 52-bit VA capable builds on 52-bit VA capable CPUs is broken since commit 0d9b1ffefabe ("arm64: mm: make vabits_actual a build time constant if possible"). This is due to the fact that the primary CPU reads the vabits_actual variable before it has been assigned. The reason for deferring the assignment of vabits_actual was that we try to perform as few stores to memory as we can with the MMU and caches off, due to the cache coherency issues it creates. Since __cpu_setup() [which is where the read of vabits_actual occurs] is also called on the secondary boot path, we cannot just read the CPU ID registers directly, given that the size of the VA space is decided by the capabilities of the primary CPU. So let's read vabits_actual only on the secondary boot path, and read the CPU ID registers directly on the primary boot path, by making it a function parameter of __cpu_setup(). To ensure that all users of vabits_actual (including kasan_early_init()) observe the correct value, move the assignment of vabits_actual back into asm code, but still defer it to after the MMU and caches have been enabled. Cc: Will Deacon Cc: Anshuman Khandual Cc: Mark Rutland Fixes: 0d9b1ffefabe ("arm64: mm: make vabits_actual a build time constant if possible") Reported-by: Joey Gouly Co-developed-by: Joey Gouly Signed-off-by: Joey Gouly Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220701111045.2944309-1-ardb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/head.S | 18 ++++++++++++++++++ arch/arm64/mm/init.c | 15 +-------------- arch/arm64/mm/proc.S | 5 +++-- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index c300b43659dc..ae0a9e44ca19 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -82,6 +82,7 @@ * x22 create_idmap() .. start_kernel() ID map VA of the DT blob * x23 primary_entry() .. start_kernel() physical misalignment/KASLR offset * x24 __primary_switch() linear map KASLR seed + * x25 primary_entry() .. start_kernel() supported VA size * x28 create_idmap() callee preserved temp register */ SYM_CODE_START(primary_entry) @@ -96,6 +97,14 @@ SYM_CODE_START(primary_entry) * On return, the CPU will be ready for the MMU to be turned on and * the TCR will have been set. */ +#if VA_BITS > 48 + mrs_s x0, SYS_ID_AA64MMFR2_EL1 + tst x0, #0xf << ID_AA64MMFR2_LVA_SHIFT + mov x0, #VA_BITS + mov x25, #VA_BITS_MIN + csel x25, x25, x0, eq + mov x0, x25 +#endif bl __cpu_setup // initialise processor b __primary_switch SYM_CODE_END(primary_entry) @@ -434,6 +443,12 @@ SYM_FUNC_START_LOCAL(__primary_switched) bl __pi_memset dsb ishst // Make zero page visible to PTW +#if VA_BITS > 48 + adr_l x8, vabits_actual // Set this early so KASAN early init + str x25, [x8] // ... observes the correct value + dc civac, x8 // Make visible to booting secondaries +#endif + #ifdef CONFIG_RANDOMIZE_BASE adrp x5, memstart_offset_seed // Save KASLR linear map seed strh w24, [x5, :lo12:memstart_offset_seed] @@ -579,6 +594,9 @@ SYM_FUNC_START_LOCAL(secondary_startup) mov x20, x0 // preserve boot mode bl switch_to_vhe bl __cpu_secondary_check52bitva +#if VA_BITS > 48 + ldr_l x0, vabits_actual +#endif bl __cpu_setup // initialise processor adrp x1, swapper_pg_dir adrp x2, idmap_pg_dir diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 1faa6760895e..339ee84e5a61 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -265,20 +265,7 @@ early_param("mem", early_mem); void __init arm64_memblock_init(void) { - s64 linear_region_size; - -#if VA_BITS > 48 - if (cpuid_feature_extract_unsigned_field( - read_sysreg_s(SYS_ID_AA64MMFR2_EL1), - ID_AA64MMFR2_LVA_SHIFT)) - vabits_actual = VA_BITS; - - /* make the variable visible to secondaries with the MMU off */ - dcache_clean_inval_poc((u64)&vabits_actual, - (u64)&vabits_actual + sizeof(vabits_actual)); -#endif - - linear_region_size = PAGE_END - _PAGE_OFFSET(vabits_actual); + s64 linear_region_size = PAGE_END - _PAGE_OFFSET(vabits_actual); /* * Corner case: 52-bit VA capable systems running KVM in nVHE mode may diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 605c6640f94b..9eb490effb7f 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -397,6 +397,8 @@ SYM_FUNC_END(idmap_kpti_install_ng_mappings) * * Initialise the processor for turning the MMU on. * + * Input: + * x0 - actual number of VA bits (ignored unless VA_BITS > 48) * Output: * Return in x0 the value of the SCTLR_EL1 register. */ @@ -466,8 +468,7 @@ SYM_FUNC_START(__cpu_setup) tcr_clear_errata_bits tcr, x9, x5 #ifdef CONFIG_ARM64_VA_BITS_52 - ldr_l x9, vabits_actual - sub x9, xzr, x9 + sub x9, xzr, x0 add x9, x9, #64 tcr_set_t1sz tcr, x9 #else From 7ddb0c3df7881206dcd8339c8dabf0318a781f91 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2022 17:04:52 +0100 Subject: [PATCH 25/34] arm64: Rename the VHE switch to "finalise_el2" as we are about to perform a lot more in 'mutate_to_vhe' than we currently do, this function really becomes the point where we finalise the basic EL2 configuration. Reflect this into the code by renaming a bunch of things: - HVC_VHE_RESTART -> HVC_FINALISE_EL2 - switch_to_vhe --> finalise_el2 - mutate_to_vhe -> __finalise_el2 No functional changes. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220630160500.1536744-2-maz@kernel.org Signed-off-by: Will Deacon --- Documentation/virt/kvm/arm/hyp-abi.rst | 11 ++++++----- arch/arm64/include/asm/virt.h | 4 ++-- arch/arm64/kernel/head.S | 6 +++--- arch/arm64/kernel/hyp-stub.S | 21 ++++++++++----------- arch/arm64/kernel/sleep.S | 2 +- 5 files changed, 22 insertions(+), 22 deletions(-) diff --git a/Documentation/virt/kvm/arm/hyp-abi.rst b/Documentation/virt/kvm/arm/hyp-abi.rst index 4d43fbc25195..412b276449d3 100644 --- a/Documentation/virt/kvm/arm/hyp-abi.rst +++ b/Documentation/virt/kvm/arm/hyp-abi.rst @@ -60,12 +60,13 @@ these functions (see arch/arm{,64}/include/asm/virt.h): * :: - x0 = HVC_VHE_RESTART (arm64 only) + x0 = HVC_FINALISE_EL2 (arm64 only) - Attempt to upgrade the kernel's exception level from EL1 to EL2 by enabling - the VHE mode. This is conditioned by the CPU supporting VHE, the EL2 MMU - being off, and VHE not being disabled by any other means (command line - option, for example). + Finish configuring EL2 depending on the command-line options, + including an attempt to upgrade the kernel's exception level from + EL1 to EL2 by enabling the VHE mode. This is conditioned by the CPU + supporting VHE, the EL2 MMU being off, and VHE not being disabled by + any other means (command line option, for example). Any other value of r0/x0 triggers a hypervisor-specific handling, which is not documented here. diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h index 0e80db4327b6..dec6eee0eda5 100644 --- a/arch/arm64/include/asm/virt.h +++ b/arch/arm64/include/asm/virt.h @@ -36,9 +36,9 @@ #define HVC_RESET_VECTORS 2 /* - * HVC_VHE_RESTART - Upgrade the CPU from EL1 to EL2, if possible + * HVC_FINALISE_EL2 - Upgrade the CPU from EL1 to EL2, if possible */ -#define HVC_VHE_RESTART 3 +#define HVC_FINALISE_EL2 3 /* Max number of HYP stub hypercalls */ #define HVC_STUB_HCALL_NR 4 diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index ae0a9e44ca19..6feac4ee105a 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -459,7 +459,7 @@ SYM_FUNC_START_LOCAL(__primary_switched) mov x0, x22 // pass FDT address in x0 bl init_feature_override // Parse cpu feature overrides mov x0, x20 - bl switch_to_vhe // Prefer VHE if possible + bl finalise_el2 // Prefer VHE if possible ldp x29, x30, [sp], #16 bl start_kernel ASM_BUG() @@ -542,7 +542,7 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) eret __cpu_stick_to_vhe: - mov x0, #HVC_VHE_RESTART + mov x0, #HVC_FINALISE_EL2 hvc #0 mov x0, #BOOT_CPU_MODE_EL2 ret @@ -592,7 +592,7 @@ SYM_FUNC_START_LOCAL(secondary_startup) * Common entry point for secondary CPUs. */ mov x20, x0 // preserve boot mode - bl switch_to_vhe + bl finalise_el2 bl __cpu_secondary_check52bitva #if VA_BITS > 48 ldr_l x0, vabits_actual diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index 5bafb53fafb4..571286eb443c 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -51,8 +51,8 @@ SYM_CODE_START_LOCAL(elx_sync) msr vbar_el2, x1 b 9f -1: cmp x0, #HVC_VHE_RESTART - b.eq mutate_to_vhe +1: cmp x0, #HVC_FINALISE_EL2 + b.eq __finalise_el2 2: cmp x0, #HVC_SOFT_RESTART b.ne 3f @@ -73,8 +73,8 @@ SYM_CODE_START_LOCAL(elx_sync) eret SYM_CODE_END(elx_sync) -// nVHE? No way! Give me the real thing! -SYM_CODE_START_LOCAL(mutate_to_vhe) +SYM_CODE_START_LOCAL(__finalise_el2) + // nVHE? No way! Give me the real thing! // Sanity check: MMU *must* be off mrs x1, sctlr_el2 tbnz x1, #0, 1f @@ -140,10 +140,10 @@ SYM_CODE_START_LOCAL(mutate_to_vhe) msr spsr_el1, x0 b enter_vhe -SYM_CODE_END(mutate_to_vhe) +SYM_CODE_END(__finalise_el2) // At the point where we reach enter_vhe(), we run with - // the MMU off (which is enforced by mutate_to_vhe()). + // the MMU off (which is enforced by __finalise_el2()). // We thus need to be in the idmap, or everything will // explode when enabling the MMU. @@ -222,11 +222,11 @@ SYM_FUNC_START(__hyp_reset_vectors) SYM_FUNC_END(__hyp_reset_vectors) /* - * Entry point to switch to VHE if deemed capable + * Entry point to finalise EL2 and switch to VHE if deemed capable * * w0: boot mode, as returned by init_kernel_el() */ -SYM_FUNC_START(switch_to_vhe) +SYM_FUNC_START(finalise_el2) // Need to have booted at EL2 cmp w0, #BOOT_CPU_MODE_EL2 b.ne 1f @@ -236,9 +236,8 @@ SYM_FUNC_START(switch_to_vhe) cmp x0, #CurrentEL_EL1 b.ne 1f - // Turn the world upside down - mov x0, #HVC_VHE_RESTART + mov x0, #HVC_FINALISE_EL2 hvc #0 1: ret -SYM_FUNC_END(switch_to_vhe) +SYM_FUNC_END(finalise_el2) diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index e36b09d942f7..617f78ad43a1 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -100,7 +100,7 @@ SYM_FUNC_END(__cpu_suspend_enter) .pushsection ".idmap.text", "awx" SYM_CODE_START(cpu_resume) bl init_kernel_el - bl switch_to_vhe + bl finalise_el2 bl __cpu_setup /* enable the MMU early - so we can access sleep_save_stash by va */ adrp x1, swapper_pg_dir From b65e411d6cc2f12a728cabe66b930c63c527a340 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2022 17:04:53 +0100 Subject: [PATCH 26/34] arm64: Save state of HCR_EL2.E2H before switch to EL1 As we're about to switch the way E2H-stuck CPUs boot, save the boot CPU E2H state as a flag tied to the boot mode that can then be checked by the idreg override code. This allows us to replace the is_kernel_in_hyp_mode() check with a simple comparison with this state, even when running at EL1. Note that this flag isn't saved in __boot_cpu_mode, and is only kept in a register in the assembly code. Use with caution. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220630160500.1536744-3-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/virt.h | 7 +++++++ arch/arm64/kernel/head.S | 7 +++++-- arch/arm64/kernel/idreg-override.c | 11 ++++++++--- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h index dec6eee0eda5..4eb601e7de50 100644 --- a/arch/arm64/include/asm/virt.h +++ b/arch/arm64/include/asm/virt.h @@ -49,6 +49,13 @@ #define BOOT_CPU_MODE_EL1 (0xe11) #define BOOT_CPU_MODE_EL2 (0xe12) +/* + * Flags returned together with the boot mode, but not preserved in + * __boot_cpu_mode. Used by the idreg override code to work out the + * boot state. + */ +#define BOOT_CPU_FLAG_E2H BIT_ULL(32) + #ifndef __ASSEMBLY__ #include diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 6feac4ee105a..73eb7c96a245 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -457,6 +457,7 @@ SYM_FUNC_START_LOCAL(__primary_switched) bl kasan_early_init #endif mov x0, x22 // pass FDT address in x0 + mov x1, x20 // pass the full boot status bl init_feature_override // Parse cpu feature overrides mov x0, x20 bl finalise_el2 // Prefer VHE if possible @@ -479,8 +480,9 @@ SYM_FUNC_END(__primary_switched) * Since we cannot always rely on ERET synchronizing writes to sysregs (e.g. if * SCTLR_ELx.EOS is clear), we place an ISB prior to ERET. * - * Returns either BOOT_CPU_MODE_EL1 or BOOT_CPU_MODE_EL2 in w0 if - * booted in EL1 or EL2 respectively. + * Returns either BOOT_CPU_MODE_EL1 or BOOT_CPU_MODE_EL2 in x0 if + * booted in EL1 or EL2 respectively, with the top 32 bits containing + * potential context flags. These flags are *not* stored in __boot_cpu_mode. */ SYM_FUNC_START(init_kernel_el) mrs x0, CurrentEL @@ -545,6 +547,7 @@ __cpu_stick_to_vhe: mov x0, #HVC_FINALISE_EL2 hvc #0 mov x0, #BOOT_CPU_MODE_EL2 + orr x0, x0, #BOOT_CPU_FLAG_E2H ret SYM_FUNC_END(init_kernel_el) diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index f92836e196e5..03185bc46d69 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -19,6 +19,8 @@ #define FTR_ALIAS_NAME_LEN 30 #define FTR_ALIAS_OPTION_LEN 116 +static u64 __boot_status __initdata; + struct ftr_set_desc { char name[FTR_DESC_NAME_LEN]; struct arm64_ftr_override *override; @@ -37,7 +39,8 @@ static bool __init mmfr1_vh_filter(u64 val) * the user was trying to force nVHE on us, proceed with * attitude adjustment. */ - return !(is_kernel_in_hyp_mode() && val == 0); + return !(__boot_status == (BOOT_CPU_FLAG_E2H | BOOT_CPU_MODE_EL2) && + val == 0); } static const struct ftr_set_desc mmfr1 __initconst = { @@ -229,9 +232,9 @@ static __init void parse_cmdline(const void *fdt) } /* Keep checkers quiet */ -void init_feature_override(const void *fdt); +void init_feature_override(const void *fdt, u64 boot_status); -asmlinkage void __init init_feature_override(const void *fdt) +asmlinkage void __init init_feature_override(const void *fdt, u64 boot_status) { int i; @@ -242,6 +245,8 @@ asmlinkage void __init init_feature_override(const void *fdt) } } + __boot_status = boot_status; + parse_cmdline(fdt); for (i = 0; i < ARRAY_SIZE(regs); i++) { From ae4b7e38e9a94798f08f5d66b02077900ab92903 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2022 17:04:54 +0100 Subject: [PATCH 27/34] arm64: Allow sticky E2H when entering EL1 For CPUs that have the unfortunate mis-feature to be stuck in VHE mode, we perform a funny dance where we completely shortcut the normal boot process to enable VHE and run the kernel at EL2, and only then start booting the kernel. Not only this is pretty ugly, but it means that the EL2 finalisation occurs before we have processed the sysreg override. Instead, start executing the kernel as if it was an EL1 guest and rely on the normal EL2 finalisation to go back to EL2. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220630160500.1536744-4-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/head.S | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 73eb7c96a245..29d641290293 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -511,6 +511,8 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) msr vbar_el2, x0 isb + mov_q x1, INIT_SCTLR_EL1_MMU_OFF + /* * Fruity CPUs seem to have HCR_EL2.E2H set to RES1, * making it impossible to start in nVHE mode. Is that @@ -520,35 +522,19 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) and x0, x0, #HCR_E2H cbz x0, 1f - /* Switching to VHE requires a sane SCTLR_EL1 as a start */ - mov_q x0, INIT_SCTLR_EL1_MMU_OFF - msr_s SYS_SCTLR_EL12, x0 - - /* - * Force an eret into a helper "function", and let it return - * to our original caller... This makes sure that we have - * initialised the basic PSTATE state. - */ - mov x0, #INIT_PSTATE_EL2 - msr spsr_el1, x0 - adr x0, __cpu_stick_to_vhe - msr elr_el1, x0 - eret + /* Set a sane SCTLR_EL1, the VHE way */ + msr_s SYS_SCTLR_EL12, x1 + mov x2, #BOOT_CPU_FLAG_E2H + b 2f 1: - mov_q x0, INIT_SCTLR_EL1_MMU_OFF - msr sctlr_el1, x0 - + msr sctlr_el1, x1 + mov x2, xzr +2: msr elr_el2, lr mov w0, #BOOT_CPU_MODE_EL2 + orr x0, x0, x2 eret - -__cpu_stick_to_vhe: - mov x0, #HVC_FINALISE_EL2 - hvc #0 - mov x0, #BOOT_CPU_MODE_EL2 - orr x0, x0, #BOOT_CPU_FLAG_E2H - ret SYM_FUNC_END(init_kernel_el) /* From fa8aa59ae6454b7a6a670acb2097e7800992cba1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2022 17:04:55 +0100 Subject: [PATCH 28/34] arm64: Factor out checking of a feature against the override into a macro Checking for a feature being supported from assembly code is a bit tedious if we need to factor in the idreg override. Since we already have such code written for forcing nVHE, move the whole thing into a macro. This heavily relies on the override structure being called foo_override for foo_el1. No functional change. Reviewed-by: Mark Brown Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220630160500.1536744-5-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/hyp-stub.S | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index 571286eb443c..43c94e7a2c1d 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -16,6 +16,25 @@ #include #include +// Warning, hardcoded register allocation +// This will clobber x1 and x2. +.macro check_override idreg, fld, pass, fail + mrs x1, \idreg\()_el1 + ubfx x1, x1, #\fld, #4 + cbz x1, \fail + + adr_l x1, \idreg\()_override + ldr x2, [x1, FTR_OVR_VAL_OFFSET] + ldr x1, [x1, FTR_OVR_MASK_OFFSET] + ubfx x2, x2, #\fld, #4 + ubfx x1, x1, #\fld, #4 + cmp x1, xzr + and x2, x2, x1 + csinv x2, x2, xzr, ne + cbnz x2, \pass + b \fail +.endm + .text .pushsection .hyp.text, "ax" @@ -80,20 +99,7 @@ SYM_CODE_START_LOCAL(__finalise_el2) tbnz x1, #0, 1f // Needs to be VHE capable, obviously - mrs x1, id_aa64mmfr1_el1 - ubfx x1, x1, #ID_AA64MMFR1_VHE_SHIFT, #4 - cbz x1, 1f - - // Check whether VHE is disabled from the command line - adr_l x1, id_aa64mmfr1_override - ldr x2, [x1, FTR_OVR_VAL_OFFSET] - ldr x1, [x1, FTR_OVR_MASK_OFFSET] - ubfx x2, x2, #ID_AA64MMFR1_VHE_SHIFT, #4 - ubfx x1, x1, #ID_AA64MMFR1_VHE_SHIFT, #4 - cmp x1, xzr - and x2, x2, x1 - csinv x2, x2, xzr, ne - cbnz x2, 2f + check_override id_aa64mmfr1 ID_AA64MMFR1_VHE_SHIFT 2f 1f 1: mov_q x0, HVC_STUB_ERR eret From 6b7ec18c09763f72dcd3d87e194860f9e9db8968 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2022 17:04:56 +0100 Subject: [PATCH 29/34] arm64: Allow the idreg override to deal with variable field width Currently, the override mechanism can only deal with 4bit fields, which is the most common case. However, we now have a bunch of ID registers that have more diverse field widths, such as ID_AA64SMFR0_EL1, which has fields that are a single bit wide. Add the support for variable width, and a macro that encodes a feature width of 4 for all existing override. No functional change. Signed-off-by: Marc Zyngier Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20220630160500.1536744-6-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/idreg-override.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index 03185bc46d69..1e5f3dba3f01 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -27,10 +27,13 @@ struct ftr_set_desc { struct { char name[FTR_DESC_FIELD_LEN]; u8 shift; + u8 width; bool (*filter)(u64 val); } fields[]; }; +#define FIELD(n, s, f) { .name = n, .shift = s, .width = 4, .filter = f } + static bool __init mmfr1_vh_filter(u64 val) { /* @@ -47,7 +50,7 @@ static const struct ftr_set_desc mmfr1 __initconst = { .name = "id_aa64mmfr1", .override = &id_aa64mmfr1_override, .fields = { - { "vh", ID_AA64MMFR1_VHE_SHIFT, mmfr1_vh_filter }, + FIELD("vh", ID_AA64MMFR1_VHE_SHIFT, mmfr1_vh_filter), {} }, }; @@ -56,8 +59,8 @@ static const struct ftr_set_desc pfr1 __initconst = { .name = "id_aa64pfr1", .override = &id_aa64pfr1_override, .fields = { - { "bt", ID_AA64PFR1_BT_SHIFT }, - { "mte", ID_AA64PFR1_MTE_SHIFT}, + FIELD("bt", ID_AA64PFR1_BT_SHIFT, NULL), + FIELD("mte", ID_AA64PFR1_MTE_SHIFT, NULL), {} }, }; @@ -66,10 +69,10 @@ static const struct ftr_set_desc isar1 __initconst = { .name = "id_aa64isar1", .override = &id_aa64isar1_override, .fields = { - { "gpi", ID_AA64ISAR1_GPI_SHIFT }, - { "gpa", ID_AA64ISAR1_GPA_SHIFT }, - { "api", ID_AA64ISAR1_API_SHIFT }, - { "apa", ID_AA64ISAR1_APA_SHIFT }, + FIELD("gpi", ID_AA64ISAR1_GPI_SHIFT, NULL), + FIELD("gpa", ID_AA64ISAR1_GPA_SHIFT, NULL), + FIELD("api", ID_AA64ISAR1_API_SHIFT, NULL), + FIELD("apa", ID_AA64ISAR1_APA_SHIFT, NULL), {} }, }; @@ -78,8 +81,8 @@ static const struct ftr_set_desc isar2 __initconst = { .name = "id_aa64isar2", .override = &id_aa64isar2_override, .fields = { - { "gpa3", ID_AA64ISAR2_GPA3_SHIFT }, - { "apa3", ID_AA64ISAR2_APA3_SHIFT }, + FIELD("gpa3", ID_AA64ISAR2_GPA3_SHIFT, NULL), + FIELD("apa3", ID_AA64ISAR2_APA3_SHIFT, NULL), {} }, }; @@ -92,7 +95,7 @@ static const struct ftr_set_desc kaslr __initconst = { .override = &kaslr_feature_override, #endif .fields = { - { "disabled", 0 }, + FIELD("disabled", 0, NULL), {} }, }; @@ -147,7 +150,8 @@ static void __init match_options(const char *cmdline) for (f = 0; strlen(regs[i]->fields[f].name); f++) { u64 shift = regs[i]->fields[f].shift; - u64 mask = 0xfUL << shift; + u64 width = regs[i]->fields[f].width ?: 4; + u64 mask = GENMASK_ULL(shift + width - 1, shift); u64 v; if (find_field(cmdline, regs[i], f, &v)) @@ -155,7 +159,7 @@ static void __init match_options(const char *cmdline) /* * If an override gets filtered out, advertise - * it by setting the value to 0xf, but + * it by setting the value to the all-ones while * clearing the mask... Yes, this is fragile. */ if (regs[i]->fields[f].filter && From 6ab7661e1d3930edc8462099f047ca2cfdbd343a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2022 17:04:57 +0100 Subject: [PATCH 30/34] arm64: Expose a __check_override primitive for oddball features In order to feal with early override of features that are not classically encoded in a standard ID register with a 4 bit wide field, add a primitive that takes a sysreg value as an input (instead of the usual sysreg name) as well as a bit field width (usually 4). No functional change. Signed-off-by: Marc Zyngier Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20220630160500.1536744-7-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/hyp-stub.S | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index 43c94e7a2c1d..de1ab9843c31 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -17,17 +17,17 @@ #include // Warning, hardcoded register allocation -// This will clobber x1 and x2. -.macro check_override idreg, fld, pass, fail - mrs x1, \idreg\()_el1 - ubfx x1, x1, #\fld, #4 +// This will clobber x1 and x2, and expect x1 to contain +// the id register value as read from the HW +.macro __check_override idreg, fld, width, pass, fail + ubfx x1, x1, #\fld, #\width cbz x1, \fail adr_l x1, \idreg\()_override ldr x2, [x1, FTR_OVR_VAL_OFFSET] ldr x1, [x1, FTR_OVR_MASK_OFFSET] - ubfx x2, x2, #\fld, #4 - ubfx x1, x1, #\fld, #4 + ubfx x2, x2, #\fld, #\width + ubfx x1, x1, #\fld, #\width cmp x1, xzr and x2, x2, x1 csinv x2, x2, xzr, ne @@ -35,6 +35,11 @@ b \fail .endm +.macro check_override idreg, fld, pass, fail + mrs x1, \idreg\()_el1 + __check_override \idreg \fld 4 \pass \fail +.endm + .text .pushsection .hyp.text, "ax" From b3000e2133d878e586416e440642ca82d234c6fb Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2022 17:04:58 +0100 Subject: [PATCH 31/34] arm64: Add the arm64.nosme command line option In order to be able to completely disable SME even if the HW seems to support it (most likely because the FW is broken), move the SME setup into the EL2 finalisation block, and use a new idreg override to deal with it. Note that we also nuke id_aa64smfr0_el1 as a byproduct. Signed-off-by: Marc Zyngier Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20220630160500.1536744-8-maz@kernel.org Signed-off-by: Will Deacon --- .../admin-guide/kernel-parameters.txt | 3 ++ arch/arm64/include/asm/cpufeature.h | 1 + arch/arm64/include/asm/el2_setup.h | 45 ------------------- arch/arm64/kernel/cpufeature.c | 4 +- arch/arm64/kernel/hyp-stub.S | 41 +++++++++++++++++ arch/arm64/kernel/idreg-override.c | 17 +++++++ 6 files changed, 65 insertions(+), 46 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2522b11e593f..301d2d0fee80 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -400,6 +400,9 @@ arm64.nomte [ARM64] Unconditionally disable Memory Tagging Extension support + arm64.nosme [ARM64] Unconditionally disable Scalable Matrix + Extension support + ataflop= [HW,M68k] atarimouse= [HW,MOUSE] Atari Mouse diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 14a8f3d93add..5adda12b1946 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -909,6 +909,7 @@ static inline unsigned int get_vmid_bits(u64 mmfr1) extern struct arm64_ftr_override id_aa64mmfr1_override; extern struct arm64_ftr_override id_aa64pfr1_override; +extern struct arm64_ftr_override id_aa64smfr0_override; extern struct arm64_ftr_override id_aa64isar1_override; extern struct arm64_ftr_override id_aa64isar2_override; diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 34ceff08cac4..18641dce5184 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -143,50 +143,6 @@ .Lskip_sve_\@: .endm -/* SME register access and priority mapping */ -.macro __init_el2_nvhe_sme - mrs x1, id_aa64pfr1_el1 - ubfx x1, x1, #ID_AA64PFR1_SME_SHIFT, #4 - cbz x1, .Lskip_sme_\@ - - bic x0, x0, #CPTR_EL2_TSM // Also disable SME traps - msr cptr_el2, x0 // Disable copro. traps to EL2 - isb - - mrs x1, sctlr_el2 - orr x1, x1, #SCTLR_ELx_ENTP2 // Disable TPIDR2 traps - msr sctlr_el2, x1 - isb - - mov x1, #0 // SMCR controls - - mrs_s x2, SYS_ID_AA64SMFR0_EL1 - ubfx x2, x2, #ID_AA64SMFR0_FA64_SHIFT, #1 // Full FP in SM? - cbz x2, .Lskip_sme_fa64_\@ - - orr x1, x1, SMCR_ELx_FA64_MASK -.Lskip_sme_fa64_\@: - - orr x1, x1, #SMCR_ELx_LEN_MASK // Enable full SME vector - msr_s SYS_SMCR_EL2, x1 // length for EL1. - - mrs_s x1, SYS_SMIDR_EL1 // Priority mapping supported? - ubfx x1, x1, #SMIDR_EL1_SMPS_SHIFT, #1 - cbz x1, .Lskip_sme_\@ - - msr_s SYS_SMPRIMAP_EL2, xzr // Make all priorities equal - - mrs x1, id_aa64mmfr1_el1 // HCRX_EL2 present? - ubfx x1, x1, #ID_AA64MMFR1_HCX_SHIFT, #4 - cbz x1, .Lskip_sme_\@ - - mrs_s x1, SYS_HCRX_EL2 - orr x1, x1, #HCRX_EL2_SMPME_MASK // Enable priority mapping - msr_s SYS_HCRX_EL2, x1 - -.Lskip_sme_\@: -.endm - /* Disable any fine grained traps */ .macro __init_el2_fgt mrs x1, id_aa64mmfr0_el1 @@ -251,7 +207,6 @@ __init_el2_nvhe_idregs __init_el2_nvhe_cptr __init_el2_nvhe_sve - __init_el2_nvhe_sme __init_el2_fgt __init_el2_nvhe_prepare_eret .endm diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index a97913d19709..a7d0686123a6 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -632,6 +632,7 @@ static const struct arm64_ftr_bits ftr_raz[] = { struct arm64_ftr_override __ro_after_init id_aa64mmfr1_override; struct arm64_ftr_override __ro_after_init id_aa64pfr1_override; +struct arm64_ftr_override __ro_after_init id_aa64smfr0_override; struct arm64_ftr_override __ro_after_init id_aa64isar1_override; struct arm64_ftr_override __ro_after_init id_aa64isar2_override; @@ -672,7 +673,8 @@ static const struct __ftr_reg_entry { ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1, &id_aa64pfr1_override), ARM64_FTR_REG(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0), - ARM64_FTR_REG(SYS_ID_AA64SMFR0_EL1, ftr_id_aa64smfr0), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64SMFR0_EL1, ftr_id_aa64smfr0, + &id_aa64smfr0_override), /* Op1 = 0, CRn = 0, CRm = 5 */ ARM64_FTR_REG(SYS_ID_AA64DFR0_EL1, ftr_id_aa64dfr0), diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index de1ab9843c31..0c69defa069e 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -98,6 +98,47 @@ SYM_CODE_START_LOCAL(elx_sync) SYM_CODE_END(elx_sync) SYM_CODE_START_LOCAL(__finalise_el2) + check_override id_aa64pfr1 ID_AA64PFR1_SME_SHIFT .Linit_sme .Lskip_sme + +.Linit_sme: /* SME register access and priority mapping */ + mrs x0, cptr_el2 // Disable SME traps + bic x0, x0, #CPTR_EL2_TSM + msr cptr_el2, x0 + isb + + mrs x1, sctlr_el2 + orr x1, x1, #SCTLR_ELx_ENTP2 // Disable TPIDR2 traps + msr sctlr_el2, x1 + isb + + mov x1, #0 // SMCR controls + + mrs_s x2, SYS_ID_AA64SMFR0_EL1 + ubfx x2, x2, #ID_AA64SMFR0_FA64_SHIFT, #1 // Full FP in SM? + cbz x2, .Lskip_sme_fa64 + + orr x1, x1, SMCR_ELx_FA64_MASK +.Lskip_sme_fa64: + + orr x1, x1, #SMCR_ELx_LEN_MASK // Enable full SME vector + msr_s SYS_SMCR_EL2, x1 // length for EL1. + + mrs_s x1, SYS_SMIDR_EL1 // Priority mapping supported? + ubfx x1, x1, #SMIDR_EL1_SMPS_SHIFT, #1 + cbz x1, .Lskip_sme + + msr_s SYS_SMPRIMAP_EL2, xzr // Make all priorities equal + + mrs x1, id_aa64mmfr1_el1 // HCRX_EL2 present? + ubfx x1, x1, #ID_AA64MMFR1_HCX_SHIFT, #4 + cbz x1, .Lskip_sme + + mrs_s x1, SYS_HCRX_EL2 + orr x1, x1, #HCRX_EL2_SMPME_MASK // Enable priority mapping + msr_s SYS_HCRX_EL2, x1 + +.Lskip_sme: + // nVHE? No way! Give me the real thing! // Sanity check: MMU *must* be off mrs x1, sctlr_el2 diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index 1e5f3dba3f01..9314f0a8561c 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -55,12 +55,28 @@ static const struct ftr_set_desc mmfr1 __initconst = { }, }; +static bool __init pfr1_sme_filter(u64 val) +{ + /* + * Similarly to SVE, disabling SME also means disabling all + * the features that are associated with it. Just set + * id_aa64smfr0_el1 to 0 and don't look back. + */ + if (!val) { + id_aa64smfr0_override.val = 0; + id_aa64smfr0_override.mask = GENMASK(63, 0); + } + + return true; +} + static const struct ftr_set_desc pfr1 __initconst = { .name = "id_aa64pfr1", .override = &id_aa64pfr1_override, .fields = { FIELD("bt", ID_AA64PFR1_BT_SHIFT, NULL), FIELD("mte", ID_AA64PFR1_MTE_SHIFT, NULL), + FIELD("sme", ID_AA64PFR1_SME_SHIFT, pfr1_sme_filter), {} }, }; @@ -114,6 +130,7 @@ static const struct { } aliases[] __initconst = { { "kvm-arm.mode=nvhe", "id_aa64mmfr1.vh=0" }, { "kvm-arm.mode=protected", "id_aa64mmfr1.vh=0" }, + { "arm64.nosme", "id_aa64pfr1.sme=0" }, { "arm64.nobti", "id_aa64pfr1.bt=0" }, { "arm64.nopauth", "id_aa64isar1.gpi=0 id_aa64isar1.gpa=0 " From 504ee23611c4bb27777576ec6c1170fd45026093 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2022 17:04:59 +0100 Subject: [PATCH 32/34] arm64: Add the arm64.nosve command line option In order to be able to completely disable SVE even if the HW seems to support it (most likely because the FW is broken), move the SVE setup into the EL2 finalisation block, and use a new idreg override to deal with it. Note that we also nuke id_aa64zfr0_el1 as a byproduct, and that SME also gets disabled, due to the dependency between the two features. Signed-off-by: Marc Zyngier Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20220630160500.1536744-9-maz@kernel.org Signed-off-by: Will Deacon --- .../admin-guide/kernel-parameters.txt | 3 +++ arch/arm64/include/asm/cpufeature.h | 2 ++ arch/arm64/include/asm/el2_setup.h | 15 ----------- arch/arm64/kernel/cpufeature.c | 8 ++++-- arch/arm64/kernel/hyp-stub.S | 11 ++++++++ arch/arm64/kernel/idreg-override.c | 26 +++++++++++++++++++ 6 files changed, 48 insertions(+), 17 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 301d2d0fee80..0f1344eb7c2f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -400,6 +400,9 @@ arm64.nomte [ARM64] Unconditionally disable Memory Tagging Extension support + arm64.nosve [ARM64] Unconditionally disable Scalable Vector + Extension support + arm64.nosme [ARM64] Unconditionally disable Scalable Matrix Extension support diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 5adda12b1946..0fc4f6e068e5 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -908,7 +908,9 @@ static inline unsigned int get_vmid_bits(u64 mmfr1) } extern struct arm64_ftr_override id_aa64mmfr1_override; +extern struct arm64_ftr_override id_aa64pfr0_override; extern struct arm64_ftr_override id_aa64pfr1_override; +extern struct arm64_ftr_override id_aa64zfr0_override; extern struct arm64_ftr_override id_aa64smfr0_override; extern struct arm64_ftr_override id_aa64isar1_override; extern struct arm64_ftr_override id_aa64isar2_override; diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 18641dce5184..2630faa5bc08 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -129,20 +129,6 @@ msr cptr_el2, x0 // Disable copro. traps to EL2 .endm -/* SVE register access */ -.macro __init_el2_nvhe_sve - mrs x1, id_aa64pfr0_el1 - ubfx x1, x1, #ID_AA64PFR0_SVE_SHIFT, #4 - cbz x1, .Lskip_sve_\@ - - bic x0, x0, #CPTR_EL2_TZ // Also disable SVE traps - msr cptr_el2, x0 // Disable copro. traps to EL2 - isb - mov x1, #ZCR_ELx_LEN_MASK // SVE: Enable full vector - msr_s SYS_ZCR_EL2, x1 // length for EL1. -.Lskip_sve_\@: -.endm - /* Disable any fine grained traps */ .macro __init_el2_fgt mrs x1, id_aa64mmfr0_el1 @@ -206,7 +192,6 @@ __init_el2_hstr __init_el2_nvhe_idregs __init_el2_nvhe_cptr - __init_el2_nvhe_sve __init_el2_fgt __init_el2_nvhe_prepare_eret .endm diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index a7d0686123a6..e5afa9eba85d 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -631,7 +631,9 @@ static const struct arm64_ftr_bits ftr_raz[] = { __ARM64_FTR_REG_OVERRIDE(#id, id, table, &no_override) struct arm64_ftr_override __ro_after_init id_aa64mmfr1_override; +struct arm64_ftr_override __ro_after_init id_aa64pfr0_override; struct arm64_ftr_override __ro_after_init id_aa64pfr1_override; +struct arm64_ftr_override __ro_after_init id_aa64zfr0_override; struct arm64_ftr_override __ro_after_init id_aa64smfr0_override; struct arm64_ftr_override __ro_after_init id_aa64isar1_override; struct arm64_ftr_override __ro_after_init id_aa64isar2_override; @@ -669,10 +671,12 @@ static const struct __ftr_reg_entry { ARM64_FTR_REG(SYS_ID_MMFR5_EL1, ftr_id_mmfr5), /* Op1 = 0, CRn = 0, CRm = 4 */ - ARM64_FTR_REG(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0, + &id_aa64pfr0_override), ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1, &id_aa64pfr1_override), - ARM64_FTR_REG(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0, + &id_aa64zfr0_override), ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64SMFR0_EL1, ftr_id_aa64smfr0, &id_aa64smfr0_override), diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index 0c69defa069e..d6b0a70a7080 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -98,6 +98,17 @@ SYM_CODE_START_LOCAL(elx_sync) SYM_CODE_END(elx_sync) SYM_CODE_START_LOCAL(__finalise_el2) + check_override id_aa64pfr0 ID_AA64PFR0_SVE_SHIFT .Linit_sve .Lskip_sve + +.Linit_sve: /* SVE register access */ + mrs x0, cptr_el2 // Disable SVE traps + bic x0, x0, #CPTR_EL2_TZ + msr cptr_el2, x0 + isb + mov x1, #ZCR_ELx_LEN_MASK // SVE: Enable full vector + msr_s SYS_ZCR_EL2, x1 // length for EL1. + +.Lskip_sve: check_override id_aa64pfr1 ID_AA64PFR1_SME_SHIFT .Linit_sme .Lskip_sme .Linit_sme: /* SME register access and priority mapping */ diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index 9314f0a8561c..7cca82639606 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -55,6 +55,30 @@ static const struct ftr_set_desc mmfr1 __initconst = { }, }; +static bool __init pfr0_sve_filter(u64 val) +{ + /* + * Disabling SVE also means disabling all the features that + * are associated with it. The easiest way to do it is just to + * override id_aa64zfr0_el1 to be 0. + */ + if (!val) { + id_aa64zfr0_override.val = 0; + id_aa64zfr0_override.mask = GENMASK(63, 0); + } + + return true; +} + +static const struct ftr_set_desc pfr0 __initconst = { + .name = "id_aa64pfr0", + .override = &id_aa64pfr0_override, + .fields = { + FIELD("sve", ID_AA64PFR0_SVE_SHIFT, pfr0_sve_filter), + {} + }, +}; + static bool __init pfr1_sme_filter(u64 val) { /* @@ -118,6 +142,7 @@ static const struct ftr_set_desc kaslr __initconst = { static const struct ftr_set_desc * const regs[] __initconst = { &mmfr1, + &pfr0, &pfr1, &isar1, &isar2, @@ -130,6 +155,7 @@ static const struct { } aliases[] __initconst = { { "kvm-arm.mode=nvhe", "id_aa64mmfr1.vh=0" }, { "kvm-arm.mode=protected", "id_aa64mmfr1.vh=0" }, + { "arm64.nosve", "id_aa64pfr0.sve=0 id_aa64pfr1.sme=0" }, { "arm64.nosme", "id_aa64pfr1.sme=0" }, { "arm64.nobti", "id_aa64pfr1.bt=0" }, { "arm64.nopauth", From 18c9aa490795745e67e30816fa9b4fafab2df7dc Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2022 17:05:00 +0100 Subject: [PATCH 33/34] arm64: Add an override for ID_AA64SMFR0_EL1.FA64 Add a specific override for ID_AA64SMFR0_EL1.FA64, which disables the full A64 streaming SVE mode. Note that no alias is provided for this, as this is already covered by arm64.nosme, and is only added as a debugging facility. Signed-off-by: Marc Zyngier Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20220630160500.1536744-10-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/hyp-stub.S | 15 ++++++++------- arch/arm64/kernel/idreg-override.c | 11 +++++++++++ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index d6b0a70a7080..3dcc3272ce16 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -122,17 +122,18 @@ SYM_CODE_START_LOCAL(__finalise_el2) msr sctlr_el2, x1 isb - mov x1, #0 // SMCR controls + mov x0, #0 // SMCR controls - mrs_s x2, SYS_ID_AA64SMFR0_EL1 - ubfx x2, x2, #ID_AA64SMFR0_FA64_SHIFT, #1 // Full FP in SM? - cbz x2, .Lskip_sme_fa64 + // Full FP in SM? + mrs_s x1, SYS_ID_AA64SMFR0_EL1 + __check_override id_aa64smfr0 ID_AA64SMFR0_FA64_SHIFT 1 .Linit_sme_fa64 .Lskip_sme_fa64 - orr x1, x1, SMCR_ELx_FA64_MASK +.Linit_sme_fa64: + orr x0, x0, SMCR_ELx_FA64_MASK .Lskip_sme_fa64: - orr x1, x1, #SMCR_ELx_LEN_MASK // Enable full SME vector - msr_s SYS_SMCR_EL2, x1 // length for EL1. + orr x0, x0, #SMCR_ELx_LEN_MASK // Enable full SME vector + msr_s SYS_SMCR_EL2, x0 // length for EL1. mrs_s x1, SYS_SMIDR_EL1 // Priority mapping supported? ubfx x1, x1, #SMIDR_EL1_SMPS_SHIFT, #1 diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index 7cca82639606..aa2a53d0d417 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -127,6 +127,16 @@ static const struct ftr_set_desc isar2 __initconst = { }, }; +static const struct ftr_set_desc smfr0 __initconst = { + .name = "id_aa64smfr0", + .override = &id_aa64smfr0_override, + .fields = { + /* FA64 is a one bit field... :-/ */ + { "fa64", ID_AA64SMFR0_FA64_SHIFT, 1, }, + {} + }, +}; + extern struct arm64_ftr_override kaslr_feature_override; static const struct ftr_set_desc kaslr __initconst = { @@ -146,6 +156,7 @@ static const struct ftr_set_desc * const regs[] __initconst = { &pfr1, &isar1, &isar2, + &smfr0, &kaslr, }; From 1191b6256e50a07e7d8ce36eb970708e42a4be1a Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 13 Jul 2022 15:09:49 +0100 Subject: [PATCH 34/34] arm64: fix KASAN_INLINE Since commit: a004393f45d9a55e ("arm64: idreg-override: use early FDT mapping in ID map") Kernels built with KASAN_INLINE=y die early in boot before producing any console output. This is because the accesses made to the FDT (e.g. in generic string processing functions) are instrumented with KASAN, and with KASAN_INLINE=y any access to an address in TTBR0 results in a bogus shadow VA, resulting in a data abort. This patch fixes this by reverting commits: 7559d9f97581654f ("arm64: setup: drop early FDT pointer helpers") bd0c3fa21878b6d0 ("arm64: idreg-override: use early FDT mapping in ID map") ... and using the TTBR1 fixmap mapping of the FDT. Note that due to a later commit: b65e411d6cc2f12a ("arm64: Save state of HCR_EL2.E2H before switch to EL1") ... which altered the prototype of init_feature_override() (and invocation from head.S), commit bd0c3fa21878b6d0 does not revert cleanly, and I've fixed that up manually. Fixes: a004393f45d9 ("arm64: idreg-override: use early FDT mapping in ID map") Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Marc Zyngier Cc: Will Deacon Acked-by: Catalin Marinas Signed-off-by: Mark Rutland Link: https://lore.kernel.org/r/20220713140949.45440-1-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/setup.h | 3 +++ arch/arm64/kernel/head.S | 5 +++-- arch/arm64/kernel/idreg-override.c | 17 +++++++++++------ arch/arm64/kernel/setup.c | 15 +++++++++++++++ 4 files changed, 32 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/setup.h b/arch/arm64/include/asm/setup.h index 5f147a418281..6437df661700 100644 --- a/arch/arm64/include/asm/setup.h +++ b/arch/arm64/include/asm/setup.h @@ -5,6 +5,9 @@ #include +void *get_early_fdt_ptr(void); +void early_fdt_map(u64 dt_phys); + /* * These two variables are used in the head.S file. */ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 29d641290293..cefe6a73ee54 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -456,8 +456,9 @@ SYM_FUNC_START_LOCAL(__primary_switched) #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) bl kasan_early_init #endif - mov x0, x22 // pass FDT address in x0 - mov x1, x20 // pass the full boot status + mov x0, x21 // pass FDT address in x0 + bl early_fdt_map // Try mapping the FDT early + mov x0, x20 // pass the full boot status bl init_feature_override // Parse cpu feature overrides mov x0, x20 bl finalise_el2 // Prefer VHE if possible diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index aa2a53d0d417..7206fd0ed9eb 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -262,11 +262,16 @@ static __init void __parse_cmdline(const char *cmdline, bool parse_aliases) } while (1); } -static __init const u8 *get_bootargs_cmdline(const void *fdt) +static __init const u8 *get_bootargs_cmdline(void) { const u8 *prop; + void *fdt; int node; + fdt = get_early_fdt_ptr(); + if (!fdt) + return NULL; + node = fdt_path_offset(fdt, "/chosen"); if (node < 0) return NULL; @@ -278,9 +283,9 @@ static __init const u8 *get_bootargs_cmdline(const void *fdt) return strlen(prop) ? prop : NULL; } -static __init void parse_cmdline(const void *fdt) +static __init void parse_cmdline(void) { - const u8 *prop = get_bootargs_cmdline(fdt); + const u8 *prop = get_bootargs_cmdline(); if (IS_ENABLED(CONFIG_CMDLINE_FORCE) || !prop) __parse_cmdline(CONFIG_CMDLINE, true); @@ -290,9 +295,9 @@ static __init void parse_cmdline(const void *fdt) } /* Keep checkers quiet */ -void init_feature_override(const void *fdt, u64 boot_status); +void init_feature_override(u64 boot_status); -asmlinkage void __init init_feature_override(const void *fdt, u64 boot_status) +asmlinkage void __init init_feature_override(u64 boot_status) { int i; @@ -305,7 +310,7 @@ asmlinkage void __init init_feature_override(const void *fdt, u64 boot_status) __boot_status = boot_status; - parse_cmdline(fdt); + parse_cmdline(); for (i = 0; i < ARRAY_SIZE(regs); i++) { if (regs[i]->override) diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index d0e6c7a291da..fea3223704b6 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -163,6 +163,21 @@ static void __init smp_build_mpidr_hash(void) pr_warn("Large number of MPIDR hash buckets detected\n"); } +static void *early_fdt_ptr __initdata; + +void __init *get_early_fdt_ptr(void) +{ + return early_fdt_ptr; +} + +asmlinkage void __init early_fdt_map(u64 dt_phys) +{ + int fdt_size; + + early_fixmap_init(); + early_fdt_ptr = fixmap_remap_fdt(dt_phys, &fdt_size, PAGE_KERNEL); +} + static void __init setup_machine_fdt(phys_addr_t dt_phys) { int size;