本文基于Linux-4.19.125, ARM V7,dual core, MMU采用2级页表(未开启LPAE)。
1 为什么需要fixmap
Linux内核启动过程中,经过汇编阶段后,mmu功能已经开启,后续只能通过虚拟地址来访问DDR,但是此时能够访问的地址范围有限,只有idmap和swapper部分可以找到物理地址,其他没有通过MMU映射的虚拟地址是无法访问的。
fixmap就是为了解决在正式建立所有物理内存映射之前,实现early console、FDT映射、early ioreamp、paging init等过程中使用虚拟地址问题的。
fixmap是在一段固定的虚拟地址上建立到物理地址的映射,将一块固定地址的virtual address,映射到任意物理地址上(kernel在编译时即固定了一段虚拟地址,这段地址被用于早期内存管理体系还未完成之前各个模块对于内存的使用),以实现内核启动早期的log输出(early console)、读取fdt、early ioremap、建立paging init等功能。
2 fixmap功能初始化
start_kernel() -> setup_arch() -> early_fixmap_init()
@arch/arm/mm/mmu.c
void __init early_fixmap_init(void)
{
pmd_t *pmd;
/*
* The early fixmap range spans multiple pmds, for which
* we are not prepared:
*/
BUILD_BUG_ON((__fix_to_virt(__end_of_early_ioremap_region) >> PMD_SHIFT)
!= FIXADDR_TOP >> PMD_SHIFT);
pmd = fixmap_pmd(FIXADDR_TOP);
pmd_populate_kernel(&init_mm, pmd, bm_pte);
pte_offset_fixmap = pte_offset_early_fixmap;
}
static inline pmd_t * __init fixmap_pmd(unsigned long addr)
{
pgd_t *pgd = pgd_offset_k(addr);
pud_t *pud = pud_offset(pgd, addr);
pmd_t *pmd = pmd_offset(pud, addr);
return pmd;
}
从fixmap_pmd的实现来看,其功能就是根据addr来计算addr对应的mpd页表的地址。
2.1 pgd计算
我们来看看pgd_offset_k的实现:
@arch/arm/include/asm/pgtable.h
/* to find an entry in a page-table-directory */
#define pgd_index(addr) ((addr) >> PGDIR_SHIFT)
#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
/* to find an entry in a kernel page-table-directory */
#define pgd_offset_k(addr) pgd_offset(&init_mm, addr)
其中PGDIR_SHIFT的定义位于arch/arm/include/asm/pgtable-2level.h
/*
* PMD_SHIFT determines the size of the area a second-level page table can map
* PGDIR_SHIFT determines what a third-level page table entry can map
*/
#define PMD_SHIFT 21
#define PGDIR_SHIFT 21
pgd_index宏的作用,就是计算addr对应的一级页表的index
而pgd_offset_k宏的作用,就是计算addr对应的一级页表的地址。
在early_fixmap_init函数中,调用了fixmap_pmd(FIXADDR_TOP),FIXADDR_TOP的定义位于
arch/arm/include/asm/fixmap.h文件
#define FIXADDR_START 0xffc00000UL
#define FIXADDR_END 0xfff00000UL
#define FIXADDR_TOP (FIXADDR_END - PAGE_SIZE)
经过计算可知FIXADDR_TOP的值为0xffeff000。
那fixmap_pmd(FIXADDR_TOP)返回的值应该是多少呢?
根据pgd_offset_k的实现,我们可以计算出0xffeff000对应的一级页表地址:
init_mm.pgd + (0xffeff000>>21)
= init_mm.pgd + 2047
我们查看init_mm.pgd的数据结构类型为 pgd_t*,即pgd_t指针,而init_mm.pgd的值在内核启动的初期已经计算好了,是0xC004000。pgd_t结构体定义如下:
//@arch/arm/include/asm/pgtable-2level.h
typedef u32 pmdval_t;
typedef struct { pmdval_t pgd[2]; } pgd_t;
那么init_mm.pgd + 2047应该等于:
0xC004000 + sizeof(pgd_t)*2047
= 0xC004000 +8*2047
= 0xC007FF8
由此,我们计算得到了pgd的值为0xC007FF8。
接下来我们计算pud的值。
2.2 pud计算
pud_offset的实现位于文件 include/asm-generic/pgtable-nopud.h:
static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
{
return (pud_t *)p4d;
}
pud即为pgd。
2.3 pmd计算
最后计算pmd的值,pmd_offset的实现位于文件arch/arm/include/asm/pgtable-2level.h
static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
{
return (pmd_t *)pud;
}
pmd即为pud。
通过以上分析,我们就清楚了,pmd=pud=pgd=0xC007FF8,这也验证了我们只有两级页表pgd和pte,pmd和pud实际就是pgd。
2.4 更新硬件页表
要是页表映射真正起作用,必须更新页表。early_fixmap_init中调用
pmd_populate_kernel(&init_mm, pmd, bm_pte);
的作用就是更新页表。
pmd_populate_kernel函数的3个参数:
init_mm:全局的mm,其定义位于mm/init-mm.c文件
pmd:一级页表地址
bm_pte:这是针对fixmap设置的一个全局数组,其定义位于arch/arm/mm/mmu.c文件:
static pte_t bm_pte[PTRS_PER_PTE + PTE_HWTABLE_PTRS]
__aligned(PTE_HWTABLE_OFF + PTE_HWTABLE_SIZE) __initdata;
bm_pte存放了PTRS_PER_PTE + PTE_HWTABLE_PTRS个二级页表,这里包含两个部分:
PTRS_PER_PTE:512个给OS用的二级页表;
PTE_HWTABLE_PTRS:512个给ARM硬件MMU使用的二级页表。
大致可以用下图来表示:
接下来我们分析一下pmd_populate_kernel函数是如何实现更新硬件页表的,pmd_populate_kernel是一个内联函数,它的实现位于arch/arm/include/asm/pgalloc.h文件:
/*
* Populate the pmdp entry with a pointer to the pte. This pmd is part
* of the mm address space.
*
* Ensure that we always set both PMD entries.
*/
static inline void
pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep)
{
/*
* The pmd must be loaded with the physical address of the PTE table
*/
__pmd_populate(pmdp, __pa(ptep), _PAGE_KERNEL_TABLE);
}
pmd_populate_kernel调用了__pmd_populate,并传递了3个参数:
pmdp: 一级页表地址
__pa(ptep): 将bm_pte转换成了物理地址
_PAGE_KERNEL_TABLE:
#define _PAGE_KERNEL_TABLE (PMD_TYPE_TABLE | PMD_BIT4 | PMD_DOMAIN(DOMAIN_KERNEL))
细心的话,我们会发现,__pmd_populate丢弃了pmd_populate_kernel的mm参数。
__pmd_populate的实现同样位于arch/arm/include/asm/pgalloc.h文件:
static inline void __pmd_populate(pmd_t *pmdp, phys_addr_t pte,
pmdval_t prot)
{
pmdval_t pmdval = (pte + PTE_HWTABLE_OFF) | prot;
pmdp[0] = __pmd(pmdval);
#ifndef CONFIG_ARM_LPAE
pmdp[1] = __pmd(pmdval + 256 * sizeof(pte_t));
#endif
flush_pmd_entry(pmdp);
}
__pmd_populate函数第5行:计算pmdval
其中PTE_HWTABLE_OFF定义于arch/arm/include/asm/pgtable-2level.h文件
#define PTRS_PER_PTE 512
#define PTE_HWTABLE_PTRS (PTRS_PER_PTE)
#define PTE_HWTABLE_OFF (PTE_HWTABLE_PTRS * sizeof(pte_t))
__pmd_populate函数第6、8行: 填充一级页表pgd_t结构体。
__pmd定义于arch/arm/include/asm/pgtable-2level.h文件
#define __pmd(x) ((pmd_t) { (x) } )
__pmd_populate函数第10行: 调用flush_pmd_entry函数更新页表
flush_pmd_entry函数的实现位于arch/arm/include/asm/tlbflush.h文件:
/*
* flush_pmd_entry
*
* Flush a PMD entry (word aligned, or double-word aligned) to
* RAM if the TLB for the CPU we are running on requires this.
* This is typically used when we are creating PMD entries.
*
* clean_pmd_entry
*
* Clean (but don't drain the write buffer) if the CPU requires
* these operations. This is typically used when we are removing
* PMD entries.
*/
static inline void flush_pmd_entry(void *pmd)
{
const unsigned int __tlb_flag = __cpu_tlb_flags;
tlb_op(TLB_DCLEAN, "c7, c10, 1 @ flush_pmd", pmd);
tlb_l2_op(TLB_L2CLEAN_FR, "c15, c9, 1 @ L2 flush_pmd", pmd);
if (tlb_flag(TLB_WB))
dsb(ishst);
}
3 fixmap的使用
3.1 fixed_address
在arch/arm/include/asm/fixmap.h文件中定义了一个枚举类型fixed_addresses:
enum fixed_addresses {
FIX_EARLYCON_MEM_BASE, // 0
__end_of_permanent_fixed_addresses, // 1
FIX_KMAP_BEGIN = __end_of_permanent_fixed_addresses, // 1
/* #define KM_TYPE_NR 16 @arch/arm/include/asm/kmap_types.h
* NR_CPUS 2
*/
FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * NR_CPUS) - 1, // 32 (1+16*2-1)
/* Support writing RO kernel text via kprobes, jump labels, etc. */
FIX_TEXT_POKE0, // 33
FIX_TEXT_POKE1, // 34
__end_of_fixmap_region, // 35
/*
* Share the kmap() region with early_ioremap(): this is guaranteed
* not to clash since early_ioremap() is only available before
* paging_init(), and kmap() only after.
*/
#define NR_FIX_BTMAPS 32
#define FIX_BTMAPS_SLOTS 7
#define TOTAL_FIX_BTMAPS (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS) // 224
FIX_BTMAP_END = __end_of_permanent_fixed_addresses, // 1
FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1, // 224
__end_of_early_ioremap_region // 225
};
fixed_addresses实际就是fix map中的内存划分,根据fixed_addresses中的注释我们发现一个细节:kmap和early_ioremap区域是重叠的,这是因为 early_ioremap只在paging_init前有效,而kmap是在paging_init之后有效,这实际也是一种节省内存的有效手段。
接下来,我们通过early ioremap来探索fixmap是如何工作的。
3.2 early ioremap
3.2.1 early_ioremap_init
/*
* @arch/arm/mm/ioremap.c
*/
/*
* Must be called after early_fixmap_init
*/
void __init early_ioremap_init(void)
{
early_ioremap_setup();
}
early_ioremap_init直接调用了early_ioremap_setup,我们看看early_ioremap_setup的代码实现:
/*
* @arch/arm/include/asm/fixmap.h
*/
#define NR_FIX_BTMAPS 32
#define FIX_BTMAPS_SLOTS 7
/*
* @mm/early_ioremap.c
*/
static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
void __init early_ioremap_setup(void)
{
int i;
for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
if (WARN_ON(prev_map[i]))
break;
for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
}
early_ioremap_setup函数的核心在最后一行代码:
slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
__fix_to_virt是一个宏定义:
@include/asm-generic/fixmap.h
#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
根据
@arch/arm/include/asm/page.h
#define PAGE_SHIFT 12
#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) //0x1000
@arch/arm/include/asm/fixmap.h
#define FIXADDR_START 0xffc00000UL
#define FIXADDR_END 0xfff00000UL
#define FIXADDR_TOP (FIXADDR_END - PAGE_SIZE) //0xffeff000
我们计算出FIXADDR_TOP的地址为0xffeff000,那么__fix_to_virt宏展开后就是:
(0xffeff000- ((x) << 12))
x应该是作为一个index,左移12位是一个page(4KB)的大小,那么__fix_to_virt宏的作用,就是从0xffeff000地址开始,从高地址向低地址按page进行偏移。
FIX_BTMAPS_SLOTS为7,所以early ioremap共占用7个slot,每个slot大小为NR_FIX_BTMAPS*PAGE_SIZE,从index为FIX_BTMAP_BEGIN开始,他们的虚拟地址依次是:
(0xffeff000- ((224-32*0) << 12)) = 0xffe1f000
(0xffeff000- ((224-32*1) << 12)) = 0xffe3f000
(0xffeff000- ((224-32*2) << 12)) = 0xffe5f000
(0xffeff000- ((224-32*3) << 12)) = 0xffe7f000
(0xffeff000- ((224-32*4) << 12)) = 0xffe9f000
(0xffeff000- ((224-32*5) << 12)) = 0xffebf000
(0xffeff000- ((224-32*6) << 12)) = 0xffedf000
这些虚拟地址被保存在slot_virt全局静态数组中。主要到slot_virt定义位置的上面两行还有prev_map和prev_size两个变量:
static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
这两个变量在后面的early_ioremap函数和early_iounmap函数中都会使用到, prev_map数组记录的是7个slot中已经被ioremap过的地址信息,而pre_size数组记录了ioremap地址对应的size信息。
3.2.2 early_ioremap
early_ioremap的实现位于mm/early_ioremap.c
/* Remap an IO device */
void __init __iomem *
early_ioremap(resource_size_t phys_addr, unsigned long size)
{
return __early_ioremap(phys_addr, size, FIXMAP_PAGE_IO);
}
early_ioremap直接调用了__early_ioremap, __early_ioremap的实现如下:
static void __init __iomem *
__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
{
unsigned long offset;
resource_size_t last_addr;
unsigned int nrpages;
enum fixed_addresses idx;
int i, slot;
WARN_ON(system_state >= SYSTEM_RUNNING);
slot = -1;
for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
if (!prev_map[i]) {
slot = i;
break;
}
}
if (WARN(slot < 0, "%s(%08llx, %08lx) not found slot\n",
__func__, (u64)phys_addr, size))
return NULL;
/* Don't allow wraparound or zero size */
last_addr = phys_addr + size - 1;
if (WARN_ON(!size || last_addr < phys_addr))
return NULL;
prev_size[slot] = size;
/*
* Mappings have to be page-aligned
*/
offset = offset_in_page(phys_addr);
phys_addr &= PAGE_MASK;
size = PAGE_ALIGN(last_addr + 1) - phys_addr;
/*
* Mappings have to fit in the FIX_BTMAP area.
*/
nrpages = size >> PAGE_SHIFT;
if (WARN_ON(nrpages > NR_FIX_BTMAPS))
return NULL;
/*
* Ok, go for it..
*/
idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
while (nrpages > 0) {
if (after_paging_init)
__late_set_fixmap(idx, phys_addr, prot);
else
__early_set_fixmap(idx, phys_addr, prot);
phys_addr += PAGE_SIZE;
--idx;
--nrpages;
}
WARN(early_ioremap_debug, "%s(%08llx, %08lx) [%d] => %08lx + %08lx\n",
__func__, (u64)phys_addr, size, slot, offset, slot_virt[slot]);
prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
return prev_map[slot];
}
函数13-18行:寻找未做map的slot
函数20-27行:参数合法性检查
函数29行:记录map的size信息到pre_size数组
函数33行:通过offset_in_page宏计算phys_addr在page中的offset,我们看一下offset_in_page宏的定义:
@include/linux/mm.h
#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK)
@arch/arm/include/asm/page.h
/* PAGE_SHIFT determines the page size */
#define PAGE_SHIFT 12
#define PAGE_MASK (~((1 << PAGE_SHIFT) - 1))
PAGE_MASK: (~((1 << 12) - 1)) = 0xfffff000
~PAGE_MASK: 0x00000fff
所以offset_in_page宏的实际作用就是获取phys_addr的低[0-11] bit,也就是在一个page中的offset值。
函数34-35行:清除phys_addr低[0-11] bit并按照页对齐重新计算size值
函数40行:计算map的size横跨的page个数
函数47-56行:以page为单位循环执行ioremap操作,这里的核心函数是__early_set_fixmap和__late_set_fixmap。
__early_set_fixmap
@arch/arm/include/asm/fixmap.h
#define __early_set_fixmap __set_fixmap
@arch/arm/mm/mmu.c
/*
* To avoid TLB flush broadcasts, this uses local_flush_tlb_kernel_range().
* As a result, this can only be called with preemption disabled, as under
* stop_machine().
*/
void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot)
{
unsigned long vaddr = __fix_to_virt(idx);
pte_t *pte = pte_offset_fixmap(pmd_off_k(vaddr), vaddr);
/* Make sure fixmap region does not exceed available allocation. */
BUILD_BUG_ON(FIXADDR_START + (__end_of_fixed_addresses * PAGE_SIZE) >
FIXADDR_END);
BUG_ON(idx >= __end_of_fixed_addresses);
/* we only support device mappings until pgprot_kernel has been set */
if (WARN_ON(pgprot_val(prot) != pgprot_val(FIXMAP_PAGE_IO) &&
pgprot_val(pgprot_kernel) == 0))
return;
if (pgprot_val(prot))
set_pte_at(NULL, vaddr, pte,
pfn_pte(phys >> PAGE_SHIFT, prot));
else
pte_clear(NULL, vaddr, pte);
local_flush_tlb_kernel_range(vaddr, vaddr + PAGE_SIZE);
}
__late_set_fixmap
在mm/early_ioremap.c 文件中__late_set_fixmap的实现如下,如果没有定义__late_set_fixmap,那么将直接触发BUG(),搜了一圈代码ARM arch下的确没有定义__late_set_fixmap宏,所以ARM arch下应该是不支持__late_set_fixmap的,也就是函数49行 if (after_paging_init)不应该成立,否则系统将崩溃,这说明不能在paging_init后再调用early_ioremap进行IO映射。
/*
* Generally, ioremap() is available after paging_init() has been called.
* Architectures wanting to allow early_ioremap after paging_init() can
* define __late_set_fixmap and __late_clear_fixmap to do the right thing.
*/
#ifndef __late_set_fixmap
static inline void __init __late_set_fixmap(enum fixed_addresses idx,
phys_addr_t phys, pgprot_t prot)
{
BUG();
}
#endif
after_paging_init的赋值是在early_ioremap_reset完成的:
void __init early_ioremap_reset(void)
{
early_ioremap_shutdown();
after_paging_init = 1;
}
而early_ioremap_reset的调用是在paging_init之后:
void __init setup_arch(char **cmdline_p)
{
...
early_ioremap_init();
...
paging_init();
...
early_ioremap_reset();
}
3.2.3 early_iounmap
early_iounmap是early_ioremap的逆向操作,完成IO映射的解除,不做详细分析。
void __init early_iounmap(void __iomem *addr, unsigned long size)
{
unsigned long virt_addr;
unsigned long offset;
unsigned int nrpages;
enum fixed_addresses idx;
int i, slot;
slot = -1;
for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
if (prev_map[i] == addr) {
slot = i;
break;
}
}
if (WARN(slot < 0, "early_iounmap(%p, %08lx) not found slot\n",
addr, size))
return;
if (WARN(prev_size[slot] != size,
"early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
addr, size, slot, prev_size[slot]))
return;
WARN(early_ioremap_debug, "early_iounmap(%p, %08lx) [%d]\n",
addr, size, slot);
virt_addr = (unsigned long)addr;
if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)))
return;
offset = offset_in_page(virt_addr);
nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
while (nrpages > 0) {
if (after_paging_init)
__late_clear_fixmap(idx);
else
__early_set_fixmap(idx, 0, FIXMAP_PAGE_CLEAR);
--idx;
--nrpages;
}
prev_map[slot] = NULL;
}