这个part是想详细走读一下用qemu运行kernel的最初始代码,也就是使用qemu运行kernel代码的详细逻辑,从qemu加载根目录下vmlinux镜像的逻辑,也就是运行arch/arm/kernel/head.S的整个过程,直到跳转到start_kernel,使用的kernel版本还是3.18。
初始运行状态
沿用上回《基于QEMU的vexpress-a9开发调试环境搭建》的最后一部分调试kernel汇编这一个part,我们需要将vmlinux的链接地址设置到0x60008000,设置方法是修改arch/arm/Kconfig中VMSPLIT_2G对应的值到0x60000000,并重新编译kernel,接着就可以直接b *0x60008000,并且执行到arch/arm/kernel/head.S,这样,我们就可以直接通过命令c运行到arch/arm/kernel/head.S的第一句代码(bl __hyp_stub_install),这其中跳过了compressed/vmlinux的解压内核等逻辑,后续章节再补充。
在此,我们打印一下运行到此处时的寄存器
// arch/arm/kernel/head.S主线代码
#ifdef CONFIG_ARM_VIRT_EXT
bl __hyp_stub_install // 1:如有必要,执行hyp模式的初始化配置
#endif
@ ensure svc mode and all interrupts masked
safe_svcmode_maskall r9 // 2: 进入SVC模式,屏蔽中断
mrc p15, 0, r9, c0, c0 @ get processor id
bl __lookup_processor_type @ r5=procinfo r9=cpuid // 3.根据CPU id查找cpu执行函数
movs r10, r5 @ invalid processor (r5=0)?
1. __hyp_stub_install的执行
如果定义了CONFIG_ARM_VIRT_EXT宏,则进入arch/arm/kernel/head.S的第一行代码就是bl __hyp_stub_install,__hyp_stub_install首先执行store_primary_cpu_mode,这个方法是用来记录主cpu运行模式的, 只有主CPU在初始化时候才会调用这个方法,这个方法其实是将主cpu当前cpsr寄存器中存的cpu模式(bit0-bit4,当前值为0x13,代表SVC_MODE),存入到__boot_cpu_mode的位置。后面__syp_stub_install_secondary其实用来给其他CPU,检查当前cpu模式,如果和主CPU模式一致,并且也是HYP_MODE,则初始化hyp-stub(这个部分我们不展开讨论了,因为不会走到),由于我们主CPU是SVC_MODE,所以不满足第二个条件,所以在执行到114行代码时,就会retne lr,返回head.S的主代码逻辑。
2. safe_svcmode_maskall的执行
这部分做的是强制让cpu进入SVC模式,并屏蔽中断。
/*
* Helper macro to enter SVC mode cleanly and mask interrupts. reg is
* a scratch register for the macro to overwrite.
*
* This macro is intended for forcing the CPU into SVC mode at boot time.
* you cannot return to the original mode.
*/
.macro safe_svcmode_maskall reg:req
#if __LINUX_ARM_ARCH__ >= 6 && !defined(CONFIG_CPU_V7M)
mrs \reg , cpsr //读取cpsr到reg
eor \reg, \reg, #HYP_MODE //reg和HYP_MODE(0x14)进行异或,结果存到reg
tst \reg, #MODE_MASK // 测试是否与0x1F相同,由于cpsr中是SVC,与HYP异或后上一步结果非0,所以这里cpsr中Z位置0
bic \reg , \reg , #MODE_MASK //清除reg中模式位
orr \reg , \reg , #PSR_I_BIT | PSR_F_BIT | SVC_MODE //reg中屏蔽中断和快速中断,并设置SVC模式
THUMB( orr \reg , \reg , #PSR_T_BIT )
bne 1f //由于cpsr中Z为0,跳转到1:的位置。
orr \reg, \reg, #PSR_A_BIT //将r0,第8位,exception位清零
adr lr, BSYM(2f) //badr是设置lr到2:这个位置,也就是跳出这个函数了。
msr spsr_cxsf, \reg //将hyp模式的spsr,设置成新的svc的cpsr
__MSR_ELR_HYP(14) //这两句是thumb语句,ignore
__ERET
1: msr cpsr_c, \reg //将reg中的结果,设置到cpsr_c,也就是cpsr的低8位里
2:
......
通过上面代码分析可知,如果当前cpu模式为HYP模式,需要额外将新的cpsr设置到HYP模式的spsr中,否则,就只是屏蔽中断/快中断,并让cpu进入svc模式。下面图可以看出HYP模式有自己的spsr寄存器。
3 __lookup_processor_type的执行
这部分是在.proc.info.init数据中,查找对应cpu的私有执行方法。具体步骤为:
- 调用CP15寄存器方法(mrc p15, 0, r9, c0, c0),获得cpu id。
- 通过cpu id,到.proc.init段中查找对应的cpu的私有执行方法。
- 找到后存入r5寄存器返回
/*
* Read processor ID register (CP#15, CR0), and look up in the linker-built
* supported processor list. Note that we can't use the absolute addresses
* for the __proc_info lists since we aren't running with the MMU on
* (and therefore, we are not in the correct address space). We have to
* calculate the offset.
*
* r9 = cpuid
* Returns:
* r3, r4, r6 corrupted
* r5 = proc_info pointer in physical address space
* r9 = cpuid (preserved)
*/
__lookup_processor_type:
adr r3, __lookup_processor_type_data // r3存放__lookup_processor_type_data的物理地址
ldmia r3, {r4 - r6} //r4存放__lookup_processor_type_data的链接地址,r5存放__proc_info_start链接地址,r6存放__proc_info_end链接地址
sub r3, r3, r4 @ get offset between virt&phys //用r3-r4,__lookup_processor_type_data的物理地址减去链接地址,r3存放物理地址与链接地址的差值
add r5, r5, r3 @ convert virt addresses to//r5存放__proc_info_start物理地址
add r6, r6, r3 @ physical address space //r6存放__proc_info_end物理地址
1: ldmia r5, {r3, r4} @ value, mask // r3存放proc_info的cpu id,r4存掩码
and r4, r4, r9 @ mask wanted bits //将r9存的cpu id与掩码与一下
teq r3, r4 //与r9存放的cpu id比较
beq 2f //相等则找到, r5存的就是CPU私有执行方法,条转到2:
add r5, r5, #PROC_INFO_SZ @ sizeof(proc_info_list) //找不到,则将r5移动一个sizeof(proc_info_list),尝试下一个proc_info_list
cmp r5, r6 //对比一下r5和r6
blo 1b //如果不相等,则跳转到1:尝试下一个proc_info_list
mov r5, #0 @ unknown processor //如果相等,则比较过了所有在proc-v7.S中的配置项,没找到可用的cpu执行方法,r5存0,表示没找到
2: ret lr // 方法返回
ENDPROC(__lookup_processor_type)
/*
* Look in <asm/procinfo.h> for information about the __proc_info structure.
*/
.align 2
.type __lookup_processor_type_data, %object
__lookup_processor_type_data:
.long .
.long __proc_info_begin //在arch/arm/kernel/vmlinux.lds.S中标注,存放.proc.info.init数据
.long __proc_info_end
.size __lookup_processor_type_data, . - __lookup_processor_type_data
在arch/arm/kernel/vmlinux.lds.S可知,__proc_info_begin~__proc_info_end存放着.proc.info.init内容。
而实际的CPU执行方法配置项放在arch/arm/mm/proc-v7.S中:
.section ".proc.info.init", #alloc, #execinstr
//定义__v7_proc宏,放置一系列CPU处理函数的位置
.macro __v7_proc initfunc, mm_mmuflags = 0, io_mmuflags = 0, hwcaps = 0, proc_fns = v7_processor_functions
ALT_SMP(.long PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AP_READ | \
PMD_SECT_AF | PMD_FLAGS_SMP | \mm_mmuflags)
ALT_UP(.long PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AP_READ | \
PMD_SECT_AF | PMD_FLAGS_UP | \mm_mmuflags)
.long PMD_TYPE_SECT | PMD_SECT_AP_WRITE | \
PMD_SECT_AP_READ | PMD_SECT_AF | \io_mmuflags
W(b) \initfunc
.long cpu_arch_name
.long cpu_elf_name
.long HWCAP_SWP | HWCAP_HALF | HWCAP_THUMB | HWCAP_FAST_MULT | \
HWCAP_EDSP | HWCAP_TLS | \hwcaps
.long cpu_v7_name
.long \proc_fns
.long v7wbi_tlb_fns
.long v6_user_fns
.long v7_cache_fns
.endm
#ifndef CONFIG_ARM_LPAE
/*
* ARM Ltd. Cortex A5 processor.
*/
.type __v7_ca5mp_proc_info, #object
__v7_ca5mp_proc_info:
.long 0x410fc050 //v7 a5的cpu id
.long 0xff0ffff0 //v7 a5的cpu掩码
__v7_proc __v7_ca5mp_setup
.size __v7_ca5mp_proc_info, . - __v7_ca5mp_proc_info
/*
* ARM Ltd. Cortex A9 processor.
*/
.type __v7_ca9mp_proc_info, #object
__v7_ca9mp_proc_info:
.long 0x410fc090 //v7 a9的cpu id,这个就是对应vexpress-9的cpu id
.long 0xff0ffff0 //v7 a9的cpu掩码
__v7_proc __v7_ca9mp_setup, proc_fns = ca9mp_processor_functions
.size __v7_ca9mp_proc_info, . - __v7_ca9mp_proc_info
#endif /* CONFIG_ARM_LPAE */
...
接下来我们继续看arch/arm/kernel/head.S的主线代码:
......
#ifndef CONFIG_XIP_KERNEL
adr r3, 2f //r3 存放跳转2的地址
ldmia r3, {r4, r8}
sub r4, r3, r4 @ (PHYS_OFFSET - PAGE_OFFSET)
add r8, r8, r4 @ PHYS_OFFSET
#else
ldr r8, =PLAT_PHYS_OFFSET @ always constant in this case
#endif
/*
* r1 = machine no, r2 = atags or dtb,
* r8 = phys_offset, r9 = cpuid, r10 = procinfo
* 这里打印一下我当前寄存这几个值
* (gdb) i r r1 r2 r8 r9 r10
r1 0x8e0 2272
r2 0x68000000 1744830464
r8 0x68000000 1744830464
r9 0x410fc090 1091551376
r10 0x6047d478 1615320184
*/
bl __vet_atags //1:
#ifdef CONFIG_SMP_ON_UP
bl __fixup_smp //2:
#endif
#ifdef CONFIG_ARM_PATCH_PHYS_VIRT
bl __fixup_pv_table //3:
#endif
bl __create_page_tables //4:
/*
* The following calls CPU specific code in a position independent
* manner. See arch/arm/mm/proc-*.S for details. r10 = base of
* xxx_proc_info structure selected by __lookup_processor_type
* above. On return, the CPU will be ready for the MMU to be
* turned on, and r0 will hold the CPU control register value.
*/
ldr r13, =__mmap_switched @ address to jump to after
@ mmu has been enabled
adr lr, BSYM(1f) @ return (PIC) address
mov r8, r4 @ set TTBR1 to swapper_pg_dir
ARM( add pc, r10, #PROCINFO_INITFUNC ) //将PROCINFO_INITFUNC所在位置赋值给r10(0x6047d478)
THUMB( add r12, r10, #PROCINFO_INITFUNC )
THUMB( ret r12 )
1: b __enable_mmu //单向跳转到__enable_mmu
4.__vet_atags
这部分直接看注释就能明白,这里是判断r2有效性的,r2存放的要么是DTB数据,要么是ATA_CORE数据,否则就是非法数据,r2赋值为0后返回。
/* Determine validity of the r2 atags pointer. The heuristic requires
* that the pointer be aligned, in the first 16k of physical RAM and
* that the ATAG_CORE marker is first and present. If CONFIG_OF_FLATTREE
* is selected, then it will also accept a dtb pointer. Future revisions
* of this function may be more lenient with the physical address and
* may also be able to move the ATAGS block if necessary.
*
* Returns:
* r2 either valid atags pointer, valid dtb pointer, or zero
* r5, r6 corrupted
*/
__vet_atags:
tst r2, #0x3 @ aligned? //是否2bit对齐?
bne 1f //如果Z位为0,表示没有对齐,则跳转到1:
ldr r5, [r2, #0] //r5存放r2地址上的值
#ifdef CONFIG_OF_FLATTREE
ldr r6, =OF_DT_MAGIC @ is it a DTB? //r6存一下OF_DT_MAGIC值
cmp r5, r6 //判断r2地址上的值是否等于OF_DT_MAGIC,以判断是否是DTB
beq 2f //是则跳转到2
#endif
cmp r5, #ATAG_CORE_SIZE @ is first tag ATAG_CORE? //再判断是否是合法的ATA_CORE信息
cmpne r5, #ATAG_CORE_SIZE_EMPTY
bne 1f
ldr r5, [r2, #4]
ldr r6, =ATAG_CORE
cmp r5, r6
bne 1f
2: ret lr @ atag/dtb pointer is ok
1: mov r2, #0 //错误处理:r2存的非DTB或ATA_CORE数据或者r2没有对齐,设置r2为0
ret lr
ENDPROC(__vet_atags)
5. __fixup_smp
这部分是判断CPU架构是否支持smp(Symmetric Multiprocessing,即对称多处理),如果支持,但硬件为单核,则做指令替换,将所有ALT_SMP(xxx)指令替换为ALT_UP指令。否则不做任何替换处理。具体可以参考
- https://stackoverflow.com/questions/17083941/what-does-alt-smp-and-alt-up-does
- https://blog.csdn.net/fh400/article/details/8282841
#ifdef CONFIG_SMP_ON_UP
__HEAD
__fixup_smp:
and r3, r9, #0x000f0000 @ architecture version
teq r3, #0x000f0000 @ CPU ID supported? //通过CPU id判断是否支持
bne __fixup_smp_on_up @ no, assume UP //不支持,跳转到支持UP(即单核CPU系统)的处理
bic r3, r9, #0x00ff0000
bic r3, r3, #0x0000000f @ mask 0xff00fff0
mov r4, #0x41000000
orr r4, r4, #0x0000b000
orr r4, r4, #0x00000020 @ val 0x4100b020
teq r3, r4 @ ARM 11MPCore?
reteq lr @ yes, assume SMP //如果是 ARM 11MPCore这里返回
mrc p15, 0, r0, c0, c0, 5 @ read MPIDR //MPIDR是个描述多核亲缘性的寄存器,读到R0寄存器,参考//https://developer.arm.com/documentation/ddi0406/b/System-Level-Architecture/Protected-Memory-System-Architecture--PMSA-/CP15-registers-for-a-PMSA-implementation/c0--Multiprocessor-Affinity-Register--MPIDR-
and r0, r0, #0xc0000000 @ multiprocessing extensions and
teq r0, #0x80000000 @ not part of a uniprocessor system?
bne __fixup_smp_on_up @ no, assume UP //如果MPIDR表明不支持多核
@ Core indicates it is SMP. Check for Aegis SOC where a single
@ Cortex-A9 CPU is present but SMP operations fault.
mov r4, #0x41000000
orr r4, r4, #0x0000c000
orr r4, r4, #0x00000090 //
teq r3, r4 @ Check for ARM Cortex-A9
retne lr @ Not ARM Cortex-A9, //直接判断是否是cortex-a9处理器,不是就返回
@ If a future SoC *does* use 0x0 as the PERIPH_BASE, then the
@ below address check will need to be #ifdef'd or equivalent
@ for the Aegis platform.
mrc p15, 4, r0, c15, c0 @ get SCU base address
teq r0, #0x0 @ '0' on actual UP A9 hardware //r0读出来是0x1e000000
beq __fixup_smp_on_up @ So its an A9 UP
ldr r0, [r0, #4] @ read SCU Config //这里读出r0是0x10
ARM_BE8(rev r0, r0) @ byteswap if big endian
and r0, r0, #0x3 @ number of CPUs // r0读出是0,也就是只有一个核
teq r0, #0x0 @ is 1?
retne lr
__fixup_smp_on_up: //处理单核程序,将ALT_SMP(xx)指令替换为ALT_UP(xx)
adr r0, 1f
ldmia r0, {r3 - r5}
sub r3, r0, r3
add r4, r4, r3
add r5, r5, r3
b __do_fixup_smp_on_up //执行替换
ENDPROC(__fixup_smp)
.align
1: .word .
.word __smpalt_begin //对应.alt.smp.init段
.word __smpalt_end
.pushsection .data
.globl smp_on_up
smp_on_up:
ALT_SMP(.long 1)
ALT_UP(.long 0)
.popsection
#endif
.text
__do_fixup_smp_on_up: //逐个替换
cmp r4, r5
reths lr
ldmia r4!, {r0, r6}
ARM( str r6, [r0, r3] )
THUMB( add r0, r0, r3 )
#ifdef __ARMEB__
THUMB( mov r6, r6, ror #16 ) @ Convert word order for big-endian.
#endif
THUMB( strh r6, [r0], #2 ) @ For Thumb-2, store as two halfwords
THUMB( mov r6, r6, lsr #16 ) @ to be robust against misaligned r3.
THUMB( strh r6, [r0] )
b __do_fixup_smp_on_up
ENDPROC(__do_fixup_smp_on_up)
CPUID寄存器内容:
字段名:Implementer(venter 销售ID)|Variant(大版本号) | Architecture(架构版本)| Part Num(产品代码)|Revision(小版本号)
基址偏移量: [31-24] | [23-20] | [19-16] | [15-4] | [3-0]
6. __create_page_tables
这个是给kernel创建临时页表,我们知道,要执行速度快,需要开启mmu和cache,前提是要创建页表做地址映射。这一步主要是段映射,可以参考(https://zhuanlan.zhihu.com/p/578336642)
/*
* Setup the initial page tables. We only setup the barest
* amount which are required to get the kernel running, which
* generally means mapping in the kernel code.
*
* r8 = phys_offset, r9 = cpuid, r10 = procinfo
*
* Returns:
* r0, r3, r5-r7 corrupted
* r4 = page table (see ARCH_PGD_SHIFT in asm/memory.h)
*/
/* 当前相关寄存器读值
r8 0x60000000
r9 0x410fc090
r10 0x6047d478
*/
__create_page_tables:
pgtbl r4, r8 @ page table address //根据0x60000000计算出page table所在位置0x60004000,作为一级页表的地址。
/*
* Clear the swapper page table
*/
mov r0, r4 // r0为0x60004000
mov r3, #0
add r6, r0, #PG_DIR_SIZE // r6为0x0x60008000
1: str r3, [r0], #4 //将0x60004000到0x60008000都设置成0x0,也就是将一级页表需要的16K全部清零。由于每个地址需要4个字节,则有4K个地址,又每个地址对应1M地址,所以16K地址能够总共映射4K*1M=4G地址。
str r3, [r0], #4
str r3, [r0], #4
str r3, [r0], #4
teq r0, r6
bne 1b
#ifdef CONFIG_ARM_LPAE //CONFIG_ARM_LPAE未设置
...
#endif
ldr r7, [r10, #PROCINFO_MM_MMUFLAGS] @ mm_mmuflags //读取proc_info 的PROCINFO_MM_MMUFLAGS为0xc0e,读到r7
/*
* Create identity mapping to cater for __enable_mmu.
* This identity mapping will be removed by paging_init().
*/
adr r0, __turn_mmu_on_loc
ldmia r0, {r3, r5, r6} //将__turn_mmu_on的链接地址读到r5,__turn_mmu_on_end链接地址放到r6, r3放当前实际运行地址。
sub r0, r0, r3 @ virt->phys offset //计算得到r0, r5,r6的实际运行地址,也就是待映射的物理地址
add r5, r5, r0 @ phys __turn_mmu_on // r5实际运行地址: 0x6047c8f0
add r6, r6, r0 @ phys __turn_mmu_on_end // r6实际运行地址: 0x6047c910
mov r5, r5, lsr #SECTION_SHIFT // r5, r6右移1M,都变成0x604,这个是一级页表中映射r5,r6的index
mov r6, r6, lsr #SECTION_SHIFT
1: orr r3, r7, r5, lsl #SECTION_SHIFT @ flags + kernel base // r3:0x60400c0e = 0x604<<20 + 0xc0e
str r3, [r4, r5, lsl #PMD_ORDER] @ identity mapping // 将r3(0x60400c0e)值放到0x60005810=0x60004000+(0x604<<2)地址上,相当于配置了__turn_mmu_on 开始的页表
//以上相当于将__turn_mmu_on开始的实际运行地址所对应的页面地址,放到页表index为0x604对应的地址上
cmp r5, r6
addlo r5, r5, #1 @ next section
blo 1b //判断__turn_mmu_on 到__turn_mmu_on_end是否映射完毕
/*
* Map our RAM from the start to the end of the kernel .bss section.
* 开始映射物理地址开始到kernel镜像结尾,开始物理地址为0x60000000,kernel结尾为_end
*/
add r0, r4, #PAGE_OFFSET >> (SECTION_SHIFT - PMD_ORDER) // r0 = r4 + (0x60000000>>(20-2))=0x60004000+0x1800=0x60005800
ldr r6, =(_end - 1) //_end为kernel镜像尾部链接地址 = 0x6067eb33
orr r3, r8, r7 // r3 = 0x60000000 | 0xc0e = 0x60000c0e
add r6, r4, r6, lsr #(SECTION_SHIFT - PMD_ORDER) // r6 = 0x60004000 + 0x6067eb33>>(20-2) = 0x60005819
1: str r3, [r0], #1 << PMD_ORDER //把r3的值放到0x60005800上,并且r0的值+4, r0 = 0x60005804
add r3, r3, #1 << SECTION_SHIFT // r3= r3+1<<20 = 0x60100c0f
cmp r0, r6 // 让r0(0x60005804)和r6(0x60005819)比较,看有没有将kernel映射完
bls 1b
//从上面我们可以看到段映射的规律。
// 1. 每个页表映射1M,映射4G,需要0x4000 = 16K地址空间,每个页表地址占4个字节。0x400占一共14个bit,32-14=18,所以一级页表高18位是固定位。作为页表基地址范围。
// 2. 实际待映射的地址的高12位,对应的就是页表index,比如kernel开始实际得地址0x60000000,取高12位0x600,再左移2位=0x1800,0x60004000+0x1800=0x60005800就是一级页表所在的实际地址。
// 3. 一级页表内容的高(12)位为实际物理地址的高(12)位,与虚地址的低(20)位拼接后,得到虚地址映射的实际地址。一级页表内容的低(20)位内容参考下图字段解释。
// 4. 综上,整个映射流程demo:
// 如果页表首地址为0x60000000,得到一个虚地址是0x12345678,则取虚地址高12位0x123,左移2位得到0x48c,页表首地址的高18位:0x60000000,相或,得到这个地址的页表地址为0x6000048c。
// 如果地址0x6000048c存的值为0xabcdef12,则取其高12位,与实际地址相加为0xabc00000 + 0x45678 = 0xabc45678为实际物理地址。0xabcdef12中的低20位是mmu的设置状态位
#ifdef CONFIG_XIP_KERNEL
...
#endif
/*
* Then map boot params address in r2 if specified.
* We map 2 sections in case the ATAGs/DTB crosses a section boundary.
*/
//下面映射r2(0x68000000)存放dtb的内存,只映射一个页1M空间就足够了
mov r0, r2, lsr #SECTION_SHIFT
movs r0, r0, lsl #SECTION_SHIFT
subne r3, r0, r8
addne r3, r3, #PAGE_OFFSET
addne r3, r4, r3, lsr #(SECTION_SHIFT - PMD_ORDER)
orrne r6, r7, r0
strne r6, [r3], #1 << PMD_ORDER
addne r6, r6, #1 << SECTION_SHIFT
strne r6, [r3]
#if defined(CONFIG_ARM_LPAE) && defined(CONFIG_CPU_ENDIAN_BE8)
...
#endif
#ifdef CONFIG_DEBUG_LL
....
#endif
#ifdef CONFIG_ARM_LPAE
...
#endif
ret lr //返回,映射完毕3个区域__turn_mmu_on的操作方法,kernel和dtb
ENDPROC(__create_page_tables)
.ltorg
.align
__turn_mmu_on_loc:
.long .
.long __turn_mmu_on
.long __turn_mmu_on_end
以上,一级页表中低20位字段内容参考(https://www.cnblogs.com/sky-heaven/p/15941382.html):
做完页表的映射,接下来就做开启mmu前的准备工作,并使能mmu,并在__mmap_switched中跳转到startKernel正式执行C代码。这部分由于和ARM强相关,并且大多都是配置CP15寄存器。我们下一章再详细分析。