mm_init_owner(&init_mm, &init_task);
=> init_mm.owner = &init_task;setup_command_line(command_line);
在setup_arch里把boot_command_line copy到了command_line里,然后把start_kernel里的这个command_line指针指向setup_arch里的command_line. 这个函数做的事就是分配两块内存,把boot_command_line和command_line分别再copy一份. (BOOT_IMAGE=/bzImage, len = 19)
// bochs: writemem "/tmp/memblock.memdump" 0xffffffff81b3e9a0 64 // bochs: writemem "/tmp/memblock.memory.memdump" 0xffffffff81b3f200 2048 // bochs: writemem "/tmp/memblock.reserved.memdump" 0xffffffff81b3e9e0 2048 $ ./print_memblock /tmp/memblock.memdump /tmp/memblock.memory.memdump /tmp/memblock.reserved.memdump sizeof(struct memblock) = 64 sizeof memblock.memory/reserved = INIT_MEMBLOCK_REGIONS * sizeof(struct memblock_region) = 2048 memblock.current_limit = 0x7ff0000 memblock.memory_size = 0x7f7f000 memblock.memory.cnt = 0x2 memblock.memory.max = 0x80 memblock.memory.regions = 0xffffffff81b3f200 memblock.reserved.cnt = 0x14 memblock.reserved.max = 0x80 memblock.reserved.regions = 0xffffffff81b3e9e0 --memory regions-- 0: start=0x10000, end=0x9f000, size=0x8f000 1: start=0x100000, end=0x7ff0000, size=0x7ef0000 --reserved regions-- 0: start=0x9a000, end=0x9f000, size=0x5000 // TRAMPOLINE 1: start=0x9fc00, end=0x100000, size=0x60400 // EBDA 2: start=0x1000000, end=0x1d04049, size=0xd04049 // Kernel TEXT DATA BSS and extend_brk 49 bytes 3: start=0x7200000, end=0x7400000, size=0x200000 // map_map[0] 第1个section的map 4: start=0x796e000, end=0x79eb000, size=0x7d000 // RAMDISK 5: start=0x7be6000, end=0x7be8000, size=0x2000 // 0xffea...的pud和pmd 6: start=0x7fb7000, end=0x7fee000, size=0x37000 // level2_fixmap_pgt[507] -> NEW PTE (0x7fb7000 0x1000) map __vsyscall_0 // ZONE_DMA32 wait_table (0x7fb8000 0x18000) // ZONE_DMA wait_table (0x7fd0000 0x18000) // node_data[0] pglist_data (0x7fe8000 0x5000) // THE PTE (0x7fed000 0x1000) 7: start=0x7fee9c0, end=0x7fee9d4, size=0x14 // **static_command_line** 8: start=0x7feea00, end=0x7feea14, size=0x14 // **saved_command_line** 9: start=0x7feea40, end=0x7feea60, size=0x20 // nosave_region 10: start=0x7feea80, end=0x7feeae8, size=0x68 // e820_saved firmware_map_entry 11: start=0x7feeb00, end=0x7feeb68, size=0x68 // e820_saved firmware_map_entry 12: start=0x7feeb80, end=0x7feebe8, size=0x68 // e820_saved firmware_map_entry 13: start=0x7feec00, end=0x7feec68, size=0x68 // e820_saved firmware_map_entry 14: start=0x7feec80, end=0x7feece8, size=0x68 // e820_saved firmware_map_entry 15: start=0x7feed00, end=0x7feed68, size=0x68 // e820_saved firmware_map_entry 16: start=0x7feed80, end=0x7feef08, size=0x188 // e820_res 17: start=0x7feef40, end=0x7feef83, size=0x43 // ioapic_resources 18: start=0x7feefc0, end=0x7feefd8, size=0x18 // 第1个section的usemap 19: start=0x7fef000, end=0x7ff0000, size=0x1000 // mem_section[0]
setup_nr_cpu_ids();
// @see kernel/smp.c#0665
// 注释里写的也很清楚了: An arch may set nr_cpu_ids earlier if needed, so this would be redundant.
nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
setup_per_cpu_areas();
// syslog里打印出 setup_percpu: NR_CPUS:256 nr_cpumask_bits:256 nr_cpu_ids:1 nr_node_ids:1
pcpu_chosen_fc = PCPU_FC_AUTO; // 默认值,可以通过GRUB命令行参数 percpu_alloc 设定
PERCPU_FIRST_CHUNK_RESERVE = PERCPU_MODULE_RESERVE = 8 << 10 = 8K = 0x2000
PERCPU_DYNAMIC_RESERVE = 20 << 10 = 20K = 0x5000
pcpu_embed_first_chunk(
PERCPU_FIRST_CHUNK_RESERVE, // 0x2000 = 8K
dyn_size, // 0x5000 = 20K
atom_size, // PMD_SIZE = 0x200000 = 2M
pcpu_cpu_distance, // 这个函数返回 LOCAL_DISTANCE = 10
pcpu_fc_alloc, // 使用memblock分配内存
pcpu_fc_free // free掉memblock分配的内存
);
struct pcpu_alloc_info *ai = pcpu_build_alloc_info(
reserved_size, // 0x2000 = 8K
dyn_size, // 0x5000 = 20K
atom_size, // 0x200000 = 2M
cpu_distance_fn // pcpu_cpu_distance, 返回 10
);
static_size = __per_cpu_end (0x14d00) - __per_cpu_start (0) = 0x14d00; // @see arch/x86/kernel/vmlinux.lds.S
// @see include/asm-generic/vmlinux.lds.h#0714
size_sum = static_size(0x14d00) + reserved_size(8K) + dyn_size(20K) = 0x1bd00 = 0x1c000 = 112K
dyn_size = 0x5300
min_unit_size = 0x1c000 = 112K
alloc_size = 2M
max_upa = 16
// 最终
struct pcpu_alloc_info ai = {
u64 static_size = 0x14d00,
u64 reserved_size = 0x2000,
u64 dyn_size = 0x5300,
u64 unit_size = 0x200000,
u64 atom_size = 0x200000,
u64 alloc_size = 0x200000,
u64 __ai_size = 0x1000,
int nr_groups = 1,
struct pcpu_group_info groups[] = {
{
int nr_units = 1,
u64 base_offset = 0,
unsigned int *cpu_map = 0x7fb6058
}
}
};
// pcpu_build_alloc_info得到了nr_groups,nr_units,然后为每个group分配一块内存,将kernel里的percpu数据copy进去.
// 然后更新了每个group的base_offset
syslog里打印出 PERCPU: Embedded 0x1c pages/cpu @0x7c00000 s0x14d00 r0x2000 d0x5300 u0x200000
pcpu_setup_first_chunk(ai, base = 0x7c00000);
// 这个函数的注释里解释清楚了 static_size, reserved_size, dyn_size 的用处:
// static_size 就是kernel里定义的percpu变量数据, reserved_size 紧跟在 static_size 后边, 这块内存主要是留给 module 用的,
// dyn_size 才是用来动态分配内存的, 这之后一直到 unit_size 是没有使用的内存
pcpu_nr_groups = 1
pcpu_group_offsets [0=0] // memblock alloc
pcpu_group_sizes [0=0x200000] // memblock alloc
pcpu_unit_map [0=0x7fee900] // memblock alloc
pcpu_unit_offsets [0=0x7fee8c0] // memblock alloc
pcpu_unit_pages = 0x200
pcpu_unit_size = 0x200000
pcpu_atom_size = 0x200000
pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + pcpu_unit_pages / 64(bit) * sizeof(long) 8 // 用bitmap表示每个page的使用情况?
= 0x80
pcpu_nr_slots = 19 + 2 = 21 = 0x15
pcpu_slot = 21 * sizeof(pcpu_slot[0] struct list_head) = 0x7fee740 // memblock alloc
schunk(static percpu area chunk) = 0x7fee6c0// memblock alloc
// struct pcpu_chunk的后边紧跟的是记录这个chunk每个page的使用情况的bitmap
// 这样说的话, chunk和unit是一个意思???
schunk->free_size = reserved_size = 0x2000;
pcpu_reserved_chunk = schunk;
pcpu_reserved_chunk_limit = static_size + reserved_size;
schunk->contig_hint = schunk->free_size = 0x2000;
dchunk = 0x7fee640 // memblock alloc
pcpu_first_chunk = dchunk = 0x7fee640 => struct pcpu_chunk {
struct list_head list, (0x7fee800)
int free_size = 0x5300,
int contig_hint = 0x5300,
void *base_addr = 0x7c00000,
int map_used = 2,
int map_alloc = 0x80,
int *map = 0x1b3df60,
void *data = 0,
bool immutable = 1,
unsigned long populated = 0xFFFF....FFFF;
};
pcpu_base_addr = 0x7c00000
/*
./print_memblock
--reserved regions--
0: start=0x9a000, end=0x9f000, size=0x5000 // TRAMPOLINE
1: start=0x9fc00, end=0x100000, size=0x60400 // EBDA
2: start=0x1000000, end=0x1d04049, size=0xd04049 // Kernel TEXT DATA BSS and extend_brk 49 bytes
3: start=0x7200000, end=0x7400000, size=0x200000 // map_map[0] 第1个section的map
4: start=0x796e000, end=0x79eb000, size=0x7d000 // RAMDISK
5: start=0x7be6000, end=0x7be8000, size=0x2000 // 0xffea...的pud和pmd
6: start=0x7c00000, end=0x7c1c000, size=0x1c000 // **static percpu area**
7: start=0x7fb7000, end=0x7fee000, size=0x37000 // level2_fixmap_pgt[507] -> NEW PTE (0x7fb7000 0x1000) map __vsyscall_0
// ZONE_DMA32 wait_table (0x7fb8000 0x18000)
// ZONE_DMA wait_table (0x7fd0000 0x18000)
// node_data[0] pglist_data (0x7fe8000 0x5000)
// THE PTE (0x7fed000 0x1000)
8: start=0x7fee640, end=0x7fee890, size=0x250 // **dchunk (0x7fee640 0x80)**
// **schunk (0x7fee6c0 0x80)**
// **pcpu_slot (0x7fee740 0x150)**
9: start=0x7fee8c0, end=0x7fee8c8, size=0x8 // **pcpu_unit_offsets**
10: start=0x7fee900, end=0x7fee904, size=0x4 // **pcpu_unit_map**
11: start=0x7fee940, end=0x7fee948, size=0x8 // **pcpu_group_sizes**
12: start=0x7fee980, end=0x7fee988, size=0x8 // **pcpu_group_offsets**
13: start=0x7fee9c0, end=0x7fee9d4, size=0x14 // static_command_line
14: start=0x7feea00, end=0x7feea14, size=0x14 // saved_command_line
15: start=0x7feea40, end=0x7feea60, size=0x20 // nosave_region
16: start=0x7feea80, end=0x7feeae8, size=0x68 // e820_saved firmware_map_entry
17: start=0x7feeb00, end=0x7feeb68, size=0x68 // e820_saved firmware_map_entry
18: start=0x7feeb80, end=0x7feebe8, size=0x68 // e820_saved firmware_map_entry
19: start=0x7feec00, end=0x7feec68, size=0x68 // e820_saved firmware_map_entry
20: start=0x7feec80, end=0x7feece8, size=0x68 // e820_saved firmware_map_entry
21: start=0x7feed00, end=0x7feed68, size=0x68 // e820_saved firmware_map_entry
22: start=0x7feed80, end=0x7feef08, size=0x188 // e820_res
23: start=0x7feef40, end=0x7feef83, size=0x43 // ioapic_resources
24: start=0x7feefc0, end=0x7feefd8, size=0x18 // 第1个section的usemap
25: start=0x7fef000, end=0x7ff0000, size=0x1000 // mem_section[0]
*/
分析了上边的 pcpu_embed_first_chunk 根本弄不懂这个percpu memory allocator是怎么回事,但结合 pcpu_alloc() 容易理解一些了.
percpu的关键在于每个cpu都有一个, static的很容易明白, int var[NR_CPUS]; dynamic的是这样的: 一块大内存看作 MemoryBlock[NR_CPUS].
然后需要一个办法把每个CPU的MemoryBlock的使用情况管理起来,因为每个CPU的MemoryBlock使用情况都是一致的,那个用一个数据结构管理就可以了.所以就有了
struct pcpu_chunk {
void *data; // 相当于 MemoryBlock[NR_CPUS]
}
当然需要知道一个MemoryBlock有多少空余空间可以用,以及有多少连续的空余空间可以用,这样当一个请求过来说要多大的内存块时,一下子就可以判断出来这个Block能不能满足.然后就有了
struct pcpu_chunk {
int free_size; // 每个MemoryBlock里有多少空余空间
int contig_hint; // 每个MemoryBlock里有多少连续的空余空间
void *data;
}
当这个MemoryBlock可以满足内存分配请求时,接下来就要分配出内存块来,那就需要有一个办法来详细的记录哪些内存已经分配出去了,哪些还空闲着.
struct pcpu_chunk {
int free_size;
int contig_hint;
int map_used;
int map_alloc;
int *map; // 额外分配个 int map[map_alloc] 数组出来,每分配出去一块内存,就记下来.
// 比如第一次分配出去100字节, map[0] = -100 (用负数表示分配出去了)
// 第二次分配出去200字节, map[1] = -200
// 那这个map_alloc定为多大呢?默认定为 16, map_used记录着已经用了几个了
// 当不够用时,就要分配一个更大的map,然后把现在的map数据copy到新的里去,原来的就free掉,有点realloc的意思
void *data;
}
当percpu真正使用起来后,会有很多个chunk,并且每个chunk的free_size大小就不一样了,这时间一个请求过来,需要有个办法能很快知道哪些chunk能满足要求,所以就要按chunk的free_size大小给排个序.怎么排呢?
pcpu_slot[] = {
sizeA => {chunkA1, chunkA2, chunkA3 ...},
sizeB => {chunkB1, chunkB2, chunkB3 ...}
}
dynamic percpu memory alloc这样做了之后,那么原来的 static 的可不可以做下统一呢?当然疑问是可以的.
分配一个chunk(static chunk),这个chunk立马有一个内存已经分配出去了,因为static的percpu变量数据编译完成后我们就知道大小了,然后这个chunk的free_size就等于reserved_size.
然后再分配一个chunk(dynamic chunk),这个chunk完全是个新chunk,只不过free_size大小为 dyn_size.
这两个chunk ready之后,接下来即使内核的内存管理还没有好,也可以动态分配percpu内存了.
pcpu_embed_first_chunk()返回0,继续向下执行.
delta = pcpu_base_addr - __per_cpu_start = 0x7c00000
接着一个for循环,把每个cpu的 per_cpu_offset, this_cpu_off都设好,并且在当前cpu(boot cpu)上用上了新的gdt.
实际上这个for循环做的事是这样的:
系统启动的时候,还不知道有多少个cpu,当定义percpu变量时,实际上只定义了一份,但标明这个变量是percpu变量,编译的时候要编译到一个特定的section里,
在这之前,boot cpu一直用着定义的这一份,前边 pcpu_embed_first_chunk 时,把 percpu section每个cpu都copy了一份,
现在虽然其它cpu还没有运行起来,但boot cpu有了自己的那一份,于是就用自己的,不再用先前定义的了.
这时看一下gdtr,指向 0x7c04000, 当前cpu的那份是从0x7c00000开始的,就对上了.
再理一下percpu的使用问题:
static定义的percpu变量在编译时那个percpu section按从地址0开始,现在访问这些变量通过宏定义 per_cpu(var, cpu),
这个宏定义做的事就是把地址 var + per_cpu_offset[cpu] 处的内容返回, 即 *(var + per_cpu_offset[cpu]). 就拿boot cpu的gdt_page举例吧:
编译后,gdt_page的地址是 0x4000,这个在System.map里可以查到,boot cpu的 per_cpu_offset[0] = 0x7c00000,
所以 per_cpu(gdt_page, 0) = 0x4000 + 0x7c00000 = 0x7c04000
对于dynamic alloc的变量,关键在于返回时要处理一下,pcpu_alloc时只在chunk的MemoryBlock[0]上分配,拿到的地址也是MemoryBlock[0]上的地址,
显然用这个地址是无法访问到其它cpu上对应的变量的,那就要把这个地址减去pcpu_base_addr,也就是0x7c00000.
然后再次使用时用per_cpu宏再加上offset就能找到对应cpu上的变量了.
上边写的比较乱,但确实搞明白了percpu是怎么回事,回头有了4核的bochs,再重新整理下.
// .config里没有定义 CONFIG_CPUMASK_OFFSTACK, 而下边这两个函数都要用到 alloc_bootmem_cpumask_var, 这个函数是个空函数,不产生任何实际指令
// 所以下边这两个函数啥都没做
setup_node_to_cpumask_map();
setup_cpu_local_masks();
setup_per_cpu_areas()初始化好了 pcpu memory allocator, 后边要想使用percpu变量就很容易了. 但有个疑问是虽然 static chunk 和 dynamic chunk有了,它们对应的内存也分配了,前者大小是 0x2000, 后者大小是 0x5300, 加上static size = 0x14d00,总大小是0x1c000,但是chunk.data还没有设,这后边怎么用呢?现在, pcpu_reserved_chunk = schunk, 注意 pcpu_setup_first_chunk() 的最后, pcpu_first_chunk = dchunk, 并且 pcpu_chunk_relocate(pcpu_first_chunk, -1) 把 dchunk 加到了 pcpu_slot 里边. 后边如果需要 alloc percpu memory, reserved 从 schunk 里分配, 非reserved从dchunk里分配.
smp_prepare_boot_cpu();
=> native_smp_prepare_boot_cpu(); 这个函数做事倒挺简单的: 在 arch/x86/kernel/setup_percpu.c#0024 里定义了 percpu 变量 cpu_number. DEFINE_PER_CPU(int, cpu_number); 在 pcpu_embed_first_chunk() 时把cpu_number给每个cpu都复制了一份, 然后在接下来的for循环里给这个变量赋值了. 再接下来,给cpu 0(也就是boot cpu了) switch_to_new_gdt(), 在这个函数里调用了 load_percpu_segment(). 这个函数 wrmsr 写入了 GS_BASE = 0x7c00000. 然后读取的时候 %gs:&cpu_number 就取到当前cpu的cpu_number的值了. 照这样说,static定义的percpu变量都可以通过%gs:var_addr取值. @see AMD-Volume2 page 72: Segment Registers in 64-Bit Mode 在64位环境里,当计算内存地址是,cs,ds,es,ss都被忽略了,但是fs和gs还都参与计算.这里有提到gs.base.
build_all_zonelists(NULL);
set_zonelist_order();
// user_zonelist_order 默认等于 ZONELIST_ORDER_DEFAULT, 可以通过GRUB命令行参数 numa_zonelist_order 来设定
current_zonelist_order = default_zonelist_order();
// setup_arch时我们知道0-16M是ZONE_DMA, 16M-4G是ZONE_DMA32,4G+是ZONE_NORMAL,我们只有一个dummy node,所以如果内存大于4G,就会有ZONE_NORMAL
// 这时返回 ZONELIST_ORDER_NODE. 如果小于4G, low_kmem_size == total_size > total_size / 2, 所以对我们来说, 不管内存多大,最终
// current_zonelist_order = ZONELIST_ORDER_NODE = 1
__build_all_zonelists(NULL);
build_zonelists(pgdat);
// 没太看明白这个函数是做什么的,不过可以肯定的是,它和 node_data.node_zonelists有关,后边把node_zonelists打印出来看看就知道了
mminit_verify_zonelist(); // 由于 mminit_loglevel < MMINIT_VERIFY, 这个函数早早的返回了, 什么也没做
cpuset_init_current_mems_allowed(); // 把current->mems_allowed的所有bits全置为1
vm_total_pages = nr_free_pagecache_pages(); // = 0x7bda = 0xf52 + 0x6e68
page_group_by_mobility_disabled = 0;
// syslog里打印出
Built 1 zonelists in Node order, mobility grouping on. Total pages: 0x7bda
Policy zone: DMA32
// bochs: writemem "/tmp/node_data.memdump" 0xffff880007fe8000 0x5000
page_alloc_init();
=> hotcpu_notifier(page_alloc_cpu_notify, 0); cpu_china (0x1c2c7e0) 是一个单链表,链表里的每个元素struct notifier_block都有一个notifier_call和priority;struct notifier_block {
int (*notifier_call)(struct notifier_block *, unsigned long, void *);
struct notifier_block __rcu *next;
int priority;
};
page_alloc_cpu_notify()做的事是当CPU_DEAD或CPU_DEAD_FROZEN时free掉它占用的内存及其它不太清楚是什么东西.parse_early_param();
我们在setup_arch()里已经调用过这个函数了,所以这次调用发现done=1就返回了.parse_args();
parse_args("Booting kernel", static_command_line, __start___param,
__stop___param - __start___param,
&unknown_bootoption);
/* Built-in module parameters. */ \
__param : AT(ADDR(__param) - LOAD_OFFSET) { \
VMLINUX_SYMBOL(__start___param) = .; \
*(__param) \
VMLINUX_SYMBOL(__stop___param) = .; \
}
显然 __start___param 和 __stop___param 之间是很多的kernel_param, parse_args() 的逻辑我们很熟悉了, 根据参数名称匹配上之后,调用对应的处理函数.
我们没有通过GRUB给kernel传递任何参数,所以这个函数什么也做不了. 前边也提到,物理机会传递root= 和 ro 两个参数,
这两个参数对应的数据结构在 .init.setup section, 也不在这个函数里处理.
pidhash_init();
nr_kernel_pages = 0xf52 + 0x6e68 = 0x7bda
0x7bda + (1 << 8) - 1 = 0x7eb9
0x7eb9 >> 8 = 0x7e
0x7e << 8 = 0x7e00
0x7e00 >> 6 = 0x1f8
round_pow_of_two = 0x200
log2qty = 9
alloc_bootmem_nopanic( 8 << 9 = 4096 = 0x1000);
// syslog里打印出
PID hash table entries: 512 (order: 0, 4096 bytes)
pidhash_size = 9
/*
./print_memblock
--reserved regions--
0: start=0x9a000, end=0x9f000, size=0x5000 // TRAMPOLINE
1: start=0x9fc00, end=0x100000, size=0x60400 // EBDA
2: start=0x1000000, end=0x1d04049, size=0xd04049 // Kernel TEXT DATA BSS and extend_brk 49 bytes
3: start=0x7200000, end=0x7400000, size=0x200000 // map_map[0] 第1个section的map
4: start=0x796e000, end=0x79eb000, size=0x7d000 // RAMDISK
5: start=0x7be6000, end=0x7be8000, size=0x2000 // 0xffea...的pud和pmd
6: start=0x7c00000, end=0x7c1c000, size=0x1c000 // static percpu area
7: start=0x7fb6000, end=0x7fee000, size=0x38000 // **pid_hash (0x7fb6000 0x1000)**
// level2_fixmap_pgt[507] -> NEW PTE (0x7fb7000 0x1000) map __vsyscall_0
// ZONE_DMA32 wait_table (0x7fb8000 0x18000)
// ZONE_DMA wait_table (0x7fd0000 0x18000)
// node_data[0] pglist_data (0x7fe8000 0x5000)
// THE PTE (0x7fed000 0x1000)
8: start=0x7fee640, end=0x7fee890, size=0x250 // dchunk (0x7fee640 0x80)
// schunk (0x7fee6c0 0x80)
// pcpu_slot (0x7fee740 0x150)
9: start=0x7fee8c0, end=0x7fee8c8, size=0x8 // pcpu_unit_offsets
10: start=0x7fee900, end=0x7fee904, size=0x4 // pcpu_unit_map
11: start=0x7fee940, end=0x7fee948, size=0x8 // pcpu_group_sizes
12: start=0x7fee980, end=0x7fee988, size=0x8 // pcpu_group_offsets
13: start=0x7fee9c0, end=0x7fee9d4, size=0x14 // static_command_line
14: start=0x7feea00, end=0x7feea14, size=0x14 // saved_command_line
15: start=0x7feea40, end=0x7feea60, size=0x20 // nosave_region
16: start=0x7feea80, end=0x7feeae8, size=0x68 // e820_saved firmware_map_entry
17: start=0x7feeb00, end=0x7feeb68, size=0x68 // e820_saved firmware_map_entry
18: start=0x7feeb80, end=0x7feebe8, size=0x68 // e820_saved firmware_map_entry
19: start=0x7feec00, end=0x7feec68, size=0x68 // e820_saved firmware_map_entry
20: start=0x7feec80, end=0x7feece8, size=0x68 // e820_saved firmware_map_entry
21: start=0x7feed00, end=0x7feed68, size=0x68 // e820_saved firmware_map_entry
22: start=0x7feed80, end=0x7feef08, size=0x188 // e820_res
23: start=0x7feef40, end=0x7feef83, size=0x43 // ioapic_resources
24: start=0x7feefc0, end=0x7feefd8, size=0x18 // 第1个section的usemap
25: start=0x7fef000, end=0x7ff0000, size=0x1000 // mem_section[0]
*/
static struct hlist_head *pid_hash;
struct hlist_head {
struct hlist_node *first;
};
struct hlist_node {
struct hlist_node *next, **pprev;
};
这个函数初始化了一个hash table,这个hash table有512个entries,每个entry指向 hlist_node;vfs_caches_init_early();
由于 hashdist = 1, dcache_init_early() 和 inode_init_early() 早早的就返回了,什么也没做.sort_main_extable();
Sort the kernel's built-in exception table 这个table的数据结构挺简单,但不清楚这个table是怎么弄出来的以及什么时间用得到.
/*
* The exception table consists of pairs of addresses: the first is the
* address of an instruction that is allowed to fault, and the second is
* the address at which the program should continue. No registers are
* modified, so it is entirely up to the continuation code to figure out
* what to do.
*
* All the routines below use bits of fixup code that are out of line
* with the main instruction path. This means when everything is well,
* we don't even have to jump over them. Further, they do not intrude
* on our cache or tlb entries.
*/
struct exception_table_entry {
unsigned long insn, fixup;
};
trap_init();
注: UnderStandingKernel这本书显然有点老了,Initializing the Interrupt Descriptor Table部分的描述和代码已经差很多了.仔细阅读代码我们可以发现:set handler时只用了两个函数,一个是set_intr_gate(dpl=0,set_intr_gate_ist只是多了个stack),一个是set_system_intr_gate(dpl=3).
.config里定义了CONFIG_IA32_EMULATION=y,所以把0x80关联到ia32_syscall上,并在used_vectors里标记上.cpu_init();
cpu = stack_smp_processor_id();
// 在arch/x86/kernel_head_64.S里,我们把%rsp(栈顶)设为init_thread_union+THREAD_SIZE-8, init_thread_union定义在
// arch/x86/kernel/init_task.c里, init_thread_union = {INIT_THREAD_INFO(init_task)}
// 在UnderStandingKernel的第85页,对thread_union有清楚的解释,默认THREAD_SIZE=8K, %rsp & 0xffffe000 就能得到 thread_info
// 然后就能取到cpu了
关于cpu方面的知识还不够,cpu_init先略过去吧.