PostgreSQL 源码解读(229)- Linux Kernel(进程虚拟内存#3)
PostgreSQL使用进程架构,每个连接对应一个后台进程,为了更好的理解这种架构,有必要深入理解进程的相关知识.本节主要介绍了Linux下的进程虚拟内存结构,并以使用C语言样例程序进行说明.
一、malloc
先前几节的样例代码通过malloc分配内存,进程虚拟内存中存在heap,如果不使用malloc,虚拟内存是否有heap呢?
[root@localhost linux]# cat 0-main.c #include #include /** * main - do nothing * * Return: EXIT_FAILURE if something failed. Otherwise EXIT_SUCCESS */int main(void){ getchar(); return (EXIT_SUCCESS);}
编译并执行,查看进程的maps
[root@localhost ~]# ps -ef|grep \ \./0root 21802 18855 0 16:45 pts/7 00:00:00 ./0root 21832 21806 0 16:45 pts/0 00:00:00 grep --color=auto ./0[root@localhost ~]# cat /proc/21802/maps00400000-00401000 r-xp 00000000 fd:00 252008457 /data/source/linux/000600000-00601000 r--p 00000000 fd:00 252008457 /data/source/linux/000601000-00602000 rw-p 00001000 fd:00 252008457 /data/source/linux/07fc6e03c5000-7fc6e057d000 r-xp 00000000 fd:00 153635 /usr/lib64/libc-2.17.so7fc6e057d000-7fc6e077d000 ---p 001b8000 fd:00 153635 /usr/lib64/libc-2.17.so7fc6e077d000-7fc6e0781000 r--p 001b8000 fd:00 153635 /usr/lib64/libc-2.17.so7fc6e0781000-7fc6e0783000 rw-p 001bc000 fd:00 153635 /usr/lib64/libc-2.17.so7fc6e0783000-7fc6e0788000 rw-p 00000000 00:00 0 7fc6e0788000-7fc6e07a9000 r-xp 00000000 fd:00 153628 /usr/lib64/ld-2.17.so7fc6e099c000-7fc6e099f000 rw-p 00000000 00:00 0 7fc6e09a7000-7fc6e09a9000 rw-p 00000000 00:00 0 7fc6e09a9000-7fc6e09aa000 r--p 00021000 fd:00 153628 /usr/lib64/ld-2.17.so7fc6e09aa000-7fc6e09ab000 rw-p 00022000 fd:00 153628 /usr/lib64/ld-2.17.so7fc6e09ab000-7fc6e09ac000 rw-p 00000000 00:00 0 7ffe3c606000-7ffe3c627000 rw-p 00000000 00:00 0 [stack]7ffe3c6b3000-7ffe3c6b5000 r-xp 00000000 00:00 0 [vdso]ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall][root@localhost ~]#
没有[heap]的存在。
malloc不是系统调用,man malloc解释如下
[…] allocate dynamic memory[…]
void *malloc(size_t size);
[…]
The malloc() function allocates size bytes and returns a pointer to the allocated memory.
malloc调用了什么系统函数?可以通过strace来分析
[root@localhost linux]# cat 3-main.c #include #include #include /** * main - let's find out which syscall malloc is using * * Return: EXIT_FAILURE if something failed. Otherwise EXIT_SUCCESS */int main(void){ void *p; write(1, "BEFORE MALLOC\n", 14); p = malloc(1); write(1, "AFTER MALLOC\n", 13); printf("%p\n", p); getchar(); return (EXIT_SUCCESS);}
编译执行,strace输出如下
[root@localhost ~]# strace ./3execve("./3", ["./3"], [/* 25 vars */]) = 0brk(NULL) = 0x1abe000mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fe132467000access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory)open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3fstat(3, {st_mode=S_IFREG|0644, st_size=34897, ...}) = 0mmap(NULL, 34897, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7fe13245e000close(3) = 0open("/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 3read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\20\35\2\0\0\0\0\0"..., 832) = 832fstat(3, {st_mode=S_IFREG|0755, st_size=2127336, ...}) = 0mmap(NULL, 3940800, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7fe131e84000mprotect(0x7fe13203c000, 2097152, PROT_NONE) = 0mmap(0x7fe13223c000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1b8000) = 0x7fe13223c000mmap(0x7fe132242000, 16832, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7fe132242000close(3) = 0mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fe13245d000mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fe13245b000arch_prctl(ARCH_SET_FS, 0x7fe13245b740) = 0mprotect(0x7fe13223c000, 16384, PROT_READ) = 0mprotect(0x600000, 4096, PROT_READ) = 0mprotect(0x7fe132468000, 4096, PROT_READ) = 0munmap(0x7fe13245e000, 34897) = 0write(1, "BEFORE MALLOC\n", 14BEFORE MALLOC) = 14brk(NULL) = 0x1abe000brk(0x1adf000) = 0x1adf000brk(NULL) = 0x1adf000write(1, "AFTER MALLOC\n", 13AFTER MALLOC) = 13fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 5), ...}) = 0mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fe132466000write(1, "0x1abe010\n", 100x1abe010) = 10fstat(0, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 5), ...}) = 0mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fe132465000read(0,
可以看到,malloc调用了brk分配堆内存,大小为0x21000,查看进程的maps
[root@localhost linux]# cat /proc/14502/maps00400000-00401000 r-xp 00000000 fd:00 36596343 /root/300600000-00601000 r--p 00000000 fd:00 36596343 /root/300601000-00602000 rw-p 00001000 fd:00 36596343 /root/301abe000-01adf000 rw-p 00000000 00:00 0 [heap]7fe131e84000-7fe13203c000 r-xp 00000000 fd:00 153635 /usr/lib64/libc-2.17.so7fe13203c000-7fe13223c000 ---p 001b8000 fd:00 153635 /usr/lib64/libc-2.17.so7fe13223c000-7fe132240000 r--p 001b8000 fd:00 153635 /usr/lib64/libc-2.17.so7fe132240000-7fe132242000 rw-p 001bc000 fd:00 153635 /usr/lib64/libc-2.17.so7fe132242000-7fe132247000 rw-p 00000000 00:00 0 7fe132247000-7fe132268000 r-xp 00000000 fd:00 153628 /usr/lib64/ld-2.17.so7fe13245b000-7fe13245e000 rw-p 00000000 00:00 0 7fe132465000-7fe132468000 rw-p 00000000 00:00 0 7fe132468000-7fe132469000 r--p 00021000 fd:00 153628 /usr/lib64/ld-2.17.so7fe132469000-7fe13246a000 rw-p 00022000 fd:00 153628 /usr/lib64/ld-2.17.so7fe13246a000-7fe13246b000 rw-p 00000000 00:00 0 7ffdfb7b5000-7ffdfb7d6000 rw-p 00000000 00:00 0 [stack]7ffdfb7ef000-7ffdfb7f1000 r-xp 00000000 00:00 0 [vdso]ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall][root@localhost linux]#
01abe000-01adf000 rw-p 00000000 00:00 0 [heap]
与strace跟踪输出相符。
执行上面的样例代码
[root@localhost linux]# ./3BEFORE MALLOCAFTER MALLOC0x1123010
输出为0x1123010,但实际的开始地址为0x1123000,多出来的0x10一共16个字节是什么呢?实际上,这16个字节,低8位为上一个未分配的chunk的大小(如已分配则为0x0),高8位为block的大小。
[root@localhost linux]# cat 5-main.c #include #include #include /** * pmem - print mem * @p: memory address to start printing from * @bytes: number of bytes to print * * Return: nothing */void pmem(void *p, unsigned int bytes){ unsigned char *ptr; unsigned int i; ptr = (unsigned char *)p; for (i = 0; i < bytes; i++) { if (i != 0) { printf(" "); } printf("x", *(ptr + i)); } printf("\n");}/** * main - the 0x10 lost bytes * * Return: EXIT_FAILURE if something failed. Otherwise EXIT_SUCCESS */int main(void){ void *p; int i; for (i = 0; i < 10; i++) { p = malloc(1024 * (i + 1)); printf("%p\n", p); printf("bytes at %p:\n", (void *)((char *)p - 0x10)); pmem((char *)p - 0x10, 0x10); } return (EXIT_SUCCESS);}[root@localhost linux]#
编译执行
[root@localhost linux]# ./50x2416010bytes at 0x2416000:00 00 00 00 00 00 00 00 11 04 00 00 00 00 00 00...
这是p指向的内存地址的首16个字节中的内容,0x4011,其中0x4010是block的大小(1024个字节+16个字节),0x0001是标记位,用于标记上一个chunk是否正在使用。
二、参考资料
Virtual memory
Hack the Virtual Memory: malloc, the heap & the program break