Linux内核如何创建一个新进程

徐洁原创作品转载请注明出处《Linux内核分析》MOOC课程http://mooc.study.163.com/course/USTC-1000029000一、实验使用gdb跟踪分析一个fork系统调用内核处理函数sys_clone，在实验楼Linux虚拟机环境下完成实验。MenuOS中添加了fork，用gdb跟踪fork分析其内核处理函数sys_clone。

XJ-2014

1586人浏览 · 2015-04-11 21:13:40

XJ-2014 · 2015-04-11 21:13:40 发布

徐洁原创作品转载请注明出处《Linux内核分析》MOOC课程http://mooc.study.163.com/course/USTC-1000029000

一、实验

使用gdb跟踪分析一个fork系统调用内核处理函数sys_clone，在实验楼Linux虚拟机环境下完成实验。MenuOS中添加了fork，用gdb跟踪fork分析其内核处理函数sys_clone。

单步执行跟踪到ret_from_fork断点处继续执行汇编语句，jmp syscall_exit之后就无法跟踪了。

二、部分相关源码分析

clone()、fork()、vfork()。

-> do_fork() -> copy_process() -> dup_task_struct() -> copy_thread() -> ret_from_fork -> syscall_exit

上图参考(http://www.ibm.com/developerworks/cn/linux/l-linux-process-management/)

/linux-3.18.6/kernel/fork.c

新创建的子进程处于可运行状态，需要调度程序把CPU控制权交给新创建的子进程才能实际运行。do_fork()利用copy_process来创建进程描述符以及子进程执行所需要的所有其他内核数据结构。

do_fork源码：

//start_stack是用户状态下栈的起始地址。stack_size是用户状态下栈的大小。Parent_tidptr和child_tipdr是指向用户空间地址的两个指针，分别指向父子进程的TID。

1623long do_fork(unsigned long clone_flags,
1624	      unsigned long stack_start,
1625	      unsigned long stack_size,	
1626	      int __user *parent_tidptr,
1627	      int __user *child_tidptr)
1628{
1629	struct task_struct *p;
1630	int trace = 0;
1631	long nr;
1632
1633	/*
1634	 * Determine whether and which event to report to ptracer.  When
1635	 * called from kernel_thread or CLONE_UNTRACED is explicitly
1636	 * requested, no event is reported; otherwise, report if the event
1637	 * for the type of forking is enabled.
1638	 */
1639	if (!(clone_flags & CLONE_UNTRACED)) {
1640		if (clone_flags & CLONE_VFORK)
1641			trace = PTRACE_EVENT_VFORK;
1642		else if ((clone_flags & CSIGNAL) != SIGCHLD)
1643			trace = PTRACE_EVENT_CLONE;
1644		else
1645			trace = PTRACE_EVENT_FORK;
1646
1647		if (likely(!ptrace_event_enabled(current, trace)))
1648			trace = 0;
1649	}
1650
1651	p = copy_process(clone_flags, stack_start, stack_size,
1652			 child_tidptr, NULL, trace);
1653	/*
1654	 * Do this prior waking up the new thread - the thread pointer
1655	 * might get invalid after that point, if the thread exits quickly.
1656	 */
1657	if (!IS_ERR(p)) { /*判断p的有效性*/
1658		struct completion vfork;
1659		struct pid *pid;
1660
1661		trace_sched_process_fork(current, p);
1662
1663		pid = get_task_pid(p, PIDTYPE_PID);
1664		nr = pid_vnr(pid);/*返回p的命名空间的pid*/
1665
1666		if (clone_flags & CLONE_PARENT_SETTID)/*如果设置父进程的TID，将nr放到parent_tidptr地址中*/
1667			put_user(nr, parent_tidptr);
1668/*如果设置了该标志，初始化进程中的completion结构*/
1669		if (clone_flags & CLONE_VFORK) {
1670			p->vfork_done = &vfork;
1671			init_completion(&vfork);
1672			get_task_struct(p);
1673		}
1674
1675		wake_up_new_task(p);/唤醒函数，将进程入运行队列*/
1676
1677		/* forking complete and child started to run, tell ptracer */
1678		if (unlikely(trace))
1679			ptrace_event_pid(trace, pid);
1680
1681		if (clone_flags & CLONE_VFORK) {
1682			if (!wait_for_vfork_done(p, &vfork))
1683				ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
1684		}
1685
1686		put_pid(pid);
1687	} else {
1688		nr = PTR_ERR(p);
1689	}
1690	return nr;
1691}

dup_task_struct()为子进程获取进程描述符。copy_process()主要完成进程数据结构，各种资源的初始化。初始化方式可以重新分配，也可以共享父进程资源，主要根据传入clone_flags参数来确定。

部分源码如下：

static struct task_struct *dup_task_struct(struct task_struct *orig)
306{
307	struct task_struct *tsk;
308	struct thread_info *ti;
309	int node = tsk_fork_get_node(orig);
310	int err;
311
312	tsk = alloc_task_struct_node(node);
313	if (!tsk)
314		return NULL;
315/*给新进程分配一个新的内核堆栈*/
316	ti = alloc_thread_info_node(tsk, node);
317	if (!ti) /*如果thread info结构没申请到，释放tsk*/
318		goto free_tsk;
319
320	err = arch_dup_task_struct(tsk, orig);/*复制task_struct*/
321	if (err)
322		goto free_ti;
323
324	tsk->stack = ti; /*task对应栈*/
325#ifdef CONFIG_SECCOMP
326	/*
327	 * We must handle setting up seccomp filters once we're under
328	 * the sighand lock in case orig has changed between now and
329	 * then. Until then, filter must be NULL to avoid messing up
330	 * the usage counts on the error path calling free_task.
331	 */
332	tsk->seccomp.filter = NULL;
333#endif
334
335	setup_thread_stack(tsk, orig);/*初始化thread info结构*/
336	clear_user_return_notifier(tsk);
337	clear_tsk_need_resched(tsk);
338	set_task_stack_end_magic(tsk);
339
340#ifdef CONFIG_CC_STACKPROTECTOR
341	tsk->stack_canary = get_random_int(); /*初始化stack_canary变量*/
copy_thread()
//新进程有自己的堆栈且会根据task_pt_regs中的内容进行修改。
int copy_thread(unsigned long clone_flags, unsigned long sp,
133	unsigned long arg, struct task_struct *p)
134{
135	struct pt_regs *childregs = task_pt_regs(p);
136	struct task_struct *tsk;
137	int err;
138
139	p->thread.sp = (unsigned long) childregs;//调度到子进程时的内核栈顶
140	p->thread.sp0 = (unsigned long) (childregs+1);
141	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
142
143	if (unlikely(p->flags & PF_KTHREAD)) {
144		/* kernel thread */
145		memset(childregs, 0, sizeof(struct pt_regs));
146		p->thread.ip = (unsigned long)ret_from_kernel_thread; 147		task_user_gs(p) = __KERNEL_STACK_CANARY;
148		childregs->ds = __USER_DS;
149		childregs->es = __USER_DS;
150		childregs->fs = __KERNEL_PERCPU;
151		childregs->bx = sp;	/* function */
152		childregs->bp = arg;
153		childregs->orig_ax = -1;
154		childregs->cs = __KERNEL_CS | get_kernel_rpl();
155		childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
156		p->thread.io_bitmap_ptr = NULL;
157		return 0;
158	}
159	*childregs = *current_pt_regs();//复制内核堆栈
160	childregs->ax = 0;//eax寄存器值强置为0，即子进程返回到用户态时返回值为0
161	if (sp)
162		childregs->sp = sp;//sp为父进程传给子进程的用户态栈，可以与父进程共享
163
164	p->thread.ip = (unsigned long) ret_from_fork; //调度到子进程时的第一条指令地址
165	task_user_gs(p) = get_user_gs(current_pt_regs());
166
167	p->thread.io_bitmap_ptr = NULL;
168	tsk = current;

p被调度运行，从ret_from_fork开始执行子进程。

/linux-3.18.6/arch/x86/kernel/entry_32.S

290ENTRY(ret_from_fork)
291	CFI_STARTPROC
292	pushl_cfi %eax
293	call schedule_tail
294	GET_THREAD_INFO(%ebp)
295	popl_cfi %eax
296	pushl_cfi $0x0202		# Reset kernel eflags
297	popfl_cfi
298	jmp syscall_exit
299	CFI_ENDPROC
300END(ret_from_fork)

接着跳转到syscall_exit

/linux-3.18.6/arch/x86/kernel/entry_32.S

syscall_exit:
506	LOCKDEP_SYS_EXIT
507	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
508					# setting need_resched or sigpending
509					# between sampling and the iret
510	TRACE_IRQS_OFF
511	movl TI_flags(%ebp), %ecx
512	testl $_TIF_ALLWORK_MASK, %ecx	# current->work
513	jne syscall_exit_work
514
515restore_all:
516	TRACE_IRQS_IRET
••••••
530restore_nocheck:
531	RESTORE_REGS 4			# skip orig_eax/error_code
532irq_return:
533	INTERRUPT_RETURN

ret_from_fork调用syscall_exit，以INTERRUPT RETURN返回。

三、总结

实际上，用户空间的寄存器、用户态堆栈等信息在切换到内核态的上下文时保存在内核栈中，父进程在内核态（dup_task_struct）复制出子进程，但子进程作为一个独立的进程，之后被调度运行时必须有一个指令地址，进程切换时，ip地址及当前内核栈的位置esp都存在于thread_info中，由copy_thread设置其thread.ip指向ret_from_fork作为子进程执行的第一条语句，并完成了内核态到用户态的切换。

进程创建由系统调用来建立新进程，归根结底都是调用do_fork来实现。do_fork主要就是调用copy_process，而copy_process初始化task_struct结构体分配给子进程，并为其分配pid，最后将其加入可运行队列中。在内核栈中，dup_task_struct()àcopy_thread()，copy_thread（）函数将父进程内核栈复制到子进程中，同时设置子进程调度后执行的第一条语句地址为do_frok返回，并将保存返回值的寄存器eax值置为0，因此子进程返回为0，而父进程继续执行之后的初始化，最后返回子进程的pid（tgid）。