open系统调用源码解析

科技2022-07-16 104

内核版本为3.2.69

文章目录

引言do_sys_opendo_filp_openpath_openatlink_path_walkwalk_componenthandle_dotsfollow_dotdot_rcudo_lookupdo_last 总结

引言

为了更好的理解VFS的运作过程，决定看看open的源码以捋顺自己的思路。还记得大一时老师为了让大家对所学的东西有更深的兴趣，经常拿open内部的调用关系图来激励大家，一个小小的open都如此复杂，你又怎能停滞不前？停止学习？哈哈哈，开个玩笑，open并不小，你也没有停滞不前！

所以这其实也算是解决一个大一时就留下的疑惑，让我们开始吧！

do_sys_open

long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) { struct open_flags op; int fd = build_open_flags(flags, mode, &op); struct filename *tmp; if (fd) return fd; // 内部首先创建存取文件名称的空间，然后从用户空间把文件名拷贝过来 tmp = getname(filename); if (IS_ERR(tmp)) return PTR_ERR(tmp); // 此函数调用alloc_fd()函数从fd_table中获取一个可用fd,并做些简单初始化 fd = get_unused_fd_flags(flags); if (fd >= 0) { // 根据name可以返回file结构体 struct file *f = do_filp_open(dfd, tmp, &op); if (IS_ERR(f)) { // 打开失败，释放刚刚获取的fd put_unused_fd(fd); fd = PTR_ERR(f); } else { // 文件如果已经被打开了，调用fsnotify_open()函数 fsnotify_open(f); // 把fd填充到每个进程的files_struct的fdtab中去 fd_install(fd, f); } } // 释放放置从用户空间拷贝过来的文件名的存储空间 putname(tmp); return fd; }

do_filp_open

显然do_filp_open是最核心的函数，它可以通过name返回一个file结构体，当然也对应着dentry和inode。

struct file *do_filp_open(int dfd, const char *pathname, const struct open_flags *op, int flags) { struct nameidata nd; struct file *filp; filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU); if (unlikely(filp == ERR_PTR(-ECHILD))) filp = path_openat(dfd, pathname, &nd, op, flags); if (unlikely(filp == ERR_PTR(-ESTALE))) filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL); return filp; }

在这个函数中，path_openat有可能会被调用三次。通常内核为了提高效率，会首先在RCU模式（rcu-walk）下进行文件打开操作；如果在此方式下打开失败，则进入普通模式（ref-walk）。第三次就比较少见了。

我们看到这里除了声明了要返回的file以外还声明了一个结构体nameidata，它的定义如下：

struct nameidata { struct path path; //该结构体标识了文件路径相关的信息 struct qstr last; struct path root; struct inode *inode; /* path.dentry.d_inode */ unsigned int flags; unsigned seq; int last_type; unsigned depth; char *saved_names[MAX_NESTED_LINKS + 1]; /* Intent data */ union { struct open_intent open; } intent; };

其主要作用为在整个路径查找过程中充当中间变量，它既可以为当前查找输入数据，又可以保存本次查找的结果。

path_openat

我们来看看path_openat：

static struct file *path_openat(int dfd, const char *pathname, struct nameidata *nd, const struct open_flags *op, int flags) { struct file *base = NULL; struct file *filp; struct path path; int error; // 声明一个新的file结构，分配前会对当前进程的权限和当前系统的文件最大数进行检测 filp = get_empty_filp(); if (!filp) return ERR_PTR(-ENFILE); filp->f_flags = op->open_flag; nd->intent.open.file = filp; nd->intent.open.flags = open_to_namei_flags(op->open_flag); nd->intent.open.create_mode = op->mode; // 对路径遍历做准备工作，主要是判断路径遍历的起始位置 error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base); if (unlikely(error)) goto out_filp; // 把这个task_struct的一项设置为零，找了半天也没找到这东西是干什么的 current->total_link_count = 0; // 对所打开文件路径进行逐一解析，每个目录项的解析结果都存在nd参数中 error = link_path_walk(pathname, nd); if (unlikely(error)) goto out_filp; // 根据最后一个目录项的结果，do_last()将填充filp所指向的file结构 filp = do_last(nd, &path, op, pathname); // filp为空，说明当前文件为符号链接文件 while (unlikely(!filp)) { /* trailing symlink */ struct path link = path; void *cookie; // 如果设置了LOOKUP_FOLLOW标志，则通过follow_link()进入符号链接文件所指文件，填充file // 否则，直接返回当前符号链接文件的filp； if (!(nd->flags & LOOKUP_FOLLOW)) { path_put_conditional(&path, nd); path_put(&nd->path); filp = ERR_PTR(-ELOOP); break; } nd->flags |= LOOKUP_PARENT; nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); error = follow_link(&link, nd, &cookie); if (unlikely(error)) filp = ERR_PTR(error); else filp = do_last(nd, &path, op, pathname); put_link(nd, &link, cookie); } out: ........ return filp; out_filp: ........ goto out; }

现在可以清楚的看到path_openat实际上就是在调用path_int固定查找点以后，开始随着路径调用link_path_walk逐个遍历，在最后一项使用do_last得到相对应的file，如果是链接文件的话还需要额外做一些事情，这个后面再说。

然后就是path_init代码太长了，它主要就是做了一个事情，就是执行以下三种情况之一：

如果路径名name以/为起始，则表示当前路径是一个绝对路径，通过set_root设置nd；否则，表示路径name是一个相对路径

如果dfd为AT_FDCWD，那么表示这个相对路径是以当前路径pwd作为起始的，因此通过pwd设置nd

如果dfd不是AT_FDCWD，表示这个相对路径是用户设置的，需要通过dfd获取具体相对路径信息，进而设置nd

link_path_walk

link_path_walk负责对各目录进行遍历，我们来看看实现：

static int link_path_walk(const char *name, struct nameidata *nd) { // name指向被搜索的路径 struct path next; // 指向下一个目录项 int err; // 如果是绝对路径则会过滤掉一个多余的/ while (*name=='/') name++; if (!*name) return 0; /* At this point we know we have a real path component. */ for(;;) { unsigned long hash; struct qstr this; // 当前搜索路径所处目录项的哈希值 unsigned int c; int type; // type指明当前目录项类型 err = may_lookup(nd); if (err) break; this.name = name; c = *(const unsigned char *)name; hash = init_name_hash(); do { name++; hash = partial_name_hash(c, hash); c = *(const unsigned char *)name; } while (c && (c != '/')); this.len = name - (const char *) this.name; this.hash = end_name_hash(hash); // 为当前目录项更新哈希值，并保存在this中 type = LAST_NORM; if (this.name[0] == '.') switch (this.len) { case 2: if (this.name[1] == '.') { type = LAST_DOTDOT; nd->flags |= LOOKUP_JUMPED; } break; case 1: type = LAST_DOT; } /* * 如果当前目录项为“.”，则type为LAST_DOT * 如果目录项为“..”，则type为LAST_DOTDOT * 否则，type默认为LAST_NORM */ if (likely(type == LAST_NORM)) { struct dentry *parent = nd->path.dentry; nd->flags &= ~LOOKUP_JUMPED; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { err = parent->d_op->d_hash(parent, nd->inode, &this); if (err < 0) break; } } /* remove trailing slashes? */ if (!c) goto last_component; while (*++name == '/'); // 分隔符有多个的时候进行过滤，name总指向最后一个 if (!*name) goto last_component; // 处理当前目录项，更新nd和next；如果当前目录项为符号链接文件，则只更新next； err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW); if (err < 0) return err; if (err) { // 当前目录项为符号链接文件，则通过nested_symlink()进行处理，更新nd err = nested_symlink(&next, nd); if (err) return err; } if (can_lookup(nd->inode)) continue; err = -ENOTDIR; break; /* here ends the main loop */ last_component: nd->last = this; nd->last_type = type; return 0; } terminate_walk(nd); return err; }

walk_component

walk_component驱动这个循环的进行，我们来看一看其实现：

/* 在每次循环中，它将获取当前目录项的dentry结构以及inode结构等信息，即更新nd。如果当前目录项对应的inode不存在，那么将向用户态返回ENOENT；在该函数中，定义了变量inode，它将保存当前目录项对应的索引节点 */ static inline int walk_component(struct nameidata *nd, struct path *path, struct qstr *name, int type, int follow) { struct inode *inode; int err; /* * "." and ".." are special - ".." especially so because it has * to be able to know about the current root directory and * parent relationships. */ // LAST_DOT和LAST_DOTDOT特殊处理 if (unlikely(type != LAST_NORM)) return handle_dots(nd, type); // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! // 重点函数，do_lookup实现路径分量dentry的查找，先在dcache缓存中查找 // 找不到则执行inode->i_op->lookup在磁盘中去找，还找不到就说明路径是错的了 err = do_lookup(nd, name, path, &inode); if (unlikely(err)) { terminate_walk(nd); return err; } if (!inode) { path_to_nameidata(path, nd); terminate_walk(nd); return -ENOENT; } /* 如果should_follow_link()获知当前目录项为符号链接文件，则退出当前函数。具体的，如果当前walk模式为rcu，则直接返回-ECHILD，否则返回1 返回-ECHILD时候，将直接返回到do_filp_open()，进行ref-walk模式重新查找；如果返回1，则返回至上层函数link_path_walk()，进入netsted_symlink()进行符号链接目录项的处理；也就是说，一旦当前目录项为符号链接文件，则需要通过ref-walk进行处理。这是因为处理符号链接文件需要通过具体文件的处理函数进行实现，这个过程可能会导致阻塞，这与rcu方式是违背的，因此需要先转换到ref-walk； */ if (should_follow_link(inode, follow)) { if (nd->flags & LOOKUP_RCU) { if (unlikely(unlazy_walk(nd, path->dentry))) { terminate_walk(nd); return -ECHILD; } } BUG_ON(inode != path->dentry->d_inode); return 1; } // 如果查找成功则通过如下函数更新nd path_to_nameidata(path, nd); nd->inode = inode; return 0; }

handle_dots

我们来看看对于 “.” 和"…"的特殊处理函数handle_dots，其会根据type类型的不同做不同的处理，有这两种特殊类型，LAST_DOT和LAST_DOTDOT：

static inline int handle_dots(struct nameidata *nd, int type) { if (type == LAST_DOTDOT) { if (nd->flags & LOOKUP_RCU) { // 第一次进入path_openat if (follow_dotdot_rcu(nd)) return -ECHILD; } else follow_dotdot(nd); } return 0; }

follow_dotdot_rcu

因为两个函数处理逻辑差不多，我们看看follow_dotdot_rcu，还有这里其实并不容易，因为涉及到文件系统之间的转换：

// 该函数是在rcu模式下获取父目录项信息，如果搜索成功，则返回0；否则，返回ECHILD static int follow_dotdot_rcu(struct nameidata *nd) { struct inode *inode = nd->inode; // 如果有需要的话，首先通过set_root_rcu()设置当前路径的根目录信息 // 只有在搜索路径是绝对路径的时候，nd中的root才会被设置。 // 因为此处是向上搜索，可能会一直找到根目录处 if (!nd->root.mnt) set_root_rcu(nd); // 通过情况下，这个循环体只会被执行一次即退出，只有当父目录项为一个挂载点时才有可能不断进行循环 while (1) { if (nd->path.dentry == nd->root.dentry && nd->path.mnt == nd->root.mnt) { // 到达根节点退出 break; } if (nd->path.dentry != nd->path.mnt->mnt_root) { struct dentry *old = nd->path.dentry; struct dentry *parent = old->d_parent; unsigned seq; inode = parent->d_inode; seq = read_seqcount_begin(&parent->d_seq); if (read_seqcount_retry(&old->d_seq, nd->seq)) goto failed; nd->path.dentry = parent; // 交换完毕 nd->seq = seq; break; } // 如果“..”所代表的目录项正好是一个挂载点时，那么需要将当前的遍历从 // 当前的文件系统向上（follow up）切换到父文件系统 if (!follow_up_rcu(&nd->path)) break; inode = nd->path.dentry->d_inode; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } // 如果退出循环体，至此已经获取到了当前目录项的上一级目录项（即“..”所代表的父目录项） // 如果这个父目录项是一个挂载点，那么还需做一些特殊检查。 // 因为在特殊情况下，当前这个父目录项又被挂载了其他的文件系统， // 那么返回上级目录这个操作获取的应该是最新文件系统的内容而不是之前那个文件系统的内容 // 但是跨越多个文件系统的事情follow_up_rcu已经做了呀。这里代码挺奇怪的。 while (d_mountpoint(nd->path.dentry)) { struct vfsmount *mounted; // 挂载点结构体 // 通过__lookup_mnt()检查父目录下挂载的文件系统是否为最新的文件系统，如果是则检查结束 mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1); if (!mounted) break; nd->path.mnt = mounted; nd->path.dentry = mounted->mnt_root; inode = nd->path.dentry->d_inode; // 更新path中的denty与inode nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } nd->inode = inode; return 0; failed: nd->flags &= ~LOOKUP_RCU; if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; rcu_read_unlock(); br_read_unlock(vfsmount_lock); return -ECHILD; }

我们来看看当遇到文件系统类型转换时的处理函数follow_up_rcu：

static int follow_up_rcu(struct path *path) { struct vfsmount *parent; struct dentry *mountpoint; parent = path->mnt->mnt_parent; // 为什么要这么写呢？原因是一个目录项上可能挂载多个文件系统 if (parent == path->mnt) // 与父文件系统类型一样的时候就是退出的时候 return 0; // 那么首先将父文件系统的挂载点对应的dentry（nd->path->mnt->mnt_mountpoint） // 赋值给当前nd中对应的dentry（nd->path->dentry） mountpoint = path->mnt->mnt_mountpoint; path->dentry = mountpoint; // 多个文件系统挂载点的entry不一样 // 将父文件系统的vfsmount结构（nd->path->mnt->mnt_parent） // 复制给当前nd中对应的mnt项（nd->path->mnt）。 path->mnt = parent; // 返回1，意味着必须再进行一次follow_dotdot_rcu()中的循环过程 return 1; }

举例说明follow_dotdot_rcu()中循环体反复执行的例子。假设fs1文件系统存在路径/home/edsionte/work，首先将fs2文件系统挂载于work下，再将fs3挂载在work下，那么此时fs3对应的父vfsmount为fs2对应的结构；再将fs4文件系统挂载在wrok下，那么fs4指向fs3。此刻，/home/edsionte/work可以访问fs4的内容，而其他之前挂载在这里的文件系统将被隐藏。假设用户在wrok目录下执行“cd . ./”，用户想得到的结果是fs1文件系统下edsionte/下的内容。而此刻work位于fs4中，那么他必须向上逐步跨越文件系统，即fs4通过follow_up_rcu()跨越到父文件系统fs3，fs3再跨越到fs2，fs2再跨越到fs1。

do_lookup

我们再来看看查找dentry的函数，也是open中最重要的函数之一，也就是do_lookup，因为普通项才会执行这个函数，所以这才是walk_component中OS最需要优化的函数，这也许是这里设置这么多优化手段的原因吧，这也导致这里的代码很复杂，作者在源码上面也写到：

It’s more convoluted than I’d like it to be, but… it’s still fairly small and for now I’d prefer to have fast path as straight as possible. It is time-critical. 它比我希望的要复杂得多，但是…它仍然很小，现在我希望有一条尽可能短的路线。这是非常紧迫的一件事。

static int do_lookup(struct nameidata *nd, struct qstr *name, struct path *path, struct inode **inode) { struct vfsmount *mnt = nd->path.mnt; struct dentry *dentry, *parent = nd->path.dentry; int need_reval = 1; int status = 1; int err; /* * Rename seqlock is not required here because in the off chance * of a false negative due to a concurrent rename, we're going to * do the non-racy lookup, below. */ if (nd->flags & LOOKUP_RCU) { // 以RCU模式进入，也就是第一次进入path_openat unsigned seq; *inode = nd->inode; // 目录项缓存(dcache)中查找 dentry = __d_lookup_rcu(parent, name, &seq, inode); if (!dentry) goto unlazy; // 失败的话进入unlazy标号 ........ // 找到的话就万事大吉，直接退出 path->mnt = mnt; path->dentry = dentry; ........ return 0; unlazy: // unlazy标号是将当前的rcu-walk切换成ref-walk模式 // 如果切换成功，则继续；否则，返回ECHILD if (unlazy_walk(nd, dentry)) return -ECHILD; // 返回到do_filp_open()处，重新进行ref模式的查找 } else { dentry = __d_lookup(parent, name); // ref-walk模式 } if (dentry && unlikely(d_need_lookup(dentry))) { dput(dentry); dentry = NULL; } retry: if (unlikely(!dentry)) { struct inode *dir = parent->d_inode; BUG_ON(nd->inode != dir); mutex_lock(&dir->i_mutex); // d_lookup进行一次内存查找 // 因为retry标号下的代码有互斥锁，很可能该函数再此处会阻塞 // 而在阻塞阶段就目标目录项就有可能被载入内存，这样就可以省去在磁盘上查找目录项的工作 // 可以看出作者确实想了很多 dentry = d_lookup(parent, name); if (likely(!dentry)) { // 查找失败，通过d_alloc_and_lookup()分配并在磁盘上查找dentry // 其实就是调用一个文件系统的钩子函数，也就是inode.lookup // 这个如果找不到就是路径错误了 dentry = d_alloc_and_lookup(parent, name, nd); if (IS_ERR(dentry)) { mutex_unlock(&dir->i_mutex); return PTR_ERR(dentry); } /* known good */ need_reval = 0; status = 1; } else if (unlikely(d_need_lookup(dentry))) { // 已经有了dentry，需要遍历parent目录填充inode // 和d_alloc_and_lookup一样都调用了文件系统的lookup钩子函数 dentry = d_inode_lookup(parent, dentry, nd); if (IS_ERR(dentry)) { mutex_unlock(&dir->i_mutex); return PTR_ERR(dentry); } /* known good */ need_reval = 0; status = 1; } mutex_unlock(&dir->i_mutex); } .............. path->mnt = mnt; path->dentry = dentry; ............ *inode = path->dentry->d_inode; return 0; }

do_last

这个函数其实就是对目录解析的最后一项，也就是我们我们要打开的文件执行的特殊操作。其中最重要的是对调用nameidate_to_flip，其中会调用文件系统预留的open，也就是一个钩子函数。

总结

其实这也只是简单的了解一下open的源码，其中很多细节还是没有涉及。我们来在温习一遍：

do_sys_open会调用get_unused_fd_flags得到一个空闲fd，调用do_filp_open得到一个file结构体，并把这两项关联起来。do_filp_open会先后以RCU模式和普通模式调用path_openat，已返回file，同时蕴含这inode和dentry。path_openat会先调用path_init根据传入的参数确定查找的起始路径，然后调用link_path_walk对每一项进行遍历，最后调用do_last执行一个钩子函数，也就是文件系统自带的inode.open。link_path_walk会调用walk_component来驱动循环。walk_component会根据不同的遍历项类型调用不同的处理函数，比如“.”“. .”就会调用handle_dots，而对于一般项就会调用do_lookup查找dentry和inode。do_lookup会先查找dcache，如果不存在的话就会调用文件系统的钩子函数inode.lookup查找对应的inode和dentry。handle_dots在处理“. .”的时候可能会遇到文件系统之间的切换，这里需要做一点处理。对于链接我没有看这部分的细节，具体可参考[7]。

参考：

博文《open系统调用源码剖析》博文《do_lookup》博文《open()在Linux内核的实现(2)-路径查找》博文《open()在Linux内核的实现(3)-“.”和“…”的处理》博文《open()在Linux内核的实现(4)-普通目录项的处理》博文《open()在Linux内核的实现(6)-打开操作分析》博文《open()在Linux内核的实现(5)-符号链接目录项的处理》

Processed: 0.010, SQL: 8