关于linux的mount的问题

[复制链接]

gaosmile 发布时间：2020-9-16 22:05

技术帖
文章封面:	-
文章简介:	-

最近都有碰到一些关于linux的mount的问题，需要去解决，先把整个mount流程理清楚后，查一个问题会比较清楚，同时也是为了巩固自己的知识点。

先上图，看完整个文章后再倒过来看这个，会更清晰哦！

如何注册与mount回调文件系统函数

如果要了解mount，需要知道super block，dentry，file等概念。这就跟学习linux的网络子系统是一个道理，需要先关注几个结构体。

其中block_device一般和块设备操作操作有关，如读取super block、读写数据等，所以block_device掌管着文件系统的底层设备。

struct block_device {
dev_tbd_dev; /* not a kdev_t - it's a search key */
intbd_openers;
struct inode *bd_inode;/* will die */
struct super_block *bd_super;
struct mutexbd_mutex;/* open/close mutex */
struct list_headbd_inodes;
void *bd_claiming;
void *bd_holder;
intbd_holders;
boolbd_write_holder;
#ifdef CONFIG_SYSFS
struct list_headbd_holder_disks;
#endif
struct block_device *bd_contains;
unsignedbd_block_size;
struct hd_struct *bd_part;
/* number of times partitions within this device have been opened. */
unsignedbd_part_count;
intbd_invalidated;
struct gendisk *bd_disk;
struct request_queue * bd_queue;
struct list_headbd_list;
/*
* Private data. You must have bd_claim'ed the block_device
* to use this. NOTE: bd_claim allows an owner to claim
* the same device multiple times, the owner must take special
* care to not mess up bd_private for that case.
*/
unsigned longbd_private;

/* The counter of freeze processes */
intbd_fsfreeze_count;
/* Mutex for freeze */
struct mutexbd_fsfreeze_mutex;
};

暂时先抛开块设备结构体，关注一下文件系统类型的结构体。笔者用//标注了一下结构体

struct file_system_type {
const char *name;//文件系统的名字，如yaffs2
int fs_flags;//说明文件系统的类型
#define FS_REQUIRES_DEV1 //文件系统必须在物理设备上
#define FS_BINARY_MOUNTDATA2 //mount此文件系统时（参见mount_fs函数 - fs/super.c）需要使用二进制数据结构的mount data（如每个位域都有固定的位置和意义）
#define FS_HAS_SUBTYPE4 //文件系统含有子类型，最常见的就是FUSE，FUSE本是不是真正的文件系统，所以要通过子文件系统类型来区别通过FUSE接口实现的不同文件系统
#define FS_USERNS_MOUNT8/* Can be mounted by userns root */ //文件系统每次挂载都后都是不同的user namespace
#define FS_USERNS_DEV_MOUNT16 /* A userns mount does not imply MNT_NODEV */ //user namespace挂载支持MNT_DEV，即非nodev模式
#define FS_RENAME_DOES_D_MOVE32768/* FS will handle d_move() during rename() internally. *///文件系统将把重命名操作reame()直接按照移动操作d_move()来处理，主要用于网络文件系统
struct dentry *(*mount) (struct file_system_type *, int,
const char *, void *);//用户挂载此文件系统时使用的回调函数
void (*kill_sb) (struct super_block *);//删除内存中的super block，在卸载文件系统时使用
struct module *owner;//指向实现这个文件系统的模块，通常为THIS_MODULE宏
struct file_system_type * next;//指向文件系统类型链表的下一个文件系统类型
struct hlist_head fs_supers;//具有同样此文件系统类型的超级块结构，都串连在这个表头下

struct lock_class_key s_lock_key;
struct lock_class_key s_umount_key;
struct lock_class_key s_vfs_rename_key;
struct lock_class_key s_writers_key[SB_FREEZE_LEVELS];

struct lock_class_key i_lock_key;
struct lock_class_key i_mutex_key;
struct lock_class_key i_mutex_dir_key;
};

file_system_type的基本操作都在fs/filesystems.c文件里.其中最重要的得看register_filesystem

/**
*register_filesystem - register a new filesystem
*@fs: the file system structure
*
*Adds the file system passed to the list of file systems the kernel
*is aware of for mount and other syscalls. Returns 0 on success,
*or a negative errno code on an error.
*
*The &struct file_system_type that is passed is linked into the kernel
*structures and must not be freed until the file system has been
*unregistered.
*/

int register_filesystem(struct file_system_type * fs)
{
int res = 0;
struct file_system_type ** p;

BUG_ON(strchr(fs->name, '.'));
if (fs->next)
return -EBUSY;
write_lock(&file_systems_lock);
p = find_filesystem(fs->name, strlen(fs->name));
if (*p)
res = -EBUSY;
else
*p = fs;
write_unlock(&file_systems_lock);
return res;
}

EXPORT_SYMBOL(register_filesystem);

此函数将文件系统注册进系统中。如果要分析文件系统，这个一定是会被调用到的。它告诉内核我叫什么文件系统，并且告诉内核是如何使用超级块的。如此mount和kill_sb回调函数就是file_system_type中是文件系统实现的重点。上面所说的，请跳转到文件系统，自己来查看。这里笔者用的是yaffs2文件系统

可以看到mount的回调函数:

static struct dentry *yaffs2_mount(struct file_system_type *fs_type, intflags,
const char *dev_name, void *data)
{
return mount_bdev(fs_type, flags, dev_name, data,yaffs2_internal_read_super_mtd);
}

看起来似乎很简单啊，只有一个mount_bdev函数。那么关注一下参数。

fs_type就是file_system_type的信息，这里传递它主要是因为它携带了super block的链表和很多锁变量。
flags文件系统的通用挂载选项
dev_name是mount操作时的设备名，后面会用到这个设备名找到对应的设备信息，从而从中获得super block，如/devinfo
data是挂载时指定的挂载选项信息
yaffs2_internal_read_super_mtd 指yafffs2特定实现的fill_super方法，用来根据xfs文件系统的特性解析mount data并继续填充super block的字段，并且初始化挂载点的根索引节点对象和目录项对象。

mount的系统调用的实现

在上面一节中，笔者已经向大家介绍了注册file_system_type的时候我们主要提供两个成员给内核，一个是文件系统的名字，一个是mount这个文件系统的方法。

命令行：

mount -t yaffs2 /dev/mtdblock7 /demo/

此刻我们需要探求一下mount又是如何被系统调用的。

在linux中的shell输入

man 2 mount

此时可以得到系统的mount API介绍

#include <sys/mount.h>

   int mount(const char *source, const char *target,
            const char *filesystemtype, unsigned long mountflags,
            const void *data);

看一下， source是要挂载的设备名，target是要挂载到哪，filesystemtype顾名思义就是文件系统类型名。还有2个参数也是我们所要关注的。

先来看一下mountflags，其主要取值来自include/uapi/linux/fs.h

/*
* These are the fs-independent mount-flags: up to 32 flags are supported
*/
#define MS_RDONLY 1/* Mount read-only */
#define MS_NOSUID 2/* Ignore suid and sgid bits */
#define MS_NODEV 4/* Disallow access to device special files */
#define MS_NOEXEC 8/* Disallow program execution */
#define MS_SYNCHRONOUS16/* Writes are synced at once */
#define MS_REMOUNT32/* Alter flags of a mounted FS */
#define MS_MANDLOCK64/* Allow mandatory locks on an FS */
#define MS_DIRSYNC128/* Directory modifications are synchronous */
#define MS_NOATIME1024/* Do not update access times. */
#define MS_NODIRATIME2048/* Do not update directory access times */
#define MS_BIND4096 //对应-B/--bind选项，告诉mount这是一次bind操作
#define MS_MOVE8192 //对应-M/--move，告诉mount这是一次move操作
#define MS_REC16384 //rec是recursive的意思，这个flag一般不单独出现，都是伴随这个flag，表示递归的进行操作
#define MS_VERBOSE32768/* War is peace. Verbosity is silence.
MS_VERBOSE is deprecated. */
#define MS_SILENT32768
#define MS_POSIXACL(1<<16)/* VFS does not apply the umask */
#define MS_UNBINDABLE(1<<17)/* change to unbindable */
#define MS_PRIVATE(1<<18)/* change to private */
#define MS_SLAVE(1<<19)/* change to slave */
#define MS_SHARED(1<<20)/* change to shared */
#define MS_RELATIME(1<<21)/* Update atime relative to mtime/ctime. */
#define MS_KERNMOUNT(1<<22) /* this is a kern_mount call */
#define MS_I_VERSION(1<<23) /* Update inode I_version field */
#define MS_STRICTATIME(1<<24) /* Always perform atime updates */
#define MS_LAZYTIME(1<<25) /* Update the on-disk [acm]times lazily */

/* These sb flags are internal to the kernel */
#define MS_NOSEC(1<<28) //有些文件系统不支持suid，security xattr等安全标记
#define MS_BORN(1<<29) //表示内存superblock已经创建完成
#define MS_ACTIVE(1<<30) //表示内存superblock正处于活动状态
#define MS_NOUSER(1<<31) //表示文件系统不能被应用层挂载使用，只能被内核使用，如rootfs

/*
* Superblock flags that can be altered by MS_REMOUNT
*/
#define MS_RMT_MASK(MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION|\
MS_LAZYTIME) // 可以在remount时改变的flags

/*
* Old magic mount flag and mask
*/
#define MS_MGC_VAL 0xC0ED0000
#define MS_MGC_MSK 0xffff0000

从定义上看，这些基本上是大家都有的，绝大部分是可以被VFS层解析掉。而data就是一下特定的参数了。

mount系统调用定义在fs/namespace.c中

SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *,dir_name,
char __user *, type, unsigned long, flags, void __user *, data)
{
int ret;
char *kernel_type;
char *kernel_dev;
unsigned long data_page;
//拷贝文件系统类型名
kernel_type = copy_mount_string(type);
ret = PTR_ERR(kernel_type);
if (IS_ERR(kernel_type))
goto out_type;
//拷贝文件系统所在的设备名
kernel_dev = copy_mount_string(dev_name);
ret = PTR_ERR(kernel_dev);
if (IS_ERR(kernel_dev))
goto out_dev;
//拷贝文件系统定制的mount data
ret = copy_mount_options(data, &data_page);
if (ret < 0)
goto out_data;
//到此mount所需要的fstype, dev_name, mountpoint, flags和data这几个参数都拷贝到内核空间了,启动底层mount吧
ret = do_mount(kernel_dev, dir_name, kernel_type, flags,
(void *) data_page);

free_page(data_page);
out_data:
kfree(kernel_dev);
out_dev:
kfree(kernel_type);
out_type:
return ret;
}

继续，来看看do_mount

/*
* Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
* be given to the mount() call (ie: read-only, no-dev, no-suid etc).
*
* data is a (void *) that can point to any structure up to
* PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
* information (or be NULL).
*
* Pre-0.97 versions of mount() didn't have a flags word.
* When the flags word was introduced its top half was required
* to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
* Therefore, if this magic number is present, it carries no information
* and must be discarded.
*/
long do_mount(const char *dev_name, const char __user *dir_name,
const char *type_page, unsigned long flags, void *data_page)
{
struct path path;
int retval = 0;
int mnt_flags = 0;
#if defined(CONFIG_DM_NFSB)
int is_nfsb = 0;
dev_t nfsb_dev;
char mapper_path[256] = {0};
#endif

/* Discard magic */
if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
flags &= ~MS_MGC_MSK;

/* Basic sanity checks */

#if defined(CONFIG_DM_NFSB)
if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
return -EINVAL;
#endif

if (data_page)
((char *)data_page)[PAGE_SIZE - 1] = 0;

#if defined(CONFIG_DM_NFSB)
if(type_page != NULL && (strncmp(type_page,"nfsb",strlen("nfsb")) == 0)) {
struct nfsb_header *hdr = NULL;
int nfsb_ret = 1;
nfsb_dev = name_to_dev_t(dev_name);

is_nfsb = 1;

hdr = kmalloc(sizeof(struct nfsb_header), GFP_KERNEL);
if(!hdr)
return -ENOMEM;

nfsb_ret = nvt_dm_check_nfsb(hdr, dev_name);
if(!nfsb_ret) {
nfsb_ret = -EINVAL;
goto nfsb_out;
}

nfsb_ret = nvt_dm_setup_linear(hdr, &nfsb_dev);
if(!nfsb_ret) {
nfsb_ret = -EINVAL;
goto nfsb_out;
}

nfsb_ret = nvt_dm_setup_nfsb(hdr, &nfsb_dev, dir_name, mapper_path);
if(!nfsb_ret) {
nfsb_ret = -EINVAL;
goto nfsb_out;
}
nfsb_out:
kfree(hdr);
if(nfsb_ret < 0)
return nfsb_ret;
}
#endif

/* ... and get the mountpoint */
retval = user_path(dir_name, &path);
if (retval)
return retval;

retval = security_sb_mount(dev_name, &path,
type_page, flags, data_page);
if (!retval && !may_mount())
retval = -EPERM;
if (retval)
goto dput_out;
//这里就是一系列的对flags的解析
/* Default to relatime unless overriden */
if (!(flags & MS_NOATIME))
mnt_flags |= MNT_RELATIME;

/* Separate the per-mountpoint flags */
if (flags & MS_NOSUID)
mnt_flags |= MNT_NOSUID;
if (flags & MS_NODEV)
mnt_flags |= MNT_NODEV;
if (flags & MS_NOEXEC)
mnt_flags |= MNT_NOEXEC;
if (flags & MS_NOATIME)
mnt_flags |= MNT_NOATIME;
if (flags & MS_NODIRATIME)
mnt_flags |= MNT_NODIRATIME;
if (flags & MS_STRICTATIME)
mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
if (flags & MS_RDONLY)
mnt_flags |= MNT_READONLY;

/* The default atime for remount is preservation */
if ((flags & MS_REMOUNT) &&
((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
MS_STRICTATIME)) == 0)) {
mnt_flags &= ~MNT_ATIME_MASK;
mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
}

flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |MS_BORN |
MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
MS_STRICTATIME);

if (flags & MS_REMOUNT)
retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
data_page);
else if (flags & MS_BIND)
retval = do_loopback(&path, dev_name, flags & MS_REC);
else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE |MS_UNBINDABLE))
retval = do_change_type(&path, flags);
else if (flags & MS_MOVE)
retval = do_move_mount(&path, dev_name);
else
#if defined(CONFIG_DM_NFSB)
{
if(is_nfsb)
retval = do_new_mount(&path, "ext4", flags, mnt_flags,
mapper_path, data_page);
else
retval = do_new_mount(&path, type_page, flags, mnt_flags,
dev_name, data_page);
}
#else
retval = do_new_mount(&path, type_page, flags, mnt_flags,
dev_name, data_page);
#endif
dput_out:
path_put(&path);
return retval;
}

看完上面的代码：大约可以提炼出do_mount要做的事情：

1.将要挂载的目录提取到内核中的path结构体中去

2.将传入的flash的通用标记分解出来

3.根据这些标记执行相应的操作：do_remount、do_loopback、do_change_type、do_move_mount、do_new_mount

接下来我们来分析一下do_new_mount。

/*
* create a new mount for userspace and request it to be added into the
* namespace's tree
*/
static int do_new_mount(struct path *path, const char *fstype, int flags,
int mnt_flags, const char *name, void *data)
{
struct file_system_type *type;
struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
struct vfsmount *mnt;
int err;

if (!fstype)
return -EINVAL;
//根据fs类型名（如xfs）在全局文件系统类型链表上找到其对应的file_system_type结构
type = get_fs_type(fstype);
if (!type)
return -ENODEV;

if (user_ns != &init_user_ns) {
if (!(type->fs_flags & FS_USERNS_MOUNT)) {
put_filesystem(type);
return -EPERM;
}
/* Only in special cases allow devices from mounts
* created outside the initial user namespace.
*/
if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
flags |= MS_NODEV;
mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
}
}
//以文件系统类型、挂载标记、设备名和挂载选项信息为参数，并没有mountpoint参数。这里只是想用type中的mount回调函数读取设备的superblock信息，填充mnt结构，然后把flag和data解析后填充到mnt结构中
mnt = vfs_kern_mount(type, flags, name, data);
if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
!mnt->mnt_sb->s_subtype)
mnt = fs_set_subtype(mnt, fstype);

put_filesystem(type);
if (IS_ERR(mnt))
return PTR_ERR(mnt);

err = do_add_mount(real_mount(mnt), path, mnt_flags);
if (err)
mntput(mnt);
return err;
}

由上面的源码可知do_new_mount所谓3件事。

1.根据fstype从全局文件系统类型(file_system_type)链表中找到对应的文件系统类型结构2.特定文件系统类型结构中的mount回调函数执行下面的挂载操作，最终构建一个mount结构体，其中包含vfsmount信息。3.将得到的mount结构体加入全局文件系统树中

vfs_kern_mount和do_add_mount是接下来重要的两个步骤，vfs_kern_mount继续解析superblock并填充mnt结构，do_add_mount将创建好的mnt加入到全局文件系统树中。

struct vfsmount *
vfs_kern_mount(struct file_system_type *type, int flags, const char *name,void *data)
{
struct mount *mnt;
struct dentry *root;

if (!type)
return ERR_PTR(-ENODEV);
// alloc一个新的struct mount结构，并初始化里面一部分（如链表指针、mnt_devname等成员内容）
mnt = alloc_vfsmnt(name);
if (!mnt)
return ERR_PTR(-ENOMEM);

if (type->alloc_mnt_data) {
mnt->mnt.data = type->alloc_mnt_data();
if (!mnt->mnt.data) {
mnt_free_id(mnt);
free_vfsmnt(mnt);
return ERR_PTR(-ENOMEM);
}
}
if (flags & MS_KERNMOUNT)
mnt->mnt.mnt_flags = MNT_INTERNAL;
// 调用具体文件系统的mount回调函数type->mount，继续挂载操作
root = mount_fs(type, flags, name, &mnt->mnt, data);
if (IS_ERR(root)) {
mnt_free_id(mnt);
free_vfsmnt(mnt);
return ERR_CAST(root);
}
//完成mnt结构的最后赋值，并返回vfsmount结构
mnt->mnt.mnt_root = root;
mnt->mnt.mnt_sb = root->d_sb;
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
mnt->mnt_parent = mnt;
lock_mount_hash();
list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
unlock_mount_hash();
return &mnt->mnt;
}
EXPORT_SYMBOL_GPL(vfs_kern_mount);

主要作用：

alloc_vfsmnt创造一个新的struct mount结构
在mount_fs函数里调用特定文件系统的mount回调函数构造一个root dentry，包含特定文件系统的super block信息
用第二步得到的结果完成对struct mount的构造，返回vfsmnt结构。

mount_fs主要就做一件事，调用type->mount回调函数。

mount再往下就是每个文件系统自己实现的mount回调函数了。

前面有讲到相关的mount_bdev，虽然它是一个通用函数，但是其最后一个参数是一个函数指针，yaffs2传入yaffs2_internal_read_super_mtd作为参数。yaffs2_internal_read_super_mtd是yaffs2自己实现的代码，也就是这里还是需要一个每个文件系统各异的处理函数。