Read调用的全程

2019年9月28日 147次阅读来源: xcshuan

当我们在C程序中用到某些库函数进行文件读取操作时，后续的整个过程都是透明的，为了了解文件系统在其中起到了什么作用，又是如何和内核的其他部分进行协作的，我们可以对Read()函数进行追踪，下面的代码均来自linux2.6.11.10版本的内核。

首先，我们写下如下的测试程序，test.c，其中1.txt里只有一句Hello,World。

#include <stdio.h>
#include <stdlib.h> 
int main() {
    char word[20];
    FILE *fp; 
    if((fp = fopen("1.txt","a+")) == NULL) { 
        fprintf(stdout, "ERROR!");  
        exit(EXIT_FAILURE); 
    }                                                                             fscanf(fp,"%s",word); 
    printf("%s\n",word);                             
    return 0;                                                
}

然后进行编译，并通过strace 工具查看函数运行时用到了哪些系统调用函数，并将结果输出到hello.txt中。

~/test$ gcc hello.c -o hello

~/test$ strace -o hello.txt ./hello

查看hello.txt中的主要内容如下

……
openat(AT_FDCWD, "x86_64/libc.so.6", O_RDONLY|O_CLOEXEC) = -1 ENOENT
……
openat(AT_FDCWD, "1.txt", O_RDWR|O_CREAT|O_APPEND, 0666) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=13, ...}) = 0
read(3, "Hello,World!\n", 4096)         = 13
fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 1), ...}) = 0
write(1, "Hello,World!\n", 13)          = 13
lseek(3, -1, SEEK_CUR)                  = 12  
exit_group(0)                           = ?

可以看到首先打开了libc.so，这里面封装了我们需要的库函数，而后调用了write、read、lseek等库函数。

我们知道，系统调用有两种方式实现，一种是老旧的int$0x80方式，还有一种是sysenter，具体细节不纠结，但过程总是先将系统调用号存入$eax，然后进行系统调用，这部分实现已经完全放进库函数了，进行系统调用后，会查系统调用表，比如read的系统调用就是3，那么查表就能查到这个函数。

比如i386处理器的系统调用号局部如下所示

/linux-2.6.11.10/include/asm-i386/unistd.h
#define __NR_restart_syscall      0
#define __NR_exit         1
#define __NR_fork         2
#define __NR_read         3
#define __NR_write        4
#define __NR_open         5
#define __NR_close        6
#define __NR_waitpid      7
#define __NR_creat        8
#define __NR_link         9                          
#define __NR_unlink      10                           
#define __NR_execve      11                            
#define __NR_chdir       12                           
#define __NR_time        13                            
#define __NR_mknod       14                           
#define __NR_chmod       15                           
#define __NR_lchown      16                            
#define __NR_break       17

由上我们看到，调用read的系统调用号为3，在这个文件的下面我们还能看到比较老旧的系统调用实现代码，现在这个功能好像已经放到库中去实现了，不在内核中实现，这里内核版本较老，所以在内核中还能看到，这里用的是通过系统调用需要的参数个数来进行区分的。

/linux-2.6.11.10/include/asm-i386/unistd.h
#define __syscall_return(type, res) \                
do { \
    if ((unsigned long)(res) >= (unsigned long)(-(128 + 1))) { \
        errno = -(res); \
        res = -1; \
    } \
    return (type) (res); \
} while (0)
/* XXX - _foo needs to be __foo, while __NR_bar could be _NR_bar. */
#define _syscall0(type,name) \
type name(void) \ 
{ \
long __res; \ 
__asm__ volatile ("int $0x80" \ 
    : "=a" (__res) \ 
    : "0" (__NR_##name)); \  
__syscall_return(type,__res); \                                                            
}

而之后，read会调用相应的服务例程sys_read，此函数定义如下。

/linux-2.6.11.10/fs/read_write.c
asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)
{
    struct file *file;
    ssize_t ret = -EBADF;
    int fput_needed;

    file = fget_light(fd, &fput_needed);   //从当前打开的文件集中返回要写的文件对象地址   
    if (file) {
        loff_t pos = file_pos_read(file);  //返回文件偏移地址
        ret = vfs_read(file, buf, count, &pos); //buf为用户态缓冲区，count为读取长度
        file_pos_write(file, pos);  //将新的偏移地址写回文件
        fput_light(file, fput_needed); //释放文件
    }

    return ret;
}
EXPORT_SYMBOL_GPL(sys_read);

该函数首先通过fget_light（light表示轻量级的）通过文件描述符，来返回一个文件地址，类型为虚拟文件系统层的struct file，然后获取文件偏移地址，并调用vfs_read即虚拟文件系统的读操作，从这里我们可以看到，无论底层是什么文件系统，由于有VFS这个中间层存在，对文件进行操作都可以把事情交给VFS来处理，这是抽象的好处。

我们可以在sys_read所在的文件里找到vfs_read。

/linux-2.6.11.10/fs/read_write.c
ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
    ssize_t ret;

    if (!(file->f_mode & FMODE_READ))  //进程的访问模式是否可读文件
        return -EBADF;
    if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read)) //检查文件是否定义有相关操作
        return -EINVAL;
    if (unlikely(!access_ok(VERIFY_WRITE, buf, count))) //粗略检查参数，看缓冲区是否有效
        return -EFAULT;

    ret = rw_verify_area(READ, file, pos, count);  //检查当前区域是否有锁
    if (!ret) {
        ret = security_file_permission (file, MAY_READ);  //检查是否有读的权限
        if (!ret) {
            if (file->f_op->read)
                ret = file->f_op->read(file, buf, count, pos);   //如有则调用相应文件系统的read函数
            else
                ret = do_sync_read(file, buf, count, pos);   //否则调用这个函数
            if (ret > 0) {
                dnotify_parent(file->f_dentry, DN_ACCESS);   //通知父目录文件已获取
                current->rchar += ret;
            }
            current->syscr++;   //一些I/O次数的统计
        }
    }
    return ret;
}

EXPORT_SYMBOL(vfs_read);

我们可以看到，vfs_read函数只是检查了一些状态，就使用回调函数 file->f_op->read，使用相应文件系统的read函数继续进行操作，这个file_operations应该是open file的时候就已经填好的，我们可以/linux-2.6.11.10/fs/ext2/file.c里找到ext2所有的文件操作，如下，其实在新内核里，read和write之类的操作已经改了。

/linux-2.6.11.10/fs/ext2/file.c
struct file_operations ext2_file_operations = {
    .llseek        = generic_file_llseek,
    .read        = generic_file_read,
    .write        = generic_file_write,
    .aio_read    = generic_file_aio_read,
    .aio_write    = generic_file_aio_write,
    .ioctl        = ext2_ioctl,
    .mmap        = generic_file_mmap,
    .open        = generic_file_open,
    .release    = ext2_release_file,
    .fsync        = ext2_sync_file,
    .readv        = generic_file_readv,
    .writev        = generic_file_writev,
    .sendfile    = generic_file_sendfile,
};

可以看到，ext2的read操作并没有额外定义，而是使用了一个通用文件读函数，在/linux-2.6.11.10/mm/filemap.c文件里可以找到这个函数，因为读写是基于页操作的。

/linux-2.6.11.10/mm/filemap.c
ssize_t generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
{
    struct iovec local_iov = { .iov_base = buf, .iov_len = count }; //用local_iov存用户缓区和读取长度
    struct kiocb kiocb; //同步和异步I/O操作描述符
    ssize_t ret;

    init_sync_kiocb(&kiocb, filp); //初始化描述符
    ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos); //所有文件系统使用的通用例程
    if (-EIOCBQUEUED == ret)  //如果在排队
        ret = wait_on_sync_kiocb(&kiocb);
    return ret;
}
EXPORT_SYMBOL(generic_file_read);

这个函数继续调用了一个通用例程，即__generic_file_aio_read，字面理解就是异步I/O读，它不是立即读取，而是会先在一个链表里排队，如果在排队就需要继续等。

/linux-2.6.11.10/mm/filemap.c
ssize_t
__generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
        unsigned long nr_segs, loff_t *ppos)
{
    struct file *filp = iocb->ki_filp;  //与正在进行的read操作相关的文件对象指针
    ssize_t retval;
    unsigned long seg;  
    size_t count;

    count = 0;
    for (seg = 0; seg < nr_segs; seg++) {
        const struct iovec *iv = &iov[seg];

        /*
         * If any segment has a negative length, or the cumulative
         * length ever wraps negative then return -EINVAL.
         */
        count += iv->iov_len;
        if (unlikely((ssize_t)(count|iv->iov_len) < 0))
            return -EINVAL;
        if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))   //检查ivoec描述符所描述的用户态缓冲区是否有效
            continue;
        if (seg == 0)
            return -EFAULT;
        nr_segs = seg;
        count -= iv->iov_len;    /* This segment is no good */
        break;
    }
    /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
    if (filp->f_flags & O_DIRECT) {    //直接I/O模式
        loff_t pos = *ppos, size;
        struct address_space *mapping;
        struct inode *inode;

        mapping = filp->f_mapping;
        inode = mapping->host;
        retval = 0;
        if (!count)
            goto out; /* skip atime */
        size = i_size_read(inode);
        if (pos < size) {
            retval = generic_file_direct_IO(READ, iocb,
                        iov, pos, nr_segs);
            if (retval >= 0 && !is_sync_kiocb(iocb))
                retval = -EIOCBQUEUED;
            if (retval > 0)
                *ppos = pos + retval;
        }
        file_accessed(filp);
        goto out;
    }
    retval = 0;    //如果不是直接I/O模式的话，就用页高速缓存
    if (count) {
        for (seg = 0; seg < nr_segs; seg++) {
            read_descriptor_t desc;  //定义读操作描述符

            desc.written = 0;
            desc.arg.buf = iov[seg].iov_base;  //用户缓冲区
            desc.count = iov[seg].iov_len;   //读取长度
            if (desc.count == 0)
                continue;
            desc.error = 0;
            do_generic_file_read(filp,ppos,&desc,file_read_actor); //调用该函数读文件
            retval += desc.written;
            if (!retval) {
                retval = desc.error;
                break;
            }
        }
    }
out:
    return retval;
}
EXPORT_SYMBOL(__generic_file_aio_read);

我们可以将上面这个函数粗略划分为三部分，检查部分，以及直接I/O读取，和页高速缓存读取，如果设置了O_DIRECT标志，则直接读取调用generic_file_direct_IO()，否则要使用页高速缓存，调用do_generic_file_read，，我们主要关注页高速缓存读取。

/linux-2.6.11.10/include/linux/fs.h
static inline void do_generic_file_read(struct file * filp, loff_t *ppos,
                    read_descriptor_t * desc,
                    read_actor_t actor)
{
    do_generic_mapping_read(filp->f_mapping,
                &filp->f_ra,
                filp,
                ppos,
                desc,
                actor);
}

do_generic_file_read会继续调用do_generic_mapping_read，这个调用表示对文件的读操作转换为对页高速缓存的读操作。

之所以要在I/O过程中加入页高速缓存这么一个缓冲层，是为了提高读取的效率，我们希望能尽量减少对磁盘的读取，而将读取放到内存中进行，所以引入页高速缓存这么一个中间层。

上面的参数中有一个filp->f_mapping，这个是一个地址空间变量，其定义如下。

/linux-2.6.11.10/include/linux/fs.h
struct address_space {
    struct inode        *host;        /* owner: inode, block_device */
    struct radix_tree_root    page_tree;    /* radix tree of all pages */
    spinlock_t        tree_lock;    /* and spinlock protecting it */
    unsigned int        i_mmap_writable;/* count VM_SHARED mappings */
    struct prio_tree_root    i_mmap;        /* tree of private and shared mappings */
    struct list_head    i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
    spinlock_t        i_mmap_lock;    /* protect tree, count, list */
    unsigned int        truncate_count;    /* Cover race condition with truncate */
    unsigned long        nrpages;    /* number of total pages */
    pgoff_t            writeback_index;/* writeback starts here */
    struct address_space_operations *a_ops;    /* methods */
    unsigned long        flags;        /* error bits/gfp mask */
    struct backing_dev_info *backing_dev_info; /* device readahead, etc */
    spinlock_t        private_lock;    /* for use by the address_space */
    struct list_head    private_list;    /* ditto */
    struct address_space    *assoc_mapping;    /* ditto */
} __attribute__((aligned(sizeof(long))));

通过host和page_tree两个属性，一个adrees_space结构体可以将一个文件和属于它的缓存页联系起来，page_tree是struct radix_tree_root类型的，就是一颗树的根，它指向一颗基树，相应的页都存在叶子节点上，这样找页就很简单了。

《Read调用的全程》

在do_generic_mapping_read里，检查完基础数据后，会建立一个循环，这个循环每次读一页内容，直到读完所有内容。

首先是find_page，它会通过关联有页的基树找到相应的页，如果没找到，就跳到no_cached_page重新分配一个页插入到基树里去，如果为脏页则需要更新，如果既能找到，又不需要更新，那么直接page_ok将数据拷贝到用户态即可。

/linux-2.6.11.10/mm/filemap.c——do_generic_mapping_read
find_page:
        page = find_get_page(mapping, index);  //首先在页高速缓存里寻找页描述符
        if (unlikely(page == NULL)) {
            handle_ra_miss(mapping, &ra, index);
            goto no_cached_page;
        }
        if (!PageUptodate(page)) //检查是否为脏页
            goto page_not_up_to_date;

找到页以后开始读页，主要的重点语句是这一句

linux-2.6.11.10/mm/filemap.c——do_generic_mapping_read
readpage:
        /* Start the actual read. The read will unlock the page. */
        error = mapping->a_ops->readpage(filp, page);
        if (unlikely(error))
            goto readpage_error;
        if (!PageUptodate(page)) {
            lock_page(page);
            if (!PageUptodate(page)) {
                if (page->mapping == NULL) {
                    /*
                     * invalidate_inode_pages got it
                     */
                    unlock_page(page);
                    page_cache_release(page);
                    goto find_page;
                }
                unlock_page(page);
                error = -EIO;
                goto readpage_error;
            }
            unlock_page(page);
        }
        /*
         * i_size must be checked after we have done ->readpage.
         *
         * Checking i_size after the readpage allows us to calculate
         * the correct value for "nr", which means the zero-filled
         * part of the page is not copied back to userspace (unless
         * another truncate extends the file - this is desired though).
         */
        isize = i_size_read(inode);
        end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
        if (unlikely(!isize || index > end_index)) {
            page_cache_release(page);
            goto out;
        }
        /* nr is the maximum number of bytes to copy from this page */
        nr = PAGE_CACHE_SIZE;
        if (index == end_index) {
            nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
            if (nr <= offset) {
                page_cache_release(page);
                goto out;
            }
        }
        nr = nr - offset;
        goto page_ok;

这里又出现了一个回调函数，又调用了相关文件系统的相关函数，我们可以找到ext2的a_ops定义如下：

/linux-2.6.11.10/fs/ext2/inode.c
struct address_space_operations ext2_aops = {
    .readpage        = ext2_readpage,
    .readpages        = ext2_readpages,
    .writepage        = ext2_writepage,
    .sync_page        = block_sync_page,
    .prepare_write        = ext2_prepare_write,
    .commit_write        = generic_commit_write,
    .bmap            = ext2_bmap,
    .direct_IO        = ext2_direct_IO,
    .writepages        = ext2_writepages,
};

再找到ext2_readpage开始我们的读页操作。

/linux-2.6.11.10/fs/ext2/inode.c
static int ext2_readpage(struct file *file, struct page *page)
{
    return mpage_readpage(page, ext2_get_block);
}

这里它又继续调用了一个通用例程mapge_readpage，导入了页地址以及ext2的数据块寻址函数。

/linux-2.6.11.10/fs/mpage.c
int mpage_readpage(struct page *page, get_block_t get_block)
{
    struct bio *bio = NULL;
    sector_t last_block_in_bio = 0;

    bio = do_mpage_readpage(bio, page, 1,
            &last_block_in_bio, get_block);
    if (bio)
        mpage_bio_submit(READ, bio);
    return 0;
}
EXPORT_SYMBOL(mpage_readpage);

这里就两步操作，申请一个struct bio对象，然后提交这个任务。bio是通用块层用来管理传输数据的，他把一个磁盘存储区和一块内存区域联系起来。

然后提交这个任务，这里面其实还有一个调度过程，所有的bio请求都在一个队列里，它可以重排读写数据块的请求，在重复访问文件同一个部分或多进程访问同一数据，可以大大提高读取效率。

最终，这件读操作会交给磁盘的设备驱动程序来进行真正的数据操作。

读完以后，再回到do_generic_mapping_read，跳到page_ok，它会调用__copy_to_user()函数将数据拷贝到用户态缓冲区，

linux-2.6.11.10/mm/filemap.c——do_generic_mapping_read
page_ok:
        /* If users can be writing to this page using arbitrary
         * virtual addresses, take care about potential aliasing
         * before reading the page on the kernel side.
         */
        if (mapping_writably_mapped(mapping))
            flush_dcache_page(page);
        /*
         * When (part of) the same page is read multiple times
         * in succession, only mark it as accessed the first time.
         */
        if (prev_index != index)
            mark_page_accessed(page);
        prev_index = index;
        /*
         * Ok, we have the page, and it's up-to-date, so
         * now we can copy it to user space...
         *
         * The actor routine returns how many bytes were actually used..
         * NOTE! This may not be the same as how much of a user buffer
         * we filled up (we may be padding etc), so we can only update
         * "pos" here (the actor routine has to update the user buffer
         * pointers and the remaining count).
         */
        ret = actor(desc, page, offset, nr);
        offset += ret;
        index += offset >> PAGE_CACHE_SHIFT;
        offset &= ~PAGE_CACHE_MASK;
        page_cache_release(page);
        if (ret == nr && desc->count)
            continue;
        goto out;

linux-2.6.11.10/mm/filemap.c
int file_read_actor(read_descriptor_t *desc, struct page *page,
            unsigned long offset, unsigned long size)
{
    char *kaddr;
    unsigned long left, count = desc->count;

    if (size > count)
        size = count;

    /*
     * Faults on the destination of a read are common, so do it before
     * taking the kmap.
     */
    if (!fault_in_pages_writeable(desc->arg.buf, size)) {
        kaddr = kmap_atomic(page, KM_USER0);
        left = __copy_to_user_inatomic(desc->arg.buf,
                        kaddr + offset, size);
        kunmap_atomic(kaddr, KM_USER0);
        if (left == 0)
            goto success;
    }

    /* Do it the slow way */
    kaddr = kmap(page);
    left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
    kunmap(page);

    if (left) {
        size -= left;
        desc->error = -EFAULT;
    }
success:
    desc->count = count - size;
    desc->written += size;
    desc->arg.buf += size;
    return size;
}

然后更新一些计数，再一步步往上返回到最开始的read()系统调用，调用就结束了。

    原文作者：xcshuan
    原文地址: https://segmentfault.com/a/1190000016820890
    本文转自网络文章，转载此文章仅为分享知识，如有侵权，请联系博主进行删除。