CVE-2022-0847-DirtyPipe分析

0收藏

0点赞

浏览量：114

2022-03-25

漏洞成因
pipe维护了一个struct pipe_buffer的数组，每个pipe_buffer指向一个page，page里存的就是pipe的数据
正常情况下，往pipe里写数据时会申请一个page，把数据拷贝到page里后再让pipe_buffer指向这个page。splice系统调用实现了一种零拷贝的技术，直接让pipe_buffer指向这个原始的数据page，这样就省去了内存拷贝的过程，提升效率
往pipe里写数据时不可能每次都正好是page_size的整数倍，如果每次写数据都要重新分配一个新的page来存，必然会造成空间的浪费。但是如果pipe_buffer的PIPE_BUF_FLAG_CAN_MERGEflag被置位，数据就会接着上一次的数据在同一个page中写入，而不是申请新的page，减少了空间的浪费
但是splice在给pipe_buffer赋值时没有初始化flag，这就造成之前被置位的PIPE_BUF_FLAG_CAN_MERGEflag不会被清除，所以只要先让所有的pipe_buffer的PIPE_BUF_FLAG_CAN_MERGEflag被置位，然后调用splice让pipe_buffer指向目标文件page cache，这时再向pipe里写数据就会直接修改page cache里的内容，造成任意文件覆盖漏洞
源码分析
以下源码来自Linux5.8.1

pipe
关键数据结构
pipe_inode_info
/**
* struct pipe_inode_info - a linux kernel pipe
* @mutex: mutex protecting the whole thing
* @rd_wait: reader wait point in case of empty pipe
* @wr_wait: writer wait point in case of full pipe
* @head: The point of buffer production
* @tail: The point of buffer consumption
* @note_loss: The next read() should insert a data-lost message
* @max_usage: The maximum number of slots that may be used in the ring
* @ring_size: total number of buffers (should be a power of 2)
* @nr_accounted: The amount this pipe accounts for in user->pipe_bufs
* @tmp_page: cached released page
* @readers: number of current readers of this pipe
* @writers: number of current writers of this pipe
* @files: number of struct file referring this pipe (protected by ->i_lock)
* @r_counter: reader counter
* @w_counter: writer counter
* @fasync_readers: reader side fasync
* @fasync_writers: writer side fasync
* @bufs: the circular array of pipe buffers
* @user: the user who created this pipe
* @watch_queue: If this pipe is a watch_queue, this is the stuff for that
**/
struct pipe_inode_info {
struct mutex mutex;
wait_queue_head_t rd_wait, wr_wait;
unsigned int head;
unsigned int tail;
unsigned int max_usage;
unsigned int ring_size;
#ifdef CONFIG_WATCH_QUEUE
bool note_loss;
#endif
unsigned int nr_accounted;
unsigned int readers;
unsigned int writers;
unsigned int files;
unsigned int r_counter;
unsigned int w_counter;
struct page *tmp_page;
struct fasync_struct *fasync_readers;
struct fasync_struct *fasync_writers;
struct pipe_buffer *bufs;
struct user_struct *user;
#ifdef CONFIG_WATCH_QUEUE
struct watch_queue *watch_queue;
#endif
};
其中bufs是一个struct pipe_buffer的数组，默认数量为16，每个pipe_buffer能存储一个page的数据。这16个page组成一个环形缓冲区，用来存储管道里的数据。

pipe_buffer
/**
* struct pipe_buffer - a linux kernel pipe buffer
* @page: the page containing the data for the pipe buffer
* @offset: offset of data inside the @page
* @len: length of data inside the @page
* @ops: operations associated with this buffer. See @pipe_buf_operations.
* @flags: pipe buffer flags. See above.
* @private: private data owned by the ops.
**/
struct pipe_buffer {
struct page *page;
unsigned int offset, len;
const struct pipe_buf_operations *ops;
unsigned int flags;
unsigned long private;
};
PIPE_BUF_FLAG_CAN_MERGE就包含在flags字段中，它将影响page指向的内存页

写pipe
调用write向pipe里写数据时会经过层层调用，最终实际调用pipe_write

static ssize_t
pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data;
unsigned int head;
ssize_t ret = 0;
size_t total_len = iov_iter_count(from);
ssize_t chars;
bool was_empty = false;
bool wake_next_writer = false;

/* Null write succeeds. */
if (unlikely(total_len == 0))
return 0;

__pipe_lock(pipe);

// 确保读者数量不为0
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
ret = -EPIPE;
goto out;
}

#ifdef CONFIG_WATCH_QUEUE
if (pipe->watch_queue) {
ret = -EXDEV;
goto out;
}
#endif

/*
* Only wake up if the pipe started out empty, since
* otherwise there should be no readers waiting.
*
* If it wasn't empty we try to merge new data into
* the last buffer.
*
* That naturally merges small writes, but it also
* page-aligs the rest of the writes for large writes
* spanning multiple pages.
*/
head = pipe->head;
was_empty = pipe_empty(head, pipe->tail);
chars = total_len & (PAGE_SIZE-1); // 要写入的数据的大小相对页帧大小的余数
// 如果余数不为0，且pipe不为空
if (chars && !was_empty) {
unsigned int mask = pipe->ring_size - 1;
// 当前头部的上一个缓冲区，因为要尝试将多余的数据与之前的数据合并
struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
int offset = buf->offset + buf->len;

// 如果PIPE_BUF_FLAG_CAN_MERGE被置位，且buf能容下chars大小的数据
if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
offset + chars <= PAGE_SIZE) {
ret = pipe_buf_confirm(pipe, buf);
if (ret)
goto out;

// 将chars大小的数据写入缓冲区
ret = copy_page_from_iter(buf->page, offset, chars, from);
if (unlikely(ret < chars)) {
ret = -EFAULT;
goto out;
}

buf->len += ret;
// 如果没有其余数据需要写入，则退出
if (!iov_iter_count(from))
goto out;
}
}

for (;;) {
// 确保对着数量不为0
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
if (!ret)
ret = -EPIPE;
break;
}

head = pipe->head;
// 如果pipe没被填满
if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
unsigned int mask = pipe->ring_size - 1;
struct pipe_buffer *buf = &pipe->bufs[head & mask];
struct page *page = pipe->tmp_page; // tmp_page用来临时存数据
int copied;

// 如果tmp_page还未分配，则用alloc_page分配一个page并赋值
if (!page) {
page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
if (unlikely(!page)) {
ret = ret ? : -ENOMEM;
break;
}
pipe->tmp_page = page;
}

/* Allocate a slot in the ring in advance and attach an
* empty buffer. If we fault or otherwise fail to use
* it, either the reader will consume it or it'll still
* be there for the next write.
*/
// 自旋锁锁住读者等待队列
spin_lock_irq(&pipe->rd_wait.lock);

head = pipe->head;
// 如果pipe已经被填满则进入下一次循环
if (pipe_full(head, pipe->tail, pipe->max_usage)) {
spin_unlock_irq(&pipe->rd_wait.lock);
continue;
}

// 先让头部指针指向下一个缓冲区
pipe->head = head + 1;
spin_unlock_irq(&pipe->rd_wait.lock);// 释放自旋锁

/* Insert it into the buffer array */
buf = &pipe->bufs[head & mask];
buf->page = page; //将之前分配的tmp_page赋值给buf->page
buf->ops = &anon_pipe_buf_ops;
buf->offset = 0;
buf->len = 0;
// 如果创建pipe时没有指定O_DIRECT选项，则将flags设置为PIPE_BUF_FLAG_CAN_MERGE
// 所以只要创建pipe时不指定flags，就能将buffer的PIPE_BUF_FLAG_CAN_MERGE置位
if (is_packetized(filp))
buf->flags = PIPE_BUF_FLAG_PACKET;
else
buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
pipe->tmp_page = NULL; // tmp_page置空

// 拷贝一页大小的数据到page里
copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
if (!ret)
ret = -EFAULT;
break;
}
ret += copied;
buf->offset = 0;
buf->len = copied;

if (!iov_iter_count(from))
break;
}
......
}
splice
splice系统调用主要由do_splice函数完成，do_splice根据输入的文件描述符进入不同的分支，在本次漏洞利用中因为in是普通文件，out是pipe，所以会进入if (opipe)这个分支

/*
* Determine where to splice to/from.
*/
long do_splice(struct file *in, loff_t __user *off_in,
struct file *out, loff_t __user *off_out,
size_t len, unsigned int flags)
{
struct pipe_inode_info *ipipe;
struct pipe_inode_info *opipe;
loff_t offset;
long ret;

if (unlikely(!(in->f_mode & FMODE_READ) ||
!(out->f_mode & FMODE_WRITE)))
return -EBADF;

ipipe = get_pipe_info(in, true);
opipe = get_pipe_info(out, true);

// in和out都是pipe
if (ipipe && opipe) {
......
}

// 只有in是pipe
if (ipipe) {
......
}
// 只有out是pipe
if (opipe) {
// 处理in和out的偏移
if (off_out)
return -ESPIPE;
if (off_in) {
if (!(in->f_mode & FMODE_PREAD))
return -EINVAL;
if (copy_from_user(&offset, off_in, sizeof(loff_t)))
return -EFAULT;
} else {
offset = in->f_pos;
}

if (out->f_flags & O_NONBLOCK)
flags |= SPLICE_F_NONBLOCK;

pipe_lock(opipe);
// 等待pipe有可用的缓冲区
ret = wait_for_space(opipe, flags);
if (!ret) {
unsigned int p_space;

/* Don't try to read more the pipe has space for. */
p_space = opipe->max_usage - pipe_occupancy(opipe->head, opipe->tail);// pipe可用空间
len = min_t(size_t, len, p_space << PAGE_SHIFT);// 实际读取长度不能超过pipe可用空间

ret = do_splice_to(in, &offset, opipe, len, flags); // 调用do_splice_to完成主要工作
}
pipe_unlock(opipe);
if (ret > 0)
wakeup_pipe_readers(opipe);
if (!off_in)
in->f_pos = offset;
else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
ret = -EFAULT;

return ret;
}

return -EINVAL;
}
在do_splice_to中又调用了输入文件的splice_read函数，之后又经过一系列的调用，最终由copy_page_to_iter_pipe完成关联page_cage和pipe缓冲区的工作

tatic size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i)
{
struct pipe_inode_info *pipe = i->pipe;
struct pipe_buffer *buf;
unsigned int p_tail = pipe->tail;
unsigned int p_mask = pipe->ring_size - 1;
unsigned int i_head = i->head;
size_t off;

if (unlikely(bytes > i->count))
bytes = i->count;

if (unlikely(!bytes))
return 0;

if (!sanity(i))
return 0;

off = i->iov_offset;
buf = &pipe->bufs[i_head & p_mask];
if (off) {
// 如果要求的offset和实际的offset相同，且头部的buffer指向的就是当前的page cache
// 则直接移动offset即可
if (offset == off && buf->page == page) {
/* merge with the last one */
buf->len += bytes;
i->iov_offset += bytes;
goto out;
}
i_head++;
buf = &pipe->bufs[i_head & p_mask];
}
if (pipe_full(i_head, p_tail, pipe->max_usage))
return 0;

buf->ops = &page_cache_pipe_buf_ops;
// 增加page的应用计数
get_page(page);
// 将pipe缓冲区的page指针指向文件的page cache
buf->page = page;
buf->offset = offset;
buf->len = bytes;

pipe->head = i_head + 1;
i->iov_offset = offset + bytes;
i->head = i_head;
out:
i->count -= bytes;
return bytes;
}
可以看到copy_page_to_iter_pipe函数直接将page cache赋值给对应buffer的page指针，而没有对buffer的flags做初始化存在，使得之前被设置的PIPE_BUF_FLAG_CAN_MERGE仍然有效

能真正地覆盖文件内容吗
由上面的漏洞分析可知，最终完成的对page cache的覆写，而不是覆盖磁盘上的文件内容。当文件的page cache存在时，之后读取该文件都将直接从page cache中获取，所以只要该page cache存在，就相当于覆盖了文件内容。经测试，只要重启系统后page cache就会消失，此时再读取文件将会得到原文件内容。

但是page cache不是有writeback机制吗，只要触发该机制是不是就能将覆写后的page cache写回磁盘呢？

为了验证这个问题，我调用sync来手动触发writeback

观察程序输出结果发现，调用sync之后读取文件内容仍然是篡改过后的内容，看起来sync似乎真的把page cache里写回到了磁盘里

但当我重启系统之后发现文件内容又复原了，说明sync即没有把page cache写回到磁盘，也没有清除缓存中的内容，相当于直接忽略了这个被篡改过的page，这是为什么呢？

经过调试发现，在向普通文件写入数据时，调用的是generic_file_write_iter函数

经过如下图所示的调用，最终会调用set_page_dirty函数将该page置为dirty状态，所以最终会被writeback机制写回到磁盘中

正如源码分析中所说的，向pipe中写入数据时调用的是pipe_write，这时我给set_page_dirty函数设置断点发现，程序之后都没有调用这个函数，这点从源码中也可以证明。

这说明当我们利用漏洞修改page cache中的内容时，系统并没有将对应的page设置为dirty，所以这个修改对writeback机制来说是不可见的，自然会被忽略掉。

那为什么重启系统文件内容又会恢复呢？那是因为重启系统将所有的缓存都回收了，执行echo 1 > /proc/sys/vm/drop_caches命令能手动回收缓存，也能将文件内容恢复

参考资料
The Dirty Pipe Vulnerability

CVE-2022-0847-DirtyPipe-Exploit

Linux5.8.1源码

CVE-2022-0847 漏洞分析

VFS源码分析-Page Cache Writeback机制

（来源：先知社区）

（原文链接：https://xz.aliyun.com/t/11038）

CVE-2022-0847-DirtyPipe分析

发表评论