From: Andrea Arcangeli <andrea@cpushare.com>
Subject: seccomp for 2.6.11-rc3

Add seccomp mode to safely compute untrusted code.

Signed-off-by: Andrea Arcangeli <andrea@cpushare.com>

--- 2.6.11-rc3/arch/i386/Kconfig	2005-02-11 03:51:27.962342554 +0100
+++ seccomp/arch/i386/Kconfig	2005-02-11 03:56:40.050875895 +0100
@@ -888,6 +888,23 @@ config REGPARM
 	generate incorrect output with certain kernel constructs when
 	-mregparm=3 is used.
 
+config SECCOMP
+	bool "Enable seccomp to safely compute untrusted bytecode"
+	depends on PROC_FS
+	default y
+	help
+	  This kernel feature is useful for number crunching applications
+	  that may need to compute untrusted bytecode during their
+	  execution. By using pipes or other transports made available to
+	  the process as file descriptors supporting the read/write
+	  syscalls, it's possible to isolate those applications in
+	  their own address space using seccomp. Once seccomp is
+	  enabled via /proc/<pid>/seccomp, it cannot be disabled
+	  and the task is only allowed to execute a few safe syscalls
+	  defined by each seccomp mode.
+
+	  If unsure, say Y. Only embedded should say N here.
+
 endmenu
 
 
--- 2.6.11-rc3/arch/i386/kernel/entry.S	2005-02-11 03:51:28.158278238 +0100
+++ seccomp/arch/i386/kernel/entry.S	2005-02-11 03:56:40.049876223 +0100
@@ -219,7 +219,8 @@ sysenter_past_esp:
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
 
-	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
+	testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp)
 	jnz syscall_trace_entry
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
@@ -243,7 +244,8 @@ ENTRY(system_call)
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
 					# system call tracing in operation
-	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
+	testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp)
 	jnz syscall_trace_entry
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
--- 2.6.11-rc3/arch/i386/kernel/ptrace.c	2005-02-11 03:51:28.186269050 +0100
+++ seccomp/arch/i386/kernel/ptrace.c	2005-02-11 03:56:40.049876223 +0100
@@ -15,6 +15,7 @@
 #include <linux/user.h>
 #include <linux/security.h>
 #include <linux/audit.h>
+#include <linux/seccomp.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -678,6 +679,9 @@ void send_sigtrap(struct task_struct *ts
 __attribute__((regparm(3)))
 void do_syscall_trace(struct pt_regs *regs, int entryexit)
 {
+	/* do the secure computing check first */
+	secure_computing(regs->orig_eax);
+
 	if (unlikely(current->audit_context)) {
 		if (!entryexit)
 			audit_syscall_entry(current, regs->orig_eax,
--- 2.6.11-rc3/arch/x86_64/ia32/ia32entry.S	2005-02-11 03:51:31.864061813 +0100
+++ seccomp/arch/x86_64/ia32/ia32entry.S	2005-02-11 03:56:40.051875566 +0100
@@ -78,7 +78,7 @@ ENTRY(ia32_sysenter_target)
  	.quad 1b,ia32_badarg
  	.previous	
 	GET_THREAD_INFO(%r10)
-	testl  $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10)
+	testl  $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
 	jnz  sysenter_tracesys
 sysenter_do_call:	
 	cmpl	$(IA32_NR_syscalls),%eax
@@ -163,7 +163,7 @@ ENTRY(ia32_cstar_target)
 	.quad 1b,ia32_badarg
 	.previous	
 	GET_THREAD_INFO(%r10)
-	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10)
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
 	jnz   cstar_tracesys
 cstar_do_call:	
 	cmpl $IA32_NR_syscalls,%eax
@@ -236,7 +236,7 @@ ENTRY(ia32_syscall)
 	   this could be a problem. */
 	SAVE_ARGS 0,0,1
 	GET_THREAD_INFO(%r10)
-	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10)
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
 	jnz ia32_tracesys
 ia32_do_syscall:	
 	cmpl $(IA32_NR_syscalls),%eax
--- 2.6.11-rc3/arch/x86_64/Kconfig	2005-02-11 03:51:31.817077236 +0100
+++ seccomp/arch/x86_64/Kconfig	2005-02-11 03:56:40.052875238 +0100
@@ -350,6 +350,24 @@ config X86_MCE_INTEL
 	help
 	   Additional support for intel specific MCE features such as
 	   the thermal monitor.
+
+config SECCOMP
+	bool "Enable seccomp to safely compute untrusted bytecode"
+	depends on PROC_FS
+	default y
+	help
+	  This kernel feature is useful for number crunching applications
+	  that may need to compute untrusted bytecode during their
+	  execution. By using pipes or other transports made available to
+	  the process as file descriptors supporting the read/write
+	  syscalls, it's possible to isolate those applications in
+	  their own address space using seccomp. Once seccomp is
+	  enabled via /proc/<pid>/seccomp, it cannot be disabled
+	  and the task is only allowed to execute a few safe syscalls
+	  defined by each seccomp mode.
+
+	  If unsure, say Y. Only embedded should say N here.
+
 endmenu
 
 #
--- 2.6.11-rc3/arch/x86_64/kernel/entry.S	2005-02-11 03:51:31.914045406 +0100
+++ seccomp/arch/x86_64/kernel/entry.S	2005-02-11 03:56:40.052875238 +0100
@@ -184,7 +184,7 @@ ENTRY(system_call)
 	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp) 
 	movq  %rcx,RIP-ARGOFFSET(%rsp)  
 	GET_THREAD_INFO(%rcx)
-	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
 	jnz tracesys
 	cmpq $__NR_syscall_max,%rax
 	ja badsys
--- 2.6.11-rc3/arch/x86_64/kernel/ptrace.c	2005-02-11 03:51:31.962029655 +0100
+++ seccomp/arch/x86_64/kernel/ptrace.c	2005-02-11 03:56:40.052875238 +0100
@@ -17,6 +17,7 @@
 #include <linux/user.h>
 #include <linux/security.h>
 #include <linux/audit.h>
+#include <linux/seccomp.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -521,6 +522,9 @@ static void syscall_trace(struct pt_regs
 
 asmlinkage void syscall_trace_enter(struct pt_regs *regs)
 {
+	/* do the secure computing check first */
+	secure_computing(regs->orig_rax);
+
 	if (unlikely(current->audit_context))
 		audit_syscall_entry(current, regs->orig_rax,
 				    regs->rdi, regs->rsi,
--- 2.6.11-rc3/fs/proc/base.c	2005-02-11 03:51:42.431592978 +0100
+++ seccomp/fs/proc/base.c	2005-02-11 03:56:40.054874581 +0100
@@ -32,6 +32,7 @@
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/ptrace.h>
+#include <linux/seccomp.h>
 #include "internal.h"
 
 /*
@@ -49,6 +50,9 @@ enum pid_directory_inos {
 	PROC_TGID_TASK,
 	PROC_TGID_STATUS,
 	PROC_TGID_MEM,
+#ifdef CONFIG_SECCOMP
+	PROC_TGID_SECCOMP,
+#endif
 	PROC_TGID_CWD,
 	PROC_TGID_ROOT,
 	PROC_TGID_EXE,
@@ -80,6 +84,9 @@ enum pid_directory_inos {
 	PROC_TID_INO,
 	PROC_TID_STATUS,
 	PROC_TID_MEM,
+#ifdef CONFIG_SECCOMP
+	PROC_TID_SECCOMP,
+#endif
 	PROC_TID_CWD,
 	PROC_TID_ROOT,
 	PROC_TID_EXE,
@@ -130,6 +137,9 @@ static struct pid_entry tgid_base_stuff[
 	E(PROC_TGID_STATM,     "statm",   S_IFREG|S_IRUGO),
 	E(PROC_TGID_MAPS,      "maps",    S_IFREG|S_IRUGO),
 	E(PROC_TGID_MEM,       "mem",     S_IFREG|S_IRUSR|S_IWUSR),
+#ifdef CONFIG_SECCOMP
+	E(PROC_TGID_SECCOMP,   "seccomp", S_IFREG|S_IRUSR|S_IWUSR),
+#endif
 	E(PROC_TGID_CWD,       "cwd",     S_IFLNK|S_IRWXUGO),
 	E(PROC_TGID_ROOT,      "root",    S_IFLNK|S_IRWXUGO),
 	E(PROC_TGID_EXE,       "exe",     S_IFLNK|S_IRWXUGO),
@@ -160,6 +170,9 @@ static struct pid_entry tid_base_stuff[]
 	E(PROC_TID_STATM,      "statm",   S_IFREG|S_IRUGO),
 	E(PROC_TID_MAPS,       "maps",    S_IFREG|S_IRUGO),
 	E(PROC_TID_MEM,        "mem",     S_IFREG|S_IRUSR|S_IWUSR),
+#ifdef CONFIG_SECCOMP
+	E(PROC_TID_SECCOMP,    "seccomp", S_IFREG|S_IRUSR|S_IWUSR),
+#endif
 	E(PROC_TID_CWD,        "cwd",     S_IFLNK|S_IRWXUGO),
 	E(PROC_TID_ROOT,       "root",    S_IFLNK|S_IRWXUGO),
 	E(PROC_TID_EXE,        "exe",     S_IFLNK|S_IRWXUGO),
@@ -808,6 +821,61 @@ static struct file_operations proc_login
 };
 #endif
 
+#ifdef CONFIG_SECCOMP
+static ssize_t seccomp_read(struct file * file, char * buf,
+			    size_t count, loff_t *ppos)
+{
+	struct task_struct * tsk = proc_task(file->f_dentry->d_inode);
+	char __buf[20];
+	loff_t __ppos = *ppos;
+	size_t len;
+
+	/* no need to print the trailing zero, so use only len */
+	len = sprintf(__buf, "%u\n", tsk->seccomp.mode);
+	if (__ppos >= len)
+		return 0;
+	if (count > len-__ppos)
+		count = len-__ppos;
+	if (copy_to_user(buf, __buf + __ppos, count))
+		return -EFAULT;
+	*ppos = __ppos + count;
+	return count;
+}
+
+static ssize_t seccomp_write(struct file * file, const char * buf,
+			     size_t count, loff_t *ppos)
+{
+	struct task_struct * tsk = proc_task(file->f_dentry->d_inode);
+	char __buf[20], * end;
+	unsigned int seccomp_mode;
+
+	/* can set it only once to be even more secure */
+	if (unlikely(tsk->seccomp.mode))
+		return -EPERM;
+
+	memset(__buf, 0, 20);
+	if (count > 19)
+		count = 19;
+	if (copy_from_user(__buf, buf, count))
+		return -EFAULT;
+	seccomp_mode = simple_strtoul(__buf, &end, 0);
+	if (*end == '\n')
+		end++;
+	if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
+		tsk->seccomp.mode = seccomp_mode;
+		set_tsk_thread_flag(tsk, TIF_SECCOMP);
+	}
+	if (unlikely(!(end - __buf)))
+		return -EIO;
+	return end - __buf;
+}
+
+static struct file_operations proc_seccomp_operations = {
+	.read		= seccomp_read,
+	.write		= seccomp_write,
+};
+#endif /* CONFIG_SECCOMP */
+
 static int proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
@@ -1443,6 +1511,12 @@ static struct dentry *proc_pident_lookup
 			inode->i_op = &proc_mem_inode_operations;
 			inode->i_fop = &proc_mem_operations;
 			break;
+#ifdef CONFIG_SECCOMP
+		case PROC_TID_SECCOMP:
+		case PROC_TGID_SECCOMP:
+			inode->i_fop = &proc_seccomp_operations;
+			break;
+#endif /* CONFIG_SECCOMP */
 		case PROC_TID_MOUNTS:
 		case PROC_TGID_MOUNTS:
 			inode->i_fop = &proc_mounts_operations;
--- 2.6.11-rc3/include/asm-i386/thread_info.h	2005-02-11 03:51:42.979413152 +0100
+++ seccomp/include/asm-i386/thread_info.h	2005-02-11 03:56:40.054874581 +0100
@@ -140,6 +140,7 @@ register unsigned long current_stack_poi
 #define TIF_SINGLESTEP		4	/* restore singlestep on return to user mode */
 #define TIF_IRET		5	/* return with iret */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
+#define TIF_SECCOMP		8	/* secure computing */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_MEMDIE		17
 
@@ -150,12 +151,14 @@ register unsigned long current_stack_poi
 #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
 #define _TIF_IRET		(1<<TIF_IRET)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
+#define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 
 /* work to do on interrupt/exception return */
 #define _TIF_WORK_MASK \
-  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP))
-#define _TIF_ALLWORK_MASK	0x0000FFFF	/* work to do on any return to u-space */
+  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP|_TIF_SECCOMP))
+/* work to do on any return to u-space */
+#define _TIF_ALLWORK_MASK	(0x0000FFFF & ~_TIF_SECCOMP)
 
 /*
  * Thread-synchronous status.
--- 2.6.11-rc3/include/asm-x86_64/thread_info.h	2005-02-11 03:51:43.624201496 +0100
+++ seccomp/include/asm-x86_64/thread_info.h	2005-02-11 03:56:40.055874253 +0100
@@ -102,6 +102,7 @@ static inline struct thread_info *stack_
 #define TIF_SINGLESTEP		4	/* reenable singlestep on user return*/
 #define TIF_IRET		5	/* force IRET */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
+#define TIF_SECCOMP		8	/* secure computing */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_IA32		17	/* 32bit process */ 
 #define TIF_FORK		18	/* ret_from_fork */
@@ -115,6 +116,7 @@ static inline struct thread_info *stack_
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_IRET		(1<<TIF_IRET)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
+#define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 #define _TIF_IA32		(1<<TIF_IA32)
 #define _TIF_FORK		(1<<TIF_FORK)
@@ -122,9 +124,9 @@ static inline struct thread_info *stack_
 
 /* work to do on interrupt/exception return */
 #define _TIF_WORK_MASK \
-  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP))
+  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP|_TIF_SECCOMP))
 /* work to do on any return to user space */
-#define _TIF_ALLWORK_MASK 0x0000FFFF	
+#define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP)
 
 #define PREEMPT_ACTIVE     0x10000000
 
--- 2.6.11-rc3/include/linux/sched.h	2005-02-11 03:51:43.951094191 +0100
+++ seccomp/include/linux/sched.h	2005-02-11 03:56:40.056873925 +0100
@@ -32,6 +32,7 @@
 #include <linux/pid.h>
 #include <linux/percpu.h>
 #include <linux/topology.h>
+#include <linux/seccomp.h>
 
 struct exec_domain;
 
@@ -643,6 +644,7 @@ struct task_struct {
 	
 	void *security;
 	struct audit_context *audit_context;
+	seccomp_t seccomp;
 
 /* Thread group tracking */
    	u32 parent_exec_id;
--- 2.6.11-rc3/include/linux/seccomp.h	1970-01-01 01:00:00.000000000 +0100
+++ seccomp/include/linux/seccomp.h	2005-02-11 03:56:40.059872940 +0100
@@ -0,0 +1,33 @@
+#ifndef _LINUX_SECCOMP_H
+#define _LINUX_SECCOMP_H
+
+#include <linux/config.h>
+
+#ifdef CONFIG_SECCOMP
+
+#define NR_SECCOMP_MODES 1
+
+#include <linux/thread_info.h>
+
+typedef struct { int mode; } seccomp_t;
+
+extern void __secure_computing(int);
+static inline void secure_computing(int this_syscall)
+{
+	if (unlikely(test_thread_flag(TIF_SECCOMP)))
+		__secure_computing(this_syscall);
+}
+
+#else /* CONFIG_SECCOMP */
+
+#if (__GNUC__ > 2)
+  typedef struct { } seccomp_t;
+#else
+  typedef struct { int gcc_is_buggy; } seccomp_t;
+#endif
+
+#define secure_computing(x) do { } while (0)
+
+#endif /* CONFIG_SECCOMP */
+
+#endif /* _LINUX_SECCOMP_H */
--- 2.6.11-rc3/kernel/Makefile	2005-01-04 01:13:30.000000000 +0100
+++ seccomp/kernel/Makefile	2005-02-11 03:56:40.059872940 +0100
@@ -26,6 +26,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_SYSFS) += ksysfs.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
+obj-$(CONFIG_SECCOMP) += seccomp.o
 
 ifneq ($(CONFIG_IA64),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
--- 2.6.11-rc3/kernel/seccomp.c	1970-01-01 01:00:00.000000000 +0100
+++ seccomp/kernel/seccomp.c	2005-02-11 03:56:40.060872612 +0100
@@ -0,0 +1,74 @@
+/*
+ * linux/kernel/seccomp.c
+ *
+ * Copyright 2004-2005  Andrea Arcangeli <andrea@cpushare.com>
+ *
+ * This defines a simple but solid secure-computing mode.
+ */
+
+#include <linux/seccomp.h>
+#include <linux/sched.h>
+#include <asm/unistd.h>
+#ifdef TIF_IA32
+#include <asm/ia32_unistd.h>
+#endif
+
+/* #define SECCOMP_DEBUG 1 */
+
+/*
+ * Secure computing mode 1 allows only read/write/exit/sigreturn.
+ * To be fully secure this must be combined with rlimit
+ * to limit the stack allocations too.
+ */
+static int mode1_syscalls[] = {
+	__NR_read, __NR_write, __NR_exit,
+	/*
+	 * Allow either sigreturn or rt_sigreturn, newer archs
+	 * like x86-64 only defines __NR_rt_sigreturn.
+	 */
+#ifdef __NR_sigreturn
+	__NR_sigreturn,
+#else
+	__NR_rt_sigreturn,
+#endif
+	0, /* null terminated */
+};
+
+#ifdef TIF_IA32
+static int mode1_syscalls_32bit[] = {
+	__NR_ia32_read, __NR_ia32_write, __NR_ia32_exit,
+	/*
+	 * Allow either sigreturn or rt_sigreturn, newer archs
+	 * like x86-64 only defines __NR_rt_sigreturn.
+	 */
+	__NR_ia32_sigreturn,
+	0, /* null terminated */
+};
+#endif
+
+void __secure_computing(int this_syscall)
+{
+	int mode = current->seccomp.mode;
+	int * syscall;
+
+	switch (mode) {
+	case 1:
+		syscall = mode1_syscalls;
+#ifdef TIF_IA32
+		if (test_thread_flag(TIF_IA32))
+			syscall = mode1_syscalls_32bit;
+#endif
+		do {
+			if (*syscall == this_syscall)
+				return;
+		} while (*++syscall);
+		break;
+	default:
+		BUG();
+	}
+
+#ifdef SECCOMP_DEBUG
+	dump_stack();
+#endif
+	do_exit(SIGKILL);
+}
