2.6.12-rc4-mm2 - sleeping function called from invalid context at mm/slab.c:2502

Thu May 19 08:30:08 UTC 2005

* David Woodhouse (dwmw2 at infradead.org) wrote:
> On Tue, 2005-05-17 at 18:04 +0100, David Woodhouse wrote:
> > I'm really not fond of the refcount trick -- I suspect I'd be happier if
> > we were just to try to keep track of sk_rmem_alloc so we never hit the
> > condition in netlink_attachskb() which might cause it to fail.
> 
> Or even better, use a kernel thread and set an infinite timeout so it'll
> never fail...

Here it is against the git tree.  It doesn't work quite right yet,
but it compiles.  It works against my own auditd, which likely means
the nl header is slightly off.

kernel/audit.c: e6d88635032c3d298fc30d5e1a55903491ca189b

--- k/kernel/audit.c
+++ l/kernel/audit.c
@@ -46,6 +46,7 @@
 #include <asm/types.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/kthread.h>
 
 #include <linux/audit.h>
 
@@ -77,7 +78,6 @@ static int	audit_rate_limit;
 
 /* Number of outstanding audit_buffers allowed. */
 static int	audit_backlog_limit = 64;
-static atomic_t	audit_backlog	    = ATOMIC_INIT(0);
 
 /* The identity of the user shutting down the audit system. */
 uid_t		audit_sig_uid = -1;
@@ -95,19 +95,17 @@ static atomic_t    audit_lost = ATOMIC_I
 /* The netlink socket. */
 static struct sock *audit_sock;
 
-/* There are two lists of audit buffers.  The txlist contains audit
- * buffers that cannot be sent immediately to the netlink device because
- * we are in an irq context (these are sent later in a tasklet).
- *
- * The second list is a list of pre-allocated audit buffers (if more
+/* The audit_freelist is a list of pre-allocated audit buffers (if more
  * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
  * being placed on the freelist). */
-static DEFINE_SPINLOCK(audit_txlist_lock);
 static DEFINE_SPINLOCK(audit_freelist_lock);
 static int	   audit_freelist_count = 0;
-static LIST_HEAD(audit_txlist);
 static LIST_HEAD(audit_freelist);
 
+static struct sk_buff_head audit_skb_queue;
+static struct task_struct *kauditd_task;
+static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
+
 /* There are three lists of rules -- one to search at task creation
  * time, one to search at syscall entry time, and another to search at
  * syscall exit time. */
@@ -136,14 +134,18 @@ static DECLARE_MUTEX(audit_netlink_sem);
  * use simultaneously. */
 struct audit_buffer {
 	struct list_head     list;
-	struct sk_buff       *skb;	/* formatted skb ready to send */
 	struct audit_context *ctx;	/* NULL or associated context */
+	int		     len;	/* used area of tmp */
+	int		     size;	/* size of tmp */
+	char		     *tmp;	/* Always NUL-terminated */
+	int		     type;
+	int		     pid;
 };
 
 static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
 {
-	struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data;
-	nlh->nlmsg_pid = pid;
+	if (ab)
+		ab->pid = pid;
 }
 
 struct audit_entry {
@@ -224,10 +226,8 @@ void audit_log_lost(const char *message)
 
 	if (print) {
 		printk(KERN_WARNING
-		       "audit: audit_lost=%d audit_backlog=%d"
-		       " audit_rate_limit=%d audit_backlog_limit=%d\n",
+		       "audit: audit_lost=%d audit_rate_limit=%d audit_backlog_limit=%d",
 		       atomic_read(&audit_lost),
-		       atomic_read(&audit_backlog),
 		       audit_rate_limit,
 		       audit_backlog_limit);
 		audit_panic(message);
@@ -281,6 +281,37 @@ static int audit_set_failure(int state, 
 	return old;
 }
 
+int kauditd_thread(void *dummy)
+{
+	struct sk_buff *skb;
+
+	while (1) {
+		skb = skb_dequeue(&audit_skb_queue);
+		if (skb) {
+			int err;
+			if (audit_pid) {
+				err = netlink_unicast(audit_sock, skb, audit_pid, 0);
+				if (err < 0) {
+					BUG_ON(err != -ECONNREFUSED);
+					printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
+					audit_pid = 0;
+				}
+			}
+		} else {
+			DECLARE_WAITQUEUE(wait, current);
+			set_current_state(TASK_INTERRUPTIBLE);
+			add_wait_queue(&kauditd_wait, &wait);
+
+			if (!skb_queue_len(&audit_skb_queue))
+				schedule();
+
+			__set_current_state(TASK_RUNNING);
+			remove_wait_queue(&kauditd_wait, &wait);
+		}
+	}
+}
+
+
 void audit_send_reply(int pid, int seq, int type, int done, int multi,
 		      void *payload, int size)
 {
@@ -299,7 +330,10 @@ void audit_send_reply(int pid, int seq, 
 	nlh->nlmsg_flags = flags;
 	data		 = NLMSG_DATA(nlh);
 	memcpy(data, payload, size);
-	netlink_unicast(audit_sock, skb, pid, MSG_DONTWAIT);
+
+	/* Ignore failure. It'll only happen if the sender goes away,
+	   because our timeout is set to infinite. */
+	netlink_unicast(audit_sock, skb, pid, 0);
 	return;
 
 nlmsg_failure:			/* Used by NLMSG_PUT */
@@ -351,6 +385,15 @@ static int audit_receive_msg(struct sk_b
 	if (err)
 		return err;
 
+	/* As soon as there's any sign of userspace auditd, start kauditd to talk to it */
+	if (!kauditd_task)
+		kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
+	if (IS_ERR(kauditd_task)) {
+		err = PTR_ERR(kauditd_task);
+		kauditd_task = NULL;
+		return err;
+	}
+
 	pid  = NETLINK_CREDS(skb)->pid;
 	uid  = NETLINK_CREDS(skb)->uid;
 	loginuid = NETLINK_CB(skb).loginuid;
@@ -365,7 +408,7 @@ static int audit_receive_msg(struct sk_b
 		status_set.rate_limit	 = audit_rate_limit;
 		status_set.backlog_limit = audit_backlog_limit;
 		status_set.lost		 = atomic_read(&audit_lost);
-		status_set.backlog	 = atomic_read(&audit_backlog);
+		status_set.backlog	 = skb_queue_len(&audit_skb_queue);
 		audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0,
 				 &status_set, sizeof(status_set));
 		break;
@@ -471,43 +514,48 @@ static void audit_receive(struct sock *s
 	up(&audit_netlink_sem);
 }
 
-/* Grab skbuff from the audit_buffer and send to user space. */
-static inline int audit_log_drain(struct audit_buffer *ab)
+/* Move data from tmp buffer into an skb. This is an extra copy, but
+ * there's no point in trying to log directly into an skb because
+ * netlink_trim() would only reallocate and copy it anyway. So we use
+ * the temporary buffer, then allocate optimally-sized skbs for netlink
+ * and check against the receiving socket's sk_rmem_alloc to ensure
+ * that we don't ever call netlink_unicast() if it would fail. */
+static void audit_log_move(struct audit_buffer *ab, int gfp_mask)
 {
-	struct sk_buff *skb = ab->skb;
-
-	if (skb) {
-		int retval = 0;
-
-		if (audit_pid) {
-			struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data;
-			nlh->nlmsg_len = skb->len - NLMSG_SPACE(0);
-			skb_get(skb); /* because netlink_* frees */
-			retval = netlink_unicast(audit_sock, skb, audit_pid,
-						 MSG_DONTWAIT);
-		}
-		if (retval == -EAGAIN &&
-		    (atomic_read(&audit_backlog)) < audit_backlog_limit) {
-			audit_log_end_irq(ab);
-			return 1;
-		}
-		if (retval < 0) {
-			if (retval == -ECONNREFUSED) {
-				printk(KERN_ERR
-				       "audit: *NO* daemon at audit_pid=%d\n",
-				       audit_pid);
-				audit_pid = 0;
-			} else
-				audit_log_lost("netlink socket too busy");
-		}
-		if (!audit_pid) { /* No daemon */
-			int offset = NLMSG_SPACE(0);
-			int len    = skb->len - offset;
-			skb->data[offset + len] = '\0';
-			printk(KERN_ERR "%s\n", skb->data + offset);
-		}
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlh;
+	char *start;
+	int len = NLMSG_SPACE(0) + ab->len + 1;
+ 
+	if (!audit_pid) {
+		skb = NULL;
+	} else if (skb_queue_len(&audit_skb_queue) > audit_backlog_limit) {
+		if (audit_rate_check())
+			printk(KERN_WARNING "audit: audit_backlog_limit %d reached\n", audit_backlog_limit);
+		audit_log_lost("backlog limit exceeded");
+		skb = NULL;
+	} else {
+		skb = alloc_skb(len, gfp_mask);
+		if (!skb)
+			audit_log_lost("out of memory in audit_log_move");
+	}
+	if (!skb) {
+		ab->tmp[ab->len] = '\0';
+		printk(KERN_ERR "%s\n", ab->tmp);
+		return;
 	}
-	return 0;
+	nlh = (struct nlmsghdr *)skb_put(skb, NLMSG_SPACE(0));
+	nlh->nlmsg_type = ab->type;
+	nlh->nlmsg_len = ab->len;
+	nlh->nlmsg_flags = 0;
+	nlh->nlmsg_pid = ab->pid;
+	nlh->nlmsg_seq = 0;
+	start = skb_put(skb, ab->len);
+	memcpy(start, ab->tmp, ab->len);
+	start[ab->len]=0;
+
+	skb_queue_tail(&audit_skb_queue, skb);
+	wake_up_interruptible(&kauditd_wait);
 }
 
 /* Initialize audit support at boot time. */
@@ -519,7 +567,9 @@ static int __init audit_init(void)
 	if (!audit_sock)
 		audit_panic("cannot initialize netlink socket");
 
+	audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
 	audit_initialized = 1;
+	skb_queue_head_init(&audit_skb_queue);
 	audit_enabled = audit_default;
 	audit_log(NULL, AUDIT_KERNEL, "initialized");
 	return 0;
@@ -547,9 +597,8 @@ static void audit_buffer_free(struct aud
 	if (!ab)
 		return;
 
-	if (ab->skb)
-		kfree_skb(ab->skb);
-	atomic_dec(&audit_backlog);
+	kfree(ab->tmp);
+
 	spin_lock_irqsave(&audit_freelist_lock, flags);
 	if (++audit_freelist_count > AUDIT_MAXFREE)
 		kfree(ab);
@@ -563,7 +612,6 @@ static struct audit_buffer * audit_buffe
 {
 	unsigned long flags;
 	struct audit_buffer *ab = NULL;
-	struct nlmsghdr *nlh;
 
 	spin_lock_irqsave(&audit_freelist_lock, flags);
 	if (!list_empty(&audit_freelist)) {
@@ -579,18 +627,16 @@ static struct audit_buffer * audit_buffe
 		if (!ab)
 			goto err;
 	}
-	atomic_inc(&audit_backlog);
 
-	ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask);
-	if (!ab->skb)
+	ab->tmp = kmalloc(AUDIT_BUFSIZ, gfp_mask);
+	if (!ab->tmp)
 		goto err;
 
 	ab->ctx   = ctx;
-	nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0));
-	nlh->nlmsg_type = type;
-	nlh->nlmsg_flags = 0;
-	nlh->nlmsg_pid = 0;
-	nlh->nlmsg_seq = 0;
+	ab->len   = 0;
+	ab->size  = AUDIT_BUFSIZ;
+	ab->type  = type;
+	ab->pid   = 0;
 	return ab;
 err:
 	audit_buffer_free(ab);
@@ -612,18 +658,6 @@ struct audit_buffer *audit_log_start(str
 	if (!audit_initialized)
 		return NULL;
 
-	if (audit_backlog_limit
-	    && atomic_read(&audit_backlog) > audit_backlog_limit) {
-		if (audit_rate_check())
-			printk(KERN_WARNING
-			       "audit: audit_backlog=%d > "
-			       "audit_backlog_limit=%d\n",
-			       atomic_read(&audit_backlog),
-			       audit_backlog_limit);
-		audit_log_lost("backlog limit exceeded");
-		return NULL;
-	}
-
 	ab = audit_buffer_alloc(ctx, GFP_ATOMIC, type);
 	if (!ab) {
 		audit_log_lost("out of memory in audit_log_start");
@@ -649,14 +683,17 @@ struct audit_buffer *audit_log_start(str
  */
 static inline int audit_expand(struct audit_buffer *ab, int extra)
 {
-	struct sk_buff *skb = ab->skb;
-	int ret = pskb_expand_head(skb, skb_headroom(skb), extra,
-				   GFP_ATOMIC);
-	if (ret < 0) {
-		audit_log_lost("out of memory in audit_expand");
+	char *tmp;
+	int len = ab->size + extra;
+
+	tmp = kmalloc(len, GFP_ATOMIC);
+	if (!tmp)
 		return 0;
-	}
-	return skb_tailroom(skb);
+	memcpy(tmp, ab->tmp, ab->len);
+	kfree(ab->tmp);
+	ab->tmp = tmp;
+	ab->size = len;
+	return ab->size - ab->len;
 }
 
 /* Format an audit message into the audit buffer.  If there isn't enough
@@ -667,22 +704,19 @@ static void audit_log_vformat(struct aud
 			      va_list args)
 {
 	int len, avail;
-	struct sk_buff *skb;
 	va_list args2;
 
 	if (!ab)
 		return;
 
-	BUG_ON(!ab->skb);
-	skb = ab->skb;
-	avail = skb_tailroom(skb);
+	avail = ab->size - ab->len;
 	if (avail == 0) {
 		avail = audit_expand(ab, AUDIT_BUFSIZ);
 		if (!avail)
 			goto out;
 	}
 	va_copy(args2, args);
-	len = vsnprintf(skb->tail, avail, fmt, args);
+	len = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
 	if (len >= avail) {
 		/* The printk buffer is 1024 bytes long, so if we get
 		 * here and AUDIT_BUFSIZ is at least 1024, then we can
@@ -690,9 +724,9 @@ static void audit_log_vformat(struct aud
 		avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
 		if (!avail)
 			goto out;
-		len = vsnprintf(skb->tail, avail, fmt, args2);
+		len = vsnprintf(ab->tmp + ab->len, avail, fmt, args2);
 	}
-	skb_put(skb, (len < avail) ? len : avail);
+	ab->len += (len < avail) ? len : avail;
 out:
 	return;
 }
@@ -740,60 +774,38 @@ void audit_log_d_path(struct audit_buffe
 		      struct dentry *dentry, struct vfsmount *vfsmnt)
 {
 	char *p;
-	struct sk_buff *skb = ab->skb;
 	int  len, avail;
 
 	if (prefix)
 		audit_log_format(ab, " %s", prefix);
 
-	avail = skb_tailroom(skb);
-	p = d_path(dentry, vfsmnt, skb->tail, avail);
+	avail = ab->size - ab->len;
+	p = d_path(dentry, vfsmnt, ab->tmp + ab->len, avail);
 	if (IS_ERR(p)) {
 		/* FIXME: can we save some information here? */
 		audit_log_format(ab, "<toolong>");
 	} else {
 		/* path isn't at start of buffer */
-		len = ((char *)skb->tail + avail - 1) - p;
-		memmove(skb->tail, p, len);
-		skb_put(skb, len);
+		len = (ab->tmp + ab->size - 1) - p;
+		memmove(ab->tmp + ab->len, p, len);
+		ab->len += len;
 	}
 }
 
-/* Remove queued messages from the audit_txlist and send them to user space. */
-static void audit_tasklet_handler(unsigned long arg)
-{
-	LIST_HEAD(list);
-	struct audit_buffer *ab;
-	unsigned long	    flags;
-
-	spin_lock_irqsave(&audit_txlist_lock, flags);
-	list_splice_init(&audit_txlist, &list);
-	spin_unlock_irqrestore(&audit_txlist_lock, flags);
-
-	while (!list_empty(&list)) {
-		ab = list_entry(list.next, struct audit_buffer, list);
-		list_del(&ab->list);
-		audit_log_end_fast(ab);
-	}
-}
-
-static DECLARE_TASKLET(audit_tasklet, audit_tasklet_handler, 0);
-
 /* The netlink_* functions cannot be called inside an irq context, so
  * the audit buffer is places on a queue and a tasklet is scheduled to
  * remove them from the queue outside the irq context.  May be called in
  * any context. */
 static void audit_log_end_irq(struct audit_buffer *ab)
 {
-	unsigned long flags;
-
 	if (!ab)
 		return;
-	spin_lock_irqsave(&audit_txlist_lock, flags);
-	list_add_tail(&ab->list, &audit_txlist);
-	spin_unlock_irqrestore(&audit_txlist_lock, flags);
-
-	tasklet_schedule(&audit_tasklet);
+	if (!audit_rate_check()) {
+		audit_log_lost("rate limit exceeded");
+	} else {
+		audit_log_move(ab, GFP_ATOMIC);
+	}
+	audit_buffer_free(ab);
 }
 
 /* Send the message in the audit buffer directly to user space.  May not
@@ -806,8 +818,7 @@ static void audit_log_end_fast(struct au
 	if (!audit_rate_check()) {
 		audit_log_lost("rate limit exceeded");
 	} else {
-		if (audit_log_drain(ab))
-			return;
+		audit_log_move(ab, GFP_KERNEL);
 	}
 	audit_buffer_free(ab);
 }
@@ -817,10 +828,8 @@ static void audit_log_end_fast(struct au
  * context.) */
 void audit_log_end(struct audit_buffer *ab)
 {
-	if (in_irq())
-		audit_log_end_irq(ab);
-	else
-		audit_log_end_fast(ab);
+	/* In a non-preemptible kernel, we have no way of knowing if a spinlock is held. */
+	audit_log_end_irq(ab);
 }
 
 /* Log an audit record.  This is a convenience function that calls