[Crash-utility] [PATCH] Fix bugs in runq

Dave Anderson anderson at redhat.com
Fri Aug 24 18:17:45 UTC 2012



----- Original Message -----
> 
> 
> ----- Original Message -----
> > Hello Dave,
> > 
> > In runq command, when dumping cfs and rt runqueues,
> > it seems that we get the wrong nr_running values of rq
> > and cfs_rq.
> > 
> > Please refer to the attached patch.
> > 
> > Thanks
> > Zhang Yanfei
> 
> Hello Zhang,
> 
> I understand what you are trying to accomplish with this patch, but
> none of my test dumpfiles can actually verify it because there is no
> difference with or without your patch.  What failure mode did you see
> in your testing?  I presume that it just showed "[no tasks queued]"
> for the RT runqueue when there were actually tasks queued there?
> 
> The reason I ask is that I'm thinking that a better solution would
> be to simplify dump_CFS_runqueues() by *not* accessing and using
> rq_nr_running, cfs_rq_nr_running or cfs_rq_h_nr_running.
> 
> Those counters are only read to determine the "active" argument to
> pass to dump_RT_prio_array(), which returns immediately if it is
> FALSE.  However, if we get rid of the "active" argument and simply
> allow dump_RT_prio_array() to always check its queues every time,
> it still works just fine.
> 
> For example, I tested my set of sample dumpfiles with this patch:
>  
>  diff -u -r1.205 task.c
>  --- task.c      12 Jul 2012 20:04:00 -0000      1.205
>  +++ task.c      22 Aug 2012 15:33:32 -0000
>  @@ -7636,7 +7636,7 @@
>                                  OFFSET(cfs_rq_tasks_timeline));
>                  }
>   
>  -               dump_RT_prio_array(nr_running != cfs_rq_nr_running,
>  +               dump_RT_prio_array(TRUE,
>                          runq + OFFSET(rq_rt) + OFFSET(rt_rq_active),
>                          &runqbuf[OFFSET(rq_rt) +
>                          OFFSET(rt_rq_active)]);
>   
> and the output is identical to testing with, and without, your patch.
> 
> So the question is whether dump_CFS_runqueues() should be needlessly
> complicated with all of the "nr_running" references?
> 
> In fact, it also seems possible that a crash could happen at a point in
> the scheduler code where those counters are not
> valid/current/trustworthy.
> 
> So unless you can convince me otherwise, I'd prefer to just remove
> the "nr_running" business completely.

Hello Zhang,

Here's the patch I've got queued, which resolves the bug you encountered
by simplifying things:


--- task.c	12 Jul 2012 20:04:00 -0000	1.205
+++ task.c	24 Aug 2012 18:05:13 -0000
@@ -67,7 +67,7 @@
 static int dump_tasks_in_cfs_rq(ulong);
 static void dump_on_rq_tasks(void);
 static void dump_CFS_runqueues(void);
-static void dump_RT_prio_array(int, ulong, char *);
+static void dump_RT_prio_array(ulong, char *);
 static void task_struct_member(struct task_context *,unsigned int, struct reference *);
 static void signal_reference(struct task_context *, ulong, struct reference *);
 static void do_sig_thread_group(ulong);
@@ -7547,7 +7547,6 @@
 	char *runqbuf, *cfs_rq_buf;
 	ulong tasks_timeline ATTRIBUTE_UNUSED;
 	struct task_context *tc;
-	long nr_running, cfs_rq_nr_running;
 	struct rb_root *root;
 	struct syment *rq_sp, *init_sp;
 
@@ -7622,22 +7621,15 @@
 
 			readmem(cfs_rq, KVADDR, cfs_rq_buf, SIZE(cfs_rq),
 				"per-cpu cfs_rq", FAULT_ON_ERROR);
-			nr_running = LONG(cfs_rq_buf + OFFSET(rq_nr_running));
-	                cfs_rq_nr_running = ULONG(cfs_rq_buf + 
-				OFFSET(cfs_rq_nr_running));
 			root = (struct rb_root *)(cfs_rq + 
 				OFFSET(cfs_rq_tasks_timeline));
 		} else {
 			cfs_rq = runq + OFFSET(rq_cfs);
-			nr_running = LONG(runqbuf + OFFSET(rq_nr_running));
-	                cfs_rq_nr_running = ULONG(runqbuf + OFFSET(rq_cfs) + 
-				OFFSET(cfs_rq_nr_running));
 			root = (struct rb_root *)(runq + OFFSET(rq_cfs) + 
 				OFFSET(cfs_rq_tasks_timeline));
 		}
 
-		dump_RT_prio_array(nr_running != cfs_rq_nr_running,
-			runq + OFFSET(rq_rt) + OFFSET(rt_rq_active), 
+		dump_RT_prio_array(runq + OFFSET(rq_rt) + OFFSET(rt_rq_active), 
 			&runqbuf[OFFSET(rq_rt) + OFFSET(rt_rq_active)]);
 
 		fprintf(fp, "  CFS RB_ROOT: %lx\n", (ulong)root);
@@ -7657,7 +7649,7 @@
 }
 
 static void
-dump_RT_prio_array(int active, ulong k_prio_array, char *u_prio_array)
+dump_RT_prio_array(ulong k_prio_array, char *u_prio_array)
 {
 	int i, c, tot, cnt, qheads;
 	ulong offset, kvaddr, uvaddr;
@@ -7668,12 +7660,6 @@
 
 	fprintf(fp, "  RT PRIO_ARRAY: %lx\n",  k_prio_array);
 
-	if (!active) {
-		INDENT(5);
-		fprintf(fp, "[no tasks queued]\n");	
-		return;
-	}
-
         qheads = (i = ARRAY_LENGTH(rt_prio_array_queue)) ?
                 i : get_array_length("rt_prio_array.queue", NULL, SIZE(list_head));
 


-------------- next part --------------
A non-text attachment was scrubbed...
Name: RT_runq.patch
Type: text/x-patch
Size: 2238 bytes
Desc: not available
URL: <http://listman.redhat.com/archives/crash-utility/attachments/20120824/e7d77a58/attachment.bin>


More information about the Crash-utility mailing list