[Cluster-devel] [PATCH] gfs2_lockcapture: Fixed rhbz#987019, modified option -o, added private option -R, removed lsof data capture.

Mon Aug 19 14:10:48 UTC 2013

From: Shane Bradley <sbradley at redhat.com>

Fix the header to use absolute path for rhbz#987019. The "-o" has been modified
so that it takes a path that will be used to create the root directory of where
the data will be written and will be location of the tarball that is
created. Added undocumented(private) "-R" option that will only capture required
data. Removed the data capture of the command "lsof" cause it might cause hangs
when capturing lockdumps cause of symlink lookup.

Signed-off-by: Shane Bradley <sbradley at redhat.com>
---
 gfs2/scripts/gfs2_lockcapture |  181 +++++++++++++++++++++++------------------
 1 files changed, 101 insertions(+), 80 deletions(-)

diff --git a/gfs2/scripts/gfs2_lockcapture b/gfs2/scripts/gfs2_lockcapture
index 81a0aeb..f8ace76 100644
--- a/gfs2/scripts/gfs2_lockcapture
+++ b/gfs2/scripts/gfs2_lockcapture
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python
 """
 The script "gfs2_lockcapture" will capture locking information from GFS2 file
 systems and DLM.
@@ -13,7 +13,7 @@ import os
 import os.path
 import logging
 import logging.handlers
-from optparse import OptionParser, Option
+from optparse import OptionParser, Option, SUPPRESS_HELP
 import time
 import platform
 import shutil
@@ -34,7 +34,7 @@ import tarfile
 sure only 1 instance of this script is running at any time.
 @type PATH_TO_PID_FILENAME: String
 """
-VERSION_NUMBER = "0.9-7"
+VERSION_NUMBER = "0.9-8"
 MAIN_LOGGER_NAME = "%s" %(os.path.basename(sys.argv[0]))
 PATH_TO_DEBUG_DIR="/sys/kernel/debug"
 PATH_TO_PID_FILENAME = "/var/run/%s.pid" %(os.path.basename(sys.argv[0]))
@@ -190,12 +190,11 @@ def runCommand(command, listOfCommandOptions, standardOut=subprocess.PIPE, stand
         commandOptionString = ""
         for option in listOfCommandOptions:
             commandOptionString += "%s " %(option)
-        message = "An error occurred running the command: $ %s %s\n" %(command, commandOptionString)
-        if (len(stdout) > 0):
-            message += stdout
-        message += "\n"
-        if (len(stderr) > 0):
-            message += stderr
+        message = "An error occurred running the command: $ %s %s" %(command, commandOptionString)
+        if (len(stdout.rstrip()) > 0):
+            message += "\n%s" %(stdout.rstrip())
+        if (len(stderr.rstrip()) > 0):
+            message += "\n%s" %(stderr.rstrip())
         logging.getLogger(MAIN_LOGGER_NAME).error(message)
     return False
 
@@ -232,12 +231,11 @@ def runCommandOutput(command, listOfCommandOptions, standardOut=subprocess.PIPE,
         commandOptionString = ""
         for option in listOfCommandOptions:
             commandOptionString += "%s " %(option)
-        message = "An error occurred running the command: $ %s %s\n" %(command, commandOptionString)
-        if (len(stdout) > 0):
-            message += stdout
-        message += "\n"
-        if (len(stderr) > 0):
-            message += stderr
+        message = "An error occurred running the command: $ %s %s" %(command, commandOptionString)
+        if (len(stdout.rstrip()) > 0):
+            message += "\n%s" %(stdout.rstrip())
+        if (len(stderr.rstrip()) > 0):
+            message += "\n%s" %(stderr.rstrip())
         logging.getLogger(MAIN_LOGGER_NAME).error(message)
         return None
     return stdout.strip().rstrip()
@@ -790,12 +788,11 @@ def getLabelMapForMountedFilesystems(clusterName, listOfMountedFilesystems):
 # #####################################################################
 # Gather output from command functions
 # #####################################################################
-def gatherGeneralInformation(pathToDSTDir):
+def gatherHostData(pathToDSTDir):
     """
     This function will gather general information about the cluster and write
     the results to a file. The following data will be captured: hostname, date,
-    uname -a, uptime, contents of /proc/mounts, and ps h -AL -o tid,s,cmd.
-
+    uname -a, uptime.
 
     @param pathToDSTDir: This is the path to directory where the files will be
     written to.
@@ -811,19 +808,16 @@ def gatherGeneralInformation(pathToDSTDir):
         systemString += "UPTIME=%s" %(stdout)
     writeToFile(os.path.join(pathToDSTDir, "hostinformation.txt"), systemString, createFile=True)
 
-    # Copy misc files
-    pathToSrcFile = "/proc/mounts"
-    copyFile(pathToSrcFile, os.path.join(pathToDSTDir, pathToSrcFile.strip("/")))
-    pathToSrcFile = "/proc/slabinfo"
-    copyFile(pathToSrcFile, os.path.join(pathToDSTDir, pathToSrcFile.strip("/")))
+def gatherDiagnosticData(pathToDSTDir):
+    """
+    This function will gather general information about the cluster and write (or
+    copy) the results to a file.
 
-    # Copy the DLM hash table sizes:
-    pathToHashTableFiles = ["/sys/kernel/config/dlm/cluster/lkbtbl_size", "/sys/kernel/config/dlm/cluster/dirtbl_size",
-                            "/sys/kernel/config/dlm/cluster/rsbtbl_size"]
-    for pathToSrcFile in pathToHashTableFiles:
-        if (os.path.exists(pathToSrcFile)):
-            copyFile(pathToSrcFile, os.path.join(pathToDSTDir, pathToSrcFile.strip("/")))
+    @param pathToDSTDir: This is the path to directory where the files will be
+    written to.
+    @type pathToDSTDir: String
 
+    """
     # Get "ps -eo user,pid,%cpu,%mem,vsz,rss,tty,stat,start,time,comm,wchan" data.
     # Get " ps h -AL -o tid,s,cmd
     command = "ps"
@@ -837,6 +831,28 @@ def gatherGeneralInformation(pathToDSTDir):
         message = "There was an error writing the command output for %s to the file %s." %(command, pathToCommandOutput)
         logging.getLogger(MAIN_LOGGER_NAME).error(message)
 
+    # Copy misc files
+    pathToSrcFile = "/proc/mounts"
+    copyFile(pathToSrcFile, os.path.join(pathToDSTDir, pathToSrcFile.strip("/")))
+    pathToSrcFile = "/proc/slabinfo"
+    copyFile(pathToSrcFile, os.path.join(pathToDSTDir, pathToSrcFile.strip("/")))
+
+    # Copy the DLM hash table sizes:
+    pathToHashTableFiles = ["/sys/kernel/config/dlm/cluster/lkbtbl_size", "/sys/kernel/config/dlm/cluster/dirtbl_size",
+                            "/sys/kernel/config/dlm/cluster/rsbtbl_size"]
+    for pathToSrcFile in pathToHashTableFiles:
+        if (os.path.exists(pathToSrcFile)):
+            copyFile(pathToSrcFile, os.path.join(pathToDSTDir, pathToSrcFile.strip("/")))
+
+def gatherOptionalDiagnosticData(pathToDSTDir):
+    """
+    This function will gather optional information about the cluster and write
+    the results to a file.
+
+    @param pathToDSTDir: This is the path to directory where the files will be
+    written to.
+    @type pathToDSTDir: String
+    """
     # Get df -h ouput
     command = "df"
     pathToCommandOutput = os.path.join(pathToDSTDir, "df-h.cmd")
@@ -848,17 +864,6 @@ def gatherGeneralInformation(pathToDSTDir):
         message = "There was an error writing the command output for %s to the file %s." %(command, pathToCommandOutput)
         logging.getLogger(MAIN_LOGGER_NAME).error(message)
 
-    # Get lsof ouput
-    command = "lsof"
-    pathToCommandOutput = os.path.join(pathToDSTDir, "lsof.cmd")
-    try:
-        fout = open(pathToCommandOutput, "w")
-        runCommand(command, [], standardOut=fout)
-        fout.close()
-    except IOError:
-        message = "There was an error writing the command output for %s to the file %s." %(command, pathToCommandOutput)
-        logging.getLogger(MAIN_LOGGER_NAME).error(message)
-
     # Write the status of all the nodes in the cluster out.
     if (runCommand("which", ["cman_tool"])):
         command = "cman_tool"
@@ -1087,7 +1092,12 @@ def __getOptions(version) :
     cmdParser.add_option("-P", "--disable_process_gather",
                          action="store_true",
                          dest="disableProcessGather",
-                         help="the gathering of process information will be disabled",
+                         help=SUPPRESS_HELP,
+                         default=False)
+    cmdParser.add_option("-m", "--diagnostic_data",
+                         action="store_true",
+                         dest="enableDiagnosticData",
+                         help=SUPPRESS_HELP,
                          default=False)
     cmdParser.add_option("-o", "--path_to_output_dir",
                          action="store",
@@ -1095,7 +1105,7 @@ def __getOptions(version) :
                          help="the directory where all the collect data will be stored",
                          type="string",
                          metavar="<output directory>",
-                         default="")
+                         default="/tmp")
     cmdParser.add_option("-r", "--num_of_runs",
                          action="store",
                          dest="numberOfRuns",
@@ -1264,7 +1274,7 @@ if __name__ == "__main__":
             message = "Debugging has been enabled."
             logging.getLogger(MAIN_LOGGER_NAME).debug(message)
         if (cmdLineOpts.disableLoggingToConsole):
-            logging.disable(logging.CRITICAL)
+            streamHandler.setLevel(logging.CRITICAL)
         # #######################################################################
         # Check to see if pid file exists and error if it does.
         # #######################################################################
@@ -1305,7 +1315,7 @@ if __name__ == "__main__":
         # #######################################################################
         # Verify they want to continue because this script will trigger sysrq events.
         # #######################################################################
-        if (not cmdLineOpts.disableQuestions):
+        if (not cmdLineOpts.disableQuestions and not cmdLineOpts.disableProcessGather):
             valid = {"yes":True, "y":True, "no":False, "n":False}
             question = "This script will trigger a sysrq -t event or collect the data for each pid directory located in /proc for each run. Are you sure you want to continue?"
             prompt = " [y/n] "
@@ -1326,14 +1336,11 @@ if __name__ == "__main__":
         # Create the output directory to verify it can be created before
         # proceeding unless it is already created from a previous run data needs
         # to be analyzed. Probably could add more debugging on if file or dir.
-        # #######################################################################
-        pathToOutputDir = cmdLineOpts.pathToOutputDir
-        if (not len(pathToOutputDir) > 0):
-            pathToOutputDir = "%s" %(os.path.join("/tmp", "%s-%s" %(time.strftime("%Y-%m-%d_%H%M%S"), os.path.basename(sys.argv[0]))))
-        # #######################################################################
+
         # Backup any existing directory with same name as current output
         # directory.
         # #######################################################################
+        pathToOutputDir = "%s" %(os.path.join(cmdLineOpts.pathToOutputDir, "%s-%s" %(os.path.basename(sys.argv[0]), time.strftime("%Y-%m-%d"))))
         if (backupOutputDirectory(pathToOutputDir)):
             message = "This directory that will be used to capture all the data: %s" %(pathToOutputDir)
             logging.getLogger(MAIN_LOGGER_NAME).info(message)
@@ -1388,38 +1395,13 @@ if __name__ == "__main__":
             logging.getLogger(MAIN_LOGGER_NAME).status(message)
 
             # Gather various bits of data from the clusternode.
-            message = "Pass (%d/%d): Gathering general information about the host." %(i, cmdLineOpts.numberOfRuns)
+            message = "Pass (%d/%d): Gathering simple data about the host." %(i, cmdLineOpts.numberOfRuns)
             logging.getLogger(MAIN_LOGGER_NAME).debug(message)
-            gatherGeneralInformation(pathToOutputRunDir)
+            gatherHostData(pathToOutputRunDir)
             # Write the clusternode name and id to the general information file.
             writeToFile(os.path.join(pathToOutputRunDir, "hostinformation.txt"),
                         "NODE_NAME=%s\nNODE_ID=%d" %(clusternode.getClusterNodeName(), clusternode.getClusterNodeID()),
                         appendToFile=True, createFile=True)
-
-            # Going to sleep for 2 seconds, so that TIMESTAMP should be in the
-            # past in the logs so that capturing sysrq data will be guaranteed.
-            time.sleep(2)
-
-            # If enabled then gather the process data.
-            if (not cmdLineOpts.disableProcessGather):
-                # Gather the backtraces for all the pids, by grabbing the /proc/<pid
-                # number> or triggering sysrq events to capture task bask traces
-                # from log.
-                # Gather the data in the /proc/<pid> directory if the file
-                # </proc/<pid>/stack exists. If file exists we will not trigger
-                # sysrq events.
-
-                # Should I gather anyhow and only capture sysrq if needed.
-                pathToPidData = "/proc"
-                if (isProcPidStackEnabled(pathToPidData)):
-                    message = "Pass (%d/%d): Triggering the capture of all pid directories in %s." %(i, cmdLineOpts.numberOfRuns, pathToPidData)
-                    logging.getLogger(MAIN_LOGGER_NAME).debug(message)
-                    gatherPidData(pathToPidData, os.path.join(pathToOutputRunDir, pathToPidData.strip("/")))
-                else:
-                    message = "Pass (%d/%d): Triggering the sysrq events for the host since stack was not captured in pid directory." %(i, cmdLineOpts.numberOfRuns)
-                    logging.getLogger(MAIN_LOGGER_NAME).debug(message)
-                    triggerSysRQEvents()
-
             # #######################################################################
             # Gather the DLM data and lock-dumps
             # #######################################################################
@@ -1444,16 +1426,55 @@ if __name__ == "__main__":
             logging.getLogger(MAIN_LOGGER_NAME).debug(message)
             if(gatherGFS2LockDumps(pathToOutputRunDir, clusternode.getMountedGFS2FilesystemNames())):
                 exitCode = 0
+            # If enabled then gather the process data. This will be included even if -R option is enabled.
+            if (not cmdLineOpts.disableProcessGather):
+                # Gather the backtraces for all the pids, by grabbing the /proc/<pid
+                # number> or triggering sysrq events to capture task bask traces
+                # from log.
+                # Gather the data in the /proc/<pid> directory if the file
+                # </proc/<pid>/stack exists. If file exists we will not trigger
+                # sysrq events.
+
+                # Should I gather anyhow and only capture sysrq if needed.
+                pathToPidData = "/proc"
+                if (isProcPidStackEnabled(pathToPidData)):
+                    message = "Pass (%d/%d): Triggering the capture of all pid directories in %s." %(i, cmdLineOpts.numberOfRuns, pathToPidData)
+                    logging.getLogger(MAIN_LOGGER_NAME).debug(message)
+                    gatherPidData(pathToPidData, os.path.join(pathToOutputRunDir, pathToPidData.strip("/")))
+                else:
+                    message = "Pass (%d/%d): Triggering the sysrq events for the host since stack was not captured in pid directory." %(i, cmdLineOpts.numberOfRuns)
+                    logging.getLogger(MAIN_LOGGER_NAME).debug(message)
+                    triggerSysRQEvents()
+
             # Gather log files
             message = "Pass (%d/%d): Gathering the log files for the host." %(i, cmdLineOpts.numberOfRuns)
             logging.getLogger(MAIN_LOGGER_NAME).debug(message)
             gatherLogs(os.path.join(pathToOutputRunDir, "logs"))
+
+            # Gather diagnostic data
+            message = "Pass (%d/%d): Gathering diagnostic data about the host." %(i, cmdLineOpts.numberOfRuns)
+            logging.getLogger(MAIN_LOGGER_NAME).debug(message)
+            gatherDiagnosticData(pathToOutputRunDir)
+            if (cmdLineOpts.enableDiagnosticData):
+                # Gather diagnostic data
+                message = "Pass (%d/%d): Gathering optional diagnostic data about the host." %(i, cmdLineOpts.numberOfRuns)
+                logging.getLogger(MAIN_LOGGER_NAME).debug(message)
+                gatherOptionalDiagnosticData(pathToOutputRunDir)
+
+            # #######################################################################
+            # Sleep for X seconds between runs
+            # #######################################################################
             # Sleep between each run if secondsToSleep is greater than or equal
-            # to 0 and current run is not the last run.
-            if ((cmdLineOpts.secondsToSleep >= 0) and (i < (cmdLineOpts.numberOfRuns))):
-                message = "The script will sleep for %d seconds between each run of capturing the lockdump data." %(cmdLineOpts.secondsToSleep)
+            # to 0 and current run is not the last run. Add 2 seconds to each sleep so
+            # that we know that there is a timestamp difference in logs between runs.
+            # The minimal sleep is 2 seconds.
+            secondsToSleep = cmdLineOpts.secondsToSleep + 2
+            if (secondsToSleep < 2):
+                secondsToSleep = 2
+            if (i < cmdLineOpts.numberOfRuns):
+                message = "The script will sleep for %d seconds between each run of capturing the lockdump data." %(secondsToSleep)
                 logging.getLogger(MAIN_LOGGER_NAME).info(message)
-                time.sleep(cmdLineOpts.secondsToSleep)
+                time.sleep(secondsToSleep)
             # Remove the handler:
             logging.getLogger(MAIN_LOGGER_NAME).removeHandler(currentRunFileHandler)
 
-- 
1.7.1