[Cluster-devel] [PATCH] gfs2_lockcapture: Fixed rhbz#987019, modified option -o, added private option -R, removed lsof data capture.
sbradley at redhat.com
sbradley at redhat.com
Mon Aug 19 14:10:48 UTC 2013
From: Shane Bradley <sbradley at redhat.com>
Fix the header to use absolute path for rhbz#987019. The "-o" has been modified
so that it takes a path that will be used to create the root directory of where
the data will be written and will be location of the tarball that is
created. Added undocumented(private) "-R" option that will only capture required
data. Removed the data capture of the command "lsof" cause it might cause hangs
when capturing lockdumps cause of symlink lookup.
Signed-off-by: Shane Bradley <sbradley at redhat.com>
---
gfs2/scripts/gfs2_lockcapture | 181 +++++++++++++++++++++++------------------
1 files changed, 101 insertions(+), 80 deletions(-)
diff --git a/gfs2/scripts/gfs2_lockcapture b/gfs2/scripts/gfs2_lockcapture
index 81a0aeb..f8ace76 100644
--- a/gfs2/scripts/gfs2_lockcapture
+++ b/gfs2/scripts/gfs2_lockcapture
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python
"""
The script "gfs2_lockcapture" will capture locking information from GFS2 file
systems and DLM.
@@ -13,7 +13,7 @@ import os
import os.path
import logging
import logging.handlers
-from optparse import OptionParser, Option
+from optparse import OptionParser, Option, SUPPRESS_HELP
import time
import platform
import shutil
@@ -34,7 +34,7 @@ import tarfile
sure only 1 instance of this script is running at any time.
@type PATH_TO_PID_FILENAME: String
"""
-VERSION_NUMBER = "0.9-7"
+VERSION_NUMBER = "0.9-8"
MAIN_LOGGER_NAME = "%s" %(os.path.basename(sys.argv[0]))
PATH_TO_DEBUG_DIR="/sys/kernel/debug"
PATH_TO_PID_FILENAME = "/var/run/%s.pid" %(os.path.basename(sys.argv[0]))
@@ -190,12 +190,11 @@ def runCommand(command, listOfCommandOptions, standardOut=subprocess.PIPE, stand
commandOptionString = ""
for option in listOfCommandOptions:
commandOptionString += "%s " %(option)
- message = "An error occurred running the command: $ %s %s\n" %(command, commandOptionString)
- if (len(stdout) > 0):
- message += stdout
- message += "\n"
- if (len(stderr) > 0):
- message += stderr
+ message = "An error occurred running the command: $ %s %s" %(command, commandOptionString)
+ if (len(stdout.rstrip()) > 0):
+ message += "\n%s" %(stdout.rstrip())
+ if (len(stderr.rstrip()) > 0):
+ message += "\n%s" %(stderr.rstrip())
logging.getLogger(MAIN_LOGGER_NAME).error(message)
return False
@@ -232,12 +231,11 @@ def runCommandOutput(command, listOfCommandOptions, standardOut=subprocess.PIPE,
commandOptionString = ""
for option in listOfCommandOptions:
commandOptionString += "%s " %(option)
- message = "An error occurred running the command: $ %s %s\n" %(command, commandOptionString)
- if (len(stdout) > 0):
- message += stdout
- message += "\n"
- if (len(stderr) > 0):
- message += stderr
+ message = "An error occurred running the command: $ %s %s" %(command, commandOptionString)
+ if (len(stdout.rstrip()) > 0):
+ message += "\n%s" %(stdout.rstrip())
+ if (len(stderr.rstrip()) > 0):
+ message += "\n%s" %(stderr.rstrip())
logging.getLogger(MAIN_LOGGER_NAME).error(message)
return None
return stdout.strip().rstrip()
@@ -790,12 +788,11 @@ def getLabelMapForMountedFilesystems(clusterName, listOfMountedFilesystems):
# #####################################################################
# Gather output from command functions
# #####################################################################
-def gatherGeneralInformation(pathToDSTDir):
+def gatherHostData(pathToDSTDir):
"""
This function will gather general information about the cluster and write
the results to a file. The following data will be captured: hostname, date,
- uname -a, uptime, contents of /proc/mounts, and ps h -AL -o tid,s,cmd.
-
+ uname -a, uptime.
@param pathToDSTDir: This is the path to directory where the files will be
written to.
@@ -811,19 +808,16 @@ def gatherGeneralInformation(pathToDSTDir):
systemString += "UPTIME=%s" %(stdout)
writeToFile(os.path.join(pathToDSTDir, "hostinformation.txt"), systemString, createFile=True)
- # Copy misc files
- pathToSrcFile = "/proc/mounts"
- copyFile(pathToSrcFile, os.path.join(pathToDSTDir, pathToSrcFile.strip("/")))
- pathToSrcFile = "/proc/slabinfo"
- copyFile(pathToSrcFile, os.path.join(pathToDSTDir, pathToSrcFile.strip("/")))
+def gatherDiagnosticData(pathToDSTDir):
+ """
+ This function will gather general information about the cluster and write (or
+ copy) the results to a file.
- # Copy the DLM hash table sizes:
- pathToHashTableFiles = ["/sys/kernel/config/dlm/cluster/lkbtbl_size", "/sys/kernel/config/dlm/cluster/dirtbl_size",
- "/sys/kernel/config/dlm/cluster/rsbtbl_size"]
- for pathToSrcFile in pathToHashTableFiles:
- if (os.path.exists(pathToSrcFile)):
- copyFile(pathToSrcFile, os.path.join(pathToDSTDir, pathToSrcFile.strip("/")))
+ @param pathToDSTDir: This is the path to directory where the files will be
+ written to.
+ @type pathToDSTDir: String
+ """
# Get "ps -eo user,pid,%cpu,%mem,vsz,rss,tty,stat,start,time,comm,wchan" data.
# Get " ps h -AL -o tid,s,cmd
command = "ps"
@@ -837,6 +831,28 @@ def gatherGeneralInformation(pathToDSTDir):
message = "There was an error writing the command output for %s to the file %s." %(command, pathToCommandOutput)
logging.getLogger(MAIN_LOGGER_NAME).error(message)
+ # Copy misc files
+ pathToSrcFile = "/proc/mounts"
+ copyFile(pathToSrcFile, os.path.join(pathToDSTDir, pathToSrcFile.strip("/")))
+ pathToSrcFile = "/proc/slabinfo"
+ copyFile(pathToSrcFile, os.path.join(pathToDSTDir, pathToSrcFile.strip("/")))
+
+ # Copy the DLM hash table sizes:
+ pathToHashTableFiles = ["/sys/kernel/config/dlm/cluster/lkbtbl_size", "/sys/kernel/config/dlm/cluster/dirtbl_size",
+ "/sys/kernel/config/dlm/cluster/rsbtbl_size"]
+ for pathToSrcFile in pathToHashTableFiles:
+ if (os.path.exists(pathToSrcFile)):
+ copyFile(pathToSrcFile, os.path.join(pathToDSTDir, pathToSrcFile.strip("/")))
+
+def gatherOptionalDiagnosticData(pathToDSTDir):
+ """
+ This function will gather optional information about the cluster and write
+ the results to a file.
+
+ @param pathToDSTDir: This is the path to directory where the files will be
+ written to.
+ @type pathToDSTDir: String
+ """
# Get df -h ouput
command = "df"
pathToCommandOutput = os.path.join(pathToDSTDir, "df-h.cmd")
@@ -848,17 +864,6 @@ def gatherGeneralInformation(pathToDSTDir):
message = "There was an error writing the command output for %s to the file %s." %(command, pathToCommandOutput)
logging.getLogger(MAIN_LOGGER_NAME).error(message)
- # Get lsof ouput
- command = "lsof"
- pathToCommandOutput = os.path.join(pathToDSTDir, "lsof.cmd")
- try:
- fout = open(pathToCommandOutput, "w")
- runCommand(command, [], standardOut=fout)
- fout.close()
- except IOError:
- message = "There was an error writing the command output for %s to the file %s." %(command, pathToCommandOutput)
- logging.getLogger(MAIN_LOGGER_NAME).error(message)
-
# Write the status of all the nodes in the cluster out.
if (runCommand("which", ["cman_tool"])):
command = "cman_tool"
@@ -1087,7 +1092,12 @@ def __getOptions(version) :
cmdParser.add_option("-P", "--disable_process_gather",
action="store_true",
dest="disableProcessGather",
- help="the gathering of process information will be disabled",
+ help=SUPPRESS_HELP,
+ default=False)
+ cmdParser.add_option("-m", "--diagnostic_data",
+ action="store_true",
+ dest="enableDiagnosticData",
+ help=SUPPRESS_HELP,
default=False)
cmdParser.add_option("-o", "--path_to_output_dir",
action="store",
@@ -1095,7 +1105,7 @@ def __getOptions(version) :
help="the directory where all the collect data will be stored",
type="string",
metavar="<output directory>",
- default="")
+ default="/tmp")
cmdParser.add_option("-r", "--num_of_runs",
action="store",
dest="numberOfRuns",
@@ -1264,7 +1274,7 @@ if __name__ == "__main__":
message = "Debugging has been enabled."
logging.getLogger(MAIN_LOGGER_NAME).debug(message)
if (cmdLineOpts.disableLoggingToConsole):
- logging.disable(logging.CRITICAL)
+ streamHandler.setLevel(logging.CRITICAL)
# #######################################################################
# Check to see if pid file exists and error if it does.
# #######################################################################
@@ -1305,7 +1315,7 @@ if __name__ == "__main__":
# #######################################################################
# Verify they want to continue because this script will trigger sysrq events.
# #######################################################################
- if (not cmdLineOpts.disableQuestions):
+ if (not cmdLineOpts.disableQuestions and not cmdLineOpts.disableProcessGather):
valid = {"yes":True, "y":True, "no":False, "n":False}
question = "This script will trigger a sysrq -t event or collect the data for each pid directory located in /proc for each run. Are you sure you want to continue?"
prompt = " [y/n] "
@@ -1326,14 +1336,11 @@ if __name__ == "__main__":
# Create the output directory to verify it can be created before
# proceeding unless it is already created from a previous run data needs
# to be analyzed. Probably could add more debugging on if file or dir.
- # #######################################################################
- pathToOutputDir = cmdLineOpts.pathToOutputDir
- if (not len(pathToOutputDir) > 0):
- pathToOutputDir = "%s" %(os.path.join("/tmp", "%s-%s" %(time.strftime("%Y-%m-%d_%H%M%S"), os.path.basename(sys.argv[0]))))
- # #######################################################################
+
# Backup any existing directory with same name as current output
# directory.
# #######################################################################
+ pathToOutputDir = "%s" %(os.path.join(cmdLineOpts.pathToOutputDir, "%s-%s" %(os.path.basename(sys.argv[0]), time.strftime("%Y-%m-%d"))))
if (backupOutputDirectory(pathToOutputDir)):
message = "This directory that will be used to capture all the data: %s" %(pathToOutputDir)
logging.getLogger(MAIN_LOGGER_NAME).info(message)
@@ -1388,38 +1395,13 @@ if __name__ == "__main__":
logging.getLogger(MAIN_LOGGER_NAME).status(message)
# Gather various bits of data from the clusternode.
- message = "Pass (%d/%d): Gathering general information about the host." %(i, cmdLineOpts.numberOfRuns)
+ message = "Pass (%d/%d): Gathering simple data about the host." %(i, cmdLineOpts.numberOfRuns)
logging.getLogger(MAIN_LOGGER_NAME).debug(message)
- gatherGeneralInformation(pathToOutputRunDir)
+ gatherHostData(pathToOutputRunDir)
# Write the clusternode name and id to the general information file.
writeToFile(os.path.join(pathToOutputRunDir, "hostinformation.txt"),
"NODE_NAME=%s\nNODE_ID=%d" %(clusternode.getClusterNodeName(), clusternode.getClusterNodeID()),
appendToFile=True, createFile=True)
-
- # Going to sleep for 2 seconds, so that TIMESTAMP should be in the
- # past in the logs so that capturing sysrq data will be guaranteed.
- time.sleep(2)
-
- # If enabled then gather the process data.
- if (not cmdLineOpts.disableProcessGather):
- # Gather the backtraces for all the pids, by grabbing the /proc/<pid
- # number> or triggering sysrq events to capture task bask traces
- # from log.
- # Gather the data in the /proc/<pid> directory if the file
- # </proc/<pid>/stack exists. If file exists we will not trigger
- # sysrq events.
-
- # Should I gather anyhow and only capture sysrq if needed.
- pathToPidData = "/proc"
- if (isProcPidStackEnabled(pathToPidData)):
- message = "Pass (%d/%d): Triggering the capture of all pid directories in %s." %(i, cmdLineOpts.numberOfRuns, pathToPidData)
- logging.getLogger(MAIN_LOGGER_NAME).debug(message)
- gatherPidData(pathToPidData, os.path.join(pathToOutputRunDir, pathToPidData.strip("/")))
- else:
- message = "Pass (%d/%d): Triggering the sysrq events for the host since stack was not captured in pid directory." %(i, cmdLineOpts.numberOfRuns)
- logging.getLogger(MAIN_LOGGER_NAME).debug(message)
- triggerSysRQEvents()
-
# #######################################################################
# Gather the DLM data and lock-dumps
# #######################################################################
@@ -1444,16 +1426,55 @@ if __name__ == "__main__":
logging.getLogger(MAIN_LOGGER_NAME).debug(message)
if(gatherGFS2LockDumps(pathToOutputRunDir, clusternode.getMountedGFS2FilesystemNames())):
exitCode = 0
+ # If enabled then gather the process data. This will be included even if -R option is enabled.
+ if (not cmdLineOpts.disableProcessGather):
+ # Gather the backtraces for all the pids, by grabbing the /proc/<pid
+ # number> or triggering sysrq events to capture task bask traces
+ # from log.
+ # Gather the data in the /proc/<pid> directory if the file
+ # </proc/<pid>/stack exists. If file exists we will not trigger
+ # sysrq events.
+
+ # Should I gather anyhow and only capture sysrq if needed.
+ pathToPidData = "/proc"
+ if (isProcPidStackEnabled(pathToPidData)):
+ message = "Pass (%d/%d): Triggering the capture of all pid directories in %s." %(i, cmdLineOpts.numberOfRuns, pathToPidData)
+ logging.getLogger(MAIN_LOGGER_NAME).debug(message)
+ gatherPidData(pathToPidData, os.path.join(pathToOutputRunDir, pathToPidData.strip("/")))
+ else:
+ message = "Pass (%d/%d): Triggering the sysrq events for the host since stack was not captured in pid directory." %(i, cmdLineOpts.numberOfRuns)
+ logging.getLogger(MAIN_LOGGER_NAME).debug(message)
+ triggerSysRQEvents()
+
# Gather log files
message = "Pass (%d/%d): Gathering the log files for the host." %(i, cmdLineOpts.numberOfRuns)
logging.getLogger(MAIN_LOGGER_NAME).debug(message)
gatherLogs(os.path.join(pathToOutputRunDir, "logs"))
+
+ # Gather diagnostic data
+ message = "Pass (%d/%d): Gathering diagnostic data about the host." %(i, cmdLineOpts.numberOfRuns)
+ logging.getLogger(MAIN_LOGGER_NAME).debug(message)
+ gatherDiagnosticData(pathToOutputRunDir)
+ if (cmdLineOpts.enableDiagnosticData):
+ # Gather diagnostic data
+ message = "Pass (%d/%d): Gathering optional diagnostic data about the host." %(i, cmdLineOpts.numberOfRuns)
+ logging.getLogger(MAIN_LOGGER_NAME).debug(message)
+ gatherOptionalDiagnosticData(pathToOutputRunDir)
+
+ # #######################################################################
+ # Sleep for X seconds between runs
+ # #######################################################################
# Sleep between each run if secondsToSleep is greater than or equal
- # to 0 and current run is not the last run.
- if ((cmdLineOpts.secondsToSleep >= 0) and (i < (cmdLineOpts.numberOfRuns))):
- message = "The script will sleep for %d seconds between each run of capturing the lockdump data." %(cmdLineOpts.secondsToSleep)
+ # to 0 and current run is not the last run. Add 2 seconds to each sleep so
+ # that we know that there is a timestamp difference in logs between runs.
+ # The minimal sleep is 2 seconds.
+ secondsToSleep = cmdLineOpts.secondsToSleep + 2
+ if (secondsToSleep < 2):
+ secondsToSleep = 2
+ if (i < cmdLineOpts.numberOfRuns):
+ message = "The script will sleep for %d seconds between each run of capturing the lockdump data." %(secondsToSleep)
logging.getLogger(MAIN_LOGGER_NAME).info(message)
- time.sleep(cmdLineOpts.secondsToSleep)
+ time.sleep(secondsToSleep)
# Remove the handler:
logging.getLogger(MAIN_LOGGER_NAME).removeHandler(currentRunFileHandler)
--
1.7.1
More information about the Cluster-devel
mailing list