#!/usr/bin/env bash # This program is part of Aspersa (http://code.google.com/p/aspersa/) # ######################################################################## # A script to collect information from a server for some period of time. # Focused on gathering diagnostic data during a MySQL performance problem. # Typically executed from the 'stalk' program. # # Author: Baron Schwartz # ######################################################################## # Print a usage message and exit. usage() { if [ "${OPT_ERR}" ]; then echo "${OPT_ERR}" fi cat <<-USAGE Usage: $0 OPTIONS [MYSQL-OPTIONS] Collects diagnostic data on a MySQL server and stores it into files. The MYSQL-OPTIONS are standard options to connect to MySQL: -uhPpS. Any options or arguments that follow this tool's -digos options will be treated as options to pass directly to mysql and mysqladmin. Options: (required: -dfigmos) -d DESTINATION Where to store the resulting data; must already exist. -f PERCENT Exit if the disk is more than this percent full. -i INTERVAL How many seconds to collect data. -g Collect GDB stack traces. -m MEGABYTES Exit unless there are this many megabytes free disk space. -o Collect oprofile data; disables -s. -p PREFIX Store the data into files with this prefix (optional). -s Collect strace data. -t Collect tcpdump data. USAGE exit 1 } # Make sure the disk isn't getting too full. Exit if the disk is more than $1 # percent full, or there is less than $2 megabytes of free space on $3 drive. check_disk_space() { PCT="$1" MB="$2" DEST="$3" avail=$(df -m -P "${DEST}" | awk '/^\//{print $4}'); full=$(df -m -P "${DEST}" | awk '/^\//{print $5}' | sed -e 's/%//g'); if [ "${avail}" -le "${MB}" -o "${full}" -ge "${PCT}" ]; then echo "Not enough free space (${full}% full, ${avail}MB free)" echo "Wanted less than ${PCT}% full and more than ${MB}MB" return 1 fi return 0 } for o; do case "${o}" in --) shift; break; ;; --help) usage; ;; -d) shift; OPT_d="${1}"; shift; ;; -f) shift; OPT_f="${1}"; shift; ;; -i) shift; OPT_i="${1}"; shift; ;; -g) shift; OPT_g="${1}"; shift; ;; -m) shift; OPT_m="${1}"; shift; ;; -o) shift; OPT_o="${1}"; shift; ;; -p) shift; OPT_p="${1}"; shift; ;; -s) shift; OPT_s="${1}"; shift; ;; -t) shift; OPT_t="${1}"; shift; ;; esac done if [ -z "${OPT_d}" -o -z "${OPT_i}" -o -z "${OPT_o}" -o -z "${OPT_g}" -o -z "${OPT_s}" ]; then OPT_ERR="Missing command-line option." usage fi if [ "${OPT_p}" ]; then d="${OPT_p}" else d=$(date +%F-%T | tr :- _); fi # Check disk space up-front. check_disk_space "${OPT_f}" "${OPT_m}" "${OPT_d}" || exit 1 echo "Gathering info for $d" # Make sure there's only one of me. ( flock 200 # Get pidof mysqld; pidof doesn't exist on some systems. We try our best... p=$(pidof -s mysqld); if [ -z "${p}" ]; then p=$(pgrep -o -x mysqld); fi if [ -z "${p}" ]; then p=$(ps -eaf | grep 'mysql[d]' | grep -v mysqld_safe | awk '{print $2}' | head -n1); fi # Get memory allocation info before anything else. if [ "${p}" ]; then if pmap --help 2>&1 | grep -- -x >/dev/null 2>&1 ; then pmap -x $p > "$OPT_d/$d-pmap" else # Some pmap's apparently don't support -x (issue 116). pmap $p > "$OPT_d/$d-pmap" fi fi # Getting a GDB stacktrace can be an intensive operation, so do this only if # necessary. if [ "${OPT_g}" = "yes" -a "${p}" ]; then gdb -ex "set pagination 0" -ex "thread apply all bt" --batch -p $p >> "$OPT_d/$d-stacktrace" else echo "GDB (-g) was not enabled" >> "$OPT_d/$d-stacktrace" fi # Get MySQL's variables if possible. Then sleep long enough that we probably # complete SHOW VARIABLES if all's well. (We don't want to run mysql in the # foreground, because it could hang.) mysql "$@" -e 'SHOW GLOBAL VARIABLES' >> "$OPT_d/$d-variables" 2>&1 & sleep .2 # Get the major.minor version number. Version 3.23 doesn't matter for our # purposes, and other releases have x.x.x* version conventions so far. VER="$(awk '/^version[^_]/{print substr($2,1,3)}' "$OPT_d/$d-variables")" # Is MySQL logging its errors to a file? If so, tail that file. errfile="$(awk '/log_error/{print $2}' "$OPT_d/$d-variables")" if [ -z "${errfile}" -a "${p}" ]; then # Try getting it from the open filehandle... errfile="$(ls -l /proc/${p}/fd | awk '/ 2 ->/{print $NF}')" fi if [ "${errfile}" ]; then echo "The error file seems to be ${errfile}" tail -f "${errfile}" >"$OPT_d/$d-log_error" 2>&1 & error_pid=$! # Send a mysqladmin debug to the server so we can potentially learn about # locking etc. mysqladmin debug "$@" else echo "Could not detect error file; will not tail MySQL's log file" fi # Get a sample of these right away, so we can get these without interaction # with the other commands we're about to run. INNOSTAT="SHOW /*!40100 ENGINE*/ INNODB STATUS\G" mysql "$@" -e "${INNOSTAT}" >> "$OPT_d/$d-innodbstatus1" 2>&1 & mysql "$@" -e 'SHOW FULL PROCESSLIST\G' >> "$OPT_d/$d-processlist1" 2>&1 & mysql "$@" -e 'SHOW OPEN TABLES' >> "$OPT_d/$d-opentables1" 2>&1 & if [ "${VER}" > "5.1" ]; then mysql "$@" -e 'SHOW ENGINE INNODB MUTEX' >> "$OPT_d/$d-mutex-status1" 2>&1 & else mysql "$@" -e 'SHOW MUTEX STATUS' >> "$OPT_d/$d-mutex-status1" 2>&1 & fi # If TCP dumping is specified, start that on the server's port. if [ "${OPT_t}" = "yes" ]; then port=$(awk '/^port/{print $2}' "$OPT_d/$d-variables") if [ "${port}" ]; then tcpdump -i any -s 4096 -w "$OPT_d/$d-tcpdump" port ${port} & tcpdump_pid=$! fi fi # Next, start oprofile gathering data during the whole rest of this process. # The --init should be a no-op if it has already been init-ed. if [ "${OPT_o}" = "yes" ]; then if opcontrol --init; then opcontrol --start --no-vmlinux else OPT_o="no" fi elif [ "${OPT_s}" = "yes" ]; then # Don't run oprofile and strace at the same time. strace -T -s 0 -f -p $p > "${DEST}/$d-strace" 2>&1 & strace_pid=$! fi # Grab a few general things first. Background all of these so we can start # them all up as quickly as possible. We use mysqladmin -c even though it is # buggy and won't stop on its own in 5.1 and newer, because there is a chance # that we will get and keep a connection to the database; in troubled times # the database tends to exceed max_connections, so reconnecting in the loop # tends not to work very well. ps -eaf >> "$OPT_d/$d-ps" 2>&1 & sysctl -a >> "$OPT_d/$d-sysctl" 2>&1 & top -bn1 >> "$OPT_d/$d-top" 2>&1 & vmstat 1 $OPT_i >> "$OPT_d/$d-vmstat" 2>&1 & vmstat $OPT_i 2 >> "$OPT_d/$d-vmstat-overall" 2>&1 & iostat -dx 1 $OPT_i >> "$OPT_d/$d-iostat" 2>&1 & iostat -dx $OPT_i 2 >> "$OPT_d/$d-iostat-overall" 2>&1 & mpstat -P ALL 1 $OPT_i >> "$OPT_d/$d-mpstat" 2>&1 & mpstat -P ALL $OPT_i 1 >> "$OPT_d/$d-mpstat-overall" 2>&1 & lsof -nP -p $p -bw >> "$OPT_d/$d-lsof" 2>&1 & mysqladmin "$@" ext -i1 -c$OPT_i >> "$OPT_d/$d-mysqladmin" 2>&1 & mysqladmin_pid=$! # This loop gathers data for the rest of the duration, and defines the time # of the whole job. echo "Loop start: $(date +'TS %s.%N %F %T')" for a in `seq 1 $OPT_i`; do # We check the disk, but don't exit, because we need to stop jobs if we # need to exit. check_disk_space "${OPT_f}" "${OPT_m}" "${OPT_d}" || break # Synchronize ourselves onto the clock tick, so the sleeps are 1-second sleep $(date +%s.%N | awk '{print 1 - ($1 % 1)}') ts="$(date +"TS %s.%N %F %T")" # Collect the stuff for this cycle (cat /proc/diskstats 2>&1; echo $ts) >> "$OPT_d/$d-diskstats" & (cat /proc/stat 2>&1; echo $ts) >> "$OPT_d/$d-procstat" & (cat /proc/vmstat 2>&1; echo $ts) >> "$OPT_d/$d-procvmstat" & (cat /proc/meminfo 2>&1; echo $ts) >> "$OPT_d/$d-meminfo" & (cat /proc/slabinfo 2>&1; echo $ts) >> "$OPT_d/$d-slabinfo" & (cat /proc/interrupts 2>&1; echo $ts) >> "$OPT_d/$d-interrupts" & (df -h 2>&1; echo $ts) >> "$OPT_d/$d-df" & (netstat -antp 2>&1; echo $ts) >> "$OPT_d/$d-netstat" & (netstat -s 2>&1; echo $ts) >> "$OPT_d/$d-netstat_s" & done echo "Loop end: $(date +'TS %s.%N %F %T')" if [ "${OPT_o}" = "yes" ]; then opcontrol --stop opcontrol --dump kill $(pidof oprofiled); opcontrol --save=aspersa_collect_$d # Attempt to generate a report; if this fails, then just tell the user how # to generate the report. path_to_binary=$(which mysqld); if [ "${path_to_binary}" -a -f "${path_to_binary}" ]; then opreport --demangle=smart --symbols --merge tgid session:aspersa_collect_$d "${path_to_binary}" > "$OPT_d/$d-opreport" else echo "oprofile data saved to aspersa_collect_$d; you should now be able to get a report" > "$OPT_d/$d-opreport" echo "by running something like" >> "$OPT_d/$d-opreport" echo "opreport --demangle=smart --symbols --merge tgid session:aspersa_collect_$d /path/to/mysqld" >> "$OPT_d/$d-opreport" fi elif [ "${OPT_s}" = "yes" ]; then kill -s 2 ${strace_pid} sleep 1 kill -s 15 ${strace_pid} # Sometimes strace leaves threads/processes in T status. kill -s 18 $p fi mysql "$@" -e "${INNOSTAT}" >> "$OPT_d/$d-innodbstatus2" 2>&1 & mysql "$@" -e 'SHOW FULL PROCESSLIST\G' >> "$OPT_d/$d-processlist2" 2>&1 & mysql "$@" -e 'SHOW OPEN TABLES' >> "$OPT_d/$d-opentables2" 2>&1 & if [ "${VER}" > "5.1" ]; then mysql "$@" -e 'SHOW ENGINE INNODB MUTEX' >> "$OPT_d/$d-mutex-status2" 2>&1 & else mysql "$@" -e 'SHOW MUTEX STATUS' >> "$OPT_d/$d-mutex-status2" 2>&1 & fi # Kill backgrounded tasks. kill $mysqladmin_pid [ "$error_pid" ] && kill $error_pid [ "$tcpdump_pid" ] && kill $tcpdump_pid # Finally, record what system we collected this data from. hostname > "$OPT_d/$d-hostname" )200>/tmp/aspersa-collect-lockfile >> "$OPT_d/$d-output" 2>&1