#!/bin/bash
######################################################################################
# Copyright 2022, ThinkParQ GmbH. All rights reserved
# THE SCRIPT IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
# PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
# This script collect logs from BeeGFS server/client for troubleshooting by ThinkParQ.
#
# Last update 23.02.2022, Alexander Lutz, ThinkParQ.
#####################################################################################
##############################
# set up and export variables
##############################
version=1.1.5
hostname=$(hostname)
parameter=""
parameter1=""
parameter2=""
dryrun="0"
disclaimer="0"
mdsum=""
pathname=beegfs-support-$hostname-$(date +%d%m%Y%H%M)
work_dir=/tmp/$pathname
tar_ball=$work_dir.tar
compressed=$tar_ball.xz
command_output=$work_dir/commands.out
export HISTTIMEFORMAT="%d/%m/%y %T "
#############################
# Help
#############################
function print_help {
echo "
Usage: beegfs-collect-log [< -h | -p | -m | -n | -y >]
-h Print this help message
-p <path> Full path to the client configuration file (also for multimount clients), e.g. /etc/beegfs/beegfs-client.conf
-m <mountpoint> BeeGFS mount point (also for multimount clients), e.g. /mnt/beegfs
-n No "df" running beegfs-net... To avoid the script hanging due to unreachable storage servers.
-y Disclaimer default "yes"... In order to disable the Disclaimer for cluster commands like mpi or pdsh.
" >&2
}
###################################
# Check if tar and xz is available
###################################
if ! type "tar" &> /dev/null; then
echo -e "\nERROR: Command "tar" not found, please install it.\n" >&2
print_help ; exit 1
fi
if ! type "xz" &> /dev/null; then
echo -e "\nERROR: Command "xz" not found, please install it.\n" >&2
print_help ; exit 1
fi
##############################
# parse command line options
##############################
while getopts ":hp:m:ny" opt ; do
case "$opt" in
h) print_help ; exit
;;
p) if ! [[ -e "$OPTARG" ]] ; then
echo "ERROR: the beegfs-client configuration file \"$OPTARG\" doesn't exist on host: "$hostname"." >&2
exit 1
fi
parameter="--cfgFile=$OPTARG"
parameter2="-c $OPTARG"
;;
m) if ! [[ -d "$OPTARG" ]] ; then
echo "ERROR: the mount point \"$OPTARG\" doesn't exist on host: "$hostname"." >&2
exit 1
fi
parameter="--mount=$OPTARG"
parameter2="-p $OPTARG"
;;
n) echo -e "beegfs-net -n will be executed on host: "$hostname"." >&2
parameter1="-n"
;;
y) echo -e "No Disclaimer question on host: "$hostname"." >&2
disclaimer="y"
;;
:) echo "ERROR: option \"-$OPTARG\" requires an argument." >&2
print_help ; exit 1
;;
*) echo "ERROR: unknown option: \"-$OPTARG\"." >&2
print_help ; exit 1
esac
done
######################################
# Respond to Disclaimer Y/N or Dry run
######################################
if [ "$disclaimer" == "0" ]; then
while true; do
printf "\n\n\n"
read -p " Please be aware that some of the files collected could contain sensitive informations like IP's, timestamps or Group/User names! Do you wish to run this script y/n or d for dry run?" ynd
case $ynd in
[Yy]* ) echo -e "\nBeeGFS script beegfs-collect-log.sh running on host:"$hostname".\n"; break;;
[Nn]* ) print_help; echo -e "\nBy By, no log set created"; exit 1;;
[Dd]* ) dryrun="1"; echo -e "\nDry run...\n\nDisplay and execute the commands and files on host: "$hostname" and delete the created log file "$pathname".tar.xz afterwards.\n"
echo -e "\nAll commands and files which are not allowed to be part of the log set please comment with # in the script beegfs-collect-log.sh\n"; break;;
* ) echo "Please answer yes (y|Y)or no (n|N) or (d|D) Dry run and press return."
print_help;;
esac
done
fi
############################################
# Creating additional config & logfiles
############################################
printf "\nCreating additional logfiles...\n"
dmesg -T > /tmp/bee-dmesg-T-$(hostname)
ibnetdiscover -f > /tmp/bee-ibnetdiscover-$(hostname)
if [[ -e "$OPTARG" ]] ; then
cat /proc/fs/beegfs/*/* > /tmp/bee-client-runningconfig-$(hostname)
printf "\n/tmp/bee-client-runningconfig-"$(hostname)""
elif [[ -e /etc/beegfs/beegfs-client.conf ]] ; then
cat /proc/fs/beegfs/*/* > /tmp/bee-client-runningconfig-$(hostname)
printf "\n/tmp/bee-client-runningconfig-"$(hostname)""
else
printf "\nNo BeeGFS-client configuration file found on: "$hostname""
fi
printf "\n/tmp/bee-dmesg-T-"$(hostname)"\n"
printf "\n/tmp/bee-ibnetdiscover-"$(hostname)"\n"
############################
# BeeGFS commands to execute
############################
commands1=( )
if which beegfs-ctl &> /dev/null ; then
commands1+=(
"cat /proc/fs/beegfs/*/log_levels"
"beegfs-df $parameter2"
"beegfs-net $parameter1"
"beegfs-check-servers $parameter2"
"beegfs-ctl --listtargets --nodetype=meta --state --spaceinfo --longnodes --pools $parameter"
"beegfs-ctl --listtargets --nodetype=storage --state --spaceinfo --longnodes --pools $parameter"
"beegfs-ctl --liststoragepools $parameter"
"beegfs-ctl --listmirrorgroups --nodetype=metadata $parameter"
"beegfs-ctl --listmirrorgroups --nodetype=storage $parameter"
"beegfs-ctl --listnodes --nodetype=management --nicdetails --route $parameter"
"beegfs-ctl --listnodes --nodetype=metadata --nicdetails --route $parameter"
"beegfs-ctl --listnodes --nodetype=storage --nicdetails --route $parameter"
"beegfs-ctl --listnodes --ping --nodetype=metadata --pingretries=10 $parameter"
"beegfs-ctl --listnodes --ping --nodetype=storage --pingretries=10 $parameter"
"beegfs-ctl --getquota --defaultlimits $parameter"
)
fi
############################
# FHGFS commands to execute
############################
if which fhgfs-ctl &> /dev/null ; then
commands1+=(
"fhgfs-df"
"fhgfs-net"
"fhgfs-check-servers"
"fhgfs-ctl --listnodes --details --nodetype=metadata"
)
fi
############################
# System commands to execute
############################
commands2=( )
commands2=(
"uptime -s"
"uname -a"
"ulimit -a"
"lsblk"
"ls -lisa /sys/block/*"
"fdisk -l"
"df -h"
"df -ih"
"mount"
"showmount --all"
"iostat -xtc"
"zpool status -v"
"zdb"
"zfs list"
"zpool get all"
"zfs get all"
"netstat -lptu"
"ifconfig"
"ip a"
"ip route"
"ip rule"
"ip route show table all"
"route -n"
"ibstat"
"ibswitches"
"rdma -d dev show"
"rdma -d link show"
"rdma sys show"
"rdma res show"
"rdma stat show"
"rdma stat qp show"
"rdma stat qp mode"
"opainfo"
"ps -axuf"
"service --status-all"
"systemctl --all --state=failed"
"crm configure show xml"
"crm status"
"pcs config show"
"pcs status"
"cpu-info"
"numactl -s"
"mpstat -P ALL"
"free"
"sysctl vm.max_map_count"
"vmstat -a"
"cat /proc/pagetypeinfo"
"cat /proc/meminfo"
"lsmod"
"ls -lisa /var/crash"
"dpkg -l | grep ^ii"
"yum list installed"
)
paths=(
/etc/apt/sources.list
/etc/beegfs
/etc/corosync
/etc/exports
/etc/fhgfs
/etc/fstab
/etc/grafana
/etc/hosts
/etc/influxdb
/etc/iproute2/rt_tables
/etc/issue
/etc/pacemaker
/etc/rc.d/rc.local
/etc/rdma
/etc/network*
/etc/security
/etc/selinux/config
/etc/sysconfig/network*
/etc/yum.repos.d
/etc/os-release
/etc/sysctl.conf
/proc/cpuinfo
/proc/meminfo
/proc/sys/vm
/root/.bash_history
/sys/block/*/queue/max_sectors_kb
/sys/block/*/queue/nr_requests
/sys/block/*/queue/read_ahead_kb
/sys/block/*/queue/scheduler
/sys/kernel/mm/transparent_hugepage
/var/log/beegfs*
/var/log/cluster
/var/log/dmesg*
/var/log/fhgfs*
/var/log/grafana
/var/log/influxdb
/var/log/messages*
/var/log/sa
/var/log/syslog*
/var/log/sysstat/
/var/log/yum*
/tmp/bee-*
)
printf "\nCollecting support data from host: $hostname\n"
############################################
# clean & setup environment local and remote
############################################
printf "\nClean & setup local environment...\n"
rm -rf $work_dir $tar_ball $compressed
mkdir $work_dir
############################################
# running commands on local host
############################################
printf "\nCreating logfile...\n"
printf "Script version: %s\n" "$version" >> $command_output
printf "Host: %s\n" "$hostname" >> $command_output
printf "Date: %s\n\n" "$(date)">> $command_output
###########################################################################################
# check the allocated memory regions for all running beegfs-storage and beegfs-meta service
###########################################################################################
running_services=0
for service in beegfs-{storage,meta} ; do
for pid in $(pidof /opt/beegfs/sbin/$service) ; do
echo -e "\nAllocated memory regions for $service [PID $pid]:" >> $command_output
pmap -q $pid | wc -l >> $command_output
running_services=1
done
done
##############################################################################
# check the open IB FS for all running beegfs-storage and beegfs-meta service
##############################################################################
for service in beegfs-{storage,meta} ; do
for pid in $(pidof /opt/beegfs/sbin/$service) ; do
echo -e "\nOpen InfiniBand file descriptors for $service [PID $pid]:" >> $command_output
ls -lahR /proc/$pid/fd/* | grep -i inf | wc -l >> $command_output
running_services=1
done
done
############################################################################
# check the FD limit for all running beegfs-storage and beegfs-meta service
############################################################################
for service in beegfs-{storage,meta} ; do
for pid in $(pidof /opt/beegfs/sbin/$service) ; do
echo -e "\nmax file descriptors for $service [PID $pid]:" >> $command_output
cat /proc/$pid/limits | grep open >> $command_output
running_services=1
done
done
(( running_services )) || echo -e "\nNo running BeeGFS storage or meta service found." >> $command_output
##################################
# running BeeGFS/FHGFS commands
##################################
printf "\n\n--------running BeeGFS/FHGFS commands----------\n%s\n\n"
for command in "${commands1[@]}"
do
printf " $command\n"
printf "\n\n-------------------------\n%s\n\n" "$command" >> $command_output
eval $command &>> $command_output
done
##################################
# running Linux System commands
##################################
printf "\n\n--------running Linux System commands-----------\n%s\n\n"
unset command
for command in "${commands2[@]}"
do
printf " $command\n"
printf "\n\n-------------------------\n%s\n\n" "$command" >> $command_output
eval $command &>> $command_output
done
#############################################################################
# Chmods are necessary to fix the permissions of the various parent
# directories in the path (--no-preserve=mode only works for the final
# member of the path, not the parent dirs!).
# *Repeated* chmods are necessary to avoid having the permissions from
# one copied path block the copy of another path sharing some parent dirs.
##############################################################################
printf "\n\n--------Collecting files-----------\n%s\n\n"
cd /tmp
for path in "${paths[@]}"
do
if [[ -e "$path" ]]
then
printf " %s\n" "$path"
cp -r --parents "$path" "$pathname" &> /dev/null
chmod -R u+rw "$pathname"
fi
done
#############################################
# creating tar archieve, compress and cleanup
#############################################
printf "\nCompressing files on host "$hostname"...\n"
tar --mode="u+rw" -cf $tar_ball $pathname
xz -9 $tar_ball
printf "\nGenerated file "
ls "$compressed"
###############################################
# calculating md5sum from archive & print size
###############################################
if which md5sum &> /dev/null ; then
printf "\nCalculating md5sum...\n\n"
mdsum=$(md5sum "$compressed")
echo -e "\nmd5sum: "$mdsum""
else
printf "\nmd5sum not installed!\n\n"
fi
printf "\nsize: "
du -h $compressed
################################################
# dry run -> delete created archive afterwards
################################################
if [ "$dryrun" -eq "0" ]; then
else
echo -e "\nDry run - deleting Thinkparq Support Logfile: "$compressed""
rm -rf "$compressed"
echo -e "\n********************************************************************************************************************************************************************************************\n"
echo -e "All commands and files which are not allowed to be part of the log set "$compressed" please comment with # in the script beegfs-collect-log.sh."
echo -e "\n Thank you"
echo -e "\n********************************************************************************************************************************************************************************************\n"
print_help
fi
#########################
# final cleanup in /tmp/
#########################
printf "\nClean up temporary files and folders on host "$hostname"... Done!\n"
echo ""
rm -rf /tmp/bee-*
rm -rf "$work_dir"