Skip to content

Instantly share code, notes, and snippets.

@oleewere
Last active December 8, 2021 13:54
Show Gist options
  • Save oleewere/9948250bf3ef4e87bfd89092f6334ebc to your computer and use it in GitHub Desktop.
Save oleewere/9948250bf3ef4e87bfd89092f6334ebc to your computer and use it in GitHub Desktop.
cop_logging_agent_check.sh
#!/bin/sh
: ${LOGFILE_FOLDER:="/var/log/cdp-telemetry-logging-doctor"}
MINION_SCRIPTS_FOLDER="/opt/salt/scripts"
SALT_CMD_TIMEOUT=120
LIVENESS_THRESHOLD_SECONDS=86400 # 1 day in seconds
BUFFER_LIMIT_BYTES=10485760 # 10 MB in bytes
# increase this to make sure to override the script in case of diagnostics
# todo: find better solution - pass by cli option?
VERSION=1
readlinkf(){
perl -MCwd -e 'print Cwd::abs_path shift' "$1";
}
if [ "$(uname -s)" = 'Linux' ]; then
SCRIPT_LOCATION=$(readlink -f "$0")
else
SCRIPT_LOCATION=$(readlinkf "$0")
fi
function print_help() {
cat << EOF
Usage: [<command>]
commands:
distribute distribute the script itself on minion nodes (by salt)
install create required crontabs on minion nodes (locally)
doctor run cdp-logging-agent checks
dump distributed call that dump details about worker processes for troubleshooting
dump-local dump details about local worker processes for troubleshooting
version print version of the script
test test doctor commands (from master on minions)
help print usage
EOF
}
function do_exit() {
local code=$1
local message=$2
if [[ "$message" == "" ]]; then
info "Exit code: $code"
else
info "Exit code: $code, --- STATUS MESSAGE --- $message --- STATUS MESSAGE ---"
fi
exit $code
}
function init_logfile() {
mkdir -p $LOGFILE_FOLDER
local timestamp=$(date +"%Y%m%d-%H%M%S")
LOGFILE="$LOGFILE_FOLDER/cdp-logging-agent-doctor-${timestamp}.log"
touch $LOGFILE
chmod 600 $LOGFILE
cleanup_old_logs
info "The following log file will be used: $LOGFILE"
}
function init_salt_prefix() {
SALT_BIN_PREFIX=$(find /opt -maxdepth 1 -type d -iname "salt_*" | xargs -I{} echo "{}/bin")
if [[ "$SALT_BIN_PREFIX" == "" ]]; then
SALT_BIN_PREFIX="/opt/salt_*/bin"
fi
}
function cleanup_old_logs() {
ls -1tr $LOGFILE_FOLDER/cdp-logging-agent-doctor*.log | head -n -5 | xargs --no-run-if-empty rm
}
function info() {
log "$1"
}
function debug() {
log "$1" "true"
}
function log() {
local timestamp=$(date +"%Y-%m-%dT%H:%M:%S.%3N%z")
local debug=$2
echo "$timestamp $1" >> $LOGFILE
if [[ "$2" == "" ]]; then
echo "$1"
fi
}
function run_command() {
local cmd=${1:?"usage: <command>"}
debug "The following command will be executed: $1"
eval $1 >> $LOGFILE 2>&1
}
function install() {
log "Installing cdp_logging_agent_doctor to cron.d ..."
cat <<EOF >>/etc/cron.d/cdp_logging_agent_doctor
0 1 * * * root sh /opt/salt/scripts/cdp_logging_agent_check.sh doctor
EOF
chmod 600 /etc/cron.d/cdp_logging_agent_doctor
do_exit 0 "INSTALLATION FINISHED"
}
function get_fluentd_config_val() {
local config_content=${1:?"usage: <config_content>"}
local config_key=${2:?"usage: <config_key>"}
echo "$config_content" | grep $config_key | head -n 1 | sed -e 's/^[[:space:]]*//' | cut -f2 -d' '
}
function get_base_path() {
local full_ccloud_storage_path=${1:?"usage: <full_ccloud_storage_path>"}
echo "$full_ccloud_storage_path" | tr -d '"' | awk -F'/%Y' '{print $1}'
}
function upload_to_cloud_storage() {
local dump_file_name=${1:?"usage: <dump_file_name>"}
local dump_base_filename=$(basename -- "$dump_file_name")
if [[ ! -f /etc/cdp-logging-agent/output.conf ]]; then
info "Logging agent config does not exist. Skip uploading data to cloud storage."
return
fi
local timestamp_dump=$(date +"%Y-%m-%d")
local dump_suffix="dumps/${timestamp_dump}"
local fluent_out_config=$(cat /etc/cdp-logging-agent/output.conf)
local s3_type=$(echo "$fluent_out_config" | grep "@type s3")
local azure_type=$(echo "$fluent_out_config" | grep "@type azurestorage_gen2")
local gcs_type=$(echo "$fluent_out_config" | grep "@type gcs")
if [[ "$s3_type" != "" ]]; then
info "Uploading dump to s3 ..."
local s3_bucket=$(get_fluentd_config_val "$fluent_out_config" "s3_bucket")
local s3_path_pattern=$(get_fluentd_config_val "$fluent_out_config" "%Y-%m-%d")
local s3_base_path=$(get_base_path "$s3_path_pattern")
local region=$(get_fluentd_config_val "$fluent_out_config" "region")
info "Detected S3 configs: Bucket=$s3_bucket, BasePath=$s3_base_path, Region=$region"
local target_location="$s3_base_path/$dump_suffix/"
local additional_params=""
if [[ "$region" != "" ]]; then
additional_params="--region $region"
fi
cdp-telemetry storage s3 upload -e --bucket "$s3_bucket" --file "$dump_file_name" $additional_params --location "$target_location"
local s3_upload_result="$?"
if [[ "$s3_upload_result" == "0" ]]; then
info "S3 upload COMPLETED: Bucket=$s3_bucket, Path=${target_location}${dump_base_filename}"
else
info "S3 upload failed with encryption, try without that parameter."
cdp-telemetry storage s3 upload --bucket "$s3_bucket" --file "$dump_file_name" $additional_params --location "$target_location"
local s3_upload_second_result="$?"
if [[ "$s3_upload_second_result" == "0" ]]; then
info "S3 upload COMPLETED: Bucket=$s3_bucket, Path=${target_location}${dump_base_filename}"
else
info "S3 upload FAILED: Bucket=$s3_bucket, Path=${target_location}${dump_base_filename}"
fi
fi
elif [[ "$azure_type" != "" ]]; then
local azure_storage_account=$(get_fluentd_config_val "$fluent_out_config" "azure_storage_account")
local azure_container=$(get_fluentd_config_val "$fluent_out_config" "azure_container")
local azure_path_pattern=$(get_fluentd_config_val "$fluent_out_config" "%Y-%m-%d")
local azure_base_path=$(get_base_path "$azure_path_pattern")
local target_location="$azure_base_path/$dump_suffix/"
info "Detected ABFS configs: Account=$azure_storage_account, Container=$azure_container, BasePath=$azure_base_path"
cdp-telemetry storage abfs --file "$dump_file_name" --location "${target_location}" --account "${azure_storage_account}" --container "${azure_container}"
local abfs_upload_result="$?"
if [[ "$abfs_upload_result" == "0" ]]; then
info "ABFS upload COMPLETED: Account=$azure_storage_account, Container=$azure_container, Path=${target_location}${dump_base_filename}"
else
info "ABFS upload FAILED: Account=$azure_storage_account, Container=$azure_container, Path=${target_location}${dump_base_filename}"
fi
elif [[ "$gcs_type" != "" ]]; then
local gcs_bucket=$(get_fluentd_config_val "$fluent_out_config" "bucket")
local gcs_path_pattern=$(get_fluentd_config_val "$fluent_out_config" "%Y-%m-%d")
local gcs_base_path=$(get_base_path "$gcs_path_pattern")
local target_location="$gcs_base_path/$dump_suffix/"
info "Detected GCS configs: Bucket=$gcs_bucket, BasePath=${gcs_base_path}"
cdp-telemetry storage gcs upload --bucket "$gcs_bucket" --file "$dump_file_name" --location "${target_location}"
local gcs_upload_result="$?"
if [[ "$gcs_upload_result" == "0" ]]; then
info "GCS upload COMPLETED: Bucket=$gcs_bucket, Path=${target_location}${dump_base_filename}"
else
info "GCS upload FAILED: Bucket=$gcs_bucket, Path=${target_location}${dump_base_filename}"
fi
else
info "No configured cloud storage log shipping is detected. Skip uploading dump."
fi
}
function create_dump_file_and_upload() {
local dump_name=${1:?"usage: <dump_name>"}
info "Compress /tmp/${dump_name}.tar.gz file ..."
( cd /tmp && tar -czvf "${dump_name}.tar.gz" "${dump_name}" && rm -r "${dump_name}" )
info "Compression complete. Cleanup old dump collections ..."
ls -1tr /tmp/cdp-logging-dump*.tar.gz | head -n -3 | xargs --no-run-if-empty rm
upload_to_cloud_storage "/tmp/${dump_name}.tar.gz"
do_exit 0 "LOCAL DUMP FINSHED"
}
function local_dump() {
local timestamp_for_folder=$(date +"%Y%m%d-%H%M%S")
local hostname_short=$(hostname)
local dump_name="cdp-logging-dump_${hostname_short}-${timestamp_for_folder}"
local dump_folder="/tmp/${dump_name}"
local logging_agent_pids_resp=$(get_logging_agent_worker_pids)
if [[ "${logging_agent_pids_resp}" == "" ]]; then
info "No any logging agents are running. Skip worker dump."
else
if [[ ! -d "${dump_folder}" ]]; then
info "Creating dump folder: ${dump_folder}"
mkdir -p "${dump_folder}"
fi
local worker=0
info "Execute command du -d ..."
local disk_check_out=$(du -h /var/log/cdp-logging-agent/*)
info "Execute command: stat ..."
local access_check_out=$(stat /var/log/cdp-logging-agent/*)
info "Gather process details dump for workers..."
local proc_dump=$(ps aux | grep cdp-logging-agent | awk 'NR>1 {$5=int($5/1024)"M (RSS)"; $6=int($6/1024)"M (VSZ)";}{ print;}' | grep "under-supervisor")
echo "$disk_check_out" >> "${dump_folder}/du.txt"
echo "$access_check_out" >> "${dump_folder}/stat.txt"
echo "$proc_dump" >> "${dump_folder}/proc.txt"
for logging_agent_pid in $logging_agent_pids_resp; do
((worker=worker+1))
local thread_dump_file="/tmp/sigdump-${logging_agent_pid}.log"
info "Thread dump logging agent worker #${worker} to ${thread_dump_file}"
kill -CONT $logging_agent_pid
local thread_dump_content=$(cat "$thread_dump_file")
info "START OF WORKER #${worker} (Pid: ${logging_agent_pid}) THREAD DUMP"
info "END OF WORKER #${worker} (Pid: ${logging_agent_pid}) THREAD DUMP"
local timestamp=$(date +"%Y%m%d-%H%M%S")
local thread_dump_log_file="${dump_folder}/worker-thread-dump-${worker}-${logging_agent_pid}-${timestamp}.txt"
info "Copying thread dump to $thread_dump_log_file"
cp -r $thread_dump_file "${dump_folder}/worker-thread-dump-${worker}-${logging_agent_pid}-${timestamp}.txt"
info "Collect addition file descriptors:"
local fd_output=$(ls -la /proc/$logging_agent_pid/fd)
info "Gather file descriptor data for #${worker} (Pid: ${logging_agent_pid})"
echo "${fd_output}" >> "${dump_folder}/fd-${worker}-${logging_agent_pid}.txt"
done
create_dump_file_and_upload $dump_name
fi
}
function dump() {
local distribution_folder="/srv/salt/distribution"
distribute_script "${distribution_folder}" "'*'"
run_command "$SALT_BIN_PREFIX/salt '*' cmd.run 'chmod 750 $MINION_SCRIPTS_FOLDER/cdp_logging_agent_check.sh && $MINION_SCRIPTS_FOLDER/cdp_logging_agent_check.sh dump-local' timeout=$SALT_CMD_TIMEOUT"
cleanup_distribution_script $distribution_folder
do_exit 0 "DISTRIBUTED DUMP FINISHED"
}
function get_logging_agent_worker_pids() {
ps aux | grep cdp-logging-agent | grep under-supervisor | awk '{print $2}'
}
function install_dist_crontab() {
if [[ -f "/etc/cron.d/cdp_logging_agent_check_distribute" ]]; then
debug "No need for updating cron.d with cdp_logging_agent_check_distribute script"
return
fi
log "Installing cdp_logging_agent_check_distribute to cron.weekly ..."
cat <<EOF >>/etc/cron.d/cdp_logging_agent_check_distribute
0 2 * * SUN root sh /opt/salt/scripts/cdp_logging_agent_check.sh distribute
EOF
chmod 600 /etc/cron.d/cdp_logging_agent_check_distribute
}
function update_metering_conf() {
local file=${1:?"usage: <metering config file location>"}
if [[ -f "${file}" ]]; then
if ! grep -q "retry_forever" "${file}"; then
sed -z 's/flush_at_shutdown true\n/flush_at_shutdown true\n retry_forever true\n retry_max_interval 1200\n/' -i $file
else
if ! grep -q "retry_max_interval" "${file}"; then
sed -z 's/retry_forever true\n/retry_forever true\n retry_max_interval 1200\n/' -i $file
fi
fi
fi
}
function doctor() {
if [[ -d "/srv/salt/fluent" ]]; then
debug "As this node is a salt-master, do additional operations on it."
update_metering_conf "/srv/salt/fluent/template/databus_metering.conf.j2"
fi
local_metering_fluent_conf="/etc/cdp-logging-agent/databus_metering.conf"
is_datahub="false"
if grep -q "/var/log/metering/heartbeats.json" "$local_metering_fluent_conf"; then
is_datahub="true"
fi
local is_installed=$(rpm -q "cdp-logging-agent" 2>&1 >/dev/null; echo $?)
local is_active=$(systemctl is-active --quiet cdp-logging-agent; echo $?)
if [[ "$is_active" == "0" ]]; then
debug "Service cdp-logging-agent is active."
buffer_folder="/var/log/cdp-logging-agent/metering_databus"
if [[ -d "$buffer_folder" && "$is_datahub" == "true" ]]; then
info "Do additional checks as this is a dathub node..."
if ! grep -q "retry_max_interval" "$local_metering_fluent_conf"; then
update_metering_conf "${local_metering_fluent_conf}"
local_dump
systemctl restart cdp-logging-agent
return
fi
last_modified_date=`stat "$buffer_folder" | grep Modify | sed -r "s/Modify: (.*)/\1/"`;
last_modified_timestamp=`date -d "$last_modified_date" +%s`;
info "Metering buffer folder /var/log/cdp-logging-agent/metering_databus was accessed: $last_modified_date"
if [ `date +%s` -gt `expr $last_modified_timestamp + $LIVENESS_THRESHOLD_SECONDS` ]; then
info "Metering buffer folder was not accessed for long time. Restarting logging agent."
local_dump
systemctl restart cdp-logging-agent
return
fi
info "Check if metering buffer folder is too large. (max: 10MB)"
buffer_folder_size=$(du -bs "$buffer_folder" | cut -f1)
local max_buffer_size=$(expr $BUFFER_LIMIT_BYTES + 0)
local act_buffer_size=$(expr $buffer_folder_size + 0)
if [[ $act_buffer_size -ge $max_buffer_size ]]; then
info "Metering buffer size is too large: $act_buffer_size Bytes. Restarting logging agent."
local_dump
systemctl restart cdp-logging-agent
else
info "Metering buffer size: $act_buffer_size Bytes"
fi
fi
elif [[ "$is_installed" == "0" ]]; then
info "Service cdp-logging-agent is installed, but not active. Check if it can be started."
if [[ -f "/etc/cdp-logging-agent/cdp-logging-agent_simple_profile.conf" ]]; then
info "Service cdp-logging-agent already configured by salt and not running, so starting it"
systemctl start cdp-logging-agent
fi
fi
do_exit 0 "LOGGING AGENT DOCTOR OPERATION FINISHED"
}
function distribute_script() {
local distribution_folder=${1:?"distribution_folder: <distribution_folder>"}
local target=${2:?"usage: <target>"}
log "Creating $distribution_folder if does not exist."
mkdir -p $distribution_folder
log "Copying $SCRIPT_LOCATION into $distribution_folder."
cp -r $SCRIPT_LOCATION $distribution_folder/
run_command "$SALT_BIN_PREFIX/salt $target cp.get_file salt:///distribution/cdp_logging_agent_check.sh $MINION_SCRIPTS_FOLDER/cdp_logging_agent_check.sh"
}
function cleanup_distribution_script() {
local distribution_folder=$1
if [[ -d "$distribution_folder" ]]; then
rm -rf $distribution_folder/cdp_logging_agent_check.sh
fi
}
function distribute() {
install_dist_crontab
local distribution_folder="/srv/salt/distribution"
local is_salt_master_active=$(systemctl is-active --quiet salt-master; echo $?)
if [[ "$is_salt_master_active" != "0" ]]; then
debug "Salt master is not active in this node. Skipping distribution"
do_exit 0 "DISTRIBUTION SKIPPED"
fi
run_command "$SALT_BIN_PREFIX/salt '*' cmd.run 'test -f /etc/cron.d/cdp_logging_agent_doctor && test -f $MINION_SCRIPTS_FOLDER/cdp_logging_agent_check.sh && sh $MINION_SCRIPTS_FOLDER/cdp_logging_agent_check.sh version || echo -1' --out-indent=-1 --out=json --out-file=$WORKING_DIR/cdp-logging-agent-versions.json timeout=$SALT_CMD_TIMEOUT"
rm -rf $WORKING_DIR/logging_check_install.txt
rm -rf $WORKING_DIR/logging_check_not_responding.txt
rm -rf $WORKING_DIR/logging_check_do_nothing.txt
for row in $(cat "$WORKING_DIR/cdp-logging-agent-versions.json" | jq -r 'to_entries | . []|=.key+":"+.value | @base64'); do
local decoded_row=$(echo "${row}" | base64 -d | jq .[] | tr -d '"')
local salt_minion_node=$(echo "${decoded_row}" | cut -d ':' -f1)
local version_value=$(echo "${decoded_row}" | cut -d ':' -f2)
log "Found node/logging-agent check script version pair: ${salt_minion_node} - ${version_value}"
if [[ "${version_value}" == *"Minion did not return"* ]];then
log "Salt minion with name '${salt_minion_node}' is not responding."
echo "${salt_minion_node}" >> $WORKING_DIR/logging_check_not_responding.txt
elif [[ "${version_value}" -ge "${VERSION}" ]]; then
log "Salt minion with name '${salt_minion_node}' is up to date (version: ${version_value}, expected: ${VERSION})."
echo "${salt_minion_node}" >> $WORKING_DIR/logging_check_do_nothing.txt
elif [[ "${version_value}" != "" ]]; then
log "Salt minion with name '${salt_minion_node}' needs cdp-logging-agent-check script installation (version: ${version_value}, expected: ${VERSION})."
echo "${salt_minion_node}" >> $WORKING_DIR/logging_check_install.txt
fi
done
if [[ -s $WORKING_DIR/logging_check_not_responding.txt ]]; then
local not_responding_hosts=$(cat $WORKING_DIR/logging_check_not_responding.txt | tr '\n' ',' | sed 's/.$//')
log "Warning: the following hosts are not responding: $not_responding_hosts"
fi
if [[ ! -s $WORKING_DIR/logging_check_install.txt ]]; then
log "Install/upgrade file $WORKING_DIR/logging_check_install.txt is empty."
do_exit 0 "No need for performing upgrade on minions."
fi
local install_targets=$(cat "$WORKING_DIR/logging_check_install.txt" | paste -sd "," -)
if [[ "$install_targets" != "" ]]; then
log "Targets are not empty for cdp-logging-agent checks."
distribute_script "${distribution_folder}" "-L $install_targets"
#run_command "$SALT_BIN_PREFIX/salt -L $install_targets cp.get_file salt:///distribution/cdp_logging_agent_check.sh $MINION_SCRIPTS_FOLDER/cdp_logging_agent_check.sh"
run_command "$SALT_BIN_PREFIX/salt -L $install_targets cmd.run 'chmod 750 $MINION_SCRIPTS_FOLDER/cdp_logging_agent_check.sh && $MINION_SCRIPTS_FOLDER/cdp_logging_agent_check.sh install' timeout=$SALT_CMD_TIMEOUT"
else
log "Targets are empty for cdp-logging-agent checks. Skip running any salt operations on them."
fi
cleanup_distribution_script $distribution_folder
do_exit 0 "DISTRIBUTION FINISHED"
}
function test_doctor() {
run_command "$SALT_BIN_PREFIX/salt '*' cmd.run 'sh $MINION_SCRIPTS_FOLDER/cdp_logging_agent_check.sh doctor'"
do_exit 0 "TEST DOCTOR COMMAND FINISHED"
}
function run_operation() {
local operation=${1:?"usage: <operation>"}
if [[ "${operation}" == "version" ]]; then
echo "${VERSION}"
return
fi
init_logfile
init_salt_prefix
if [[ "$WORKING_DIR" == "" ]]; then
WORKING_DIR="/tmp"
elif [[ ! -d "$WORKING_DIR" ]]; then
log "Working directory does not exists. Creating it..."
mkdir -p "$WORKING_DIR"
fi
if [[ "${operation}" == "distribute" ]]; then
distribute
elif [[ "${operation}" == "install" ]]; then
install
elif [[ "${operation}" == "doctor" ]]; then
doctor
elif [[ "${operation}" == "dump" ]]; then
dump
elif [[ "${operation}" == "dump-local" ]]; then
local_dump
elif [[ "${operation}" == "test" ]]; then
test_doctor
fi
}
function main() {
command="$1"
case $command in
"distribute")
run_operation "distribute"
;;
"doctor")
run_operation "doctor"
;;
"install")
run_operation "install"
;;
"version")
run_operation "version"
;;
"dump")
run_operation "dump"
;;
"dump-local")
run_operation "dump-local"
;;
"test")
run_operation "test"
;;
"help")
print_help
;;
*)
echo "Available commands: (distribute | doctor | dump | dump-local | install | version | test | help)"
;;
esac
}
main ${1+"$@"}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment