Last active
December 8, 2021 13:54
-
-
Save oleewere/9948250bf3ef4e87bfd89092f6334ebc to your computer and use it in GitHub Desktop.
cop_logging_agent_check.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
: ${LOGFILE_FOLDER:="/var/log/cdp-telemetry-logging-doctor"} | |
MINION_SCRIPTS_FOLDER="/opt/salt/scripts" | |
SALT_CMD_TIMEOUT=120 | |
LIVENESS_THRESHOLD_SECONDS=86400 # 1 day in seconds | |
BUFFER_LIMIT_BYTES=10485760 # 10 MB in bytes | |
# increase this to make sure to override the script in case of diagnostics | |
# todo: find better solution - pass by cli option? | |
VERSION=1 | |
readlinkf(){ | |
perl -MCwd -e 'print Cwd::abs_path shift' "$1"; | |
} | |
if [ "$(uname -s)" = 'Linux' ]; then | |
SCRIPT_LOCATION=$(readlink -f "$0") | |
else | |
SCRIPT_LOCATION=$(readlinkf "$0") | |
fi | |
function print_help() { | |
cat << EOF | |
Usage: [<command>] | |
commands: | |
distribute distribute the script itself on minion nodes (by salt) | |
install create required crontabs on minion nodes (locally) | |
doctor run cdp-logging-agent checks | |
dump distributed call that dump details about worker processes for troubleshooting | |
dump-local dump details about local worker processes for troubleshooting | |
version print version of the script | |
test test doctor commands (from master on minions) | |
help print usage | |
EOF | |
} | |
function do_exit() { | |
local code=$1 | |
local message=$2 | |
if [[ "$message" == "" ]]; then | |
info "Exit code: $code" | |
else | |
info "Exit code: $code, --- STATUS MESSAGE --- $message --- STATUS MESSAGE ---" | |
fi | |
exit $code | |
} | |
function init_logfile() { | |
mkdir -p $LOGFILE_FOLDER | |
local timestamp=$(date +"%Y%m%d-%H%M%S") | |
LOGFILE="$LOGFILE_FOLDER/cdp-logging-agent-doctor-${timestamp}.log" | |
touch $LOGFILE | |
chmod 600 $LOGFILE | |
cleanup_old_logs | |
info "The following log file will be used: $LOGFILE" | |
} | |
function init_salt_prefix() { | |
SALT_BIN_PREFIX=$(find /opt -maxdepth 1 -type d -iname "salt_*" | xargs -I{} echo "{}/bin") | |
if [[ "$SALT_BIN_PREFIX" == "" ]]; then | |
SALT_BIN_PREFIX="/opt/salt_*/bin" | |
fi | |
} | |
function cleanup_old_logs() { | |
ls -1tr $LOGFILE_FOLDER/cdp-logging-agent-doctor*.log | head -n -5 | xargs --no-run-if-empty rm | |
} | |
function info() { | |
log "$1" | |
} | |
function debug() { | |
log "$1" "true" | |
} | |
function log() { | |
local timestamp=$(date +"%Y-%m-%dT%H:%M:%S.%3N%z") | |
local debug=$2 | |
echo "$timestamp $1" >> $LOGFILE | |
if [[ "$2" == "" ]]; then | |
echo "$1" | |
fi | |
} | |
function run_command() { | |
local cmd=${1:?"usage: <command>"} | |
debug "The following command will be executed: $1" | |
eval $1 >> $LOGFILE 2>&1 | |
} | |
function install() { | |
log "Installing cdp_logging_agent_doctor to cron.d ..." | |
cat <<EOF >>/etc/cron.d/cdp_logging_agent_doctor | |
0 1 * * * root sh /opt/salt/scripts/cdp_logging_agent_check.sh doctor | |
EOF | |
chmod 600 /etc/cron.d/cdp_logging_agent_doctor | |
do_exit 0 "INSTALLATION FINISHED" | |
} | |
function get_fluentd_config_val() { | |
local config_content=${1:?"usage: <config_content>"} | |
local config_key=${2:?"usage: <config_key>"} | |
echo "$config_content" | grep $config_key | head -n 1 | sed -e 's/^[[:space:]]*//' | cut -f2 -d' ' | |
} | |
function get_base_path() { | |
local full_ccloud_storage_path=${1:?"usage: <full_ccloud_storage_path>"} | |
echo "$full_ccloud_storage_path" | tr -d '"' | awk -F'/%Y' '{print $1}' | |
} | |
function upload_to_cloud_storage() { | |
local dump_file_name=${1:?"usage: <dump_file_name>"} | |
local dump_base_filename=$(basename -- "$dump_file_name") | |
if [[ ! -f /etc/cdp-logging-agent/output.conf ]]; then | |
info "Logging agent config does not exist. Skip uploading data to cloud storage." | |
return | |
fi | |
local timestamp_dump=$(date +"%Y-%m-%d") | |
local dump_suffix="dumps/${timestamp_dump}" | |
local fluent_out_config=$(cat /etc/cdp-logging-agent/output.conf) | |
local s3_type=$(echo "$fluent_out_config" | grep "@type s3") | |
local azure_type=$(echo "$fluent_out_config" | grep "@type azurestorage_gen2") | |
local gcs_type=$(echo "$fluent_out_config" | grep "@type gcs") | |
if [[ "$s3_type" != "" ]]; then | |
info "Uploading dump to s3 ..." | |
local s3_bucket=$(get_fluentd_config_val "$fluent_out_config" "s3_bucket") | |
local s3_path_pattern=$(get_fluentd_config_val "$fluent_out_config" "%Y-%m-%d") | |
local s3_base_path=$(get_base_path "$s3_path_pattern") | |
local region=$(get_fluentd_config_val "$fluent_out_config" "region") | |
info "Detected S3 configs: Bucket=$s3_bucket, BasePath=$s3_base_path, Region=$region" | |
local target_location="$s3_base_path/$dump_suffix/" | |
local additional_params="" | |
if [[ "$region" != "" ]]; then | |
additional_params="--region $region" | |
fi | |
cdp-telemetry storage s3 upload -e --bucket "$s3_bucket" --file "$dump_file_name" $additional_params --location "$target_location" | |
local s3_upload_result="$?" | |
if [[ "$s3_upload_result" == "0" ]]; then | |
info "S3 upload COMPLETED: Bucket=$s3_bucket, Path=${target_location}${dump_base_filename}" | |
else | |
info "S3 upload failed with encryption, try without that parameter." | |
cdp-telemetry storage s3 upload --bucket "$s3_bucket" --file "$dump_file_name" $additional_params --location "$target_location" | |
local s3_upload_second_result="$?" | |
if [[ "$s3_upload_second_result" == "0" ]]; then | |
info "S3 upload COMPLETED: Bucket=$s3_bucket, Path=${target_location}${dump_base_filename}" | |
else | |
info "S3 upload FAILED: Bucket=$s3_bucket, Path=${target_location}${dump_base_filename}" | |
fi | |
fi | |
elif [[ "$azure_type" != "" ]]; then | |
local azure_storage_account=$(get_fluentd_config_val "$fluent_out_config" "azure_storage_account") | |
local azure_container=$(get_fluentd_config_val "$fluent_out_config" "azure_container") | |
local azure_path_pattern=$(get_fluentd_config_val "$fluent_out_config" "%Y-%m-%d") | |
local azure_base_path=$(get_base_path "$azure_path_pattern") | |
local target_location="$azure_base_path/$dump_suffix/" | |
info "Detected ABFS configs: Account=$azure_storage_account, Container=$azure_container, BasePath=$azure_base_path" | |
cdp-telemetry storage abfs --file "$dump_file_name" --location "${target_location}" --account "${azure_storage_account}" --container "${azure_container}" | |
local abfs_upload_result="$?" | |
if [[ "$abfs_upload_result" == "0" ]]; then | |
info "ABFS upload COMPLETED: Account=$azure_storage_account, Container=$azure_container, Path=${target_location}${dump_base_filename}" | |
else | |
info "ABFS upload FAILED: Account=$azure_storage_account, Container=$azure_container, Path=${target_location}${dump_base_filename}" | |
fi | |
elif [[ "$gcs_type" != "" ]]; then | |
local gcs_bucket=$(get_fluentd_config_val "$fluent_out_config" "bucket") | |
local gcs_path_pattern=$(get_fluentd_config_val "$fluent_out_config" "%Y-%m-%d") | |
local gcs_base_path=$(get_base_path "$gcs_path_pattern") | |
local target_location="$gcs_base_path/$dump_suffix/" | |
info "Detected GCS configs: Bucket=$gcs_bucket, BasePath=${gcs_base_path}" | |
cdp-telemetry storage gcs upload --bucket "$gcs_bucket" --file "$dump_file_name" --location "${target_location}" | |
local gcs_upload_result="$?" | |
if [[ "$gcs_upload_result" == "0" ]]; then | |
info "GCS upload COMPLETED: Bucket=$gcs_bucket, Path=${target_location}${dump_base_filename}" | |
else | |
info "GCS upload FAILED: Bucket=$gcs_bucket, Path=${target_location}${dump_base_filename}" | |
fi | |
else | |
info "No configured cloud storage log shipping is detected. Skip uploading dump." | |
fi | |
} | |
function create_dump_file_and_upload() { | |
local dump_name=${1:?"usage: <dump_name>"} | |
info "Compress /tmp/${dump_name}.tar.gz file ..." | |
( cd /tmp && tar -czvf "${dump_name}.tar.gz" "${dump_name}" && rm -r "${dump_name}" ) | |
info "Compression complete. Cleanup old dump collections ..." | |
ls -1tr /tmp/cdp-logging-dump*.tar.gz | head -n -3 | xargs --no-run-if-empty rm | |
upload_to_cloud_storage "/tmp/${dump_name}.tar.gz" | |
do_exit 0 "LOCAL DUMP FINSHED" | |
} | |
function local_dump() { | |
local timestamp_for_folder=$(date +"%Y%m%d-%H%M%S") | |
local hostname_short=$(hostname) | |
local dump_name="cdp-logging-dump_${hostname_short}-${timestamp_for_folder}" | |
local dump_folder="/tmp/${dump_name}" | |
local logging_agent_pids_resp=$(get_logging_agent_worker_pids) | |
if [[ "${logging_agent_pids_resp}" == "" ]]; then | |
info "No any logging agents are running. Skip worker dump." | |
else | |
if [[ ! -d "${dump_folder}" ]]; then | |
info "Creating dump folder: ${dump_folder}" | |
mkdir -p "${dump_folder}" | |
fi | |
local worker=0 | |
info "Execute command du -d ..." | |
local disk_check_out=$(du -h /var/log/cdp-logging-agent/*) | |
info "Execute command: stat ..." | |
local access_check_out=$(stat /var/log/cdp-logging-agent/*) | |
info "Gather process details dump for workers..." | |
local proc_dump=$(ps aux | grep cdp-logging-agent | awk 'NR>1 {$5=int($5/1024)"M (RSS)"; $6=int($6/1024)"M (VSZ)";}{ print;}' | grep "under-supervisor") | |
echo "$disk_check_out" >> "${dump_folder}/du.txt" | |
echo "$access_check_out" >> "${dump_folder}/stat.txt" | |
echo "$proc_dump" >> "${dump_folder}/proc.txt" | |
for logging_agent_pid in $logging_agent_pids_resp; do | |
((worker=worker+1)) | |
local thread_dump_file="/tmp/sigdump-${logging_agent_pid}.log" | |
info "Thread dump logging agent worker #${worker} to ${thread_dump_file}" | |
kill -CONT $logging_agent_pid | |
local thread_dump_content=$(cat "$thread_dump_file") | |
info "START OF WORKER #${worker} (Pid: ${logging_agent_pid}) THREAD DUMP" | |
info "END OF WORKER #${worker} (Pid: ${logging_agent_pid}) THREAD DUMP" | |
local timestamp=$(date +"%Y%m%d-%H%M%S") | |
local thread_dump_log_file="${dump_folder}/worker-thread-dump-${worker}-${logging_agent_pid}-${timestamp}.txt" | |
info "Copying thread dump to $thread_dump_log_file" | |
cp -r $thread_dump_file "${dump_folder}/worker-thread-dump-${worker}-${logging_agent_pid}-${timestamp}.txt" | |
info "Collect addition file descriptors:" | |
local fd_output=$(ls -la /proc/$logging_agent_pid/fd) | |
info "Gather file descriptor data for #${worker} (Pid: ${logging_agent_pid})" | |
echo "${fd_output}" >> "${dump_folder}/fd-${worker}-${logging_agent_pid}.txt" | |
done | |
create_dump_file_and_upload $dump_name | |
fi | |
} | |
function dump() { | |
local distribution_folder="/srv/salt/distribution" | |
distribute_script "${distribution_folder}" "'*'" | |
run_command "$SALT_BIN_PREFIX/salt '*' cmd.run 'chmod 750 $MINION_SCRIPTS_FOLDER/cdp_logging_agent_check.sh && $MINION_SCRIPTS_FOLDER/cdp_logging_agent_check.sh dump-local' timeout=$SALT_CMD_TIMEOUT" | |
cleanup_distribution_script $distribution_folder | |
do_exit 0 "DISTRIBUTED DUMP FINISHED" | |
} | |
function get_logging_agent_worker_pids() { | |
ps aux | grep cdp-logging-agent | grep under-supervisor | awk '{print $2}' | |
} | |
function install_dist_crontab() { | |
if [[ -f "/etc/cron.d/cdp_logging_agent_check_distribute" ]]; then | |
debug "No need for updating cron.d with cdp_logging_agent_check_distribute script" | |
return | |
fi | |
log "Installing cdp_logging_agent_check_distribute to cron.weekly ..." | |
cat <<EOF >>/etc/cron.d/cdp_logging_agent_check_distribute | |
0 2 * * SUN root sh /opt/salt/scripts/cdp_logging_agent_check.sh distribute | |
EOF | |
chmod 600 /etc/cron.d/cdp_logging_agent_check_distribute | |
} | |
function update_metering_conf() { | |
local file=${1:?"usage: <metering config file location>"} | |
if [[ -f "${file}" ]]; then | |
if ! grep -q "retry_forever" "${file}"; then | |
sed -z 's/flush_at_shutdown true\n/flush_at_shutdown true\n retry_forever true\n retry_max_interval 1200\n/' -i $file | |
else | |
if ! grep -q "retry_max_interval" "${file}"; then | |
sed -z 's/retry_forever true\n/retry_forever true\n retry_max_interval 1200\n/' -i $file | |
fi | |
fi | |
fi | |
} | |
function doctor() { | |
if [[ -d "/srv/salt/fluent" ]]; then | |
debug "As this node is a salt-master, do additional operations on it." | |
update_metering_conf "/srv/salt/fluent/template/databus_metering.conf.j2" | |
fi | |
local_metering_fluent_conf="/etc/cdp-logging-agent/databus_metering.conf" | |
is_datahub="false" | |
if grep -q "/var/log/metering/heartbeats.json" "$local_metering_fluent_conf"; then | |
is_datahub="true" | |
fi | |
local is_installed=$(rpm -q "cdp-logging-agent" 2>&1 >/dev/null; echo $?) | |
local is_active=$(systemctl is-active --quiet cdp-logging-agent; echo $?) | |
if [[ "$is_active" == "0" ]]; then | |
debug "Service cdp-logging-agent is active." | |
buffer_folder="/var/log/cdp-logging-agent/metering_databus" | |
if [[ -d "$buffer_folder" && "$is_datahub" == "true" ]]; then | |
info "Do additional checks as this is a dathub node..." | |
if ! grep -q "retry_max_interval" "$local_metering_fluent_conf"; then | |
update_metering_conf "${local_metering_fluent_conf}" | |
local_dump | |
systemctl restart cdp-logging-agent | |
return | |
fi | |
last_modified_date=`stat "$buffer_folder" | grep Modify | sed -r "s/Modify: (.*)/\1/"`; | |
last_modified_timestamp=`date -d "$last_modified_date" +%s`; | |
info "Metering buffer folder /var/log/cdp-logging-agent/metering_databus was accessed: $last_modified_date" | |
if [ `date +%s` -gt `expr $last_modified_timestamp + $LIVENESS_THRESHOLD_SECONDS` ]; then | |
info "Metering buffer folder was not accessed for long time. Restarting logging agent." | |
local_dump | |
systemctl restart cdp-logging-agent | |
return | |
fi | |
info "Check if metering buffer folder is too large. (max: 10MB)" | |
buffer_folder_size=$(du -bs "$buffer_folder" | cut -f1) | |
local max_buffer_size=$(expr $BUFFER_LIMIT_BYTES + 0) | |
local act_buffer_size=$(expr $buffer_folder_size + 0) | |
if [[ $act_buffer_size -ge $max_buffer_size ]]; then | |
info "Metering buffer size is too large: $act_buffer_size Bytes. Restarting logging agent." | |
local_dump | |
systemctl restart cdp-logging-agent | |
else | |
info "Metering buffer size: $act_buffer_size Bytes" | |
fi | |
fi | |
elif [[ "$is_installed" == "0" ]]; then | |
info "Service cdp-logging-agent is installed, but not active. Check if it can be started." | |
if [[ -f "/etc/cdp-logging-agent/cdp-logging-agent_simple_profile.conf" ]]; then | |
info "Service cdp-logging-agent already configured by salt and not running, so starting it" | |
systemctl start cdp-logging-agent | |
fi | |
fi | |
do_exit 0 "LOGGING AGENT DOCTOR OPERATION FINISHED" | |
} | |
function distribute_script() { | |
local distribution_folder=${1:?"distribution_folder: <distribution_folder>"} | |
local target=${2:?"usage: <target>"} | |
log "Creating $distribution_folder if does not exist." | |
mkdir -p $distribution_folder | |
log "Copying $SCRIPT_LOCATION into $distribution_folder." | |
cp -r $SCRIPT_LOCATION $distribution_folder/ | |
run_command "$SALT_BIN_PREFIX/salt $target cp.get_file salt:///distribution/cdp_logging_agent_check.sh $MINION_SCRIPTS_FOLDER/cdp_logging_agent_check.sh" | |
} | |
function cleanup_distribution_script() { | |
local distribution_folder=$1 | |
if [[ -d "$distribution_folder" ]]; then | |
rm -rf $distribution_folder/cdp_logging_agent_check.sh | |
fi | |
} | |
function distribute() { | |
install_dist_crontab | |
local distribution_folder="/srv/salt/distribution" | |
local is_salt_master_active=$(systemctl is-active --quiet salt-master; echo $?) | |
if [[ "$is_salt_master_active" != "0" ]]; then | |
debug "Salt master is not active in this node. Skipping distribution" | |
do_exit 0 "DISTRIBUTION SKIPPED" | |
fi | |
run_command "$SALT_BIN_PREFIX/salt '*' cmd.run 'test -f /etc/cron.d/cdp_logging_agent_doctor && test -f $MINION_SCRIPTS_FOLDER/cdp_logging_agent_check.sh && sh $MINION_SCRIPTS_FOLDER/cdp_logging_agent_check.sh version || echo -1' --out-indent=-1 --out=json --out-file=$WORKING_DIR/cdp-logging-agent-versions.json timeout=$SALT_CMD_TIMEOUT" | |
rm -rf $WORKING_DIR/logging_check_install.txt | |
rm -rf $WORKING_DIR/logging_check_not_responding.txt | |
rm -rf $WORKING_DIR/logging_check_do_nothing.txt | |
for row in $(cat "$WORKING_DIR/cdp-logging-agent-versions.json" | jq -r 'to_entries | . []|=.key+":"+.value | @base64'); do | |
local decoded_row=$(echo "${row}" | base64 -d | jq .[] | tr -d '"') | |
local salt_minion_node=$(echo "${decoded_row}" | cut -d ':' -f1) | |
local version_value=$(echo "${decoded_row}" | cut -d ':' -f2) | |
log "Found node/logging-agent check script version pair: ${salt_minion_node} - ${version_value}" | |
if [[ "${version_value}" == *"Minion did not return"* ]];then | |
log "Salt minion with name '${salt_minion_node}' is not responding." | |
echo "${salt_minion_node}" >> $WORKING_DIR/logging_check_not_responding.txt | |
elif [[ "${version_value}" -ge "${VERSION}" ]]; then | |
log "Salt minion with name '${salt_minion_node}' is up to date (version: ${version_value}, expected: ${VERSION})." | |
echo "${salt_minion_node}" >> $WORKING_DIR/logging_check_do_nothing.txt | |
elif [[ "${version_value}" != "" ]]; then | |
log "Salt minion with name '${salt_minion_node}' needs cdp-logging-agent-check script installation (version: ${version_value}, expected: ${VERSION})." | |
echo "${salt_minion_node}" >> $WORKING_DIR/logging_check_install.txt | |
fi | |
done | |
if [[ -s $WORKING_DIR/logging_check_not_responding.txt ]]; then | |
local not_responding_hosts=$(cat $WORKING_DIR/logging_check_not_responding.txt | tr '\n' ',' | sed 's/.$//') | |
log "Warning: the following hosts are not responding: $not_responding_hosts" | |
fi | |
if [[ ! -s $WORKING_DIR/logging_check_install.txt ]]; then | |
log "Install/upgrade file $WORKING_DIR/logging_check_install.txt is empty." | |
do_exit 0 "No need for performing upgrade on minions." | |
fi | |
local install_targets=$(cat "$WORKING_DIR/logging_check_install.txt" | paste -sd "," -) | |
if [[ "$install_targets" != "" ]]; then | |
log "Targets are not empty for cdp-logging-agent checks." | |
distribute_script "${distribution_folder}" "-L $install_targets" | |
#run_command "$SALT_BIN_PREFIX/salt -L $install_targets cp.get_file salt:///distribution/cdp_logging_agent_check.sh $MINION_SCRIPTS_FOLDER/cdp_logging_agent_check.sh" | |
run_command "$SALT_BIN_PREFIX/salt -L $install_targets cmd.run 'chmod 750 $MINION_SCRIPTS_FOLDER/cdp_logging_agent_check.sh && $MINION_SCRIPTS_FOLDER/cdp_logging_agent_check.sh install' timeout=$SALT_CMD_TIMEOUT" | |
else | |
log "Targets are empty for cdp-logging-agent checks. Skip running any salt operations on them." | |
fi | |
cleanup_distribution_script $distribution_folder | |
do_exit 0 "DISTRIBUTION FINISHED" | |
} | |
function test_doctor() { | |
run_command "$SALT_BIN_PREFIX/salt '*' cmd.run 'sh $MINION_SCRIPTS_FOLDER/cdp_logging_agent_check.sh doctor'" | |
do_exit 0 "TEST DOCTOR COMMAND FINISHED" | |
} | |
function run_operation() { | |
local operation=${1:?"usage: <operation>"} | |
if [[ "${operation}" == "version" ]]; then | |
echo "${VERSION}" | |
return | |
fi | |
init_logfile | |
init_salt_prefix | |
if [[ "$WORKING_DIR" == "" ]]; then | |
WORKING_DIR="/tmp" | |
elif [[ ! -d "$WORKING_DIR" ]]; then | |
log "Working directory does not exists. Creating it..." | |
mkdir -p "$WORKING_DIR" | |
fi | |
if [[ "${operation}" == "distribute" ]]; then | |
distribute | |
elif [[ "${operation}" == "install" ]]; then | |
install | |
elif [[ "${operation}" == "doctor" ]]; then | |
doctor | |
elif [[ "${operation}" == "dump" ]]; then | |
dump | |
elif [[ "${operation}" == "dump-local" ]]; then | |
local_dump | |
elif [[ "${operation}" == "test" ]]; then | |
test_doctor | |
fi | |
} | |
function main() { | |
command="$1" | |
case $command in | |
"distribute") | |
run_operation "distribute" | |
;; | |
"doctor") | |
run_operation "doctor" | |
;; | |
"install") | |
run_operation "install" | |
;; | |
"version") | |
run_operation "version" | |
;; | |
"dump") | |
run_operation "dump" | |
;; | |
"dump-local") | |
run_operation "dump-local" | |
;; | |
"test") | |
run_operation "test" | |
;; | |
"help") | |
print_help | |
;; | |
*) | |
echo "Available commands: (distribute | doctor | dump | dump-local | install | version | test | help)" | |
;; | |
esac | |
} | |
main ${1+"$@"} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment