一、mysqld服务监控
一、mysqld服务监控
监控项:mysqld进程、3306监听端口、是否能创建连接(异常情况邮件报警)
机制说明:定时任务+脚本,20秒探测一次mysqld进程
1.1)定时任务:
- * * * /bin/bash /data/software/script/cron_check.sh #每分钟执行一次cron_check.sh脚本
1.2)脚本:cron_check.sh #1分钟内执行三次check_con.sh脚本,间隔时间20秒(实现20秒执行一次的功能)
- cat /data/software/script/cron_check.sh
#!/bin/bash
for ((i=1; i<=3; i++));do
/data/software/script/check_rep.sh >> /data/software/script/check_rep.log 2>&1 #主从复制延迟监控脚本
/data/software/script/check_con.sh >> /data/software/script/check_con.log 2>&1 #mysqld附加监控脚本
sleep 20
done
1.3)脚本:check_con.sh #探测mysqld服务
- cat /data/software/script/check_con.sh
#!/bin/bash
BIN_PATH='/usr/local/mysql/bin'
HOST='localhost'
PORT=3306
USER='orzuser'
PASSWD='XXXXXX'
IP=`/sbin/ifconfig|sed -n '/inet addr/s/^[^:]*:\([0-9.]\{7,15\}\) .*/\1/p'|grep -v '127.0.0.1'|sed -n 1p` #取本地IP地址
mail_list=xiaochou13f@126.com #收件人邮箱
DT=`date +"%Y-%m-%d %H:%M:%S"` #获取当前时间
fun_mail(){ #定义发生邮件函数
echo -e "$DT\n$1" | mail -s "$2" $mail_list
}
- check mysqld #检测mysqld进程服务是否存在
mysqld_num=`ps -ef|egrep -i "mysqld"|egrep -v "mysqld_safe"|grep -v grep| grep $PORT|wc -l`
if [ $mysqld_num == 1 ];then #存在返回正常,否则发送邮件
echo "The mysqld is alived! at:$DT"
else
echo "The mysqld has gone away"
fun_mail "The mysqld has gone away" "mail from: $IP, The mysqld has gone away"
fi
- check tcp 3306 #检测3306端口的监听是否存储;存在返回正常,否则邮件
num_listen=`netstat -tunl|grep $PORT|wc -l`
if [ $num_listen == 1 ];then
echo "The listen is alived"
else
echo "The listen has gone away"
fun_mail "The listen has gone away" "mail from: $IP, The listen has gone away"
fi
- check connect #尝试创建连接,并进行一次select检索,检测是否能够创建connect,并正常查询
mysqld_cmd="select 1;" #成功返回正常,否者邮件
mysqld_con=$($BIN_PATH/mysql -u$USER -p$PASSWD -h$HOST -P$PORT -e "$mysqld_cmd")
#echo $mysqld_con
com_res=$(echo $mysqld_con | awk '{print $2}')
#echo $com_res
if [ $com_res == 1 ];then
echo "The connect is ok!"
else
echo "The connect is flase"
fun_mail "The connect is flase" "mail from: $IP, The connect is flase "
fi
二、主从复制延迟监控
监控项:slave两个thread、延迟情况(异常情况邮件报警,并附slave info、process、transaction、lock、innodb status状态信息)
机制说明:定时任务+脚本,20秒探测一次mysqld进程
2.1)如上
2.2)如上
2.3)脚本:check_rep.sh #检测slave
- cat /data/software/script/check_rep.sh
#!/bin/bash
BIN_PATH='/usr/local/mysql/bin'
HOST='localhost'
PORT=3306
USER='orzuser'
PASSWD='XXXXXX'
SDT=`date +"%Y-%m-%d %H:%M:%S"` #获取当前时间
IP=`/sbin/ifconfig|sed -n '/inet addr/s/^[^:]*:\([0-9.]\{7,15\}\) .*/\1/p'|grep -v '127.0.0.1'|sed -n 1p` #获取本地IP地址
mail_list=xiaochou13f@126.com,zhiming.han@fumubang.com #收件人地址
- check replication #检测IO_Thread和SQL_Thread两个复制进程,是否都为Yes
M_com="show slave status\G"
M_res=$($BIN_PATH/mysql -u$USER -p$PASSWD -e "$M_com"|grep -E "Running|File|Pos|Behind")
IO_env=$(echo $M_res|awk '{print $12}') #IO_Thread进程状态
SQL_env=$(echo $M_res|awk '{print $14}') #SQL_Thread进程装
if [ "$IO_env" = "Yes" -a "$SQL_env" = "Yes" ] #如果是双Yes返回正常,否则邮件,附件2个进程的状态
then
echo "Slave is normal at:$SDT"
else
echo -e "$SDT\nSlave is stoped\nThe IO_Thread is:$IO_env\nThe SQL_Thread is:$SQL_env" | mail -s "mail from:$IP, Replication Error" $mail_list
fi
- check relay #检测延迟
Behind_sec=$(echo $M_res | awk '{print $22}') #延迟的秒数,正常为0
- Behind_info=$(echo $M_res | awk '{print $1,$2"\n"$9,$10"\n"$3,$4"\n"$15,$16"\n"$21,$22}')
Slave_info=$($BIN_PATH/mysql -u$USER -p$PASSWD -h$HOST -P$PORT -e "show slave status\G") #salve整体状态
Process_info=$($BIN_PATH/mysql -u$USER -p$PASSWD -e "select * from information_schema.processlist where command='Query'\G") #当前活跃connect的情况
Lock_com="SELECT lw.requesting_trx_id AS request_ID, trx.trx_mysql_thread_id as request_mysql_ID,trx.trx_query AS request_command,lw.blocking_trx_id AS blocking_ID, trx1.trx_mysql_thread_id as blocking_mysql_ID,trx1.trx_query AS blocking_command, lo.lock_index AS lock_index FROM information_schema.innodb_lock_waits lw INNER JOIN information_schema.innodb_locks lo ON lw.requesting_trx_id = lo.lock_trx_id INNER JOIN information_schema.innodb_locks lo1 ON lw.blocking_trx_id = lo1.lock_trx_id INNER JOIN information_schema.innodb_trx trx ON lo.lock_trx_id = trx.trx_id INNER JOIN information_schema.innodb_trx trx1 ON lo1.lock_trx_id = trx1.trx_id\G"
Lock_info=$($BIN_PATH/mysql -u$USER -p$PASSWD -e "$Lock_com") #当前锁等待情况#当前锁等待情况
Myisam_lock_info=$($BIN_PATH/mysql -u$USER -p$PASSWD -e "show status like 'Table_locks%'\G") #Table_locks_waited(针对Table_locks_waited),需对比上次值
Trx_info=$($BIN_PATH/mysql -u$USER -p$PASSWD -e "select * from information_schema.innodb_trx\G") #当前活跃事务的情况(innodb表)
Innodb_info=$($BIN_PATH/mysql -u$USER -p$PASSWD -e "show engine innodb status\G") #当前InnoDB引擎整体情况
- Outfile="1.The Slave info:"$Slave_info"2.The Process info:"$Process_info"3.The Lock info:"$Lock_info"4.The Innodb info:"$Innodb_info
- 如果延迟为0,反正正常;如果延迟大于5秒,邮件并附加以上信息,以便分析导致延迟的原因;延迟0-5之间日志中warning
if [ $Behind_sec -eq 0 ];then
echo "No delay"
elif [ $Behind_sec -ge 5 ];then
echo "The slave is laged: $Behind_sec s!"
echo -e "$SDT\n1.The Slave info:\n$Slave_info\n\n2.The Process info:\n$Process_info\n\n3.1 The Lock info:\n$Lock_info\n\n3.1The MyISAM lock info:\n$Myisam_lock_info\n\n4.The Trx info:\n$Trx\n\n5.The Innodb info:\n$Innodb_info" | mail -s "mail from:$IP, Slave has behind" $mail_list
elif [ $Behind_sec -gt 0 -a $Behind_sec -lt 5 ]; then
echo "Slight Waring"
fi