3. Shell代码
处理alert文件的调用脚本check_alert.sh
#!/bin/sh
#设置alert日志文件的环境变量
ORIGINAL_ALERT="/arraybill/home/oracle/app/oracle/admin/hnbill/bdump/alert_hnbill.log"
#判断输入的参数:天
if [ $# -ne 1 ]
then
echo "参数不对,用法: sh check_alert.sh 3"
exit 1
fi
if [ $1 -lt 0 ]
then
echo "对不起,请输入一个非负整数"
exit 1
fi
#查询参数对应时间以前的系统时间
sh dated_alert.sh $1 >/dev/null
#分解时间格式,注意,其中天可能是一位,也可能是两位,月份是3位,月份与天组成的字符串长度固定
str_day=$(sed 1d tmp_date.log|awk '{if(substr($1,1,1) == 0) print substr($1,2,1);else print substr($1,1,2) }')
str_length=$(echo $str_day|awk '{print length($1)}')
#将三个字母表示的月份分解成首字母大写,其他小写
str_month=$(sed 1d tmp_date.log|awk '{print substr($1,4,3)}'|sed '1,$y/[JFMASODN]ABCDEFGHIJKLMNOPQRSTUVWXYZ/[JFMASODN]abcdefghijklmnopqrstuvwxyz/')
if [ $str_length -eq 1 ]
then
echo "awk '/""$str_month $str_day""/{print NR}' "$ORIGINAL_ALERT"" >tmp.sh
#echo "$str_month $str_day"
fi
if [ $str_length -eq 2 ]
then
echo "awk '/""$str_month $str_day""/{print NR}' "$ORIGINAL_ALERT"" >tmp.sh
#echo "$str_month $str_day"
fi
chmod 777 tmp.sh
chmod 777 tmp.sh
sh tmp.sh |awk 'NR==1{print $1}' >rownum.lst
#获取要处理的alert日志的开始行号和结束行号
alert_row_begin=$(cat rownum.lst |awk '{print $1}')
alert_row_end=$(wc -l $ORIGINAL_ALERT|awk '{print $1}')
echo "awk 'NR==$alert_row_begin,NR==$alert_row_end{print \$0}' "$ORIGINAL_ALERT"" >tmp.sh
#根据行号范围,截取alert日志内容,并保存到临时文件tmp_alert.tmp中
sh tmp.sh >tmp_alert.tmp
echo
echo "-----------------------------------------------"
echo "错误汇总 开始时间:$begin_date_time"
echo "-----------------------------------------------"
echo
echo "数量 错误号"
echo "---- -------------------"
grep ORA- tmp_alert.tmp |awk '{print substr($0,0,9)}'|sort -nr| uniq -c|sort -nr
#如果有600错误,调用600错误的处理脚本
if [ $(awk '/ORA-00600/{print NR}' tmp_alert.tmp|wc -l|awk '{print $1}') -gt 0 ]
then
sh check_00600.sh
fi
#如果有07445错误,调用07445错误的处理脚本
if [ $(awk '/ORA-07445/{print NR}' tmp_alert.tmp|wc -l|awk '{print $1}') -gt 0 ]
then
sh check_07445.sh
fi
#如果有 000060 错误,调用000060错误的处理脚本
if [ $(awk '/ORA-000060/{print NR}' tmp_alert.tmp|wc -l|awk '{print $1}') -gt 0 ]
then
sh check_000060.sh
fi
#如果有 01555 错误,调用01555错误的处理脚本
if [ $(awk '/ORA-01555/{print NR}' tmp_alert.tmp|wc -l|awk '{print $1}') -gt 0 ]
then
sh check_01555.sh
fi
echo
#删除调用过程中产生的临时文件
rm -f tmp_3_line.sh
rm -f tmp_date.log
rm -f tmp.sh
rm -f ora01555_trc.log
rm -f ora006000_trc.log
rm -f ora07445_trc.log
rm -f ora000060_trc.log
rm -f trc_tmp
rm -f tmp_alert.tmp
rm -f ora006000_rownum.log
rm -f rownum.lst
#设置alert日志文件的环境变量
ORIGINAL_ALERT="/arraybill/home/oracle/app/oracle/admin/hnbill/bdump/alert_hnbill.log"
#判断输入的参数:天
if [ $# -ne 1 ]
then
echo "参数不对,用法: sh check_alert.sh 3"
exit 1
fi
if [ $1 -lt 0 ]
then
echo "对不起,请输入一个非负整数"
exit 1
fi
#查询参数对应时间以前的系统时间
sh dated_alert.sh $1 >/dev/null
#分解时间格式,注意,其中天可能是一位,也可能是两位,月份是3位,月份与天组成的字符串长度固定
str_day=$(sed 1d tmp_date.log|awk '{if(substr($1,1,1) == 0) print substr($1,2,1);else print substr($1,1,2) }')
str_length=$(echo $str_day|awk '{print length($1)}')
#将三个字母表示的月份分解成首字母大写,其他小写
str_month=$(sed 1d tmp_date.log|awk '{print substr($1,4,3)}'|sed '1,$y/[JFMASODN]ABCDEFGHIJKLMNOPQRSTUVWXYZ/[JFMASODN]abcdefghijklmnopqrstuvwxyz/')
if [ $str_length -eq 1 ]
then
echo "awk '/""$str_month $str_day""/{print NR}' "$ORIGINAL_ALERT"" >tmp.sh
#echo "$str_month $str_day"
fi
if [ $str_length -eq 2 ]
then
echo "awk '/""$str_month $str_day""/{print NR}' "$ORIGINAL_ALERT"" >tmp.sh
#echo "$str_month $str_day"
fi
chmod 777 tmp.sh
chmod 777 tmp.sh
sh tmp.sh |awk 'NR==1{print $1}' >rownum.lst
#获取要处理的alert日志的开始行号和结束行号
alert_row_begin=$(cat rownum.lst |awk '{print $1}')
alert_row_end=$(wc -l $ORIGINAL_ALERT|awk '{print $1}')
echo "awk 'NR==$alert_row_begin,NR==$alert_row_end{print \$0}' "$ORIGINAL_ALERT"" >tmp.sh
#根据行号范围,截取alert日志内容,并保存到临时文件tmp_alert.tmp中
sh tmp.sh >tmp_alert.tmp
echo
echo "-----------------------------------------------"
echo "错误汇总 开始时间:$begin_date_time"
echo "-----------------------------------------------"
echo
echo "数量 错误号"
echo "---- -------------------"
grep ORA- tmp_alert.tmp |awk '{print substr($0,0,9)}'|sort -nr| uniq -c|sort -nr
#如果有600错误,调用600错误的处理脚本
if [ $(awk '/ORA-00600/{print NR}' tmp_alert.tmp|wc -l|awk '{print $1}') -gt 0 ]
then
sh check_00600.sh
fi
#如果有07445错误,调用07445错误的处理脚本
if [ $(awk '/ORA-07445/{print NR}' tmp_alert.tmp|wc -l|awk '{print $1}') -gt 0 ]
then
sh check_07445.sh
fi
#如果有 000060 错误,调用000060错误的处理脚本
if [ $(awk '/ORA-000060/{print NR}' tmp_alert.tmp|wc -l|awk '{print $1}') -gt 0 ]
then
sh check_000060.sh
fi
#如果有 01555 错误,调用01555错误的处理脚本
if [ $(awk '/ORA-01555/{print NR}' tmp_alert.tmp|wc -l|awk '{print $1}') -gt 0 ]
then
sh check_01555.sh
fi
echo
#删除调用过程中产生的临时文件
rm -f tmp_3_line.sh
rm -f tmp_date.log
rm -f tmp.sh
rm -f ora01555_trc.log
rm -f ora006000_trc.log
rm -f ora07445_trc.log
rm -f ora000060_trc.log
rm -f trc_tmp
rm -f tmp_alert.tmp
rm -f ora006000_rownum.log
rm -f rownum.lst
产生格式化日期的脚本dated_alert.sh
echo "set heading off" >get_date.sql
echo "set feedback off" >>get_date.sql
echo "set term off" >>get_date.sql
echo "spool tmp_date.log" >>get_date.sql
echo "select sysdate-"$1" from dual;" >>get_date.sql
echo "spool off" >>get_date.sql
echo "exit" >>get_date.sql
export NLS_LANG=american
sqlplus "/as sysdba" @get_date.sql <<EOF
EOF
rm -f get_date.sql
echo "set feedback off" >>get_date.sql
echo "set term off" >>get_date.sql
echo "spool tmp_date.log" >>get_date.sql
echo "select sysdate-"$1" from dual;" >>get_date.sql
echo "spool off" >>get_date.sql
echo "exit" >>get_date.sql
export NLS_LANG=american
sqlplus "/as sysdba" @get_date.sql <<EOF
EOF
rm -f get_date.sql
处理ORA-00600错误的脚本check_00600.sh
本只给出ORA-00600错误的处理脚本,其他错误类似,限于篇幅,其他错误的处理脚本在此处省略。
echo
echo "----------------------"
echo "ORA-00600错误跟踪信息"
echo "----------------------"
awk '/ORA-00600/{print NR}' tmp_alert.tmp|awk '{print "awk ""'\''""NR=="$1-1" {print ""\"mget \"""substr($4,1,length($4)-1)}""'\''"" tmp_alert.tmp"}' >tmp1.sh
chmod 777 tmp1.sh
chmod 777 tmp1.sh
echo
echo
sh tmp1.sh |sort -r| uniq >ora006000_trc.log
#对所有的ORA-00600跟踪文件进行处理,去掉重复的文件,判断文件是否存在,并生成ftp可用的格式
echo "" >trc_tmp
for trc_file in $(cat ora006000_trc.log|awk '{print $2}')
do
if [ -f $trc_file ]
then
echo "mget $trc_file" >>trc_tmp
else
echo "找不到跟踪文件: $trc_file" >>trc_tmp
fi
done
cat trc_tmp|sort -r
echo
echo
echo >tmp_3_line.sh
chmod 777 tmp_3_line.sh
chmod 777 tmp_3_line.sh
#对于每个ORA-00600,打印时间戳、跟踪文件和具体报错信息
for tmp_row_num in $(awk '/ORA-00600/{print NR}' tmp_alert.tmp)
do
let tmp_time=$tmp_row_num-2
let tmp_trc=$tmp_row_num-1
echo "awk 'NR=="$tmp_time"{print \$0}' tmp_alert.tmp" >>tmp_3_line.sh
echo "awk 'NR=="$tmp_trc"{print \$0}' tmp_alert.tmp" >>tmp_3_line.sh
echo "awk 'NR=="$tmp_row_num"{print \$0}' tmp_alert.tmp" >>tmp_3_line.sh
echo "echo" >>tmp_3_line.sh
done
sh tmp_3_line.sh
echo
echo
#错误信息提示
echo "---------------"
echo "关于ORA-00600"
echo "---------------"
oerr ora 00600
echo
echo "----------------------"
echo "ORA-00600错误跟踪信息"
echo "----------------------"
awk '/ORA-00600/{print NR}' tmp_alert.tmp|awk '{print "awk ""'\''""NR=="$1-1" {print ""\"mget \"""substr($4,1,length($4)-1)}""'\''"" tmp_alert.tmp"}' >tmp1.sh
chmod 777 tmp1.sh
chmod 777 tmp1.sh
echo
echo
sh tmp1.sh |sort -r| uniq >ora006000_trc.log
#对所有的ORA-00600跟踪文件进行处理,去掉重复的文件,判断文件是否存在,并生成ftp可用的格式
echo "" >trc_tmp
for trc_file in $(cat ora006000_trc.log|awk '{print $2}')
do
if [ -f $trc_file ]
then
echo "mget $trc_file" >>trc_tmp
else
echo "找不到跟踪文件: $trc_file" >>trc_tmp
fi
done
cat trc_tmp|sort -r
echo
echo
echo >tmp_3_line.sh
chmod 777 tmp_3_line.sh
chmod 777 tmp_3_line.sh
#对于每个ORA-00600,打印时间戳、跟踪文件和具体报错信息
for tmp_row_num in $(awk '/ORA-00600/{print NR}' tmp_alert.tmp)
do
let tmp_time=$tmp_row_num-2
let tmp_trc=$tmp_row_num-1
echo "awk 'NR=="$tmp_time"{print \$0}' tmp_alert.tmp" >>tmp_3_line.sh
echo "awk 'NR=="$tmp_trc"{print \$0}' tmp_alert.tmp" >>tmp_3_line.sh
echo "awk 'NR=="$tmp_row_num"{print \$0}' tmp_alert.tmp" >>tmp_3_line.sh
echo "echo" >>tmp_3_line.sh
done
sh tmp_3_line.sh
echo
echo
#错误信息提示
echo "---------------"
echo "关于ORA-00600"
echo "---------------"
oerr ora 00600
echo
输出样例
$ sh check_alert.sh 2
----------------------------------------------
错误汇总 开始时间:Wed Sep 10 00:11:39 2008
-----------------------------------------------
数量 错误号
---- -------------------
3 ORA-00600
1 ORA-01555
----------------------
ORA-00600 错误跟踪信息
----------------------
mget /arraybill/home/oracle/app/oracle/admin/hnbill/udump/hnbill_ora_5984.trc
找不到跟踪文件: /arraybill/home/oracle/app/oracle/admin/hnbill/udump/hnbill_ora_10650.trc
Fri Sep 10 11:55:42 2008
Errors in file /arraybill/home/oracle/app/oracle/admin/hnbill/udump/hnbill_ora_15196.trc:
ORA-00600: internal error code, arguments: [qerrmObnd1], [932], [ORA-00932: inconsistent datatypes: expected got
Fri Sep 10 11:56:16 2008
Errors in file /arraybill/home/oracle/app/oracle/admin/hnbill/udump/hnbill_ora_15196.trc:
ORA-00600: internal error code, arguments: [qerrmObnd1], [932], [ORA-00932: inconsistent datatypes: expected got
Tue Sep 10 19:48:49 2008
Errors in file /arraybill/home/oracle/app/oracle/admin/hnbill/udump/hnbill_ora_5984.trc:
ORA-00600: internal error code, arguments: [729], [1200], [space leak], [], [], [], [], []
---------------
关于ORA-00600
---------------
00600, 00000, "internal error code, arguments: [%s], [%s], [%s], [%s], [%s], [%s], [%s], [%s]"
// *Cause: This is the generic internal error number for Oracle program
// exceptions. This indicates that a process has encountered an
// exceptional condition.
// *Action: Report as a bug - the first argument is the internal error number
-------------------------
ORA-01555 错误跟踪信息
-------------------------
Thu Sep 11 04:38:18 2008
ORA-01555 caused by SQL statement below (Query Duration=12429 sec, SCN: 0x0240.ba061afa):
Thu Sep 11 04:38:18 2008
SELECT * FROM BB_SERVICE_RELATION_T WHERE CITY_CODE = :B4 AND SERVICE_KIND = :B3 AND IF_VALID = 1 AND SERVING_STATUS <> 10 AND SERVING_STATUS <> 20 AND (PRESERVE02 <> 1 OR PRESERVE02 IS NULL) AND MOD(USER_ID,:B2 ) = :B1 - 1
---------------
关于ORA-01555
---------------
01555, 00000, "snapshot too old: rollback segment number %s with name \"%s\" too small"
// *Cause: rollback records needed by a reader for consistent read are
// overwritten by other writers
// *Action: If in Automatic Undo Management mode, increase undo_retention
// setting. Otherwise, use larger rollback segments
----------------------------------------------
错误汇总 开始时间:Wed Sep 10 00:11:39 2008
-----------------------------------------------
数量 错误号
---- -------------------
3 ORA-00600
1 ORA-01555
----------------------
ORA-00600 错误跟踪信息
----------------------
mget /arraybill/home/oracle/app/oracle/admin/hnbill/udump/hnbill_ora_5984.trc
找不到跟踪文件: /arraybill/home/oracle/app/oracle/admin/hnbill/udump/hnbill_ora_10650.trc
Fri Sep 10 11:55:42 2008
Errors in file /arraybill/home/oracle/app/oracle/admin/hnbill/udump/hnbill_ora_15196.trc:
ORA-00600: internal error code, arguments: [qerrmObnd1], [932], [ORA-00932: inconsistent datatypes: expected got
Fri Sep 10 11:56:16 2008
Errors in file /arraybill/home/oracle/app/oracle/admin/hnbill/udump/hnbill_ora_15196.trc:
ORA-00600: internal error code, arguments: [qerrmObnd1], [932], [ORA-00932: inconsistent datatypes: expected got
Tue Sep 10 19:48:49 2008
Errors in file /arraybill/home/oracle/app/oracle/admin/hnbill/udump/hnbill_ora_5984.trc:
ORA-00600: internal error code, arguments: [729], [1200], [space leak], [], [], [], [], []
---------------
关于ORA-00600
---------------
00600, 00000, "internal error code, arguments: [%s], [%s], [%s], [%s], [%s], [%s], [%s], [%s]"
// *Cause: This is the generic internal error number for Oracle program
// exceptions. This indicates that a process has encountered an
// exceptional condition.
// *Action: Report as a bug - the first argument is the internal error number
-------------------------
ORA-01555 错误跟踪信息
-------------------------
Thu Sep 11 04:38:18 2008
ORA-01555 caused by SQL statement below (Query Duration=12429 sec, SCN: 0x0240.ba061afa):
Thu Sep 11 04:38:18 2008
SELECT * FROM BB_SERVICE_RELATION_T WHERE CITY_CODE = :B4 AND SERVICE_KIND = :B3 AND IF_VALID = 1 AND SERVING_STATUS <> 10 AND SERVING_STATUS <> 20 AND (PRESERVE02 <> 1 OR PRESERVE02 IS NULL) AND MOD(USER_ID,:B2 ) = :B1 - 1
---------------
关于ORA-01555
---------------
01555, 00000, "snapshot too old: rollback segment number %s with name \"%s\" too small"
// *Cause: rollback records needed by a reader for consistent read are
// overwritten by other writers
// *Action: If in Automatic Undo Management mode, increase undo_retention
// setting. Otherwise, use larger rollback segments
4. 后处理与展望
本文只是对符合时间格式的alert文档进行处理,脚本还需要进一步完善,以提高其可扩展性,即能定义一个错误列表,相当于一个配置表,只需要修改错误列表,就能决定哪些错误是需要重点关注的。此外,还需要添加一个模块,如果在alert文件对应的时间戳上没有找到记录,需要进行一个异常处理。
对查找到的错误,要及时进行后处理,这些处理包括:对错误进行分析,查找文档,需要优化的语句进行优化,需要打补丁的bug,视情况下载并打补丁。