dbms_hm.run_check遇到ORA-00604、ORA-01427

11.2.0.3 下尝试使用11g health monitor新特性时出现了ORA-00604、ORA-01427, 查询MOS发现 (Bug 12385172: ORA-01427 WHEN EXECUTING DBMS_HM.RUN_CHECK),当 DB中存在case when then的function index时会触发该BUG:

 

SQL> select * from v$version;

BANNER
--------------------------------------------------------------------------------
Oracle Database 11g Enterprise Edition Release 11.2.0.3.0 - 64bit Production
PL/SQL Release 11.2.0.3.0 - Production
CORE    11.2.0.3.0      Production
TNS for Linux: Version 11.2.0.3.0 - Production
NLSRTL Version 11.2.0.3.0 - Production

SQL> select * from global_name;

GLOBAL_NAME
--------------------------------------------------------------------------------
www.askmaclean.com

SQL> exec dbms_hm.run_check('Dictionary Integrity Check','check-2');
BEGIN dbms_hm.run_check('Dictionary Integrity Check','check-2'); END;

*
ERROR at line 1:
ORA-00604: error occurred at recursive SQL level 1
ORA-01427: single-row subquery returns more than one row
ORA-06512: at "SYS.DBMS_HM", line 191
ORA-06512: at line 1

 

 

可以通过以下脚本找出 DB中case when then类型的函数索引:

 

 

-- Determine DDL statements (note: this will take a while to return results!)

 set long 100000

 exec dbms_metadata.set_transform_param(dbms_metadata.session_transform,'PRETTY',true);
 exec dbms_metadata.set_transform_param(dbms_metadata.session_transform,'TABLESPACE',false);
 exec dbms_metadata.set_transform_param(dbms_metadata.session_transform,'SEGMENT_ATTRIBUTES',false);
 exec dbms_metadata.set_transform_param(dbms_metadata.session_transform,'STORAGE',false);

 -- Checking the DDL statement
 col DDL form a100 word_wrapped
 select dbms_metadata.get_ddl(RTRIM(UPPER(object_type)),
                              RTRIM(UPPER(object_name)),
                              RTRIM(UPPER(owner))) DDL
  from DBA_OBJECTS
 where object_type='INDEX'
   and object_id
    in (select x from (select obj# x, obj#||','||intcol#,  count(obj#||','||intcol#)
          from ICOLDEP$
         group by obj#, obj#||','||intcol# having count(*) > 1)
 );

 

 

对于安装了APEX 组件或者在DBCA创建数据库时选择了General Purpose从Seed中clone数据库而非Custom Database的DB ,都会创建有”APEX_030200″.”WWV_FLOW_WORKSHEETS_UNQ_IDX”、”APEX_030200″.”WWV_FLOW_WS_UNQ_ALIAS_IDX”、”APEX_030200″.”WWV_FLOW_WORKSHEET_RPTS_UK” 三个函数索引。

如果没有实际使用APEX组件的话,我们可以直接DROP掉APEX_030200:

 

SQL> drop user "APEX_030200" cascade;

User dropped.

SQL> set long 100000
SQL>
SQL> exec dbms_metadata.set_transform_param(dbms_metadata.session_transform,'PRETTY',true);

PL/SQL procedure successfully completed.

SQL> exec dbms_metadata.set_transform_param(dbms_metadata.session_transform,'TABLESPACE',false);

PL/SQL procedure successfully completed.

SQL> exec dbms_metadata.set_transform_param(dbms_metadata.session_transform,'SEGMENT_ATTRIBUTES',false);

PL/SQL procedure successfully completed.

SQL> exec dbms_metadata.set_transform_param(dbms_metadata.session_transform,'STORAGE',false);

PL/SQL procedure successfully completed.

SQL>
SQL> -- Checking the DDL statement
SQL> col DDL form a100 word_wrapped
SQL> select dbms_metadata.get_ddl(RTRIM(UPPER(object_type)),
  2                               RTRIM(UPPER(object_name)),
  3                               RTRIM(UPPER(owner))) DDL
  4   from DBA_OBJECTS
  5  where object_type='INDEX'
  6    and object_id
  7     in (select x from (select obj# x, obj#||','||intcol#,  count(obj#||','||intcol#)
  8           from ICOLDEP$
  9          group by obj#, obj#||','||intcol# having count(*) > 1)
 10  );

no rows selected

 

 

再次尝试测试health check dictionary 发现问题仍存在:

 

SQL>  exec dbms_hm.run_check('Dictionary Integrity Check','check-mac3');
BEGIN dbms_hm.run_check('Dictionary Integrity Check','check-mac3'); END;

*
ERROR at line 1:
ORA-00604: error occurred at recursive SQL level 1
ORA-01427: single-row subquery returns more than one row
ORA-06512: at "SYS.DBMS_HM", line 191
ORA-06512: at line 1

 

 

到这一步决定自己来诊断这个ORA-01427错误的根源, 因为是递归SQL层出现故障,所以这里我们可以用到ERRORSTACK来深入了解问题:

 

 

SQL> oradebug setmypid;
Statement processed.

SQL> oradebug event 1427 trace name errorstack level 4;
Statement processed.

/* 以上我们设置当触发1427错误事件时TRACE level 4的错误堆栈ERRORSTACK */

SQL> exec dbms_hm.run_check('Dictionary Integrity Check','check-mac4');
BEGIN dbms_hm.run_check('Dictionary Integrity Check','check-mac4'); END;

*
ERROR at line 1:
ORA-00604: error occurred at recursive SQL level 1
ORA-01427: single-row subquery returns more than one row
ORA-06512: at "SYS.DBMS_HM", line 191
ORA-06512: at line 1

/* 触发ORA-01427 错误 将生成相关TRACE 信息*/

SQL> oradebug tracefile_name
/s01/orabase/diag/rdbms/vprod/VPROD1/trace/VPROD1_ora_7781.trc

 

来进一步观察生成的TRACE文件:

 

*** 2012-04-30 09:20:55.438
dbms_hm: (In run_check)
Begin dbkhicd_run_check
dbkh_run_check_internal: BEGIN; check_namep=Dictionary Integrity Check, run_namep=check-mac4
dbkh_run_check_internal: BEGIN; timeout=0
dbkh_run_check_internal: AFTER RUN CREATE; run_id=1281

*** 2012-04-30 09:20:55.603
dbkedDefDump(): Starting a non-incident diagnostic dump (flags=0x0, level=4, mask=0x0)
----- Error Stack Dump -----
ORA-01427: single-row subquery returns more than one row
----- Current SQL Statement for this session (sql_id=gxjzd1s7m8xfj) -----
select 52, rowid, 'ind$.obj#'
  from IND$
 where obj# < 0
union all
select 57, rowid, 'ind$.type#'
  from IND$
 where type# not between 1 and 9
union all
select 58, rowid, 'ind$.pctfree$'
  from IND$
 where pctfree$ not between 0 and 99
union all
select 59, rowid, 'ind$.analyzetime <= SYSDATE'
  from IND$
 where analyzetime > SYSDATE
union all
select 51, rowid, 'ind$.obj# pk'
  from IND$
 where obj# is null
union all
select 51, rowid, 'ind$.obj# pk'
  from IND$
 where 1 > (select obj# from IND$ group by obj# having count(*) > 1)
union all
select 53, rowid, 'ind$.dataobj# range'
  from IND$
 where 1 >
       (select dataobj# from IND$ group by dataobj# having count(*) > 1)
union all
select 54, rowid, 'ind$.ts# fk'
  from IND$
 where (ts#) in (select ts#
                   from IND$
                  where (ts#) not in (select ts# from ts$)
                    and ts# != 2147483647)
union all
select 55, rowid, 'ind$.ts,file,block fk'
  from IND$
 where (ts#, file#, block#) in (select ts#, file#, block#
                                  from IND$
                                 where (ts#, file#, block#) not in
                                       (select ts#, file#, block# from seg$)
                                   and file# != 0
                                   and block# != 0)
union all
select 56, rowid, 'ind$.obj# fk_obj$'
  from IND$
 where (obj#) in
       (select obj# from IND$ where (obj#) not in (select obj# from obj$))

----- PL/SQL Stack -----
----- PL/SQL Call Stack -----
  object      line  object
  handle    number  name
0xb1269160       191  package body SYS.DBMS_HM
0xb1d9f600         1  anonymous block

 

实际触发ORA-01427的是一条较长的递归SQL语句,该SQL由多个部分UNION ALL组合而成负责检测IND$基表是否存在逻辑不一致, 实际检测可以发现真真存在问题的是 这一段SQL:

 

select 53, rowid, 'ind$.dataobj# range'
  from IND$
 where 1 >
       (select dataobj# from IND$ group by dataobj# having count(*) > 1)

ERROR at line 4:
ORA-01427: single-row subquery returns more than one row

SQL>  select dataobj# from IND$ group by dataobj# having count(*) > 1;

  DATAOBJ#
----------

     75601
     75599
     75594
     75605

 

IND$ 基表上居然存在多条dataobj#重复的记录,我们来看看是哪些对象:

 

select /*+ first_rows */
 owner, object_name, data_object_id
  from dba_objects
 where data_object_id in
       (select dataobj# from IND$ group by dataobj# having count(*) > 1)
       order by 3 ;

OWNER                          OBJECT_NAME                    DATA_OBJECT_ID
------------------------------ ------------------------------ --------------
SYS                            SYS_C0010990                            75594
OE                             WHS_LOCATION_IX                         75594
OE                             ORD_CUSTOMER_IX                         75599
SYS                            SYS_IOT_TOP_75598                       75599
SYS                            SYS_IOT_TOP_75600                       75601
OE                             CUST_ACCOUNT_MANAGER_IX                 75601
OE                             PROD_SUPPLIER_IX                        75605
SYS                            SYS_IOT_TOP_75603                       75605

8 rows selected.

 

 

OE这个Sample Schema下的多个索引居然和SYS用户的一些索引的DATA_OBJECT_ID重号; 我们不可能去改动SYS下的对象,而OE这个Schema则无关紧要,删除这些OE下的问题索引:

 

SQL> drop index oe.WHS_LOCATION_IX;

Index dropped.

SQL> drop index oe.ORD_CUSTOMER_IX;

Index dropped.

SQL> drop index oe.CUST_ACCOUNT_MANAGER_IX;

Index dropped.

SQL> drop index oe.PROD_SUPPLIER_IX;

Index dropped.

SQL> select dataobj# from IND$ group by dataobj# having count(*) > 1;

  DATAOBJ#
----------

 

再次测试后成功执行Dictionary Integrity Check

 

SQL> exec dbms_hm.run_check('Dictionary Integrity Check','check-mac5');
PL/SQL procedure successfully completed.
SQL> set pause on;
SQL> spool dic_check
SQL> SET LONG 100000
SQL> SET LONGCHUNKSIZE 1000
SQL> SET PAGESIZE 100
SQL> SET LINESIZE 512
SQL> SELECT DBMS_HM.GET_RUN_REPORT('CHECK-MAC5') FROM DUAL;
DBMS_HM.GET_RUN_REPORT('CHECK-MAC5')
-----------------------------------------------------
Basic Run Information
 Run Name                     : check-mac5
 Run Id                       : 1301
 Check Name                   : Dictionary Integrity Check
 Mode                         : MANUAL
 Status                       : COMPLETED
 Start Time                   : 2012-04-30 09:33:28.540140 -04:00
 End Time                     : 2012-04-30 09:33:32.303679 -04:00
 Error Encountered            : 0
 Source Incident Id           : 0
 Number of Incidents Created  : 0
Input Paramters for the Run
 TABLE_NAME=ALL_CORE_TABLES
 CHECK_MASK=ALL
Run Findings And Recommendations
 Finding
 Finding Name  : Dictionary Inconsistency
 Finding ID    : 1302
 Type          : FAILURE
 Status        : OPEN
 Priority      : CRITICAL
 Message       : SQL dictionary health check: syn$.owner fk 95 on object SYN$
               failed
 Message       : Damaged rowid is AAAABEAABAAANWgAB7 - description: Synonymn
               APEX is referenced
 Finding
 Finding Name  : Dictionary Inconsistency
 Finding ID    : 1305
 Type          : FAILURE
 Status        : OPEN
 Priority      : CRITICAL
 Message       : SQL dictionary health check: syn$.owner fk 95 on object SYN$
               failed
 Message       : Damaged rowid is AAAABEAABAAANWhAAu - description: Synonymn
               APEXWS is referenced
 Finding
 Finding Name  : Dictionary Inconsistency
 Finding ID    : 1308
 Type          : FAILURE
 Status        : OPEN
 Priority      : CRITICAL
 Message       : SQL dictionary health check: syn$.owner fk 95 on object SYN$
               failed
 Message       : Damaged rowid is AAAABEAABAAANWgACO - description: Synonymn
               APEX_ACTIVITY_LOG is referenced
 Finding
 Finding Name  : Dictionary Inconsistency
 Finding ID    : 1311
 Type          : FAILURE
 Status        : OPEN
 Priority      : CRITICAL
 Message       : SQL dictionary health check: syn$.owner fk 95 on object SYN$
               failed
 Message       : Damaged rowid is AAAABEAABAAANWgABl - description: Synonymn
               APEX_ADMIN is referenced
 Finding
 Finding Name  : Dictionary Inconsistency
 Finding ID    : 1314
 Type          : FAILURE
 Status        : OPEN
 Priority      : CRITICAL
 Message       : SQL dictionary health check: syn$.owner fk 95 on object SYN$
               failed
 Message       : Damaged rowid is AAAABEAABAAANWgACB - description: Synonymn
               APEX_APPLICATION is referenced

 

 

这个case希望大家能了解的是对于ORA-00604这类递归SQL层的错误,报错信息本身给出的诊断信息是不完整的,需要我们通过一些工具来深入了解实际引发错误的是哪一条SQL语句,这些Recusive SQL出错的主要原因往往是BUG、或者数据字典存在不一致。如何在脱离MOS和SR帮助的情况下,安全地WorkAround绕过这个错误。

了解你所不知道的SMON功能(七):清理IND$字典基表

SMON的作用还包括清理IND$字典基表(cleanup ind$):
ind$_flags

触发场景

当我们在线创建或重建索引时(create or rebuild index online),服务进程会到IND$字典基表中将该索引对应的记录的FLAGS字段修改为十进制的256或者512(见上图0x100=256,0×200=512),如:

SQL> create index macleans_index on larges(owner,object_name) online;
SQL> select obj# from obj$ where name='MACLEANS_INDEX';
OBJ#
----------
1343842
SQL> select FLAGS from ind$ where obj#=1343842;
FLAGS
----------
256
ind_online$字典基表记录了索引在线创建/重建的历史
SQL> select * from ind_online$;
OBJ#      TYPE#      FLAGS
---------- ---------- ----------
1343839          1        256
1343842          1        256
create table ind_online$
( obj#          number not null,
type#         number not null,              /* what kind of index is this? */
/* normal : 1 */
/* bitmap : 2 */
/* cluster : 3 */
/* iot - top : 4 */
/* iot - nested : 5 */
/* secondary : 6 */
/* ansi : 7 */
/* lob : 8 */
/* cooperative index method : 9 */
flags         number not null
/* index is being online built : 0x100 */
/* index is being online rebuilt : 0x200 */
)

原则上online create/rebuild index的的清理工作由实际操作的服务进程负责完成,这种清理在DDL语句成功的情况下包括一系列数据字典的维护,在该DDL语句失败的情形中包括对临时段的清理和数据字典的维护,无论如何都需要drop在线日志中间表 SYS_JOURNAL_nnnnn(nnnn为该索引的obj#)。数据字典的维护工作就包含对IND$基表中相应索引记录的FLAGS标志位的恢复,但是如果服务进程在语句执行过程中意外终止的话,那么短时间内FLAGS标志位字段就无法得到恢复,这将导致对该索引的后续操作因ORA-8104错误而无法继续:

SQL> drop index macleans_index;
drop index macleans_index
*
ERROR at line 1:
ORA-08104: this index object 1343842 is being online built or rebuilt
08104, 00000, "this index object %s is being online built or rebuilt"
// *Cause:  the index is being created or rebuild or waited for recovering
//          from the online (re)build
// *Action: wait the online index build or recovery to complete

SMON负责在启动后(startup)的每小时执行一次对IND$基表中因在线创建/重建索引失败所留下记录的清理,这种清理工作由kdicclean函数驱动(kdicclean is run by smon every 1 hour,called from SMON to find if there is any online builder death and cleanup our ind$ and obj$ and drop the journal table, stop journaling)。
这种清理工作典型的调用堆栈stack call如下:

ksbrdp -> ktmSmonMain  ktmmon -> kdicclean -> kdic_cleanup -> ktssdrp_segment

注意因为SMON进程的清理工作每小时才执行一次,而且在工作负载很高的情况下可能实际很久都不会得到清理,在这种情景中我们总是希望能尽快完成对索引的在线创建或重建,在10gr2以后的版本中我们可以直接使用dbms_repair.online_index_clean来手动清理online index rebuild的遗留问题:

SQL> drop index macleans_index;
drop index macleans_index
*
ERROR at line 1:
ORA-08104: this index object 1343842 is being online built or rebuilt
DECLARE
isClean BOOLEAN;
BEGIN
isClean := FALSE;
WHILE isClean=FALSE
LOOP
isClean := dbms_repair.online_index_clean(
dbms_repair.all_index_id, dbms_repair.lock_wait);
dbms_lock.sleep(10);
END LOOP;
END;
/
SQL>  drop index macleans_index;
drop index macleans_index
*
ERROR at line 1:
ORA-01418: specified index does not exist
成功清理

但是如果在9i中的话就比较麻烦,可以尝试用以下方法(不是很推荐,除非你已经等了很久):

1.首先手工删除在线日志表,通过以下手段找出这个中间表的名字
select object_name
from dba_objects
where object_name like
(select '%' || object_id || '%'
from dba_objects
where object_name = '&INDEX_NAME')
/
Enter value for index_name: MACLEANS_INDEX
old   6:          where object_name = '&INDEX_NAME')
new   6:          where object_name = 'MACLEANS_INDEX')
OBJECT_NAME
--------------------------------------------------------------------------------
SYS_JOURNAL_1343845
SQL> drop table SYS_JOURNAL_1343845;
Table dropped.
2.第二步要手动修改IND$字典基表
!!!!!! 注意!手动修改数据字典要足够小心!!
select flags from ind$ where obj#=&INDEX_OBJECT_ID;
Enter value for index_object_id: 1343845
old   1: select flags from ind$ where obj#=&INDEX_OBJECT_ID
new   1: select flags from ind$ where obj#=1343845
FLAGS
----------
256
a) 针对online create index,手动删除对应的记录
delete from IND$ where obj#=&INDEX_OBJECT_ID
b) 针对online rebuild index,手动恢复对应记录的FLAGS标志位
update IND$ set FLAGS=FLAGS-512 where obj#=&INDEX_OBJECT_ID

接下来我们实际观察一下清理工作的细节:

SQL> select obj# from obj$ where name='MACLEANS_INDEX';
OBJ#
----------
1343854
SQL> select FLAGS from ind$ where obj#=1343854;
FLAGS
----------
256
SQL> oradebug setmypid;
Statement processed.
SQL> oradebug event 10046 trace name context forever,level 8;
Statement processed.
SQL> DECLARE
2   isClean BOOLEAN;
3  BEGIN
4    isClean := FALSE;
5    WHILE isClean=FALSE
6    LOOP
7      isClean := dbms_repair.online_index_clean(
8      dbms_repair.all_index_id, dbms_repair.lock_wait);
9
10      dbms_lock.sleep(10);
11    END LOOP;
12  END;
13  /
PL/SQL procedure successfully completed.
===============================10046 trace=============================
select i.obj#, i.flags, u.name, o.name, o.type#
from sys.obj$ o, sys.user$ u, sys.ind_online$ i
where (bitand(i.flags, 256) = 256 or bitand(i.flags, 512) = 512)
and (not ((i.type# = 9) and bitand(i.flags, 8) = 8))
and o.obj# = i.obj#
and o.owner# = u.user#
select u.name,
o.name,
o.namespace,
o.type#,
decode(bitand(i.property, 1024), 0, 0, 1)
from ind$ i, obj$ o, user$ u
where i.obj# = :1
and o.obj# = i.bo#
and o.owner# = u.user#
delete from object_usage
where obj# in (select a.obj#
from object_usage a, ind$ b
where a.obj# = b.obj#
and b.bo# = :1)
drop table "SYS"."SYS_JOURNAL_1343854" purge
delete from icoldep$ where obj# in (select obj# from ind$ where bo#=:1)
delete from ind$ where bo#=:1
delete from ind$ where obj#=:1

我们可以利用以下语句找出系统中可能需要恢复的IND$记录,注意不要看到查询有结果就认为这是操作失败的征兆,很可能是有人在线创建或重建索引:

select i.obj#, i.flags, u.name, o.name, o.type#
from sys.obj$ o, sys.user$ u, sys.ind_online$ i
where (bitand(i.flags, 256) = 256 or bitand(i.flags, 512) = 512)
and (not ((i.type# = 9) and bitand(i.flags, 8) = 8))
and o.obj# = i.obj#
and o.owner# = u.user#
/

相关诊断事件可以通过设置诊断事件event=’8105 trace name context forever’
来禁止SMON清理IND$(Oracle event to turn off smon cleanup for online index build)

 alter system set events '8105 trace name context forever';

沪ICP备14014813号

沪公网安备 31010802001379号