United StatesChange Country, Oracle Worldwide Web Sites Communities I am a... I want to...
Bug ID: 7042740 CMS: assert(n> q) failed: Looping at: ... blockOffsetTable.cpp:557
7042740 : CMS: assert(n> q) failed: Looping at: ... blockOffsetTable.cpp:557

Details
Type:
Bug
Submit Date:
2011-05-07
Status:
Closed
Updated Date:
2012-12-05
Project Name:
JDK
Resolved Date:
2011-06-09
Component:
hotspot
OS:
generic
Sub-Component:
gc
CPU:
generic
Priority:
P3
Resolution:
Fixed
Affected Versions:
7
Fixed Versions:
hs21

Related Reports
Backport:
Backport:
Backport:
Duplicate:
Relates:
Relates:
Relates:

Sub Tasks

Description
;; Using jvm: "/export/local/common/jdk/baseline/solaris-sparcv9/jre/lib/sparcv9/server/libjvm.so"
#
# A fatal error has been detected by the Java Runtime Environment:
#
#  Internal Error (/tmp/jprt/P1/B/212141.brutisso/source/src/share/vm/memory/blockOffsetTable.cpp:557), pid=15208, tid=8
#  assert(n> q) failed: Looping at: 0xf56df77e62925778
#
# JRE version: 7.0-b140
# Java VM: Java HotSpot(TM) 64-Bit Server VM (21.0-b10-internal-201105032121.brutisso.hs-gc-boauub-fastdebug mixed mode solaris-sparc )
# Core dump written. Default location: /export/local/49527.JDK7.NIGHTLY.VM+solaris-sparcv9_vm_server_mixed_nsk.stress.testlist/results/ResultDir/gcl001/core or core.15208
#
# If you would like to submit a bug report, please visit:
#   http://bugreport.sun.com/bugreport/crash.jsp
#

---------------  T H R E A D  ---------------

Current thread (0x000000010015e800):  GCTaskThread [stack: 0x0000000000000000,0x0000000000000000] [id=8]

Stack: 
[error occurred during error reporting (printing stack bounds), id 0xe0000000]

Native frames: (J=compiled Java code, j=interpreted, Vv=VM code, C=native code)
V  [libjvm.so+0x1236d8c]  void VMError::report_and_die()+0x794;;  void VMError::report_and_die()+0x794
V  [libjvm.so+0x6ea0c0]  void report_vm_error(const char*,int,const char*,const char*)+0x78;;  void report_vm_error(const char*,int,const char*,const char*)+0x78
V  [libjvm.so+0x1235a38]  void VMError::report(outputStream*)+0x9a8;;  void VMError::report(outputStream*)+0x9a8
V  [libjvm.so+0x1236d8c]  void VMError::report_and_die()+0x794;;  void VMError::report_and_die()+0x794
V  [libjvm.so+0x6ea0c0]  void report_vm_error(const char*,int,const char*,const char*)+0x78;;  void report_vm_error(const char*,int,const char*,const char*)+0x78
V  [libjvm.so+0x36c8fc]  HeapWord*BlockOffsetArrayNonContigSpace::block_start_unsafe(const void*)const+0x2b4;;  HeapWord*BlockOffsetArrayNonContigSpace::block_start_unsafe(const void*)const+0x2b4
V  [libjvm.so+0xf0bb38]  void CardTableModRefBS::process_stride(Space*,MemRegion,int,int,DirtyCardToOopClosure*,ClearNoncleanCardWrapper*,signed char**,unsigned long,unsigned long)+0x3b0;;  void CardTableModRefBS::process_stride(Space*,MemRegion,int,int,DirtyCardToOopClosure*,ClearNoncleanCardWrapper*,signed char**,unsigned long,unsigned long)+0x3b0
V  [libjvm.so+0xf0b6a4]  void CardTableModRefBS::non_clean_card_iterate_parallel_work(Space*,MemRegion,DirtyCardToOopClosure*,ClearNoncleanCardWrapper*,int)+0x13c;;  void CardTableModRefBS::non_clean_card_iterate_parallel_work(Space*,MemRegion,DirtyCardToOopClosure*,ClearNoncleanCardWrapper*,int)+0x13c
V  [libjvm.so+0x4d1c94]  void CardTableModRefBS::non_clean_card_iterate_possibly_parallel(Space*,MemRegion,DirtyCardToOopClosure*,ClearNoncleanCardWrapper*)+0x64;;  void CardTableModRefBS::non_clean_card_iterate_possibly_parallel(Space*,MemRegion,DirtyCardToOopClosure*,ClearNoncleanCardWrapper*)+0x64
V  [libjvm.so+0x4d54c4]  void CardTableRS::younger_refs_in_space_iterate(Space*,OopsInGenClosure*)+0x354;;  void CardTableRS::younger_refs_in_space_iterate(Space*,OopsInGenClosure*)+0x354
V  [libjvm.so+0x67de98]  void ConcurrentMarkSweepGeneration::younger_refs_iterate(OopsInGenClosure*)+0x90;;  void ConcurrentMarkSweepGeneration::younger_refs_iterate(OopsInGenClosure*)+0x90
V  [libjvm.so+0x80e9cc]  void GenCollectedHeap::gen_process_strong_roots(int,bool,bool,bool,SharedHeap::ScanningOption,OopsInGenClosure*,bool,OopsInGenClosure*)+0x234;;  void GenCollectedHeap::gen_process_strong_roots(int,bool,bool,bool,SharedHeap::ScanningOption,OopsInGenClosure*,bool,OopsInGenClosure*)+0x234
V  [libjvm.so+0xf16fd4]  void ParNewGenTask::work(int)+0x1d4;;  void ParNewGenTask::work(int)+0x1d4
V  [libjvm.so+0x126af8c]  void GangWorker::loop()+0x27c;;  void GangWorker::loop()+0x27c
V  [libjvm.so+0xee3110]  java_start+0x270;;  java_start+0x270


...

Other Threads:
  0x0000000101692800 VMThread [stack: 0xffffffff75200000,0xffffffff75300000] [id=16]
  0x0000000101777800 WatcherThread [stack: 0xffffffff5ee00000,0xffffffff5ef00000] [id=26]

=>0x000000010015e800 (exited) GCTaskThread [stack: 0x0000000000000000,0x0000000000000000] [id=8]

VM state:at safepoint (normal execution)

VM Mutex/Monitor currently owned by a thread:  ([mutex/lock_event])
[0x000000010014c0b8] Threads_lock - owner thread: 0x0000000101692800
[0x000000010014c9a8] Heap_lock - owner thread: 0x0000000101c7b800

Heap
 par new generation   total 61184K, used 61134K [0xffffffff60000000, 0xffffffff64260000, 0xffffffff65000000)
  eden space 54400K,  99% used [0xffffffff60000000, 0xffffffff6351eb08, 0xffffffff63520000)
  from space 6784K,  99% used [0xffffffff63bc0000, 0xffffffff64254f98, 0xffffffff64260000)
  to   space 6784K,   0% used [0xffffffff63520000, 0xffffffff63520000, 0xffffffff63bc0000)
 concurrent mark-sweep generation total 133784K, used 25565K [0xffffffff65000000, 0xffffffff6d2a6000, 0xffffffff6f400000)
 concurrent-mark-sweep perm gen total 21248K, used 2923K [0xffffffff6f400000, 0xffffffff708c0000, 0xffffffff74400000)

Code Cache  [0xffffffff76000000, 0xffffffff76400000, 0xffffffff79000000)
 total_blobs=198 nmethods=26 adapters=134 free_code_cache=48019Kb largest_free_block=49131264

...
VM Arguments:
jvm_args: -Xmixed -XX:-PrintVMOptions -Xconcgc -XX:+CMSClassUnloadingEnabled -XX:+StartAttachListener -XX:+IgnoreUnrecognizedVMOptions -XX:-UseCompressedOops -Djava.library.path=/export/local/common/testbase/7/vm/vm/bin/lib/solaris-sparcv9/nsk/stress/jni/gclocker/ 
java_command: nsk.stress.jni.gclocker.gcl001
Launcher Type: SUN_STANDARD

...
---------------  S Y S T E M  ---------------

OS:                   Oracle Solaris 10 9/10 s10s_u9wos_14a SPARC
     Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
                            Assembled 11 August 2010

uname:SunOS 5.10 Generic_142909-17 sun4v  (T2 libthread)
rlimit: STACK 10000k, CORE infinity, NOFILE 65536, AS infinity
load average:32.80 60.86 32.73

CPU:total 11 has_v8, has_v9, has_blk_init, is_sun4v, is_niagara

Memory: 8k page, physical 993280k(57192k free)

vm_info: Java HotSpot(TM) 64-Bit Server VM (21.0-b10-internal-201105032121.brutisso.hs-gc-boauub-fastdebug) for solaris-sparc JRE (1.7.0), built on May  4 2011 00:30:58 by "jprtadm" with Sun Studio 12u1

time: Thu May  5 16:12:54 2011
elapsed time: 125 seconds

# Host info: SunOS vm-t1000-02a 5.10 Generic_142909-17 sun4v sparc SUNW,Sun-Fire-T1000
Internal Error (/tmp/jprt/P1/B/212141.brutisso/source/src/share/vm/memory/blockOffsetTable.cpp:557), pid=15208, tid=8
assert(n> q) failed: Looping at: 0xf56df77e62925778

Happened twice:
http://sqeweb.us.oracle.com/nfs/results/vm/gtee/JDK7/NIGHTLY/VM/2011-05-04/GC_Baseline-Xconc/vm/solaris-sparcv9/server/mixed/solaris-sparcv9_vm_server_mixed_nsk.stress.testlist/ResultDir/gcl001/hs_err_pid15208.log

http://sqeweb.us.oracle.com/nfs/results/vm/gtee/JDK7/NIGHTLY/VM/2011-05-04/GC_Baseline-Xinc/vm/solaris-sparcv9/server/mixed/solaris-sparcv9_vm_server_mixed_nsk.stress.testlist/ResultDir/gcl001/hs_err_pid21770.log
nsk/stress/jni/gclocker/gcl001

bigapps/OpenDS/stability
nsk/stress/jni/gclocker/gcl001

                                    

Comments
EVALUATION

http://hg.openjdk.java.net/jdk7/hotspot-rt/hotspot/rev/8cbcd406c42e
                                     
2011-06-09
EVALUATION

http://hg.openjdk.java.net/hsx/hotspot-rt/hotspot/rev/537a4053b0f9
                                     
2011-06-08
EVALUATION

http://hg.openjdk.java.net/jdk7/hotspot/hotspot/rev/8cbcd406c42e
                                     
2011-05-28
EVALUATION

http://hg.openjdk.java.net/hsx/hotspot-gc/hotspot/rev/537a4053b0f9
                                     
2011-05-24
SUGGESTED FIX

http://cr.openjdk.java.net/~ysr/7042740/webrev.0

What follows is some background on this problem, for those who
may not be familiar with or may have forgotten the details:
The sweeper must occasionally "yield"
during its sweep -- these might be either to allow a foreground
scavenge to happen, a direct allocation of a large object or
because of a JNI critical section, or when sweeping perm gen, for
metadata allocation during class loading. Typically, the sweeper
can then continue exactly from the point at which it had yielded.
If these yields are at a block boundary, the sweeper can restart
there and be sure that it is still at a block boundary. This is
because blocks will not spontaneously coalesce together rendeing
block boundaries non-boundaries.

Furthermore, if the high water mark of the heap is recorded when a concurrent
cycle starts, then by current allocate-live policy, there can't be
unmarked objects above that recorded high water mark during the sweeping phase
of that cycle, because all such objects must have been allocated black.
Thus, the sweeper can terminate its sweep at this previously recorded
high water mark. This can be useful to limit unnecessary sweeping
work when the heap is rapidly expanding.

However, when the high water mark is recorded at the end of the heap,
and the heap ends in a free chunk, then a subsequent expansion
coalesces the previously coterminal chunk with the expansion delta,
so as to reduce fragmentation. Unfortunately, this renders the recorded
high water mark a non-block-boundary, so that it is dangerous for
the sweeper to assume otherwise and try to determine the length of
the "following" block. This is how the problem originally began.

We realized this problem in a previous CR (see webrev for the
Zeno-like trail of associated CRs), but that fix was incorrect
because although it avoided stepping past the recorded
high watermark and getting into trouble, it would return
only the prefix ending at the high water mark to the free
lists. Not only would this lose some space in the form of
a one-time leak, but this would result in the block offset table
for the suffix now potentially pointing a walker off into
never-never land because of landing at a non-block boundary on
a backward logarithmic jump landing at an arbitrary point in
the prefix. Once this happened, one could run into any manner
of assertion failures or crashes in the debug vm, and crashes
in the product vm. There could be other failure modes
depending on where the "lost space" lay on a card that
might have been dirtied by a later store, as object iteration
stepped into the "no man's land" created by the leaked
space for example between two bona-fide objects.

The problem has existed forever of course but became easier
to reproduce because of setting BlockOffsetArrayUnallocatedBlock
to false allowed the recorded high water mark (the sweep limit)
to move to the end of the heap during an inflationary phase
in the heap, exposing us to the problem, whereas previously
the setting of that boolean often prevented us from going
right up to the end because we would instead stop at the top
of the allocated part, making it much less likely -- but not
impossible -- to hit this bug.

One of the problems with fixing this bug was that the sweep
closure does not remember the last block boundary examined
when it steps to the end of that block, so it did not have
sufficient information to sweep up the full block that
straddled the recorded high water mark as we stepped over it.

The fix was for the closure to do a one-step lookahead when it
handled free or newly garbage blocks to see if it would reach
the high water mark at the next step, and if so determine
the size of the block correctly at that point, so it could
be returned in its entirety, potentially coalesced with a set
of preceding blocks.

While debugging this code, I added a few newer asserts that check
various invariants that should hold at various points of the sweep,
elaborated the error messages that issue for some other existing asserts,
as well as added some sweeping status messages under the non-product sweep-tracing
flag. Additionally, I const'd some variables in the associated/affected
methods.

Testing: gclocker001; jprt; (will test with OpenDS prior to pushing fix)
                                     
2011-05-20
EVALUATION

When the heap expands "during" an ongoing CMS cycle and the expansion delta is
coalesced with a then-terminal chunk, the previously recorded limit of the sweep
may not be a block boundary anymore. Thus, the code in the SweepClosure destructor
that picks up and flushes the free chunk that is coterminal with limit
canniot assume that limit is a block boundary; rather it must use the end
of the last block that was below limit. This bug was hidden until now
because the unallocated block boundary's use made it extremely unlikely
(but not impossible) that such a coalition would occur redering
the previously recorded limit a non-boundary because of coalition.

This very same problem was flagged earlier via asserts on two previous
occasions in the not-too-distant past, but my previous
fixes for it had each been incomplete, so we had left this bug
incompletely fixed. I have linked these bugs to this CR for reference:-

6977970: CMS: concurrentMarkSweepGeneration.cpp:7947 assert(addr <= _limit) failed: sweep invariant
7008136: CMS: assert((HeapWord*)nextChunk <= _limit) failed: sweep invariant

Note: I am of two minds about leaving the "introduced in build" field at
hs21-b12. Strictly speaking it is not true, since the bug has existed forever.
However, it is also true that for the most part this bug was mostly (but not
completely) masked because until hs21-b12 we used to use
BlockOffsetArrayUseUnallocatedBlock = true ( at least on MP boxes, which
are by far the more common beast today; see 6977970 for related commentary,
and an appropriate sense of deja vu).
                                     
2011-05-14
PUBLIC COMMENTS

Appears related to the way the sweeper use "limit" to save on sweeping to the end.
This appears to be screwing up the free block sizes towards the end, possibly
as a result of incorrect sweeper actions interleaving with heap expansion.
It's a bit difficult to reproduced because sweeper actions and heap expansion
have to interleave in the right manner.

gcl001 seems to be the best test wrt reproducibility. This appears to be an
existing bug that was exposed late last week as a result of the recent change
in the default setting of BlockOffsetArrayUseUnallocatedBlock.
                                     
2011-05-13
WORK AROUND

Fix the size of the old gen and of the perm gen:-

    -Xms<N> -Xmx<N> -XX:PermSize=<P> -XX:MaxPermSize=<P>
                                     
2011-05-13
PUBLIC COMMENTS

Two more failures of the same kind from the 2011-05-06 nightlies:

http://sqeweb.us.oracle.com/nfs/results/vm/gtee/JDK7/NIGHTLY/VM/2011-05-06/GC_Baseline-Xinc/vm/solaris-sparc/server/mixed/solaris-sparc_vm_server_mixed_nsk.stress.testlist/ResultDir/gcl001/hs_err_pid7688.log

http://sqeweb.us.oracle.com/nfs/results/vm/gtee/JDK7/NIGHTLY/VM/2011-05-06/GC_Baseline-Xinc/vm/solaris-i586/client/mixed/solaris-i586_vm_client_mixed_vm.parallel_class_loading.testlist/ResultDir/interface/hs_err_pid29585.log
                                     
2011-05-09
PUBLIC COMMENTS

Another hs_err file from GC-Xconc baseline of the nightlies of 2011-05-06:
http://sqeweb.us.oracle.com/nfs/results/vm/gtee/JDK7/NIGHTLY/VM/2011-05-06/GC_Baseline-Xconc/bigapps/solaris-sparc/server/mixed/solaris-sparc_bigapps_server_mixed_OpenDS/OpenDS/hs_err_pid22994.log

#  Internal Error (/tmp/jprt/P1/B/122902.brutisso/source/src/share/vm/memory/blockOffsetTable.cpp:557), pid=22994, tid=4
#  assert(n> q) failed: Looping at: 0x430c5bbc
                                     
2011-05-09
PUBLIC COMMENTS

521  HeapWord* BlockOffsetArrayNonContigSpace::block_start_unsafe(
   522    const void* addr) const {
   523    assert(_array->offset_array(0) == 0, "objects can't cross covered areas");
   524    assert(_bottom <= addr && addr < _end,
   525           "addr must be covered by this Array");
   526    // Must read this exactly once because it can be modified by parallel
   527    // allocation.
   528    HeapWord* ub = _unallocated_block;
   529    if (BlockOffsetArrayUseUnallocatedBlock && addr >= ub) {
   530      assert(ub < _end, "tautology (see above)");
   531      return ub;
   532    }
   533
   534    // Otherwise, find the block start using the table.
   535    size_t index = _array->index_for(addr);
   536    HeapWord* q = _array->address_for_index(index);
   537
   538    uint offset = _array->offset_array(index);    // Extend u_char to uint.
   539    while (offset >= N_words) {
   540      // The excess of the offset from N_words indicates a power of Base
   541      // to go back by.
   542      size_t n_cards_back = entry_to_cards_back(offset);
   543      q -= (N_words * n_cards_back);
   544      assert(q >= _sp->bottom(), "Went below bottom!");
   545      index -= n_cards_back;
   546      offset = _array->offset_array(index);
   547    }
   548    assert(offset < N_words, "offset too large");
   549    index--;
   550    q -= offset;
   551    HeapWord* n = q;
   552
   553    while (n <= addr) {
   554      debug_only(HeapWord* last = q);   // for debugging
   555      q = n;
   556      n += _sp->block_size(n);
   557      assert(n > q, err_msg("Looping at: " INTPTR_FORMAT, n));
   558    }
   559    assert(q <= addr, err_msg("wrong order for current (" INTPTR_FORMAT ") <= arg (" INTPTR_FORMAT ")", q, addr));
   560    assert(addr <= n, err_msg("wrong order for arg (" INTPTR_FORMAT ") <= next (" INTPTR_FORMAT ")", addr, n));
   561    return q;
   562  }


So it would seem at first blush as though this started happening after BlockOffsetArrayUseUnallocatedBlock was
turned off. As a result, we do not have the cutout/switchback at line 531 above.
Instead we seem to run into some trouble perhaps above what may have been the high
end of the heap where there may be some allocation activity concurrent with
card-scanning. A closer examination however reveals something much more sinister:-

  #  assert(n> q) failed: Looping at: 0xf56df77e62925778

However:-

Heap
 par new generation   total 61184K, used 61134K [0xffffffff60000000, 0xffffffff64260000, 0xffffffff65000000)
  eden space 54400K,  99% used [0xffffffff60000000, 0xffffffff6351eb08, 0xffffffff63520000)
  from space 6784K,  99% used [0xffffffff63bc0000, 0xffffffff64254f98, 0xffffffff64260000)
  to   space 6784K,   0% used [0xffffffff63520000, 0xffffffff63520000, 0xffffffff63bc0000)
 concurrent mark-sweep generation total 133784K, used 25565K [0xffffffff65000000, 0xffffffff6d2a6000, 0xffffffff6f400000)
 concurrent-mark-sweep perm gen total 21248K, used 2923K [0xffffffff6f400000, 0xffffffff708c0000, 0xffffffff74400000)

which means that our heap walk went quite awry and landed us way outside the heap bounds, at an address
much lower than the heap.
                                     
2011-05-07



Hardware and Software, Engineered to Work Together