SUGGESTED FIX
Here are the essential changes:
*** /net/spot/archive02/ysr/clone/webrev/src/share/vm/memory/concurrentMarkSweepGeneration.cpp- Mon Jan 5 10:42:03 2004
--- concurrentMarkSweepGeneration.cpp Mon Jan 5 10:37:12 2004
*** 1,7 ****
#ifdef USE_PRAGMA_IDENT_SRC
! #pragma ident "@(#)concurrentMarkSweepGeneration.cpp 1.171 03/12/17 17:59:05 JVM"
#endif
/*
* Copyright 2003 Sun Microsystems, Inc. All rights reserved.
* SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
*/
--- 1,7 ----
#ifdef USE_PRAGMA_IDENT_SRC
! #pragma ident "@(#)concurrentMarkSweepGeneration.cpp 1.173 04/01/05 10:37:13 JVM"
#endif
/*
* Copyright 2003 Sun Microsystems, Inc. All rights reserved.
* SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
*/
*** 1627,1636 ****
--- 1629,1665 ----
assert_lock_strong(_permGen->freelistLock());
PRODUCT_ONLY(ShouldNotReachHere());
return true;
}
+ // A utility class that is used by the CMS collector to
+ // temporarily "release" the foreground collector from its
+ // usual obligation to wait for the background collector to
+ // complete an ongoing phase before proceeding.
+ class ReleaseForegroundGC: public StackObj {
+ private:
+ CMSCollector* _c;
+ public:
+ ReleaseForegroundGC(CMSCollector* c) : _c(c) {
+ assert(_c->_foregroundGCShouldWait, "Else should not need to call");
+ MutexLockerEx x(CMS_lock, Mutex::_no_safepoint_check_flag);
+ // allow a potentially blocked foreground collector to proceed
+ _c->_foregroundGCShouldWait = false;
+ if (_c->_foregroundGCIsActive) {
+ CMS_lock->notify();
+ }
+ assert(!ConcurrentMarkSweepThread::cms_thread_has_cms_token(),
+ "Possible deadlock");
+ }
+
+ ~ReleaseForegroundGC() {
+ assert(!_c->_foregroundGCShouldWait, "Usage protocol violation?");
+ MutexLockerEx x(CMS_lock, Mutex::_no_safepoint_check_flag);
+ _c->_foregroundGCShouldWait = true;
+ }
+ };
+
// There are separate collect_in_background and collect_in_foreground because of
// the different locking requirements of the background collector and the
// foreground collector. There was originally an attempt to share
// one "collect" method between the background collector and the foreground
// collector but the if-then-else required made it cleaner to have
*** 1760,1816 ****
abortable_preclean();
assert(_collectorState == FinalMarking, "Collector state should "
"have changed");
break;
case FinalMarking:
{
// If a foreground collection is in progress, it already has
// the pending list lock. This is similar to the situation
! // with the Heap_lock. See comments in stopWorldAndDo()
// about racing for the Heap_lock.
- {
- MutexLockerEx x(CMS_lock, Mutex::_no_safepoint_check_flag);
- // allow a potentially blocked foreground collector to proceed
- _foregroundGCShouldWait = false;
- if (_foregroundGCIsActive) {
- CMS_lock->notify();
- }
- assert(!ConcurrentMarkSweepThread::cms_thread_has_cms_token(),
- "Possible deadlock");
- }
ConcurrentMarkSweepThread::manipulatePLL(
SurrogateLockerThread::acquirePLL);
- bool didSomeWork = false;
- {
- MutexLockerEx x(CMS_lock, Mutex::_no_safepoint_check_flag);
- // For regularity, set _foregroundGCShouldWait
- // The background collector is grabbing the locks
- // it needs to do the collection but on releasing the
- // locks still make the foreground collector wait for
- // the _foregroundGCShouldWait flag.
- _foregroundGCShouldWait = true;
}
if (_collectorState == FinalMarking) {
// we didn't lose a race to FG thread
! stopWorldAndDo(CMS_op_checkpointRootsFinal);
! didSomeWork = true;
} else {
// else we did lose a race to FG thread
assert(_collectorState == Idling, "The foreground collector"
" should have finished the collection");
}
! // Check if we need to post a notification on PLL;
! if (didSomeWork &&
_ref_processor->read_and_reset_notify_ref_lock()) {
ConcurrentMarkSweepThread::manipulatePLL(
SurrogateLockerThread::releaseAndNotifyPLL);
} else {
ConcurrentMarkSweepThread::manipulatePLL(
SurrogateLockerThread::releasePLL);
}
}
break;
case Sweeping:
// final marking in checkpointRootsFinal has been completed
sweep(true);
assert(_collectorState == Resetting, "Collector state change "
--- 1792,1843 ----
abortable_preclean();
assert(_collectorState == FinalMarking, "Collector state should "
"have changed");
break;
case FinalMarking:
+ assert(_foregroundGCShouldWait, "block pre-condition");
{
// If a foreground collection is in progress, it already has
// the pending list lock. This is similar to the situation
! // with the Heap_lock. See comments in stop_world_and_do()
// about racing for the Heap_lock.
+ // We may block while trying to communicate with the
+ // SLT thread in order to manipulate the PLL. We make
+ // sure that the foreground collector will not block
+ // waiting for us to complete communication with the
+ // SLT thread. See, for instance, bug XXXX.
+ {
+ ReleaseForegroundGC x(this);
ConcurrentMarkSweepThread::manipulatePLL(
SurrogateLockerThread::acquirePLL);
}
+ bool did_some_work = false;
if (_collectorState == FinalMarking) {
// we didn't lose a race to FG thread
! did_some_work = stop_world_and_do(CMS_op_checkpointRootsFinal);
} else {
// else we did lose a race to FG thread
assert(_collectorState == Idling, "The foreground collector"
" should have finished the collection");
}
! // Post a notification on PLL, as necessary, taking
! // care to make sure that the foreground collector will
! // not stall waiting for us to return promptly from the call.
! {
! ReleaseForegroundGC x(this);
! if (did_some_work &&
_ref_processor->read_and_reset_notify_ref_lock()) {
ConcurrentMarkSweepThread::manipulatePLL(
SurrogateLockerThread::releaseAndNotifyPLL);
} else {
ConcurrentMarkSweepThread::manipulatePLL(
SurrogateLockerThread::releasePLL);
}
}
+ }
+ assert(_foregroundGCShouldWait, "block post-condition");
break;
case Sweeping:
// final marking in checkpointRootsFinal has been completed
sweep(true);
assert(_collectorState == Resetting, "Collector state change "
----
*** /net/spot/archive02/ysr/clone/webrev/src/share/vm/memory/concurrentMarkSweepGeneration.hpp- Mon Jan 5 10:42:07 2004
--- concurrentMarkSweepGeneration.hpp Mon Jan 5 10:33:08 2004
*** 1,7 ****
#ifdef USE_PRAGMA_IDENT_HDR
! #pragma ident "@(#)concurrentMarkSweepGeneration.hpp 1.99 03/12/02 13:54:31 JVM"
#endif
/*
* Copyright 2003 Sun Microsystems, Inc. All rights reserved.
* SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
*/
--- 1,7 ----
#ifdef USE_PRAGMA_IDENT_HDR
! #pragma ident "@(#)concurrentMarkSweepGeneration.hpp 1.100 04/01/05 10:33:10 JVM"
#endif
/*
* Copyright 2003 Sun Microsystems, Inc. All rights reserved.
* SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
*/
*** 386,395 ****
--- 386,396 ----
friend class Par_PushAndMarkClosure; // -- ditto --
friend class CMSKeepAliveClosure; // -- ditto --
friend class CMSDrainMarkingStackClosure; // -- ditto --
friend class CMSInnerParMarkAndPushClosure; // -- ditto --
NOT_PRODUCT(friend class ScanMarkedObjectsAgainClosure;) // assertion on _overflow_list
+ friend class ReleaseForegroundGC; // to access _foregroundGCShouldWait
private:
jlong _time_of_last_gc;
void update_time_of_last_gc(jlong now) {
_time_of_last_gc = now;
*** 502,513 ****
enum CMS_op_type {
CMS_op_checkpointRootsInitial,
CMS_op_checkpointRootsFinal
};
! void doCMSOperation(CMS_op_type op);
! bool stopWorldAndDo(CMS_op_type op);
OopTaskQueueSet* task_queues() { return _task_queues; }
int* hash_seed(int i) { return &_hash_seed[i]; }
// Support for parallelizing young gen rescan in CMS remark phase
--- 503,514 ----
enum CMS_op_type {
CMS_op_checkpointRootsInitial,
CMS_op_checkpointRootsFinal
};
! void do_CMS_operation(CMS_op_type op);
! bool stop_world_and_do(CMS_op_type op);
OopTaskQueueSet* task_queues() { return _task_queues; }
int* hash_seed(int i) { return &_hash_seed[i]; }
// Support for parallelizing young gen rescan in CMS remark phase
###@###.### 2004-01-14: Ignore the above; a simpler fix has been
engineered, but the diffs are too numerous (as indeed the above were) to list
here in their entirety. Please refer to:
http://analemma.sfbay/net/spot/archive02/ysr/clone/webrev
Event: putback-to
Parent workspace: /net/jano.sfbay/export/disk05/hotspot/ws/main/gc_baseline
(jano.sfbay:/export/disk05/hotspot/ws/main/gc_baseline)
Child workspace: /prt-workspaces/20040114174425.ysr.clone/workspace
(prt-web:/prt-workspaces/20040114174425.ysr.clone/workspace)
User: ysr
Comment:
---------------------------------------------------------
Original workspace: neeraja:/net/spot/archive02/ysr/clone
Submitter: ysr
Archived data: /net/prt-archiver.sfbay/export2/archived_workspaces/main/gc_baseline/2004/20040114174425.ysr.clone/
Webrev: http://analemma.sfbay.sun.com/net/prt-archiver.sfbay/export2/archived_workspaces/main/gc_baseline/2004/20040114174425.ysr.clone/workspace/webrevs/webrev-2004.01.15/index.html
Fixed: 4962516 CMS thread/SLT deadlock problem
Webrev: http://analemma.sfbay/net/spot/archive02/ysr/clone/webrev
The deadlock was first identified and analyzed by Amit Nene of
HP early December. Since this is a day-one bug, which we
had never seen in practice, we thought we'd defer the fix to 1.5.1,
reasoning that the bug needed quite adversarial scheduling
to manifest. Alas, the new linux kernel exposes the
deadlock within a few hours running ATG (ATG is run with a low
CMSInitiatingOccupancy setting so that it's essentially doing
CMS collections all the time, which increases the probability
of the deadlock event; the new linux scheduler probably helps
quite a bit as well, since that's the only configuration
that shows up the problem).
The problem arises because when the CMS thread is communicating
with the SLT (surrogate locker thread) thread (which is a
JavaThread that does PLL (pending list lock) locking on behalf
of the CMS thread around the CMS remark phase), the
VM thread may have initiated a safepoint and the SLT thread
might suspend in SLT_lock->wait() after having released the
PLL and notified the CMS thread waiting on the SLT_lock,
but while itself holding the mutex underlying the SLT_lock
monitor. At this point, the CMS thread has not released
the "baton" to the foreground collector (the vm thread),
resulting in a 3-way deadlock between the SLT, CMS and
VM threads.
Our solution is to release the foreground gc thread
around all stop-world phases, and a fortiori around the
SLT communication above. During these stop-world phases,
the PLL lock and the Heap_lock prevent interference with
the foreground thread.
(Our initial solution was to recognize that the CMS<->SLT
2-way handshake protocol is subject to delay because of
safepointing and to "release the baton" (to the foreground
collector) around such communication. This is safe because
CMS<->SLT communication is used for PLL lock/notify/unlock which
is done outside of the remark phase. It can be shown
that the temporal phase during which the PLL is held
is not subject to interruption. The window of vulnerability,
which we avoid with this fix, is when the PLL has been
released by the SLT thread but before the SLT thread has
completed the "ack" in the 2-way handshake protocol in
its communication with the CMS thread.
The final solution followed from an optimization suggestion
of Jon Masamitsu when reviewing the initial solution.)
Thanks to June Zhong and Dxo-Shin Chen for testing help.
Reviewed by: Jon Masamitsu (more reviews welcome)
Approved by: server project team
Fix Verified: yes
Verification Testing: ATG on linux, -client and -server, CMS
ongoing for > 24 hours (without fix would deadlock -- on
specific Linux machines -- within ~4-5 hour)
Other testing: (linux/intel,sparc/solaris,fastdebug/product,c1/c2,cms):
. refworkload
. ATG
. volano
. quicklook -full
Files:
update: src/share/vm/memory/concurrentMarkSweepGeneration.cpp
update: src/share/vm/memory/concurrentMarkSweepGeneration.hpp
Examined files: 3150
Contents Summary:
2 update
3148 no action (unchanged)
|