// Copyright 2025 The Go Authors. All rights reserved. // Use of this source code is governed by a MIT // license that can be found in the LICENSE file. /* * Project: cockroach * Issue or PR : https://github.com/cockroachdb/cockroach/pull/10214 * Buggy version: 7207111aa3a43df0552509365fdec741a53f873f * fix commit-id: 27e863d90ab0660494778f1c35966cc5ddc38e32 * Flaky: 3/100 * Description: This goroutine leak is caused by different order when acquiring * coalescedMu.Lock() and raftMu.Lock(). The fix is to refactor sendQueuedHeartbeats() * so that cockroachdb can unlock coalescedMu before locking raftMu. */ package main import ( "os" "runtime/pprof" "sync" "time" "unsafe" ) func init() { register("Cockroach10214", Cockroach10214) } type Store_cockroach10214 struct { coalescedMu struct { sync.Mutex // L1 heartbeatResponses []int } mu struct { replicas map[int]*Replica_cockroach10214 } } func (s *Store_cockroach10214) sendQueuedHeartbeats() { s.coalescedMu.Lock() // L1 acquire defer s.coalescedMu.Unlock() // L2 release for i := 0; i < len(s.coalescedMu.heartbeatResponses); i++ { s.sendQueuedHeartbeatsToNode() // L2 } } func (s *Store_cockroach10214) sendQueuedHeartbeatsToNode() { for i := 0; i < len(s.mu.replicas); i++ { r := s.mu.replicas[i] r.reportUnreachable() // L2 } } type Replica_cockroach10214 struct { raftMu sync.Mutex // L2 mu sync.Mutex // L3 store *Store_cockroach10214 } func (r *Replica_cockroach10214) reportUnreachable() { r.raftMu.Lock() // L2 acquire time.Sleep(time.Millisecond) defer r.raftMu.Unlock() // L2 release } func (r *Replica_cockroach10214) tick() { r.raftMu.Lock() // L2 acquire defer r.raftMu.Unlock() // L2 release r.tickRaftMuLocked() } func (r *Replica_cockroach10214) tickRaftMuLocked() { r.mu.Lock() // L3 acquire defer r.mu.Unlock() // L3 release if r.maybeQuiesceLocked() { return } } func (r *Replica_cockroach10214) maybeQuiesceLocked() bool { for i := 0; i < 2; i++ { if !r.maybeCoalesceHeartbeat() { return true } } return false } func (r *Replica_cockroach10214) maybeCoalesceHeartbeat() bool { msgtype := uintptr(unsafe.Pointer(r)) % 3 switch msgtype { case 0, 1, 2: r.store.coalescedMu.Lock() // L1 acquire default: return false } r.store.coalescedMu.Unlock() // L1 release return true } func Cockroach10214() { prof := pprof.Lookup("goroutineleak") defer func() { time.Sleep(100 * time.Millisecond) prof.WriteTo(os.Stdout, 2) }() for i := 0; i < 1000; i++ { go func() { store := &Store_cockroach10214{} responses := &store.coalescedMu.heartbeatResponses *responses = append(*responses, 1, 2) store.mu.replicas = make(map[int]*Replica_cockroach10214) rp1 := &Replica_cockroach10214{ // L2,3[0] store: store, } rp2 := &Replica_cockroach10214{ // L2,3[1] store: store, } store.mu.replicas[0] = rp1 store.mu.replicas[1] = rp2 go store.sendQueuedHeartbeats() // G1 go rp1.tick() // G2 }() } } // Example of goroutine leak trace: // // G1 G2 //------------------------------------------------------------------------------------ // s.sendQueuedHeartbeats() . // s.coalescedMu.Lock() [L1] . // s.sendQueuedHeartbeatsToNode() . // s.mu.replicas[0].reportUnreachable() . // s.mu.replicas[0].raftMu.Lock() [L2] . // . s.mu.replicas[0].tick() // . s.mu.replicas[0].raftMu.Lock() [L2] // . s.mu.replicas[0].tickRaftMuLocked() // . s.mu.replicas[0].mu.Lock() [L3] // . s.mu.replicas[0].maybeQuiesceLocked() // . s.mu.replicas[0].maybeCoalesceHeartbeat() // . s.coalescedMu.Lock() [L1] //--------------------------------G1,G2 leak------------------------------------------