// Copyright 2022 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. //go:build goexperiment.pagetrace // Page tracer. // // This file contains an implementation of page trace instrumentation for tracking // the way the Go runtime manages pages of memory. The trace may be enabled at program // startup with the GODEBUG option pagetrace. // // Each page trace event is either 8 or 16 bytes wide. The first // 8 bytes follow this format for non-sync events: // // [16 timestamp delta][35 base address][10 npages][1 isLarge][2 pageTraceEventType] // // If the "large" bit is set then the event is 16 bytes wide with the second 8 byte word // containing the full npages value (the npages bitfield is 0). // // The base address's bottom pageShift bits are always zero hence why we can pack other // data in there. We ignore the top 16 bits, assuming a 48 bit address space for the // heap. // // The timestamp delta is computed from the difference between the current nanotime // timestamp and the last sync event's timestamp. The bottom pageTraceTimeLostBits of // this delta is removed and only the next pageTraceTimeDeltaBits are kept. // // A sync event is emitted at the beginning of each trace buffer and whenever the // timestamp delta would not fit in an event. // // Sync events have the following structure: // // [61 timestamp or P ID][1 isPID][2 pageTraceSyncEvent] // // In essence, the "large" bit repurposed to indicate whether it's a timestamp or a P ID // (these are typically uint32). Note that we only have 61 bits for the 64-bit timestamp, // but like for the delta we drop the bottom pageTraceTimeLostBits here as well. package runtime import ( "runtime/internal/sys" "unsafe" ) // pageTraceAlloc records a page trace allocation event. // pp may be nil. Call only if debug.pagetracefd != 0. // // Must run on the system stack as a crude way to prevent preemption. // //go:systemstack func pageTraceAlloc(pp *p, now int64, base, npages uintptr) { if pageTrace.enabled { if now == 0 { now = nanotime() } pageTraceEmit(pp, now, base, npages, pageTraceAllocEvent) } } // pageTraceFree records a page trace free event. // pp may be nil. Call only if debug.pagetracefd != 0. // // Must run on the system stack as a crude way to prevent preemption. // //go:systemstack func pageTraceFree(pp *p, now int64, base, npages uintptr) { if pageTrace.enabled { if now == 0 { now = nanotime() } pageTraceEmit(pp, now, base, npages, pageTraceFreeEvent) } } // pageTraceScav records a page trace scavenge event. // pp may be nil. Call only if debug.pagetracefd != 0. // // Must run on the system stack as a crude way to prevent preemption. // //go:systemstack func pageTraceScav(pp *p, now int64, base, npages uintptr) { if pageTrace.enabled { if now == 0 { now = nanotime() } pageTraceEmit(pp, now, base, npages, pageTraceScavEvent) } } // pageTraceEventType is a page trace event type. type pageTraceEventType uint8 const ( pageTraceSyncEvent pageTraceEventType = iota // Timestamp emission. pageTraceAllocEvent // Allocation of pages. pageTraceFreeEvent // Freeing pages. pageTraceScavEvent // Scavenging pages. ) // pageTraceEmit emits a page trace event. // // Must run on the system stack as a crude way to prevent preemption. // //go:systemstack func pageTraceEmit(pp *p, now int64, base, npages uintptr, typ pageTraceEventType) { // Get a buffer. var tbp *pageTraceBuf pid := int32(-1) if pp == nil { // We have no P, so take the global buffer. lock(&pageTrace.lock) tbp = &pageTrace.buf } else { tbp = &pp.pageTraceBuf pid = pp.id } // Initialize the buffer if necessary. tb := *tbp if tb.buf == nil { tb.buf = (*pageTraceEvents)(sysAlloc(pageTraceBufSize, &memstats.other_sys)) tb = tb.writePid(pid) } // Handle timestamp and emit a sync event if necessary. if now < tb.timeBase { now = tb.timeBase } if now-tb.timeBase >= pageTraceTimeMaxDelta { tb.timeBase = now tb = tb.writeSync(pid) } // Emit the event. tb = tb.writeEvent(pid, now, base, npages, typ) // Write back the buffer. *tbp = tb if pp == nil { unlock(&pageTrace.lock) } } const ( pageTraceBufSize = 32 << 10 // These constants describe the per-event timestamp delta encoding. pageTraceTimeLostBits = 7 // How many bits of precision we lose in the delta. pageTraceTimeDeltaBits = 16 // Size of the delta in bits. pageTraceTimeMaxDelta = 1 << (pageTraceTimeLostBits + pageTraceTimeDeltaBits) ) // pageTraceEvents is the low-level buffer containing the trace data. type pageTraceEvents struct { _ sys.NotInHeap events [pageTraceBufSize / 8]uint64 } // pageTraceBuf is a wrapper around pageTraceEvents that knows how to write events // to the buffer. It tracks state necessary to do so. type pageTraceBuf struct { buf *pageTraceEvents len int // How many events have been written so far. timeBase int64 // The current timestamp base from which deltas are produced. finished bool // Whether this trace buf should no longer flush anything out. } // writePid writes a P ID event indicating which P we're running on. // // Assumes there's always space in the buffer since this is only called at the // beginning of a new buffer. // // Must run on the system stack as a crude way to prevent preemption. // //go:systemstack func (tb pageTraceBuf) writePid(pid int32) pageTraceBuf { e := uint64(int64(pid))<<3 | 0b100 | uint64(pageTraceSyncEvent) tb.buf.events[tb.len] = e tb.len++ return tb } // writeSync writes a sync event, which is just a timestamp. Handles flushing. // // Must run on the system stack as a crude way to prevent preemption. // //go:systemstack func (tb pageTraceBuf) writeSync(pid int32) pageTraceBuf { if tb.len+1 > len(tb.buf.events) { // N.B. flush will writeSync again. return tb.flush(pid, tb.timeBase) } e := ((uint64(tb.timeBase) >> pageTraceTimeLostBits) << 3) | uint64(pageTraceSyncEvent) tb.buf.events[tb.len] = e tb.len++ return tb } // writeEvent handles writing all non-sync and non-pid events. Handles flushing if necessary. // // pid indicates the P we're currently running on. Necessary in case we need to flush. // now is the current nanotime timestamp. // base is the base address of whatever group of pages this event is happening to. // npages is the length of the group of pages this event is happening to. // typ is the event that's happening to these pages. // // Must run on the system stack as a crude way to prevent preemption. // //go:systemstack func (tb pageTraceBuf) writeEvent(pid int32, now int64, base, npages uintptr, typ pageTraceEventType) pageTraceBuf { large := 0 np := npages if npages >= 1024 { large = 1 np = 0 } if tb.len+1+large > len(tb.buf.events) { tb = tb.flush(pid, now) } if base%pageSize != 0 { throw("base address not page aligned") } e := uint64(base) // The pageShift low-order bits are zero. e |= uint64(typ) // 2 bits e |= uint64(large) << 2 // 1 bit e |= uint64(np) << 3 // 10 bits // Write the timestamp delta in the upper pageTraceTimeDeltaBits. e |= uint64((now-tb.timeBase)>>pageTraceTimeLostBits) << (64 - pageTraceTimeDeltaBits) tb.buf.events[tb.len] = e if large != 0 { // npages doesn't fit in 10 bits, so write an additional word with that data. tb.buf.events[tb.len+1] = uint64(npages) } tb.len += 1 + large return tb } // flush writes out the contents of the buffer to pageTrace.fd and resets the buffer. // It then writes out a P ID event and the first sync event for the new buffer. // // Must run on the system stack as a crude way to prevent preemption. // //go:systemstack func (tb pageTraceBuf) flush(pid int32, now int64) pageTraceBuf { if !tb.finished { lock(&pageTrace.fdLock) writeFull(uintptr(pageTrace.fd), (*byte)(unsafe.Pointer(&tb.buf.events[0])), tb.len*8) unlock(&pageTrace.fdLock) } tb.len = 0 tb.timeBase = now return tb.writePid(pid).writeSync(pid) } var pageTrace struct { // enabled indicates whether tracing is enabled. If true, fd >= 0. // // Safe to read without synchronization because it's only set once // at program initialization. enabled bool // buf is the page trace buffer used if there is no P. // // lock protects buf. lock mutex buf pageTraceBuf // fdLock protects writing to fd. // // fd is the file to write the page trace to. fdLock mutex fd int32 } // initPageTrace initializes the page tracing infrastructure from GODEBUG. // // env must be the value of the GODEBUG environment variable. func initPageTrace(env string) { var value string for env != "" { elt, rest := env, "" for i := 0; i < len(env); i++ { if env[i] == ',' { elt, rest = env[:i], env[i+1:] break } } env = rest if hasPrefix(elt, "pagetrace=") { value = elt[len("pagetrace="):] break } } pageTrace.fd = -1 if canCreateFile && value != "" { var tmp [4096]byte if len(value) != 0 && len(value) < 4096 { copy(tmp[:], value) pageTrace.fd = create(&tmp[0], 0o664) } } pageTrace.enabled = pageTrace.fd >= 0 } // finishPageTrace flushes all P's trace buffers and disables page tracing. func finishPageTrace() { if !pageTrace.enabled { return } // Grab worldsema as we're about to execute a ragged barrier. semacquire(&worldsema) systemstack(func() { // Disable tracing. This isn't strictly necessary and it's best-effort. pageTrace.enabled = false // Execute a ragged barrier, flushing each trace buffer. forEachP(waitReasonPageTraceFlush, func(pp *p) { if pp.pageTraceBuf.buf != nil { pp.pageTraceBuf = pp.pageTraceBuf.flush(pp.id, nanotime()) } pp.pageTraceBuf.finished = true }) // Write the global have-no-P buffer. lock(&pageTrace.lock) if pageTrace.buf.buf != nil { pageTrace.buf = pageTrace.buf.flush(-1, nanotime()) } pageTrace.buf.finished = true unlock(&pageTrace.lock) // Safely close the file as nothing else should be allowed to write to the fd. lock(&pageTrace.fdLock) closefd(pageTrace.fd) pageTrace.fd = -1 unlock(&pageTrace.fdLock) }) semrelease(&worldsema) } // writeFull ensures that a complete write of bn bytes from b is made to fd. func writeFull(fd uintptr, b *byte, bn int) { for bn > 0 { n := write(fd, unsafe.Pointer(b), int32(bn)) if n == -_EINTR || n == -_EAGAIN { continue } if n < 0 { print("errno=", -n, "\n") throw("writeBytes: bad write") } bn -= int(n) b = addb(b, uintptr(n)) } }