onepass.go

     1  // Copyright 2014 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package regexp
     6  
     7  import (
     8  	"regexp/syntax"
     9  	"slices"
    10  	"strings"
    11  	"unicode"
    12  	"unicode/utf8"
    13  )
    14  
    15  // "One-pass" regexp execution.
    16  // Some regexps can be analyzed to determine that they never need
    17  // backtracking: they are guaranteed to run in one pass over the string
    18  // without bothering to save all the usual NFA state.
    19  // Detect those and execute them more quickly.
    20  
    21  // A onePassProg is a compiled one-pass regular expression program.
    22  // It is the same as syntax.Prog except for the use of onePassInst.
    23  type onePassProg struct {
    24  	Inst   []onePassInst
    25  	Start  int // index of start instruction
    26  	NumCap int // number of InstCapture insts in re
    27  }
    28  
    29  // A onePassInst is a single instruction in a one-pass regular expression program.
    30  // It is the same as syntax.Inst except for the new 'Next' field.
    31  type onePassInst struct {
    32  	syntax.Inst
    33  	Next []uint32
    34  }
    35  
    36  // onePassPrefix returns a literal string that all matches for the
    37  // regexp must start with. Complete is true if the prefix
    38  // is the entire match. Pc is the index of the last rune instruction
    39  // in the string. The onePassPrefix skips over the mandatory
    40  // EmptyBeginText.
    41  func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
    42  	i := &p.Inst[p.Start]
    43  	if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
    44  		return "", i.Op == syntax.InstMatch, uint32(p.Start)
    45  	}
    46  	pc = i.Out
    47  	i = &p.Inst[pc]
    48  	for i.Op == syntax.InstNop {
    49  		pc = i.Out
    50  		i = &p.Inst[pc]
    51  	}
    52  	// Avoid allocation of buffer if prefix is empty.
    53  	if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
    54  		return "", i.Op == syntax.InstMatch, uint32(p.Start)
    55  	}
    56  
    57  	// Have prefix; gather characters.
    58  	var buf strings.Builder
    59  	for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 && i.Rune[0] != utf8.RuneError {
    60  		buf.WriteRune(i.Rune[0])
    61  		pc, i = i.Out, &p.Inst[i.Out]
    62  	}
    63  	if i.Op == syntax.InstEmptyWidth &&
    64  		syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 &&
    65  		p.Inst[i.Out].Op == syntax.InstMatch {
    66  		complete = true
    67  	}
    68  	return buf.String(), complete, pc
    69  }
    70  
    71  // onePassNext selects the next actionable state of the prog, based on the input character.
    72  // It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
    73  // One of the alternates may ultimately lead without input to end of line. If the instruction
    74  // is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
    75  func onePassNext(i *onePassInst, r rune) uint32 {
    76  	next := i.MatchRunePos(r)
    77  	if next >= 0 {
    78  		return i.Next[next]
    79  	}
    80  	if i.Op == syntax.InstAltMatch {
    81  		return i.Out
    82  	}
    83  	return 0
    84  }
    85  
    86  func iop(i *syntax.Inst) syntax.InstOp {
    87  	op := i.Op
    88  	switch op {
    89  	case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
    90  		op = syntax.InstRune
    91  	}
    92  	return op
    93  }
    94  
    95  // Sparse Array implementation is used as a queueOnePass.
    96  type queueOnePass struct {
    97  	sparse          []uint32
    98  	dense           []uint32
    99  	size, nextIndex uint32
   100  }
   101  
   102  func (q *queueOnePass) empty() bool {
   103  	return q.nextIndex >= q.size
   104  }
   105  
   106  func (q *queueOnePass) next() (n uint32) {
   107  	n = q.dense[q.nextIndex]
   108  	q.nextIndex++
   109  	return
   110  }
   111  
   112  func (q *queueOnePass) clear() {
   113  	q.size = 0
   114  	q.nextIndex = 0
   115  }
   116  
   117  func (q *queueOnePass) contains(u uint32) bool {
   118  	if u >= uint32(len(q.sparse)) {
   119  		return false
   120  	}
   121  	return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
   122  }
   123  
   124  func (q *queueOnePass) insert(u uint32) {
   125  	if !q.contains(u) {
   126  		q.insertNew(u)
   127  	}
   128  }
   129  
   130  func (q *queueOnePass) insertNew(u uint32) {
   131  	if u >= uint32(len(q.sparse)) {
   132  		return
   133  	}
   134  	q.sparse[u] = q.size
   135  	q.dense[q.size] = u
   136  	q.size++
   137  }
   138  
   139  func newQueue(size int) (q *queueOnePass) {
   140  	return &queueOnePass{
   141  		sparse: make([]uint32, size),
   142  		dense:  make([]uint32, size),
   143  	}
   144  }
   145  
   146  // mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
   147  // and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
   148  // i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
   149  // NextIp array with the single element mergeFailed is returned.
   150  // The code assumes that both inputs contain ordered and non-intersecting rune pairs.
   151  const mergeFailed = uint32(0xffffffff)
   152  
   153  var (
   154  	noRune = []rune{}
   155  	noNext = []uint32{mergeFailed}
   156  )
   157  
   158  func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
   159  	leftLen := len(*leftRunes)
   160  	rightLen := len(*rightRunes)
   161  	if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
   162  		panic("mergeRuneSets odd length []rune")
   163  	}
   164  	var (
   165  		lx, rx int
   166  	)
   167  	merged := make([]rune, 0)
   168  	next := make([]uint32, 0)
   169  	ok := true
   170  	defer func() {
   171  		if !ok {
   172  			merged = nil
   173  			next = nil
   174  		}
   175  	}()
   176  
   177  	ix := -1
   178  	extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
   179  		if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
   180  			return false
   181  		}
   182  		merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
   183  		*newLow += 2
   184  		ix += 2
   185  		next = append(next, pc)
   186  		return true
   187  	}
   188  
   189  	for lx < leftLen || rx < rightLen {
   190  		switch {
   191  		case rx >= rightLen:
   192  			ok = extend(&lx, leftRunes, leftPC)
   193  		case lx >= leftLen:
   194  			ok = extend(&rx, rightRunes, rightPC)
   195  		case (*rightRunes)[rx] < (*leftRunes)[lx]:
   196  			ok = extend(&rx, rightRunes, rightPC)
   197  		default:
   198  			ok = extend(&lx, leftRunes, leftPC)
   199  		}
   200  		if !ok {
   201  			return noRune, noNext
   202  		}
   203  	}
   204  	return merged, next
   205  }
   206  
   207  // cleanupOnePass drops working memory, and restores certain shortcut instructions.
   208  func cleanupOnePass(prog *onePassProg, original *syntax.Prog) {
   209  	for ix, instOriginal := range original.Inst {
   210  		switch instOriginal.Op {
   211  		case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune:
   212  		case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail:
   213  			prog.Inst[ix].Next = nil
   214  		case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
   215  			prog.Inst[ix].Next = nil
   216  			prog.Inst[ix] = onePassInst{Inst: instOriginal}
   217  		}
   218  	}
   219  }
   220  
   221  // onePassCopy creates a copy of the original Prog, as we'll be modifying it.
   222  func onePassCopy(prog *syntax.Prog) *onePassProg {
   223  	p := &onePassProg{
   224  		Start:  prog.Start,
   225  		NumCap: prog.NumCap,
   226  		Inst:   make([]onePassInst, len(prog.Inst)),
   227  	}
   228  	for i, inst := range prog.Inst {
   229  		p.Inst[i] = onePassInst{Inst: inst}
   230  	}
   231  
   232  	// rewrites one or more common Prog constructs that enable some otherwise
   233  	// non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
   234  	// ip A, that points to ips B & C.
   235  	// A:BC + B:DA => A:BC + B:CD
   236  	// A:BC + B:DC => A:DC + B:DC
   237  	for pc := range p.Inst {
   238  		switch p.Inst[pc].Op {
   239  		default:
   240  			continue
   241  		case syntax.InstAlt, syntax.InstAltMatch:
   242  			// A:Bx + B:Ay
   243  			p_A_Other := &p.Inst[pc].Out
   244  			p_A_Alt := &p.Inst[pc].Arg
   245  			// make sure a target is another Alt
   246  			instAlt := p.Inst[*p_A_Alt]
   247  			if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
   248  				p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
   249  				instAlt = p.Inst[*p_A_Alt]
   250  				if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
   251  					continue
   252  				}
   253  			}
   254  			instOther := p.Inst[*p_A_Other]
   255  			// Analyzing both legs pointing to Alts is for another day
   256  			if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch {
   257  				// too complicated
   258  				continue
   259  			}
   260  			// simple empty transition loop
   261  			// A:BC + B:DA => A:BC + B:DC
   262  			p_B_Alt := &p.Inst[*p_A_Alt].Out
   263  			p_B_Other := &p.Inst[*p_A_Alt].Arg
   264  			patch := false
   265  			if instAlt.Out == uint32(pc) {
   266  				patch = true
   267  			} else if instAlt.Arg == uint32(pc) {
   268  				patch = true
   269  				p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
   270  			}
   271  			if patch {
   272  				*p_B_Alt = *p_A_Other
   273  			}
   274  
   275  			// empty transition to common target
   276  			// A:BC + B:DC => A:DC + B:DC
   277  			if *p_A_Other == *p_B_Alt {
   278  				*p_A_Alt = *p_B_Other
   279  			}
   280  		}
   281  	}
   282  	return p
   283  }
   284  
   285  var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
   286  var anyRune = []rune{0, unicode.MaxRune}
   287  
   288  // makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
   289  // the match engine can always tell which branch to take. The routine may modify
   290  // p if it is turned into a onepass Prog. If it isn't possible for this to be a
   291  // onepass Prog, the Prog nil is returned. makeOnePass is recursive
   292  // to the size of the Prog.
   293  func makeOnePass(p *onePassProg) *onePassProg {
   294  	// If the machine is very long, it's not worth the time to check if we can use one pass.
   295  	if len(p.Inst) >= 1000 {
   296  		return nil
   297  	}
   298  
   299  	var (
   300  		instQueue    = newQueue(len(p.Inst))
   301  		visitQueue   = newQueue(len(p.Inst))
   302  		check        func(uint32, []bool) bool
   303  		onePassRunes = make([][]rune, len(p.Inst))
   304  	)
   305  
   306  	// check that paths from Alt instructions are unambiguous, and rebuild the new
   307  	// program as a onepass program
   308  	check = func(pc uint32, m []bool) (ok bool) {
   309  		ok = true
   310  		inst := &p.Inst[pc]
   311  		if visitQueue.contains(pc) {
   312  			return
   313  		}
   314  		visitQueue.insert(pc)
   315  		switch inst.Op {
   316  		case syntax.InstAlt, syntax.InstAltMatch:
   317  			ok = check(inst.Out, m) && check(inst.Arg, m)
   318  			// check no-input paths to InstMatch
   319  			matchOut := m[inst.Out]
   320  			matchArg := m[inst.Arg]
   321  			if matchOut && matchArg {
   322  				ok = false
   323  				break
   324  			}
   325  			// Match on empty goes in inst.Out
   326  			if matchArg {
   327  				inst.Out, inst.Arg = inst.Arg, inst.Out
   328  				matchOut, matchArg = matchArg, matchOut
   329  			}
   330  			if matchOut {
   331  				m[pc] = true
   332  				inst.Op = syntax.InstAltMatch
   333  			}
   334  
   335  			// build a dispatch operator from the two legs of the alt.
   336  			onePassRunes[pc], inst.Next = mergeRuneSets(
   337  				&onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
   338  			if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
   339  				ok = false
   340  				break
   341  			}
   342  		case syntax.InstCapture, syntax.InstNop:
   343  			ok = check(inst.Out, m)
   344  			m[pc] = m[inst.Out]
   345  			// pass matching runes back through these no-ops.
   346  			onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
   347  			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
   348  			for i := range inst.Next {
   349  				inst.Next[i] = inst.Out
   350  			}
   351  		case syntax.InstEmptyWidth:
   352  			ok = check(inst.Out, m)
   353  			m[pc] = m[inst.Out]
   354  			onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
   355  			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
   356  			for i := range inst.Next {
   357  				inst.Next[i] = inst.Out
   358  			}
   359  		case syntax.InstMatch, syntax.InstFail:
   360  			m[pc] = inst.Op == syntax.InstMatch
   361  		case syntax.InstRune:
   362  			m[pc] = false
   363  			if len(inst.Next) > 0 {
   364  				break
   365  			}
   366  			instQueue.insert(inst.Out)
   367  			if len(inst.Rune) == 0 {
   368  				onePassRunes[pc] = []rune{}
   369  				inst.Next = []uint32{inst.Out}
   370  				break
   371  			}
   372  			runes := make([]rune, 0)
   373  			if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
   374  				r0 := inst.Rune[0]
   375  				runes = append(runes, r0, r0)
   376  				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
   377  					runes = append(runes, r1, r1)
   378  				}
   379  				slices.Sort(runes)
   380  			} else {
   381  				runes = append(runes, inst.Rune...)
   382  			}
   383  			onePassRunes[pc] = runes
   384  			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
   385  			for i := range inst.Next {
   386  				inst.Next[i] = inst.Out
   387  			}
   388  			inst.Op = syntax.InstRune
   389  		case syntax.InstRune1:
   390  			m[pc] = false
   391  			if len(inst.Next) > 0 {
   392  				break
   393  			}
   394  			instQueue.insert(inst.Out)
   395  			runes := []rune{}
   396  			// expand case-folded runes
   397  			if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
   398  				r0 := inst.Rune[0]
   399  				runes = append(runes, r0, r0)
   400  				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
   401  					runes = append(runes, r1, r1)
   402  				}
   403  				slices.Sort(runes)
   404  			} else {
   405  				runes = append(runes, inst.Rune[0], inst.Rune[0])
   406  			}
   407  			onePassRunes[pc] = runes
   408  			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
   409  			for i := range inst.Next {
   410  				inst.Next[i] = inst.Out
   411  			}
   412  			inst.Op = syntax.InstRune
   413  		case syntax.InstRuneAny:
   414  			m[pc] = false
   415  			if len(inst.Next) > 0 {
   416  				break
   417  			}
   418  			instQueue.insert(inst.Out)
   419  			onePassRunes[pc] = append([]rune{}, anyRune...)
   420  			inst.Next = []uint32{inst.Out}
   421  		case syntax.InstRuneAnyNotNL:
   422  			m[pc] = false
   423  			if len(inst.Next) > 0 {
   424  				break
   425  			}
   426  			instQueue.insert(inst.Out)
   427  			onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
   428  			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
   429  			for i := range inst.Next {
   430  				inst.Next[i] = inst.Out
   431  			}
   432  		}
   433  		return
   434  	}
   435  
   436  	instQueue.clear()
   437  	instQueue.insert(uint32(p.Start))
   438  	m := make([]bool, len(p.Inst))
   439  	for !instQueue.empty() {
   440  		visitQueue.clear()
   441  		pc := instQueue.next()
   442  		if !check(pc, m) {
   443  			p = nil
   444  			break
   445  		}
   446  	}
   447  	if p != nil {
   448  		for i := range p.Inst {
   449  			p.Inst[i].Rune = onePassRunes[i]
   450  		}
   451  	}
   452  	return p
   453  }
   454  
   455  // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
   456  // can be recharacterized as a one-pass regexp program, or syntax.nil if the
   457  // Prog cannot be converted. For a one pass prog, the fundamental condition that must
   458  // be true is: at any InstAlt, there must be no ambiguity about what branch to  take.
   459  func compileOnePass(prog *syntax.Prog) (p *onePassProg) {
   460  	if prog.Start == 0 {
   461  		return nil
   462  	}
   463  	// onepass regexp is anchored
   464  	if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth ||
   465  		syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText {
   466  		return nil
   467  	}
   468  	hasAlt := false
   469  	for _, inst := range prog.Inst {
   470  		if inst.Op == syntax.InstAlt || inst.Op == syntax.InstAltMatch {
   471  			hasAlt = true
   472  			break
   473  		}
   474  	}
   475  	// If we have alternates, every instruction leading to InstMatch must be EmptyEndText.
   476  	// Also, any match on empty text must be $.
   477  	for _, inst := range prog.Inst {
   478  		opOut := prog.Inst[inst.Out].Op
   479  		switch inst.Op {
   480  		default:
   481  			if opOut == syntax.InstMatch && hasAlt {
   482  				return nil
   483  			}
   484  		case syntax.InstAlt, syntax.InstAltMatch:
   485  			if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch {
   486  				return nil
   487  			}
   488  		case syntax.InstEmptyWidth:
   489  			if opOut == syntax.InstMatch {
   490  				if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText {
   491  					continue
   492  				}
   493  				return nil
   494  			}
   495  		}
   496  	}
   497  	// Creates a slightly optimized copy of the original Prog
   498  	// that cleans up some Prog idioms that block valid onepass programs
   499  	p = onePassCopy(prog)
   500  
   501  	// checkAmbiguity on InstAlts, build onepass Prog if possible
   502  	p = makeOnePass(p)
   503  
   504  	if p != nil {
   505  		cleanupOnePass(p, prog)
   506  	}
   507  	return p
   508  }
   509
View as plain text