Source file src/cmd/compile/internal/syntax/source.go

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This file implements source, a buffered rune reader
     6  // specialized for scanning Go code: Reading
     7  // ASCII characters, maintaining current (line, col)
     8  // position information, and recording of the most
     9  // recently read source segment are highly optimized.
    10  // This file is self-contained (go tool compile source.go
    11  // compiles) and thus could be made into its own package.
    12  
    13  package syntax
    14  
    15  import (
    16  	"io"
    17  	"unicode/utf8"
    18  )
    19  
    20  // The source buffer is accessed using three indices b (begin),
    21  // r (read), and e (end):
    22  //
    23  // - If b >= 0, it points to the beginning of a segment of most
    24  //   recently read characters (typically a Go literal).
    25  //
    26  // - r points to the byte immediately following the most recently
    27  //   read character ch, which starts at r-chw.
    28  //
    29  // - e points to the byte immediately following the last byte that
    30  //   was read into the buffer.
    31  //
    32  // The buffer content is terminated at buf[e] with the sentinel
    33  // character utf8.RuneSelf. This makes it possible to test for
    34  // the common case of ASCII characters with a single 'if' (see
    35  // nextch method).
    36  //
    37  //                +------ content in use -------+
    38  //                v                             v
    39  // buf [...read...|...segment...|ch|...unread...|s|...free...]
    40  //                ^             ^  ^            ^
    41  //                |             |  |            |
    42  //                b         r-chw  r            e
    43  //
    44  // Invariant: -1 <= b < r <= e < len(buf) && buf[e] == sentinel
    45  
    46  type source struct {
    47  	in   io.Reader
    48  	errh func(line, col uint, msg string)
    49  
    50  	buf       []byte // source buffer
    51  	ioerr     error  // pending I/O error, or nil
    52  	b, r, e   int    // buffer indices (see comment above)
    53  	line, col uint   // source position of ch (0-based)
    54  	ch        rune   // most recently read character
    55  	chw       int    // width of ch
    56  }
    57  
    58  const sentinel = utf8.RuneSelf
    59  
    60  func (s *source) init(in io.Reader, errh func(line, col uint, msg string)) {
    61  	s.in = in
    62  	s.errh = errh
    63  
    64  	if s.buf == nil {
    65  		s.buf = make([]byte, nextSize(0))
    66  	}
    67  	s.buf[0] = sentinel
    68  	s.ioerr = nil
    69  	s.b, s.r, s.e = -1, 0, 0
    70  	s.line, s.col = 0, 0
    71  	s.ch = ' '
    72  	s.chw = 0
    73  }
    74  
    75  // starting points for line and column numbers
    76  const linebase = 1
    77  const colbase = 1
    78  
    79  // pos returns the (line, col) source position of s.ch.
    80  func (s *source) pos() (line, col uint) {
    81  	return linebase + s.line, colbase + s.col
    82  }
    83  
    84  // error reports the error msg at source position s.pos().
    85  func (s *source) error(msg string) {
    86  	line, col := s.pos()
    87  	s.errh(line, col, msg)
    88  }
    89  
    90  // start starts a new active source segment (including s.ch).
    91  // As long as stop has not been called, the active segment's
    92  // bytes (excluding s.ch) may be retrieved by calling segment.
    93  func (s *source) start()          { s.b = s.r - s.chw }
    94  func (s *source) stop()           { s.b = -1 }
    95  func (s *source) segment() []byte { return s.buf[s.b : s.r-s.chw] }
    96  
    97  // rewind rewinds the scanner's read position and character s.ch
    98  // to the start of the currently active segment, which must not
    99  // contain any newlines (otherwise position information will be
   100  // incorrect). Currently, rewind is only needed for handling the
   101  // source sequence ".."; it must not be called outside an active
   102  // segment.
   103  func (s *source) rewind() {
   104  	// ok to verify precondition - rewind is rarely called
   105  	if s.b < 0 {
   106  		panic("no active segment")
   107  	}
   108  	s.col -= uint(s.r - s.b)
   109  	s.r = s.b
   110  	s.nextch()
   111  }
   112  
   113  func (s *source) nextch() {
   114  redo:
   115  	s.col += uint(s.chw)
   116  	if s.ch == '\n' {
   117  		s.line++
   118  		s.col = 0
   119  	}
   120  
   121  	// fast common case: at least one ASCII character
   122  	if s.ch = rune(s.buf[s.r]); s.ch < sentinel {
   123  		s.r++
   124  		s.chw = 1
   125  		if s.ch == 0 {
   126  			s.error("invalid NUL character")
   127  			goto redo
   128  		}
   129  		return
   130  	}
   131  
   132  	// slower general case: add more bytes to buffer if we don't have a full rune
   133  	for s.e-s.r < utf8.UTFMax && !utf8.FullRune(s.buf[s.r:s.e]) && s.ioerr == nil {
   134  		s.fill()
   135  	}
   136  
   137  	// EOF
   138  	if s.r == s.e {
   139  		if s.ioerr != io.EOF {
   140  			// ensure we never start with a '/' (e.g., rooted path) in the error message
   141  			s.error("I/O error: " + s.ioerr.Error())
   142  			s.ioerr = nil
   143  		}
   144  		s.ch = -1
   145  		s.chw = 0
   146  		return
   147  	}
   148  
   149  	s.ch, s.chw = utf8.DecodeRune(s.buf[s.r:s.e])
   150  	s.r += s.chw
   151  
   152  	if s.ch == utf8.RuneError && s.chw == 1 {
   153  		s.error("invalid UTF-8 encoding")
   154  		goto redo
   155  	}
   156  
   157  	// BOM's are only allowed as the first character in a file
   158  	const BOM = 0xfeff
   159  	if s.ch == BOM {
   160  		if s.line > 0 || s.col > 0 {
   161  			s.error("invalid BOM in the middle of the file")
   162  		}
   163  		goto redo
   164  	}
   165  }
   166  
   167  // fill reads more source bytes into s.buf.
   168  // It returns with at least one more byte in the buffer, or with s.ioerr != nil.
   169  func (s *source) fill() {
   170  	// determine content to preserve
   171  	b := s.r
   172  	if s.b >= 0 {
   173  		b = s.b
   174  		s.b = 0 // after buffer has grown or content has been moved down
   175  	}
   176  	content := s.buf[b:s.e]
   177  
   178  	// grow buffer or move content down
   179  	if len(content)*2 > len(s.buf) {
   180  		s.buf = make([]byte, nextSize(len(s.buf)))
   181  		copy(s.buf, content)
   182  	} else if b > 0 {
   183  		copy(s.buf, content)
   184  	}
   185  	s.r -= b
   186  	s.e -= b
   187  
   188  	// read more data: try a limited number of times
   189  	for i := 0; i < 10; i++ {
   190  		var n int
   191  		n, s.ioerr = s.in.Read(s.buf[s.e : len(s.buf)-1]) // -1 to leave space for sentinel
   192  		if n < 0 {
   193  			panic("negative read") // incorrect underlying io.Reader implementation
   194  		}
   195  		if n > 0 || s.ioerr != nil {
   196  			s.e += n
   197  			s.buf[s.e] = sentinel
   198  			return
   199  		}
   200  		// n == 0
   201  	}
   202  
   203  	s.buf[s.e] = sentinel
   204  	s.ioerr = io.ErrNoProgress
   205  }
   206  
   207  // nextSize returns the next bigger size for a buffer of a given size.
   208  func nextSize(size int) int {
   209  	const min = 4 << 10 // 4K: minimum buffer size
   210  	const max = 1 << 20 // 1M: maximum buffer size which is still doubled
   211  	if size < min {
   212  		return min
   213  	}
   214  	if size <= max {
   215  		return size << 1
   216  	}
   217  	return size + max
   218  }
   219  

View as plain text