wire.go

     1  // Copyright 2023 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build goexperiment.jsonv2
     6  
     7  // Package jsonwire implements stateless functionality for handling JSON text.
     8  package jsonwire
     9  
    10  import (
    11  	"cmp"
    12  	"errors"
    13  	"strconv"
    14  	"strings"
    15  	"unicode"
    16  	"unicode/utf16"
    17  	"unicode/utf8"
    18  )
    19  
    20  // TrimSuffixWhitespace trims JSON from the end of b.
    21  func TrimSuffixWhitespace(b []byte) []byte {
    22  	// NOTE: The arguments and logic are kept simple to keep this inlinable.
    23  	n := len(b) - 1
    24  	for n >= 0 && (b[n] == ' ' || b[n] == '\t' || b[n] == '\r' || b[n] == '\n') {
    25  		n--
    26  	}
    27  	return b[:n+1]
    28  }
    29  
    30  // TrimSuffixString trims a valid JSON string at the end of b.
    31  // The behavior is undefined if there is not a valid JSON string present.
    32  func TrimSuffixString(b []byte) []byte {
    33  	// NOTE: The arguments and logic are kept simple to keep this inlinable.
    34  	if len(b) > 0 && b[len(b)-1] == '"' {
    35  		b = b[:len(b)-1]
    36  	}
    37  	for len(b) >= 2 && !(b[len(b)-1] == '"' && b[len(b)-2] != '\\') {
    38  		b = b[:len(b)-1] // trim all characters except an unescaped quote
    39  	}
    40  	if len(b) > 0 && b[len(b)-1] == '"' {
    41  		b = b[:len(b)-1]
    42  	}
    43  	return b
    44  }
    45  
    46  // HasSuffixByte reports whether b ends with c.
    47  func HasSuffixByte(b []byte, c byte) bool {
    48  	// NOTE: The arguments and logic are kept simple to keep this inlinable.
    49  	return len(b) > 0 && b[len(b)-1] == c
    50  }
    51  
    52  // TrimSuffixByte removes c from the end of b if it is present.
    53  func TrimSuffixByte(b []byte, c byte) []byte {
    54  	// NOTE: The arguments and logic are kept simple to keep this inlinable.
    55  	if len(b) > 0 && b[len(b)-1] == c {
    56  		return b[:len(b)-1]
    57  	}
    58  	return b
    59  }
    60  
    61  // QuoteRune quotes the first rune in the input.
    62  func QuoteRune[Bytes ~[]byte | ~string](b Bytes) string {
    63  	r, n := utf8.DecodeRuneInString(string(truncateMaxUTF8(b)))
    64  	if r == utf8.RuneError && n == 1 {
    65  		return `'\x` + strconv.FormatUint(uint64(b[0]), 16) + `'`
    66  	}
    67  	return strconv.QuoteRune(r)
    68  }
    69  
    70  // CompareUTF16 lexicographically compares x to y according
    71  // to the UTF-16 codepoints of the UTF-8 encoded input strings.
    72  // This implements the ordering specified in RFC 8785, section 3.2.3.
    73  func CompareUTF16[Bytes ~[]byte | ~string](x, y Bytes) int {
    74  	// NOTE: This is an optimized, mostly allocation-free implementation
    75  	// of CompareUTF16Simple in wire_test.go. FuzzCompareUTF16 verifies that the
    76  	// two implementations agree on the result of comparing any two strings.
    77  	isUTF16Self := func(r rune) bool {
    78  		return ('\u0000' <= r && r <= '\uD7FF') || ('\uE000' <= r && r <= '\uFFFF')
    79  	}
    80  
    81  	for {
    82  		if len(x) == 0 || len(y) == 0 {
    83  			return cmp.Compare(len(x), len(y))
    84  		}
    85  
    86  		// ASCII fast-path.
    87  		if x[0] < utf8.RuneSelf || y[0] < utf8.RuneSelf {
    88  			if x[0] != y[0] {
    89  				return cmp.Compare(x[0], y[0])
    90  			}
    91  			x, y = x[1:], y[1:]
    92  			continue
    93  		}
    94  
    95  		// Decode next pair of runes as UTF-8.
    96  		rx, nx := utf8.DecodeRuneInString(string(truncateMaxUTF8(x)))
    97  		ry, ny := utf8.DecodeRuneInString(string(truncateMaxUTF8(y)))
    98  
    99  		selfx := isUTF16Self(rx)
   100  		selfy := isUTF16Self(ry)
   101  		switch {
   102  		// The x rune is a single UTF-16 codepoint, while
   103  		// the y rune is a surrogate pair of UTF-16 codepoints.
   104  		case selfx && !selfy:
   105  			ry, _ = utf16.EncodeRune(ry)
   106  		// The y rune is a single UTF-16 codepoint, while
   107  		// the x rune is a surrogate pair of UTF-16 codepoints.
   108  		case selfy && !selfx:
   109  			rx, _ = utf16.EncodeRune(rx)
   110  		}
   111  		if rx != ry {
   112  			return cmp.Compare(rx, ry)
   113  		}
   114  
   115  		// Check for invalid UTF-8, in which case,
   116  		// we just perform a byte-for-byte comparison.
   117  		if isInvalidUTF8(rx, nx) || isInvalidUTF8(ry, ny) {
   118  			if x[0] != y[0] {
   119  				return cmp.Compare(x[0], y[0])
   120  			}
   121  		}
   122  		x, y = x[nx:], y[ny:]
   123  	}
   124  }
   125  
   126  // truncateMaxUTF8 truncates b such it contains at least one rune.
   127  //
   128  // The utf8 package currently lacks generic variants, which complicates
   129  // generic functions that operates on either []byte or string.
   130  // As a hack, we always call the utf8 function operating on strings,
   131  // but always truncate the input such that the result is identical.
   132  //
   133  // Example usage:
   134  //
   135  //	utf8.DecodeRuneInString(string(truncateMaxUTF8(b)))
   136  //
   137  // Converting a []byte to a string is stack allocated since
   138  // truncateMaxUTF8 guarantees that the []byte is short.
   139  func truncateMaxUTF8[Bytes ~[]byte | ~string](b Bytes) Bytes {
   140  	// TODO(https://go.dev/issue/56948): Remove this function and
   141  	// instead directly call generic utf8 functions wherever used.
   142  	if len(b) > utf8.UTFMax {
   143  		return b[:utf8.UTFMax]
   144  	}
   145  	return b
   146  }
   147  
   148  // TODO(https://go.dev/issue/70547): Use utf8.ErrInvalid instead.
   149  var ErrInvalidUTF8 = errors.New("invalid UTF-8")
   150  
   151  func NewInvalidCharacterError[Bytes ~[]byte | ~string](prefix Bytes, where string) error {
   152  	what := QuoteRune(prefix)
   153  	return errors.New("invalid character " + what + " " + where)
   154  }
   155  
   156  func NewInvalidEscapeSequenceError[Bytes ~[]byte | ~string](what Bytes) error {
   157  	label := "escape sequence"
   158  	if len(what) > 6 {
   159  		label = "surrogate pair"
   160  	}
   161  	needEscape := strings.IndexFunc(string(what), func(r rune) bool {
   162  		return r == '`' || r == utf8.RuneError || unicode.IsSpace(r) || !unicode.IsPrint(r)
   163  	}) >= 0
   164  	if needEscape {
   165  		return errors.New("invalid " + label + " " + strconv.Quote(string(what)) + " in string")
   166  	} else {
   167  		return errors.New("invalid " + label + " `" + string(what) + "` in string")
   168  	}
   169  }
   170  
   171  // TruncatePointer optionally truncates the JSON pointer,
   172  // enforcing that the length roughly does not exceed n.
   173  func TruncatePointer(s string, n int) string {
   174  	if len(s) <= n {
   175  		return s
   176  	}
   177  	i := n / 2
   178  	j := len(s) - n/2
   179  
   180  	// Avoid truncating a name if there are multiple names present.
   181  	if k := strings.LastIndexByte(s[:i], '/'); k > 0 {
   182  		i = k
   183  	}
   184  	if k := strings.IndexByte(s[j:], '/'); k >= 0 {
   185  		j += k + len("/")
   186  	}
   187  
   188  	// Avoid truncation in the middle of a UTF-8 rune.
   189  	for i > 0 && isInvalidUTF8(utf8.DecodeLastRuneInString(s[:i])) {
   190  		i--
   191  	}
   192  	for j < len(s) && isInvalidUTF8(utf8.DecodeRuneInString(s[j:])) {
   193  		j++
   194  	}
   195  
   196  	// Determine the right middle fragment to use.
   197  	var middle string
   198  	switch strings.Count(s[i:j], "/") {
   199  	case 0:
   200  		middle = "…"
   201  	case 1:
   202  		middle = "…/…"
   203  	default:
   204  		middle = "…/…/…"
   205  	}
   206  	if strings.HasPrefix(s[i:j], "/") && middle != "…" {
   207  		middle = strings.TrimPrefix(middle, "…")
   208  	}
   209  	if strings.HasSuffix(s[i:j], "/") && middle != "…" {
   210  		middle = strings.TrimSuffix(middle, "…")
   211  	}
   212  	return s[:i] + middle + s[j:]
   213  }
   214  
   215  func isInvalidUTF8(r rune, rn int) bool {
   216  	return r == utf8.RuneError && rn == 1
   217  }
   218
View as plain text