Source file src/strconv/quote.go

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:generate go run makeisprint.go -output isprint.go
     6  
     7  package strconv
     8  
     9  import (
    10  	"unicode/utf8"
    11  )
    12  
    13  const (
    14  	lowerhex = "0123456789abcdef"
    15  	upperhex = "0123456789ABCDEF"
    16  )
    17  
    18  // contains reports whether the string contains the byte c.
    19  func contains(s string, c byte) bool {
    20  	return index(s, c) != -1
    21  }
    22  
    23  func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string {
    24  	return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly))
    25  }
    26  
    27  func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) string {
    28  	return string(appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly))
    29  }
    30  
    31  func appendQuotedWith(buf []byte, s string, quote byte, ASCIIonly, graphicOnly bool) []byte {
    32  	// Often called with big strings, so preallocate. If there's quoting,
    33  	// this is conservative but still helps a lot.
    34  	if cap(buf)-len(buf) < len(s) {
    35  		nBuf := make([]byte, len(buf), len(buf)+1+len(s)+1)
    36  		copy(nBuf, buf)
    37  		buf = nBuf
    38  	}
    39  	buf = append(buf, quote)
    40  	for r, width := rune(0), 0; len(s) > 0; s = s[width:] {
    41  		r, width = utf8.DecodeRuneInString(s)
    42  		if width == 1 && r == utf8.RuneError {
    43  			buf = append(buf, `\x`...)
    44  			buf = append(buf, lowerhex[s[0]>>4])
    45  			buf = append(buf, lowerhex[s[0]&0xF])
    46  			continue
    47  		}
    48  		buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
    49  	}
    50  	buf = append(buf, quote)
    51  	return buf
    52  }
    53  
    54  func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
    55  	buf = append(buf, quote)
    56  	if !utf8.ValidRune(r) {
    57  		r = utf8.RuneError
    58  	}
    59  	buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
    60  	buf = append(buf, quote)
    61  	return buf
    62  }
    63  
    64  func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
    65  	if r == rune(quote) || r == '\\' { // always backslashed
    66  		buf = append(buf, '\\')
    67  		buf = append(buf, byte(r))
    68  		return buf
    69  	}
    70  	if ASCIIonly {
    71  		if r < utf8.RuneSelf && IsPrint(r) {
    72  			buf = append(buf, byte(r))
    73  			return buf
    74  		}
    75  	} else if IsPrint(r) || graphicOnly && isInGraphicList(r) {
    76  		return utf8.AppendRune(buf, r)
    77  	}
    78  	switch r {
    79  	case '\a':
    80  		buf = append(buf, `\a`...)
    81  	case '\b':
    82  		buf = append(buf, `\b`...)
    83  	case '\f':
    84  		buf = append(buf, `\f`...)
    85  	case '\n':
    86  		buf = append(buf, `\n`...)
    87  	case '\r':
    88  		buf = append(buf, `\r`...)
    89  	case '\t':
    90  		buf = append(buf, `\t`...)
    91  	case '\v':
    92  		buf = append(buf, `\v`...)
    93  	default:
    94  		switch {
    95  		case r < ' ' || r == 0x7f:
    96  			buf = append(buf, `\x`...)
    97  			buf = append(buf, lowerhex[byte(r)>>4])
    98  			buf = append(buf, lowerhex[byte(r)&0xF])
    99  		case !utf8.ValidRune(r):
   100  			r = 0xFFFD
   101  			fallthrough
   102  		case r < 0x10000:
   103  			buf = append(buf, `\u`...)
   104  			for s := 12; s >= 0; s -= 4 {
   105  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
   106  			}
   107  		default:
   108  			buf = append(buf, `\U`...)
   109  			for s := 28; s >= 0; s -= 4 {
   110  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
   111  			}
   112  		}
   113  	}
   114  	return buf
   115  }
   116  
   117  // Quote returns a double-quoted Go string literal representing s. The
   118  // returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   119  // control characters and non-printable characters as defined by
   120  // [IsPrint].
   121  func Quote(s string) string {
   122  	return quoteWith(s, '"', false, false)
   123  }
   124  
   125  // AppendQuote appends a double-quoted Go string literal representing s,
   126  // as generated by [Quote], to dst and returns the extended buffer.
   127  func AppendQuote(dst []byte, s string) []byte {
   128  	return appendQuotedWith(dst, s, '"', false, false)
   129  }
   130  
   131  // QuoteToASCII returns a double-quoted Go string literal representing s.
   132  // The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   133  // non-ASCII characters and non-printable characters as defined by [IsPrint].
   134  func QuoteToASCII(s string) string {
   135  	return quoteWith(s, '"', true, false)
   136  }
   137  
   138  // AppendQuoteToASCII appends a double-quoted Go string literal representing s,
   139  // as generated by [QuoteToASCII], to dst and returns the extended buffer.
   140  func AppendQuoteToASCII(dst []byte, s string) []byte {
   141  	return appendQuotedWith(dst, s, '"', true, false)
   142  }
   143  
   144  // QuoteToGraphic returns a double-quoted Go string literal representing s.
   145  // The returned string leaves Unicode graphic characters, as defined by
   146  // [IsGraphic], unchanged and uses Go escape sequences (\t, \n, \xFF, \u0100)
   147  // for non-graphic characters.
   148  func QuoteToGraphic(s string) string {
   149  	return quoteWith(s, '"', false, true)
   150  }
   151  
   152  // AppendQuoteToGraphic appends a double-quoted Go string literal representing s,
   153  // as generated by [QuoteToGraphic], to dst and returns the extended buffer.
   154  func AppendQuoteToGraphic(dst []byte, s string) []byte {
   155  	return appendQuotedWith(dst, s, '"', false, true)
   156  }
   157  
   158  // QuoteRune returns a single-quoted Go character literal representing the
   159  // rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100)
   160  // for control characters and non-printable characters as defined by [IsPrint].
   161  // If r is not a valid Unicode code point, it is interpreted as the Unicode
   162  // replacement character U+FFFD.
   163  func QuoteRune(r rune) string {
   164  	return quoteRuneWith(r, '\'', false, false)
   165  }
   166  
   167  // AppendQuoteRune appends a single-quoted Go character literal representing the rune,
   168  // as generated by [QuoteRune], to dst and returns the extended buffer.
   169  func AppendQuoteRune(dst []byte, r rune) []byte {
   170  	return appendQuotedRuneWith(dst, r, '\'', false, false)
   171  }
   172  
   173  // QuoteRuneToASCII returns a single-quoted Go character literal representing
   174  // the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
   175  // \u0100) for non-ASCII characters and non-printable characters as defined
   176  // by [IsPrint].
   177  // If r is not a valid Unicode code point, it is interpreted as the Unicode
   178  // replacement character U+FFFD.
   179  func QuoteRuneToASCII(r rune) string {
   180  	return quoteRuneWith(r, '\'', true, false)
   181  }
   182  
   183  // AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune,
   184  // as generated by [QuoteRuneToASCII], to dst and returns the extended buffer.
   185  func AppendQuoteRuneToASCII(dst []byte, r rune) []byte {
   186  	return appendQuotedRuneWith(dst, r, '\'', true, false)
   187  }
   188  
   189  // QuoteRuneToGraphic returns a single-quoted Go character literal representing
   190  // the rune. If the rune is not a Unicode graphic character,
   191  // as defined by [IsGraphic], the returned string will use a Go escape sequence
   192  // (\t, \n, \xFF, \u0100).
   193  // If r is not a valid Unicode code point, it is interpreted as the Unicode
   194  // replacement character U+FFFD.
   195  func QuoteRuneToGraphic(r rune) string {
   196  	return quoteRuneWith(r, '\'', false, true)
   197  }
   198  
   199  // AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune,
   200  // as generated by [QuoteRuneToGraphic], to dst and returns the extended buffer.
   201  func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte {
   202  	return appendQuotedRuneWith(dst, r, '\'', false, true)
   203  }
   204  
   205  // CanBackquote reports whether the string s can be represented
   206  // unchanged as a single-line backquoted string without control
   207  // characters other than tab.
   208  func CanBackquote(s string) bool {
   209  	for len(s) > 0 {
   210  		r, wid := utf8.DecodeRuneInString(s)
   211  		s = s[wid:]
   212  		if wid > 1 {
   213  			if r == '\ufeff' {
   214  				return false // BOMs are invisible and should not be quoted.
   215  			}
   216  			continue // All other multibyte runes are correctly encoded and assumed printable.
   217  		}
   218  		if r == utf8.RuneError {
   219  			return false
   220  		}
   221  		if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' {
   222  			return false
   223  		}
   224  	}
   225  	return true
   226  }
   227  
   228  func unhex(b byte) (v rune, ok bool) {
   229  	c := rune(b)
   230  	switch {
   231  	case '0' <= c && c <= '9':
   232  		return c - '0', true
   233  	case 'a' <= c && c <= 'f':
   234  		return c - 'a' + 10, true
   235  	case 'A' <= c && c <= 'F':
   236  		return c - 'A' + 10, true
   237  	}
   238  	return
   239  }
   240  
   241  // UnquoteChar decodes the first character or byte in the escaped string
   242  // or character literal represented by the string s.
   243  // It returns four values:
   244  //
   245  //  1. value, the decoded Unicode code point or byte value;
   246  //  2. multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
   247  //  3. tail, the remainder of the string after the character; and
   248  //  4. an error that will be nil if the character is syntactically valid.
   249  //
   250  // The second argument, quote, specifies the type of literal being parsed
   251  // and therefore which escaped quote character is permitted.
   252  // If set to a single quote, it permits the sequence \' and disallows unescaped '.
   253  // If set to a double quote, it permits \" and disallows unescaped ".
   254  // If set to zero, it does not permit either escape and allows both quote characters to appear unescaped.
   255  func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) {
   256  	// easy cases
   257  	if len(s) == 0 {
   258  		err = ErrSyntax
   259  		return
   260  	}
   261  	switch c := s[0]; {
   262  	case c == quote && (quote == '\'' || quote == '"'):
   263  		err = ErrSyntax
   264  		return
   265  	case c >= utf8.RuneSelf:
   266  		r, size := utf8.DecodeRuneInString(s)
   267  		return r, true, s[size:], nil
   268  	case c != '\\':
   269  		return rune(s[0]), false, s[1:], nil
   270  	}
   271  
   272  	// hard case: c is backslash
   273  	if len(s) <= 1 {
   274  		err = ErrSyntax
   275  		return
   276  	}
   277  	c := s[1]
   278  	s = s[2:]
   279  
   280  	switch c {
   281  	case 'a':
   282  		value = '\a'
   283  	case 'b':
   284  		value = '\b'
   285  	case 'f':
   286  		value = '\f'
   287  	case 'n':
   288  		value = '\n'
   289  	case 'r':
   290  		value = '\r'
   291  	case 't':
   292  		value = '\t'
   293  	case 'v':
   294  		value = '\v'
   295  	case 'x', 'u', 'U':
   296  		n := 0
   297  		switch c {
   298  		case 'x':
   299  			n = 2
   300  		case 'u':
   301  			n = 4
   302  		case 'U':
   303  			n = 8
   304  		}
   305  		var v rune
   306  		if len(s) < n {
   307  			err = ErrSyntax
   308  			return
   309  		}
   310  		for j := 0; j < n; j++ {
   311  			x, ok := unhex(s[j])
   312  			if !ok {
   313  				err = ErrSyntax
   314  				return
   315  			}
   316  			v = v<<4 | x
   317  		}
   318  		s = s[n:]
   319  		if c == 'x' {
   320  			// single-byte string, possibly not UTF-8
   321  			value = v
   322  			break
   323  		}
   324  		if !utf8.ValidRune(v) {
   325  			err = ErrSyntax
   326  			return
   327  		}
   328  		value = v
   329  		multibyte = true
   330  	case '0', '1', '2', '3', '4', '5', '6', '7':
   331  		v := rune(c) - '0'
   332  		if len(s) < 2 {
   333  			err = ErrSyntax
   334  			return
   335  		}
   336  		for j := 0; j < 2; j++ { // one digit already; two more
   337  			x := rune(s[j]) - '0'
   338  			if x < 0 || x > 7 {
   339  				err = ErrSyntax
   340  				return
   341  			}
   342  			v = (v << 3) | x
   343  		}
   344  		s = s[2:]
   345  		if v > 255 {
   346  			err = ErrSyntax
   347  			return
   348  		}
   349  		value = v
   350  	case '\\':
   351  		value = '\\'
   352  	case '\'', '"':
   353  		if c != quote {
   354  			err = ErrSyntax
   355  			return
   356  		}
   357  		value = rune(c)
   358  	default:
   359  		err = ErrSyntax
   360  		return
   361  	}
   362  	tail = s
   363  	return
   364  }
   365  
   366  // QuotedPrefix returns the quoted string (as understood by [Unquote]) at the prefix of s.
   367  // If s does not start with a valid quoted string, QuotedPrefix returns an error.
   368  func QuotedPrefix(s string) (string, error) {
   369  	out, _, err := unquote(s, false)
   370  	return out, err
   371  }
   372  
   373  // Unquote interprets s as a single-quoted, double-quoted,
   374  // or backquoted Go string literal, returning the string value
   375  // that s quotes.  (If s is single-quoted, it would be a Go
   376  // character literal; Unquote returns the corresponding
   377  // one-character string. For an empty character literal
   378  // Unquote returns the empty string.)
   379  func Unquote(s string) (string, error) {
   380  	out, rem, err := unquote(s, true)
   381  	if len(rem) > 0 {
   382  		return "", ErrSyntax
   383  	}
   384  	return out, err
   385  }
   386  
   387  // unquote parses a quoted string at the start of the input,
   388  // returning the parsed prefix, the remaining suffix, and any parse errors.
   389  // If unescape is true, the parsed prefix is unescaped,
   390  // otherwise the input prefix is provided verbatim.
   391  func unquote(in string, unescape bool) (out, rem string, err error) {
   392  	// Determine the quote form and optimistically find the terminating quote.
   393  	if len(in) < 2 {
   394  		return "", in, ErrSyntax
   395  	}
   396  	quote := in[0]
   397  	end := index(in[1:], quote)
   398  	if end < 0 {
   399  		return "", in, ErrSyntax
   400  	}
   401  	end += 2 // position after terminating quote; may be wrong if escape sequences are present
   402  
   403  	switch quote {
   404  	case '`':
   405  		switch {
   406  		case !unescape:
   407  			out = in[:end] // include quotes
   408  		case !contains(in[:end], '\r'):
   409  			out = in[len("`") : end-len("`")] // exclude quotes
   410  		default:
   411  			// Carriage return characters ('\r') inside raw string literals
   412  			// are discarded from the raw string value.
   413  			buf := make([]byte, 0, end-len("`")-len("\r")-len("`"))
   414  			for i := len("`"); i < end-len("`"); i++ {
   415  				if in[i] != '\r' {
   416  					buf = append(buf, in[i])
   417  				}
   418  			}
   419  			out = string(buf)
   420  		}
   421  		// NOTE: Prior implementations did not verify that raw strings consist
   422  		// of valid UTF-8 characters and we continue to not verify it as such.
   423  		// The Go specification does not explicitly require valid UTF-8,
   424  		// but only mention that it is implicitly valid for Go source code
   425  		// (which must be valid UTF-8).
   426  		return out, in[end:], nil
   427  	case '"', '\'':
   428  		// Handle quoted strings without any escape sequences.
   429  		if !contains(in[:end], '\\') && !contains(in[:end], '\n') {
   430  			var valid bool
   431  			switch quote {
   432  			case '"':
   433  				valid = utf8.ValidString(in[len(`"`) : end-len(`"`)])
   434  			case '\'':
   435  				r, n := utf8.DecodeRuneInString(in[len("'") : end-len("'")])
   436  				valid = len("'")+n+len("'") == end && (r != utf8.RuneError || n != 1)
   437  			}
   438  			if valid {
   439  				out = in[:end]
   440  				if unescape {
   441  					out = out[1 : end-1] // exclude quotes
   442  				}
   443  				return out, in[end:], nil
   444  			}
   445  		}
   446  
   447  		// Handle quoted strings with escape sequences.
   448  		var buf []byte
   449  		in0 := in
   450  		in = in[1:] // skip starting quote
   451  		if unescape {
   452  			buf = make([]byte, 0, 3*end/2) // try to avoid more allocations
   453  		}
   454  		for len(in) > 0 && in[0] != quote {
   455  			// Process the next character,
   456  			// rejecting any unescaped newline characters which are invalid.
   457  			r, multibyte, rem, err := UnquoteChar(in, quote)
   458  			if in[0] == '\n' || err != nil {
   459  				return "", in0, ErrSyntax
   460  			}
   461  			in = rem
   462  
   463  			// Append the character if unescaping the input.
   464  			if unescape {
   465  				if r < utf8.RuneSelf || !multibyte {
   466  					buf = append(buf, byte(r))
   467  				} else {
   468  					buf = utf8.AppendRune(buf, r)
   469  				}
   470  			}
   471  
   472  			// Single quoted strings must be a single character.
   473  			if quote == '\'' {
   474  				break
   475  			}
   476  		}
   477  
   478  		// Verify that the string ends with a terminating quote.
   479  		if !(len(in) > 0 && in[0] == quote) {
   480  			return "", in0, ErrSyntax
   481  		}
   482  		in = in[1:] // skip terminating quote
   483  
   484  		if unescape {
   485  			return string(buf), in, nil
   486  		}
   487  		return in0[:len(in0)-len(in)], in, nil
   488  	default:
   489  		return "", in, ErrSyntax
   490  	}
   491  }
   492  
   493  // bsearch is semantically the same as [slices.BinarySearch] (without NaN checks)
   494  // We copied this function because we can not import "slices" here.
   495  func bsearch[S ~[]E, E ~uint16 | ~uint32](s S, v E) (int, bool) {
   496  	n := len(s)
   497  	i, j := 0, n
   498  	for i < j {
   499  		h := i + (j-i)>>1
   500  		if s[h] < v {
   501  			i = h + 1
   502  		} else {
   503  			j = h
   504  		}
   505  	}
   506  	return i, i < n && s[i] == v
   507  }
   508  
   509  // TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests
   510  // to give the same answer. It allows this package not to depend on unicode,
   511  // and therefore not pull in all the Unicode tables. If the linker were better
   512  // at tossing unused tables, we could get rid of this implementation.
   513  // That would be nice.
   514  
   515  // IsPrint reports whether the rune is defined as printable by Go, with
   516  // the same definition as [unicode.IsPrint]: letters, numbers, punctuation,
   517  // symbols and ASCII space.
   518  func IsPrint(r rune) bool {
   519  	// Fast check for Latin-1
   520  	if r <= 0xFF {
   521  		if 0x20 <= r && r <= 0x7E {
   522  			// All the ASCII is printable from space through DEL-1.
   523  			return true
   524  		}
   525  		if 0xA1 <= r && r <= 0xFF {
   526  			// Similarly for ¡ through ÿ...
   527  			return r != 0xAD // ...except for the bizarre soft hyphen.
   528  		}
   529  		return false
   530  	}
   531  
   532  	// Same algorithm, either on uint16 or uint32 value.
   533  	// First, find first i such that isPrint[i] >= x.
   534  	// This is the index of either the start or end of a pair that might span x.
   535  	// The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]).
   536  	// If we find x in a range, make sure x is not in isNotPrint list.
   537  
   538  	if 0 <= r && r < 1<<16 {
   539  		rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16
   540  		i, _ := bsearch(isPrint, rr)
   541  		if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
   542  			return false
   543  		}
   544  		_, found := bsearch(isNotPrint, rr)
   545  		return !found
   546  	}
   547  
   548  	rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32
   549  	i, _ := bsearch(isPrint, rr)
   550  	if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
   551  		return false
   552  	}
   553  	if r >= 0x20000 {
   554  		return true
   555  	}
   556  	r -= 0x10000
   557  	_, found := bsearch(isNotPrint, uint16(r))
   558  	return !found
   559  }
   560  
   561  // IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such
   562  // characters include letters, marks, numbers, punctuation, symbols, and
   563  // spaces, from categories L, M, N, P, S, and Zs.
   564  func IsGraphic(r rune) bool {
   565  	if IsPrint(r) {
   566  		return true
   567  	}
   568  	return isInGraphicList(r)
   569  }
   570  
   571  // isInGraphicList reports whether the rune is in the isGraphic list. This separation
   572  // from IsGraphic allows quoteWith to avoid two calls to IsPrint.
   573  // Should be called only if IsPrint fails.
   574  func isInGraphicList(r rune) bool {
   575  	// We know r must fit in 16 bits - see makeisprint.go.
   576  	if r > 0xFFFF {
   577  		return false
   578  	}
   579  	_, found := bsearch(isGraphic, uint16(r))
   580  	return found
   581  }
   582  

View as plain text