Source file src/cmd/vendor/golang.org/x/text/unicode/norm/forminfo.go
1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package norm 6 7 import "encoding/binary" 8 9 // This file contains Form-specific logic and wrappers for data in tables.go. 10 11 // Rune info is stored in a separate trie per composing form. A composing form 12 // and its corresponding decomposing form share the same trie. Each trie maps 13 // a rune to a uint16. The values take two forms. For v >= 0x8000: 14 // bits 15 // 15: 1 (inverse of NFD_QC bit of qcInfo) 16 // 12..7: qcInfo (see below). isYesD is always true (no decomposition). 17 // 6..0: ccc (compressed CCC value). 18 // For v < 0x8000, the respective rune has a decomposition and v is an index 19 // into a byte array of UTF-8 decomposition sequences and additional info and 20 // has the form: 21 // <header> <decomp_byte>* [<tccc> [<lccc>]] 22 // The header contains the number of bytes in the decomposition (excluding this 23 // length byte), with 33 mapped to 31 to fit in 5 bits. 24 // (If any 31- or 32-byte decompositions come along, we could switch to using 25 // use a general lookup table as long as there are at most 32 distinct lengths.) 26 // The three most significant bits of this length byte correspond 27 // to bit 5, 4, and 3 of qcInfo (see below). The byte sequence itself starts at v+1. 28 // The byte sequence is followed by a trailing and leading CCC if the values 29 // for these are not zero. The value of v determines which ccc are appended 30 // to the sequences. For v < firstCCC, there are none, for v >= firstCCC, 31 // the sequence is followed by a trailing ccc, and for v >= firstLeadingCC 32 // there is an additional leading ccc. The value of tccc itself is the 33 // trailing CCC shifted left 2 bits. The two least-significant bits of tccc 34 // are the number of trailing non-starters. 35 36 const ( 37 qcInfoMask = 0x3F // to clear all but the relevant bits in a qcInfo 38 headerLenMask = 0x1F // extract the length value from the header byte (31 => 33) 39 headerFlagsMask = 0xE0 // extract the qcInfo bits from the header byte 40 ) 41 42 // Properties provides access to normalization properties of a rune. 43 type Properties struct { 44 pos uint8 // start position in reorderBuffer; used in composition.go 45 size uint8 // length of UTF-8 encoding of this rune 46 ccc uint8 // leading canonical combining class (ccc if not decomposition) 47 tccc uint8 // trailing canonical combining class (ccc if not decomposition) 48 nLead uint8 // number of leading non-starters. 49 flags qcInfo // quick check flags 50 index uint16 51 } 52 53 // functions dispatchable per form 54 type lookupFunc func(b input, i int) Properties 55 56 // formInfo holds Form-specific functions and tables. 57 type formInfo struct { 58 form Form 59 composing, compatibility bool // form type 60 info lookupFunc 61 nextMain iterFunc 62 } 63 64 var formTable = []*formInfo{{ 65 form: NFC, 66 composing: true, 67 compatibility: false, 68 info: lookupInfoNFC, 69 nextMain: nextComposed, 70 }, { 71 form: NFD, 72 composing: false, 73 compatibility: false, 74 info: lookupInfoNFC, 75 nextMain: nextDecomposed, 76 }, { 77 form: NFKC, 78 composing: true, 79 compatibility: true, 80 info: lookupInfoNFKC, 81 nextMain: nextComposed, 82 }, { 83 form: NFKD, 84 composing: false, 85 compatibility: true, 86 info: lookupInfoNFKC, 87 nextMain: nextDecomposed, 88 }} 89 90 // We do not distinguish between boundaries for NFC, NFD, etc. to avoid 91 // unexpected behavior for the user. For example, in NFD, there is a boundary 92 // after 'a'. However, 'a' might combine with modifiers, so from the application's 93 // perspective it is not a good boundary. We will therefore always use the 94 // boundaries for the combining variants. 95 96 // BoundaryBefore returns true if this rune starts a new segment and 97 // cannot combine with any rune on the left. 98 func (p Properties) BoundaryBefore() bool { 99 if p.ccc == 0 && !p.combinesBackward() { 100 return true 101 } 102 // We assume that the CCC of the first character in a decomposition 103 // is always non-zero if different from info.ccc and that we can return 104 // false at this point. This is verified by maketables. 105 return false 106 } 107 108 // BoundaryAfter returns true if runes cannot combine with or otherwise 109 // interact with this or previous runes. 110 func (p Properties) BoundaryAfter() bool { 111 // TODO: loosen these conditions. 112 return p.isInert() 113 } 114 115 // We pack quick check data in 6 bits: 116 // 117 // 5: Combines forward (0 == false, 1 == true) 118 // 4..3: NFC_QC Yes(00), No (10), or Maybe (11) 119 // 2: NFD_QC Yes (0) or No (1). No also means there is a decomposition. 120 // 1..0: Number of trailing non-starters. 121 // 122 // When all 6 bits are zero, the character is inert, meaning it is never 123 // influenced by normalization. 124 type qcInfo uint8 125 126 func (p Properties) isYesC() bool { return p.flags&0x10 == 0 } 127 func (p Properties) isYesD() bool { return p.flags&0x4 == 0 } 128 129 func (p Properties) combinesForward() bool { return p.flags&0x20 != 0 } 130 func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe 131 func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD 132 133 func (p Properties) isInert() bool { 134 return p.flags&qcInfoMask == 0 && p.ccc == 0 135 } 136 137 func (p Properties) multiSegment() bool { 138 return p.index >= firstMulti && p.index < endMulti 139 } 140 141 func (p Properties) nLeadingNonStarters() uint8 { 142 return p.nLead 143 } 144 145 func (p Properties) nTrailingNonStarters() uint8 { 146 return uint8(p.flags & 0x03) 147 } 148 149 // Decomposition returns the decomposition for the underlying rune 150 // or nil if there is none. 151 func (p Properties) Decomposition() []byte { 152 // TODO: create the decomposition for Hangul? 153 if p.index == 0 { 154 return nil 155 } 156 i := p.index 157 n := decomps[i] & headerLenMask 158 if n == 31 { 159 n = 33 160 } 161 i++ 162 return decomps[i : i+uint16(n)] 163 } 164 165 // Size returns the length of UTF-8 encoding of the rune. 166 func (p Properties) Size() int { 167 return int(p.size) 168 } 169 170 // CCC returns the canonical combining class of the underlying rune. 171 func (p Properties) CCC() uint8 { 172 if p.index >= firstCCCZeroExcept { 173 return 0 174 } 175 return ccc[p.ccc] 176 } 177 178 // LeadCCC returns the CCC of the first rune in the decomposition. 179 // If there is no decomposition, LeadCCC equals CCC. 180 func (p Properties) LeadCCC() uint8 { 181 return ccc[p.ccc] 182 } 183 184 // TrailCCC returns the CCC of the last rune in the decomposition. 185 // If there is no decomposition, TrailCCC equals CCC. 186 func (p Properties) TrailCCC() uint8 { 187 return ccc[p.tccc] 188 } 189 190 func buildRecompMap() { 191 recompMap = make(map[uint32]rune, len(recompMapPacked)/8) 192 var buf [8]byte 193 for i := 0; i < len(recompMapPacked); i += 8 { 194 copy(buf[:], recompMapPacked[i:i+8]) 195 key := binary.BigEndian.Uint32(buf[:4]) 196 val := binary.BigEndian.Uint32(buf[4:]) 197 recompMap[key] = rune(val) 198 } 199 } 200 201 // Recomposition 202 // We use 32-bit keys instead of 64-bit for the two codepoint keys. 203 // This clips off the bits of three entries, but we know this will not 204 // result in a collision. In the unlikely event that changes to 205 // UnicodeData.txt introduce collisions, the compiler will catch it. 206 // Note that the recomposition map for NFC and NFKC are identical. 207 208 // combine returns the combined rune or 0 if it doesn't exist. 209 // 210 // The caller is responsible for calling 211 // recompMapOnce.Do(buildRecompMap) sometime before this is called. 212 func combine(a, b rune) rune { 213 key := uint32(uint16(a))<<16 + uint32(uint16(b)) 214 if recompMap == nil { 215 panic("caller error") // see func comment 216 } 217 return recompMap[key] 218 } 219 220 func lookupInfoNFC(b input, i int) Properties { 221 v, sz := b.charinfoNFC(i) 222 return compInfo(v, sz) 223 } 224 225 func lookupInfoNFKC(b input, i int) Properties { 226 v, sz := b.charinfoNFKC(i) 227 return compInfo(v, sz) 228 } 229 230 // Properties returns properties for the first rune in s. 231 func (f Form) Properties(s []byte) Properties { 232 if f == NFC || f == NFD { 233 return compInfo(nfcData.lookup(s)) 234 } 235 return compInfo(nfkcData.lookup(s)) 236 } 237 238 // PropertiesString returns properties for the first rune in s. 239 func (f Form) PropertiesString(s string) Properties { 240 if f == NFC || f == NFD { 241 return compInfo(nfcData.lookupString(s)) 242 } 243 return compInfo(nfkcData.lookupString(s)) 244 } 245 246 // compInfo converts the information contained in v and sz 247 // to a Properties. See the comment at the top of the file 248 // for more information on the format. 249 func compInfo(v uint16, sz int) Properties { 250 if v == 0 { 251 return Properties{size: uint8(sz)} 252 } else if v >= 0x8000 { 253 p := Properties{ 254 size: uint8(sz), 255 ccc: uint8(v), 256 tccc: uint8(v), 257 flags: qcInfo(v >> 8), 258 } 259 if p.ccc > 0 || p.combinesBackward() { 260 p.nLead = uint8(p.flags & 0x3) 261 } 262 return p 263 } 264 // has decomposition 265 h := decomps[v] 266 f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4 267 p := Properties{size: uint8(sz), flags: f, index: v} 268 if v >= firstCCC { 269 n := uint16(h & headerLenMask) 270 if n == 31 { 271 n = 33 272 } 273 v += n + 1 274 c := decomps[v] 275 p.tccc = c >> 2 276 p.flags |= qcInfo(c & 0x3) 277 if v >= firstLeadingCCC { 278 p.nLead = c & 0x3 279 if v >= firstStarterWithNLead { 280 // We were tricked. Remove the decomposition. 281 p.flags &= 0x03 282 p.index = 0 283 return p 284 } 285 p.ccc = decomps[v+1] 286 } 287 } 288 return p 289 } 290