Source file src/image/jpeg/dct.go
1 // Copyright 2025 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package jpeg 6 7 // Discrete Cosine Transformation (DCT) implementations using the algorithm from 8 // Christoph Loeffler, Adriaan Lightenberg, and George S. Mostchytz, 9 // “Practical Fast 1-D DCT Algorithms with 11 Multiplications,” ICASSP 1989. 10 // https://ieeexplore.ieee.org/document/266596 11 // 12 // Since the paper is paywalled, the rest of this comment gives a summary. 13 // 14 // A 1-dimensional forward DCT (1D FDCT) takes as input 8 values x0..x7 15 // and transforms them in place into the result values. 16 // 17 // The mathematical definition of the N-point 1D FDCT is: 18 // 19 // X[k] = α_k Σ_n x[n] * cos (2n+1)*k*π/2N 20 // 21 // where α₀ = √2 and α_k = 1 for k > 0. 22 // 23 // For our purposes, N=8, so the angles end up being multiples of π/16. 24 // The most direct implementation of this definition would require 64 multiplications. 25 // 26 // Loeffler's paper presents a more efficient computation that requires only 27 // 11 multiplications and works in terms of three basic operations: 28 // 29 // - A “butterfly” x0, x1 = x0+x1, x0-x1. 30 // The inverse is x0, x1 = (x0+x1)/2, (x0-x1)/2. 31 // 32 // - A scaling of x0 by k: x0 *= k. The inverse is scaling by 1/k. 33 // 34 // - A rotation of x0, x1 by θ, defined as: 35 // x0, x1 = x0 cos θ + x1 sin θ, -x0 sin θ + x1 cos θ. 36 // The inverse is rotation by -θ. 37 // 38 // The algorithm proceeds in four stages: 39 // 40 // Stage 1: 41 // - butterfly x0, x7; x1, x6; x2, x5; x3, x4. 42 // 43 // Stage 2: 44 // - butterfly x0, x3; x1, x2 45 // - rotate x4, x7 by 3π/16 46 // - rotate x5, x6 by π/16. 47 // 48 // Stage 3: 49 // - butterfly x0, x1; x4, x6; x7, x5 50 // - rotate x2, x3 by 6π/16 and scale by √2. 51 // 52 // Stage 4: 53 // - butterfly x7, x4 54 // - scale x5, x6 by √2. 55 // 56 // Finally, the values are permuted. The permutation can be read as either: 57 // - x0, x4, x2, x6, x7, x3, x5, x1 = x0, x1, x2, x3, x4, x5, x6, x7 (paper's form) 58 // - x0, x1, x2, x3, x4, x5, x6, x7 = x0, x7, x2, x5, x1, x6, x3, x4 (sorted by LHS) 59 // The code below uses the second form to make it easier to merge adjacent stores. 60 // (Note that unlike in recursive FFT implementations, the permutation here is 61 // not always mapping indexes to their bit reversals.) 62 // 63 // As written above, the rotation requires four multiplications, but it can be 64 // reduced to three by refactoring (see [dctBox] below), and the scaling in 65 // stage 3 can be merged into the rotation constants, so the overall cost 66 // of a 1D FDCT is 11 multiplies. 67 // 68 // The 1D inverse DCT (IDCT) is the 1D FDCT run backward 69 // with all the basic operations inverted. 70 71 // dctBox implements a 3-multiply, 3-add rotation+scaling. 72 // Given x0, x1, k*cos θ, and k*sin θ, dctBox returns the 73 // rotated and scaled coordinates. 74 // (It is called dctBox because the rotate+scale operation 75 // is drawn as a box in Figures 1 and 2 in the paper.) 76 func dctBox(x0, x1, kcos, ksin int32) (y0, y1 int32) { 77 // y0 = x0*kcos + x1*ksin 78 // y1 = -x0*ksin + x1*kcos 79 ksum := kcos * (x0 + x1) 80 y0 = ksum + (ksin-kcos)*x1 81 y1 = ksum - (kcos+ksin)*x0 82 return y0, y1 83 } 84 85 // A block is an 8x8 input to a 2D DCT (either the FDCT or IDCT). 86 // The input is actually only 8x8 uint8 values, and the outputs are 8x8 int16, 87 // but it is convenient to use int32s for intermediate storage, 88 // so we define only a single block type of [8*8]int32. 89 // 90 // A 2D DCT is implemented as 1D DCTs over the rows and columns. 91 // 92 // dct_test.go defines a String method for nice printing in tests. 93 type block [blockSize]int32 94 95 const blockSize = 8 * 8 96 97 // Note on Numerical Precision 98 // 99 // The inputs to both the FDCT and IDCT are uint8 values stored in a block, 100 // and the outputs are int16s in the same block, but the overall operation 101 // uses int32 values as fixed-point intermediate values. 102 // In the code comments below, the notation “QN.M” refers to a 103 // signed value of 1+N+M significant bits, one of which is the sign bit, 104 // and M of which hold fractional (sub-integer) precision. 105 // For example, 255 as a Q8.0 value is stored as int32(255), 106 // while 255 as a Q8.1 value is stored as int32(510), 107 // and 255.5 as a Q8.1 value is int32(511). 108 // The notation UQN.M refers to an unsigned value of N+M significant bits. 109 // See https://en.wikipedia.org/wiki/Q_(number_format) for more. 110 // 111 // In general we only need to keep about 16 significant bits, but it is more 112 // efficient and somewhat more precise to let unnecessary fractional bits 113 // accumulate and shift them away in bulk rather than after every operation. 114 // As such, it is important to keep track of the number of fractional bits 115 // in each variable at different points in the code, to avoid mistakes like 116 // adding numbers with different fractional precisions, as well as to keep 117 // track of the total number of bits, to avoid overflow. A comment like: 118 // 119 // // x[123] now Q8.2. 120 // 121 // means that x1, x2, and x3 are all Q8.2 (11-bit) values. 122 // Keeping extra precision bits also reduces the size of the errors introduced 123 // by using right shift to approximate rounded division. 124 125 // Constants needed for the implementation. 126 // These are all 60-bit precision fixed-point constants. 127 // The function c(val, b) rounds the constant to b bits. 128 // c is simple enough that calls to it with constant args 129 // are inlined and constant-propagated down to an inline constant. 130 // Each constant is commented with its Ivy definition (see robpike.io/ivy), 131 // using this scaling helper function: 132 // 133 // op fix x = floor 0.5 + x * 2**60 134 const ( 135 cos1 = 1130768441178740757 // fix cos 1*pi/16 136 sin1 = 224923827593068887 // fix sin 1*pi/16 137 cos3 = 958619196450722178 // fix cos 3*pi/16 138 sin3 = 640528868967736374 // fix sin 3*pi/16 139 sqrt2 = 1630477228166597777 // fix sqrt 2 140 sqrt2_cos6 = 623956622067911264 // fix (sqrt 2)*cos 6*pi/16 141 sqrt2_sin6 = 1506364539328854985 // fix (sqrt 2)*sin 6*pi/16 142 sqrt2inv = 815238614083298888 // fix 1/sqrt 2 143 sqrt2inv_cos6 = 311978311033955632 // fix (1/sqrt 2)*cos 6*pi/16 144 sqrt2inv_sin6 = 753182269664427492 // fix (1/sqrt 2)*sin 6*pi/16 145 ) 146 147 func c(x uint64, bits int) int32 { 148 return int32((x + (1 << (59 - bits))) >> (60 - bits)) 149 } 150 151 // fdct implements the forward DCT. 152 // Inputs are UQ8.0; outputs are Q13.0. 153 func fdct(b *block) { 154 fdctCols(b) 155 fdctRows(b) 156 } 157 158 // fdctCols applies the 1D DCT to the columns of b. 159 // Inputs are UQ8.0 in [0,255] but interpreted as [-128,127]. 160 // Outputs are Q10.18. 161 func fdctCols(b *block) { 162 for i := range 8 { 163 x0 := b[0*8+i] 164 x1 := b[1*8+i] 165 x2 := b[2*8+i] 166 x3 := b[3*8+i] 167 x4 := b[4*8+i] 168 x5 := b[5*8+i] 169 x6 := b[6*8+i] 170 x7 := b[7*8+i] 171 172 // x[01234567] are UQ8.0 in [0,255]. 173 174 // Stage 1: four butterflies. 175 // In general a butterfly of QN.M inputs produces Q(N+1).M outputs. 176 // A butterfly of UQN.M inputs produces a UQ(N+1).M sum and a QN.M difference. 177 178 x0, x7 = x0+x7, x0-x7 179 x1, x6 = x1+x6, x1-x6 180 x2, x5 = x2+x5, x2-x5 181 x3, x4 = x3+x4, x3-x4 182 // x[0123] now UQ9.0 in [0, 510]. 183 // x[4567] now Q8.0 in [-255,255]. 184 185 // Stage 2: two boxes and two butterflies. 186 // A box on QN.M inputs with B-bit constants 187 // produces Q(N+1).(M+B) outputs. 188 // (The +1 is from the addition.) 189 190 x4, x7 = dctBox(x4, x7, c(cos3, 18), c(sin3, 18)) 191 x5, x6 = dctBox(x5, x6, c(cos1, 18), c(sin1, 18)) 192 // x[47] now Q9.18 in [-354, 354]. 193 // x[56] now Q9.18 in [-300, 300]. 194 195 x0, x3 = x0+x3, x0-x3 196 x1, x2 = x1+x2, x1-x2 197 // x[01] now UQ10.0 in [0, 1020]. 198 // x[23] now Q9.0 in [-510, 510]. 199 200 // Stage 3: one box and three butterflies. 201 202 x2, x3 = dctBox(x2, x3, c(sqrt2_cos6, 18), c(sqrt2_sin6, 18)) 203 // x[23] now Q10.18 in [-943, 943]. 204 205 x0, x1 = x0+x1, x0-x1 206 // x0 now UQ11.0 in [0, 2040]. 207 // x1 now Q10.0 in [-1020, 1020]. 208 209 // Store x0, x1, x2, x3 to their permuted targets. 210 // The original +128 in every input value 211 // has cancelled out except in the “DC signal” x0. 212 // Subtracting 128*8 here is equivalent to subtracting 128 213 // from every input before we started, but cheaper. 214 // It also converts x0 from UQ11.18 to Q10.18. 215 b[0*8+i] = (x0 - 128*8) << 18 216 b[4*8+i] = x1 << 18 217 b[2*8+i] = x2 218 b[6*8+i] = x3 219 220 x4, x6 = x4+x6, x4-x6 221 x7, x5 = x7+x5, x7-x5 222 // x[4567] now Q10.18 in [-654, 654]. 223 224 // Stage 4: two √2 scalings and one butterfly. 225 226 x5 = (x5 >> 12) * c(sqrt2, 12) 227 x6 = (x6 >> 12) * c(sqrt2, 12) 228 // x[56] still Q10.18 in [-925, 925] (= 654√2). 229 x7, x4 = x7+x4, x7-x4 230 // x[47] still Q10.18 in [-925, 925] (not Q11.18!). 231 // This is not obvious at all! See “Note on 925” below. 232 233 // Store x4 x5 x6 x7 to their permuted targets. 234 b[1*8+i] = x7 235 b[3*8+i] = x5 236 b[5*8+i] = x6 237 b[7*8+i] = x4 238 } 239 } 240 241 // fdctRows applies the 1D DCT to the rows of b. 242 // Inputs are Q10.18; outputs are Q13.0. 243 func fdctRows(b *block) { 244 for i := range 8 { 245 x := b[8*i : 8*i+8 : 8*i+8] 246 x0 := x[0] 247 x1 := x[1] 248 x2 := x[2] 249 x3 := x[3] 250 x4 := x[4] 251 x5 := x[5] 252 x6 := x[6] 253 x7 := x[7] 254 255 // x[01234567] are Q10.18 [-1020, 1020]. 256 257 // Stage 1: four butterflies. 258 259 x0, x7 = x0+x7, x0-x7 260 x1, x6 = x1+x6, x1-x6 261 x2, x5 = x2+x5, x2-x5 262 x3, x4 = x3+x4, x3-x4 263 // x[01234567] now Q11.18 in [-2040, 2040]. 264 265 // Stage 2: two boxes and two butterflies. 266 267 x4, x7 = dctBox(x4>>14, x7>>14, c(cos3, 14), c(sin3, 14)) 268 x5, x6 = dctBox(x5>>14, x6>>14, c(cos1, 14), c(sin1, 14)) 269 // x[47] now Q12.18 in [-2830, 2830]. 270 // x[56] now Q12.18 in [-2400, 2400]. 271 x0, x3 = x0+x3, x0-x3 272 x1, x2 = x1+x2, x1-x2 273 // x[01234567] now Q12.18 in [-4080, 4080]. 274 275 // Stage 3: one box and three butterflies. 276 277 x2, x3 = dctBox(x2>>14, x3>>14, c(sqrt2_cos6, 14), c(sqrt2_sin6, 14)) 278 // x[23] now Q13.18 in [-7539, 7539]. 279 x0, x1 = x0+x1, x0-x1 280 // x[01] now Q13.18 in [-8160, 8160]. 281 x4, x6 = x4+x6, x4-x6 282 x7, x5 = x7+x5, x7-x5 283 // x[4567] now Q13.18 in [-5230, 5230]. 284 285 // Stage 4: two √2 scalings and one butterfly. 286 287 x5 = (x5 >> 14) * c(sqrt2, 14) 288 x6 = (x6 >> 14) * c(sqrt2, 14) 289 // x[56] still Q13.18 in [-7397, 7397] (= 5230√2). 290 x7, x4 = x7+x4, x7-x4 291 // x[47] still Q13.18 in [-7395, 7395] (= 2040*3.6246). 292 // See “Note on 925” below. 293 294 // Cut from Q13.18 to Q13.0. 295 x0 = (x0 + 1<<17) >> 18 296 x1 = (x1 + 1<<17) >> 18 297 x2 = (x2 + 1<<17) >> 18 298 x3 = (x3 + 1<<17) >> 18 299 x4 = (x4 + 1<<17) >> 18 300 x5 = (x5 + 1<<17) >> 18 301 x6 = (x6 + 1<<17) >> 18 302 x7 = (x7 + 1<<17) >> 18 303 304 // Note: Unlike in fdctCols, saved all stores for the end 305 // because they are adjacent memory locations and some systems 306 // can use multiword stores. 307 x[0] = x0 308 x[1] = x7 309 x[2] = x2 310 x[3] = x5 311 x[4] = x1 312 x[5] = x6 313 x[6] = x3 314 x[7] = x4 315 } 316 } 317 318 // “Note on 925”, deferred from above to avoid interrupting code. 319 // 320 // In fdctCols, heading into stage 2, the values x4, x5, x6, x7 are in [-255, 255]. 321 // Let's call those specific values b4, b5, b6, b7, and trace how x[4567] evolve: 322 // 323 // Stage 2: 324 // x4 = b4*cos3 + b7*sin3 325 // x7 = -b4*sin3 + b7*cos3 326 // x5 = b5*cos1 + b6*sin1 327 // x6 = -b5*sin1 + b6*cos1 328 // 329 // Stage 3: 330 // 331 // x4 = x4+x6 = b4*cos3 + b7*sin3 - b5*sin1 + b6*cos1 332 // x6 = x4-x6 = b4*cos3 + b7*sin3 + b5*sin1 - b6*cos1 333 // x7 = x7+x5 = -b4*sin3 + b7*cos3 + b5*cos1 + b6*sin1 334 // x5 = x7-x5 = -b4*sin3 + b7*cos3 - b5*cos1 - b6*sin1 335 // 336 // Stage 4: 337 // 338 // x7 = x7+x4 = -b4*sin3 + b7*cos3 + b5*cos1 + b6*sin1 + b4*cos3 + b7*sin3 - b5*sin1 + b6*cos1 339 // = b4*(cos3-sin3) + b5*(cos1-sin1) + b6*(cos1+sin1) + b7*(cos3+sin3) 340 // < 255*(0.2759 + 0.7857 + 1.1759 + 1.3871) = 255*3.6246 < 925. 341 // 342 // x4 = x7-x4 = -b4*sin3 + b7*cos3 + b5*cos1 + b6*sin1 - b4*cos3 - b7*sin3 + b5*sin1 - b6*cos1 343 // = -b4*(cos3+sin3) + b5*(cos1+sin1) + b6*(sin1-cos1) + b7*(cos3-sin3) 344 // < same 925. 345 // 346 // The fact that x5, x6 are also at most 925 is not a coincidence: we are computing 347 // the same kinds of numbers for all four, just with different paths to them. 348 // 349 // In fdctRows, the same analysis applies, but the initial values are 350 // in [-2040, 2040] instead of [-255, 255], so the bound is 2040*3.6246 < 7395. 351 352 // idct implements the inverse DCT. 353 // Inputs are UQ8.0; outputs are Q10.3. 354 func idct(b *block) { 355 // A 2D IDCT is a 1D IDCT on rows followed by columns. 356 idctRows(b) 357 idctCols(b) 358 } 359 360 // idctRows applies the 1D IDCT to the rows of b. 361 // Inputs are UQ8.0; outputs are Q9.20. 362 func idctRows(b *block) { 363 for i := range 8 { 364 x := b[8*i : 8*i+8 : 8*i+8] 365 x0 := x[0] 366 x7 := x[1] 367 x2 := x[2] 368 x5 := x[3] 369 x1 := x[4] 370 x6 := x[5] 371 x3 := x[6] 372 x4 := x[7] 373 374 // Run FDCT backward. 375 // Independent operations have been reordered somewhat 376 // to make precision tracking easier. 377 // 378 // Note that “x0, x1 = x0+x1, x0-x1” is now a reverse butterfly 379 // and carries with it an implicit divide by two: the extra bit 380 // is added to the precision, not the value size. 381 382 // x[01234567] are UQ8.0 in [0, 255]. 383 384 // Stages 4, 3, 2: x0, x1, x2, x3. 385 386 x0 <<= 17 387 x1 <<= 17 388 // x0, x1 now UQ8.17. 389 x0, x1 = x0+x1, x0-x1 390 // x0 now UQ8.18 in [0, 255]. 391 // x1 now Q7.18 in [-127½, 127½]. 392 393 // Note: (1/sqrt 2)*((cos 6*pi/16)+(sin 6*pi/16)) < 0.924, so no new high bit. 394 x2, x3 = dctBox(x2, x3, c(sqrt2inv_cos6, 18), -c(sqrt2inv_sin6, 18)) 395 // x[23] now Q8.18 in [-236, 236]. 396 x1, x2 = x1+x2, x1-x2 397 x0, x3 = x0+x3, x0-x3 398 // x[0123] now Q8.19 in [-246, 246]. 399 400 // Stages 4, 3, 2: x4, x5, x6, x7. 401 402 x4 <<= 7 403 x7 <<= 7 404 // x[47] now UQ8.7 405 x7, x4 = x7+x4, x7-x4 406 // x7 now UQ8.8 in [0, 255]. 407 // x4 now Q7.8 in [-127½, 127½]. 408 409 x6 = x6 * c(sqrt2inv, 8) 410 x5 = x5 * c(sqrt2inv, 8) 411 // x[56] now UQ8.8 in [0, 181]. 412 // Note that 1/√2 has five 0s in its binary representation after 413 // the 8th bit, so this multipliy is actually producing 12 bits of precision. 414 415 x7, x5 = x7+x5, x7-x5 416 x4, x6 = x4+x6, x4-x6 417 // x[4567] now Q8.9 in [-218, 218]. 418 419 x4, x7 = dctBox(x4>>2, x7>>2, c(cos3, 12), -c(sin3, 12)) 420 x5, x6 = dctBox(x5>>2, x6>>2, c(cos1, 12), -c(sin1, 12)) 421 // x[4567] now Q9.19 in [-303, 303]. 422 423 // Stage 1. 424 425 x0, x7 = x0+x7, x0-x7 426 x1, x6 = x1+x6, x1-x6 427 x2, x5 = x2+x5, x2-x5 428 x3, x4 = x3+x4, x3-x4 429 // x[01234567] now Q9.20 in [-275, 275]. 430 431 // Note: we don't need all 20 bits of “precision”, 432 // but it is faster to let idctCols shift it away as part 433 // of other operations rather than downshift here. 434 435 x[0] = x0 436 x[1] = x1 437 x[2] = x2 438 x[3] = x3 439 x[4] = x4 440 x[5] = x5 441 x[6] = x6 442 x[7] = x7 443 } 444 } 445 446 // idctCols applies the 1D IDCT to the columns of b. 447 // Inputs are Q9.20. 448 // Outputs are Q10.3. That is, the result is the IDCT*8. 449 func idctCols(b *block) { 450 for i := range 8 { 451 x0 := b[0*8+i] 452 x7 := b[1*8+i] 453 x2 := b[2*8+i] 454 x5 := b[3*8+i] 455 x1 := b[4*8+i] 456 x6 := b[5*8+i] 457 x3 := b[6*8+i] 458 x4 := b[7*8+i] 459 460 // x[012345678] are Q9.20. 461 462 // Start by adding 0.5 to x0 (the incoming DC signal). 463 // The butterflies will add it to all the other values, 464 // and then the final shifts will round properly. 465 x0 += 1 << 19 466 467 // Stages 4, 3, 2: x0, x1, x2, x3. 468 469 x0, x1 = (x0+x1)>>2, (x0-x1)>>2 470 // x[01] now Q9.19. 471 // Note: (1/sqrt 2)*((cos 6*pi/16)+(sin 6*pi/16)) < 1, so no new high bit. 472 x2, x3 = dctBox(x2>>13, x3>>13, c(sqrt2inv_cos6, 12), -c(sqrt2inv_sin6, 12)) 473 // x[0123] now Q9.19. 474 475 x1, x2 = x1+x2, x1-x2 476 x0, x3 = x0+x3, x0-x3 477 // x[0123] now Q9.20. 478 479 // Stages 4, 3, 2: x4, x5, x6, x7. 480 481 x7, x4 = x7+x4, x7-x4 482 // x[47] now Q9.21. 483 484 x5 = (x5 >> 13) * c(sqrt2inv, 14) 485 x6 = (x6 >> 13) * c(sqrt2inv, 14) 486 // x[56] now Q9.21. 487 488 x7, x5 = x7+x5, x7-x5 489 x4, x6 = x4+x6, x4-x6 490 // x[4567] now Q9.22. 491 492 x4, x7 = dctBox(x4>>14, x7>>14, c(cos3, 12), -c(sin3, 12)) 493 x5, x6 = dctBox(x5>>14, x6>>14, c(cos1, 12), -c(sin1, 12)) 494 // x[4567] now Q10.20. 495 496 x0, x7 = x0+x7, x0-x7 497 x1, x6 = x1+x6, x1-x6 498 x2, x5 = x2+x5, x2-x5 499 x3, x4 = x3+x4, x3-x4 500 // x[01234567] now Q10.21. 501 502 x0 >>= 18 503 x1 >>= 18 504 x2 >>= 18 505 x3 >>= 18 506 x4 >>= 18 507 x5 >>= 18 508 x6 >>= 18 509 x7 >>= 18 510 // x[01234567] now Q10.3. 511 512 b[0*8+i] = x0 513 b[1*8+i] = x1 514 b[2*8+i] = x2 515 b[3*8+i] = x3 516 b[4*8+i] = x4 517 b[5*8+i] = x5 518 b[6*8+i] = x6 519 b[7*8+i] = x7 520 } 521 } 522