dct.go

     1  // Copyright 2025 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package jpeg
     6  
     7  // Discrete Cosine Transformation (DCT) implementations using the algorithm from
     8  // Christoph Loeffler, Adriaan Lightenberg, and George S. Mostchytz,
     9  // “Practical Fast 1-D DCT Algorithms with 11 Multiplications,” ICASSP 1989.
    10  // https://ieeexplore.ieee.org/document/266596
    11  //
    12  // Since the paper is paywalled, the rest of this comment gives a summary.
    13  //
    14  // A 1-dimensional forward DCT (1D FDCT) takes as input 8 values x0..x7
    15  // and transforms them in place into the result values.
    16  //
    17  // The mathematical definition of the N-point 1D FDCT is:
    18  //
    19  //	X[k] = α_k Σ_n x[n] * cos (2n+1)*k*π/2N
    20  //
    21  // where α₀ = √2 and α_k = 1 for k > 0.
    22  //
    23  // For our purposes, N=8, so the angles end up being multiples of π/16.
    24  // The most direct implementation of this definition would require 64 multiplications.
    25  //
    26  // Loeffler's paper presents a more efficient computation that requires only
    27  // 11 multiplications and works in terms of three basic operations:
    28  //
    29  //  - A “butterfly” x0, x1 = x0+x1, x0-x1.
    30  //    The inverse is x0, x1 = (x0+x1)/2, (x0-x1)/2.
    31  //
    32  //  - A scaling of x0 by k: x0 *= k. The inverse is scaling by 1/k.
    33  //
    34  //  - A rotation of x0, x1 by θ, defined as:
    35  //    x0, x1 = x0 cos θ + x1 sin θ, -x0 sin θ + x1 cos θ.
    36  //    The inverse is rotation by -θ.
    37  //
    38  // The algorithm proceeds in four stages:
    39  //
    40  // Stage 1:
    41  //  - butterfly x0, x7; x1, x6; x2, x5; x3, x4.
    42  //
    43  // Stage 2:
    44  //  - butterfly x0, x3; x1, x2
    45  //  - rotate x4, x7 by 3π/16
    46  //  - rotate x5, x6 by π/16.
    47  //
    48  // Stage 3:
    49  //  - butterfly x0, x1; x4, x6; x7, x5
    50  //  - rotate x2, x3 by 6π/16 and scale by √2.
    51  //
    52  // Stage 4:
    53  //  - butterfly x7, x4
    54  //  - scale x5, x6 by √2.
    55  //
    56  // Finally, the values are permuted. The permutation can be read as either:
    57  //  - x0, x4, x2, x6, x7, x3, x5, x1 = x0, x1, x2, x3, x4, x5, x6, x7 (paper's form)
    58  //  - x0, x1, x2, x3, x4, x5, x6, x7 = x0, x7, x2, x5, x1, x6, x3, x4 (sorted by LHS)
    59  // The code below uses the second form to make it easier to merge adjacent stores.
    60  // (Note that unlike in recursive FFT implementations, the permutation here is
    61  // not always mapping indexes to their bit reversals.)
    62  //
    63  // As written above, the rotation requires four multiplications, but it can be
    64  // reduced to three by refactoring (see [dctBox] below), and the scaling in
    65  // stage 3 can be merged into the rotation constants, so the overall cost
    66  // of a 1D FDCT is 11 multiplies.
    67  //
    68  // The 1D inverse DCT (IDCT) is the 1D FDCT run backward
    69  // with all the basic operations inverted.
    70  
    71  // dctBox implements a 3-multiply, 3-add rotation+scaling.
    72  // Given x0, x1, k*cos θ, and k*sin θ, dctBox returns the
    73  // rotated and scaled coordinates.
    74  // (It is called dctBox because the rotate+scale operation
    75  // is drawn as a box in Figures 1 and 2 in the paper.)
    76  func dctBox(x0, x1, kcos, ksin int32) (y0, y1 int32) {
    77  	// y0 = x0*kcos + x1*ksin
    78  	// y1 = -x0*ksin + x1*kcos
    79  	ksum := kcos * (x0 + x1)
    80  	y0 = ksum + (ksin-kcos)*x1
    81  	y1 = ksum - (kcos+ksin)*x0
    82  	return y0, y1
    83  }
    84  
    85  // A block is an 8x8 input to a 2D DCT (either the FDCT or IDCT).
    86  // The input is actually only 8x8 uint8 values, and the outputs are 8x8 int16,
    87  // but it is convenient to use int32s for intermediate storage,
    88  // so we define only a single block type of [8*8]int32.
    89  //
    90  // A 2D DCT is implemented as 1D DCTs over the rows and columns.
    91  //
    92  // dct_test.go defines a String method for nice printing in tests.
    93  type block [blockSize]int32
    94  
    95  const blockSize = 8 * 8
    96  
    97  // Note on Numerical Precision
    98  //
    99  // The inputs to both the FDCT and IDCT are uint8 values stored in a block,
   100  // and the outputs are int16s in the same block, but the overall operation
   101  // uses int32 values as fixed-point intermediate values.
   102  // In the code comments below, the notation “QN.M” refers to a
   103  // signed value of 1+N+M significant bits, one of which is the sign bit,
   104  // and M of which hold fractional (sub-integer) precision.
   105  // For example, 255 as a Q8.0 value is stored as int32(255),
   106  // while 255 as a Q8.1 value is stored as int32(510),
   107  // and 255.5 as a Q8.1 value is int32(511).
   108  // The notation UQN.M refers to an unsigned value of N+M significant bits.
   109  // See https://en.wikipedia.org/wiki/Q_(number_format) for more.
   110  //
   111  // In general we only need to keep about 16 significant bits, but it is more
   112  // efficient and somewhat more precise to let unnecessary fractional bits
   113  // accumulate and shift them away in bulk rather than after every operation.
   114  // As such, it is important to keep track of the number of fractional bits
   115  // in each variable at different points in the code, to avoid mistakes like
   116  // adding numbers with different fractional precisions, as well as to keep
   117  // track of the total number of bits, to avoid overflow. A comment like:
   118  //
   119  //	// x[123] now Q8.2.
   120  //
   121  // means that x1, x2, and x3 are all Q8.2 (11-bit) values.
   122  // Keeping extra precision bits also reduces the size of the errors introduced
   123  // by using right shift to approximate rounded division.
   124  
   125  // Constants needed for the implementation.
   126  // These are all 60-bit precision fixed-point constants.
   127  // The function c(val, b) rounds the constant to b bits.
   128  // c is simple enough that calls to it with constant args
   129  // are inlined and constant-propagated down to an inline constant.
   130  // Each constant is commented with its Ivy definition (see robpike.io/ivy),
   131  // using this scaling helper function:
   132  //
   133  //	op fix x = floor 0.5 + x * 2**60
   134  const (
   135  	cos1          = 1130768441178740757 // fix cos 1*pi/16
   136  	sin1          = 224923827593068887  // fix sin 1*pi/16
   137  	cos3          = 958619196450722178  // fix cos 3*pi/16
   138  	sin3          = 640528868967736374  // fix sin 3*pi/16
   139  	sqrt2         = 1630477228166597777 // fix sqrt 2
   140  	sqrt2_cos6    = 623956622067911264  // fix (sqrt 2)*cos 6*pi/16
   141  	sqrt2_sin6    = 1506364539328854985 // fix (sqrt 2)*sin 6*pi/16
   142  	sqrt2inv      = 815238614083298888  // fix 1/sqrt 2
   143  	sqrt2inv_cos6 = 311978311033955632  // fix (1/sqrt 2)*cos 6*pi/16
   144  	sqrt2inv_sin6 = 753182269664427492  // fix (1/sqrt 2)*sin 6*pi/16
   145  )
   146  
   147  func c(x uint64, bits int) int32 {
   148  	return int32((x + (1 << (59 - bits))) >> (60 - bits))
   149  }
   150  
   151  // fdct implements the forward DCT.
   152  // Inputs are UQ8.0; outputs are Q13.0.
   153  func fdct(b *block) {
   154  	fdctCols(b)
   155  	fdctRows(b)
   156  }
   157  
   158  // fdctCols applies the 1D DCT to the columns of b.
   159  // Inputs are UQ8.0 in [0,255] but interpreted as [-128,127].
   160  // Outputs are Q10.18.
   161  func fdctCols(b *block) {
   162  	for i := range 8 {
   163  		x0 := b[0*8+i]
   164  		x1 := b[1*8+i]
   165  		x2 := b[2*8+i]
   166  		x3 := b[3*8+i]
   167  		x4 := b[4*8+i]
   168  		x5 := b[5*8+i]
   169  		x6 := b[6*8+i]
   170  		x7 := b[7*8+i]
   171  
   172  		// x[01234567] are UQ8.0 in [0,255].
   173  
   174  		// Stage 1: four butterflies.
   175  		// In general a butterfly of QN.M inputs produces Q(N+1).M outputs.
   176  		// A butterfly of UQN.M inputs produces a UQ(N+1).M sum and a QN.M difference.
   177  
   178  		x0, x7 = x0+x7, x0-x7
   179  		x1, x6 = x1+x6, x1-x6
   180  		x2, x5 = x2+x5, x2-x5
   181  		x3, x4 = x3+x4, x3-x4
   182  		// x[0123] now UQ9.0 in [0, 510].
   183  		// x[4567] now Q8.0 in [-255,255].
   184  
   185  		// Stage 2: two boxes and two butterflies.
   186  		// A box on QN.M inputs with B-bit constants
   187  		// produces Q(N+1).(M+B) outputs.
   188  		// (The +1 is from the addition.)
   189  
   190  		x4, x7 = dctBox(x4, x7, c(cos3, 18), c(sin3, 18))
   191  		x5, x6 = dctBox(x5, x6, c(cos1, 18), c(sin1, 18))
   192  		// x[47] now Q9.18 in [-354, 354].
   193  		// x[56] now Q9.18 in [-300, 300].
   194  
   195  		x0, x3 = x0+x3, x0-x3
   196  		x1, x2 = x1+x2, x1-x2
   197  		// x[01] now UQ10.0 in [0, 1020].
   198  		// x[23] now Q9.0 in [-510, 510].
   199  
   200  		// Stage 3: one box and three butterflies.
   201  
   202  		x2, x3 = dctBox(x2, x3, c(sqrt2_cos6, 18), c(sqrt2_sin6, 18))
   203  		// x[23] now Q10.18 in [-943, 943].
   204  
   205  		x0, x1 = x0+x1, x0-x1
   206  		// x0 now UQ11.0 in [0, 2040].
   207  		// x1 now Q10.0 in [-1020, 1020].
   208  
   209  		// Store x0, x1, x2, x3 to their permuted targets.
   210  		// The original +128 in every input value
   211  		// has cancelled out except in the “DC signal” x0.
   212  		// Subtracting 128*8 here is equivalent to subtracting 128
   213  		// from every input before we started, but cheaper.
   214  		// It also converts x0 from UQ11.18 to Q10.18.
   215  		b[0*8+i] = (x0 - 128*8) << 18
   216  		b[4*8+i] = x1 << 18
   217  		b[2*8+i] = x2
   218  		b[6*8+i] = x3
   219  
   220  		x4, x6 = x4+x6, x4-x6
   221  		x7, x5 = x7+x5, x7-x5
   222  		// x[4567] now Q10.18 in [-654, 654].
   223  
   224  		// Stage 4: two √2 scalings and one butterfly.
   225  
   226  		x5 = (x5 >> 12) * c(sqrt2, 12)
   227  		x6 = (x6 >> 12) * c(sqrt2, 12)
   228  		// x[56] still Q10.18 in [-925, 925] (= 654√2).
   229  		x7, x4 = x7+x4, x7-x4
   230  		// x[47] still Q10.18 in [-925, 925] (not Q11.18!).
   231  		// This is not obvious at all! See “Note on 925” below.
   232  
   233  		// Store x4 x5 x6 x7 to their permuted targets.
   234  		b[1*8+i] = x7
   235  		b[3*8+i] = x5
   236  		b[5*8+i] = x6
   237  		b[7*8+i] = x4
   238  	}
   239  }
   240  
   241  // fdctRows applies the 1D DCT to the rows of b.
   242  // Inputs are Q10.18; outputs are Q13.0.
   243  func fdctRows(b *block) {
   244  	for i := range 8 {
   245  		x := b[8*i : 8*i+8 : 8*i+8]
   246  		x0 := x[0]
   247  		x1 := x[1]
   248  		x2 := x[2]
   249  		x3 := x[3]
   250  		x4 := x[4]
   251  		x5 := x[5]
   252  		x6 := x[6]
   253  		x7 := x[7]
   254  
   255  		// x[01234567] are Q10.18 [-1020, 1020].
   256  
   257  		// Stage 1: four butterflies.
   258  
   259  		x0, x7 = x0+x7, x0-x7
   260  		x1, x6 = x1+x6, x1-x6
   261  		x2, x5 = x2+x5, x2-x5
   262  		x3, x4 = x3+x4, x3-x4
   263  		// x[01234567] now Q11.18 in [-2040, 2040].
   264  
   265  		// Stage 2: two boxes and two butterflies.
   266  
   267  		x4, x7 = dctBox(x4>>14, x7>>14, c(cos3, 14), c(sin3, 14))
   268  		x5, x6 = dctBox(x5>>14, x6>>14, c(cos1, 14), c(sin1, 14))
   269  		// x[47] now Q12.18 in [-2830, 2830].
   270  		// x[56] now Q12.18 in [-2400, 2400].
   271  		x0, x3 = x0+x3, x0-x3
   272  		x1, x2 = x1+x2, x1-x2
   273  		// x[01234567] now Q12.18 in [-4080, 4080].
   274  
   275  		// Stage 3: one box and three butterflies.
   276  
   277  		x2, x3 = dctBox(x2>>14, x3>>14, c(sqrt2_cos6, 14), c(sqrt2_sin6, 14))
   278  		// x[23] now Q13.18 in [-7539, 7539].
   279  		x0, x1 = x0+x1, x0-x1
   280  		// x[01] now Q13.18 in [-8160, 8160].
   281  		x4, x6 = x4+x6, x4-x6
   282  		x7, x5 = x7+x5, x7-x5
   283  		// x[4567] now Q13.18 in [-5230, 5230].
   284  
   285  		// Stage 4: two √2 scalings and one butterfly.
   286  
   287  		x5 = (x5 >> 14) * c(sqrt2, 14)
   288  		x6 = (x6 >> 14) * c(sqrt2, 14)
   289  		// x[56] still Q13.18 in [-7397, 7397] (= 5230√2).
   290  		x7, x4 = x7+x4, x7-x4
   291  		// x[47] still Q13.18 in [-7395, 7395] (= 2040*3.6246).
   292  		// See “Note on 925” below.
   293  
   294  		// Cut from Q13.18 to Q13.0.
   295  		x0 = (x0 + 1<<17) >> 18
   296  		x1 = (x1 + 1<<17) >> 18
   297  		x2 = (x2 + 1<<17) >> 18
   298  		x3 = (x3 + 1<<17) >> 18
   299  		x4 = (x4 + 1<<17) >> 18
   300  		x5 = (x5 + 1<<17) >> 18
   301  		x6 = (x6 + 1<<17) >> 18
   302  		x7 = (x7 + 1<<17) >> 18
   303  
   304  		// Note: Unlike in fdctCols, saved all stores for the end
   305  		// because they are adjacent memory locations and some systems
   306  		// can use multiword stores.
   307  		x[0] = x0
   308  		x[1] = x7
   309  		x[2] = x2
   310  		x[3] = x5
   311  		x[4] = x1
   312  		x[5] = x6
   313  		x[6] = x3
   314  		x[7] = x4
   315  	}
   316  }
   317  
   318  // “Note on 925”, deferred from above to avoid interrupting code.
   319  //
   320  // In fdctCols, heading into stage 2, the values x4, x5, x6, x7 are in [-255, 255].
   321  // Let's call those specific values b4, b5, b6, b7, and trace how x[4567] evolve:
   322  //
   323  // Stage 2:
   324  //	x4 = b4*cos3 + b7*sin3
   325  //	x7 = -b4*sin3 + b7*cos3
   326  //	x5 = b5*cos1 + b6*sin1
   327  //	x6 = -b5*sin1 + b6*cos1
   328  //
   329  // Stage 3:
   330  //
   331  //	x4 = x4+x6 =  b4*cos3 + b7*sin3 - b5*sin1 + b6*cos1
   332  //	x6 = x4-x6 =  b4*cos3 + b7*sin3 + b5*sin1 - b6*cos1
   333  //	x7 = x7+x5 = -b4*sin3 + b7*cos3 + b5*cos1 + b6*sin1
   334  //	x5 = x7-x5 = -b4*sin3 + b7*cos3 - b5*cos1 - b6*sin1
   335  //
   336  // Stage 4:
   337  //
   338  //	x7 = x7+x4 = -b4*sin3 + b7*cos3 + b5*cos1 + b6*sin1 + b4*cos3 + b7*sin3 - b5*sin1 + b6*cos1
   339  //	   = b4*(cos3-sin3) + b5*(cos1-sin1) + b6*(cos1+sin1) + b7*(cos3+sin3)
   340  //	   < 255*(0.2759 + 0.7857 + 1.1759 + 1.3871) = 255*3.6246 < 925.
   341  //
   342  //	x4 = x7-x4 = -b4*sin3 + b7*cos3 + b5*cos1 + b6*sin1 - b4*cos3 - b7*sin3 + b5*sin1 - b6*cos1
   343  //	   = -b4*(cos3+sin3) + b5*(cos1+sin1) + b6*(sin1-cos1) + b7*(cos3-sin3)
   344  //	   < same 925.
   345  //
   346  // The fact that x5, x6 are also at most 925 is not a coincidence: we are computing
   347  // the same kinds of numbers for all four, just with different paths to them.
   348  //
   349  // In fdctRows, the same analysis applies, but the initial values are
   350  // in [-2040, 2040] instead of [-255, 255], so the bound is 2040*3.6246 < 7395.
   351  
   352  // idct implements the inverse DCT.
   353  // Inputs are UQ8.0; outputs are Q10.3.
   354  func idct(b *block) {
   355  	// A 2D IDCT is a 1D IDCT on rows followed by columns.
   356  	idctRows(b)
   357  	idctCols(b)
   358  }
   359  
   360  // idctRows applies the 1D IDCT to the rows of b.
   361  // Inputs are UQ8.0; outputs are Q9.20.
   362  func idctRows(b *block) {
   363  	for i := range 8 {
   364  		x := b[8*i : 8*i+8 : 8*i+8]
   365  		x0 := x[0]
   366  		x7 := x[1]
   367  		x2 := x[2]
   368  		x5 := x[3]
   369  		x1 := x[4]
   370  		x6 := x[5]
   371  		x3 := x[6]
   372  		x4 := x[7]
   373  
   374  		// Run FDCT backward.
   375  		// Independent operations have been reordered somewhat
   376  		// to make precision tracking easier.
   377  		//
   378  		// Note that “x0, x1 = x0+x1, x0-x1” is now a reverse butterfly
   379  		// and carries with it an implicit divide by two: the extra bit
   380  		// is added to the precision, not the value size.
   381  
   382  		// x[01234567] are UQ8.0 in [0, 255].
   383  
   384  		// Stages 4, 3, 2: x0, x1, x2, x3.
   385  
   386  		x0 <<= 17
   387  		x1 <<= 17
   388  		// x0, x1 now UQ8.17.
   389  		x0, x1 = x0+x1, x0-x1
   390  		// x0 now UQ8.18 in [0, 255].
   391  		// x1 now Q7.18 in [-127½, 127½].
   392  
   393  		// Note: (1/sqrt 2)*((cos 6*pi/16)+(sin 6*pi/16)) < 0.924, so no new high bit.
   394  		x2, x3 = dctBox(x2, x3, c(sqrt2inv_cos6, 18), -c(sqrt2inv_sin6, 18))
   395  		// x[23] now Q8.18 in [-236, 236].
   396  		x1, x2 = x1+x2, x1-x2
   397  		x0, x3 = x0+x3, x0-x3
   398  		// x[0123] now Q8.19 in [-246, 246].
   399  
   400  		// Stages 4, 3, 2: x4, x5, x6, x7.
   401  
   402  		x4 <<= 7
   403  		x7 <<= 7
   404  		// x[47] now UQ8.7
   405  		x7, x4 = x7+x4, x7-x4
   406  		// x7 now UQ8.8 in [0, 255].
   407  		// x4 now Q7.8 in [-127½, 127½].
   408  
   409  		x6 = x6 * c(sqrt2inv, 8)
   410  		x5 = x5 * c(sqrt2inv, 8)
   411  		// x[56] now UQ8.8 in [0, 181].
   412  		// Note that 1/√2 has five 0s in its binary representation after
   413  		// the 8th bit, so this multipliy is actually producing 12 bits of precision.
   414  
   415  		x7, x5 = x7+x5, x7-x5
   416  		x4, x6 = x4+x6, x4-x6
   417  		// x[4567] now Q8.9 in [-218, 218].
   418  
   419  		x4, x7 = dctBox(x4>>2, x7>>2, c(cos3, 12), -c(sin3, 12))
   420  		x5, x6 = dctBox(x5>>2, x6>>2, c(cos1, 12), -c(sin1, 12))
   421  		// x[4567] now Q9.19 in [-303, 303].
   422  
   423  		// Stage 1.
   424  
   425  		x0, x7 = x0+x7, x0-x7
   426  		x1, x6 = x1+x6, x1-x6
   427  		x2, x5 = x2+x5, x2-x5
   428  		x3, x4 = x3+x4, x3-x4
   429  		// x[01234567] now Q9.20 in [-275, 275].
   430  
   431  		// Note: we don't need all 20 bits of “precision”,
   432  		// but it is faster to let idctCols shift it away as part
   433  		// of other operations rather than downshift here.
   434  
   435  		x[0] = x0
   436  		x[1] = x1
   437  		x[2] = x2
   438  		x[3] = x3
   439  		x[4] = x4
   440  		x[5] = x5
   441  		x[6] = x6
   442  		x[7] = x7
   443  	}
   444  }
   445  
   446  // idctCols applies the 1D IDCT to the columns of b.
   447  // Inputs are Q9.20.
   448  // Outputs are Q10.3. That is, the result is the IDCT*8.
   449  func idctCols(b *block) {
   450  	for i := range 8 {
   451  		x0 := b[0*8+i]
   452  		x7 := b[1*8+i]
   453  		x2 := b[2*8+i]
   454  		x5 := b[3*8+i]
   455  		x1 := b[4*8+i]
   456  		x6 := b[5*8+i]
   457  		x3 := b[6*8+i]
   458  		x4 := b[7*8+i]
   459  
   460  		// x[012345678] are Q9.20.
   461  
   462  		// Start by adding 0.5 to x0 (the incoming DC signal).
   463  		// The butterflies will add it to all the other values,
   464  		// and then the final shifts will round properly.
   465  		x0 += 1 << 19
   466  
   467  		// Stages 4, 3, 2: x0, x1, x2, x3.
   468  
   469  		x0, x1 = (x0+x1)>>2, (x0-x1)>>2
   470  		// x[01] now Q9.19.
   471  		// Note: (1/sqrt 2)*((cos 6*pi/16)+(sin 6*pi/16)) < 1, so no new high bit.
   472  		x2, x3 = dctBox(x2>>13, x3>>13, c(sqrt2inv_cos6, 12), -c(sqrt2inv_sin6, 12))
   473  		// x[0123] now Q9.19.
   474  
   475  		x1, x2 = x1+x2, x1-x2
   476  		x0, x3 = x0+x3, x0-x3
   477  		// x[0123] now Q9.20.
   478  
   479  		// Stages 4, 3, 2: x4, x5, x6, x7.
   480  
   481  		x7, x4 = x7+x4, x7-x4
   482  		// x[47] now Q9.21.
   483  
   484  		x5 = (x5 >> 13) * c(sqrt2inv, 14)
   485  		x6 = (x6 >> 13) * c(sqrt2inv, 14)
   486  		// x[56] now Q9.21.
   487  
   488  		x7, x5 = x7+x5, x7-x5
   489  		x4, x6 = x4+x6, x4-x6
   490  		// x[4567] now Q9.22.
   491  
   492  		x4, x7 = dctBox(x4>>14, x7>>14, c(cos3, 12), -c(sin3, 12))
   493  		x5, x6 = dctBox(x5>>14, x6>>14, c(cos1, 12), -c(sin1, 12))
   494  		// x[4567] now Q10.20.
   495  
   496  		x0, x7 = x0+x7, x0-x7
   497  		x1, x6 = x1+x6, x1-x6
   498  		x2, x5 = x2+x5, x2-x5
   499  		x3, x4 = x3+x4, x3-x4
   500  		// x[01234567] now Q10.21.
   501  
   502  		x0 >>= 18
   503  		x1 >>= 18
   504  		x2 >>= 18
   505  		x3 >>= 18
   506  		x4 >>= 18
   507  		x5 >>= 18
   508  		x6 >>= 18
   509  		x7 >>= 18
   510  		// x[01234567] now Q10.3.
   511  
   512  		b[0*8+i] = x0
   513  		b[1*8+i] = x1
   514  		b[2*8+i] = x2
   515  		b[3*8+i] = x3
   516  		b[4*8+i] = x4
   517  		b[5*8+i] = x5
   518  		b[6*8+i] = x6
   519  		b[7*8+i] = x7
   520  	}
   521  }
   522
View as plain text