Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

jfdctint.c

Last change on this file was 846, checked in by Dmitry A. Kuminov, 14 years ago
trunk: Merged in qt 4.7.2 sources from branches/vendor/nokia/qt.
File size: 155.0 KB

Rev	Line
[2]	1	/*
	2	* jfdctint.c
	3	*
	4	* Copyright (C) 1991-1996, Thomas G. Lane.
[846]	5	* Modification developed 2003-2009 by Guido Vollbeding.
[2]	6	* This file is part of the Independent JPEG Group's software.
	7	* For conditions of distribution and use, see the accompanying README file.
	8	*
	9	* This file contains a slow-but-accurate integer implementation of the
	10	* forward DCT (Discrete Cosine Transform).
	11	*
	12	* A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
	13	* on each column. Direct algorithms are also available, but they are
	14	* much more complex and seem not to be any faster when reduced to code.
	15	*
	16	* This implementation is based on an algorithm described in
	17	* C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
	18	* Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
	19	* Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
	20	* The primary algorithm described there uses 11 multiplies and 29 adds.
	21	* We use their alternate method with 12 multiplies and 32 adds.
	22	* The advantage of this method is that no data path contains more than one
	23	* multiplication; this allows a very simple and accurate implementation in
	24	* scaled fixed-point arithmetic, with a minimal number of shifts.
[846]	25	*
	26	* We also provide FDCT routines with various input sample block sizes for
	27	* direct resolution reduction or enlargement and for direct resolving the
	28	* common 2x1 and 1x2 subsampling cases without additional resampling: NxN
	29	* (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 output DCT block.
	30	*
	31	* For N<8 we fill the remaining block coefficients with zero.
	32	* For N>8 we apply a partial N-point FDCT on the input samples, computing
	33	* just the lower 8 frequency coefficients and discarding the rest.
	34	*
	35	* We must scale the output coefficients of the N-point FDCT appropriately
	36	* to the standard 8-point FDCT level by 8/N per 1-D pass. This scaling
	37	* is folded into the constant multipliers (pass 2) and/or final/initial
	38	* shifting.
	39	*
	40	* CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
	41	* since there would be too many additional constants to pre-calculate.
[2]	42	*/
	43
	44	#define JPEG_INTERNALS
	45	#include "jinclude.h"
	46	#include "jpeglib.h"
	47	#include "jdct.h" /* Private declarations for DCT subsystem */
	48
	49	#ifdef DCT_ISLOW_SUPPORTED
	50
	51
	52	/*
	53	* This module is specialized to the case DCTSIZE = 8.
	54	*/
	55
	56	#if DCTSIZE != 8
[846]	57	Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
[2]	58	#endif
	59
	60
	61	/*
	62	* The poop on this scaling stuff is as follows:
	63	*
	64	* Each 1-D DCT step produces outputs which are a factor of sqrt(N)
	65	* larger than the true DCT outputs. The final outputs are therefore
	66	* a factor of N larger than desired; since N=8 this can be cured by
	67	* a simple right shift at the end of the algorithm. The advantage of
	68	* this arrangement is that we save two multiplications per 1-D DCT,
	69	* because the y0 and y4 outputs need not be divided by sqrt(N).
	70	* In the IJG code, this factor of 8 is removed by the quantization step
	71	* (in jcdctmgr.c), NOT in this module.
	72	*
	73	* We have to do addition and subtraction of the integer inputs, which
	74	* is no problem, and multiplication by fractional constants, which is
	75	* a problem to do in integer arithmetic. We multiply all the constants
	76	* by CONST_SCALE and convert them to integer constants (thus retaining
	77	* CONST_BITS bits of precision in the constants). After doing a
	78	* multiplication we have to divide the product by CONST_SCALE, with proper
	79	* rounding, to produce the correct output. This division can be done
	80	* cheaply as a right shift of CONST_BITS bits. We postpone shifting
	81	* as long as possible so that partial sums can be added together with
	82	* full fractional precision.
	83	*
	84	* The outputs of the first pass are scaled up by PASS1_BITS bits so that
	85	* they are represented to better-than-integral precision. These outputs
	86	* require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
	87	* with the recommended scaling. (For 12-bit sample data, the intermediate
	88	* array is INT32 anyway.)
	89	*
	90	* To avoid overflow of the 32-bit intermediate results in pass 2, we must
	91	* have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis
	92	* shows that the values given below are the most effective.
	93	*/
	94
	95	#if BITS_IN_JSAMPLE == 8
	96	#define CONST_BITS 13
	97	#define PASS1_BITS 2
	98	#else
	99	#define CONST_BITS 13
	100	#define PASS1_BITS 1 /* lose a little precision to avoid overflow */
	101	#endif
	102
	103	/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
	104	* causing a lot of useless floating-point operations at run time.
	105	* To get around this we use the following pre-calculated constants.
	106	* If you change CONST_BITS you may want to add appropriate values.
	107	* (With a reasonable C compiler, you can just rely on the FIX() macro...)
	108	*/
	109
	110	#if CONST_BITS == 13
	111	#define FIX_0_298631336 ((INT32) 2446) /* FIX(0.298631336) */
	112	#define FIX_0_390180644 ((INT32) 3196) /* FIX(0.390180644) */
	113	#define FIX_0_541196100 ((INT32) 4433) /* FIX(0.541196100) */
	114	#define FIX_0_765366865 ((INT32) 6270) /* FIX(0.765366865) */
	115	#define FIX_0_899976223 ((INT32) 7373) /* FIX(0.899976223) */
	116	#define FIX_1_175875602 ((INT32) 9633) /* FIX(1.175875602) */
	117	#define FIX_1_501321110 ((INT32) 12299) /* FIX(1.501321110) */
	118	#define FIX_1_847759065 ((INT32) 15137) /* FIX(1.847759065) */
	119	#define FIX_1_961570560 ((INT32) 16069) /* FIX(1.961570560) */
	120	#define FIX_2_053119869 ((INT32) 16819) /* FIX(2.053119869) */
	121	#define FIX_2_562915447 ((INT32) 20995) /* FIX(2.562915447) */
	122	#define FIX_3_072711026 ((INT32) 25172) /* FIX(3.072711026) */
	123	#else
	124	#define FIX_0_298631336 FIX(0.298631336)
	125	#define FIX_0_390180644 FIX(0.390180644)
	126	#define FIX_0_541196100 FIX(0.541196100)
	127	#define FIX_0_765366865 FIX(0.765366865)
	128	#define FIX_0_899976223 FIX(0.899976223)
	129	#define FIX_1_175875602 FIX(1.175875602)
	130	#define FIX_1_501321110 FIX(1.501321110)
	131	#define FIX_1_847759065 FIX(1.847759065)
	132	#define FIX_1_961570560 FIX(1.961570560)
	133	#define FIX_2_053119869 FIX(2.053119869)
	134	#define FIX_2_562915447 FIX(2.562915447)
	135	#define FIX_3_072711026 FIX(3.072711026)
	136	#endif
	137
	138
	139	/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
	140	* For 8-bit samples with the recommended scaling, all the variable
	141	* and constant values involved are no more than 16 bits wide, so a
	142	* 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
	143	* For 12-bit samples, a full 32-bit multiplication will be needed.
	144	*/
	145
	146	#if BITS_IN_JSAMPLE == 8
	147	#define MULTIPLY(var,const) MULTIPLY16C16(var,const)
	148	#else
	149	#define MULTIPLY(var,const) ((var) * (const))
	150	#endif
	151
	152
	153	/*
	154	* Perform the forward DCT on one block of samples.
	155	*/
	156
	157	GLOBAL(void)
[846]	158	jpeg_fdct_islow (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
[2]	159	{
[846]	160	INT32 tmp0, tmp1, tmp2, tmp3;
[2]	161	INT32 tmp10, tmp11, tmp12, tmp13;
[846]	162	INT32 z1;
[2]	163	DCTELEM *dataptr;
[846]	164	JSAMPROW elemptr;
[2]	165	int ctr;
	166	SHIFT_TEMPS
	167
	168	/* Pass 1: process rows. */
	169	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
	170	/* furthermore, we scale the results by 2*PASS1_BITS. /
	171
	172	dataptr = data;
[846]	173	for (ctr = 0; ctr < DCTSIZE; ctr++) {
	174	elemptr = sample_data[ctr] + start_col;
	175
[2]	176	/* Even part per LL&M figure 1 --- note that published figure is faulty;
	177	* rotator "sqrt(2)c1" should be "sqrt(2)c6".
	178	*/
[846]	179
	180	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
	181	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
	182	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
	183	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
	184
[2]	185	tmp10 = tmp0 + tmp3;
[846]	186	tmp12 = tmp0 - tmp3;
[2]	187	tmp11 = tmp1 + tmp2;
[846]	188	tmp13 = tmp1 - tmp2;
	189
	190	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
	191	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
	192	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
	193	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
	194
	195	/* Apply unsigned->signed conversion */
	196	dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
[2]	197	dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
[846]	198
[2]	199	z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
[846]	200	/* Add fudge factor here for final descale. */
	201	z1 += ONE << (CONST_BITS-PASS1_BITS-1);
	202	dataptr[2] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),
	203	CONST_BITS-PASS1_BITS);
	204	dataptr[6] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),
	205	CONST_BITS-PASS1_BITS);
	206
[2]	207	/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
[846]	208	* cK represents sqrt(2) * cos(K*pi/16).
	209	* i0..i3 in the paper are tmp0..tmp3 here.
[2]	210	*/
[846]	211
	212	tmp10 = tmp0 + tmp3;
	213	tmp11 = tmp1 + tmp2;
	214	tmp12 = tmp0 + tmp2;
	215	tmp13 = tmp1 + tmp3;
	216	z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
	217	/* Add fudge factor here for final descale. */
	218	z1 += ONE << (CONST_BITS-PASS1_BITS-1);
	219
	220	tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
	221	tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
	222	tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
	223	tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
	224	tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
	225	tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
	226	tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
	227	tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
	228
	229	tmp12 += z1;
	230	tmp13 += z1;
	231
	232	dataptr[1] = (DCTELEM)
	233	RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
	234	dataptr[3] = (DCTELEM)
	235	RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
	236	dataptr[5] = (DCTELEM)
	237	RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
	238	dataptr[7] = (DCTELEM)
	239	RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);
	240
[2]	241	dataptr += DCTSIZE; /* advance pointer to next row */
	242	}
	243
	244	/* Pass 2: process columns.
	245	* We remove the PASS1_BITS scaling, but leave the results scaled up
	246	* by an overall factor of 8.
	247	*/
	248
	249	dataptr = data;
	250	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
[846]	251	/* Even part per LL&M figure 1 --- note that published figure is faulty;
	252	* rotator "sqrt(2)c1" should be "sqrt(2)c6".
	253	*/
	254
[2]	255	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE7];
	256	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE6];
	257	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE5];
	258	tmp3 = dataptr[DCTSIZE3] + dataptr[DCTSIZE4];
[846]	259
	260	/* Add fudge factor here for final descale. */
	261	tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
	262	tmp12 = tmp0 - tmp3;
	263	tmp11 = tmp1 + tmp2;
	264	tmp13 = tmp1 - tmp2;
	265
	266	tmp0 = dataptr[DCTSIZE0] - dataptr[DCTSIZE7];
	267	tmp1 = dataptr[DCTSIZE1] - dataptr[DCTSIZE6];
	268	tmp2 = dataptr[DCTSIZE2] - dataptr[DCTSIZE5];
	269	tmp3 = dataptr[DCTSIZE3] - dataptr[DCTSIZE4];
	270
	271	dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
	272	dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
	273
	274	z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
	275	/* Add fudge factor here for final descale. */
	276	z1 += ONE << (CONST_BITS+PASS1_BITS-1);
	277	dataptr[DCTSIZE*2] = (DCTELEM)
	278	RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);
	279	dataptr[DCTSIZE*6] = (DCTELEM)
	280	RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);
	281
	282	/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
	283	* cK represents sqrt(2) * cos(K*pi/16).
	284	* i0..i3 in the paper are tmp0..tmp3 here.
	285	*/
	286
	287	tmp10 = tmp0 + tmp3;
	288	tmp11 = tmp1 + tmp2;
	289	tmp12 = tmp0 + tmp2;
	290	tmp13 = tmp1 + tmp3;
	291	z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
	292	/* Add fudge factor here for final descale. */
	293	z1 += ONE << (CONST_BITS+PASS1_BITS-1);
	294
	295	tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
	296	tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
	297	tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
	298	tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
	299	tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
	300	tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
	301	tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
	302	tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
	303
	304	tmp12 += z1;
	305	tmp13 += z1;
	306
	307	dataptr[DCTSIZE*1] = (DCTELEM)
	308	RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
	309	dataptr[DCTSIZE*3] = (DCTELEM)
	310	RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
	311	dataptr[DCTSIZE*5] = (DCTELEM)
	312	RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
	313	dataptr[DCTSIZE*7] = (DCTELEM)
	314	RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);
	315
	316	dataptr++; /* advance pointer to next column */
	317	}
	318	}
	319
	320	#ifdef DCT_SCALING_SUPPORTED
	321
	322
	323	/*
	324	* Perform the forward DCT on a 7x7 sample block.
	325	*/
	326
	327	GLOBAL(void)
	328	jpeg_fdct_7x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	329	{
	330	INT32 tmp0, tmp1, tmp2, tmp3;
	331	INT32 tmp10, tmp11, tmp12;
	332	INT32 z1, z2, z3;
	333	DCTELEM *dataptr;
	334	JSAMPROW elemptr;
	335	int ctr;
	336	SHIFT_TEMPS
	337
	338	/* Pre-zero output coefficient block. */
	339	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
	340
	341	/* Pass 1: process rows. */
	342	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
	343	/* furthermore, we scale the results by 2*PASS1_BITS. /
	344	/* cK represents sqrt(2) * cos(Kpi/14). /
	345
	346	dataptr = data;
	347	for (ctr = 0; ctr < 7; ctr++) {
	348	elemptr = sample_data[ctr] + start_col;
	349
	350	/* Even part */
	351
	352	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
	353	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
	354	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
	355	tmp3 = GETJSAMPLE(elemptr[3]);
	356
	357	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
	358	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
	359	tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
	360
	361	z1 = tmp0 + tmp2;
	362	/* Apply unsigned->signed conversion */
	363	dataptr[0] = (DCTELEM)
	364	((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
	365	tmp3 += tmp3;
	366	z1 -= tmp3;
	367	z1 -= tmp3;
	368	z1 = MULTIPLY(z1, FIX(0.353553391)); /* (c2+c6-c4)/2 */
	369	z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002)); /* (c2+c4-c6)/2 */
	370	z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123)); /* c6 */
	371	dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
	372	z1 -= z2;
	373	z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734)); /* c4 */
	374	dataptr[4] = (DCTELEM)
	375	DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
	376	CONST_BITS-PASS1_BITS);
	377	dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
	378
	379	/* Odd part */
	380
	381	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347)); /* (c3+c1-c5)/2 */
	382	tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339)); /* (c3+c5-c1)/2 */
	383	tmp0 = tmp1 - tmp2;
	384	tmp1 += tmp2;
	385	tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
	386	tmp1 += tmp2;
	387	tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268)); /* c5 */
	388	tmp0 += tmp3;
	389	tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693)); /* c3+c1-c5 */
	390
	391	dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
	392	dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
	393	dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
	394
	395	dataptr += DCTSIZE; /* advance pointer to next row */
	396	}
	397
	398	/* Pass 2: process columns.
	399	* We remove the PASS1_BITS scaling, but leave the results scaled up
	400	* by an overall factor of 8.
	401	* We must also scale the output by (8/7)**2 = 64/49, which we fold
	402	* into the constant multipliers:
	403	* cK now represents sqrt(2) * cos(Kpi/14) 64/49.
	404	*/
	405
	406	dataptr = data;
	407	for (ctr = 0; ctr < 7; ctr++) {
	408	/* Even part */
	409
	410	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE6];
	411	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE5];
	412	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE4];
	413	tmp3 = dataptr[DCTSIZE*3];
	414
	415	tmp10 = dataptr[DCTSIZE0] - dataptr[DCTSIZE6];
	416	tmp11 = dataptr[DCTSIZE1] - dataptr[DCTSIZE5];
	417	tmp12 = dataptr[DCTSIZE2] - dataptr[DCTSIZE4];
	418
	419	z1 = tmp0 + tmp2;
	420	dataptr[DCTSIZE*0] = (DCTELEM)
	421	DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
	422	CONST_BITS+PASS1_BITS);
	423	tmp3 += tmp3;
	424	z1 -= tmp3;
	425	z1 -= tmp3;
	426	z1 = MULTIPLY(z1, FIX(0.461784020)); /* (c2+c6-c4)/2 */
	427	z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084)); /* (c2+c4-c6)/2 */
	428	z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446)); /* c6 */
	429	dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS);
	430	z1 -= z2;
	431	z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509)); /* c4 */
	432	dataptr[DCTSIZE*4] = (DCTELEM)
	433	DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
	434	CONST_BITS+PASS1_BITS);
	435	dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS);
	436
	437	/* Odd part */
	438
	439	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677)); /* (c3+c1-c5)/2 */
	440	tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464)); /* (c3+c5-c1)/2 */
	441	tmp0 = tmp1 - tmp2;
	442	tmp1 += tmp2;
	443	tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
	444	tmp1 += tmp2;
	445	tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310)); /* c5 */
	446	tmp0 += tmp3;
	447	tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355)); /* c3+c1-c5 */
	448
	449	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS);
	450	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS);
	451	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS);
	452
	453	dataptr++; /* advance pointer to next column */
	454	}
	455	}
	456
	457
	458	/*
	459	* Perform the forward DCT on a 6x6 sample block.
	460	*/
	461
	462	GLOBAL(void)
	463	jpeg_fdct_6x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	464	{
	465	INT32 tmp0, tmp1, tmp2;
	466	INT32 tmp10, tmp11, tmp12;
	467	DCTELEM *dataptr;
	468	JSAMPROW elemptr;
	469	int ctr;
	470	SHIFT_TEMPS
	471
	472	/* Pre-zero output coefficient block. */
	473	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
	474
	475	/* Pass 1: process rows. */
	476	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
	477	/* furthermore, we scale the results by 2*PASS1_BITS. /
	478	/* cK represents sqrt(2) * cos(Kpi/12). /
	479
	480	dataptr = data;
	481	for (ctr = 0; ctr < 6; ctr++) {
	482	elemptr = sample_data[ctr] + start_col;
	483
	484	/* Even part */
	485
	486	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
	487	tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
	488	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
	489
	490	tmp10 = tmp0 + tmp2;
	491	tmp12 = tmp0 - tmp2;
	492
	493	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
	494	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
	495	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
	496
	497	/* Apply unsigned->signed conversion */
	498	dataptr[0] = (DCTELEM)
	499	((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
	500	dataptr[2] = (DCTELEM)
	501	DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */
	502	CONST_BITS-PASS1_BITS);
	503	dataptr[4] = (DCTELEM)
	504	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
	505	CONST_BITS-PASS1_BITS);
	506
	507	/* Odd part */
	508
	509	tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */
	510	CONST_BITS-PASS1_BITS);
	511
	512	dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
	513	dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
	514	dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
	515
	516	dataptr += DCTSIZE; /* advance pointer to next row */
	517	}
	518
	519	/* Pass 2: process columns.
	520	* We remove the PASS1_BITS scaling, but leave the results scaled up
	521	* by an overall factor of 8.
	522	* We must also scale the output by (8/6)**2 = 16/9, which we fold
	523	* into the constant multipliers:
	524	* cK now represents sqrt(2) * cos(Kpi/12) 16/9.
	525	*/
	526
	527	dataptr = data;
	528	for (ctr = 0; ctr < 6; ctr++) {
	529	/* Even part */
	530
	531	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE5];
	532	tmp11 = dataptr[DCTSIZE1] + dataptr[DCTSIZE4];
	533	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE3];
	534
	535	tmp10 = tmp0 + tmp2;
	536	tmp12 = tmp0 - tmp2;
	537
	538	tmp0 = dataptr[DCTSIZE0] - dataptr[DCTSIZE5];
	539	tmp1 = dataptr[DCTSIZE1] - dataptr[DCTSIZE4];
	540	tmp2 = dataptr[DCTSIZE2] - dataptr[DCTSIZE3];
	541
	542	dataptr[DCTSIZE*0] = (DCTELEM)
	543	DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */
	544	CONST_BITS+PASS1_BITS);
	545	dataptr[DCTSIZE*2] = (DCTELEM)
	546	DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */
	547	CONST_BITS+PASS1_BITS);
	548	dataptr[DCTSIZE*4] = (DCTELEM)
	549	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
	550	CONST_BITS+PASS1_BITS);
	551
	552	/* Odd part */
	553
	554	tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */
	555
	556	dataptr[DCTSIZE*1] = (DCTELEM)
	557	DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
	558	CONST_BITS+PASS1_BITS);
	559	dataptr[DCTSIZE*3] = (DCTELEM)
	560	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */
	561	CONST_BITS+PASS1_BITS);
	562	dataptr[DCTSIZE*5] = (DCTELEM)
	563	DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */
	564	CONST_BITS+PASS1_BITS);
	565
	566	dataptr++; /* advance pointer to next column */
	567	}
	568	}
	569
	570
	571	/*
	572	* Perform the forward DCT on a 5x5 sample block.
	573	*/
	574
	575	GLOBAL(void)
	576	jpeg_fdct_5x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	577	{
	578	INT32 tmp0, tmp1, tmp2;
	579	INT32 tmp10, tmp11;
	580	DCTELEM *dataptr;
	581	JSAMPROW elemptr;
	582	int ctr;
	583	SHIFT_TEMPS
	584
	585	/* Pre-zero output coefficient block. */
	586	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
	587
	588	/* Pass 1: process rows. */
	589	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
	590	/* furthermore, we scale the results by 2*PASS1_BITS. /
	591	/* We scale the results further by 2 as part of output adaption */
	592	/* scaling for different DCT size. */
	593	/* cK represents sqrt(2) * cos(Kpi/10). /
	594
	595	dataptr = data;
	596	for (ctr = 0; ctr < 5; ctr++) {
	597	elemptr = sample_data[ctr] + start_col;
	598
	599	/* Even part */
	600
	601	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
	602	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
	603	tmp2 = GETJSAMPLE(elemptr[2]);
	604
	605	tmp10 = tmp0 + tmp1;
	606	tmp11 = tmp0 - tmp1;
	607
	608	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
	609	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
	610
	611	/* Apply unsigned->signed conversion */
	612	dataptr[0] = (DCTELEM)
	613	((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << (PASS1_BITS+1));
	614	tmp11 = MULTIPLY(tmp11, FIX(0.790569415)); /* (c2+c4)/2 */
	615	tmp10 -= tmp2 << 2;
	616	tmp10 = MULTIPLY(tmp10, FIX(0.353553391)); /* (c2-c4)/2 */
	617	dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS-1);
	618	dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS-1);
	619
	620	/* Odd part */
	621
	622	tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876)); /* c3 */
	623
	624	dataptr[1] = (DCTELEM)
	625	DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
	626	CONST_BITS-PASS1_BITS-1);
	627	dataptr[3] = (DCTELEM)
	628	DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
	629	CONST_BITS-PASS1_BITS-1);
	630
	631	dataptr += DCTSIZE; /* advance pointer to next row */
	632	}
	633
	634	/* Pass 2: process columns.
	635	* We remove the PASS1_BITS scaling, but leave the results scaled up
	636	* by an overall factor of 8.
	637	* We must also scale the output by (8/5)**2 = 64/25, which we partially
	638	* fold into the constant multipliers (other part was done in pass 1):
	639	* cK now represents sqrt(2) * cos(Kpi/10) 32/25.
	640	*/
	641
	642	dataptr = data;
	643	for (ctr = 0; ctr < 5; ctr++) {
	644	/* Even part */
	645
	646	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE4];
	647	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE3];
	648	tmp2 = dataptr[DCTSIZE*2];
	649
	650	tmp10 = tmp0 + tmp1;
	651	tmp11 = tmp0 - tmp1;
	652
	653	tmp0 = dataptr[DCTSIZE0] - dataptr[DCTSIZE4];
	654	tmp1 = dataptr[DCTSIZE1] - dataptr[DCTSIZE3];
	655
	656	dataptr[DCTSIZE*0] = (DCTELEM)
	657	DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)), /* 32/25 */
	658	CONST_BITS+PASS1_BITS);
	659	tmp11 = MULTIPLY(tmp11, FIX(1.011928851)); /* (c2+c4)/2 */
	660	tmp10 -= tmp2 << 2;
	661	tmp10 = MULTIPLY(tmp10, FIX(0.452548340)); /* (c2-c4)/2 */
	662	dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
	663	dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
	664
	665	/* Odd part */
	666
	667	tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961)); /* c3 */
	668
	669	dataptr[DCTSIZE*1] = (DCTELEM)
	670	DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
	671	CONST_BITS+PASS1_BITS);
	672	dataptr[DCTSIZE*3] = (DCTELEM)
	673	DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
	674	CONST_BITS+PASS1_BITS);
	675
	676	dataptr++; /* advance pointer to next column */
	677	}
	678	}
	679
	680
	681	/*
	682	* Perform the forward DCT on a 4x4 sample block.
	683	*/
	684
	685	GLOBAL(void)
	686	jpeg_fdct_4x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	687	{
	688	INT32 tmp0, tmp1;
	689	INT32 tmp10, tmp11;
	690	DCTELEM *dataptr;
	691	JSAMPROW elemptr;
	692	int ctr;
	693	SHIFT_TEMPS
	694
	695	/* Pre-zero output coefficient block. */
	696	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
	697
	698	/* Pass 1: process rows. */
	699	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
	700	/* furthermore, we scale the results by 2*PASS1_BITS. /
	701	/* We must also scale the output by (8/4)2 = 22, which we add here. */
	702	/* cK represents sqrt(2) * cos(Kpi/16) [refers to 8-point FDCT]. /
	703
	704	dataptr = data;
	705	for (ctr = 0; ctr < 4; ctr++) {
	706	elemptr = sample_data[ctr] + start_col;
	707
	708	/* Even part */
	709
	710	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
	711	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
	712
	713	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
	714	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
	715
	716	/* Apply unsigned->signed conversion */
	717	dataptr[0] = (DCTELEM)
	718	((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+2));
	719	dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+2));
	720
	721	/* Odd part */
	722
	723	tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
	724	/* Add fudge factor here for final descale. */
	725	tmp0 += ONE << (CONST_BITS-PASS1_BITS-3);
	726
	727	dataptr[1] = (DCTELEM)
	728	RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
	729	CONST_BITS-PASS1_BITS-2);
	730	dataptr[3] = (DCTELEM)
	731	RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
	732	CONST_BITS-PASS1_BITS-2);
	733
	734	dataptr += DCTSIZE; /* advance pointer to next row */
	735	}
	736
	737	/* Pass 2: process columns.
	738	* We remove the PASS1_BITS scaling, but leave the results scaled up
	739	* by an overall factor of 8.
	740	*/
	741
	742	dataptr = data;
	743	for (ctr = 0; ctr < 4; ctr++) {
	744	/* Even part */
	745
	746	/* Add fudge factor here for final descale. */
	747	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE3] + (ONE << (PASS1_BITS-1));
	748	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE2];
	749
	750	tmp10 = dataptr[DCTSIZE0] - dataptr[DCTSIZE3];
	751	tmp11 = dataptr[DCTSIZE1] - dataptr[DCTSIZE2];
	752
	753	dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
	754	dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
	755
	756	/* Odd part */
	757
	758	tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
	759	/* Add fudge factor here for final descale. */
	760	tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);
	761
	762	dataptr[DCTSIZE*1] = (DCTELEM)
	763	RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
	764	CONST_BITS+PASS1_BITS);
	765	dataptr[DCTSIZE*3] = (DCTELEM)
	766	RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
	767	CONST_BITS+PASS1_BITS);
	768
	769	dataptr++; /* advance pointer to next column */
	770	}
	771	}
	772
	773
	774	/*
	775	* Perform the forward DCT on a 3x3 sample block.
	776	*/
	777
	778	GLOBAL(void)
	779	jpeg_fdct_3x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	780	{
	781	INT32 tmp0, tmp1, tmp2;
	782	DCTELEM *dataptr;
	783	JSAMPROW elemptr;
	784	int ctr;
	785	SHIFT_TEMPS
	786
	787	/* Pre-zero output coefficient block. */
	788	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
	789
	790	/* Pass 1: process rows. */
	791	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
	792	/* furthermore, we scale the results by 2*PASS1_BITS. /
	793	/* We scale the results further by 2*2 as part of output adaption /
	794	/* scaling for different DCT size. */
	795	/* cK represents sqrt(2) * cos(Kpi/6). /
	796
	797	dataptr = data;
	798	for (ctr = 0; ctr < 3; ctr++) {
	799	elemptr = sample_data[ctr] + start_col;
	800
	801	/* Even part */
	802
	803	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
	804	tmp1 = GETJSAMPLE(elemptr[1]);
	805
	806	tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
	807
	808	/* Apply unsigned->signed conversion */
	809	dataptr[0] = (DCTELEM)
	810	((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+2));
	811	dataptr[2] = (DCTELEM)
	812	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
	813	CONST_BITS-PASS1_BITS-2);
	814
	815	/* Odd part */
	816
	817	dataptr[1] = (DCTELEM)
	818	DESCALE(MULTIPLY(tmp2, FIX(1.224744871)), /* c1 */
	819	CONST_BITS-PASS1_BITS-2);
	820
	821	dataptr += DCTSIZE; /* advance pointer to next row */
	822	}
	823
	824	/* Pass 2: process columns.
	825	* We remove the PASS1_BITS scaling, but leave the results scaled up
	826	* by an overall factor of 8.
	827	* We must also scale the output by (8/3)**2 = 64/9, which we partially
	828	* fold into the constant multipliers (other part was done in pass 1):
	829	* cK now represents sqrt(2) * cos(Kpi/6) 16/9.
	830	*/
	831
	832	dataptr = data;
	833	for (ctr = 0; ctr < 3; ctr++) {
	834	/* Even part */
	835
	836	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE2];
	837	tmp1 = dataptr[DCTSIZE*1];
	838
	839	tmp2 = dataptr[DCTSIZE0] - dataptr[DCTSIZE2];
	840
	841	dataptr[DCTSIZE*0] = (DCTELEM)
	842	DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
	843	CONST_BITS+PASS1_BITS);
	844	dataptr[DCTSIZE*2] = (DCTELEM)
	845	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
	846	CONST_BITS+PASS1_BITS);
	847
	848	/* Odd part */
	849
	850	dataptr[DCTSIZE*1] = (DCTELEM)
	851	DESCALE(MULTIPLY(tmp2, FIX(2.177324216)), /* c1 */
	852	CONST_BITS+PASS1_BITS);
	853
	854	dataptr++; /* advance pointer to next column */
	855	}
	856	}
	857
	858
	859	/*
	860	* Perform the forward DCT on a 2x2 sample block.
	861	*/
	862
	863	GLOBAL(void)
	864	jpeg_fdct_2x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	865	{
	866	INT32 tmp0, tmp1, tmp2, tmp3;
	867	JSAMPROW elemptr;
	868
	869	/* Pre-zero output coefficient block. */
	870	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
	871
	872	/* Pass 1: process rows. */
	873	/* Note results are scaled up by sqrt(8) compared to a true DCT. */
	874
	875	/* Row 0 */
	876	elemptr = sample_data[0] + start_col;
	877
	878	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
	879	tmp1 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
	880
	881	/* Row 1 */
	882	elemptr = sample_data[1] + start_col;
	883
	884	tmp2 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
	885	tmp3 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
	886
	887	/* Pass 2: process columns.
	888	* We leave the results scaled up by an overall factor of 8.
	889	* We must also scale the output by (8/2)2 = 24.
	890	*/
	891
	892	/* Column 0 */
	893	/* Apply unsigned->signed conversion */
	894	data[DCTSIZE0] = (DCTELEM) ((tmp0 + tmp2 - 4 CENTERJSAMPLE) << 4);
	895	data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp2) << 4);
	896
	897	/* Column 1 */
	898	data[DCTSIZE*0+1] = (DCTELEM) ((tmp1 + tmp3) << 4);
	899	data[DCTSIZE*1+1] = (DCTELEM) ((tmp1 - tmp3) << 4);
	900	}
	901
	902
	903	/*
	904	* Perform the forward DCT on a 1x1 sample block.
	905	*/
	906
	907	GLOBAL(void)
	908	jpeg_fdct_1x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	909	{
	910	/* Pre-zero output coefficient block. */
	911	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
	912
	913	/* We leave the result scaled up by an overall factor of 8. */
	914	/* We must also scale the output by (8/1)2 = 26. */
	915	/* Apply unsigned->signed conversion */
	916	data[0] = (DCTELEM)
	917	((GETJSAMPLE(sample_data[0][start_col]) - CENTERJSAMPLE) << 6);
	918	}
	919
	920
	921	/*
	922	* Perform the forward DCT on a 9x9 sample block.
	923	*/
	924
	925	GLOBAL(void)
	926	jpeg_fdct_9x9 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	927	{
	928	INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
	929	INT32 tmp10, tmp11, tmp12, tmp13;
	930	INT32 z1, z2;
	931	DCTELEM workspace[8];
	932	DCTELEM *dataptr;
	933	DCTELEM *wsptr;
	934	JSAMPROW elemptr;
	935	int ctr;
	936	SHIFT_TEMPS
	937
	938	/* Pass 1: process rows. */
	939	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
	940	/* we scale the results further by 2 as part of output adaption */
	941	/* scaling for different DCT size. */
	942	/* cK represents sqrt(2) * cos(Kpi/18). /
	943
	944	dataptr = data;
	945	ctr = 0;
	946	for (;;) {
	947	elemptr = sample_data[ctr] + start_col;
	948
	949	/* Even part */
	950
	951	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[8]);
	952	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[7]);
	953	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[6]);
	954	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[5]);
	955	tmp4 = GETJSAMPLE(elemptr[4]);
	956
	957	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[8]);
	958	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[7]);
	959	tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[6]);
	960	tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[5]);
	961
	962	z1 = tmp0 + tmp2 + tmp3;
	963	z2 = tmp1 + tmp4;
	964	/* Apply unsigned->signed conversion */
	965	dataptr[0] = (DCTELEM) ((z1 + z2 - 9 * CENTERJSAMPLE) << 1);
	966	dataptr[6] = (DCTELEM)
	967	DESCALE(MULTIPLY(z1 - z2 - z2, FIX(0.707106781)), /* c6 */
	968	CONST_BITS-1);
	969	z1 = MULTIPLY(tmp0 - tmp2, FIX(1.328926049)); /* c2 */
	970	z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(0.707106781)); /* c6 */
	971	dataptr[2] = (DCTELEM)
	972	DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.083350441)) /* c4 */
	973	+ z1 + z2, CONST_BITS-1);
	974	dataptr[4] = (DCTELEM)
	975	DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.245575608)) /* c8 */
	976	+ z1 - z2, CONST_BITS-1);
	977
	978	/* Odd part */
	979
	980	dataptr[3] = (DCTELEM)
	981	DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.224744871)), /* c3 */
	982	CONST_BITS-1);
	983
	984	tmp11 = MULTIPLY(tmp11, FIX(1.224744871)); /* c3 */
	985	tmp0 = MULTIPLY(tmp10 + tmp12, FIX(0.909038955)); /* c5 */
	986	tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.483689525)); /* c7 */
	987
	988	dataptr[1] = (DCTELEM) DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS-1);
	989
	990	tmp2 = MULTIPLY(tmp12 - tmp13, FIX(1.392728481)); /* c1 */
	991
	992	dataptr[5] = (DCTELEM) DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS-1);
	993	dataptr[7] = (DCTELEM) DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS-1);
	994
	995	ctr++;
	996
	997	if (ctr != DCTSIZE) {
	998	if (ctr == 9)
	999	break; /* Done. */
	1000	dataptr += DCTSIZE; /* advance pointer to next row */
	1001	} else
	1002	dataptr = workspace; /* switch pointer to extended workspace */
	1003	}
	1004
	1005	/* Pass 2: process columns.
	1006	* We leave the results scaled up by an overall factor of 8.
	1007	* We must also scale the output by (8/9)**2 = 64/81, which we partially
	1008	* fold into the constant multipliers and final/initial shifting:
	1009	* cK now represents sqrt(2) * cos(Kpi/18) 128/81.
	1010	*/
	1011
	1012	dataptr = data;
	1013	wsptr = workspace;
	1014	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
	1015	/* Even part */
	1016
	1017	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE0];
	1018	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE7];
	1019	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE6];
	1020	tmp3 = dataptr[DCTSIZE3] + dataptr[DCTSIZE5];
	1021	tmp4 = dataptr[DCTSIZE*4];
	1022
	1023	tmp10 = dataptr[DCTSIZE0] - wsptr[DCTSIZE0];
	1024	tmp11 = dataptr[DCTSIZE1] - dataptr[DCTSIZE7];
	1025	tmp12 = dataptr[DCTSIZE2] - dataptr[DCTSIZE6];
	1026	tmp13 = dataptr[DCTSIZE3] - dataptr[DCTSIZE5];
	1027
	1028	z1 = tmp0 + tmp2 + tmp3;
	1029	z2 = tmp1 + tmp4;
	1030	dataptr[DCTSIZE*0] = (DCTELEM)
	1031	DESCALE(MULTIPLY(z1 + z2, FIX(1.580246914)), /* 128/81 */
	1032	CONST_BITS+2);
	1033	dataptr[DCTSIZE*6] = (DCTELEM)
	1034	DESCALE(MULTIPLY(z1 - z2 - z2, FIX(1.117403309)), /* c6 */
	1035	CONST_BITS+2);
	1036	z1 = MULTIPLY(tmp0 - tmp2, FIX(2.100031287)); /* c2 */
	1037	z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(1.117403309)); /* c6 */
	1038	dataptr[DCTSIZE*2] = (DCTELEM)
	1039	DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.711961190)) /* c4 */
	1040	+ z1 + z2, CONST_BITS+2);
	1041	dataptr[DCTSIZE*4] = (DCTELEM)
	1042	DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.388070096)) /* c8 */
	1043	+ z1 - z2, CONST_BITS+2);
	1044
	1045	/* Odd part */
	1046
	1047	dataptr[DCTSIZE*3] = (DCTELEM)
	1048	DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.935399303)), /* c3 */
	1049	CONST_BITS+2);
	1050
	1051	tmp11 = MULTIPLY(tmp11, FIX(1.935399303)); /* c3 */
	1052	tmp0 = MULTIPLY(tmp10 + tmp12, FIX(1.436506004)); /* c5 */
	1053	tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.764348879)); /* c7 */
	1054
	1055	dataptr[DCTSIZE*1] = (DCTELEM)
	1056	DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS+2);
	1057
	1058	tmp2 = MULTIPLY(tmp12 - tmp13, FIX(2.200854883)); /* c1 */
	1059
	1060	dataptr[DCTSIZE*5] = (DCTELEM)
	1061	DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS+2);
	1062	dataptr[DCTSIZE*7] = (DCTELEM)
	1063	DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS+2);
	1064
	1065	dataptr++; /* advance pointer to next column */
	1066	wsptr++; /* advance pointer to next column */
	1067	}
	1068	}
	1069
	1070
	1071	/*
	1072	* Perform the forward DCT on a 10x10 sample block.
	1073	*/
	1074
	1075	GLOBAL(void)
	1076	jpeg_fdct_10x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	1077	{
	1078	INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
	1079	INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
	1080	DCTELEM workspace[8*2];
	1081	DCTELEM *dataptr;
	1082	DCTELEM *wsptr;
	1083	JSAMPROW elemptr;
	1084	int ctr;
	1085	SHIFT_TEMPS
	1086
	1087	/* Pass 1: process rows. */
	1088	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
	1089	/* we scale the results further by 2 as part of output adaption */
	1090	/* scaling for different DCT size. */
	1091	/* cK represents sqrt(2) * cos(Kpi/20). /
	1092
	1093	dataptr = data;
	1094	ctr = 0;
	1095	for (;;) {
	1096	elemptr = sample_data[ctr] + start_col;
	1097
	1098	/* Even part */
	1099
	1100	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
	1101	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
	1102	tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
	1103	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
	1104	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
	1105
	1106	tmp10 = tmp0 + tmp4;
	1107	tmp13 = tmp0 - tmp4;
	1108	tmp11 = tmp1 + tmp3;
	1109	tmp14 = tmp1 - tmp3;
	1110
	1111	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
	1112	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
	1113	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
	1114	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
	1115	tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
	1116
	1117	/* Apply unsigned->signed conversion */
	1118	dataptr[0] = (DCTELEM)
	1119	((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << 1);
	1120	tmp12 += tmp12;
	1121	dataptr[4] = (DCTELEM)
	1122	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
	1123	MULTIPLY(tmp11 - tmp12, FIX(0.437016024)), /* c8 */
	1124	CONST_BITS-1);
	1125	tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876)); /* c6 */
	1126	dataptr[2] = (DCTELEM)
	1127	DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)), /* c2-c6 */
	1128	CONST_BITS-1);
	1129	dataptr[6] = (DCTELEM)
	1130	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)), /* c2+c6 */
	1131	CONST_BITS-1);
	1132
	1133	/* Odd part */
	1134
	1135	tmp10 = tmp0 + tmp4;
	1136	tmp11 = tmp1 - tmp3;
	1137	dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << 1);
	1138	tmp2 <<= CONST_BITS;
	1139	dataptr[1] = (DCTELEM)
	1140	DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) + /* c1 */
	1141	MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 + /* c3 */
	1142	MULTIPLY(tmp3, FIX(0.642039522)) + /* c7 */
	1143	MULTIPLY(tmp4, FIX(0.221231742)), /* c9 */
	1144	CONST_BITS-1);
	1145	tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) - /* (c3+c7)/2 */
	1146	MULTIPLY(tmp1 + tmp3, FIX(0.587785252)); /* (c1-c9)/2 */
	1147	tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) + /* (c3-c7)/2 */
	1148	(tmp11 << (CONST_BITS - 1)) - tmp2;
	1149	dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-1);
	1150	dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-1);
	1151
	1152	ctr++;
	1153
	1154	if (ctr != DCTSIZE) {
	1155	if (ctr == 10)
	1156	break; /* Done. */
	1157	dataptr += DCTSIZE; /* advance pointer to next row */
	1158	} else
	1159	dataptr = workspace; /* switch pointer to extended workspace */
	1160	}
	1161
	1162	/* Pass 2: process columns.
	1163	* We leave the results scaled up by an overall factor of 8.
	1164	* We must also scale the output by (8/10)**2 = 16/25, which we partially
	1165	* fold into the constant multipliers and final/initial shifting:
	1166	* cK now represents sqrt(2) * cos(Kpi/20) 32/25.
	1167	*/
	1168
	1169	dataptr = data;
	1170	wsptr = workspace;
	1171	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
	1172	/* Even part */
	1173
	1174	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE1];
	1175	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE0];
	1176	tmp12 = dataptr[DCTSIZE2] + dataptr[DCTSIZE7];
	1177	tmp3 = dataptr[DCTSIZE3] + dataptr[DCTSIZE6];
	1178	tmp4 = dataptr[DCTSIZE4] + dataptr[DCTSIZE5];
	1179
	1180	tmp10 = tmp0 + tmp4;
	1181	tmp13 = tmp0 - tmp4;
	1182	tmp11 = tmp1 + tmp3;
	1183	tmp14 = tmp1 - tmp3;
	1184
	1185	tmp0 = dataptr[DCTSIZE0] - wsptr[DCTSIZE1];
	1186	tmp1 = dataptr[DCTSIZE1] - wsptr[DCTSIZE0];
	1187	tmp2 = dataptr[DCTSIZE2] - dataptr[DCTSIZE7];
	1188	tmp3 = dataptr[DCTSIZE3] - dataptr[DCTSIZE6];
	1189	tmp4 = dataptr[DCTSIZE4] - dataptr[DCTSIZE5];
	1190
	1191	dataptr[DCTSIZE*0] = (DCTELEM)
	1192	DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
	1193	CONST_BITS+2);
	1194	tmp12 += tmp12;
	1195	dataptr[DCTSIZE*4] = (DCTELEM)
	1196	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
	1197	MULTIPLY(tmp11 - tmp12, FIX(0.559380511)), /* c8 */
	1198	CONST_BITS+2);
	1199	tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961)); /* c6 */
	1200	dataptr[DCTSIZE*2] = (DCTELEM)
	1201	DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)), /* c2-c6 */
	1202	CONST_BITS+2);
	1203	dataptr[DCTSIZE*6] = (DCTELEM)
	1204	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)), /* c2+c6 */
	1205	CONST_BITS+2);
	1206
	1207	/* Odd part */
	1208
	1209	tmp10 = tmp0 + tmp4;
	1210	tmp11 = tmp1 - tmp3;
	1211	dataptr[DCTSIZE*5] = (DCTELEM)
	1212	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)), /* 32/25 */
	1213	CONST_BITS+2);
	1214	tmp2 = MULTIPLY(tmp2, FIX(1.28)); /* 32/25 */
	1215	dataptr[DCTSIZE*1] = (DCTELEM)
	1216	DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) + /* c1 */
	1217	MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 + /* c3 */
	1218	MULTIPLY(tmp3, FIX(0.821810588)) + /* c7 */
	1219	MULTIPLY(tmp4, FIX(0.283176630)), /* c9 */
	1220	CONST_BITS+2);
	1221	tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) - /* (c3+c7)/2 */
	1222	MULTIPLY(tmp1 + tmp3, FIX(0.752365123)); /* (c1-c9)/2 */
	1223	tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) + /* (c3-c7)/2 */
	1224	MULTIPLY(tmp11, FIX(0.64)) - tmp2; /* 16/25 */
	1225	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+2);
	1226	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+2);
	1227
	1228	dataptr++; /* advance pointer to next column */
	1229	wsptr++; /* advance pointer to next column */
	1230	}
	1231	}
	1232
	1233
	1234	/*
	1235	* Perform the forward DCT on an 11x11 sample block.
	1236	*/
	1237
	1238	GLOBAL(void)
	1239	jpeg_fdct_11x11 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	1240	{
	1241	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
	1242	INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
	1243	INT32 z1, z2, z3;
	1244	DCTELEM workspace[8*3];
	1245	DCTELEM *dataptr;
	1246	DCTELEM *wsptr;
	1247	JSAMPROW elemptr;
	1248	int ctr;
	1249	SHIFT_TEMPS
	1250
	1251	/* Pass 1: process rows. */
	1252	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
	1253	/* we scale the results further by 2 as part of output adaption */
	1254	/* scaling for different DCT size. */
	1255	/* cK represents sqrt(2) * cos(Kpi/22). /
	1256
	1257	dataptr = data;
	1258	ctr = 0;
	1259	for (;;) {
	1260	elemptr = sample_data[ctr] + start_col;
	1261
	1262	/* Even part */
	1263
	1264	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[10]);
	1265	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[9]);
	1266	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[8]);
	1267	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[7]);
	1268	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[6]);
	1269	tmp5 = GETJSAMPLE(elemptr[5]);
	1270
	1271	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[10]);
	1272	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[9]);
	1273	tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[8]);
	1274	tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[7]);
	1275	tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[6]);
	1276
	1277	/* Apply unsigned->signed conversion */
	1278	dataptr[0] = (DCTELEM)
	1279	((tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 - 11 * CENTERJSAMPLE) << 1);
	1280	tmp5 += tmp5;
	1281	tmp0 -= tmp5;
	1282	tmp1 -= tmp5;
	1283	tmp2 -= tmp5;
	1284	tmp3 -= tmp5;
	1285	tmp4 -= tmp5;
	1286	z1 = MULTIPLY(tmp0 + tmp3, FIX(1.356927976)) + /* c2 */
	1287	MULTIPLY(tmp2 + tmp4, FIX(0.201263574)); /* c10 */
	1288	z2 = MULTIPLY(tmp1 - tmp3, FIX(0.926112931)); /* c6 */
	1289	z3 = MULTIPLY(tmp0 - tmp1, FIX(1.189712156)); /* c4 */
	1290	dataptr[2] = (DCTELEM)
	1291	DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.018300590)) /* c2+c8-c6 */
	1292	- MULTIPLY(tmp4, FIX(1.390975730)), /* c4+c10 */
	1293	CONST_BITS-1);
	1294	dataptr[4] = (DCTELEM)
	1295	DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.062335650)) /* c4-c6-c10 */
	1296	- MULTIPLY(tmp2, FIX(1.356927976)) /* c2 */
	1297	+ MULTIPLY(tmp4, FIX(0.587485545)), /* c8 */
	1298	CONST_BITS-1);
	1299	dataptr[6] = (DCTELEM)
	1300	DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.620527200)) /* c2+c4-c6 */
	1301	- MULTIPLY(tmp2, FIX(0.788749120)), /* c8+c10 */
	1302	CONST_BITS-1);
	1303
	1304	/* Odd part */
	1305
	1306	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.286413905)); /* c3 */
	1307	tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.068791298)); /* c5 */
	1308	tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.764581576)); /* c7 */
	1309	tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.719967871)) /* c7+c5+c3-c1 */
	1310	+ MULTIPLY(tmp14, FIX(0.398430003)); /* c9 */
	1311	tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.764581576)); /* -c7 */
	1312	tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.399818907)); /* -c1 */
	1313	tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.276416582)) /* c9+c7+c1-c3 */
	1314	- MULTIPLY(tmp14, FIX(1.068791298)); /* c5 */
	1315	tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.398430003)); /* c9 */
	1316	tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(1.989053629)) /* c9+c5+c3-c7 */
	1317	+ MULTIPLY(tmp14, FIX(1.399818907)); /* c1 */
	1318	tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.305598626)) /* c1+c5-c9-c7 */
	1319	- MULTIPLY(tmp14, FIX(1.286413905)); /* c3 */
	1320
	1321	dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-1);
	1322	dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-1);
	1323	dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-1);
	1324	dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS-1);
	1325
	1326	ctr++;
	1327
	1328	if (ctr != DCTSIZE) {
	1329	if (ctr == 11)
	1330	break; /* Done. */
	1331	dataptr += DCTSIZE; /* advance pointer to next row */
	1332	} else
	1333	dataptr = workspace; /* switch pointer to extended workspace */
	1334	}
	1335
	1336	/* Pass 2: process columns.
	1337	* We leave the results scaled up by an overall factor of 8.
	1338	* We must also scale the output by (8/11)**2 = 64/121, which we partially
	1339	* fold into the constant multipliers and final/initial shifting:
	1340	* cK now represents sqrt(2) * cos(Kpi/22) 128/121.
	1341	*/
	1342
	1343	dataptr = data;
	1344	wsptr = workspace;
	1345	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
	1346	/* Even part */
	1347
	1348	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE2];
	1349	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE1];
	1350	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE0];
	1351	tmp3 = dataptr[DCTSIZE3] + dataptr[DCTSIZE7];
	1352	tmp4 = dataptr[DCTSIZE4] + dataptr[DCTSIZE6];
	1353	tmp5 = dataptr[DCTSIZE*5];
	1354
	1355	tmp10 = dataptr[DCTSIZE0] - wsptr[DCTSIZE2];
	1356	tmp11 = dataptr[DCTSIZE1] - wsptr[DCTSIZE1];
	1357	tmp12 = dataptr[DCTSIZE2] - wsptr[DCTSIZE0];
	1358	tmp13 = dataptr[DCTSIZE3] - dataptr[DCTSIZE7];
	1359	tmp14 = dataptr[DCTSIZE4] - dataptr[DCTSIZE6];
	1360
	1361	dataptr[DCTSIZE*0] = (DCTELEM)
	1362	DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5,
	1363	FIX(1.057851240)), /* 128/121 */
	1364	CONST_BITS+2);
	1365	tmp5 += tmp5;
	1366	tmp0 -= tmp5;
	1367	tmp1 -= tmp5;
	1368	tmp2 -= tmp5;
	1369	tmp3 -= tmp5;
	1370	tmp4 -= tmp5;
	1371	z1 = MULTIPLY(tmp0 + tmp3, FIX(1.435427942)) + /* c2 */
	1372	MULTIPLY(tmp2 + tmp4, FIX(0.212906922)); /* c10 */
	1373	z2 = MULTIPLY(tmp1 - tmp3, FIX(0.979689713)); /* c6 */
	1374	z3 = MULTIPLY(tmp0 - tmp1, FIX(1.258538479)); /* c4 */
	1375	dataptr[DCTSIZE*2] = (DCTELEM)
	1376	DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.077210542)) /* c2+c8-c6 */
	1377	- MULTIPLY(tmp4, FIX(1.471445400)), /* c4+c10 */
	1378	CONST_BITS+2);
	1379	dataptr[DCTSIZE*4] = (DCTELEM)
	1380	DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.065941844)) /* c4-c6-c10 */
	1381	- MULTIPLY(tmp2, FIX(1.435427942)) /* c2 */
	1382	+ MULTIPLY(tmp4, FIX(0.621472312)), /* c8 */
	1383	CONST_BITS+2);
	1384	dataptr[DCTSIZE*6] = (DCTELEM)
	1385	DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.714276708)) /* c2+c4-c6 */
	1386	- MULTIPLY(tmp2, FIX(0.834379234)), /* c8+c10 */
	1387	CONST_BITS+2);
	1388
	1389	/* Odd part */
	1390
	1391	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.360834544)); /* c3 */
	1392	tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.130622199)); /* c5 */
	1393	tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.808813568)); /* c7 */
	1394	tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.819470145)) /* c7+c5+c3-c1 */
	1395	+ MULTIPLY(tmp14, FIX(0.421479672)); /* c9 */
	1396	tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.808813568)); /* -c7 */
	1397	tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.480800167)); /* -c1 */
	1398	tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.350258864)) /* c9+c7+c1-c3 */
	1399	- MULTIPLY(tmp14, FIX(1.130622199)); /* c5 */
	1400	tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.421479672)); /* c9 */
	1401	tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(2.104122847)) /* c9+c5+c3-c7 */
	1402	+ MULTIPLY(tmp14, FIX(1.480800167)); /* c1 */
	1403	tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.381129125)) /* c1+c5-c9-c7 */
	1404	- MULTIPLY(tmp14, FIX(1.360834544)); /* c3 */
	1405
	1406	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
	1407	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
	1408	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
	1409	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);
	1410
	1411	dataptr++; /* advance pointer to next column */
	1412	wsptr++; /* advance pointer to next column */
	1413	}
	1414	}
	1415
	1416
	1417	/*
	1418	* Perform the forward DCT on a 12x12 sample block.
	1419	*/
	1420
	1421	GLOBAL(void)
	1422	jpeg_fdct_12x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	1423	{
	1424	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
	1425	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
	1426	DCTELEM workspace[8*4];
	1427	DCTELEM *dataptr;
	1428	DCTELEM *wsptr;
	1429	JSAMPROW elemptr;
	1430	int ctr;
	1431	SHIFT_TEMPS
	1432
	1433	/* Pass 1: process rows. */
	1434	/* Note results are scaled up by sqrt(8) compared to a true DCT. */
	1435	/* cK represents sqrt(2) * cos(Kpi/24). /
	1436
	1437	dataptr = data;
	1438	ctr = 0;
	1439	for (;;) {
	1440	elemptr = sample_data[ctr] + start_col;
	1441
	1442	/* Even part */
	1443
	1444	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
	1445	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
	1446	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
	1447	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
	1448	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
	1449	tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
	1450
	1451	tmp10 = tmp0 + tmp5;
	1452	tmp13 = tmp0 - tmp5;
	1453	tmp11 = tmp1 + tmp4;
	1454	tmp14 = tmp1 - tmp4;
	1455	tmp12 = tmp2 + tmp3;
	1456	tmp15 = tmp2 - tmp3;
	1457
	1458	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
	1459	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
	1460	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
	1461	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
	1462	tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
	1463	tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
	1464
	1465	/* Apply unsigned->signed conversion */
	1466	dataptr[0] = (DCTELEM) (tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE);
	1467	dataptr[6] = (DCTELEM) (tmp13 - tmp14 - tmp15);
	1468	dataptr[4] = (DCTELEM)
	1469	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
	1470	CONST_BITS);
	1471	dataptr[2] = (DCTELEM)
	1472	DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
	1473	CONST_BITS);
	1474
	1475	/* Odd part */
	1476
	1477	tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100); /* c9 */
	1478	tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865); /* c3-c9 */
	1479	tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065); /* c3+c9 */
	1480	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054)); /* c5 */
	1481	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669)); /* c7 */
	1482	tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
	1483	+ MULTIPLY(tmp5, FIX(0.184591911)); /* c11 */
	1484	tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
	1485	tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
	1486	+ MULTIPLY(tmp5, FIX(0.860918669)); /* c7 */
	1487	tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
	1488	- MULTIPLY(tmp5, FIX(1.121971054)); /* c5 */
	1489	tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
	1490	- MULTIPLY(tmp2 + tmp5, FIX_0_541196100); /* c9 */
	1491
	1492	dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS);
	1493	dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS);
	1494	dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS);
	1495	dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS);
	1496
	1497	ctr++;
	1498
	1499	if (ctr != DCTSIZE) {
	1500	if (ctr == 12)
	1501	break; /* Done. */
	1502	dataptr += DCTSIZE; /* advance pointer to next row */
	1503	} else
	1504	dataptr = workspace; /* switch pointer to extended workspace */
	1505	}
	1506
	1507	/* Pass 2: process columns.
	1508	* We leave the results scaled up by an overall factor of 8.
	1509	* We must also scale the output by (8/12)**2 = 4/9, which we partially
	1510	* fold into the constant multipliers and final shifting:
	1511	* cK now represents sqrt(2) * cos(Kpi/24) 8/9.
	1512	*/
	1513
	1514	dataptr = data;
	1515	wsptr = workspace;
	1516	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
	1517	/* Even part */
	1518
	1519	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE3];
	1520	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE2];
	1521	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE1];
	1522	tmp3 = dataptr[DCTSIZE3] + wsptr[DCTSIZE0];
	1523	tmp4 = dataptr[DCTSIZE4] + dataptr[DCTSIZE7];
	1524	tmp5 = dataptr[DCTSIZE5] + dataptr[DCTSIZE6];
	1525
	1526	tmp10 = tmp0 + tmp5;
	1527	tmp13 = tmp0 - tmp5;
	1528	tmp11 = tmp1 + tmp4;
	1529	tmp14 = tmp1 - tmp4;
	1530	tmp12 = tmp2 + tmp3;
	1531	tmp15 = tmp2 - tmp3;
	1532
	1533	tmp0 = dataptr[DCTSIZE0] - wsptr[DCTSIZE3];
	1534	tmp1 = dataptr[DCTSIZE1] - wsptr[DCTSIZE2];
	1535	tmp2 = dataptr[DCTSIZE2] - wsptr[DCTSIZE1];
	1536	tmp3 = dataptr[DCTSIZE3] - wsptr[DCTSIZE0];
	1537	tmp4 = dataptr[DCTSIZE4] - dataptr[DCTSIZE7];
	1538	tmp5 = dataptr[DCTSIZE5] - dataptr[DCTSIZE6];
	1539
	1540	dataptr[DCTSIZE*0] = (DCTELEM)
	1541	DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
	1542	CONST_BITS+1);
	1543	dataptr[DCTSIZE*6] = (DCTELEM)
	1544	DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
	1545	CONST_BITS+1);
	1546	dataptr[DCTSIZE*4] = (DCTELEM)
	1547	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)), /* c4 */
	1548	CONST_BITS+1);
	1549	dataptr[DCTSIZE*2] = (DCTELEM)
	1550	DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) + /* 8/9 */
	1551	MULTIPLY(tmp13 + tmp15, FIX(1.214244803)), /* c2 */
	1552	CONST_BITS+1);
	1553
	1554	/* Odd part */
	1555
	1556	tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200)); /* c9 */
	1557	tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102)); /* c3-c9 */
	1558	tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502)); /* c3+c9 */
	1559	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603)); /* c5 */
	1560	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039)); /* c7 */
	1561	tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
	1562	+ MULTIPLY(tmp5, FIX(0.164081699)); /* c11 */
	1563	tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
	1564	tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
	1565	+ MULTIPLY(tmp5, FIX(0.765261039)); /* c7 */
	1566	tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
	1567	- MULTIPLY(tmp5, FIX(0.997307603)); /* c5 */
	1568	tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
	1569	- MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
	1570
	1571	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+1);
	1572	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+1);
	1573	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+1);
	1574	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+1);
	1575
	1576	dataptr++; /* advance pointer to next column */
	1577	wsptr++; /* advance pointer to next column */
	1578	}
	1579	}
	1580
	1581
	1582	/*
	1583	* Perform the forward DCT on a 13x13 sample block.
	1584	*/
	1585
	1586	GLOBAL(void)
	1587	jpeg_fdct_13x13 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	1588	{
	1589	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
	1590	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
	1591	INT32 z1, z2;
	1592	DCTELEM workspace[8*5];
	1593	DCTELEM *dataptr;
	1594	DCTELEM *wsptr;
	1595	JSAMPROW elemptr;
	1596	int ctr;
	1597	SHIFT_TEMPS
	1598
	1599	/* Pass 1: process rows. */
	1600	/* Note results are scaled up by sqrt(8) compared to a true DCT. */
	1601	/* cK represents sqrt(2) * cos(Kpi/26). /
	1602
	1603	dataptr = data;
	1604	ctr = 0;
	1605	for (;;) {
	1606	elemptr = sample_data[ctr] + start_col;
	1607
	1608	/* Even part */
	1609
	1610	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[12]);
	1611	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[11]);
	1612	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[10]);
	1613	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[9]);
	1614	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[8]);
	1615	tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[7]);
	1616	tmp6 = GETJSAMPLE(elemptr[6]);
	1617
	1618	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[12]);
	1619	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[11]);
	1620	tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[10]);
	1621	tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[9]);
	1622	tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[8]);
	1623	tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[7]);
	1624
	1625	/* Apply unsigned->signed conversion */
	1626	dataptr[0] = (DCTELEM)
	1627	(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6 - 13 * CENTERJSAMPLE);
	1628	tmp6 += tmp6;
	1629	tmp0 -= tmp6;
	1630	tmp1 -= tmp6;
	1631	tmp2 -= tmp6;
	1632	tmp3 -= tmp6;
	1633	tmp4 -= tmp6;
	1634	tmp5 -= tmp6;
	1635	dataptr[2] = (DCTELEM)
	1636	DESCALE(MULTIPLY(tmp0, FIX(1.373119086)) + /* c2 */
	1637	MULTIPLY(tmp1, FIX(1.058554052)) + /* c6 */
	1638	MULTIPLY(tmp2, FIX(0.501487041)) - /* c10 */
	1639	MULTIPLY(tmp3, FIX(0.170464608)) - /* c12 */
	1640	MULTIPLY(tmp4, FIX(0.803364869)) - /* c8 */
	1641	MULTIPLY(tmp5, FIX(1.252223920)), /* c4 */
	1642	CONST_BITS);
	1643	z1 = MULTIPLY(tmp0 - tmp2, FIX(1.155388986)) - /* (c4+c6)/2 */
	1644	MULTIPLY(tmp3 - tmp4, FIX(0.435816023)) - /* (c2-c10)/2 */
	1645	MULTIPLY(tmp1 - tmp5, FIX(0.316450131)); /* (c8-c12)/2 */
	1646	z2 = MULTIPLY(tmp0 + tmp2, FIX(0.096834934)) - /* (c4-c6)/2 */
	1647	MULTIPLY(tmp3 + tmp4, FIX(0.937303064)) + /* (c2+c10)/2 */
	1648	MULTIPLY(tmp1 + tmp5, FIX(0.486914739)); /* (c8+c12)/2 */
	1649
	1650	dataptr[4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS);
	1651	dataptr[6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS);
	1652
	1653	/* Odd part */
	1654
	1655	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.322312651)); /* c3 */
	1656	tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.163874945)); /* c5 */
	1657	tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.937797057)) + /* c7 */
	1658	MULTIPLY(tmp14 + tmp15, FIX(0.338443458)); /* c11 */
	1659	tmp0 = tmp1 + tmp2 + tmp3 -
	1660	MULTIPLY(tmp10, FIX(2.020082300)) + /* c3+c5+c7-c1 */
	1661	MULTIPLY(tmp14, FIX(0.318774355)); /* c9-c11 */
	1662	tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.937797057)) - /* c7 */
	1663	MULTIPLY(tmp11 + tmp12, FIX(0.338443458)); /* c11 */
	1664	tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.163874945)); /* -c5 */
	1665	tmp1 += tmp4 + tmp5 +
	1666	MULTIPLY(tmp11, FIX(0.837223564)) - /* c5+c9+c11-c3 */
	1667	MULTIPLY(tmp14, FIX(2.341699410)); /* c1+c7 */
	1668	tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.657217813)); /* -c9 */
	1669	tmp2 += tmp4 + tmp6 -
	1670	MULTIPLY(tmp12, FIX(1.572116027)) + /* c1+c5-c9-c11 */
	1671	MULTIPLY(tmp15, FIX(2.260109708)); /* c3+c7 */
	1672	tmp3 += tmp5 + tmp6 +
	1673	MULTIPLY(tmp13, FIX(2.205608352)) - /* c3+c5+c9-c7 */
	1674	MULTIPLY(tmp15, FIX(1.742345811)); /* c1+c11 */
	1675
	1676	dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
	1677	dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
	1678	dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
	1679	dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
	1680
	1681	ctr++;
	1682
	1683	if (ctr != DCTSIZE) {
	1684	if (ctr == 13)
	1685	break; /* Done. */
	1686	dataptr += DCTSIZE; /* advance pointer to next row */
	1687	} else
	1688	dataptr = workspace; /* switch pointer to extended workspace */
	1689	}
	1690
	1691	/* Pass 2: process columns.
	1692	* We leave the results scaled up by an overall factor of 8.
	1693	* We must also scale the output by (8/13)**2 = 64/169, which we partially
	1694	* fold into the constant multipliers and final shifting:
	1695	* cK now represents sqrt(2) * cos(Kpi/26) 128/169.
	1696	*/
	1697
	1698	dataptr = data;
	1699	wsptr = workspace;
	1700	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
	1701	/* Even part */
	1702
	1703	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE4];
	1704	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE3];
	1705	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE2];
	1706	tmp3 = dataptr[DCTSIZE3] + wsptr[DCTSIZE1];
	1707	tmp4 = dataptr[DCTSIZE4] + wsptr[DCTSIZE0];
	1708	tmp5 = dataptr[DCTSIZE5] + dataptr[DCTSIZE7];
	1709	tmp6 = dataptr[DCTSIZE*6];
	1710
	1711	tmp10 = dataptr[DCTSIZE0] - wsptr[DCTSIZE4];
	1712	tmp11 = dataptr[DCTSIZE1] - wsptr[DCTSIZE3];
	1713	tmp12 = dataptr[DCTSIZE2] - wsptr[DCTSIZE2];
	1714	tmp13 = dataptr[DCTSIZE3] - wsptr[DCTSIZE1];
	1715	tmp14 = dataptr[DCTSIZE4] - wsptr[DCTSIZE0];
	1716	tmp15 = dataptr[DCTSIZE5] - dataptr[DCTSIZE7];
	1717
	1718	dataptr[DCTSIZE*0] = (DCTELEM)
	1719	DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6,
	1720	FIX(0.757396450)), /* 128/169 */
	1721	CONST_BITS+1);
	1722	tmp6 += tmp6;
	1723	tmp0 -= tmp6;
	1724	tmp1 -= tmp6;
	1725	tmp2 -= tmp6;
	1726	tmp3 -= tmp6;
	1727	tmp4 -= tmp6;
	1728	tmp5 -= tmp6;
	1729	dataptr[DCTSIZE*2] = (DCTELEM)
	1730	DESCALE(MULTIPLY(tmp0, FIX(1.039995521)) + /* c2 */
	1731	MULTIPLY(tmp1, FIX(0.801745081)) + /* c6 */
	1732	MULTIPLY(tmp2, FIX(0.379824504)) - /* c10 */
	1733	MULTIPLY(tmp3, FIX(0.129109289)) - /* c12 */
	1734	MULTIPLY(tmp4, FIX(0.608465700)) - /* c8 */
	1735	MULTIPLY(tmp5, FIX(0.948429952)), /* c4 */
	1736	CONST_BITS+1);
	1737	z1 = MULTIPLY(tmp0 - tmp2, FIX(0.875087516)) - /* (c4+c6)/2 */
	1738	MULTIPLY(tmp3 - tmp4, FIX(0.330085509)) - /* (c2-c10)/2 */
	1739	MULTIPLY(tmp1 - tmp5, FIX(0.239678205)); /* (c8-c12)/2 */
	1740	z2 = MULTIPLY(tmp0 + tmp2, FIX(0.073342435)) - /* (c4-c6)/2 */
	1741	MULTIPLY(tmp3 + tmp4, FIX(0.709910013)) + /* (c2+c10)/2 */
	1742	MULTIPLY(tmp1 + tmp5, FIX(0.368787494)); /* (c8+c12)/2 */
	1743
	1744	dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+1);
	1745	dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS+1);
	1746
	1747	/* Odd part */
	1748
	1749	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.001514908)); /* c3 */
	1750	tmp2 = MULTIPLY(tmp10 + tmp12, FIX(0.881514751)); /* c5 */
	1751	tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.710284161)) + /* c7 */
	1752	MULTIPLY(tmp14 + tmp15, FIX(0.256335874)); /* c11 */
	1753	tmp0 = tmp1 + tmp2 + tmp3 -
	1754	MULTIPLY(tmp10, FIX(1.530003162)) + /* c3+c5+c7-c1 */
	1755	MULTIPLY(tmp14, FIX(0.241438564)); /* c9-c11 */
	1756	tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.710284161)) - /* c7 */
	1757	MULTIPLY(tmp11 + tmp12, FIX(0.256335874)); /* c11 */
	1758	tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(0.881514751)); /* -c5 */
	1759	tmp1 += tmp4 + tmp5 +
	1760	MULTIPLY(tmp11, FIX(0.634110155)) - /* c5+c9+c11-c3 */
	1761	MULTIPLY(tmp14, FIX(1.773594819)); /* c1+c7 */
	1762	tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.497774438)); /* -c9 */
	1763	tmp2 += tmp4 + tmp6 -
	1764	MULTIPLY(tmp12, FIX(1.190715098)) + /* c1+c5-c9-c11 */
	1765	MULTIPLY(tmp15, FIX(1.711799069)); /* c3+c7 */
	1766	tmp3 += tmp5 + tmp6 +
	1767	MULTIPLY(tmp13, FIX(1.670519935)) - /* c3+c5+c9-c7 */
	1768	MULTIPLY(tmp15, FIX(1.319646532)); /* c1+c11 */
	1769
	1770	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+1);
	1771	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+1);
	1772	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+1);
	1773	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+1);
	1774
	1775	dataptr++; /* advance pointer to next column */
	1776	wsptr++; /* advance pointer to next column */
	1777	}
	1778	}
	1779
	1780
	1781	/*
	1782	* Perform the forward DCT on a 14x14 sample block.
	1783	*/
	1784
	1785	GLOBAL(void)
	1786	jpeg_fdct_14x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	1787	{
	1788	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
	1789	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
	1790	DCTELEM workspace[8*6];
	1791	DCTELEM *dataptr;
	1792	DCTELEM *wsptr;
	1793	JSAMPROW elemptr;
	1794	int ctr;
	1795	SHIFT_TEMPS
	1796
	1797	/* Pass 1: process rows. */
	1798	/* Note results are scaled up by sqrt(8) compared to a true DCT. */
	1799	/* cK represents sqrt(2) * cos(Kpi/28). /
	1800
	1801	dataptr = data;
	1802	ctr = 0;
	1803	for (;;) {
	1804	elemptr = sample_data[ctr] + start_col;
	1805
	1806	/* Even part */
	1807
	1808	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
	1809	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
	1810	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
	1811	tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
	1812	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
	1813	tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
	1814	tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
	1815
	1816	tmp10 = tmp0 + tmp6;
	1817	tmp14 = tmp0 - tmp6;
	1818	tmp11 = tmp1 + tmp5;
	1819	tmp15 = tmp1 - tmp5;
	1820	tmp12 = tmp2 + tmp4;
	1821	tmp16 = tmp2 - tmp4;
	1822
	1823	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
	1824	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
	1825	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
	1826	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
	1827	tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
	1828	tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
	1829	tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
	1830
	1831	/* Apply unsigned->signed conversion */
	1832	dataptr[0] = (DCTELEM)
	1833	(tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE);
	1834	tmp13 += tmp13;
	1835	dataptr[4] = (DCTELEM)
	1836	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
	1837	MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
	1838	MULTIPLY(tmp12 - tmp13, FIX(0.881747734)), /* c8 */
	1839	CONST_BITS);
	1840
	1841	tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686)); /* c6 */
	1842
	1843	dataptr[2] = (DCTELEM)
	1844	DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590)) /* c2-c6 */
	1845	+ MULTIPLY(tmp16, FIX(0.613604268)), /* c10 */
	1846	CONST_BITS);
	1847	dataptr[6] = (DCTELEM)
	1848	DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954)) /* c6+c10 */
	1849	- MULTIPLY(tmp16, FIX(1.378756276)), /* c2 */
	1850	CONST_BITS);
	1851
	1852	/* Odd part */
	1853
	1854	tmp10 = tmp1 + tmp2;
	1855	tmp11 = tmp5 - tmp4;
	1856	dataptr[7] = (DCTELEM) (tmp0 - tmp10 + tmp3 - tmp11 - tmp6);
	1857	tmp3 <<= CONST_BITS;
	1858	tmp10 = MULTIPLY(tmp10, - FIX(0.158341681)); /* -c13 */
	1859	tmp11 = MULTIPLY(tmp11, FIX(1.405321284)); /* c1 */
	1860	tmp10 += tmp11 - tmp3;
	1861	tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) + /* c5 */
	1862	MULTIPLY(tmp4 + tmp6, FIX(0.752406978)); /* c9 */
	1863	dataptr[5] = (DCTELEM)
	1864	DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
	1865	+ MULTIPLY(tmp4, FIX(1.119999435)), /* c1+c11-c9 */
	1866	CONST_BITS);
	1867	tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) + /* c3 */
	1868	MULTIPLY(tmp5 - tmp6, FIX(0.467085129)); /* c11 */
	1869	dataptr[3] = (DCTELEM)
	1870	DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
	1871	- MULTIPLY(tmp5, FIX(3.069855259)), /* c1+c5+c11 */
	1872	CONST_BITS);
	1873	dataptr[1] = (DCTELEM)
	1874	DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
	1875	MULTIPLY(tmp0 + tmp6, FIX(1.126980169)), /* c3+c5-c1 */
	1876	CONST_BITS);
	1877
	1878	ctr++;
	1879
	1880	if (ctr != DCTSIZE) {
	1881	if (ctr == 14)
	1882	break; /* Done. */
	1883	dataptr += DCTSIZE; /* advance pointer to next row */
	1884	} else
	1885	dataptr = workspace; /* switch pointer to extended workspace */
	1886	}
	1887
	1888	/* Pass 2: process columns.
	1889	* We leave the results scaled up by an overall factor of 8.
	1890	* We must also scale the output by (8/14)**2 = 16/49, which we partially
	1891	* fold into the constant multipliers and final shifting:
	1892	* cK now represents sqrt(2) * cos(Kpi/28) 32/49.
	1893	*/
	1894
	1895	dataptr = data;
	1896	wsptr = workspace;
	1897	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
	1898	/* Even part */
	1899
	1900	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE5];
	1901	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE4];
	1902	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE3];
	1903	tmp13 = dataptr[DCTSIZE3] + wsptr[DCTSIZE2];
	1904	tmp4 = dataptr[DCTSIZE4] + wsptr[DCTSIZE1];
	1905	tmp5 = dataptr[DCTSIZE5] + wsptr[DCTSIZE0];
	1906	tmp6 = dataptr[DCTSIZE6] + dataptr[DCTSIZE7];
	1907
	1908	tmp10 = tmp0 + tmp6;
	1909	tmp14 = tmp0 - tmp6;
	1910	tmp11 = tmp1 + tmp5;
	1911	tmp15 = tmp1 - tmp5;
	1912	tmp12 = tmp2 + tmp4;
	1913	tmp16 = tmp2 - tmp4;
	1914
	1915	tmp0 = dataptr[DCTSIZE0] - wsptr[DCTSIZE5];
	1916	tmp1 = dataptr[DCTSIZE1] - wsptr[DCTSIZE4];
	1917	tmp2 = dataptr[DCTSIZE2] - wsptr[DCTSIZE3];
	1918	tmp3 = dataptr[DCTSIZE3] - wsptr[DCTSIZE2];
	1919	tmp4 = dataptr[DCTSIZE4] - wsptr[DCTSIZE1];
	1920	tmp5 = dataptr[DCTSIZE5] - wsptr[DCTSIZE0];
	1921	tmp6 = dataptr[DCTSIZE6] - dataptr[DCTSIZE7];
	1922
	1923	dataptr[DCTSIZE*0] = (DCTELEM)
	1924	DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
	1925	FIX(0.653061224)), /* 32/49 */
	1926	CONST_BITS+1);
	1927	tmp13 += tmp13;
	1928	dataptr[DCTSIZE*4] = (DCTELEM)
	1929	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
	1930	MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
	1931	MULTIPLY(tmp12 - tmp13, FIX(0.575835255)), /* c8 */
	1932	CONST_BITS+1);
	1933
	1934	tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570)); /* c6 */
	1935
	1936	dataptr[DCTSIZE*2] = (DCTELEM)
	1937	DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691)) /* c2-c6 */
	1938	+ MULTIPLY(tmp16, FIX(0.400721155)), /* c10 */
	1939	CONST_BITS+1);
	1940	dataptr[DCTSIZE*6] = (DCTELEM)
	1941	DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725)) /* c6+c10 */
	1942	- MULTIPLY(tmp16, FIX(0.900412262)), /* c2 */
	1943	CONST_BITS+1);
	1944
	1945	/* Odd part */
	1946
	1947	tmp10 = tmp1 + tmp2;
	1948	tmp11 = tmp5 - tmp4;
	1949	dataptr[DCTSIZE*7] = (DCTELEM)
	1950	DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
	1951	FIX(0.653061224)), /* 32/49 */
	1952	CONST_BITS+1);
	1953	tmp3 = MULTIPLY(tmp3 , FIX(0.653061224)); /* 32/49 */
	1954	tmp10 = MULTIPLY(tmp10, - FIX(0.103406812)); /* -c13 */
	1955	tmp11 = MULTIPLY(tmp11, FIX(0.917760839)); /* c1 */
	1956	tmp10 += tmp11 - tmp3;
	1957	tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) + /* c5 */
	1958	MULTIPLY(tmp4 + tmp6, FIX(0.491367823)); /* c9 */
	1959	dataptr[DCTSIZE*5] = (DCTELEM)
	1960	DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
	1961	+ MULTIPLY(tmp4, FIX(0.731428202)), /* c1+c11-c9 */
	1962	CONST_BITS+1);
	1963	tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) + /* c3 */
	1964	MULTIPLY(tmp5 - tmp6, FIX(0.305035186)); /* c11 */
	1965	dataptr[DCTSIZE*3] = (DCTELEM)
	1966	DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
	1967	- MULTIPLY(tmp5, FIX(2.004803435)), /* c1+c5+c11 */
	1968	CONST_BITS+1);
	1969	dataptr[DCTSIZE*1] = (DCTELEM)
	1970	DESCALE(tmp11 + tmp12 + tmp3
	1971	- MULTIPLY(tmp0, FIX(0.735987049)) /* c3+c5-c1 */
	1972	- MULTIPLY(tmp6, FIX(0.082925825)), /* c9-c11-c13 */
	1973	CONST_BITS+1);
	1974
	1975	dataptr++; /* advance pointer to next column */
	1976	wsptr++; /* advance pointer to next column */
	1977	}
	1978	}
	1979
	1980
	1981	/*
	1982	* Perform the forward DCT on a 15x15 sample block.
	1983	*/
	1984
	1985	GLOBAL(void)
	1986	jpeg_fdct_15x15 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	1987	{
	1988	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
	1989	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
	1990	INT32 z1, z2, z3;
	1991	DCTELEM workspace[8*7];
	1992	DCTELEM *dataptr;
	1993	DCTELEM *wsptr;
	1994	JSAMPROW elemptr;
	1995	int ctr;
	1996	SHIFT_TEMPS
	1997
	1998	/* Pass 1: process rows. */
	1999	/* Note results are scaled up by sqrt(8) compared to a true DCT. */
	2000	/* cK represents sqrt(2) * cos(Kpi/30). /
	2001
	2002	dataptr = data;
	2003	ctr = 0;
	2004	for (;;) {
	2005	elemptr = sample_data[ctr] + start_col;
	2006
	2007	/* Even part */
	2008
	2009	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[14]);
	2010	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[13]);
	2011	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[12]);
	2012	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[11]);
	2013	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[10]);
	2014	tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[9]);
	2015	tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[8]);
	2016	tmp7 = GETJSAMPLE(elemptr[7]);
	2017
	2018	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[14]);
	2019	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[13]);
	2020	tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[12]);
	2021	tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[11]);
	2022	tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[10]);
	2023	tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[9]);
	2024	tmp16 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[8]);
	2025
	2026	z1 = tmp0 + tmp4 + tmp5;
	2027	z2 = tmp1 + tmp3 + tmp6;
	2028	z3 = tmp2 + tmp7;
	2029	/* Apply unsigned->signed conversion */
	2030	dataptr[0] = (DCTELEM) (z1 + z2 + z3 - 15 * CENTERJSAMPLE);
	2031	z3 += z3;
	2032	dataptr[6] = (DCTELEM)
	2033	DESCALE(MULTIPLY(z1 - z3, FIX(1.144122806)) - /* c6 */
	2034	MULTIPLY(z2 - z3, FIX(0.437016024)), /* c12 */
	2035	CONST_BITS);
	2036	tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
	2037	z1 = MULTIPLY(tmp3 - tmp2, FIX(1.531135173)) - /* c2+c14 */
	2038	MULTIPLY(tmp6 - tmp2, FIX(2.238241955)); /* c4+c8 */
	2039	z2 = MULTIPLY(tmp5 - tmp2, FIX(0.798468008)) - /* c8-c14 */
	2040	MULTIPLY(tmp0 - tmp2, FIX(0.091361227)); /* c2-c4 */
	2041	z3 = MULTIPLY(tmp0 - tmp3, FIX(1.383309603)) + /* c2 */
	2042	MULTIPLY(tmp6 - tmp5, FIX(0.946293579)) + /* c8 */
	2043	MULTIPLY(tmp1 - tmp4, FIX(0.790569415)); /* (c6+c12)/2 */
	2044
	2045	dataptr[2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS);
	2046	dataptr[4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS);
	2047
	2048	/* Odd part */
	2049
	2050	tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
	2051	FIX(1.224744871)); /* c5 */
	2052	tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.344997024)) + /* c3 */
	2053	MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.831253876)); /* c9 */
	2054	tmp12 = MULTIPLY(tmp12, FIX(1.224744871)); /* c5 */
	2055	tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.406466353)) + /* c1 */
	2056	MULTIPLY(tmp11 + tmp14, FIX(1.344997024)) + /* c3 */
	2057	MULTIPLY(tmp13 + tmp15, FIX(0.575212477)); /* c11 */
	2058	tmp0 = MULTIPLY(tmp13, FIX(0.475753014)) - /* c7-c11 */
	2059	MULTIPLY(tmp14, FIX(0.513743148)) + /* c3-c9 */
	2060	MULTIPLY(tmp16, FIX(1.700497885)) + tmp4 + tmp12; /* c1+c13 */
	2061	tmp3 = MULTIPLY(tmp10, - FIX(0.355500862)) - /* -(c1-c7) */
	2062	MULTIPLY(tmp11, FIX(2.176250899)) - /* c3+c9 */
	2063	MULTIPLY(tmp15, FIX(0.869244010)) + tmp4 - tmp12; /* c11+c13 */
	2064
	2065	dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
	2066	dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
	2067	dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
	2068	dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
	2069
	2070	ctr++;
	2071
	2072	if (ctr != DCTSIZE) {
	2073	if (ctr == 15)
	2074	break; /* Done. */
	2075	dataptr += DCTSIZE; /* advance pointer to next row */
	2076	} else
	2077	dataptr = workspace; /* switch pointer to extended workspace */
	2078	}
	2079
	2080	/* Pass 2: process columns.
	2081	* We leave the results scaled up by an overall factor of 8.
	2082	* We must also scale the output by (8/15)**2 = 64/225, which we partially
	2083	* fold into the constant multipliers and final shifting:
	2084	* cK now represents sqrt(2) * cos(Kpi/30) 256/225.
	2085	*/
	2086
	2087	dataptr = data;
	2088	wsptr = workspace;
	2089	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
	2090	/* Even part */
	2091
	2092	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE6];
	2093	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE5];
	2094	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE4];
	2095	tmp3 = dataptr[DCTSIZE3] + wsptr[DCTSIZE3];
	2096	tmp4 = dataptr[DCTSIZE4] + wsptr[DCTSIZE2];
	2097	tmp5 = dataptr[DCTSIZE5] + wsptr[DCTSIZE1];
	2098	tmp6 = dataptr[DCTSIZE6] + wsptr[DCTSIZE0];
	2099	tmp7 = dataptr[DCTSIZE*7];
	2100
	2101	tmp10 = dataptr[DCTSIZE0] - wsptr[DCTSIZE6];
	2102	tmp11 = dataptr[DCTSIZE1] - wsptr[DCTSIZE5];
	2103	tmp12 = dataptr[DCTSIZE2] - wsptr[DCTSIZE4];
	2104	tmp13 = dataptr[DCTSIZE3] - wsptr[DCTSIZE3];
	2105	tmp14 = dataptr[DCTSIZE4] - wsptr[DCTSIZE2];
	2106	tmp15 = dataptr[DCTSIZE5] - wsptr[DCTSIZE1];
	2107	tmp16 = dataptr[DCTSIZE6] - wsptr[DCTSIZE0];
	2108
	2109	z1 = tmp0 + tmp4 + tmp5;
	2110	z2 = tmp1 + tmp3 + tmp6;
	2111	z3 = tmp2 + tmp7;
	2112	dataptr[DCTSIZE*0] = (DCTELEM)
	2113	DESCALE(MULTIPLY(z1 + z2 + z3, FIX(1.137777778)), /* 256/225 */
	2114	CONST_BITS+2);
	2115	z3 += z3;
	2116	dataptr[DCTSIZE*6] = (DCTELEM)
	2117	DESCALE(MULTIPLY(z1 - z3, FIX(1.301757503)) - /* c6 */
	2118	MULTIPLY(z2 - z3, FIX(0.497227121)), /* c12 */
	2119	CONST_BITS+2);
	2120	tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
	2121	z1 = MULTIPLY(tmp3 - tmp2, FIX(1.742091575)) - /* c2+c14 */
	2122	MULTIPLY(tmp6 - tmp2, FIX(2.546621957)); /* c4+c8 */
	2123	z2 = MULTIPLY(tmp5 - tmp2, FIX(0.908479156)) - /* c8-c14 */
	2124	MULTIPLY(tmp0 - tmp2, FIX(0.103948774)); /* c2-c4 */
	2125	z3 = MULTIPLY(tmp0 - tmp3, FIX(1.573898926)) + /* c2 */
	2126	MULTIPLY(tmp6 - tmp5, FIX(1.076671805)) + /* c8 */
	2127	MULTIPLY(tmp1 - tmp4, FIX(0.899492312)); /* (c6+c12)/2 */
	2128
	2129	dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS+2);
	2130	dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS+2);
	2131
	2132	/* Odd part */
	2133
	2134	tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
	2135	FIX(1.393487498)); /* c5 */
	2136	tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.530307725)) + /* c3 */
	2137	MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.945782187)); /* c9 */
	2138	tmp12 = MULTIPLY(tmp12, FIX(1.393487498)); /* c5 */
	2139	tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.600246161)) + /* c1 */
	2140	MULTIPLY(tmp11 + tmp14, FIX(1.530307725)) + /* c3 */
	2141	MULTIPLY(tmp13 + tmp15, FIX(0.654463974)); /* c11 */
	2142	tmp0 = MULTIPLY(tmp13, FIX(0.541301207)) - /* c7-c11 */
	2143	MULTIPLY(tmp14, FIX(0.584525538)) + /* c3-c9 */
	2144	MULTIPLY(tmp16, FIX(1.934788705)) + tmp4 + tmp12; /* c1+c13 */
	2145	tmp3 = MULTIPLY(tmp10, - FIX(0.404480980)) - /* -(c1-c7) */
	2146	MULTIPLY(tmp11, FIX(2.476089912)) - /* c3+c9 */
	2147	MULTIPLY(tmp15, FIX(0.989006518)) + tmp4 - tmp12; /* c11+c13 */
	2148
	2149	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
	2150	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
	2151	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
	2152	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);
	2153
	2154	dataptr++; /* advance pointer to next column */
	2155	wsptr++; /* advance pointer to next column */
	2156	}
	2157	}
	2158
	2159
	2160	/*
	2161	* Perform the forward DCT on a 16x16 sample block.
	2162	*/
	2163
	2164	GLOBAL(void)
	2165	jpeg_fdct_16x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	2166	{
	2167	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
	2168	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
	2169	DCTELEM workspace[DCTSIZE2];
	2170	DCTELEM *dataptr;
	2171	DCTELEM *wsptr;
	2172	JSAMPROW elemptr;
	2173	int ctr;
	2174	SHIFT_TEMPS
	2175
	2176	/* Pass 1: process rows. */
	2177	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
	2178	/* furthermore, we scale the results by 2*PASS1_BITS. /
	2179	/* cK represents sqrt(2) * cos(Kpi/32). /
	2180
	2181	dataptr = data;
	2182	ctr = 0;
	2183	for (;;) {
	2184	elemptr = sample_data[ctr] + start_col;
	2185
	2186	/* Even part */
	2187
	2188	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
	2189	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
	2190	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
	2191	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
	2192	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
	2193	tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
	2194	tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
	2195	tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
	2196
	2197	tmp10 = tmp0 + tmp7;
	2198	tmp14 = tmp0 - tmp7;
	2199	tmp11 = tmp1 + tmp6;
	2200	tmp15 = tmp1 - tmp6;
	2201	tmp12 = tmp2 + tmp5;
	2202	tmp16 = tmp2 - tmp5;
	2203	tmp13 = tmp3 + tmp4;
	2204	tmp17 = tmp3 - tmp4;
	2205
	2206	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
	2207	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
	2208	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
	2209	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
	2210	tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
	2211	tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
	2212	tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
	2213	tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
	2214
	2215	/* Apply unsigned->signed conversion */
	2216	dataptr[0] = (DCTELEM)
	2217	((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
	2218	dataptr[4] = (DCTELEM)
	2219	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
	2220	MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
	2221	CONST_BITS-PASS1_BITS);
	2222
	2223	tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
	2224	MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
	2225
	2226	dataptr[2] = (DCTELEM)
	2227	DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
	2228	+ MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */
	2229	CONST_BITS-PASS1_BITS);
	2230	dataptr[6] = (DCTELEM)
	2231	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
	2232	- MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
	2233	CONST_BITS-PASS1_BITS);
	2234
	2235	/* Odd part */
	2236
	2237	tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
	2238	MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
	2239	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
	2240	MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
	2241	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
	2242	MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
	2243	tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
	2244	MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
	2245	tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
	2246	MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
	2247	tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
	2248	MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
	2249	tmp10 = tmp11 + tmp12 + tmp13 -
	2250	MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
	2251	MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
	2252	tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
	2253	- MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
	2254	tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
	2255	+ MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
	2256	tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
	2257	+ MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
	2258
	2259	dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
	2260	dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
	2261	dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
	2262	dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
	2263
	2264	ctr++;
	2265
	2266	if (ctr != DCTSIZE) {
	2267	if (ctr == DCTSIZE * 2)
	2268	break; /* Done. */
	2269	dataptr += DCTSIZE; /* advance pointer to next row */
	2270	} else
	2271	dataptr = workspace; /* switch pointer to extended workspace */
	2272	}
	2273
	2274	/* Pass 2: process columns.
	2275	* We remove the PASS1_BITS scaling, but leave the results scaled up
	2276	* by an overall factor of 8.
	2277	* We must also scale the output by (8/16)2 = 1/22.
	2278	*/
	2279
	2280	dataptr = data;
	2281	wsptr = workspace;
	2282	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
	2283	/* Even part */
	2284
	2285	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE7];
	2286	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE6];
	2287	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE5];
	2288	tmp3 = dataptr[DCTSIZE3] + wsptr[DCTSIZE4];
	2289	tmp4 = dataptr[DCTSIZE4] + wsptr[DCTSIZE3];
	2290	tmp5 = dataptr[DCTSIZE5] + wsptr[DCTSIZE2];
	2291	tmp6 = dataptr[DCTSIZE6] + wsptr[DCTSIZE1];
	2292	tmp7 = dataptr[DCTSIZE7] + wsptr[DCTSIZE0];
	2293
	2294	tmp10 = tmp0 + tmp7;
	2295	tmp14 = tmp0 - tmp7;
	2296	tmp11 = tmp1 + tmp6;
	2297	tmp15 = tmp1 - tmp6;
	2298	tmp12 = tmp2 + tmp5;
	2299	tmp16 = tmp2 - tmp5;
	2300	tmp13 = tmp3 + tmp4;
	2301	tmp17 = tmp3 - tmp4;
	2302
	2303	tmp0 = dataptr[DCTSIZE0] - wsptr[DCTSIZE7];
	2304	tmp1 = dataptr[DCTSIZE1] - wsptr[DCTSIZE6];
	2305	tmp2 = dataptr[DCTSIZE2] - wsptr[DCTSIZE5];
	2306	tmp3 = dataptr[DCTSIZE3] - wsptr[DCTSIZE4];
	2307	tmp4 = dataptr[DCTSIZE4] - wsptr[DCTSIZE3];
	2308	tmp5 = dataptr[DCTSIZE5] - wsptr[DCTSIZE2];
	2309	tmp6 = dataptr[DCTSIZE6] - wsptr[DCTSIZE1];
	2310	tmp7 = dataptr[DCTSIZE7] - wsptr[DCTSIZE0];
	2311
	2312	dataptr[DCTSIZE*0] = (DCTELEM)
	2313	DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+2);
	2314	dataptr[DCTSIZE*4] = (DCTELEM)
	2315	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
	2316	MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
	2317	CONST_BITS+PASS1_BITS+2);
	2318
	2319	tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
	2320	MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
	2321
	2322	dataptr[DCTSIZE*2] = (DCTELEM)
	2323	DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
	2324	+ MULTIPLY(tmp16, FIX(2.172734804)), /* c2+10 */
	2325	CONST_BITS+PASS1_BITS+2);
	2326	dataptr[DCTSIZE*6] = (DCTELEM)
	2327	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
	2328	- MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
	2329	CONST_BITS+PASS1_BITS+2);
	2330
	2331	/* Odd part */
	2332
	2333	tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
	2334	MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
	2335	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
	2336	MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
	2337	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
	2338	MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
	2339	tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
	2340	MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
	2341	tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
	2342	MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
	2343	tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
	2344	MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
	2345	tmp10 = tmp11 + tmp12 + tmp13 -
	2346	MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
	2347	MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
	2348	tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
	2349	- MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
	2350	tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
	2351	+ MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
	2352	tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
	2353	+ MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
	2354
	2355	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+2);
	2356	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+2);
	2357	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+2);
	2358	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+2);
	2359
	2360	dataptr++; /* advance pointer to next column */
	2361	wsptr++; /* advance pointer to next column */
	2362	}
	2363	}
	2364
	2365
	2366	/*
	2367	* Perform the forward DCT on a 16x8 sample block.
	2368	*
	2369	* 16-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
	2370	*/
	2371
	2372	GLOBAL(void)
	2373	jpeg_fdct_16x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	2374	{
	2375	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
	2376	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
	2377	INT32 z1;
	2378	DCTELEM *dataptr;
	2379	JSAMPROW elemptr;
	2380	int ctr;
	2381	SHIFT_TEMPS
	2382
	2383	/* Pass 1: process rows. */
	2384	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
	2385	/* furthermore, we scale the results by 2*PASS1_BITS. /
	2386	/* 16-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/32). /
	2387
	2388	dataptr = data;
	2389	ctr = 0;
	2390	for (ctr = 0; ctr < DCTSIZE; ctr++) {
	2391	elemptr = sample_data[ctr] + start_col;
	2392
	2393	/* Even part */
	2394
	2395	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
	2396	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
	2397	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
	2398	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
	2399	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
	2400	tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
	2401	tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
	2402	tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
	2403
	2404	tmp10 = tmp0 + tmp7;
	2405	tmp14 = tmp0 - tmp7;
	2406	tmp11 = tmp1 + tmp6;
	2407	tmp15 = tmp1 - tmp6;
	2408	tmp12 = tmp2 + tmp5;
	2409	tmp16 = tmp2 - tmp5;
	2410	tmp13 = tmp3 + tmp4;
	2411	tmp17 = tmp3 - tmp4;
	2412
	2413	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
	2414	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
	2415	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
	2416	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
	2417	tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
	2418	tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
	2419	tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
	2420	tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
	2421
	2422	/* Apply unsigned->signed conversion */
	2423	dataptr[0] = (DCTELEM)
	2424	((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
	2425	dataptr[4] = (DCTELEM)
	2426	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
	2427	MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
	2428	CONST_BITS-PASS1_BITS);
	2429
	2430	tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
	2431	MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
	2432
	2433	dataptr[2] = (DCTELEM)
	2434	DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
	2435	+ MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */
	2436	CONST_BITS-PASS1_BITS);
	2437	dataptr[6] = (DCTELEM)
	2438	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
	2439	- MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
	2440	CONST_BITS-PASS1_BITS);
	2441
	2442	/* Odd part */
	2443
	2444	tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
	2445	MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
	2446	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
	2447	MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
	2448	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
	2449	MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
	2450	tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
	2451	MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
	2452	tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
	2453	MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
	2454	tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
	2455	MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
	2456	tmp10 = tmp11 + tmp12 + tmp13 -
	2457	MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
	2458	MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
	2459	tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
	2460	- MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
	2461	tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
	2462	+ MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
	2463	tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
	2464	+ MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
	2465
	2466	dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
	2467	dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
	2468	dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
	2469	dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
	2470
	2471	dataptr += DCTSIZE; /* advance pointer to next row */
	2472	}
	2473
	2474	/* Pass 2: process columns.
	2475	* We remove the PASS1_BITS scaling, but leave the results scaled up
	2476	* by an overall factor of 8.
	2477	* We must also scale the output by 8/16 = 1/2.
	2478	*/
	2479
	2480	dataptr = data;
	2481	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
[2]	2482	/* Even part per LL&M figure 1 --- note that published figure is faulty;
	2483	* rotator "sqrt(2)c1" should be "sqrt(2)c6".
	2484	*/
[846]	2485
	2486	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE7];
	2487	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE6];
	2488	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE5];
	2489	tmp3 = dataptr[DCTSIZE3] + dataptr[DCTSIZE4];
	2490
[2]	2491	tmp10 = tmp0 + tmp3;
[846]	2492	tmp12 = tmp0 - tmp3;
[2]	2493	tmp11 = tmp1 + tmp2;
[846]	2494	tmp13 = tmp1 - tmp2;
	2495
	2496	tmp0 = dataptr[DCTSIZE0] - dataptr[DCTSIZE7];
	2497	tmp1 = dataptr[DCTSIZE1] - dataptr[DCTSIZE6];
	2498	tmp2 = dataptr[DCTSIZE2] - dataptr[DCTSIZE5];
	2499	tmp3 = dataptr[DCTSIZE3] - dataptr[DCTSIZE4];
	2500
	2501	dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS+1);
	2502	dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS+1);
	2503
[2]	2504	z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
[846]	2505	dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),
	2506	CONST_BITS+PASS1_BITS+1);
	2507	dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),
	2508	CONST_BITS+PASS1_BITS+1);
	2509
[2]	2510	/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
[846]	2511	* 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
	2512	* i0..i3 in the paper are tmp0..tmp3 here.
[2]	2513	*/
[846]	2514
	2515	tmp10 = tmp0 + tmp3;
	2516	tmp11 = tmp1 + tmp2;
	2517	tmp12 = tmp0 + tmp2;
	2518	tmp13 = tmp1 + tmp3;
	2519	z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
	2520
	2521	tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
	2522	tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
	2523	tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
	2524	tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
	2525	tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
	2526	tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
	2527	tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
	2528	tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
	2529
	2530	tmp12 += z1;
	2531	tmp13 += z1;
	2532
	2533	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12,
	2534	CONST_BITS+PASS1_BITS+1);
	2535	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13,
	2536	CONST_BITS+PASS1_BITS+1);
	2537	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12,
	2538	CONST_BITS+PASS1_BITS+1);
	2539	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13,
	2540	CONST_BITS+PASS1_BITS+1);
	2541
[2]	2542	dataptr++; /* advance pointer to next column */
	2543	}
	2544	}
	2545
[846]	2546
	2547	/*
	2548	* Perform the forward DCT on a 14x7 sample block.
	2549	*
	2550	* 14-point FDCT in pass 1 (rows), 7-point in pass 2 (columns).
	2551	*/
	2552
	2553	GLOBAL(void)
	2554	jpeg_fdct_14x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	2555	{
	2556	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
	2557	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
	2558	INT32 z1, z2, z3;
	2559	DCTELEM *dataptr;
	2560	JSAMPROW elemptr;
	2561	int ctr;
	2562	SHIFT_TEMPS
	2563
	2564	/* Zero bottom row of output coefficient block. */
	2565	MEMZERO(&data[DCTSIZE7], SIZEOF(DCTELEM) DCTSIZE);
	2566
	2567	/* Pass 1: process rows. */
	2568	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
	2569	/* furthermore, we scale the results by 2*PASS1_BITS. /
	2570	/* 14-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/28). /
	2571
	2572	dataptr = data;
	2573	for (ctr = 0; ctr < 7; ctr++) {
	2574	elemptr = sample_data[ctr] + start_col;
	2575
	2576	/* Even part */
	2577
	2578	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
	2579	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
	2580	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
	2581	tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
	2582	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
	2583	tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
	2584	tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
	2585
	2586	tmp10 = tmp0 + tmp6;
	2587	tmp14 = tmp0 - tmp6;
	2588	tmp11 = tmp1 + tmp5;
	2589	tmp15 = tmp1 - tmp5;
	2590	tmp12 = tmp2 + tmp4;
	2591	tmp16 = tmp2 - tmp4;
	2592
	2593	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
	2594	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
	2595	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
	2596	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
	2597	tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
	2598	tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
	2599	tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
	2600
	2601	/* Apply unsigned->signed conversion */
	2602	dataptr[0] = (DCTELEM)
	2603	((tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE) << PASS1_BITS);
	2604	tmp13 += tmp13;
	2605	dataptr[4] = (DCTELEM)
	2606	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
	2607	MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
	2608	MULTIPLY(tmp12 - tmp13, FIX(0.881747734)), /* c8 */
	2609	CONST_BITS-PASS1_BITS);
	2610
	2611	tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686)); /* c6 */
	2612
	2613	dataptr[2] = (DCTELEM)
	2614	DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590)) /* c2-c6 */
	2615	+ MULTIPLY(tmp16, FIX(0.613604268)), /* c10 */
	2616	CONST_BITS-PASS1_BITS);
	2617	dataptr[6] = (DCTELEM)
	2618	DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954)) /* c6+c10 */
	2619	- MULTIPLY(tmp16, FIX(1.378756276)), /* c2 */
	2620	CONST_BITS-PASS1_BITS);
	2621
	2622	/* Odd part */
	2623
	2624	tmp10 = tmp1 + tmp2;
	2625	tmp11 = tmp5 - tmp4;
	2626	dataptr[7] = (DCTELEM) ((tmp0 - tmp10 + tmp3 - tmp11 - tmp6) << PASS1_BITS);
	2627	tmp3 <<= CONST_BITS;
	2628	tmp10 = MULTIPLY(tmp10, - FIX(0.158341681)); /* -c13 */
	2629	tmp11 = MULTIPLY(tmp11, FIX(1.405321284)); /* c1 */
	2630	tmp10 += tmp11 - tmp3;
	2631	tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) + /* c5 */
	2632	MULTIPLY(tmp4 + tmp6, FIX(0.752406978)); /* c9 */
	2633	dataptr[5] = (DCTELEM)
	2634	DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
	2635	+ MULTIPLY(tmp4, FIX(1.119999435)), /* c1+c11-c9 */
	2636	CONST_BITS-PASS1_BITS);
	2637	tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) + /* c3 */
	2638	MULTIPLY(tmp5 - tmp6, FIX(0.467085129)); /* c11 */
	2639	dataptr[3] = (DCTELEM)
	2640	DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
	2641	- MULTIPLY(tmp5, FIX(3.069855259)), /* c1+c5+c11 */
	2642	CONST_BITS-PASS1_BITS);
	2643	dataptr[1] = (DCTELEM)
	2644	DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
	2645	MULTIPLY(tmp0 + tmp6, FIX(1.126980169)), /* c3+c5-c1 */
	2646	CONST_BITS-PASS1_BITS);
	2647
	2648	dataptr += DCTSIZE; /* advance pointer to next row */
	2649	}
	2650
	2651	/* Pass 2: process columns.
	2652	* We remove the PASS1_BITS scaling, but leave the results scaled up
	2653	* by an overall factor of 8.
	2654	* We must also scale the output by (8/14)*(8/7) = 32/49, which we
	2655	* partially fold into the constant multipliers and final shifting:
	2656	* 7-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/14) 64/49.
	2657	*/
	2658
	2659	dataptr = data;
	2660	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
	2661	/* Even part */
	2662
	2663	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE6];
	2664	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE5];
	2665	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE4];
	2666	tmp3 = dataptr[DCTSIZE*3];
	2667
	2668	tmp10 = dataptr[DCTSIZE0] - dataptr[DCTSIZE6];
	2669	tmp11 = dataptr[DCTSIZE1] - dataptr[DCTSIZE5];
	2670	tmp12 = dataptr[DCTSIZE2] - dataptr[DCTSIZE4];
	2671
	2672	z1 = tmp0 + tmp2;
	2673	dataptr[DCTSIZE*0] = (DCTELEM)
	2674	DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
	2675	CONST_BITS+PASS1_BITS+1);
	2676	tmp3 += tmp3;
	2677	z1 -= tmp3;
	2678	z1 -= tmp3;
	2679	z1 = MULTIPLY(z1, FIX(0.461784020)); /* (c2+c6-c4)/2 */
	2680	z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084)); /* (c2+c4-c6)/2 */
	2681	z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446)); /* c6 */
	2682	dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS+1);
	2683	z1 -= z2;
	2684	z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509)); /* c4 */
	2685	dataptr[DCTSIZE*4] = (DCTELEM)
	2686	DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
	2687	CONST_BITS+PASS1_BITS+1);
	2688	dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS+1);
	2689
	2690	/* Odd part */
	2691
	2692	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677)); /* (c3+c1-c5)/2 */
	2693	tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464)); /* (c3+c5-c1)/2 */
	2694	tmp0 = tmp1 - tmp2;
	2695	tmp1 += tmp2;
	2696	tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
	2697	tmp1 += tmp2;
	2698	tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310)); /* c5 */
	2699	tmp0 += tmp3;
	2700	tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355)); /* c3+c1-c5 */
	2701
	2702	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS+1);
	2703	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS+1);
	2704	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS+1);
	2705
	2706	dataptr++; /* advance pointer to next column */
	2707	}
	2708	}
	2709
	2710
	2711	/*
	2712	* Perform the forward DCT on a 12x6 sample block.
	2713	*
	2714	* 12-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
	2715	*/
	2716
	2717	GLOBAL(void)
	2718	jpeg_fdct_12x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	2719	{
	2720	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
	2721	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
	2722	DCTELEM *dataptr;
	2723	JSAMPROW elemptr;
	2724	int ctr;
	2725	SHIFT_TEMPS
	2726
	2727	/* Zero 2 bottom rows of output coefficient block. */
	2728	MEMZERO(&data[DCTSIZE6], SIZEOF(DCTELEM) DCTSIZE * 2);
	2729
	2730	/* Pass 1: process rows. */
	2731	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
	2732	/* furthermore, we scale the results by 2*PASS1_BITS. /
	2733	/* 12-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/24). /
	2734
	2735	dataptr = data;
	2736	for (ctr = 0; ctr < 6; ctr++) {
	2737	elemptr = sample_data[ctr] + start_col;
	2738
	2739	/* Even part */
	2740
	2741	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
	2742	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
	2743	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
	2744	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
	2745	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
	2746	tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
	2747
	2748	tmp10 = tmp0 + tmp5;
	2749	tmp13 = tmp0 - tmp5;
	2750	tmp11 = tmp1 + tmp4;
	2751	tmp14 = tmp1 - tmp4;
	2752	tmp12 = tmp2 + tmp3;
	2753	tmp15 = tmp2 - tmp3;
	2754
	2755	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
	2756	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
	2757	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
	2758	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
	2759	tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
	2760	tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
	2761
	2762	/* Apply unsigned->signed conversion */
	2763	dataptr[0] = (DCTELEM)
	2764	((tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE) << PASS1_BITS);
	2765	dataptr[6] = (DCTELEM) ((tmp13 - tmp14 - tmp15) << PASS1_BITS);
	2766	dataptr[4] = (DCTELEM)
	2767	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
	2768	CONST_BITS-PASS1_BITS);
	2769	dataptr[2] = (DCTELEM)
	2770	DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
	2771	CONST_BITS-PASS1_BITS);
	2772
	2773	/* Odd part */
	2774
	2775	tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100); /* c9 */
	2776	tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865); /* c3-c9 */
	2777	tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065); /* c3+c9 */
	2778	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054)); /* c5 */
	2779	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669)); /* c7 */
	2780	tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
	2781	+ MULTIPLY(tmp5, FIX(0.184591911)); /* c11 */
	2782	tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
	2783	tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
	2784	+ MULTIPLY(tmp5, FIX(0.860918669)); /* c7 */
	2785	tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
	2786	- MULTIPLY(tmp5, FIX(1.121971054)); /* c5 */
	2787	tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
	2788	- MULTIPLY(tmp2 + tmp5, FIX_0_541196100); /* c9 */
	2789
	2790	dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
	2791	dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
	2792	dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
	2793	dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
	2794
	2795	dataptr += DCTSIZE; /* advance pointer to next row */
	2796	}
	2797
	2798	/* Pass 2: process columns.
	2799	* We remove the PASS1_BITS scaling, but leave the results scaled up
	2800	* by an overall factor of 8.
	2801	* We must also scale the output by (8/12)*(8/6) = 8/9, which we
	2802	* partially fold into the constant multipliers and final shifting:
	2803	* 6-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/12) 16/9.
	2804	*/
	2805
	2806	dataptr = data;
	2807	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
	2808	/* Even part */
	2809
	2810	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE5];
	2811	tmp11 = dataptr[DCTSIZE1] + dataptr[DCTSIZE4];
	2812	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE3];
	2813
	2814	tmp10 = tmp0 + tmp2;
	2815	tmp12 = tmp0 - tmp2;
	2816
	2817	tmp0 = dataptr[DCTSIZE0] - dataptr[DCTSIZE5];
	2818	tmp1 = dataptr[DCTSIZE1] - dataptr[DCTSIZE4];
	2819	tmp2 = dataptr[DCTSIZE2] - dataptr[DCTSIZE3];
	2820
	2821	dataptr[DCTSIZE*0] = (DCTELEM)
	2822	DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */
	2823	CONST_BITS+PASS1_BITS+1);
	2824	dataptr[DCTSIZE*2] = (DCTELEM)
	2825	DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */
	2826	CONST_BITS+PASS1_BITS+1);
	2827	dataptr[DCTSIZE*4] = (DCTELEM)
	2828	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
	2829	CONST_BITS+PASS1_BITS+1);
	2830
	2831	/* Odd part */
	2832
	2833	tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */
	2834
	2835	dataptr[DCTSIZE*1] = (DCTELEM)
	2836	DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
	2837	CONST_BITS+PASS1_BITS+1);
	2838	dataptr[DCTSIZE*3] = (DCTELEM)
	2839	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */
	2840	CONST_BITS+PASS1_BITS+1);
	2841	dataptr[DCTSIZE*5] = (DCTELEM)
	2842	DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */
	2843	CONST_BITS+PASS1_BITS+1);
	2844
	2845	dataptr++; /* advance pointer to next column */
	2846	}
	2847	}
	2848
	2849
	2850	/*
	2851	* Perform the forward DCT on a 10x5 sample block.
	2852	*
	2853	* 10-point FDCT in pass 1 (rows), 5-point in pass 2 (columns).
	2854	*/
	2855
	2856	GLOBAL(void)
	2857	jpeg_fdct_10x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	2858	{
	2859	INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
	2860	INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
	2861	DCTELEM *dataptr;
	2862	JSAMPROW elemptr;
	2863	int ctr;
	2864	SHIFT_TEMPS
	2865
	2866	/* Zero 3 bottom rows of output coefficient block. */
	2867	MEMZERO(&data[DCTSIZE5], SIZEOF(DCTELEM) DCTSIZE * 3);
	2868
	2869	/* Pass 1: process rows. */
	2870	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
	2871	/* furthermore, we scale the results by 2*PASS1_BITS. /
	2872	/* 10-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/20). /
	2873
	2874	dataptr = data;
	2875	for (ctr = 0; ctr < 5; ctr++) {
	2876	elemptr = sample_data[ctr] + start_col;
	2877
	2878	/* Even part */
	2879
	2880	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
	2881	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
	2882	tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
	2883	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
	2884	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
	2885
	2886	tmp10 = tmp0 + tmp4;
	2887	tmp13 = tmp0 - tmp4;
	2888	tmp11 = tmp1 + tmp3;
	2889	tmp14 = tmp1 - tmp3;
	2890
	2891	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
	2892	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
	2893	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
	2894	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
	2895	tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
	2896
	2897	/* Apply unsigned->signed conversion */
	2898	dataptr[0] = (DCTELEM)
	2899	((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << PASS1_BITS);
	2900	tmp12 += tmp12;
	2901	dataptr[4] = (DCTELEM)
	2902	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
	2903	MULTIPLY(tmp11 - tmp12, FIX(0.437016024)), /* c8 */
	2904	CONST_BITS-PASS1_BITS);
	2905	tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876)); /* c6 */
	2906	dataptr[2] = (DCTELEM)
	2907	DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)), /* c2-c6 */
	2908	CONST_BITS-PASS1_BITS);
	2909	dataptr[6] = (DCTELEM)
	2910	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)), /* c2+c6 */
	2911	CONST_BITS-PASS1_BITS);
	2912
	2913	/* Odd part */
	2914
	2915	tmp10 = tmp0 + tmp4;
	2916	tmp11 = tmp1 - tmp3;
	2917	dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << PASS1_BITS);
	2918	tmp2 <<= CONST_BITS;
	2919	dataptr[1] = (DCTELEM)
	2920	DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) + /* c1 */
	2921	MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 + /* c3 */
	2922	MULTIPLY(tmp3, FIX(0.642039522)) + /* c7 */
	2923	MULTIPLY(tmp4, FIX(0.221231742)), /* c9 */
	2924	CONST_BITS-PASS1_BITS);
	2925	tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) - /* (c3+c7)/2 */
	2926	MULTIPLY(tmp1 + tmp3, FIX(0.587785252)); /* (c1-c9)/2 */
	2927	tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) + /* (c3-c7)/2 */
	2928	(tmp11 << (CONST_BITS - 1)) - tmp2;
	2929	dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-PASS1_BITS);
	2930	dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-PASS1_BITS);
	2931
	2932	dataptr += DCTSIZE; /* advance pointer to next row */
	2933	}
	2934
	2935	/* Pass 2: process columns.
	2936	* We remove the PASS1_BITS scaling, but leave the results scaled up
	2937	* by an overall factor of 8.
	2938	* We must also scale the output by (8/10)*(8/5) = 32/25, which we
	2939	* fold into the constant multipliers:
	2940	* 5-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/10) 32/25.
	2941	*/
	2942
	2943	dataptr = data;
	2944	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
	2945	/* Even part */
	2946
	2947	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE4];
	2948	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE3];
	2949	tmp2 = dataptr[DCTSIZE*2];
	2950
	2951	tmp10 = tmp0 + tmp1;
	2952	tmp11 = tmp0 - tmp1;
	2953
	2954	tmp0 = dataptr[DCTSIZE0] - dataptr[DCTSIZE4];
	2955	tmp1 = dataptr[DCTSIZE1] - dataptr[DCTSIZE3];
	2956
	2957	dataptr[DCTSIZE*0] = (DCTELEM)
	2958	DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)), /* 32/25 */
	2959	CONST_BITS+PASS1_BITS);
	2960	tmp11 = MULTIPLY(tmp11, FIX(1.011928851)); /* (c2+c4)/2 */
	2961	tmp10 -= tmp2 << 2;
	2962	tmp10 = MULTIPLY(tmp10, FIX(0.452548340)); /* (c2-c4)/2 */
	2963	dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
	2964	dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
	2965
	2966	/* Odd part */
	2967
	2968	tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961)); /* c3 */
	2969
	2970	dataptr[DCTSIZE*1] = (DCTELEM)
	2971	DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
	2972	CONST_BITS+PASS1_BITS);
	2973	dataptr[DCTSIZE*3] = (DCTELEM)
	2974	DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
	2975	CONST_BITS+PASS1_BITS);
	2976
	2977	dataptr++; /* advance pointer to next column */
	2978	}
	2979	}
	2980
	2981
	2982	/*
	2983	* Perform the forward DCT on an 8x4 sample block.
	2984	*
	2985	* 8-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
	2986	*/
	2987
	2988	GLOBAL(void)
	2989	jpeg_fdct_8x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	2990	{
	2991	INT32 tmp0, tmp1, tmp2, tmp3;
	2992	INT32 tmp10, tmp11, tmp12, tmp13;
	2993	INT32 z1;
	2994	DCTELEM *dataptr;
	2995	JSAMPROW elemptr;
	2996	int ctr;
	2997	SHIFT_TEMPS
	2998
	2999	/* Zero 4 bottom rows of output coefficient block. */
	3000	MEMZERO(&data[DCTSIZE4], SIZEOF(DCTELEM) DCTSIZE * 4);
	3001
	3002	/* Pass 1: process rows. */
	3003	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
	3004	/* furthermore, we scale the results by 2*PASS1_BITS. /
	3005	/* We must also scale the output by 8/4 = 2, which we add here. */
	3006
	3007	dataptr = data;
	3008	for (ctr = 0; ctr < 4; ctr++) {
	3009	elemptr = sample_data[ctr] + start_col;
	3010
	3011	/* Even part per LL&M figure 1 --- note that published figure is faulty;
	3012	* rotator "sqrt(2)c1" should be "sqrt(2)c6".
	3013	*/
	3014
	3015	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
	3016	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
	3017	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
	3018	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
	3019
	3020	tmp10 = tmp0 + tmp3;
	3021	tmp12 = tmp0 - tmp3;
	3022	tmp11 = tmp1 + tmp2;
	3023	tmp13 = tmp1 - tmp2;
	3024
	3025	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
	3026	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
	3027	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
	3028	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
	3029
	3030	/* Apply unsigned->signed conversion */
	3031	dataptr[0] = (DCTELEM)
	3032	((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << (PASS1_BITS+1));
	3033	dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << (PASS1_BITS+1));
	3034
	3035	z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
	3036	/* Add fudge factor here for final descale. */
	3037	z1 += ONE << (CONST_BITS-PASS1_BITS-2);
	3038	dataptr[2] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),
	3039	CONST_BITS-PASS1_BITS-1);
	3040	dataptr[6] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),
	3041	CONST_BITS-PASS1_BITS-1);
	3042
	3043	/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
	3044	* 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
	3045	* i0..i3 in the paper are tmp0..tmp3 here.
	3046	*/
	3047
	3048	tmp10 = tmp0 + tmp3;
	3049	tmp11 = tmp1 + tmp2;
	3050	tmp12 = tmp0 + tmp2;
	3051	tmp13 = tmp1 + tmp3;
	3052	z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
	3053	/* Add fudge factor here for final descale. */
	3054	z1 += ONE << (CONST_BITS-PASS1_BITS-2);
	3055
	3056	tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
	3057	tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
	3058	tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
	3059	tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
	3060	tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
	3061	tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
	3062	tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
	3063	tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
	3064
	3065	tmp12 += z1;
	3066	tmp13 += z1;
	3067
	3068	dataptr[1] = (DCTELEM)
	3069	RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS-1);
	3070	dataptr[3] = (DCTELEM)
	3071	RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS-1);
	3072	dataptr[5] = (DCTELEM)
	3073	RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS-1);
	3074	dataptr[7] = (DCTELEM)
	3075	RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS-1);
	3076
	3077	dataptr += DCTSIZE; /* advance pointer to next row */
	3078	}
	3079
	3080	/* Pass 2: process columns.
	3081	* We remove the PASS1_BITS scaling, but leave the results scaled up
	3082	* by an overall factor of 8.
	3083	* 4-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
	3084	*/
	3085
	3086	dataptr = data;
	3087	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
	3088	/* Even part */
	3089
	3090	/* Add fudge factor here for final descale. */
	3091	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE3] + (ONE << (PASS1_BITS-1));
	3092	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE2];
	3093
	3094	tmp10 = dataptr[DCTSIZE0] - dataptr[DCTSIZE3];
	3095	tmp11 = dataptr[DCTSIZE1] - dataptr[DCTSIZE2];
	3096
	3097	dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
	3098	dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
	3099
	3100	/* Odd part */
	3101
	3102	tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
	3103	/* Add fudge factor here for final descale. */
	3104	tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);
	3105
	3106	dataptr[DCTSIZE*1] = (DCTELEM)
	3107	RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
	3108	CONST_BITS+PASS1_BITS);
	3109	dataptr[DCTSIZE*3] = (DCTELEM)
	3110	RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
	3111	CONST_BITS+PASS1_BITS);
	3112
	3113	dataptr++; /* advance pointer to next column */
	3114	}
	3115	}
	3116
	3117
	3118	/*
	3119	* Perform the forward DCT on a 6x3 sample block.
	3120	*
	3121	* 6-point FDCT in pass 1 (rows), 3-point in pass 2 (columns).
	3122	*/
	3123
	3124	GLOBAL(void)
	3125	jpeg_fdct_6x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	3126	{
	3127	INT32 tmp0, tmp1, tmp2;
	3128	INT32 tmp10, tmp11, tmp12;
	3129	DCTELEM *dataptr;
	3130	JSAMPROW elemptr;
	3131	int ctr;
	3132	SHIFT_TEMPS
	3133
	3134	/* Pre-zero output coefficient block. */
	3135	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
	3136
	3137	/* Pass 1: process rows. */
	3138	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
	3139	/* furthermore, we scale the results by 2*PASS1_BITS. /
	3140	/* We scale the results further by 2 as part of output adaption */
	3141	/* scaling for different DCT size. */
	3142	/* 6-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/12). /
	3143
	3144	dataptr = data;
	3145	for (ctr = 0; ctr < 3; ctr++) {
	3146	elemptr = sample_data[ctr] + start_col;
	3147
	3148	/* Even part */
	3149
	3150	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
	3151	tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
	3152	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
	3153
	3154	tmp10 = tmp0 + tmp2;
	3155	tmp12 = tmp0 - tmp2;
	3156
	3157	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
	3158	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
	3159	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
	3160
	3161	/* Apply unsigned->signed conversion */
	3162	dataptr[0] = (DCTELEM)
	3163	((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << (PASS1_BITS+1));
	3164	dataptr[2] = (DCTELEM)
	3165	DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */
	3166	CONST_BITS-PASS1_BITS-1);
	3167	dataptr[4] = (DCTELEM)
	3168	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
	3169	CONST_BITS-PASS1_BITS-1);
	3170
	3171	/* Odd part */
	3172
	3173	tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */
	3174	CONST_BITS-PASS1_BITS-1);
	3175
	3176	dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << (PASS1_BITS+1)));
	3177	dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << (PASS1_BITS+1));
	3178	dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << (PASS1_BITS+1)));
	3179
	3180	dataptr += DCTSIZE; /* advance pointer to next row */
	3181	}
	3182
	3183	/* Pass 2: process columns.
	3184	* We remove the PASS1_BITS scaling, but leave the results scaled up
	3185	* by an overall factor of 8.
	3186	* We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
	3187	* fold into the constant multipliers (other part was done in pass 1):
	3188	* 3-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/6) 16/9.
	3189	*/
	3190
	3191	dataptr = data;
	3192	for (ctr = 0; ctr < 6; ctr++) {
	3193	/* Even part */
	3194
	3195	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE2];
	3196	tmp1 = dataptr[DCTSIZE*1];
	3197
	3198	tmp2 = dataptr[DCTSIZE0] - dataptr[DCTSIZE2];
	3199
	3200	dataptr[DCTSIZE*0] = (DCTELEM)
	3201	DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
	3202	CONST_BITS+PASS1_BITS);
	3203	dataptr[DCTSIZE*2] = (DCTELEM)
	3204	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
	3205	CONST_BITS+PASS1_BITS);
	3206
	3207	/* Odd part */
	3208
	3209	dataptr[DCTSIZE*1] = (DCTELEM)
	3210	DESCALE(MULTIPLY(tmp2, FIX(2.177324216)), /* c1 */
	3211	CONST_BITS+PASS1_BITS);
	3212
	3213	dataptr++; /* advance pointer to next column */
	3214	}
	3215	}
	3216
	3217
	3218	/*
	3219	* Perform the forward DCT on a 4x2 sample block.
	3220	*
	3221	* 4-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
	3222	*/
	3223
	3224	GLOBAL(void)
	3225	jpeg_fdct_4x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	3226	{
	3227	INT32 tmp0, tmp1;
	3228	INT32 tmp10, tmp11;
	3229	DCTELEM *dataptr;
	3230	JSAMPROW elemptr;
	3231	int ctr;
	3232	SHIFT_TEMPS
	3233
	3234	/* Pre-zero output coefficient block. */
	3235	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
	3236
	3237	/* Pass 1: process rows. */
	3238	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
	3239	/* furthermore, we scale the results by 2*PASS1_BITS. /
	3240	/* We must also scale the output by (8/4)(8/2) = 23, which we add here. /
	3241	/* 4-point FDCT kernel, */
	3242	/* cK represents sqrt(2) * cos(Kpi/16) [refers to 8-point FDCT]. /
	3243
	3244	dataptr = data;
	3245	for (ctr = 0; ctr < 2; ctr++) {
	3246	elemptr = sample_data[ctr] + start_col;
	3247
	3248	/* Even part */
	3249
	3250	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
	3251	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
	3252
	3253	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
	3254	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
	3255
	3256	/* Apply unsigned->signed conversion */
	3257	dataptr[0] = (DCTELEM)
	3258	((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+3));
	3259	dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+3));
	3260
	3261	/* Odd part */
	3262
	3263	tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
	3264	/* Add fudge factor here for final descale. */
	3265	tmp0 += ONE << (CONST_BITS-PASS1_BITS-4);
	3266
	3267	dataptr[1] = (DCTELEM)
	3268	RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
	3269	CONST_BITS-PASS1_BITS-3);
	3270	dataptr[3] = (DCTELEM)
	3271	RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
	3272	CONST_BITS-PASS1_BITS-3);
	3273
	3274	dataptr += DCTSIZE; /* advance pointer to next row */
	3275	}
	3276
	3277	/* Pass 2: process columns.
	3278	* We remove the PASS1_BITS scaling, but leave the results scaled up
	3279	* by an overall factor of 8.
	3280	*/
	3281
	3282	dataptr = data;
	3283	for (ctr = 0; ctr < 4; ctr++) {
	3284	/* Even part */
	3285
	3286	/* Add fudge factor here for final descale. */
	3287	tmp0 = dataptr[DCTSIZE*0] + (ONE << (PASS1_BITS-1));
	3288	tmp1 = dataptr[DCTSIZE*1];
	3289
	3290	dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
	3291
	3292	/* Odd part */
	3293
	3294	dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
	3295
	3296	dataptr++; /* advance pointer to next column */
	3297	}
	3298	}
	3299
	3300
	3301	/*
	3302	* Perform the forward DCT on a 2x1 sample block.
	3303	*
	3304	* 2-point FDCT in pass 1 (rows), 1-point in pass 2 (columns).
	3305	*/
	3306
	3307	GLOBAL(void)
	3308	jpeg_fdct_2x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	3309	{
	3310	INT32 tmp0, tmp1;
	3311	JSAMPROW elemptr;
	3312
	3313	/* Pre-zero output coefficient block. */
	3314	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
	3315
	3316	elemptr = sample_data[0] + start_col;
	3317
	3318	tmp0 = GETJSAMPLE(elemptr[0]);
	3319	tmp1 = GETJSAMPLE(elemptr[1]);
	3320
	3321	/* We leave the results scaled up by an overall factor of 8.
	3322	* We must also scale the output by (8/2)(8/1) = 2*5.
	3323	*/
	3324
	3325	/* Even part */
	3326	/* Apply unsigned->signed conversion */
	3327	data[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5);
	3328
	3329	/* Odd part */
	3330	data[1] = (DCTELEM) ((tmp0 - tmp1) << 5);
	3331	}
	3332
	3333
	3334	/*
	3335	* Perform the forward DCT on an 8x16 sample block.
	3336	*
	3337	* 8-point FDCT in pass 1 (rows), 16-point in pass 2 (columns).
	3338	*/
	3339
	3340	GLOBAL(void)
	3341	jpeg_fdct_8x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	3342	{
	3343	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
	3344	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
	3345	INT32 z1;
	3346	DCTELEM workspace[DCTSIZE2];
	3347	DCTELEM *dataptr;
	3348	DCTELEM *wsptr;
	3349	JSAMPROW elemptr;
	3350	int ctr;
	3351	SHIFT_TEMPS
	3352
	3353	/* Pass 1: process rows. */
	3354	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
	3355	/* furthermore, we scale the results by 2*PASS1_BITS. /
	3356
	3357	dataptr = data;
	3358	ctr = 0;
	3359	for (;;) {
	3360	elemptr = sample_data[ctr] + start_col;
	3361
	3362	/* Even part per LL&M figure 1 --- note that published figure is faulty;
	3363	* rotator "sqrt(2)c1" should be "sqrt(2)c6".
	3364	*/
	3365
	3366	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
	3367	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
	3368	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
	3369	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
	3370
	3371	tmp10 = tmp0 + tmp3;
	3372	tmp12 = tmp0 - tmp3;
	3373	tmp11 = tmp1 + tmp2;
	3374	tmp13 = tmp1 - tmp2;
	3375
	3376	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
	3377	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
	3378	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
	3379	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
	3380
	3381	/* Apply unsigned->signed conversion */
	3382	dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
	3383	dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
	3384
	3385	z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
	3386	dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),
	3387	CONST_BITS-PASS1_BITS);
	3388	dataptr[6] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),
	3389	CONST_BITS-PASS1_BITS);
	3390
	3391	/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
	3392	* 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
	3393	* i0..i3 in the paper are tmp0..tmp3 here.
	3394	*/
	3395
	3396	tmp10 = tmp0 + tmp3;
	3397	tmp11 = tmp1 + tmp2;
	3398	tmp12 = tmp0 + tmp2;
	3399	tmp13 = tmp1 + tmp3;
	3400	z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
	3401
	3402	tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
	3403	tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
	3404	tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
	3405	tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
	3406	tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
	3407	tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
	3408	tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
	3409	tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
	3410
	3411	tmp12 += z1;
	3412	tmp13 += z1;
	3413
	3414	dataptr[1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
	3415	dataptr[3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
	3416	dataptr[5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
	3417	dataptr[7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);
	3418
	3419	ctr++;
	3420
	3421	if (ctr != DCTSIZE) {
	3422	if (ctr == DCTSIZE * 2)
	3423	break; /* Done. */
	3424	dataptr += DCTSIZE; /* advance pointer to next row */
	3425	} else
	3426	dataptr = workspace; /* switch pointer to extended workspace */
	3427	}
	3428
	3429	/* Pass 2: process columns.
	3430	* We remove the PASS1_BITS scaling, but leave the results scaled up
	3431	* by an overall factor of 8.
	3432	* We must also scale the output by 8/16 = 1/2.
	3433	* 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
	3434	*/
	3435
	3436	dataptr = data;
	3437	wsptr = workspace;
	3438	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
	3439	/* Even part */
	3440
	3441	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE7];
	3442	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE6];
	3443	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE5];
	3444	tmp3 = dataptr[DCTSIZE3] + wsptr[DCTSIZE4];
	3445	tmp4 = dataptr[DCTSIZE4] + wsptr[DCTSIZE3];
	3446	tmp5 = dataptr[DCTSIZE5] + wsptr[DCTSIZE2];
	3447	tmp6 = dataptr[DCTSIZE6] + wsptr[DCTSIZE1];
	3448	tmp7 = dataptr[DCTSIZE7] + wsptr[DCTSIZE0];
	3449
	3450	tmp10 = tmp0 + tmp7;
	3451	tmp14 = tmp0 - tmp7;
	3452	tmp11 = tmp1 + tmp6;
	3453	tmp15 = tmp1 - tmp6;
	3454	tmp12 = tmp2 + tmp5;
	3455	tmp16 = tmp2 - tmp5;
	3456	tmp13 = tmp3 + tmp4;
	3457	tmp17 = tmp3 - tmp4;
	3458
	3459	tmp0 = dataptr[DCTSIZE0] - wsptr[DCTSIZE7];
	3460	tmp1 = dataptr[DCTSIZE1] - wsptr[DCTSIZE6];
	3461	tmp2 = dataptr[DCTSIZE2] - wsptr[DCTSIZE5];
	3462	tmp3 = dataptr[DCTSIZE3] - wsptr[DCTSIZE4];
	3463	tmp4 = dataptr[DCTSIZE4] - wsptr[DCTSIZE3];
	3464	tmp5 = dataptr[DCTSIZE5] - wsptr[DCTSIZE2];
	3465	tmp6 = dataptr[DCTSIZE6] - wsptr[DCTSIZE1];
	3466	tmp7 = dataptr[DCTSIZE7] - wsptr[DCTSIZE0];
	3467
	3468	dataptr[DCTSIZE*0] = (DCTELEM)
	3469	DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+1);
	3470	dataptr[DCTSIZE*4] = (DCTELEM)
	3471	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
	3472	MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
	3473	CONST_BITS+PASS1_BITS+1);
	3474
	3475	tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
	3476	MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
	3477
	3478	dataptr[DCTSIZE*2] = (DCTELEM)
	3479	DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
	3480	+ MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */
	3481	CONST_BITS+PASS1_BITS+1);
	3482	dataptr[DCTSIZE*6] = (DCTELEM)
	3483	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
	3484	- MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
	3485	CONST_BITS+PASS1_BITS+1);
	3486
	3487	/* Odd part */
	3488
	3489	tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
	3490	MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
	3491	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
	3492	MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
	3493	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
	3494	MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
	3495	tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
	3496	MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
	3497	tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
	3498	MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
	3499	tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
	3500	MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
	3501	tmp10 = tmp11 + tmp12 + tmp13 -
	3502	MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
	3503	MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
	3504	tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
	3505	- MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
	3506	tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
	3507	+ MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
	3508	tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
	3509	+ MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
	3510
	3511	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+1);
	3512	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+1);
	3513	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+1);
	3514	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+1);
	3515
	3516	dataptr++; /* advance pointer to next column */
	3517	wsptr++; /* advance pointer to next column */
	3518	}
	3519	}
	3520
	3521
	3522	/*
	3523	* Perform the forward DCT on a 7x14 sample block.
	3524	*
	3525	* 7-point FDCT in pass 1 (rows), 14-point in pass 2 (columns).
	3526	*/
	3527
	3528	GLOBAL(void)
	3529	jpeg_fdct_7x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	3530	{
	3531	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
	3532	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
	3533	INT32 z1, z2, z3;
	3534	DCTELEM workspace[8*6];
	3535	DCTELEM *dataptr;
	3536	DCTELEM *wsptr;
	3537	JSAMPROW elemptr;
	3538	int ctr;
	3539	SHIFT_TEMPS
	3540
	3541	/* Pre-zero output coefficient block. */
	3542	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
	3543
	3544	/* Pass 1: process rows. */
	3545	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
	3546	/* furthermore, we scale the results by 2*PASS1_BITS. /
	3547	/* 7-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/14). /
	3548
	3549	dataptr = data;
	3550	ctr = 0;
	3551	for (;;) {
	3552	elemptr = sample_data[ctr] + start_col;
	3553
	3554	/* Even part */
	3555
	3556	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
	3557	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
	3558	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
	3559	tmp3 = GETJSAMPLE(elemptr[3]);
	3560
	3561	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
	3562	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
	3563	tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
	3564
	3565	z1 = tmp0 + tmp2;
	3566	/* Apply unsigned->signed conversion */
	3567	dataptr[0] = (DCTELEM)
	3568	((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
	3569	tmp3 += tmp3;
	3570	z1 -= tmp3;
	3571	z1 -= tmp3;
	3572	z1 = MULTIPLY(z1, FIX(0.353553391)); /* (c2+c6-c4)/2 */
	3573	z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002)); /* (c2+c4-c6)/2 */
	3574	z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123)); /* c6 */
	3575	dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
	3576	z1 -= z2;
	3577	z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734)); /* c4 */
	3578	dataptr[4] = (DCTELEM)
	3579	DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
	3580	CONST_BITS-PASS1_BITS);
	3581	dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
	3582
	3583	/* Odd part */
	3584
	3585	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347)); /* (c3+c1-c5)/2 */
	3586	tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339)); /* (c3+c5-c1)/2 */
	3587	tmp0 = tmp1 - tmp2;
	3588	tmp1 += tmp2;
	3589	tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
	3590	tmp1 += tmp2;
	3591	tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268)); /* c5 */
	3592	tmp0 += tmp3;
	3593	tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693)); /* c3+c1-c5 */
	3594
	3595	dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
	3596	dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
	3597	dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
	3598
	3599	ctr++;
	3600
	3601	if (ctr != DCTSIZE) {
	3602	if (ctr == 14)
	3603	break; /* Done. */
	3604	dataptr += DCTSIZE; /* advance pointer to next row */
	3605	} else
	3606	dataptr = workspace; /* switch pointer to extended workspace */
	3607	}
	3608
	3609	/* Pass 2: process columns.
	3610	* We remove the PASS1_BITS scaling, but leave the results scaled up
	3611	* by an overall factor of 8.
	3612	* We must also scale the output by (8/7)*(8/14) = 32/49, which we
	3613	* fold into the constant multipliers:
	3614	* 14-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/28) 32/49.
	3615	*/
	3616
	3617	dataptr = data;
	3618	wsptr = workspace;
	3619	for (ctr = 0; ctr < 7; ctr++) {
	3620	/* Even part */
	3621
	3622	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE5];
	3623	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE4];
	3624	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE3];
	3625	tmp13 = dataptr[DCTSIZE3] + wsptr[DCTSIZE2];
	3626	tmp4 = dataptr[DCTSIZE4] + wsptr[DCTSIZE1];
	3627	tmp5 = dataptr[DCTSIZE5] + wsptr[DCTSIZE0];
	3628	tmp6 = dataptr[DCTSIZE6] + dataptr[DCTSIZE7];
	3629
	3630	tmp10 = tmp0 + tmp6;
	3631	tmp14 = tmp0 - tmp6;
	3632	tmp11 = tmp1 + tmp5;
	3633	tmp15 = tmp1 - tmp5;
	3634	tmp12 = tmp2 + tmp4;
	3635	tmp16 = tmp2 - tmp4;
	3636
	3637	tmp0 = dataptr[DCTSIZE0] - wsptr[DCTSIZE5];
	3638	tmp1 = dataptr[DCTSIZE1] - wsptr[DCTSIZE4];
	3639	tmp2 = dataptr[DCTSIZE2] - wsptr[DCTSIZE3];
	3640	tmp3 = dataptr[DCTSIZE3] - wsptr[DCTSIZE2];
	3641	tmp4 = dataptr[DCTSIZE4] - wsptr[DCTSIZE1];
	3642	tmp5 = dataptr[DCTSIZE5] - wsptr[DCTSIZE0];
	3643	tmp6 = dataptr[DCTSIZE6] - dataptr[DCTSIZE7];
	3644
	3645	dataptr[DCTSIZE*0] = (DCTELEM)
	3646	DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
	3647	FIX(0.653061224)), /* 32/49 */
	3648	CONST_BITS+PASS1_BITS);
	3649	tmp13 += tmp13;
	3650	dataptr[DCTSIZE*4] = (DCTELEM)
	3651	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
	3652	MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
	3653	MULTIPLY(tmp12 - tmp13, FIX(0.575835255)), /* c8 */
	3654	CONST_BITS+PASS1_BITS);
	3655
	3656	tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570)); /* c6 */
	3657
	3658	dataptr[DCTSIZE*2] = (DCTELEM)
	3659	DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691)) /* c2-c6 */
	3660	+ MULTIPLY(tmp16, FIX(0.400721155)), /* c10 */
	3661	CONST_BITS+PASS1_BITS);
	3662	dataptr[DCTSIZE*6] = (DCTELEM)
	3663	DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725)) /* c6+c10 */
	3664	- MULTIPLY(tmp16, FIX(0.900412262)), /* c2 */
	3665	CONST_BITS+PASS1_BITS);
	3666
	3667	/* Odd part */
	3668
	3669	tmp10 = tmp1 + tmp2;
	3670	tmp11 = tmp5 - tmp4;
	3671	dataptr[DCTSIZE*7] = (DCTELEM)
	3672	DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
	3673	FIX(0.653061224)), /* 32/49 */
	3674	CONST_BITS+PASS1_BITS);
	3675	tmp3 = MULTIPLY(tmp3 , FIX(0.653061224)); /* 32/49 */
	3676	tmp10 = MULTIPLY(tmp10, - FIX(0.103406812)); /* -c13 */
	3677	tmp11 = MULTIPLY(tmp11, FIX(0.917760839)); /* c1 */
	3678	tmp10 += tmp11 - tmp3;
	3679	tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) + /* c5 */
	3680	MULTIPLY(tmp4 + tmp6, FIX(0.491367823)); /* c9 */
	3681	dataptr[DCTSIZE*5] = (DCTELEM)
	3682	DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
	3683	+ MULTIPLY(tmp4, FIX(0.731428202)), /* c1+c11-c9 */
	3684	CONST_BITS+PASS1_BITS);
	3685	tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) + /* c3 */
	3686	MULTIPLY(tmp5 - tmp6, FIX(0.305035186)); /* c11 */
	3687	dataptr[DCTSIZE*3] = (DCTELEM)
	3688	DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
	3689	- MULTIPLY(tmp5, FIX(2.004803435)), /* c1+c5+c11 */
	3690	CONST_BITS+PASS1_BITS);
	3691	dataptr[DCTSIZE*1] = (DCTELEM)
	3692	DESCALE(tmp11 + tmp12 + tmp3
	3693	- MULTIPLY(tmp0, FIX(0.735987049)) /* c3+c5-c1 */
	3694	- MULTIPLY(tmp6, FIX(0.082925825)), /* c9-c11-c13 */
	3695	CONST_BITS+PASS1_BITS);
	3696
	3697	dataptr++; /* advance pointer to next column */
	3698	wsptr++; /* advance pointer to next column */
	3699	}
	3700	}
	3701
	3702
	3703	/*
	3704	* Perform the forward DCT on a 6x12 sample block.
	3705	*
	3706	* 6-point FDCT in pass 1 (rows), 12-point in pass 2 (columns).
	3707	*/
	3708
	3709	GLOBAL(void)
	3710	jpeg_fdct_6x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	3711	{
	3712	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
	3713	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
	3714	DCTELEM workspace[8*4];
	3715	DCTELEM *dataptr;
	3716	DCTELEM *wsptr;
	3717	JSAMPROW elemptr;
	3718	int ctr;
	3719	SHIFT_TEMPS
	3720
	3721	/* Pre-zero output coefficient block. */
	3722	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
	3723
	3724	/* Pass 1: process rows. */
	3725	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
	3726	/* furthermore, we scale the results by 2*PASS1_BITS. /
	3727	/* 6-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/12). /
	3728
	3729	dataptr = data;
	3730	ctr = 0;
	3731	for (;;) {
	3732	elemptr = sample_data[ctr] + start_col;
	3733
	3734	/* Even part */
	3735
	3736	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
	3737	tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
	3738	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
	3739
	3740	tmp10 = tmp0 + tmp2;
	3741	tmp12 = tmp0 - tmp2;
	3742
	3743	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
	3744	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
	3745	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
	3746
	3747	/* Apply unsigned->signed conversion */
	3748	dataptr[0] = (DCTELEM)
	3749	((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
	3750	dataptr[2] = (DCTELEM)
	3751	DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */
	3752	CONST_BITS-PASS1_BITS);
	3753	dataptr[4] = (DCTELEM)
	3754	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
	3755	CONST_BITS-PASS1_BITS);
	3756
	3757	/* Odd part */
	3758
	3759	tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */
	3760	CONST_BITS-PASS1_BITS);
	3761
	3762	dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
	3763	dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
	3764	dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
	3765
	3766	ctr++;
	3767
	3768	if (ctr != DCTSIZE) {
	3769	if (ctr == 12)
	3770	break; /* Done. */
	3771	dataptr += DCTSIZE; /* advance pointer to next row */
	3772	} else
	3773	dataptr = workspace; /* switch pointer to extended workspace */
	3774	}
	3775
	3776	/* Pass 2: process columns.
	3777	* We remove the PASS1_BITS scaling, but leave the results scaled up
	3778	* by an overall factor of 8.
	3779	* We must also scale the output by (8/6)*(8/12) = 8/9, which we
	3780	* fold into the constant multipliers:
	3781	* 12-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/24) 8/9.
	3782	*/
	3783
	3784	dataptr = data;
	3785	wsptr = workspace;
	3786	for (ctr = 0; ctr < 6; ctr++) {
	3787	/* Even part */
	3788
	3789	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE3];
	3790	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE2];
	3791	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE1];
	3792	tmp3 = dataptr[DCTSIZE3] + wsptr[DCTSIZE0];
	3793	tmp4 = dataptr[DCTSIZE4] + dataptr[DCTSIZE7];
	3794	tmp5 = dataptr[DCTSIZE5] + dataptr[DCTSIZE6];
	3795
	3796	tmp10 = tmp0 + tmp5;
	3797	tmp13 = tmp0 - tmp5;
	3798	tmp11 = tmp1 + tmp4;
	3799	tmp14 = tmp1 - tmp4;
	3800	tmp12 = tmp2 + tmp3;
	3801	tmp15 = tmp2 - tmp3;
	3802
	3803	tmp0 = dataptr[DCTSIZE0] - wsptr[DCTSIZE3];
	3804	tmp1 = dataptr[DCTSIZE1] - wsptr[DCTSIZE2];
	3805	tmp2 = dataptr[DCTSIZE2] - wsptr[DCTSIZE1];
	3806	tmp3 = dataptr[DCTSIZE3] - wsptr[DCTSIZE0];
	3807	tmp4 = dataptr[DCTSIZE4] - dataptr[DCTSIZE7];
	3808	tmp5 = dataptr[DCTSIZE5] - dataptr[DCTSIZE6];
	3809
	3810	dataptr[DCTSIZE*0] = (DCTELEM)
	3811	DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
	3812	CONST_BITS+PASS1_BITS);
	3813	dataptr[DCTSIZE*6] = (DCTELEM)
	3814	DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
	3815	CONST_BITS+PASS1_BITS);
	3816	dataptr[DCTSIZE*4] = (DCTELEM)
	3817	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)), /* c4 */
	3818	CONST_BITS+PASS1_BITS);
	3819	dataptr[DCTSIZE*2] = (DCTELEM)
	3820	DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) + /* 8/9 */
	3821	MULTIPLY(tmp13 + tmp15, FIX(1.214244803)), /* c2 */
	3822	CONST_BITS+PASS1_BITS);
	3823
	3824	/* Odd part */
	3825
	3826	tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200)); /* c9 */
	3827	tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102)); /* c3-c9 */
	3828	tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502)); /* c3+c9 */
	3829	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603)); /* c5 */
	3830	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039)); /* c7 */
	3831	tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
	3832	+ MULTIPLY(tmp5, FIX(0.164081699)); /* c11 */
	3833	tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
	3834	tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
	3835	+ MULTIPLY(tmp5, FIX(0.765261039)); /* c7 */
	3836	tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
	3837	- MULTIPLY(tmp5, FIX(0.997307603)); /* c5 */
	3838	tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
	3839	- MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
	3840
	3841	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS);
	3842	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS);
	3843	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS);
	3844	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS);
	3845
	3846	dataptr++; /* advance pointer to next column */
	3847	wsptr++; /* advance pointer to next column */
	3848	}
	3849	}
	3850
	3851
	3852	/*
	3853	* Perform the forward DCT on a 5x10 sample block.
	3854	*
	3855	* 5-point FDCT in pass 1 (rows), 10-point in pass 2 (columns).
	3856	*/
	3857
	3858	GLOBAL(void)
	3859	jpeg_fdct_5x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	3860	{
	3861	INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
	3862	INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
	3863	DCTELEM workspace[8*2];
	3864	DCTELEM *dataptr;
	3865	DCTELEM *wsptr;
	3866	JSAMPROW elemptr;
	3867	int ctr;
	3868	SHIFT_TEMPS
	3869
	3870	/* Pre-zero output coefficient block. */
	3871	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
	3872
	3873	/* Pass 1: process rows. */
	3874	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
	3875	/* furthermore, we scale the results by 2*PASS1_BITS. /
	3876	/* 5-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/10). /
	3877
	3878	dataptr = data;
	3879	ctr = 0;
	3880	for (;;) {
	3881	elemptr = sample_data[ctr] + start_col;
	3882
	3883	/* Even part */
	3884
	3885	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
	3886	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
	3887	tmp2 = GETJSAMPLE(elemptr[2]);
	3888
	3889	tmp10 = tmp0 + tmp1;
	3890	tmp11 = tmp0 - tmp1;
	3891
	3892	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
	3893	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
	3894
	3895	/* Apply unsigned->signed conversion */
	3896	dataptr[0] = (DCTELEM)
	3897	((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << PASS1_BITS);
	3898	tmp11 = MULTIPLY(tmp11, FIX(0.790569415)); /* (c2+c4)/2 */
	3899	tmp10 -= tmp2 << 2;
	3900	tmp10 = MULTIPLY(tmp10, FIX(0.353553391)); /* (c2-c4)/2 */
	3901	dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS);
	3902	dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS);
	3903
	3904	/* Odd part */
	3905
	3906	tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876)); /* c3 */
	3907
	3908	dataptr[1] = (DCTELEM)
	3909	DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
	3910	CONST_BITS-PASS1_BITS);
	3911	dataptr[3] = (DCTELEM)
	3912	DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
	3913	CONST_BITS-PASS1_BITS);
	3914
	3915	ctr++;
	3916
	3917	if (ctr != DCTSIZE) {
	3918	if (ctr == 10)
	3919	break; /* Done. */
	3920	dataptr += DCTSIZE; /* advance pointer to next row */
	3921	} else
	3922	dataptr = workspace; /* switch pointer to extended workspace */
	3923	}
	3924
	3925	/* Pass 2: process columns.
	3926	* We remove the PASS1_BITS scaling, but leave the results scaled up
	3927	* by an overall factor of 8.
	3928	* We must also scale the output by (8/5)*(8/10) = 32/25, which we
	3929	* fold into the constant multipliers:
	3930	* 10-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/20) 32/25.
	3931	*/
	3932
	3933	dataptr = data;
	3934	wsptr = workspace;
	3935	for (ctr = 0; ctr < 5; ctr++) {
	3936	/* Even part */
	3937
	3938	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE1];
	3939	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE0];
	3940	tmp12 = dataptr[DCTSIZE2] + dataptr[DCTSIZE7];
	3941	tmp3 = dataptr[DCTSIZE3] + dataptr[DCTSIZE6];
	3942	tmp4 = dataptr[DCTSIZE4] + dataptr[DCTSIZE5];
	3943
	3944	tmp10 = tmp0 + tmp4;
	3945	tmp13 = tmp0 - tmp4;
	3946	tmp11 = tmp1 + tmp3;
	3947	tmp14 = tmp1 - tmp3;
	3948
	3949	tmp0 = dataptr[DCTSIZE0] - wsptr[DCTSIZE1];
	3950	tmp1 = dataptr[DCTSIZE1] - wsptr[DCTSIZE0];
	3951	tmp2 = dataptr[DCTSIZE2] - dataptr[DCTSIZE7];
	3952	tmp3 = dataptr[DCTSIZE3] - dataptr[DCTSIZE6];
	3953	tmp4 = dataptr[DCTSIZE4] - dataptr[DCTSIZE5];
	3954
	3955	dataptr[DCTSIZE*0] = (DCTELEM)
	3956	DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
	3957	CONST_BITS+PASS1_BITS);
	3958	tmp12 += tmp12;
	3959	dataptr[DCTSIZE*4] = (DCTELEM)
	3960	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
	3961	MULTIPLY(tmp11 - tmp12, FIX(0.559380511)), /* c8 */
	3962	CONST_BITS+PASS1_BITS);
	3963	tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961)); /* c6 */
	3964	dataptr[DCTSIZE*2] = (DCTELEM)
	3965	DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)), /* c2-c6 */
	3966	CONST_BITS+PASS1_BITS);
	3967	dataptr[DCTSIZE*6] = (DCTELEM)
	3968	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)), /* c2+c6 */
	3969	CONST_BITS+PASS1_BITS);
	3970
	3971	/* Odd part */
	3972
	3973	tmp10 = tmp0 + tmp4;
	3974	tmp11 = tmp1 - tmp3;
	3975	dataptr[DCTSIZE*5] = (DCTELEM)
	3976	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)), /* 32/25 */
	3977	CONST_BITS+PASS1_BITS);
	3978	tmp2 = MULTIPLY(tmp2, FIX(1.28)); /* 32/25 */
	3979	dataptr[DCTSIZE*1] = (DCTELEM)
	3980	DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) + /* c1 */
	3981	MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 + /* c3 */
	3982	MULTIPLY(tmp3, FIX(0.821810588)) + /* c7 */
	3983	MULTIPLY(tmp4, FIX(0.283176630)), /* c9 */
	3984	CONST_BITS+PASS1_BITS);
	3985	tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) - /* (c3+c7)/2 */
	3986	MULTIPLY(tmp1 + tmp3, FIX(0.752365123)); /* (c1-c9)/2 */
	3987	tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) + /* (c3-c7)/2 */
	3988	MULTIPLY(tmp11, FIX(0.64)) - tmp2; /* 16/25 */
	3989	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+PASS1_BITS);
	3990	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+PASS1_BITS);
	3991
	3992	dataptr++; /* advance pointer to next column */
	3993	wsptr++; /* advance pointer to next column */
	3994	}
	3995	}
	3996
	3997
	3998	/*
	3999	* Perform the forward DCT on a 4x8 sample block.
	4000	*
	4001	* 4-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
	4002	*/
	4003
	4004	GLOBAL(void)
	4005	jpeg_fdct_4x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	4006	{
	4007	INT32 tmp0, tmp1, tmp2, tmp3;
	4008	INT32 tmp10, tmp11, tmp12, tmp13;
	4009	INT32 z1;
	4010	DCTELEM *dataptr;
	4011	JSAMPROW elemptr;
	4012	int ctr;
	4013	SHIFT_TEMPS
	4014
	4015	/* Pre-zero output coefficient block. */
	4016	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
	4017
	4018	/* Pass 1: process rows. */
	4019	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
	4020	/* furthermore, we scale the results by 2*PASS1_BITS. /
	4021	/* We must also scale the output by 8/4 = 2, which we add here. */
	4022	/* 4-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/16). /
	4023
	4024	dataptr = data;
	4025	for (ctr = 0; ctr < DCTSIZE; ctr++) {
	4026	elemptr = sample_data[ctr] + start_col;
	4027
	4028	/* Even part */
	4029
	4030	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
	4031	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
	4032
	4033	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
	4034	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
	4035
	4036	/* Apply unsigned->signed conversion */
	4037	dataptr[0] = (DCTELEM)
	4038	((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+1));
	4039	dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+1));
	4040
	4041	/* Odd part */
	4042
	4043	tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
	4044	/* Add fudge factor here for final descale. */
	4045	tmp0 += ONE << (CONST_BITS-PASS1_BITS-2);
	4046
	4047	dataptr[1] = (DCTELEM)
	4048	RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
	4049	CONST_BITS-PASS1_BITS-1);
	4050	dataptr[3] = (DCTELEM)
	4051	RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
	4052	CONST_BITS-PASS1_BITS-1);
	4053
	4054	dataptr += DCTSIZE; /* advance pointer to next row */
	4055	}
	4056
	4057	/* Pass 2: process columns.
	4058	* We remove the PASS1_BITS scaling, but leave the results scaled up
	4059	* by an overall factor of 8.
	4060	*/
	4061
	4062	dataptr = data;
	4063	for (ctr = 0; ctr < 4; ctr++) {
	4064	/* Even part per LL&M figure 1 --- note that published figure is faulty;
	4065	* rotator "sqrt(2)c1" should be "sqrt(2)c6".
	4066	*/
	4067
	4068	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE7];
	4069	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE6];
	4070	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE5];
	4071	tmp3 = dataptr[DCTSIZE3] + dataptr[DCTSIZE4];
	4072
	4073	/* Add fudge factor here for final descale. */
	4074	tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
	4075	tmp12 = tmp0 - tmp3;
	4076	tmp11 = tmp1 + tmp2;
	4077	tmp13 = tmp1 - tmp2;
	4078
	4079	tmp0 = dataptr[DCTSIZE0] - dataptr[DCTSIZE7];
	4080	tmp1 = dataptr[DCTSIZE1] - dataptr[DCTSIZE6];
	4081	tmp2 = dataptr[DCTSIZE2] - dataptr[DCTSIZE5];
	4082	tmp3 = dataptr[DCTSIZE3] - dataptr[DCTSIZE4];
	4083
	4084	dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
	4085	dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
	4086
	4087	z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
	4088	/* Add fudge factor here for final descale. */
	4089	z1 += ONE << (CONST_BITS+PASS1_BITS-1);
	4090	dataptr[DCTSIZE*2] = (DCTELEM)
	4091	RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);
	4092	dataptr[DCTSIZE*6] = (DCTELEM)
	4093	RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);
	4094
	4095	/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
	4096	* 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
	4097	* i0..i3 in the paper are tmp0..tmp3 here.
	4098	*/
	4099
	4100	tmp10 = tmp0 + tmp3;
	4101	tmp11 = tmp1 + tmp2;
	4102	tmp12 = tmp0 + tmp2;
	4103	tmp13 = tmp1 + tmp3;
	4104	z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
	4105	/* Add fudge factor here for final descale. */
	4106	z1 += ONE << (CONST_BITS+PASS1_BITS-1);
	4107
	4108	tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
	4109	tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
	4110	tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
	4111	tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
	4112	tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
	4113	tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
	4114	tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
	4115	tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
	4116
	4117	tmp12 += z1;
	4118	tmp13 += z1;
	4119
	4120	dataptr[DCTSIZE*1] = (DCTELEM)
	4121	RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
	4122	dataptr[DCTSIZE*3] = (DCTELEM)
	4123	RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
	4124	dataptr[DCTSIZE*5] = (DCTELEM)
	4125	RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
	4126	dataptr[DCTSIZE*7] = (DCTELEM)
	4127	RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);
	4128
	4129	dataptr++; /* advance pointer to next column */
	4130	}
	4131	}
	4132
	4133
	4134	/*
	4135	* Perform the forward DCT on a 3x6 sample block.
	4136	*
	4137	* 3-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
	4138	*/
	4139
	4140	GLOBAL(void)
	4141	jpeg_fdct_3x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	4142	{
	4143	INT32 tmp0, tmp1, tmp2;
	4144	INT32 tmp10, tmp11, tmp12;
	4145	DCTELEM *dataptr;
	4146	JSAMPROW elemptr;
	4147	int ctr;
	4148	SHIFT_TEMPS
	4149
	4150	/* Pre-zero output coefficient block. */
	4151	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
	4152
	4153	/* Pass 1: process rows. */
	4154	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
	4155	/* furthermore, we scale the results by 2*PASS1_BITS. /
	4156	/* We scale the results further by 2 as part of output adaption */
	4157	/* scaling for different DCT size. */
	4158	/* 3-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/6). /
	4159
	4160	dataptr = data;
	4161	for (ctr = 0; ctr < 6; ctr++) {
	4162	elemptr = sample_data[ctr] + start_col;
	4163
	4164	/* Even part */
	4165
	4166	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
	4167	tmp1 = GETJSAMPLE(elemptr[1]);
	4168
	4169	tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
	4170
	4171	/* Apply unsigned->signed conversion */
	4172	dataptr[0] = (DCTELEM)
	4173	((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+1));
	4174	dataptr[2] = (DCTELEM)
	4175	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
	4176	CONST_BITS-PASS1_BITS-1);
	4177
	4178	/* Odd part */
	4179
	4180	dataptr[1] = (DCTELEM)
	4181	DESCALE(MULTIPLY(tmp2, FIX(1.224744871)), /* c1 */
	4182	CONST_BITS-PASS1_BITS-1);
	4183
	4184	dataptr += DCTSIZE; /* advance pointer to next row */
	4185	}
	4186
	4187	/* Pass 2: process columns.
	4188	* We remove the PASS1_BITS scaling, but leave the results scaled up
	4189	* by an overall factor of 8.
	4190	* We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
	4191	* fold into the constant multipliers (other part was done in pass 1):
	4192	* 6-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/12) 16/9.
	4193	*/
	4194
	4195	dataptr = data;
	4196	for (ctr = 0; ctr < 3; ctr++) {
	4197	/* Even part */
	4198
	4199	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE5];
	4200	tmp11 = dataptr[DCTSIZE1] + dataptr[DCTSIZE4];
	4201	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE3];
	4202
	4203	tmp10 = tmp0 + tmp2;
	4204	tmp12 = tmp0 - tmp2;
	4205
	4206	tmp0 = dataptr[DCTSIZE0] - dataptr[DCTSIZE5];
	4207	tmp1 = dataptr[DCTSIZE1] - dataptr[DCTSIZE4];
	4208	tmp2 = dataptr[DCTSIZE2] - dataptr[DCTSIZE3];
	4209
	4210	dataptr[DCTSIZE*0] = (DCTELEM)
	4211	DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */
	4212	CONST_BITS+PASS1_BITS);
	4213	dataptr[DCTSIZE*2] = (DCTELEM)
	4214	DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */
	4215	CONST_BITS+PASS1_BITS);
	4216	dataptr[DCTSIZE*4] = (DCTELEM)
	4217	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
	4218	CONST_BITS+PASS1_BITS);
	4219
	4220	/* Odd part */
	4221
	4222	tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */
	4223
	4224	dataptr[DCTSIZE*1] = (DCTELEM)
	4225	DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
	4226	CONST_BITS+PASS1_BITS);
	4227	dataptr[DCTSIZE*3] = (DCTELEM)
	4228	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */
	4229	CONST_BITS+PASS1_BITS);
	4230	dataptr[DCTSIZE*5] = (DCTELEM)
	4231	DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */
	4232	CONST_BITS+PASS1_BITS);
	4233
	4234	dataptr++; /* advance pointer to next column */
	4235	}
	4236	}
	4237
	4238
	4239	/*
	4240	* Perform the forward DCT on a 2x4 sample block.
	4241	*
	4242	* 2-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
	4243	*/
	4244
	4245	GLOBAL(void)
	4246	jpeg_fdct_2x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	4247	{
	4248	INT32 tmp0, tmp1;
	4249	INT32 tmp10, tmp11;
	4250	DCTELEM *dataptr;
	4251	JSAMPROW elemptr;
	4252	int ctr;
	4253	SHIFT_TEMPS
	4254
	4255	/* Pre-zero output coefficient block. */
	4256	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
	4257
	4258	/* Pass 1: process rows. */
	4259	/* Note results are scaled up by sqrt(8) compared to a true DCT. */
	4260	/* We must also scale the output by (8/2)(8/4) = 23, which we add here. /
	4261
	4262	dataptr = data;
	4263	for (ctr = 0; ctr < 4; ctr++) {
	4264	elemptr = sample_data[ctr] + start_col;
	4265
	4266	/* Even part */
	4267
	4268	tmp0 = GETJSAMPLE(elemptr[0]);
	4269	tmp1 = GETJSAMPLE(elemptr[1]);
	4270
	4271	/* Apply unsigned->signed conversion */
	4272	dataptr[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 3);
	4273
	4274	/* Odd part */
	4275
	4276	dataptr[1] = (DCTELEM) ((tmp0 - tmp1) << 3);
	4277
	4278	dataptr += DCTSIZE; /* advance pointer to next row */
	4279	}
	4280
	4281	/* Pass 2: process columns.
	4282	* We leave the results scaled up by an overall factor of 8.
	4283	* 4-point FDCT kernel,
	4284	* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
	4285	*/
	4286
	4287	dataptr = data;
	4288	for (ctr = 0; ctr < 2; ctr++) {
	4289	/* Even part */
	4290
	4291	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE3];
	4292	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE2];
	4293
	4294	tmp10 = dataptr[DCTSIZE0] - dataptr[DCTSIZE3];
	4295	tmp11 = dataptr[DCTSIZE1] - dataptr[DCTSIZE2];
	4296
	4297	dataptr[DCTSIZE*0] = (DCTELEM) (tmp0 + tmp1);
	4298	dataptr[DCTSIZE*2] = (DCTELEM) (tmp0 - tmp1);
	4299
	4300	/* Odd part */
	4301
	4302	tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
	4303	/* Add fudge factor here for final descale. */
	4304	tmp0 += ONE << (CONST_BITS-1);
	4305
	4306	dataptr[DCTSIZE*1] = (DCTELEM)
	4307	RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
	4308	CONST_BITS);
	4309	dataptr[DCTSIZE*3] = (DCTELEM)
	4310	RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
	4311	CONST_BITS);
	4312
	4313	dataptr++; /* advance pointer to next column */
	4314	}
	4315	}
	4316
	4317
	4318	/*
	4319	* Perform the forward DCT on a 1x2 sample block.
	4320	*
	4321	* 1-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
	4322	*/
	4323
	4324	GLOBAL(void)
	4325	jpeg_fdct_1x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
	4326	{
	4327	INT32 tmp0, tmp1;
	4328
	4329	/* Pre-zero output coefficient block. */
	4330	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
	4331
	4332	tmp0 = GETJSAMPLE(sample_data[0][start_col]);
	4333	tmp1 = GETJSAMPLE(sample_data[1][start_col]);
	4334
	4335	/* We leave the results scaled up by an overall factor of 8.
	4336	* We must also scale the output by (8/1)(8/2) = 2*5.
	4337	*/
	4338
	4339	/* Even part */
	4340	/* Apply unsigned->signed conversion */
	4341	data[DCTSIZE0] = (DCTELEM) ((tmp0 + tmp1 - 2 CENTERJSAMPLE) << 5);
	4342
	4343	/* Odd part */
	4344	data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp1) << 5);
	4345	}
	4346
	4347	#endif /* DCT_SCALING_SUPPORTED */
[2]	4348	#endif /* DCT_ISLOW_SUPPORTED */

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/src/3rdparty/libjpeg/jfdctint.c

Download in other formats: