Context Navigation

mpi.c

Visit:

Last change on this file was 21363, checked in by vladest, 16 years ago
Added RSA security interface
File size: 101.7 KB

Rev	Line
[21363]	1	/*
	2	* dlls/rsaenh/mpi.c
	3	* Multi Precision Integer functions
	4	*
	5	* Copyright 2004 Michael Jung
	6	* Based on public domain code by Tom St Denis (tomstdenis@iahu.ca)
	7	*
	8	* This library is free software; you can redistribute it and/or
	9	* modify it under the terms of the GNU Lesser General Public
	10	* License as published by the Free Software Foundation; either
	11	* version 2.1 of the License, or (at your option) any later version.
	12	*
	13	* This library is distributed in the hope that it will be useful,
	14	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	16	* Lesser General Public License for more details.
	17	*
	18	* You should have received a copy of the GNU Lesser General Public
	19	* License along with this library; if not, write to the Free Software
	20	* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
	21	*/
	22
	23	/*
	24	* This file contains code from the LibTomCrypt cryptographic
	25	* library written by Tom St Denis (tomstdenis@iahu.ca). LibTomCrypt
	26	* is in the public domain. The code in this file is tailored to
	27	* special requirements. Take a look at http://libtomcrypt.org for the
	28	* original version.
	29	*/
	30
	31	#include <stdarg.h>
	32	#include "tomcrypt.h"
	33
	34	/* Known optimal configurations
	35	CPU /Compiler /MUL CUTOFF/SQR CUTOFF
	36	-------------------------------------------------------------
	37	Intel P4 Northwood /GCC v3.4.1 / 88/ 128/LTM 0.32 ;-)
	38	*/
	39	static const int KARATSUBA_MUL_CUTOFF = 88, /* Min. number of digits before Karatsuba multiplication is used. */
	40	KARATSUBA_SQR_CUTOFF = 128; /* Min. number of digits before Karatsuba squaring is used. */
	41
	42	static void bn_reverse(unsigned char *s, int len);
	43	static int s_mp_add(mp_int a, mp_int b, mp_int *c);
	44	static int s_mp_exptmod (const mp_int * G, const mp_int * X, mp_int * P, mp_int * Y);
	45	#define s_mp_mul(a, b, c) s_mp_mul_digs(a, b, c, (a)->used + (b)->used + 1)
	46	static int s_mp_mul_digs(const mp_int a, const mp_int b, mp_int *c, int digs);
	47	static int s_mp_mul_high_digs(const mp_int a, const mp_int b, mp_int *c, int digs);
	48	static int s_mp_sqr(const mp_int a, mp_int b);
	49	static int s_mp_sub(const mp_int a, const mp_int b, mp_int *c);
	50	static int mp_exptmod_fast(const mp_int G, const mp_int X, mp_int P, mp_int Y, int mode);
	51	static int mp_invmod_slow (const mp_int * a, mp_int * b, mp_int * c);
	52	static int mp_karatsuba_mul(const mp_int a, const mp_int b, mp_int *c);
	53	static int mp_karatsuba_sqr(const mp_int a, mp_int b);
	54
	55	/* computes the modular inverse via binary extended euclidean algorithm,
	56	* that is c = 1/a mod b
	57	*
	58	* Based on slow invmod except this is optimized for the case where b is
	59	* odd as per HAC Note 14.64 on pp. 610
	60	*/
	61	static int
	62	fast_mp_invmod (const mp_int * a, mp_int * b, mp_int * c)
	63	{
	64	mp_int x, y, u, v, B, D;
	65	int res, neg;
	66
	67	/* 2. [modified] b must be odd */
	68	if (mp_iseven (b) == 1) {
	69	return MP_VAL;
	70	}
	71
	72	/* init all our temps */
	73	if ((res = mp_init_multi(&x, &y, &u, &v, &B, &D, NULL)) != MP_OKAY) {
	74	return res;
	75	}
	76
	77	/* x == modulus, y == value to invert */
	78	if ((res = mp_copy (b, &x)) != MP_OKAY) {
	79	goto __ERR;
	80	}
	81
	82	/* we need y = \|a\| */
	83	if ((res = mp_abs (a, &y)) != MP_OKAY) {
	84	goto __ERR;
	85	}
	86
	87	/* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
	88	if ((res = mp_copy (&x, &u)) != MP_OKAY) {
	89	goto __ERR;
	90	}
	91	if ((res = mp_copy (&y, &v)) != MP_OKAY) {
	92	goto __ERR;
	93	}
	94	mp_set (&D, 1);
	95
	96	top:
	97	/* 4. while u is even do */
	98	while (mp_iseven (&u) == 1) {
	99	/* 4.1 u = u/2 */
	100	if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
	101	goto __ERR;
	102	}
	103	/* 4.2 if B is odd then */
	104	if (mp_isodd (&B) == 1) {
	105	if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
	106	goto __ERR;
	107	}
	108	}
	109	/* B = B/2 */
	110	if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
	111	goto __ERR;
	112	}
	113	}
	114
	115	/* 5. while v is even do */
	116	while (mp_iseven (&v) == 1) {
	117	/* 5.1 v = v/2 */
	118	if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
	119	goto __ERR;
	120	}
	121	/* 5.2 if D is odd then */
	122	if (mp_isodd (&D) == 1) {
	123	/* D = (D-x)/2 */
	124	if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
	125	goto __ERR;
	126	}
	127	}
	128	/* D = D/2 */
	129	if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
	130	goto __ERR;
	131	}
	132	}
	133
	134	/* 6. if u >= v then */
	135	if (mp_cmp (&u, &v) != MP_LT) {
	136	/* u = u - v, B = B - D */
	137	if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
	138	goto __ERR;
	139	}
	140
	141	if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
	142	goto __ERR;
	143	}
	144	} else {
	145	/* v - v - u, D = D - B */
	146	if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
	147	goto __ERR;
	148	}
	149
	150	if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
	151	goto __ERR;
	152	}
	153	}
	154
	155	/* if not zero goto step 4 */
	156	if (mp_iszero (&u) == 0) {
	157	goto top;
	158	}
	159
	160	/* now a = C, b = D, gcd == gv /
	161
	162	/* if v != 1 then there is no inverse */
	163	if (mp_cmp_d (&v, 1) != MP_EQ) {
	164	res = MP_VAL;
	165	goto __ERR;
	166	}
	167
	168	/* b is now the inverse */
	169	neg = a->sign;
	170	while (D.sign == MP_NEG) {
	171	if ((res = mp_add (&D, b, &D)) != MP_OKAY) {
	172	goto __ERR;
	173	}
	174	}
	175	mp_exch (&D, c);
	176	c->sign = neg;
	177	res = MP_OKAY;
	178
	179	__ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
	180	return res;
	181	}
	182
	183	/* computes xR**-1 == x (mod N) via Montgomery Reduction
	184	*
	185	* This is an optimized implementation of montgomery_reduce
	186	* which uses the comba method to quickly calculate the columns of the
	187	* reduction.
	188	*
	189	* Based on Algorithm 14.32 on pp.601 of HAC.
	190	*/
	191	static int
	192	fast_mp_montgomery_reduce (mp_int * x, const mp_int * n, mp_digit rho)
	193	{
	194	int ix, res, olduse;
	195	mp_word W[MP_WARRAY];
	196
	197	/* get old used count */
	198	olduse = x->used;
	199
	200	/* grow a as required */
	201	if (x->alloc < n->used + 1) {
	202	if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) {
	203	return res;
	204	}
	205	}
	206
	207	/* first we have to get the digits of the input into
	208	* an array of double precision words W[...]
	209	*/
	210	{
	211	register mp_word *_W;
	212	register mp_digit *tmpx;
	213
	214	/* alias for the W[] array */
	215	_W = W;
	216
	217	/* alias for the digits of x*/
	218	tmpx = x->dp;
	219
	220	/* copy the digits of a into W[0..a->used-1] */
	221	for (ix = 0; ix < x->used; ix++) {
	222	_W++ = tmpx++;
	223	}
	224
	225	/* zero the high words of W[a->used..m->used2] /
	226	for (; ix < n->used * 2 + 1; ix++) {
	227	*_W++ = 0;
	228	}
	229	}
	230
	231	/* now we proceed to zero successive digits
	232	* from the least significant upwards
	233	*/
	234	for (ix = 0; ix < n->used; ix++) {
	235	/* mu = ai * m' mod b
	236	*
	237	* We avoid a double precision multiplication (which isn't required)
	238	* by casting the value down to a mp_digit. Note this requires
	239	* that W[ix-1] have the carry cleared (see after the inner loop)
	240	*/
	241	register mp_digit mu;
	242	mu = (mp_digit) (((W[ix] & MP_MASK) * rho) & MP_MASK);
	243
	244	/* a = a + mu * m * b**i
	245	*
	246	* This is computed in place and on the fly. The multiplication
	247	* by b**i is handled by offsetting which columns the results
	248	* are added to.
	249	*
	250	* Note the comba method normally doesn't handle carries in the
	251	* inner loop In this case we fix the carry from the previous
	252	* column since the Montgomery reduction requires digits of the
	253	* result (so far) [see above] to work. This is
	254	* handled by fixing up one carry after the inner loop. The
	255	* carry fixups are done in order so after these loops the
	256	* first m->used words of W[] have the carries fixed
	257	*/
	258	{
	259	register int iy;
	260	register mp_digit *tmpn;
	261	register mp_word *_W;
	262
	263	/* alias for the digits of the modulus */
	264	tmpn = n->dp;
	265
	266	/* Alias for the columns set by an offset of ix */
	267	_W = W + ix;
	268
	269	/* inner loop */
	270	for (iy = 0; iy < n->used; iy++) {
	271	_W++ += ((mp_word)mu) ((mp_word)*tmpn++);
	272	}
	273	}
	274
	275	/* now fix carry for next digit, W[ix+1] */
	276	W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
	277	}
	278
	279	/* now we have to propagate the carries and
	280	* shift the words downward [all those least
	281	* significant digits we zeroed].
	282	*/
	283	{
	284	register mp_digit *tmpx;
	285	register mp_word _W, _W1;
	286
	287	/* nox fix rest of carries */
	288
	289	/* alias for current word */
	290	_W1 = W + ix;
	291
	292	/* alias for next word, where the carry goes */
	293	_W = W + ++ix;
	294
	295	for (; ix <= n->used * 2 + 1; ix++) {
	296	_W++ += _W1++ >> ((mp_word) DIGIT_BIT);
	297	}
	298
	299	/* copy out, A = A/b**n
	300	*
	301	* The result is A/b**n but instead of converting from an
	302	* array of mp_word to mp_digit than calling mp_rshd
	303	* we just copy them in the right order
	304	*/
	305
	306	/* alias for destination word */
	307	tmpx = x->dp;
	308
	309	/* alias for shifted double precision result */
	310	_W = W + n->used;
	311
	312	for (ix = 0; ix < n->used + 1; ix++) {
	313	tmpx++ = (mp_digit)(_W++ & ((mp_word) MP_MASK));
	314	}
	315
	316	/* zero oldused digits, if the input a was larger than
	317	* m->used+1 we'll have to clear the digits
	318	*/
	319	for (; ix < olduse; ix++) {
	320	*tmpx++ = 0;
	321	}
	322	}
	323
	324	/* set the max used and clamp */
	325	x->used = n->used + 1;
	326	mp_clamp (x);
	327
	328	/* if A >= m then A = A - m */
	329	if (mp_cmp_mag (x, n) != MP_LT) {
	330	return s_mp_sub (x, n, x);
	331	}
	332	return MP_OKAY;
	333	}
	334
	335	/* Fast (comba) multiplier
	336	*
	337	* This is the fast column-array [comba] multiplier. It is
	338	* designed to compute the columns of the product first
	339	* then handle the carries afterwards. This has the effect
	340	* of making the nested loops that compute the columns very
	341	* simple and schedulable on super-scalar processors.
	342	*
	343	* This has been modified to produce a variable number of
	344	* digits of output so if say only a half-product is required
	345	* you don't have to compute the upper half (a feature
	346	* required for fast Barrett reduction).
	347	*
	348	* Based on Algorithm 14.12 on pp.595 of HAC.
	349	*
	350	*/
	351	static int
	352	fast_s_mp_mul_digs (const mp_int * a, const mp_int * b, mp_int * c, int digs)
	353	{
	354	int olduse, res, pa, ix, iz;
	355	mp_digit W[MP_WARRAY];
	356	register mp_word _W;
	357
	358	/* grow the destination as required */
	359	if (c->alloc < digs) {
	360	if ((res = mp_grow (c, digs)) != MP_OKAY) {
	361	return res;
	362	}
	363	}
	364
	365	/* number of output digits to produce */
	366	pa = MIN(digs, a->used + b->used);
	367
	368	/* clear the carry */
	369	_W = 0;
	370	for (ix = 0; ix <= pa; ix++) {
	371	int tx, ty;
	372	int iy;
	373	mp_digit tmpx, tmpy;
	374
	375	/* get offsets into the two bignums */
	376	ty = MIN(b->used-1, ix);
	377	tx = ix - ty;
	378
	379	/* setup temp aliases */
	380	tmpx = a->dp + tx;
	381	tmpy = b->dp + ty;
	382
	383	/* This is the number of times the loop will iterate, essentially it's
	384	while (tx++ < a->used && ty-- >= 0) { ... }
	385	*/
	386	iy = MIN(a->used-tx, ty+1);
	387
	388	/* execute loop */
	389	for (iz = 0; iz < iy; ++iz) {
	390	_W += ((mp_word)tmpx++)((mp_word)*tmpy--);
	391	}
	392
	393	/* store term */
	394	W[ix] = ((mp_digit)_W) & MP_MASK;
	395
	396	/* make next carry */
	397	_W = _W >> ((mp_word)DIGIT_BIT);
	398	}
	399
	400	/* setup dest */
	401	olduse = c->used;
	402	c->used = digs;
	403
	404	{
	405	register mp_digit *tmpc;
	406	tmpc = c->dp;
	407	for (ix = 0; ix < digs; ix++) {
	408	/* now extract the previous digit [below the carry] */
	409	*tmpc++ = W[ix];
	410	}
	411
	412	/* clear unused digits [that existed in the old copy of c] */
	413	for (; ix < olduse; ix++) {
	414	*tmpc++ = 0;
	415	}
	416	}
	417	mp_clamp (c);
	418	return MP_OKAY;
	419	}
	420
	421	/* this is a modified version of fast_s_mul_digs that only produces
	422	* output digits above digs. See the comments for fast_s_mul_digs
	423	* to see how it works.
	424	*
	425	* This is used in the Barrett reduction since for one of the multiplications
	426	* only the higher digits were needed. This essentially halves the work.
	427	*
	428	* Based on Algorithm 14.12 on pp.595 of HAC.
	429	*/
	430	static int
	431	fast_s_mp_mul_high_digs (const mp_int * a, const mp_int * b, mp_int * c, int digs)
	432	{
	433	int olduse, res, pa, ix, iz;
	434	mp_digit W[MP_WARRAY];
	435	mp_word _W;
	436
	437	/* grow the destination as required */
	438	pa = a->used + b->used;
	439	if (c->alloc < pa) {
	440	if ((res = mp_grow (c, pa)) != MP_OKAY) {
	441	return res;
	442	}
	443	}
	444
	445	/* number of output digits to produce */
	446	pa = a->used + b->used;
	447	_W = 0;
	448	for (ix = digs; ix <= pa; ix++) {
	449	int tx, ty, iy;
	450	mp_digit tmpx, tmpy;
	451
	452	/* get offsets into the two bignums */
	453	ty = MIN(b->used-1, ix);
	454	tx = ix - ty;
	455
	456	/* setup temp aliases */
	457	tmpx = a->dp + tx;
	458	tmpy = b->dp + ty;
	459
	460	/* This is the number of times the loop will iterate, essentially it's
	461	while (tx++ < a->used && ty-- >= 0) { ... }
	462	*/
	463	iy = MIN(a->used-tx, ty+1);
	464
	465	/* execute loop */
	466	for (iz = 0; iz < iy; iz++) {
	467	_W += ((mp_word)tmpx++)((mp_word)*tmpy--);
	468	}
	469
	470	/* store term */
	471	W[ix] = ((mp_digit)_W) & MP_MASK;
	472
	473	/* make next carry */
	474	_W = _W >> ((mp_word)DIGIT_BIT);
	475	}
	476
	477	/* setup dest */
	478	olduse = c->used;
	479	c->used = pa;
	480
	481	{
	482	register mp_digit *tmpc;
	483
	484	tmpc = c->dp + digs;
	485	for (ix = digs; ix <= pa; ix++) {
	486	/* now extract the previous digit [below the carry] */
	487	*tmpc++ = W[ix];
	488	}
	489
	490	/* clear unused digits [that existed in the old copy of c] */
	491	for (; ix < olduse; ix++) {
	492	*tmpc++ = 0;
	493	}
	494	}
	495	mp_clamp (c);
	496	return MP_OKAY;
	497	}
	498
	499	/* fast squaring
	500	*
	501	* This is the comba method where the columns of the product
	502	* are computed first then the carries are computed. This
	503	* has the effect of making a very simple inner loop that
	504	* is executed the most
	505	*
	506	* W2 represents the outer products and W the inner.
	507	*
	508	* A further optimizations is made because the inner
	509	* products are of the form "A * B * 2". The *2 part does
	510	* not need to be computed until the end which is good
	511	* because 64-bit shifts are slow!
	512	*
	513	* Based on Algorithm 14.16 on pp.597 of HAC.
	514	*
	515	*/
	516	/* the jist of squaring...
	517
	518	you do like mult except the offset of the tmpx [one that starts closer to zero]
	519	can't equal the offset of tmpy. So basically you set up iy like before then you min it with
	520	(ty-tx) so that it never happens. You double all those you add in the inner loop
	521
	522	After that loop you do the squares and add them in.
	523
	524	Remove W2 and don't memset W
	525
	526	*/
	527
	528	static int fast_s_mp_sqr (const mp_int * a, mp_int * b)
	529	{
	530	int olduse, res, pa, ix, iz;
	531	mp_digit W[MP_WARRAY], *tmpx;
	532	mp_word W1;
	533
	534	/* grow the destination as required */
	535	pa = a->used + a->used;
	536	if (b->alloc < pa) {
	537	if ((res = mp_grow (b, pa)) != MP_OKAY) {
	538	return res;
	539	}
	540	}
	541
	542	/* number of output digits to produce */
	543	W1 = 0;
	544	for (ix = 0; ix <= pa; ix++) {
	545	int tx, ty, iy;
	546	mp_word _W;
	547	mp_digit *tmpy;
	548
	549	/* clear counter */
	550	_W = 0;
	551
	552	/* get offsets into the two bignums */
	553	ty = MIN(a->used-1, ix);
	554	tx = ix - ty;
	555
	556	/* setup temp aliases */
	557	tmpx = a->dp + tx;
	558	tmpy = a->dp + ty;
	559
	560	/* This is the number of times the loop will iterate, essentially it's
	561	while (tx++ < a->used && ty-- >= 0) { ... }
	562	*/
	563	iy = MIN(a->used-tx, ty+1);
	564
	565	/* now for squaring tx can never equal ty
	566	* we halve the distance since they approach at a rate of 2x
	567	* and we have to round because odd cases need to be executed
	568	*/
	569	iy = MIN(iy, (ty-tx+1)>>1);
	570
	571	/* execute loop */
	572	for (iz = 0; iz < iy; iz++) {
	573	_W += ((mp_word)tmpx++)((mp_word)*tmpy--);
	574	}
	575
	576	/* double the inner product and add carry */
	577	_W = _W + _W + W1;
	578
	579	/* even columns have the square term in them */
	580	if ((ix&1) == 0) {
	581	_W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
	582	}
	583
	584	/* store it */
	585	W[ix] = _W;
	586
	587	/* make next carry */
	588	W1 = _W >> ((mp_word)DIGIT_BIT);
	589	}
	590
	591	/* setup dest */
	592	olduse = b->used;
	593	b->used = a->used+a->used;
	594
	595	{
	596	mp_digit *tmpb;
	597	tmpb = b->dp;
	598	for (ix = 0; ix < pa; ix++) {
	599	*tmpb++ = W[ix] & MP_MASK;
	600	}
	601
	602	/* clear unused digits [that existed in the old copy of c] */
	603	for (; ix < olduse; ix++) {
	604	*tmpb++ = 0;
	605	}
	606	}
	607	mp_clamp (b);
	608	return MP_OKAY;
	609	}
	610
	611	/* computes a = 2**b
	612	*
	613	* Simple algorithm which zeroes the int, grows it then just sets one bit
	614	* as required.
	615	*/
	616	int
	617	mp_2expt (mp_int * a, int b)
	618	{
	619	int res;
	620
	621	/* zero a as per default */
	622	mp_zero (a);
	623
	624	/* grow a to accommodate the single bit */
	625	if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) {
	626	return res;
	627	}
	628
	629	/* set the used count of where the bit will go */
	630	a->used = b / DIGIT_BIT + 1;
	631
	632	/* put the single bit in its place */
	633	a->dp[b / DIGIT_BIT] = ((mp_digit)1) << (b % DIGIT_BIT);
	634
	635	return MP_OKAY;
	636	}
	637
	638	/* b = \|a\|
	639	*
	640	* Simple function copies the input and fixes the sign to positive
	641	*/
	642	int
	643	mp_abs (const mp_int * a, mp_int * b)
	644	{
	645	int res;
	646
	647	/* copy a to b */
	648	if (a != b) {
	649	if ((res = mp_copy (a, b)) != MP_OKAY) {
	650	return res;
	651	}
	652	}
	653
	654	/* force the sign of b to positive */
	655	b->sign = MP_ZPOS;
	656
	657	return MP_OKAY;
	658	}
	659
	660	/* high level addition (handles signs) */
	661	int mp_add (mp_int * a, mp_int * b, mp_int * c)
	662	{
	663	int sa, sb, res;
	664
	665	/* get sign of both inputs */
	666	sa = a->sign;
	667	sb = b->sign;
	668
	669	/* handle two cases, not four */
	670	if (sa == sb) {
	671	/* both positive or both negative */
	672	/* add their magnitudes, copy the sign */
	673	c->sign = sa;
	674	res = s_mp_add (a, b, c);
	675	} else {
	676	/* one positive, the other negative */
	677	/* subtract the one with the greater magnitude from */
	678	/* the one of the lesser magnitude. The result gets */
	679	/* the sign of the one with the greater magnitude. */
	680	if (mp_cmp_mag (a, b) == MP_LT) {
	681	c->sign = sb;
	682	res = s_mp_sub (b, a, c);
	683	} else {
	684	c->sign = sa;
	685	res = s_mp_sub (a, b, c);
	686	}
	687	}
	688	return res;
	689	}
	690
	691
	692	/* single digit addition */
	693	int
	694	mp_add_d (mp_int * a, mp_digit b, mp_int * c)
	695	{
	696	int res, ix, oldused;
	697	mp_digit tmpa, tmpc, mu;
	698
	699	/* grow c as required */
	700	if (c->alloc < a->used + 1) {
	701	if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
	702	return res;
	703	}
	704	}
	705
	706	/* if a is negative and \|a\| >= b, call c = \|a\| - b */
	707	if (a->sign == MP_NEG && (a->used > 1 \|\| a->dp[0] >= b)) {
	708	/* temporarily fix sign of a */
	709	a->sign = MP_ZPOS;
	710
	711	/* c = \|a\| - b */
	712	res = mp_sub_d(a, b, c);
	713
	714	/* fix sign */
	715	a->sign = c->sign = MP_NEG;
	716
	717	return res;
	718	}
	719
	720	/* old number of used digits in c */
	721	oldused = c->used;
	722
	723	/* sign always positive */
	724	c->sign = MP_ZPOS;
	725
	726	/* source alias */
	727	tmpa = a->dp;
	728
	729	/* destination alias */
	730	tmpc = c->dp;
	731
	732	/* if a is positive */
	733	if (a->sign == MP_ZPOS) {
	734	/* add digit, after this we're propagating
	735	* the carry.
	736	*/
	737	tmpc = tmpa++ + b;
	738	mu = *tmpc >> DIGIT_BIT;
	739	*tmpc++ &= MP_MASK;
	740
	741	/* now handle rest of the digits */
	742	for (ix = 1; ix < a->used; ix++) {
	743	tmpc = tmpa++ + mu;
	744	mu = *tmpc >> DIGIT_BIT;
	745	*tmpc++ &= MP_MASK;
	746	}
	747	/* set final carry */
	748	ix++;
	749	*tmpc++ = mu;
	750
	751	/* setup size */
	752	c->used = a->used + 1;
	753	} else {
	754	/* a was negative and \|a\| < b */
	755	c->used = 1;
	756
	757	/* the result is a single digit */
	758	if (a->used == 1) {
	759	*tmpc++ = b - a->dp[0];
	760	} else {
	761	*tmpc++ = b;
	762	}
	763
	764	/* setup count so the clearing of oldused
	765	* can fall through correctly
	766	*/
	767	ix = 1;
	768	}
	769
	770	/* now zero to oldused */
	771	while (ix++ < oldused) {
	772	*tmpc++ = 0;
	773	}
	774	mp_clamp(c);
	775
	776	return MP_OKAY;
	777	}
	778
	779	/* trim unused digits
	780	*
	781	* This is used to ensure that leading zero digits are
	782	* trimed and the leading "used" digit will be non-zero
	783	* Typically very fast. Also fixes the sign if there
	784	* are no more leading digits
	785	*/
	786	void
	787	mp_clamp (mp_int * a)
	788	{
	789	/* decrease used while the most significant digit is
	790	* zero.
	791	*/
	792	while (a->used > 0 && a->dp[a->used - 1] == 0) {
	793	--(a->used);
	794	}
	795
	796	/* reset the sign flag if used == 0 */
	797	if (a->used == 0) {
	798	a->sign = MP_ZPOS;
	799	}
	800	}
	801
	802	/* clear one (frees) */
	803	void
	804	mp_clear (mp_int * a)
	805	{
	806	int i;
	807
	808	/* only do anything if a hasn't been freed previously */
	809	if (a->dp != NULL) {
	810	/* first zero the digits */
	811	for (i = 0; i < a->used; i++) {
	812	a->dp[i] = 0;
	813	}
	814
	815	/* free ram */
	816	free(a->dp);
	817
	818	/* reset members to make debugging easier */
	819	a->dp = NULL;
	820	a->alloc = a->used = 0;
	821	a->sign = MP_ZPOS;
	822	}
	823	}
	824
	825
	826	void mp_clear_multi(mp_int *mp, ...)
	827	{
	828	mp_int* next_mp = mp;
	829	va_list args;
	830	va_start(args, mp);
	831	while (next_mp != NULL) {
	832	mp_clear(next_mp);
	833	next_mp = va_arg(args, mp_int*);
	834	}
	835	va_end(args);
	836	}
	837
	838	/* compare two ints (signed)*/
	839	int
	840	mp_cmp (const mp_int * a, const mp_int * b)
	841	{
	842	/* compare based on sign */
	843	if (a->sign != b->sign) {
	844	if (a->sign == MP_NEG) {
	845	return MP_LT;
	846	} else {
	847	return MP_GT;
	848	}
	849	}
	850
	851	/* compare digits */
	852	if (a->sign == MP_NEG) {
	853	/* if negative compare opposite direction */
	854	return mp_cmp_mag(b, a);
	855	} else {
	856	return mp_cmp_mag(a, b);
	857	}
	858	}
	859
	860	/* compare a digit */
	861	int mp_cmp_d(const mp_int * a, mp_digit b)
	862	{
	863	/* compare based on sign */
	864	if (a->sign == MP_NEG) {
	865	return MP_LT;
	866	}
	867
	868	/* compare based on magnitude */
	869	if (a->used > 1) {
	870	return MP_GT;
	871	}
	872
	873	/* compare the only digit of a to b */
	874	if (a->dp[0] > b) {
	875	return MP_GT;
	876	} else if (a->dp[0] < b) {
	877	return MP_LT;
	878	} else {
	879	return MP_EQ;
	880	}
	881	}
	882
	883	/* compare maginitude of two ints (unsigned) */
	884	int mp_cmp_mag (const mp_int * a, const mp_int * b)
	885	{
	886	int n;
	887	mp_digit tmpa, tmpb;
	888
	889	/* compare based on # of non-zero digits */
	890	if (a->used > b->used) {
	891	return MP_GT;
	892	}
	893
	894	if (a->used < b->used) {
	895	return MP_LT;
	896	}
	897
	898	/* alias for a */
	899	tmpa = a->dp + (a->used - 1);
	900
	901	/* alias for b */
	902	tmpb = b->dp + (a->used - 1);
	903
	904	/* compare based on digits */
	905	for (n = 0; n < a->used; ++n, --tmpa, --tmpb) {
	906	if (tmpa > tmpb) {
	907	return MP_GT;
	908	}
	909
	910	if (tmpa < tmpb) {
	911	return MP_LT;
	912	}
	913	}
	914	return MP_EQ;
	915	}
	916
	917	static const int lnz[16] = {
	918	4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
	919	};
	920
	921	/* Counts the number of lsbs which are zero before the first zero bit */
	922	int mp_cnt_lsb(const mp_int *a)
	923	{
	924	int x;
	925	mp_digit q, qq;
	926
	927	/* easy out */
	928	if (mp_iszero(a) == 1) {
	929	return 0;
	930	}
	931
	932	/* scan lower digits until non-zero */
	933	for (x = 0; x < a->used && a->dp[x] == 0; x++);
	934	q = a->dp[x];
	935	x *= DIGIT_BIT;
	936
	937	/* now scan this digit until a 1 is found */
	938	if ((q & 1) == 0) {
	939	do {
	940	qq = q & 15;
	941	x += lnz[qq];
	942	q >>= 4;
	943	} while (qq == 0);
	944	}
	945	return x;
	946	}
	947
	948	/* copy, b = a */
	949	int
	950	mp_copy (const mp_int * a, mp_int * b)
	951	{
	952	int res, n;
	953
	954	/* if dst == src do nothing */
	955	if (a == b) {
	956	return MP_OKAY;
	957	}
	958
	959	/* grow dest */
	960	if (b->alloc < a->used) {
	961	if ((res = mp_grow (b, a->used)) != MP_OKAY) {
	962	return res;
	963	}
	964	}
	965
	966	/* zero b and copy the parameters over */
	967	{
	968	register mp_digit tmpa, tmpb;
	969
	970	/* pointer aliases */
	971
	972	/* source */
	973	tmpa = a->dp;
	974
	975	/* destination */
	976	tmpb = b->dp;
	977
	978	/* copy all the digits */
	979	for (n = 0; n < a->used; n++) {
	980	tmpb++ = tmpa++;
	981	}
	982
	983	/* clear high digits */
	984	for (; n < b->used; n++) {
	985	*tmpb++ = 0;
	986	}
	987	}
	988
	989	/* copy used count and sign */
	990	b->used = a->used;
	991	b->sign = a->sign;
	992	return MP_OKAY;
	993	}
	994
	995	/* returns the number of bits in an int */
	996	int
	997	mp_count_bits (const mp_int * a)
	998	{
	999	int r;
	1000	mp_digit q;
	1001
	1002	/* shortcut */
	1003	if (a->used == 0) {
	1004	return 0;
	1005	}
	1006
	1007	/* get number of digits and add that */
	1008	r = (a->used - 1) * DIGIT_BIT;
	1009
	1010	/* take the last digit and count the bits in it */
	1011	q = a->dp[a->used - 1];
	1012	while (q > ((mp_digit) 0)) {
	1013	++r;
	1014	q >>= ((mp_digit) 1);
	1015	}
	1016	return r;
	1017	}
	1018
	1019	/* integer signed division.
	1020	* c*b + d == a [e.g. a/b, c=quotient, d=remainder]
	1021	* HAC pp.598 Algorithm 14.20
	1022	*
	1023	* Note that the description in HAC is horribly
	1024	* incomplete. For example, it doesn't consider
	1025	* the case where digits are removed from 'x' in
	1026	* the inner loop. It also doesn't consider the
	1027	* case that y has fewer than three digits, etc..
	1028	*
	1029	* The overall algorithm is as described as
	1030	* 14.20 from HAC but fixed to treat these cases.
	1031	*/
	1032	int mp_div (const mp_int * a, const mp_int * b, mp_int * c, mp_int * d)
	1033	{
	1034	mp_int q, x, y, t1, t2;
	1035	int res, n, t, i, norm, neg;
	1036
	1037	/* is divisor zero ? */
	1038	if (mp_iszero (b) == 1) {
	1039	return MP_VAL;
	1040	}
	1041
	1042	/* if a < b then q=0, r = a */
	1043	if (mp_cmp_mag (a, b) == MP_LT) {
	1044	if (d != NULL) {
	1045	res = mp_copy (a, d);
	1046	} else {
	1047	res = MP_OKAY;
	1048	}
	1049	if (c != NULL) {
	1050	mp_zero (c);
	1051	}
	1052	return res;
	1053	}
	1054
	1055	if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) {
	1056	return res;
	1057	}
	1058	q.used = a->used + 2;
	1059
	1060	if ((res = mp_init (&t1)) != MP_OKAY) {
	1061	goto __Q;
	1062	}
	1063
	1064	if ((res = mp_init (&t2)) != MP_OKAY) {
	1065	goto __T1;
	1066	}
	1067
	1068	if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
	1069	goto __T2;
	1070	}
	1071
	1072	if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
	1073	goto __X;
	1074	}
	1075
	1076	/* fix the sign */
	1077	neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
	1078	x.sign = y.sign = MP_ZPOS;
	1079
	1080	/* normalize both x and y, ensure that y >= b/2, [b == 2*DIGIT_BIT] /
	1081	norm = mp_count_bits(&y) % DIGIT_BIT;
	1082	if (norm < DIGIT_BIT-1) {
	1083	norm = (DIGIT_BIT-1) - norm;
	1084	if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
	1085	goto __Y;
	1086	}
	1087	if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
	1088	goto __Y;
	1089	}
	1090	} else {
	1091	norm = 0;
	1092	}
	1093
	1094	/* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
	1095	n = x.used - 1;
	1096	t = y.used - 1;
	1097
	1098	/* while (x >= ybn-t) do { q[n-t] += 1; x -= yb*{n-t} } /
	1099	if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = yb{n-t} /
	1100	goto __Y;
	1101	}
	1102
	1103	while (mp_cmp (&x, &y) != MP_LT) {
	1104	++(q.dp[n - t]);
	1105	if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
	1106	goto __Y;
	1107	}
	1108	}
	1109
	1110	/* reset y by shifting it back down */
	1111	mp_rshd (&y, n - t);
	1112
	1113	/* step 3. for i from n down to (t + 1) */
	1114	for (i = n; i >= (t + 1); i--) {
	1115	if (i > x.used) {
	1116	continue;
	1117	}
	1118
	1119	/* step 3.1 if xi == yt then set q{i-t-1} to b-1,
	1120	* otherwise set q{i-t-1} to (xib + x{i-1})/yt /
	1121	if (x.dp[i] == y.dp[t]) {
	1122	q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
	1123	} else {
	1124	mp_word tmp;
	1125	tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
	1126	tmp \|= ((mp_word) x.dp[i - 1]);
	1127	tmp /= ((mp_word) y.dp[t]);
	1128	if (tmp > (mp_word) MP_MASK)
	1129	tmp = MP_MASK;
	1130	q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
	1131	}
	1132
	1133	/* while (q{i-t-1} * (yt * b + y{t-1})) >
	1134	xi * b*2 + xi-1 b + xi-2
	1135
	1136	do q{i-t-1} -= 1;
	1137	*/
	1138	q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
	1139	do {
	1140	q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
	1141
	1142	/* find left hand */
	1143	mp_zero (&t1);
	1144	t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
	1145	t1.dp[1] = y.dp[t];
	1146	t1.used = 2;
	1147	if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
	1148	goto __Y;
	1149	}
	1150
	1151	/* find right hand */
	1152	t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
	1153	t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
	1154	t2.dp[2] = x.dp[i];
	1155	t2.used = 3;
	1156	} while (mp_cmp_mag(&t1, &t2) == MP_GT);
	1157
	1158	/* step 3.3 x = x - q{i-t-1} * y * b*{i-t-1} /
	1159	if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
	1160	goto __Y;
	1161	}
	1162
	1163	if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
	1164	goto __Y;
	1165	}
	1166
	1167	if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
	1168	goto __Y;
	1169	}
	1170
	1171	/* if x < 0 then { x = x + yb{i-t-1}; q{i-t-1} -= 1; } /
	1172	if (x.sign == MP_NEG) {
	1173	if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
	1174	goto __Y;
	1175	}
	1176	if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
	1177	goto __Y;
	1178	}
	1179	if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
	1180	goto __Y;
	1181	}
	1182
	1183	q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
	1184	}
	1185	}
	1186
	1187	/* now q is the quotient and x is the remainder
	1188	* [which we have to normalize]
	1189	*/
	1190
	1191	/* get sign before writing to c */
	1192	x.sign = x.used == 0 ? MP_ZPOS : a->sign;
	1193
	1194	if (c != NULL) {
	1195	mp_clamp (&q);
	1196	mp_exch (&q, c);
	1197	c->sign = neg;
	1198	}
	1199
	1200	if (d != NULL) {
	1201	mp_div_2d (&x, norm, &x, NULL);
	1202	mp_exch (&x, d);
	1203	}
	1204
	1205	res = MP_OKAY;
	1206
	1207	__Y:mp_clear (&y);
	1208	__X:mp_clear (&x);
	1209	__T2:mp_clear (&t2);
	1210	__T1:mp_clear (&t1);
	1211	__Q:mp_clear (&q);
	1212	return res;
	1213	}
	1214
	1215	/* b = a/2 */
	1216	int mp_div_2(const mp_int * a, mp_int * b)
	1217	{
	1218	int x, res, oldused;
	1219
	1220	/* copy */
	1221	if (b->alloc < a->used) {
	1222	if ((res = mp_grow (b, a->used)) != MP_OKAY) {
	1223	return res;
	1224	}
	1225	}
	1226
	1227	oldused = b->used;
	1228	b->used = a->used;
	1229	{
	1230	register mp_digit r, rr, tmpa, tmpb;
	1231
	1232	/* source alias */
	1233	tmpa = a->dp + b->used - 1;
	1234
	1235	/* dest alias */
	1236	tmpb = b->dp + b->used - 1;
	1237
	1238	/* carry */
	1239	r = 0;
	1240	for (x = b->used - 1; x >= 0; x--) {
	1241	/* get the carry for the next iteration */
	1242	rr = *tmpa & 1;
	1243
	1244	/* shift the current digit, add in carry and store */
	1245	tmpb-- = (tmpa-- >> 1) \| (r << (DIGIT_BIT - 1));
	1246
	1247	/* forward carry to next iteration */
	1248	r = rr;
	1249	}
	1250
	1251	/* zero excess digits */
	1252	tmpb = b->dp + b->used;
	1253	for (x = b->used; x < oldused; x++) {
	1254	*tmpb++ = 0;
	1255	}
	1256	}
	1257	b->sign = a->sign;
	1258	mp_clamp (b);
	1259	return MP_OKAY;
	1260	}
	1261
	1262	/* shift right by a certain bit count (store quotient in c, optional remainder in d) */
	1263	int mp_div_2d (const mp_int * a, int b, mp_int * c, mp_int * d)
	1264	{
	1265	mp_digit D, r, rr;
	1266	int x, res;
	1267	mp_int t;
	1268
	1269
	1270	/* if the shift count is <= 0 then we do no work */
	1271	if (b <= 0) {
	1272	res = mp_copy (a, c);
	1273	if (d != NULL) {
	1274	mp_zero (d);
	1275	}
	1276	return res;
	1277	}
	1278
	1279	if ((res = mp_init (&t)) != MP_OKAY) {
	1280	return res;
	1281	}
	1282
	1283	/* get the remainder */
	1284	if (d != NULL) {
	1285	if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) {
	1286	mp_clear (&t);
	1287	return res;
	1288	}
	1289	}
	1290
	1291	/* copy */
	1292	if ((res = mp_copy (a, c)) != MP_OKAY) {
	1293	mp_clear (&t);
	1294	return res;
	1295	}
	1296
	1297	/* shift by as many digits in the bit count */
	1298	if (b >= DIGIT_BIT) {
	1299	mp_rshd (c, b / DIGIT_BIT);
	1300	}
	1301
	1302	/* shift any bit count < DIGIT_BIT */
	1303	D = (mp_digit) (b % DIGIT_BIT);
	1304	if (D != 0) {
	1305	register mp_digit *tmpc, mask, shift;
	1306
	1307	/* mask */
	1308	mask = (((mp_digit)1) << D) - 1;
	1309
	1310	/* shift for lsb */
	1311	shift = DIGIT_BIT - D;
	1312
	1313	/* alias */
	1314	tmpc = c->dp + (c->used - 1);
	1315
	1316	/* carry */
	1317	r = 0;
	1318	for (x = c->used - 1; x >= 0; x--) {
	1319	/* get the lower bits of this word in a temp */
	1320	rr = *tmpc & mask;
	1321
	1322	/* shift the current word and mix in the carry bits from the previous word */
	1323	tmpc = (tmpc >> D) \| (r << shift);
	1324	--tmpc;
	1325
	1326	/* set the carry to the carry bits of the current word found above */
	1327	r = rr;
	1328	}
	1329	}
	1330	mp_clamp (c);
	1331	if (d != NULL) {
	1332	mp_exch (&t, d);
	1333	}
	1334	mp_clear (&t);
	1335	return MP_OKAY;
	1336	}
	1337
	1338	static int s_is_power_of_two(mp_digit b, int *p)
	1339	{
	1340	int x;
	1341
	1342	for (x = 1; x < DIGIT_BIT; x++) {
	1343	if (b == (((mp_digit)1)<<x)) {
	1344	*p = x;
	1345	return 1;
	1346	}
	1347	}
	1348	return 0;
	1349	}
	1350
	1351	/* single digit division (based on routine from MPI) */
	1352	int mp_div_d (const mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
	1353	{
	1354	mp_int q;
	1355	mp_word w;
	1356	mp_digit t;
	1357	int res, ix;
	1358
	1359	/* cannot divide by zero */
	1360	if (b == 0) {
	1361	return MP_VAL;
	1362	}
	1363
	1364	/* quick outs */
	1365	if (b == 1 \|\| mp_iszero(a) == 1) {
	1366	if (d != NULL) {
	1367	*d = 0;
	1368	}
	1369	if (c != NULL) {
	1370	return mp_copy(a, c);
	1371	}
	1372	return MP_OKAY;
	1373	}
	1374
	1375	/* power of two ? */
	1376	if (s_is_power_of_two(b, &ix) == 1) {
	1377	if (d != NULL) {
	1378	*d = a->dp[0] & ((((mp_digit)1)<<ix) - 1);
	1379	}
	1380	if (c != NULL) {
	1381	return mp_div_2d(a, ix, c, NULL);
	1382	}
	1383	return MP_OKAY;
	1384	}
	1385
	1386	/* no easy answer [c'est la vie]. Just division */
	1387	if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
	1388	return res;
	1389	}
	1390
	1391	q.used = a->used;
	1392	q.sign = a->sign;
	1393	w = 0;
	1394	for (ix = a->used - 1; ix >= 0; ix--) {
	1395	w = (w << ((mp_word)DIGIT_BIT)) \| ((mp_word)a->dp[ix]);
	1396
	1397	if (w >= b) {
	1398	t = (mp_digit)(w / b);
	1399	w -= ((mp_word)t) * ((mp_word)b);
	1400	} else {
	1401	t = 0;
	1402	}
	1403	q.dp[ix] = t;
	1404	}
	1405
	1406	if (d != NULL) {
	1407	*d = (mp_digit)w;
	1408	}
	1409
	1410	if (c != NULL) {
	1411	mp_clamp(&q);
	1412	mp_exch(&q, c);
	1413	}
	1414	mp_clear(&q);
	1415
	1416	return res;
	1417	}
	1418
	1419	/* reduce "x" in place modulo "n" using the Diminished Radix algorithm.
	1420	*
	1421	* Based on algorithm from the paper
	1422	*
	1423	* "Generating Efficient Primes for Discrete Log Cryptosystems"
	1424	* Chae Hoon Lim, Pil Loong Lee,
	1425	* POSTECH Information Research Laboratories
	1426	*
	1427	* The modulus must be of a special format [see manual]
	1428	*
	1429	* Has been modified to use algorithm 7.10 from the LTM book instead
	1430	*
	1431	* Input x must be in the range 0 <= x <= (n-1)**2
	1432	*/
	1433	int
	1434	mp_dr_reduce (mp_int * x, const mp_int * n, mp_digit k)
	1435	{
	1436	int err, i, m;
	1437	mp_word r;
	1438	mp_digit mu, tmpx1, tmpx2;
	1439
	1440	/* m = digits in modulus */
	1441	m = n->used;
	1442
	1443	/* ensure that "x" has at least 2m digits */
	1444	if (x->alloc < m + m) {
	1445	if ((err = mp_grow (x, m + m)) != MP_OKAY) {
	1446	return err;
	1447	}
	1448	}
	1449
	1450	/* top of loop, this is where the code resumes if
	1451	* another reduction pass is required.
	1452	*/
	1453	top:
	1454	/* aliases for digits */
	1455	/* alias for lower half of x */
	1456	tmpx1 = x->dp;
	1457
	1458	/* alias for upper half of x, or x/B*m /
	1459	tmpx2 = x->dp + m;
	1460
	1461	/* set carry to zero */
	1462	mu = 0;
	1463
	1464	/* compute (x mod B*m) + k [x/B*m] inline and inplace /
	1465	for (i = 0; i < m; i++) {
	1466	r = ((mp_word)tmpx2++) ((mp_word)k) + *tmpx1 + mu;
	1467	*tmpx1++ = (mp_digit)(r & MP_MASK);
	1468	mu = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
	1469	}
	1470
	1471	/* set final carry */
	1472	*tmpx1++ = mu;
	1473
	1474	/* zero words above m */
	1475	for (i = m + 1; i < x->used; i++) {
	1476	*tmpx1++ = 0;
	1477	}
	1478
	1479	/* clamp, sub and return */
	1480	mp_clamp (x);
	1481
	1482	/* if x >= n then subtract and reduce again
	1483	* Each successive "recursion" makes the input smaller and smaller.
	1484	*/
	1485	if (mp_cmp_mag (x, n) != MP_LT) {
	1486	s_mp_sub(x, n, x);
	1487	goto top;
	1488	}
	1489	return MP_OKAY;
	1490	}
	1491
	1492	/* determines the setup value */
	1493	void mp_dr_setup(const mp_int a, mp_digit d)
	1494	{
	1495	/* the casts are required if DIGIT_BIT is one less than
	1496	* the number of bits in a mp_digit [e.g. DIGIT_BIT==31]
	1497	*/
	1498	*d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) -
	1499	((mp_word)a->dp[0]));
	1500	}
	1501
	1502	/* swap the elements of two integers, for cases where you can't simply swap the
	1503	* mp_int pointers around
	1504	*/
	1505	void
	1506	mp_exch (mp_int * a, mp_int * b)
	1507	{
	1508	mp_int t;
	1509
	1510	t = *a;
	1511	a = b;
	1512	*b = t;
	1513	}
	1514
	1515	/* this is a shell function that calls either the normal or Montgomery
	1516	* exptmod functions. Originally the call to the montgomery code was
	1517	* embedded in the normal function but that wasted a lot of stack space
	1518	* for nothing (since 99% of the time the Montgomery code would be called)
	1519	*/
	1520	int mp_exptmod (const mp_int * G, const mp_int * X, mp_int * P, mp_int * Y)
	1521	{
	1522	int dr;
	1523
	1524	/* modulus P must be positive */
	1525	if (P->sign == MP_NEG) {
	1526	return MP_VAL;
	1527	}
	1528
	1529	/* if exponent X is negative we have to recurse */
	1530	if (X->sign == MP_NEG) {
	1531	mp_int tmpG, tmpX;
	1532	int err;
	1533
	1534	/* first compute 1/G mod P */
	1535	if ((err = mp_init(&tmpG)) != MP_OKAY) {
	1536	return err;
	1537	}
	1538	if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) {
	1539	mp_clear(&tmpG);
	1540	return err;
	1541	}
	1542
	1543	/* now get \|X\| */
	1544	if ((err = mp_init(&tmpX)) != MP_OKAY) {
	1545	mp_clear(&tmpG);
	1546	return err;
	1547	}
	1548	if ((err = mp_abs(X, &tmpX)) != MP_OKAY) {
	1549	mp_clear_multi(&tmpG, &tmpX, NULL);
	1550	return err;
	1551	}
	1552
	1553	/* and now compute (1/G)\|X\| instead of GX [X < 0] */
	1554	err = mp_exptmod(&tmpG, &tmpX, P, Y);
	1555	mp_clear_multi(&tmpG, &tmpX, NULL);
	1556	return err;
	1557	}
	1558
	1559	dr = 0;
	1560
	1561	/* if the modulus is odd or dr != 0 use the fast method */
	1562	if (mp_isodd (P) == 1 \|\| dr != 0) {
	1563	return mp_exptmod_fast (G, X, P, Y, dr);
	1564	} else {
	1565	/* otherwise use the generic Barrett reduction technique */
	1566	return s_mp_exptmod (G, X, P, Y);
	1567	}
	1568	}
	1569
	1570	/* computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
	1571	*
	1572	* Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
	1573	* The value of k changes based on the size of the exponent.
	1574	*
	1575	* Uses Montgomery or Diminished Radix reduction [whichever appropriate]
	1576	*/
	1577
	1578	int
	1579	mp_exptmod_fast (const mp_int * G, const mp_int * X, mp_int * P, mp_int * Y, int redmode)
	1580	{
	1581	mp_int M[256], res;
	1582	mp_digit buf, mp;
	1583	int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
	1584
	1585	/* use a pointer to the reduction algorithm. This allows us to use
	1586	* one of many reduction algorithms without modding the guts of
	1587	* the code with if statements everywhere.
	1588	*/
	1589	int (redux)(mp_int,const mp_int*,mp_digit);
	1590
	1591	/* find window size */
	1592	x = mp_count_bits (X);
	1593	if (x <= 7) {
	1594	winsize = 2;
	1595	} else if (x <= 36) {
	1596	winsize = 3;
	1597	} else if (x <= 140) {
	1598	winsize = 4;
	1599	} else if (x <= 450) {
	1600	winsize = 5;
	1601	} else if (x <= 1303) {
	1602	winsize = 6;
	1603	} else if (x <= 3529) {
	1604	winsize = 7;
	1605	} else {
	1606	winsize = 8;
	1607	}
	1608
	1609	/* init M array */
	1610	/* init first cell */
	1611	if ((err = mp_init(&M[1])) != MP_OKAY) {
	1612	return err;
	1613	}
	1614
	1615	/* now init the second half of the array */
	1616	for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
	1617	if ((err = mp_init(&M[x])) != MP_OKAY) {
	1618	for (y = 1<<(winsize-1); y < x; y++) {
	1619	mp_clear (&M[y]);
	1620	}
	1621	mp_clear(&M[1]);
	1622	return err;
	1623	}
	1624	}
	1625
	1626	/* determine and setup reduction code */
	1627	if (redmode == 0) {
	1628	/* now setup montgomery */
	1629	if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
	1630	goto __M;
	1631	}
	1632
	1633	/* automatically pick the comba one if available (saves quite a few calls/ifs) */
	1634	if (((P->used * 2 + 1) < MP_WARRAY) &&
	1635	P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
	1636	redux = fast_mp_montgomery_reduce;
	1637	} else {
	1638	/* use slower baseline Montgomery method */
	1639	redux = mp_montgomery_reduce;
	1640	}
	1641	} else if (redmode == 1) {
	1642	/* setup DR reduction for moduli of the form B*k - b /
	1643	mp_dr_setup(P, &mp);
	1644	redux = mp_dr_reduce;
	1645	} else {
	1646	/* setup DR reduction for moduli of the form 2*k - b /
	1647	if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
	1648	goto __M;
	1649	}
	1650	redux = mp_reduce_2k;
	1651	}
	1652
	1653	/* setup result */
	1654	if ((err = mp_init (&res)) != MP_OKAY) {
	1655	goto __M;
	1656	}
	1657
	1658	/* create M table
	1659	*
	1660
	1661	*
	1662	* The first half of the table is not computed though accept for M[0] and M[1]
	1663	*/
	1664
	1665	if (redmode == 0) {
	1666	/* now we need R mod m */
	1667	if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
	1668	goto __RES;
	1669	}
	1670
	1671	/* now set M[1] to G * R mod m */
	1672	if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
	1673	goto __RES;
	1674	}
	1675	} else {
	1676	mp_set(&res, 1);
	1677	if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
	1678	goto __RES;
	1679	}
	1680	}
	1681
	1682	/* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
	1683	if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
	1684	goto __RES;
	1685	}
	1686
	1687	for (x = 0; x < (winsize - 1); x++) {
	1688	if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
	1689	goto __RES;
	1690	}
	1691	if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
	1692	goto __RES;
	1693	}
	1694	}
	1695
	1696	/* create upper table */
	1697	for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
	1698	if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
	1699	goto __RES;
	1700	}
	1701	if ((err = redux (&M[x], P, mp)) != MP_OKAY) {
	1702	goto __RES;
	1703	}
	1704	}
	1705
	1706	/* set initial mode and bit cnt */
	1707	mode = 0;
	1708	bitcnt = 1;
	1709	buf = 0;
	1710	digidx = X->used - 1;
	1711	bitcpy = 0;
	1712	bitbuf = 0;
	1713
	1714	for (;;) {
	1715	/* grab next digit as required */
	1716	if (--bitcnt == 0) {
	1717	/* if digidx == -1 we are out of digits so break */
	1718	if (digidx == -1) {
	1719	break;
	1720	}
	1721	/* read next digit and reset bitcnt */
	1722	buf = X->dp[digidx--];
	1723	bitcnt = DIGIT_BIT;
	1724	}
	1725
	1726	/* grab the next msb from the exponent */
	1727	y = (buf >> (DIGIT_BIT - 1)) & 1;
	1728	buf <<= (mp_digit)1;
	1729
	1730	/* if the bit is zero and mode == 0 then we ignore it
	1731	* These represent the leading zero bits before the first 1 bit
	1732	* in the exponent. Technically this opt is not required but it
	1733	* does lower the # of trivial squaring/reductions used
	1734	*/
	1735	if (mode == 0 && y == 0) {
	1736	continue;
	1737	}
	1738
	1739	/* if the bit is zero and mode == 1 then we square */
	1740	if (mode == 1 && y == 0) {
	1741	if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
	1742	goto __RES;
	1743	}
	1744	if ((err = redux (&res, P, mp)) != MP_OKAY) {
	1745	goto __RES;
	1746	}
	1747	continue;
	1748	}
	1749
	1750	/* else we add it to the window */
	1751	bitbuf \|= (y << (winsize - ++bitcpy));
	1752	mode = 2;
	1753
	1754	if (bitcpy == winsize) {
	1755	/* ok window is filled so square as required and multiply */
	1756	/* square first */
	1757	for (x = 0; x < winsize; x++) {
	1758	if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
	1759	goto __RES;
	1760	}
	1761	if ((err = redux (&res, P, mp)) != MP_OKAY) {
	1762	goto __RES;
	1763	}
	1764	}
	1765
	1766	/* then multiply */
	1767	if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
	1768	goto __RES;
	1769	}
	1770	if ((err = redux (&res, P, mp)) != MP_OKAY) {
	1771	goto __RES;
	1772	}
	1773
	1774	/* empty window and reset */
	1775	bitcpy = 0;
	1776	bitbuf = 0;
	1777	mode = 1;
	1778	}
	1779	}
	1780
	1781	/* if bits remain then square/multiply */
	1782	if (mode == 2 && bitcpy > 0) {
	1783	/* square then multiply if the bit is set */
	1784	for (x = 0; x < bitcpy; x++) {
	1785	if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
	1786	goto __RES;
	1787	}
	1788	if ((err = redux (&res, P, mp)) != MP_OKAY) {
	1789	goto __RES;
	1790	}
	1791
	1792	/* get next bit of the window */
	1793	bitbuf <<= 1;
	1794	if ((bitbuf & (1 << winsize)) != 0) {
	1795	/* then multiply */
	1796	if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
	1797	goto __RES;
	1798	}
	1799	if ((err = redux (&res, P, mp)) != MP_OKAY) {
	1800	goto __RES;
	1801	}
	1802	}
	1803	}
	1804	}
	1805
	1806	if (redmode == 0) {
	1807	/* fixup result if Montgomery reduction is used
	1808	* recall that any value in a Montgomery system is
	1809	* actually multiplied by R mod n. So we have
	1810	* to reduce one more time to cancel out the factor
	1811	* of R.
	1812	*/
	1813	if ((err = redux(&res, P, mp)) != MP_OKAY) {
	1814	goto __RES;
	1815	}
	1816	}
	1817
	1818	/* swap res with Y */
	1819	mp_exch (&res, Y);
	1820	err = MP_OKAY;
	1821	__RES:mp_clear (&res);
	1822	__M:
	1823	mp_clear(&M[1]);
	1824	for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
	1825	mp_clear (&M[x]);
	1826	}
	1827	return err;
	1828	}
	1829
	1830	/* Greatest Common Divisor using the binary method */
	1831	int mp_gcd (const mp_int * a, const mp_int * b, mp_int * c)
	1832	{
	1833	mp_int u, v;
	1834	int k, u_lsb, v_lsb, res;
	1835
	1836	/* either zero than gcd is the largest */
	1837	if (mp_iszero (a) == 1 && mp_iszero (b) == 0) {
	1838	return mp_abs (b, c);
	1839	}
	1840	if (mp_iszero (a) == 0 && mp_iszero (b) == 1) {
	1841	return mp_abs (a, c);
	1842	}
	1843
	1844	/* optimized. At this point if a == 0 then
	1845	* b must equal zero too
	1846	*/
	1847	if (mp_iszero (a) == 1) {
	1848	mp_zero(c);
	1849	return MP_OKAY;
	1850	}
	1851
	1852	/* get copies of a and b we can modify */
	1853	if ((res = mp_init_copy (&u, a)) != MP_OKAY) {
	1854	return res;
	1855	}
	1856
	1857	if ((res = mp_init_copy (&v, b)) != MP_OKAY) {
	1858	goto __U;
	1859	}
	1860
	1861	/* must be positive for the remainder of the algorithm */
	1862	u.sign = v.sign = MP_ZPOS;
	1863
	1864	/* B1. Find the common power of two for u and v */
	1865	u_lsb = mp_cnt_lsb(&u);
	1866	v_lsb = mp_cnt_lsb(&v);
	1867	k = MIN(u_lsb, v_lsb);
	1868
	1869	if (k > 0) {
	1870	/* divide the power of two out */
	1871	if ((res = mp_div_2d(&u, k, &u, NULL)) != MP_OKAY) {
	1872	goto __V;
	1873	}
	1874
	1875	if ((res = mp_div_2d(&v, k, &v, NULL)) != MP_OKAY) {
	1876	goto __V;
	1877	}
	1878	}
	1879
	1880	/* divide any remaining factors of two out */
	1881	if (u_lsb != k) {
	1882	if ((res = mp_div_2d(&u, u_lsb - k, &u, NULL)) != MP_OKAY) {
	1883	goto __V;
	1884	}
	1885	}
	1886
	1887	if (v_lsb != k) {
	1888	if ((res = mp_div_2d(&v, v_lsb - k, &v, NULL)) != MP_OKAY) {
	1889	goto __V;
	1890	}
	1891	}
	1892
	1893	while (mp_iszero(&v) == 0) {
	1894	/* make sure v is the largest */
	1895	if (mp_cmp_mag(&u, &v) == MP_GT) {
	1896	/* swap u and v to make sure v is >= u */
	1897	mp_exch(&u, &v);
	1898	}
	1899
	1900	/* subtract smallest from largest */
	1901	if ((res = s_mp_sub(&v, &u, &v)) != MP_OKAY) {
	1902	goto __V;
	1903	}
	1904
	1905	/* Divide out all factors of two */
	1906	if ((res = mp_div_2d(&v, mp_cnt_lsb(&v), &v, NULL)) != MP_OKAY) {
	1907	goto __V;
	1908	}
	1909	}
	1910
	1911	/* multiply by 2*k which we divided out at the beginning /
	1912	if ((res = mp_mul_2d (&u, k, c)) != MP_OKAY) {
	1913	goto __V;
	1914	}
	1915	c->sign = MP_ZPOS;
	1916	res = MP_OKAY;
	1917	__V:mp_clear (&u);
	1918	__U:mp_clear (&v);
	1919	return res;
	1920	}
	1921
	1922	/* get the lower 32-bits of an mp_int */
	1923	unsigned long mp_get_int(const mp_int * a)
	1924	{
	1925	int i;
	1926	unsigned long res;
	1927
	1928	if (a->used == 0) {
	1929	return 0;
	1930	}
	1931
	1932	/* get number of digits of the lsb we have to read */
	1933	i = MIN(a->used,(int)((sizeof(unsigned long)*CHAR_BIT+DIGIT_BIT-1)/DIGIT_BIT))-1;
	1934
	1935	/* get most significant digit of result */
	1936	res = DIGIT(a,i);
	1937
	1938	while (--i >= 0) {
	1939	res = (res << DIGIT_BIT) \| DIGIT(a,i);
	1940	}
	1941
	1942	/* force result to 32-bits always so it is consistent on non 32-bit platforms */
	1943	return res & 0xFFFFFFFFUL;
	1944	}
	1945
	1946	/* grow as required */
	1947	int mp_grow (mp_int * a, int size)
	1948	{
	1949	int i;
	1950	mp_digit *tmp;
	1951
	1952	/* if the alloc size is smaller alloc more ram */
	1953	if (a->alloc < size) {
	1954	/* ensure there are always at least MP_PREC digits extra on top */
	1955	size += (MP_PREC * 2) - (size % MP_PREC);
	1956
	1957	/* reallocate the array a->dp
	1958	*
	1959	* We store the return in a temporary variable
	1960	* in case the operation failed we don't want
	1961	* to overwrite the dp member of a.
	1962	*/
	1963	tmp = realloc (a->dp, sizeof (mp_digit) * size);
	1964	if (tmp == NULL) {
	1965	/* reallocation failed but "a" is still valid [can be freed] */
	1966	return MP_MEM;
	1967	}
	1968
	1969	/* reallocation succeeded so set a->dp */
	1970	a->dp = tmp;
	1971
	1972	/* zero excess digits */
	1973	i = a->alloc;
	1974	a->alloc = size;
	1975	for (; i < a->alloc; i++) {
	1976	a->dp[i] = 0;
	1977	}
	1978	}
	1979	return MP_OKAY;
	1980	}
	1981
	1982	/* init a new mp_int */
	1983	int mp_init (mp_int * a)
	1984	{
	1985	int i;
	1986
	1987	/* allocate memory required and clear it */
	1988	a->dp = malloc (sizeof (mp_digit) * MP_PREC);
	1989	if (a->dp == NULL) {
	1990	return MP_MEM;
	1991	}
	1992
	1993	/* set the digits to zero */
	1994	for (i = 0; i < MP_PREC; i++) {
	1995	a->dp[i] = 0;
	1996	}
	1997
	1998	/* set the used to zero, allocated digits to the default precision
	1999	* and sign to positive */
	2000	a->used = 0;
	2001	a->alloc = MP_PREC;
	2002	a->sign = MP_ZPOS;
	2003
	2004	return MP_OKAY;
	2005	}
	2006
	2007	/* creates "a" then copies b into it */
	2008	int mp_init_copy (mp_int * a, const mp_int * b)
	2009	{
	2010	int res;
	2011
	2012	if ((res = mp_init (a)) != MP_OKAY) {
	2013	return res;
	2014	}
	2015	return mp_copy (b, a);
	2016	}
	2017
	2018	int mp_init_multi(mp_int *mp, ...)
	2019	{
	2020	mp_err res = MP_OKAY; /* Assume ok until proven otherwise */
	2021	int n = 0; /* Number of ok inits */
	2022	mp_int* cur_arg = mp;
	2023	va_list args;
	2024
	2025	va_start(args, mp); /* init args to next argument from caller */
	2026	while (cur_arg != NULL) {
	2027	if (mp_init(cur_arg) != MP_OKAY) {
	2028	/* Oops - error! Back-track and mp_clear what we already
	2029	succeeded in init-ing, then return error.
	2030	*/
	2031	va_list clean_args;
	2032
	2033	/* end the current list */
	2034	va_end(args);
	2035
	2036	/* now start cleaning up */
	2037	cur_arg = mp;
	2038	va_start(clean_args, mp);
	2039	while (n--) {
	2040	mp_clear(cur_arg);
	2041	cur_arg = va_arg(clean_args, mp_int*);
	2042	}
	2043	va_end(clean_args);
	2044	res = MP_MEM;
	2045	break;
	2046	}
	2047	n++;
	2048	cur_arg = va_arg(args, mp_int*);
	2049	}
	2050	va_end(args);
	2051	return res; /* Assumed ok, if error flagged above. */
	2052	}
	2053
	2054	/* init an mp_init for a given size */
	2055	int mp_init_size (mp_int * a, int size)
	2056	{
	2057	int x;
	2058
	2059	/* pad size so there are always extra digits */
	2060	size += (MP_PREC * 2) - (size % MP_PREC);
	2061
	2062	/* alloc mem */
	2063	a->dp = malloc (sizeof (mp_digit) * size);
	2064	if (a->dp == NULL) {
	2065	return MP_MEM;
	2066	}
	2067
	2068	/* set the members */
	2069	a->used = 0;
	2070	a->alloc = size;
	2071	a->sign = MP_ZPOS;
	2072
	2073	/* zero the digits */
	2074	for (x = 0; x < size; x++) {
	2075	a->dp[x] = 0;
	2076	}
	2077
	2078	return MP_OKAY;
	2079	}
	2080
	2081	/* hac 14.61, pp608 */
	2082	int mp_invmod (const mp_int * a, mp_int * b, mp_int * c)
	2083	{
	2084	/* b cannot be negative */
	2085	if (b->sign == MP_NEG \|\| mp_iszero(b) == 1) {
	2086	return MP_VAL;
	2087	}
	2088
	2089	/* if the modulus is odd we can use a faster routine instead */
	2090	if (mp_isodd (b) == 1) {
	2091	return fast_mp_invmod (a, b, c);
	2092	}
	2093
	2094	return mp_invmod_slow(a, b, c);
	2095	}
	2096
	2097	/* hac 14.61, pp608 */
	2098	int mp_invmod_slow (const mp_int * a, mp_int * b, mp_int * c)
	2099	{
	2100	mp_int x, y, u, v, A, B, C, D;
	2101	int res;
	2102
	2103	/* b cannot be negative */
	2104	if (b->sign == MP_NEG \|\| mp_iszero(b) == 1) {
	2105	return MP_VAL;
	2106	}
	2107
	2108	/* init temps */
	2109	if ((res = mp_init_multi(&x, &y, &u, &v,
	2110	&A, &B, &C, &D, NULL)) != MP_OKAY) {
	2111	return res;
	2112	}
	2113
	2114	/* x = a, y = b */
	2115	if ((res = mp_copy (a, &x)) != MP_OKAY) {
	2116	goto __ERR;
	2117	}
	2118	if ((res = mp_copy (b, &y)) != MP_OKAY) {
	2119	goto __ERR;
	2120	}
	2121
	2122	/* 2. [modified] if x,y are both even then return an error! */
	2123	if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
	2124	res = MP_VAL;
	2125	goto __ERR;
	2126	}
	2127
	2128	/* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
	2129	if ((res = mp_copy (&x, &u)) != MP_OKAY) {
	2130	goto __ERR;
	2131	}
	2132	if ((res = mp_copy (&y, &v)) != MP_OKAY) {
	2133	goto __ERR;
	2134	}
	2135	mp_set (&A, 1);
	2136	mp_set (&D, 1);
	2137
	2138	top:
	2139	/* 4. while u is even do */
	2140	while (mp_iseven (&u) == 1) {
	2141	/* 4.1 u = u/2 */
	2142	if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
	2143	goto __ERR;
	2144	}
	2145	/* 4.2 if A or B is odd then */
	2146	if (mp_isodd (&A) == 1 \|\| mp_isodd (&B) == 1) {
	2147	/* A = (A+y)/2, B = (B-x)/2 */
	2148	if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
	2149	goto __ERR;
	2150	}
	2151	if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
	2152	goto __ERR;
	2153	}
	2154	}
	2155	/* A = A/2, B = B/2 */
	2156	if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
	2157	goto __ERR;
	2158	}
	2159	if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
	2160	goto __ERR;
	2161	}
	2162	}
	2163
	2164	/* 5. while v is even do */
	2165	while (mp_iseven (&v) == 1) {
	2166	/* 5.1 v = v/2 */
	2167	if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
	2168	goto __ERR;
	2169	}
	2170	/* 5.2 if C or D is odd then */
	2171	if (mp_isodd (&C) == 1 \|\| mp_isodd (&D) == 1) {
	2172	/* C = (C+y)/2, D = (D-x)/2 */
	2173	if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
	2174	goto __ERR;
	2175	}
	2176	if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
	2177	goto __ERR;
	2178	}
	2179	}
	2180	/* C = C/2, D = D/2 */
	2181	if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
	2182	goto __ERR;
	2183	}
	2184	if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
	2185	goto __ERR;
	2186	}
	2187	}
	2188
	2189	/* 6. if u >= v then */
	2190	if (mp_cmp (&u, &v) != MP_LT) {
	2191	/* u = u - v, A = A - C, B = B - D */
	2192	if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
	2193	goto __ERR;
	2194	}
	2195
	2196	if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
	2197	goto __ERR;
	2198	}
	2199
	2200	if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
	2201	goto __ERR;
	2202	}
	2203	} else {
	2204	/* v - v - u, C = C - A, D = D - B */
	2205	if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
	2206	goto __ERR;
	2207	}
	2208
	2209	if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
	2210	goto __ERR;
	2211	}
	2212
	2213	if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
	2214	goto __ERR;
	2215	}
	2216	}
	2217
	2218	/* if not zero goto step 4 */
	2219	if (mp_iszero (&u) == 0)
	2220	goto top;
	2221
	2222	/* now a = C, b = D, gcd == gv /
	2223
	2224	/* if v != 1 then there is no inverse */
	2225	if (mp_cmp_d (&v, 1) != MP_EQ) {
	2226	res = MP_VAL;
	2227	goto __ERR;
	2228	}
	2229
	2230	/* if its too low */
	2231	while (mp_cmp_d(&C, 0) == MP_LT) {
	2232	if ((res = mp_add(&C, b, &C)) != MP_OKAY) {
	2233	goto __ERR;
	2234	}
	2235	}
	2236
	2237	/* too big */
	2238	while (mp_cmp_mag(&C, b) != MP_LT) {
	2239	if ((res = mp_sub(&C, b, &C)) != MP_OKAY) {
	2240	goto __ERR;
	2241	}
	2242	}
	2243
	2244	/* C is now the inverse */
	2245	mp_exch (&C, c);
	2246	res = MP_OKAY;
	2247	__ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
	2248	return res;
	2249	}
	2250
	2251	/* c = \|a\| * \|b\| using Karatsuba Multiplication using
	2252	* three half size multiplications
	2253	*
	2254	* Let B represent the radix [e.g. 2**DIGIT_BIT] and
	2255	* let n represent half of the number of digits in
	2256	* the min(a,b)
	2257	*
	2258	* a = a1 * B**n + a0
	2259	* b = b1 * B**n + b0
	2260	*
	2261	* Then, a * b =>
	2262	a1b1 * B*2n + ((a1 - a0)(b1 - b0) + a0b0 + a1b1) B + a0b0
	2263	*
	2264	* Note that a1b1 and a0b0 are used twice and only need to be
	2265	* computed once. So in total three half size (half # of
	2266	* digit) multiplications are performed, a0b0, a1b1 and
	2267	* (a1-b1)(a0-b0)
	2268	*
	2269	* Note that a multiplication of half the digits requires
	2270	* 1/4th the number of single precision multiplications so in
	2271	* total after one call 25% of the single precision multiplications
	2272	* are saved. Note also that the call to mp_mul can end up back
	2273	* in this function if the a0, a1, b0, or b1 are above the threshold.
	2274	* This is known as divide-and-conquer and leads to the famous
	2275	* O(Nlg(3)) or O(N1.584) work which is asymptotically lower than
	2276	* the standard O(N**2) that the baseline/comba methods use.
	2277	* Generally though the overhead of this method doesn't pay off
	2278	* until a certain size (N ~ 80) is reached.
	2279	*/
	2280	int mp_karatsuba_mul (const mp_int * a, const mp_int * b, mp_int * c)
	2281	{
	2282	mp_int x0, x1, y0, y1, t1, x0y0, x1y1;
	2283	int B, err;
	2284
	2285	/* default the return code to an error */
	2286	err = MP_MEM;
	2287
	2288	/* min # of digits */
	2289	B = MIN (a->used, b->used);
	2290
	2291	/* now divide in two */
	2292	B = B >> 1;
	2293
	2294	/* init copy all the temps */
	2295	if (mp_init_size (&x0, B) != MP_OKAY)
	2296	goto ERR;
	2297	if (mp_init_size (&x1, a->used - B) != MP_OKAY)
	2298	goto X0;
	2299	if (mp_init_size (&y0, B) != MP_OKAY)
	2300	goto X1;
	2301	if (mp_init_size (&y1, b->used - B) != MP_OKAY)
	2302	goto Y0;
	2303
	2304	/* init temps */
	2305	if (mp_init_size (&t1, B * 2) != MP_OKAY)
	2306	goto Y1;
	2307	if (mp_init_size (&x0y0, B * 2) != MP_OKAY)
	2308	goto T1;
	2309	if (mp_init_size (&x1y1, B * 2) != MP_OKAY)
	2310	goto X0Y0;
	2311
	2312	/* now shift the digits */
	2313	x0.used = y0.used = B;
	2314	x1.used = a->used - B;
	2315	y1.used = b->used - B;
	2316
	2317	{
	2318	register int x;
	2319	register mp_digit tmpa, tmpb, tmpx, tmpy;
	2320
	2321	/* we copy the digits directly instead of using higher level functions
	2322	* since we also need to shift the digits
	2323	*/
	2324	tmpa = a->dp;
	2325	tmpb = b->dp;
	2326
	2327	tmpx = x0.dp;
	2328	tmpy = y0.dp;
	2329	for (x = 0; x < B; x++) {
	2330	tmpx++ = tmpa++;
	2331	tmpy++ = tmpb++;
	2332	}
	2333
	2334	tmpx = x1.dp;
	2335	for (x = B; x < a->used; x++) {
	2336	tmpx++ = tmpa++;
	2337	}
	2338
	2339	tmpy = y1.dp;
	2340	for (x = B; x < b->used; x++) {
	2341	tmpy++ = tmpb++;
	2342	}
	2343	}
	2344
	2345	/* only need to clamp the lower words since by definition the
	2346	* upper words x1/y1 must have a known number of digits
	2347	*/
	2348	mp_clamp (&x0);
	2349	mp_clamp (&y0);
	2350
	2351	/* now calc the products x0y0 and x1y1 */
	2352	/* after this x0 is no longer required, free temp [x0==t2]! */
	2353	if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)
	2354	goto X1Y1; /* x0y0 = x0y0 /
	2355	if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
	2356	goto X1Y1; /* x1y1 = x1y1 /
	2357
	2358	/* now calc x1-x0 and y1-y0 */
	2359	if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
	2360	goto X1Y1; /* t1 = x1 - x0 */
	2361	if (mp_sub (&y1, &y0, &x0) != MP_OKAY)
	2362	goto X1Y1; /* t2 = y1 - y0 */
	2363	if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
	2364	goto X1Y1; /* t1 = (x1 - x0) * (y1 - y0) */
	2365
	2366	/* add x0y0 */
	2367	if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
	2368	goto X1Y1; /* t2 = x0y0 + x1y1 */
	2369	if (mp_sub (&x0, &t1, &t1) != MP_OKAY)
	2370	goto X1Y1; /* t1 = x0y0 + x1y1 - (x1-x0)(y1-y0) /
	2371
	2372	/* shift by B */
	2373	if (mp_lshd (&t1, B) != MP_OKAY)
	2374	goto X1Y1; /* t1 = (x0y0 + x1y1 - (x1-x0)(y1-y0))<<B /
	2375	if (mp_lshd (&x1y1, B * 2) != MP_OKAY)
	2376	goto X1Y1; /* x1y1 = x1y1 << 2B /
	2377
	2378	if (mp_add (&x0y0, &t1, &t1) != MP_OKAY)
	2379	goto X1Y1; /* t1 = x0y0 + t1 */
	2380	if (mp_add (&t1, &x1y1, c) != MP_OKAY)
	2381	goto X1Y1; /* t1 = x0y0 + t1 + x1y1 */
	2382
	2383	/* Algorithm succeeded set the return code to MP_OKAY */
	2384	err = MP_OKAY;
	2385
	2386	X1Y1:mp_clear (&x1y1);
	2387	X0Y0:mp_clear (&x0y0);
	2388	T1:mp_clear (&t1);
	2389	Y1:mp_clear (&y1);
	2390	Y0:mp_clear (&y0);
	2391	X1:mp_clear (&x1);
	2392	X0:mp_clear (&x0);
	2393	ERR:
	2394	return err;
	2395	}
	2396
	2397	/* Karatsuba squaring, computes b = a*a using three
	2398	* half size squarings
	2399	*
	2400	* See comments of karatsuba_mul for details. It
	2401	* is essentially the same algorithm but merely
	2402	* tuned to perform recursive squarings.
	2403	*/
	2404	int mp_karatsuba_sqr (const mp_int * a, mp_int * b)
	2405	{
	2406	mp_int x0, x1, t1, t2, x0x0, x1x1;
	2407	int B, err;
	2408
	2409	err = MP_MEM;
	2410
	2411	/* min # of digits */
	2412	B = a->used;
	2413
	2414	/* now divide in two */
	2415	B = B >> 1;
	2416
	2417	/* init copy all the temps */
	2418	if (mp_init_size (&x0, B) != MP_OKAY)
	2419	goto ERR;
	2420	if (mp_init_size (&x1, a->used - B) != MP_OKAY)
	2421	goto X0;
	2422
	2423	/* init temps */
	2424	if (mp_init_size (&t1, a->used * 2) != MP_OKAY)
	2425	goto X1;
	2426	if (mp_init_size (&t2, a->used * 2) != MP_OKAY)
	2427	goto T1;
	2428	if (mp_init_size (&x0x0, B * 2) != MP_OKAY)
	2429	goto T2;
	2430	if (mp_init_size (&x1x1, (a->used - B) * 2) != MP_OKAY)
	2431	goto X0X0;
	2432
	2433	{
	2434	register int x;
	2435	register mp_digit dst, src;
	2436
	2437	src = a->dp;
	2438
	2439	/* now shift the digits */
	2440	dst = x0.dp;
	2441	for (x = 0; x < B; x++) {
	2442	dst++ = src++;
	2443	}
	2444
	2445	dst = x1.dp;
	2446	for (x = B; x < a->used; x++) {
	2447	dst++ = src++;
	2448	}
	2449	}
	2450
	2451	x0.used = B;
	2452	x1.used = a->used - B;
	2453
	2454	mp_clamp (&x0);
	2455
	2456	/* now calc the products x0x0 and x1x1 */
	2457	if (mp_sqr (&x0, &x0x0) != MP_OKAY)
	2458	goto X1X1; /* x0x0 = x0x0 /
	2459	if (mp_sqr (&x1, &x1x1) != MP_OKAY)
	2460	goto X1X1; /* x1x1 = x1x1 /
	2461
	2462	/* now calc (x1-x0)*2 /
	2463	if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
	2464	goto X1X1; /* t1 = x1 - x0 */
	2465	if (mp_sqr (&t1, &t1) != MP_OKAY)
	2466	goto X1X1; /* t1 = (x1 - x0) * (x1 - x0) */
	2467
	2468	/* add x0y0 */
	2469	if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
	2470	goto X1X1; /* t2 = x0x0 + x1x1 */
	2471	if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
	2472	goto X1X1; /* t1 = x0x0 + x1x1 - (x1-x0)(x1-x0) /
	2473
	2474	/* shift by B */
	2475	if (mp_lshd (&t1, B) != MP_OKAY)
	2476	goto X1X1; /* t1 = (x0x0 + x1x1 - (x1-x0)(x1-x0))<<B /
	2477	if (mp_lshd (&x1x1, B * 2) != MP_OKAY)
	2478	goto X1X1; /* x1x1 = x1x1 << 2B /
	2479
	2480	if (mp_add (&x0x0, &t1, &t1) != MP_OKAY)
	2481	goto X1X1; /* t1 = x0x0 + t1 */
	2482	if (mp_add (&t1, &x1x1, b) != MP_OKAY)
	2483	goto X1X1; /* t1 = x0x0 + t1 + x1x1 */
	2484
	2485	err = MP_OKAY;
	2486
	2487	X1X1:mp_clear (&x1x1);
	2488	X0X0:mp_clear (&x0x0);
	2489	T2:mp_clear (&t2);
	2490	T1:mp_clear (&t1);
	2491	X1:mp_clear (&x1);
	2492	X0:mp_clear (&x0);
	2493	ERR:
	2494	return err;
	2495	}
	2496
	2497	/* computes least common multiple as \|ab\|/(a, b) /
	2498	int mp_lcm (const mp_int * a, const mp_int * b, mp_int * c)
	2499	{
	2500	int res;
	2501	mp_int t1, t2;
	2502
	2503
	2504	if ((res = mp_init_multi (&t1, &t2, NULL)) != MP_OKAY) {
	2505	return res;
	2506	}
	2507
	2508	/* t1 = get the GCD of the two inputs */
	2509	if ((res = mp_gcd (a, b, &t1)) != MP_OKAY) {
	2510	goto __T;
	2511	}
	2512
	2513	/* divide the smallest by the GCD */
	2514	if (mp_cmp_mag(a, b) == MP_LT) {
	2515	/* store quotient in t2 such that t2 * b is the LCM */
	2516	if ((res = mp_div(a, &t1, &t2, NULL)) != MP_OKAY) {
	2517	goto __T;
	2518	}
	2519	res = mp_mul(b, &t2, c);
	2520	} else {
	2521	/* store quotient in t2 such that t2 * a is the LCM */
	2522	if ((res = mp_div(b, &t1, &t2, NULL)) != MP_OKAY) {
	2523	goto __T;
	2524	}
	2525	res = mp_mul(a, &t2, c);
	2526	}
	2527
	2528	/* fix the sign to positive */
	2529	c->sign = MP_ZPOS;
	2530
	2531	__T:
	2532	mp_clear_multi (&t1, &t2, NULL);
	2533	return res;
	2534	}
	2535
	2536	/* shift left a certain amount of digits */
	2537	int mp_lshd (mp_int * a, int b)
	2538	{
	2539	int x, res;
	2540
	2541	/* if its less than zero return */
	2542	if (b <= 0) {
	2543	return MP_OKAY;
	2544	}
	2545
	2546	/* grow to fit the new digits */
	2547	if (a->alloc < a->used + b) {
	2548	if ((res = mp_grow (a, a->used + b)) != MP_OKAY) {
	2549	return res;
	2550	}
	2551	}
	2552
	2553	{
	2554	register mp_digit top, bottom;
	2555
	2556	/* increment the used by the shift amount then copy upwards */
	2557	a->used += b;
	2558
	2559	/* top */
	2560	top = a->dp + a->used - 1;
	2561
	2562	/* base */
	2563	bottom = a->dp + a->used - 1 - b;
	2564
	2565	/* much like mp_rshd this is implemented using a sliding window
	2566	* except the window goes the otherway around. Copying from
	2567	* the bottom to the top. see bn_mp_rshd.c for more info.
	2568	*/
	2569	for (x = a->used - 1; x >= b; x--) {
	2570	top-- = bottom--;
	2571	}
	2572
	2573	/* zero the lower digits */
	2574	top = a->dp;
	2575	for (x = 0; x < b; x++) {
	2576	*top++ = 0;
	2577	}
	2578	}
	2579	return MP_OKAY;
	2580	}
	2581
	2582	/* c = a mod b, 0 <= c < b */
	2583	int
	2584	mp_mod (const mp_int * a, mp_int * b, mp_int * c)
	2585	{
	2586	mp_int t;
	2587	int res;
	2588
	2589	if ((res = mp_init (&t)) != MP_OKAY) {
	2590	return res;
	2591	}
	2592
	2593	if ((res = mp_div (a, b, NULL, &t)) != MP_OKAY) {
	2594	mp_clear (&t);
	2595	return res;
	2596	}
	2597
	2598	if (t.sign != b->sign) {
	2599	res = mp_add (b, &t, c);
	2600	} else {
	2601	res = MP_OKAY;
	2602	mp_exch (&t, c);
	2603	}
	2604
	2605	mp_clear (&t);
	2606	return res;
	2607	}
	2608
	2609	/* calc a value mod 2*b /
	2610	int
	2611	mp_mod_2d (const mp_int * a, int b, mp_int * c)
	2612	{
	2613	int x, res;
	2614
	2615	/* if b is <= 0 then zero the int */
	2616	if (b <= 0) {
	2617	mp_zero (c);
	2618	return MP_OKAY;
	2619	}
	2620
	2621	/* if the modulus is larger than the value than return */
	2622	if (b > a->used * DIGIT_BIT) {
	2623	res = mp_copy (a, c);
	2624	return res;
	2625	}
	2626
	2627	/* copy */
	2628	if ((res = mp_copy (a, c)) != MP_OKAY) {
	2629	return res;
	2630	}
	2631
	2632	/* zero digits above the last digit of the modulus */
	2633	for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
	2634	c->dp[x] = 0;
	2635	}
	2636	/* clear the digit that is not completely outside/inside the modulus */
	2637	c->dp[b / DIGIT_BIT] &= (1 << ((mp_digit)b % DIGIT_BIT)) - 1;
	2638	mp_clamp (c);
	2639	return MP_OKAY;
	2640	}
	2641
	2642	int
	2643	mp_mod_d (const mp_int * a, mp_digit b, mp_digit * c)
	2644	{
	2645	return mp_div_d(a, b, NULL, c);
	2646	}
	2647
	2648	/*
	2649	* shifts with subtractions when the result is greater than b.
	2650	*
	2651	* The method is slightly modified to shift B unconditionally up to just under
	2652	* the leading bit of b. This saves a lot of multiple precision shifting.
	2653	*/
	2654	int mp_montgomery_calc_normalization (mp_int * a, const mp_int * b)
	2655	{
	2656	int x, bits, res;
	2657
	2658	/* how many bits of last digit does b use */
	2659	bits = mp_count_bits (b) % DIGIT_BIT;
	2660
	2661
	2662	if (b->used > 1) {
	2663	if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
	2664	return res;
	2665	}
	2666	} else {
	2667	mp_set(a, 1);
	2668	bits = 1;
	2669	}
	2670
	2671
	2672	/* now compute C = A * B mod b */
	2673	for (x = bits - 1; x < DIGIT_BIT; x++) {
	2674	if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
	2675	return res;
	2676	}
	2677	if (mp_cmp_mag (a, b) != MP_LT) {
	2678	if ((res = s_mp_sub (a, b, a)) != MP_OKAY) {
	2679	return res;
	2680	}
	2681	}
	2682	}
	2683
	2684	return MP_OKAY;
	2685	}
	2686
	2687	/* computes xR*-1 == x (mod N) via Montgomery Reduction /
	2688	int
	2689	mp_montgomery_reduce (mp_int * x, const mp_int * n, mp_digit rho)
	2690	{
	2691	int ix, res, digs;
	2692	mp_digit mu;
	2693
	2694	/* can the fast reduction [comba] method be used?
	2695	*
	2696	* Note that unlike in mul you're safely allowed less
	2697	* than the available columns [255 per default] since carries
	2698	* are fixed up in the inner loop.
	2699	*/
	2700	digs = n->used * 2 + 1;
	2701	if ((digs < MP_WARRAY) &&
	2702	n->used <
	2703	(1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
	2704	return fast_mp_montgomery_reduce (x, n, rho);
	2705	}
	2706
	2707	/* grow the input as required */
	2708	if (x->alloc < digs) {
	2709	if ((res = mp_grow (x, digs)) != MP_OKAY) {
	2710	return res;
	2711	}
	2712	}
	2713	x->used = digs;
	2714
	2715	for (ix = 0; ix < n->used; ix++) {
	2716	/* mu = ai * rho mod b
	2717	*
	2718	* The value of rho must be precalculated via
	2719	* montgomery_setup() such that
	2720	* it equals -1/n0 mod b this allows the
	2721	* following inner loop to reduce the
	2722	* input one digit at a time
	2723	*/
	2724	mu = (mp_digit) (((mp_word)x->dp[ix]) * ((mp_word)rho) & MP_MASK);
	2725
	2726	/* a = a + mu * m * b*i /
	2727	{
	2728	register int iy;
	2729	register mp_digit tmpn, tmpx, u;
	2730	register mp_word r;
	2731
	2732	/* alias for digits of the modulus */
	2733	tmpn = n->dp;
	2734
	2735	/* alias for the digits of x [the input] */
	2736	tmpx = x->dp + ix;
	2737
	2738	/* set the carry to zero */
	2739	u = 0;
	2740
	2741	/* Multiply and add in place */
	2742	for (iy = 0; iy < n->used; iy++) {
	2743	/* compute product and sum */
	2744	r = ((mp_word)mu) * ((mp_word)*tmpn++) +
	2745	((mp_word) u) + ((mp_word) * tmpx);
	2746
	2747	/* get carry */
	2748	u = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
	2749
	2750	/* fix digit */
	2751	*tmpx++ = (mp_digit)(r & ((mp_word) MP_MASK));
	2752	}
	2753	/* At this point the ix'th digit of x should be zero */
	2754
	2755
	2756	/* propagate carries upwards as required*/
	2757	while (u) {
	2758	*tmpx += u;
	2759	u = *tmpx >> DIGIT_BIT;
	2760	*tmpx++ &= MP_MASK;
	2761	}
	2762	}
	2763	}
	2764
	2765	/* at this point the n.used'th least
	2766	* significant digits of x are all zero
	2767	* which means we can shift x to the
	2768	* right by n.used digits and the
	2769	* residue is unchanged.
	2770	*/
	2771
	2772	/* x = x/b*n.used /
	2773	mp_clamp(x);
	2774	mp_rshd (x, n->used);
	2775
	2776	/* if x >= n then x = x - n */
	2777	if (mp_cmp_mag (x, n) != MP_LT) {
	2778	return s_mp_sub (x, n, x);
	2779	}
	2780
	2781	return MP_OKAY;
	2782	}
	2783
	2784	/* setups the montgomery reduction stuff */
	2785	int
	2786	mp_montgomery_setup (const mp_int * n, mp_digit * rho)
	2787	{
	2788	mp_digit x, b;
	2789
	2790	/* fast inversion mod 2**k
	2791	*
	2792	* Based on the fact that
	2793	*
	2794	* XA = 1 (mod 2n) => (X(2-XA)) A = 1 (mod 22n)
	2795	* => 2XA - XXA*A = 1
	2796	* => 2*(1) - (1) = 1
	2797	*/
	2798	b = n->dp[0];
	2799
	2800	if ((b & 1) == 0) {
	2801	return MP_VAL;
	2802	}
	2803
	2804	x = (((b + 2) & 4) << 1) + b; /* here xa==1 mod 24 /
	2805	x = 2 - b x; /* here xa==1 mod 28 /
	2806	x = 2 - b x; /* here xa==1 mod 216 /
	2807	x = 2 - b x; /* here xa==1 mod 232 /
	2808
	2809	/* rho = -1/m mod b */
	2810	*rho = (((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
	2811
	2812	return MP_OKAY;
	2813	}
	2814
	2815	/* high level multiplication (handles sign) */
	2816	int mp_mul (const mp_int * a, const mp_int * b, mp_int * c)
	2817	{
	2818	int res, neg;
	2819	neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
	2820
	2821	/* use Karatsuba? */
	2822	if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) {
	2823	res = mp_karatsuba_mul (a, b, c);
	2824	} else
	2825	{
	2826	/* can we use the fast multiplier?
	2827	*
	2828	* The fast multiplier can be used if the output will
	2829	* have less than MP_WARRAY digits and the number of
	2830	* digits won't affect carry propagation
	2831	*/
	2832	int digs = a->used + b->used + 1;
	2833
	2834	if ((digs < MP_WARRAY) &&
	2835	MIN(a->used, b->used) <=
	2836	(1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
	2837	res = fast_s_mp_mul_digs (a, b, c, digs);
	2838	} else
	2839	res = s_mp_mul (a, b, c); /* uses s_mp_mul_digs */
	2840	}
	2841	c->sign = (c->used > 0) ? neg : MP_ZPOS;
	2842	return res;
	2843	}
	2844
	2845	/* b = a2 /
	2846	int mp_mul_2(const mp_int * a, mp_int * b)
	2847	{
	2848	int x, res, oldused;
	2849
	2850	/* grow to accommodate result */
	2851	if (b->alloc < a->used + 1) {
	2852	if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) {
	2853	return res;
	2854	}
	2855	}
	2856
	2857	oldused = b->used;
	2858	b->used = a->used;
	2859
	2860	{
	2861	register mp_digit r, rr, tmpa, tmpb;
	2862
	2863	/* alias for source */
	2864	tmpa = a->dp;
	2865
	2866	/* alias for dest */
	2867	tmpb = b->dp;
	2868
	2869	/* carry */
	2870	r = 0;
	2871	for (x = 0; x < a->used; x++) {
	2872
	2873	/* get what will be the next carry bit from the
	2874	* MSB of the current digit
	2875	*/
	2876	rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
	2877
	2878	/* now shift up this digit, add in the carry [from the previous] */
	2879	tmpb++ = ((tmpa++ << ((mp_digit)1)) \| r) & MP_MASK;
	2880
	2881	/* copy the carry that would be from the source
	2882	* digit into the next iteration
	2883	*/
	2884	r = rr;
	2885	}
	2886
	2887	/* new leading digit? */
	2888	if (r != 0) {
	2889	/* add a MSB which is always 1 at this point */
	2890	*tmpb = 1;
	2891	++(b->used);
	2892	}
	2893
	2894	/* now zero any excess digits on the destination
	2895	* that we didn't write to
	2896	*/
	2897	tmpb = b->dp + b->used;
	2898	for (x = b->used; x < oldused; x++) {
	2899	*tmpb++ = 0;
	2900	}
	2901	}
	2902	b->sign = a->sign;
	2903	return MP_OKAY;
	2904	}
	2905
	2906	/* shift left by a certain bit count */
	2907	int mp_mul_2d (const mp_int * a, int b, mp_int * c)
	2908	{
	2909	mp_digit d;
	2910	int res;
	2911
	2912	/* copy */
	2913	if (a != c) {
	2914	if ((res = mp_copy (a, c)) != MP_OKAY) {
	2915	return res;
	2916	}
	2917	}
	2918
	2919	if (c->alloc < c->used + b/DIGIT_BIT + 1) {
	2920	if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) {
	2921	return res;
	2922	}
	2923	}
	2924
	2925	/* shift by as many digits in the bit count */
	2926	if (b >= DIGIT_BIT) {
	2927	if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) {
	2928	return res;
	2929	}
	2930	}
	2931
	2932	/* shift any bit count < DIGIT_BIT */
	2933	d = (mp_digit) (b % DIGIT_BIT);
	2934	if (d != 0) {
	2935	register mp_digit *tmpc, shift, mask, r, rr;
	2936	register int x;
	2937
	2938	/* bitmask for carries */
	2939	mask = (((mp_digit)1) << d) - 1;
	2940
	2941	/* shift for msbs */
	2942	shift = DIGIT_BIT - d;
	2943
	2944	/* alias */
	2945	tmpc = c->dp;
	2946
	2947	/* carry */
	2948	r = 0;
	2949	for (x = 0; x < c->used; x++) {
	2950	/* get the higher bits of the current word */
	2951	rr = (*tmpc >> shift) & mask;
	2952
	2953	/* shift the current word and OR in the carry */
	2954	tmpc = ((tmpc << d) \| r) & MP_MASK;
	2955	++tmpc;
	2956
	2957	/* set the carry to the carry bits of the current word */
	2958	r = rr;
	2959	}
	2960
	2961	/* set final carry */
	2962	if (r != 0) {
	2963	c->dp[(c->used)++] = r;
	2964	}
	2965	}
	2966	mp_clamp (c);
	2967	return MP_OKAY;
	2968	}
	2969
	2970	/* multiply by a digit */
	2971	int
	2972	mp_mul_d (const mp_int * a, mp_digit b, mp_int * c)
	2973	{
	2974	mp_digit u, tmpa, tmpc;
	2975	mp_word r;
	2976	int ix, res, olduse;
	2977
	2978	/* make sure c is big enough to hold ab /
	2979	if (c->alloc < a->used + 1) {
	2980	if ((res = mp_grow (c, a->used + 1)) != MP_OKAY) {
	2981	return res;
	2982	}
	2983	}
	2984
	2985	/* get the original destinations used count */
	2986	olduse = c->used;
	2987
	2988	/* set the sign */
	2989	c->sign = a->sign;
	2990
	2991	/* alias for a->dp [source] */
	2992	tmpa = a->dp;
	2993
	2994	/* alias for c->dp [dest] */
	2995	tmpc = c->dp;
	2996
	2997	/* zero carry */
	2998	u = 0;
	2999
	3000	/* compute columns */
	3001	for (ix = 0; ix < a->used; ix++) {
	3002	/* compute product and carry sum for this term */
	3003	r = ((mp_word) u) + ((mp_word)tmpa++) ((mp_word)b);
	3004
	3005	/* mask off higher bits to get a single digit */
	3006	*tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
	3007
	3008	/* send carry into next iteration */
	3009	u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
	3010	}
	3011
	3012	/* store final carry [if any] */
	3013	*tmpc++ = u;
	3014
	3015	/* now zero digits above the top */
	3016	while (ix++ < olduse) {
	3017	*tmpc++ = 0;
	3018	}
	3019
	3020	/* set used count */
	3021	c->used = a->used + 1;
	3022	mp_clamp(c);
	3023
	3024	return MP_OKAY;
	3025	}
	3026
	3027	/* d = a * b (mod c) */
	3028	int
	3029	mp_mulmod (const mp_int * a, const mp_int * b, mp_int * c, mp_int * d)
	3030	{
	3031	int res;
	3032	mp_int t;
	3033
	3034	if ((res = mp_init (&t)) != MP_OKAY) {
	3035	return res;
	3036	}
	3037
	3038	if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
	3039	mp_clear (&t);
	3040	return res;
	3041	}
	3042	res = mp_mod (&t, c, d);
	3043	mp_clear (&t);
	3044	return res;
	3045	}
	3046
	3047	/* table of first PRIME_SIZE primes */
	3048	static const mp_digit __prime_tab[] = {
	3049	0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
	3050	0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
	3051	0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
	3052	0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083,
	3053	0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
	3054	0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
	3055	0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
	3056	0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
	3057
	3058	0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
	3059	0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
	3060	0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
	3061	0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
	3062	0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
	3063	0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
	3064	0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
	3065	0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
	3066
	3067	0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
	3068	0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
	3069	0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
	3070	0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
	3071	0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
	3072	0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
	3073	0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
	3074	0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
	3075
	3076	0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
	3077	0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
	3078	0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
	3079	0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
	3080	0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
	3081	0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
	3082	0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
	3083	0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
	3084	};
	3085
	3086	/* determines if an integers is divisible by one
	3087	* of the first PRIME_SIZE primes or not
	3088	*
	3089	* sets result to 0 if not, 1 if yes
	3090	*/
	3091	int mp_prime_is_divisible (const mp_int * a, int *result)
	3092	{
	3093	int err, ix;
	3094	mp_digit res;
	3095
	3096	/* default to not */
	3097	*result = MP_NO;
	3098
	3099	for (ix = 0; ix < PRIME_SIZE; ix++) {
	3100	/* what is a mod __prime_tab[ix] */
	3101	if ((err = mp_mod_d (a, __prime_tab[ix], &res)) != MP_OKAY) {
	3102	return err;
	3103	}
	3104
	3105	/* is the residue zero? */
	3106	if (res == 0) {
	3107	*result = MP_YES;
	3108	return MP_OKAY;
	3109	}
	3110	}
	3111
	3112	return MP_OKAY;
	3113	}
	3114
	3115	/* performs a variable number of rounds of Miller-Rabin
	3116	*
	3117	* Probability of error after t rounds is no more than
	3118
	3119	*
	3120	* Sets result to 1 if probably prime, 0 otherwise
	3121	*/
	3122	int mp_prime_is_prime (mp_int * a, int t, int *result)
	3123	{
	3124	mp_int b;
	3125	int ix, err, res;
	3126
	3127	/* default to no */
	3128	*result = MP_NO;
	3129
	3130	/* valid value of t? */
	3131	if (t <= 0 \|\| t > PRIME_SIZE) {
	3132	return MP_VAL;
	3133	}
	3134
	3135	/* is the input equal to one of the primes in the table? */
	3136	for (ix = 0; ix < PRIME_SIZE; ix++) {
	3137	if (mp_cmp_d(a, __prime_tab[ix]) == MP_EQ) {
	3138	*result = 1;
	3139	return MP_OKAY;
	3140	}
	3141	}
	3142
	3143	/* first perform trial division */
	3144	if ((err = mp_prime_is_divisible (a, &res)) != MP_OKAY) {
	3145	return err;
	3146	}
	3147
	3148	/* return if it was trivially divisible */
	3149	if (res == MP_YES) {
	3150	return MP_OKAY;
	3151	}
	3152
	3153	/* now perform the miller-rabin rounds */
	3154	if ((err = mp_init (&b)) != MP_OKAY) {
	3155	return err;
	3156	}
	3157
	3158	for (ix = 0; ix < t; ix++) {
	3159	/* set the prime */
	3160	mp_set (&b, __prime_tab[ix]);
	3161
	3162	if ((err = mp_prime_miller_rabin (a, &b, &res)) != MP_OKAY) {
	3163	goto __B;
	3164	}
	3165
	3166	if (res == MP_NO) {
	3167	goto __B;
	3168	}
	3169	}
	3170
	3171	/* passed the test */
	3172	*result = MP_YES;
	3173	__B:mp_clear (&b);
	3174	return err;
	3175	}
	3176
	3177	/* Miller-Rabin test of "a" to the base of "b" as described in
	3178	* HAC pp. 139 Algorithm 4.24
	3179	*
	3180	* Sets result to 0 if definitely composite or 1 if probably prime.
	3181	* Randomly the chance of error is no more than 1/4 and often
	3182	* very much lower.
	3183	*/
	3184	int mp_prime_miller_rabin (mp_int * a, const mp_int * b, int *result)
	3185	{
	3186	mp_int n1, y, r;
	3187	int s, j, err;
	3188
	3189	/* default */
	3190	*result = MP_NO;
	3191
	3192	/* ensure b > 1 */
	3193	if (mp_cmp_d(b, 1) != MP_GT) {
	3194	return MP_VAL;
	3195	}
	3196
	3197	/* get n1 = a - 1 */
	3198	if ((err = mp_init_copy (&n1, a)) != MP_OKAY) {
	3199	return err;
	3200	}
	3201	if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) {
	3202	goto __N1;
	3203	}
	3204
	3205	/* set 2*s r = n1 */
	3206	if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) {
	3207	goto __N1;
	3208	}
	3209
	3210	/* count the number of least significant bits
	3211	* which are zero
	3212	*/
	3213	s = mp_cnt_lsb(&r);
	3214
	3215	/* now divide n - 1 by 2*s /
	3216	if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) {
	3217	goto __R;
	3218	}
	3219
	3220	/* compute y = b*r mod a /
	3221	if ((err = mp_init (&y)) != MP_OKAY) {
	3222	goto __R;
	3223	}
	3224	if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) {
	3225	goto __Y;
	3226	}
	3227
	3228	/* if y != 1 and y != n1 do */
	3229	if (mp_cmp_d (&y, 1) != MP_EQ && mp_cmp (&y, &n1) != MP_EQ) {
	3230	j = 1;
	3231	/* while j <= s-1 and y != n1 */
	3232	while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) {
	3233	if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) {
	3234	goto __Y;
	3235	}
	3236
	3237	/* if y == 1 then composite */
	3238	if (mp_cmp_d (&y, 1) == MP_EQ) {
	3239	goto __Y;
	3240	}
	3241
	3242	++j;
	3243	}
	3244
	3245	/* if y != n1 then composite */
	3246	if (mp_cmp (&y, &n1) != MP_EQ) {
	3247	goto __Y;
	3248	}
	3249	}
	3250
	3251	/* probably prime now */
	3252	*result = MP_YES;
	3253	__Y:mp_clear (&y);
	3254	__R:mp_clear (&r);
	3255	__N1:mp_clear (&n1);
	3256	return err;
	3257	}
	3258
	3259	static const struct {
	3260	int k, t;
	3261	} sizes[] = {
	3262	{ 128, 28 },
	3263	{ 256, 16 },
	3264	{ 384, 10 },
	3265	{ 512, 7 },
	3266	{ 640, 6 },
	3267	{ 768, 5 },
	3268	{ 896, 4 },
	3269	{ 1024, 4 }
	3270	};
	3271
	3272	/* returns # of RM trials required for a given bit size */
	3273	int mp_prime_rabin_miller_trials(int size)
	3274	{
	3275	int x;
	3276
	3277	for (x = 0; x < (int)(sizeof(sizes)/(sizeof(sizes[0]))); x++) {
	3278	if (sizes[x].k == size) {
	3279	return sizes[x].t;
	3280	} else if (sizes[x].k > size) {
	3281	return (x == 0) ? sizes[0].t : sizes[x - 1].t;
	3282	}
	3283	}
	3284	return sizes[x-1].t + 1;
	3285	}
	3286
	3287	/* makes a truly random prime of a given size (bits),
	3288	*
	3289	* Flags are as follows:
	3290	*
	3291	* LTM_PRIME_BBS - make prime congruent to 3 mod 4
	3292	* LTM_PRIME_SAFE - make sure (p-1)/2 is prime as well (implies LTM_PRIME_BBS)
	3293	* LTM_PRIME_2MSB_OFF - make the 2nd highest bit zero
	3294	* LTM_PRIME_2MSB_ON - make the 2nd highest bit one
	3295	*
	3296	* You have to supply a callback which fills in a buffer with random bytes. "dat" is a parameter you can
	3297	* have passed to the callback (e.g. a state or something). This function doesn't use "dat" itself
	3298	* so it can be NULL
	3299	*
	3300	*/
	3301
	3302	/* This is possibly the mother of all prime generation functions, muahahahahaha! */
	3303	int mp_prime_random_ex(mp_int a, int t, int size, int flags, ltm_prime_callback cb, void dat)
	3304	{
	3305	unsigned char *tmp, maskAND, maskOR_msb, maskOR_lsb;
	3306	int res, err, bsize, maskOR_msb_offset;
	3307
	3308	/* sanity check the input */
	3309	if (size <= 1 \|\| t <= 0) {
	3310	return MP_VAL;
	3311	}
	3312
	3313	/* LTM_PRIME_SAFE implies LTM_PRIME_BBS */
	3314	if (flags & LTM_PRIME_SAFE) {
	3315	flags \|= LTM_PRIME_BBS;
	3316	}
	3317
	3318	/* calc the byte size */
	3319	bsize = (size>>3)+((size&7)?1:0);
	3320
	3321	/* we need a buffer of bsize bytes */
	3322	tmp = malloc(bsize);
	3323	if (tmp == NULL) {
	3324	return MP_MEM;
	3325	}
	3326
	3327	/* calc the maskAND value for the MSbyte*/
	3328	maskAND = ((size&7) == 0) ? 0xFF : (0xFF >> (8 - (size & 7)));
	3329
	3330	/* calc the maskOR_msb */
	3331	maskOR_msb = 0;
	3332	maskOR_msb_offset = ((size & 7) == 1) ? 1 : 0;
	3333	if (flags & LTM_PRIME_2MSB_ON) {
	3334	maskOR_msb \|= 1 << ((size - 2) & 7);
	3335	} else if (flags & LTM_PRIME_2MSB_OFF) {
	3336	maskAND &= ~(1 << ((size - 2) & 7));
	3337	}
	3338
	3339	/* get the maskOR_lsb */
	3340	maskOR_lsb = 0;
	3341	if (flags & LTM_PRIME_BBS) {
	3342	maskOR_lsb \|= 3;
	3343	}
	3344
	3345	do {
	3346	/* read the bytes */
	3347	if (cb(tmp, bsize, dat) != bsize) {
	3348	err = MP_VAL;
	3349	goto error;
	3350	}
	3351
	3352	/* work over the MSbyte */
	3353	tmp[0] &= maskAND;
	3354	tmp[0] \|= 1 << ((size - 1) & 7);
	3355
	3356	/* mix in the maskORs */
	3357	tmp[maskOR_msb_offset] \|= maskOR_msb;
	3358	tmp[bsize-1] \|= maskOR_lsb;
	3359
	3360	/* read it in */
	3361	if ((err = mp_read_unsigned_bin(a, tmp, bsize)) != MP_OKAY) { goto error; }
	3362
	3363	/* is it prime? */
	3364	if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY) { goto error; }
	3365	if (res == MP_NO) {
	3366	continue;
	3367	}
	3368
	3369	if (flags & LTM_PRIME_SAFE) {
	3370	/* see if (a-1)/2 is prime */
	3371	if ((err = mp_sub_d(a, 1, a)) != MP_OKAY) { goto error; }
	3372	if ((err = mp_div_2(a, a)) != MP_OKAY) { goto error; }
	3373
	3374	/* is it prime? */
	3375	if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY) { goto error; }
	3376	}
	3377	} while (res == MP_NO);
	3378
	3379	if (flags & LTM_PRIME_SAFE) {
	3380	/* restore a to the original value */
	3381	if ((err = mp_mul_2(a, a)) != MP_OKAY) { goto error; }
	3382	if ((err = mp_add_d(a, 1, a)) != MP_OKAY) { goto error; }
	3383	}
	3384
	3385	err = MP_OKAY;
	3386	error:
	3387	free(tmp);
	3388	return err;
	3389	}
	3390
	3391	/* reads an unsigned char array, assumes the msb is stored first [big endian] */
	3392	int
	3393	mp_read_unsigned_bin (mp_int * a, const unsigned char *b, int c)
	3394	{
	3395	int res;
	3396
	3397	/* make sure there are at least two digits */
	3398	if (a->alloc < 2) {
	3399	if ((res = mp_grow(a, 2)) != MP_OKAY) {
	3400	return res;
	3401	}
	3402	}
	3403
	3404	/* zero the int */
	3405	mp_zero (a);
	3406
	3407	/* read the bytes in */
	3408	while (c-- > 0) {
	3409	if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) {
	3410	return res;
	3411	}
	3412
	3413	a->dp[0] \|= *b++;
	3414	a->used += 1;
	3415	}
	3416	mp_clamp (a);
	3417	return MP_OKAY;
	3418	}
	3419
	3420	/* reduces x mod m, assumes 0 < x < m**2, mu is
	3421	* precomputed via mp_reduce_setup.
	3422	* From HAC pp.604 Algorithm 14.42
	3423	*/
	3424	int
	3425	mp_reduce (mp_int * x, const mp_int * m, const mp_int * mu)
	3426	{
	3427	mp_int q;
	3428	int res, um = m->used;
	3429
	3430	/* q = x */
	3431	if ((res = mp_init_copy (&q, x)) != MP_OKAY) {
	3432	return res;
	3433	}
	3434
	3435	/* q1 = x / b*(k-1) /
	3436	mp_rshd (&q, um - 1);
	3437
	3438	/* according to HAC this optimization is ok */
	3439	if (((unsigned long) um) > (((mp_digit)1) << (DIGIT_BIT - 1))) {
	3440	if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) {
	3441	goto CLEANUP;
	3442	}
	3443	} else {
	3444	if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
	3445	goto CLEANUP;
	3446	}
	3447	}
	3448
	3449	/* q3 = q2 / b*(k+1) /
	3450	mp_rshd (&q, um + 1);
	3451
	3452	/* x = x mod b*(k+1), quick (no division) /
	3453	if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
	3454	goto CLEANUP;
	3455	}
	3456
	3457	/* q = q * m mod b*(k+1), quick (no division) /
	3458	if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) {
	3459	goto CLEANUP;
	3460	}
	3461
	3462	/* x = x - q */
	3463	if ((res = mp_sub (x, &q, x)) != MP_OKAY) {
	3464	goto CLEANUP;
	3465	}
	3466
	3467	/* If x < 0, add b*(k+1) to it /
	3468	if (mp_cmp_d (x, 0) == MP_LT) {
	3469	mp_set (&q, 1);
	3470	if ((res = mp_lshd (&q, um + 1)) != MP_OKAY)
	3471	goto CLEANUP;
	3472	if ((res = mp_add (x, &q, x)) != MP_OKAY)
	3473	goto CLEANUP;
	3474	}
	3475
	3476	/* Back off if it's too big */
	3477	while (mp_cmp (x, m) != MP_LT) {
	3478	if ((res = s_mp_sub (x, m, x)) != MP_OKAY) {
	3479	goto CLEANUP;
	3480	}
	3481	}
	3482
	3483	CLEANUP:
	3484	mp_clear (&q);
	3485
	3486	return res;
	3487	}
	3488
	3489	/* reduces a modulo n where n is of the form 2*p - d /
	3490	int
	3491	mp_reduce_2k(mp_int a, const mp_int n, mp_digit d)
	3492	{
	3493	mp_int q;
	3494	int p, res;
	3495
	3496	if ((res = mp_init(&q)) != MP_OKAY) {
	3497	return res;
	3498	}
	3499
	3500	p = mp_count_bits(n);
	3501	top:
	3502	/* q = a/2p, a = a mod 2p */
	3503	if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
	3504	goto ERR;
	3505	}
	3506
	3507	if (d != 1) {
	3508	/* q = q * d */
	3509	if ((res = mp_mul_d(&q, d, &q)) != MP_OKAY) {
	3510	goto ERR;
	3511	}
	3512	}
	3513
	3514	/* a = a + q */
	3515	if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
	3516	goto ERR;
	3517	}
	3518
	3519	if (mp_cmp_mag(a, n) != MP_LT) {
	3520	s_mp_sub(a, n, a);
	3521	goto top;
	3522	}
	3523
	3524	ERR:
	3525	mp_clear(&q);
	3526	return res;
	3527	}
	3528
	3529	/* determines the setup value */
	3530	int
	3531	mp_reduce_2k_setup(const mp_int a, mp_digit d)
	3532	{
	3533	int res, p;
	3534	mp_int tmp;
	3535
	3536	if ((res = mp_init(&tmp)) != MP_OKAY) {
	3537	return res;
	3538	}
	3539
	3540	p = mp_count_bits(a);
	3541	if ((res = mp_2expt(&tmp, p)) != MP_OKAY) {
	3542	mp_clear(&tmp);
	3543	return res;
	3544	}
	3545
	3546	if ((res = s_mp_sub(&tmp, a, &tmp)) != MP_OKAY) {
	3547	mp_clear(&tmp);
	3548	return res;
	3549	}
	3550
	3551	*d = tmp.dp[0];
	3552	mp_clear(&tmp);
	3553	return MP_OKAY;
	3554	}
	3555
	3556	/* pre-calculate the value required for Barrett reduction
	3557	* For a given modulus "b" it calulates the value required in "a"
	3558	*/
	3559	int mp_reduce_setup (mp_int * a, const mp_int * b)
	3560	{
	3561	int res;
	3562
	3563	if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
	3564	return res;
	3565	}
	3566	return mp_div (a, b, a, NULL);
	3567	}
	3568
	3569	/* shift right a certain amount of digits */
	3570	void mp_rshd (mp_int * a, int b)
	3571	{
	3572	int x;
	3573
	3574	/* if b <= 0 then ignore it */
	3575	if (b <= 0) {
	3576	return;
	3577	}
	3578
	3579	/* if b > used then simply zero it and return */
	3580	if (a->used <= b) {
	3581	mp_zero (a);
	3582	return;
	3583	}
	3584
	3585	{
	3586	register mp_digit bottom, top;
	3587
	3588	/* shift the digits down */
	3589
	3590	/* bottom */
	3591	bottom = a->dp;
	3592
	3593	/* top [offset into digits] */
	3594	top = a->dp + b;
	3595
	3596	/* this is implemented as a sliding window where
	3597	* the window is b-digits long and digits from
	3598	* the top of the window are copied to the bottom
	3599	*
	3600	* e.g.
	3601
	3602	b-2 \| b-1 \| b0 \| b1 \| b2 \| ... \| bb \| ---->
	3603	/\ \| ---->
	3604	\-------------------/ ---->
	3605	*/
	3606	for (x = 0; x < (a->used - b); x++) {
	3607	bottom++ = top++;
	3608	}
	3609
	3610	/* zero the top digits */
	3611	for (; x < a->used; x++) {
	3612	*bottom++ = 0;
	3613	}
	3614	}
	3615
	3616	/* remove excess digits */
	3617	a->used -= b;
	3618	}
	3619
	3620	/* set to a digit */
	3621	void mp_set (mp_int * a, mp_digit b)
	3622	{
	3623	mp_zero (a);
	3624	a->dp[0] = b & MP_MASK;
	3625	a->used = (a->dp[0] != 0) ? 1 : 0;
	3626	}
	3627
	3628	/* set a 32-bit const */
	3629	int mp_set_int (mp_int * a, unsigned long b)
	3630	{
	3631	int x, res;
	3632
	3633	mp_zero (a);
	3634
	3635	/* set four bits at a time */
	3636	for (x = 0; x < 8; x++) {
	3637	/* shift the number up four bits */
	3638	if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) {
	3639	return res;
	3640	}
	3641
	3642	/* OR in the top four bits of the source */
	3643	a->dp[0] \|= (b >> 28) & 15;
	3644
	3645	/* shift the source up to the next four bits */
	3646	b <<= 4;
	3647
	3648	/* ensure that digits are not clamped off */
	3649	a->used += 1;
	3650	}
	3651	mp_clamp (a);
	3652	return MP_OKAY;
	3653	}
	3654
	3655	/* shrink a bignum */
	3656	int mp_shrink (mp_int * a)
	3657	{
	3658	mp_digit *tmp;
	3659	if (a->alloc != a->used && a->used > 0) {
	3660	if ((tmp = realloc (a->dp, sizeof (mp_digit) * a->used)) == NULL) {
	3661	return MP_MEM;
	3662	}
	3663	a->dp = tmp;
	3664	a->alloc = a->used;
	3665	}
	3666	return MP_OKAY;
	3667	}
	3668
	3669	/* get the size for an signed equivalent */
	3670	int mp_signed_bin_size (const mp_int * a)
	3671	{
	3672	return 1 + mp_unsigned_bin_size (a);
	3673	}
	3674
	3675	/* computes b = aa /
	3676	int
	3677	mp_sqr (const mp_int * a, mp_int * b)
	3678	{
	3679	int res;
	3680
	3681	if (a->used >= KARATSUBA_SQR_CUTOFF) {
	3682	res = mp_karatsuba_sqr (a, b);
	3683	} else
	3684	{
	3685	/* can we use the fast comba multiplier? */
	3686	if ((a->used * 2 + 1) < MP_WARRAY &&
	3687	a->used <
	3688	(1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
	3689	res = fast_s_mp_sqr (a, b);
	3690	} else
	3691	res = s_mp_sqr (a, b);
	3692	}
	3693	b->sign = MP_ZPOS;
	3694	return res;
	3695	}
	3696
	3697	/* c = a * a (mod b) */
	3698	int
	3699	mp_sqrmod (const mp_int * a, mp_int * b, mp_int * c)
	3700	{
	3701	int res;
	3702	mp_int t;
	3703
	3704	if ((res = mp_init (&t)) != MP_OKAY) {
	3705	return res;
	3706	}
	3707
	3708	if ((res = mp_sqr (a, &t)) != MP_OKAY) {
	3709	mp_clear (&t);
	3710	return res;
	3711	}
	3712	res = mp_mod (&t, b, c);
	3713	mp_clear (&t);
	3714	return res;
	3715	}
	3716
	3717	/* high level subtraction (handles signs) */
	3718	int
	3719	mp_sub (mp_int * a, mp_int * b, mp_int * c)
	3720	{
	3721	int sa, sb, res;
	3722
	3723	sa = a->sign;
	3724	sb = b->sign;
	3725
	3726	if (sa != sb) {
	3727	/* subtract a negative from a positive, OR */
	3728	/* subtract a positive from a negative. */
	3729	/* In either case, ADD their magnitudes, */
	3730	/* and use the sign of the first number. */
	3731	c->sign = sa;
	3732	res = s_mp_add (a, b, c);
	3733	} else {
	3734	/* subtract a positive from a positive, OR */
	3735	/* subtract a negative from a negative. */
	3736	/* First, take the difference between their */
	3737	/* magnitudes, then... */
	3738	if (mp_cmp_mag (a, b) != MP_LT) {
	3739	/* Copy the sign from the first */
	3740	c->sign = sa;
	3741	/* The first has a larger or equal magnitude */
	3742	res = s_mp_sub (a, b, c);
	3743	} else {
	3744	/* The result has the opposite sign from */
	3745	/* the first number. */
	3746	c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
	3747	/* The second has a larger magnitude */
	3748	res = s_mp_sub (b, a, c);
	3749	}
	3750	}
	3751	return res;
	3752	}
	3753
	3754	/* single digit subtraction */
	3755	int
	3756	mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
	3757	{
	3758	mp_digit tmpa, tmpc, mu;
	3759	int res, ix, oldused;
	3760
	3761	/* grow c as required */
	3762	if (c->alloc < a->used + 1) {
	3763	if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
	3764	return res;
	3765	}
	3766	}
	3767
	3768	/* if a is negative just do an unsigned
	3769	* addition [with fudged signs]
	3770	*/
	3771	if (a->sign == MP_NEG) {
	3772	a->sign = MP_ZPOS;
	3773	res = mp_add_d(a, b, c);
	3774	a->sign = c->sign = MP_NEG;
	3775	return res;
	3776	}
	3777
	3778	/* setup regs */
	3779	oldused = c->used;
	3780	tmpa = a->dp;
	3781	tmpc = c->dp;
	3782
	3783	/* if a <= b simply fix the single digit */
	3784	if ((a->used == 1 && a->dp[0] <= b) \|\| a->used == 0) {
	3785	if (a->used == 1) {
	3786	tmpc++ = b - tmpa;
	3787	} else {
	3788	*tmpc++ = b;
	3789	}
	3790	ix = 1;
	3791
	3792	/* negative/1digit */
	3793	c->sign = MP_NEG;
	3794	c->used = 1;
	3795	} else {
	3796	/* positive/size */
	3797	c->sign = MP_ZPOS;
	3798	c->used = a->used;
	3799
	3800	/* subtract first digit */
	3801	tmpc = tmpa++ - b;
	3802	mu = tmpc >> (sizeof(mp_digit) CHAR_BIT - 1);
	3803	*tmpc++ &= MP_MASK;
	3804
	3805	/* handle rest of the digits */
	3806	for (ix = 1; ix < a->used; ix++) {
	3807	tmpc = tmpa++ - mu;
	3808	mu = tmpc >> (sizeof(mp_digit) CHAR_BIT - 1);
	3809	*tmpc++ &= MP_MASK;
	3810	}
	3811	}
	3812
	3813	/* zero excess digits */
	3814	while (ix++ < oldused) {
	3815	*tmpc++ = 0;
	3816	}
	3817	mp_clamp(c);
	3818	return MP_OKAY;
	3819	}
	3820
	3821	/* store in unsigned [big endian] format */
	3822	int
	3823	mp_to_unsigned_bin (const mp_int * a, unsigned char *b)
	3824	{
	3825	int x, res;
	3826	mp_int t;
	3827
	3828	if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
	3829	return res;
	3830	}
	3831
	3832	x = 0;
	3833	while (mp_iszero (&t) == 0) {
	3834	b[x++] = (unsigned char) (t.dp[0] & 255);
	3835	if ((res = mp_div_2d (&t, 8, &t, NULL)) != MP_OKAY) {
	3836	mp_clear (&t);
	3837	return res;
	3838	}
	3839	}
	3840	bn_reverse (b, x);
	3841	mp_clear (&t);
	3842	return MP_OKAY;
	3843	}
	3844
	3845	/* get the size for an unsigned equivalent */
	3846	int
	3847	mp_unsigned_bin_size (const mp_int * a)
	3848	{
	3849	int size = mp_count_bits (a);
	3850	return (size / 8 + ((size & 7) != 0 ? 1 : 0));
	3851	}
	3852
	3853	/* set to zero */
	3854	void
	3855	mp_zero (mp_int * a)
	3856	{
	3857	a->sign = MP_ZPOS;
	3858	a->used = 0;
	3859	memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
	3860	}
	3861
	3862	/* reverse an array, used for radix code */
	3863	static void
	3864	bn_reverse (unsigned char *s, int len)
	3865	{
	3866	int ix, iy;
	3867	unsigned char t;
	3868
	3869	ix = 0;
	3870	iy = len - 1;
	3871	while (ix < iy) {
	3872	t = s[ix];
	3873	s[ix] = s[iy];
	3874	s[iy] = t;
	3875	++ix;
	3876	--iy;
	3877	}
	3878	}
	3879
	3880	/* low level addition, based on HAC pp.594, Algorithm 14.7 */
	3881	static int
	3882	s_mp_add (mp_int * a, mp_int * b, mp_int * c)
	3883	{
	3884	mp_int *x;
	3885	int olduse, res, min, max;
	3886
	3887	/* find sizes, we let \|a\| <= \|b\| which means we have to sort
	3888	* them. "x" will point to the input with the most digits
	3889	*/
	3890	if (a->used > b->used) {
	3891	min = b->used;
	3892	max = a->used;
	3893	x = a;
	3894	} else {
	3895	min = a->used;
	3896	max = b->used;
	3897	x = b;
	3898	}
	3899
	3900	/* init result */
	3901	if (c->alloc < max + 1) {
	3902	if ((res = mp_grow (c, max + 1)) != MP_OKAY) {
	3903	return res;
	3904	}
	3905	}
	3906
	3907	/* get old used digit count and set new one */
	3908	olduse = c->used;
	3909	c->used = max + 1;
	3910
	3911	{
	3912	register mp_digit u, tmpa, tmpb, *tmpc;
	3913	register int i;
	3914
	3915	/* alias for digit pointers */
	3916
	3917	/* first input */
	3918	tmpa = a->dp;
	3919
	3920	/* second input */
	3921	tmpb = b->dp;
	3922
	3923	/* destination */
	3924	tmpc = c->dp;
	3925
	3926	/* zero the carry */
	3927	u = 0;
	3928	for (i = 0; i < min; i++) {
	3929	/* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
	3930	tmpc = tmpa++ + *tmpb++ + u;
	3931
	3932	/* U = carry bit of T[i] */
	3933	u = *tmpc >> ((mp_digit)DIGIT_BIT);
	3934
	3935	/* take away carry bit from T[i] */
	3936	*tmpc++ &= MP_MASK;
	3937	}
	3938
	3939	/* now copy higher words if any, that is in A+B
	3940	* if A or B has more digits add those in
	3941	*/
	3942	if (min != max) {
	3943	for (; i < max; i++) {
	3944	/* T[i] = X[i] + U */
	3945	*tmpc = x->dp[i] + u;
	3946
	3947	/* U = carry bit of T[i] */
	3948	u = *tmpc >> ((mp_digit)DIGIT_BIT);
	3949
	3950	/* take away carry bit from T[i] */
	3951	*tmpc++ &= MP_MASK;
	3952	}
	3953	}
	3954
	3955	/* add carry */
	3956	*tmpc++ = u;
	3957
	3958	/* clear digits above oldused */
	3959	for (i = c->used; i < olduse; i++) {
	3960	*tmpc++ = 0;
	3961	}
	3962	}
	3963
	3964	mp_clamp (c);
	3965	return MP_OKAY;
	3966	}
	3967
	3968	static int s_mp_exptmod (const mp_int * G, const mp_int * X, mp_int * P, mp_int * Y)
	3969	{
	3970	mp_int M[256], res, mu;
	3971	mp_digit buf;
	3972	int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
	3973
	3974	/* find window size */
	3975	x = mp_count_bits (X);
	3976	if (x <= 7) {
	3977	winsize = 2;
	3978	} else if (x <= 36) {
	3979	winsize = 3;
	3980	} else if (x <= 140) {
	3981	winsize = 4;
	3982	} else if (x <= 450) {
	3983	winsize = 5;
	3984	} else if (x <= 1303) {
	3985	winsize = 6;
	3986	} else if (x <= 3529) {
	3987	winsize = 7;
	3988	} else {
	3989	winsize = 8;
	3990	}
	3991
	3992	/* init M array */
	3993	/* init first cell */
	3994	if ((err = mp_init(&M[1])) != MP_OKAY) {
	3995	return err;
	3996	}
	3997
	3998	/* now init the second half of the array */
	3999	for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
	4000	if ((err = mp_init(&M[x])) != MP_OKAY) {
	4001	for (y = 1<<(winsize-1); y < x; y++) {
	4002	mp_clear (&M[y]);
	4003	}
	4004	mp_clear(&M[1]);
	4005	return err;
	4006	}
	4007	}
	4008
	4009	/* create mu, used for Barrett reduction */
	4010	if ((err = mp_init (&mu)) != MP_OKAY) {
	4011	goto __M;
	4012	}
	4013	if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
	4014	goto __MU;
	4015	}
	4016
	4017	/* create M table
	4018	*
	4019	* The M table contains powers of the base,
	4020	* e.g. M[x] = G**x mod P
	4021	*
	4022	* The first half of the table is not
	4023	* computed though accept for M[0] and M[1]
	4024	*/
	4025	if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
	4026	goto __MU;
	4027	}
	4028
	4029	/* compute the value at M[1<<(winsize-1)] by squaring
	4030	* M[1] (winsize-1) times
	4031	*/
	4032	if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
	4033	goto __MU;
	4034	}
	4035
	4036	for (x = 0; x < (winsize - 1); x++) {
	4037	if ((err = mp_sqr (&M[1 << (winsize - 1)],
	4038	&M[1 << (winsize - 1)])) != MP_OKAY) {
	4039	goto __MU;
	4040	}
	4041	if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
	4042	goto __MU;
	4043	}
	4044	}
	4045
	4046	/* create upper table, that is M[x] = M[x-1] * M[1] (mod P)
	4047	* for x = (2(winsize - 1) + 1) to (2winsize - 1)
	4048	*/
	4049	for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
	4050	if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
	4051	goto __MU;
	4052	}
	4053	if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) {
	4054	goto __MU;
	4055	}
	4056	}
	4057
	4058	/* setup result */
	4059	if ((err = mp_init (&res)) != MP_OKAY) {
	4060	goto __MU;
	4061	}
	4062	mp_set (&res, 1);
	4063
	4064	/* set initial mode and bit cnt */
	4065	mode = 0;
	4066	bitcnt = 1;
	4067	buf = 0;
	4068	digidx = X->used - 1;
	4069	bitcpy = 0;
	4070	bitbuf = 0;
	4071
	4072	for (;;) {
	4073	/* grab next digit as required */
	4074	if (--bitcnt == 0) {
	4075	/* if digidx == -1 we are out of digits */
	4076	if (digidx == -1) {
	4077	break;
	4078	}
	4079	/* read next digit and reset the bitcnt */
	4080	buf = X->dp[digidx--];
	4081	bitcnt = DIGIT_BIT;
	4082	}
	4083
	4084	/* grab the next msb from the exponent */
	4085	y = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
	4086	buf <<= (mp_digit)1;
	4087
	4088	/* if the bit is zero and mode == 0 then we ignore it
	4089	* These represent the leading zero bits before the first 1 bit
	4090	* in the exponent. Technically this opt is not required but it
	4091	* does lower the # of trivial squaring/reductions used
	4092	*/
	4093	if (mode == 0 && y == 0) {
	4094	continue;
	4095	}
	4096
	4097	/* if the bit is zero and mode == 1 then we square */
	4098	if (mode == 1 && y == 0) {
	4099	if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
	4100	goto __RES;
	4101	}
	4102	if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
	4103	goto __RES;
	4104	}
	4105	continue;
	4106	}
	4107
	4108	/* else we add it to the window */
	4109	bitbuf \|= (y << (winsize - ++bitcpy));
	4110	mode = 2;
	4111
	4112	if (bitcpy == winsize) {
	4113	/* ok window is filled so square as required and multiply */
	4114	/* square first */
	4115	for (x = 0; x < winsize; x++) {
	4116	if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
	4117	goto __RES;
	4118	}
	4119	if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
	4120	goto __RES;
	4121	}
	4122	}
	4123
	4124	/* then multiply */
	4125	if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
	4126	goto __RES;
	4127	}
	4128	if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
	4129	goto __RES;
	4130	}
	4131
	4132	/* empty window and reset */
	4133	bitcpy = 0;
	4134	bitbuf = 0;
	4135	mode = 1;
	4136	}
	4137	}
	4138
	4139	/* if bits remain then square/multiply */
	4140	if (mode == 2 && bitcpy > 0) {
	4141	/* square then multiply if the bit is set */
	4142	for (x = 0; x < bitcpy; x++) {
	4143	if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
	4144	goto __RES;
	4145	}
	4146	if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
	4147	goto __RES;
	4148	}
	4149
	4150	bitbuf <<= 1;
	4151	if ((bitbuf & (1 << winsize)) != 0) {
	4152	/* then multiply */
	4153	if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
	4154	goto __RES;
	4155	}
	4156	if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
	4157	goto __RES;
	4158	}
	4159	}
	4160	}
	4161	}
	4162
	4163	mp_exch (&res, Y);
	4164	err = MP_OKAY;
	4165	__RES:mp_clear (&res);
	4166	__MU:mp_clear (&mu);
	4167	__M:
	4168	mp_clear(&M[1]);
	4169	for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
	4170	mp_clear (&M[x]);
	4171	}
	4172	return err;
	4173	}
	4174
	4175	/* multiplies \|a\| * \|b\| and only computes up to digs digits of result
	4176	* HAC pp. 595, Algorithm 14.12 Modified so you can control how
	4177	* many digits of output are created.
	4178	*/
	4179	static int
	4180	s_mp_mul_digs (const mp_int * a, const mp_int * b, mp_int * c, int digs)
	4181	{
	4182	mp_int t;
	4183	int res, pa, pb, ix, iy;
	4184	mp_digit u;
	4185	mp_word r;
	4186	mp_digit tmpx, tmpt, tmpy;
	4187
	4188	/* can we use the fast multiplier? */
	4189	if (((digs) < MP_WARRAY) &&
	4190	MIN (a->used, b->used) <
	4191	(1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
	4192	return fast_s_mp_mul_digs (a, b, c, digs);
	4193	}
	4194
	4195	if ((res = mp_init_size (&t, digs)) != MP_OKAY) {
	4196	return res;
	4197	}
	4198	t.used = digs;
	4199
	4200	/* compute the digits of the product directly */
	4201	pa = a->used;
	4202	for (ix = 0; ix < pa; ix++) {
	4203	/* set the carry to zero */
	4204	u = 0;
	4205
	4206	/* limit ourselves to making digs digits of output */
	4207	pb = MIN (b->used, digs - ix);
	4208
	4209	/* setup some aliases */
	4210	/* copy of the digit from a used within the nested loop */
	4211	tmpx = a->dp[ix];
	4212
	4213	/* an alias for the destination shifted ix places */
	4214	tmpt = t.dp + ix;
	4215
	4216	/* an alias for the digits of b */
	4217	tmpy = b->dp;
	4218
	4219	/* compute the columns of the output and propagate the carry */
	4220	for (iy = 0; iy < pb; iy++) {
	4221	/* compute the column as a mp_word */
	4222	r = ((mp_word)*tmpt) +
	4223	((mp_word)tmpx) * ((mp_word)*tmpy++) +
	4224	((mp_word) u);
	4225
	4226	/* the new column is the lower part of the result */
	4227	*tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
	4228
	4229	/* get the carry word from the result */
	4230	u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
	4231	}
	4232	/* set carry if it is placed below digs */
	4233	if (ix + iy < digs) {
	4234	*tmpt = u;
	4235	}
	4236	}
	4237
	4238	mp_clamp (&t);
	4239	mp_exch (&t, c);
	4240
	4241	mp_clear (&t);
	4242	return MP_OKAY;
	4243	}
	4244
	4245	/* multiplies \|a\| * \|b\| and does not compute the lower digs digits
	4246	* [meant to get the higher part of the product]
	4247	*/
	4248	static int
	4249	s_mp_mul_high_digs (const mp_int * a, const mp_int * b, mp_int * c, int digs)
	4250	{
	4251	mp_int t;
	4252	int res, pa, pb, ix, iy;
	4253	mp_digit u;
	4254	mp_word r;
	4255	mp_digit tmpx, tmpt, tmpy;
	4256
	4257	/* can we use the fast multiplier? */
	4258	if (((a->used + b->used + 1) < MP_WARRAY)
	4259	&& MIN (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
	4260	return fast_s_mp_mul_high_digs (a, b, c, digs);
	4261	}
	4262
	4263	if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) {
	4264	return res;
	4265	}
	4266	t.used = a->used + b->used + 1;
	4267
	4268	pa = a->used;
	4269	pb = b->used;
	4270	for (ix = 0; ix < pa; ix++) {
	4271	/* clear the carry */
	4272	u = 0;
	4273
	4274	/* left hand side of A[ix] * B[iy] */
	4275	tmpx = a->dp[ix];
	4276
	4277	/* alias to the address of where the digits will be stored */
	4278	tmpt = &(t.dp[digs]);
	4279
	4280	/* alias for where to read the right hand side from */
	4281	tmpy = b->dp + (digs - ix);
	4282
	4283	for (iy = digs - ix; iy < pb; iy++) {
	4284	/* calculate the double precision result */
	4285	r = ((mp_word)*tmpt) +
	4286	((mp_word)tmpx) * ((mp_word)*tmpy++) +
	4287	((mp_word) u);
	4288
	4289	/* get the lower part */
	4290	*tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
	4291
	4292	/* carry the carry */
	4293	u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
	4294	}
	4295	*tmpt = u;
	4296	}
	4297	mp_clamp (&t);
	4298	mp_exch (&t, c);
	4299	mp_clear (&t);
	4300	return MP_OKAY;
	4301	}
	4302
	4303	/* low level squaring, b = aa, HAC pp.596-597, Algorithm 14.16 /
	4304	static int
	4305	s_mp_sqr (const mp_int * a, mp_int * b)
	4306	{
	4307	mp_int t;
	4308	int res, ix, iy, pa;
	4309	mp_word r;
	4310	mp_digit u, tmpx, *tmpt;
	4311
	4312	pa = a->used;
	4313	if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) {
	4314	return res;
	4315	}
	4316
	4317	/* default used is maximum possible size */
	4318	t.used = 2*pa + 1;
	4319
	4320	for (ix = 0; ix < pa; ix++) {
	4321	/* first calculate the digit at 2ix /
	4322	/* calculate double precision result */
	4323	r = ((mp_word) t.dp[2*ix]) +
	4324	((mp_word)a->dp[ix])*((mp_word)a->dp[ix]);
	4325
	4326	/* store lower part in result */
	4327	t.dp[ix+ix] = (mp_digit) (r & ((mp_word) MP_MASK));
	4328
	4329	/* get the carry */
	4330	u = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
	4331
	4332	/* left hand side of A[ix] * A[iy] */
	4333	tmpx = a->dp[ix];
	4334
	4335	/* alias for where to store the results */
	4336	tmpt = t.dp + (2*ix + 1);
	4337
	4338	for (iy = ix + 1; iy < pa; iy++) {
	4339	/* first calculate the product */
	4340	r = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
	4341
	4342	/* now calculate the double precision result, note we use
	4343	* addition instead of *2 since it's easier to optimize
	4344	*/
	4345	r = ((mp_word) *tmpt) + r + r + ((mp_word) u);
	4346
	4347	/* store lower part */
	4348	*tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
	4349
	4350	/* get carry */
	4351	u = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
	4352	}
	4353	/* propagate upwards */
	4354	while (u != ((mp_digit) 0)) {
	4355	r = ((mp_word) *tmpt) + ((mp_word) u);
	4356	*tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
	4357	u = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
	4358	}
	4359	}
	4360
	4361	mp_clamp (&t);
	4362	mp_exch (&t, b);
	4363	mp_clear (&t);
	4364	return MP_OKAY;
	4365	}
	4366
	4367	/* low level subtraction (assumes \|a\| > \|b\|), HAC pp.595 Algorithm 14.9 */
	4368	int
	4369	s_mp_sub (const mp_int * a, const mp_int * b, mp_int * c)
	4370	{
	4371	int olduse, res, min, max;
	4372
	4373	/* find sizes */
	4374	min = b->used;
	4375	max = a->used;
	4376
	4377	/* init result */
	4378	if (c->alloc < max) {
	4379	if ((res = mp_grow (c, max)) != MP_OKAY) {
	4380	return res;
	4381	}
	4382	}
	4383	olduse = c->used;
	4384	c->used = max;
	4385
	4386	{
	4387	register mp_digit u, tmpa, tmpb, *tmpc;
	4388	register int i;
	4389
	4390	/* alias for digit pointers */
	4391	tmpa = a->dp;
	4392	tmpb = b->dp;
	4393	tmpc = c->dp;
	4394
	4395	/* set carry to zero */
	4396	u = 0;
	4397	for (i = 0; i < min; i++) {
	4398	/* T[i] = A[i] - B[i] - U */
	4399	tmpc = tmpa++ - *tmpb++ - u;
	4400
	4401	/* U = carry bit of T[i]
	4402	* Note this saves performing an AND operation since
	4403	* if a carry does occur it will propagate all the way to the
	4404	* MSB. As a result a single shift is enough to get the carry
	4405	*/
	4406	u = tmpc >> ((mp_digit)(CHAR_BIT sizeof (mp_digit) - 1));
	4407
	4408	/* Clear carry from T[i] */
	4409	*tmpc++ &= MP_MASK;
	4410	}
	4411
	4412	/* now copy higher words if any, e.g. if A has more digits than B */
	4413	for (; i < max; i++) {
	4414	/* T[i] = A[i] - U */
	4415	tmpc = tmpa++ - u;
	4416
	4417	/* U = carry bit of T[i] */
	4418	u = tmpc >> ((mp_digit)(CHAR_BIT sizeof (mp_digit) - 1));
	4419
	4420	/* Clear carry from T[i] */
	4421	*tmpc++ &= MP_MASK;
	4422	}
	4423
	4424	/* clear digits above used (since we may not have grown result above) */
	4425	for (i = c->used; i < olduse; i++) {
	4426	*tmpc++ = 0;
	4427	}
	4428	}
	4429
	4430	mp_clamp (c);
	4431	return MP_OKAY;
	4432	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/src/rsaenh/mpi.c

Download in other formats: