/* +------------------------------------------------------------------------+
   |                                                                        |
   |                           Entiers de longueur arbitraire               |
   |                                                                        |
   |                                       Carr                            |
   |                                                                        |
   +------------------------------------------------------------------------+ */

/* M. Quercia, 31/01/2001 */

#include "macros-s.h"

                     /* +-------------------------------+
                        |  b <- a^2, algorithme en n^2  |
                        +-------------------------------+ */

/* void xn(sqr_n2)(naturel a, longueur la, naturel b) */
#ifdef have_sn_sqr_n2
ENTER(sn_sqr_n2)

/* variables locales */
#undef l
#undef r
#undef m
#undef cr
#define l  -16(%ebp)
#define r  -20(%ebp)
#define m  -24(%ebp)
#define cr -28(%ebp)

        movl   la,%ecx
.Lsn_sqr_n2_aux_entry:
        movl   a,%esi
        movl   b,%edi
        jecxz  .Lsn_sqr_n2_exit0

	/* la = 1 ? */
	decl   %ecx
	jnz    .Lsn_sqr_n2_big
	movl   (%esi),%eax
	mull   %eax
	movl   %eax,(%edi)
	movl   %edx,4(%edi)
.Lsn_sqr_n2_exit0:
	RETURN
.Lsn_sqr_n2_big:

	/* ici la > 1, calcule 2a dans le haut de b */
	leal   4(%esi,%ecx,4),%esi
	leal   8(%edi,%ecx,8),%edi
	negl   %ecx
	pushl  %ecx                 /* l */
	clc
	.align 4

.Lsn_sqr_n2_double:
	movl   (%esi,%ecx,4),%eax
	rcll   $1,%eax
	movl   %eax,(%edi,%ecx,4)
	incl   %ecx
	jne    .Lsn_sqr_n2_double
	pushf                       /* r */
	leal   -8(%esp),%esp        /* place pour m et cr */
	movl   l,%ecx

	/* multiplie a par 2a */
	/* premier chiffre    */
	leal   -4(%edi,%ecx,4),%ebx
	movl   -4(%esi,%ecx,4),%eax  /* multiplicateur */
	movl   %eax,m
	mull   %eax
	movl   %eax,-4(%ebx,%ecx,4)
	.align 4

.Lsn_sqr_n2_loop0:
	movl   %edx,cr
	movl   (%edi,%ecx,4),%eax
	mull   m
	addl   cr,%eax
	adcl   $0,%edx
	movl   %eax,(%ebx,%ecx,4)
	incl   %ecx
	jne    .Lsn_sqr_n2_loop0
	jmp    .Lsn_sqr_n2_cont

	.align 4

	/* chiffres suivants  */
.Lsn_sqr_n2_loop1:
	movl   -4(%esi,%ecx,4),%eax  /* multiplicateur */
	movl   %eax,m
	mull   %eax
	addl   %eax,-4(%ebx,%ecx,4)
	adcl   $0,%edx
	movl   %edx,cr
	movl   (%edi,%ecx,4),%eax
	andl   $-2,%eax
	mull   m
	addl   cr,%eax
	adcl   $0,%edx
	addl   %eax,(%ebx,%ecx,4)
	adcl   $0,%edx
	incl   %ecx
	jz     .Lsn_sqr_n2_cont

.Lsn_sqr_n2_loop2:
	movl   %edx,cr
	movl   (%edi,%ecx,4),%eax
	mull   m
	addl   cr,%eax
	adcl   $0,%edx
	addl   %eax,(%ebx,%ecx,4)
	adcl   $0,%edx
	incl   %ecx
	jne    .Lsn_sqr_n2_loop2

.Lsn_sqr_n2_cont:
	movl   %edx,(%ebx)
	leal   4(%ebx),%ebx
	movl   l,%ecx
	incl   %ecx
	movl   %ecx,l
	jne    .Lsn_sqr_n2_loop1	

	/* dernier carr */
	movl   -4(%esi),%eax
	mull   %eax
	addl   %eax,-4(%ebx)
	adcl   $0,%edx
	movl   %edx,(%ebx)

	/* rcupre la retenue de 2a */
	leal   8(%esp),%esp
	popf
	jnc    .Lsn_sqr_n2_exit
	
	/* ajoute a1.. */	
	movl   la,%ecx
	decl   %ecx
	negl   %ecx
	clc
	.align 4
	
.Lsn_sqr_n2_add:
	movl   -4(%esi,%ecx,4),%eax
	adcl   %eax,(%ebx,%ecx,4)
	incl   %ecx
	jne    .Lsn_sqr_n2_add
	adcl   $0,(%ebx)
	

EXIT(sn_sqr_n2)
#endif	


                  /* +-------------------------------------+
                     |  b <- a^2, algorithme de Karatsuba  |
                     +-------------------------------------+ */

/* void xn(karasqr)(naturel a, longueur la, naturel b) */
#ifdef have_sn_karasqr
ENTER(sn_karasqr)

/* variables locales */
#undef  l0
#undef  l1
#undef  l2
#define l0 -16(%ebp)
#define l1 -20(%ebp)
#define l2 -24(%ebp)
					
	/* limine les petits carrs */
        movl   la,%ecx
.Lsn_karasqr_aux_entry:
        cmpl   $klim,%ecx
        jb     .Lsn_sqr_n2_aux_entry

	/* prpare la dichotomie */
        shrl   $1,%ecx
        pushl  %ecx               /* l0 */
	movl   %ecx,%edx
        adcl   $0,%edx            /* l1 */
        pushl  %edx
        movl   a,%esi             /* esi -> a0 */
	leal   (%esi,%ecx,4),%edi /* edi -> a1 */
	
        /* b <- |a0-a1| */
	subl   %ecx,%edx
	je     .Lsn_karasqr_cmp_a01
	cmpl   $0,(%edi,%ecx,4)     /* si l1 > l0 chiffre de tte = 0 ? */
	jne    .Lsn_karasqr_a1_bigger  /* sinon a1 > a0 */
	xorl   %edx,%edx

.Lsn_karasqr_cmp_a01:                  /* si l1 = l0 compare les chiffres de tte */
	movl   -4(%esi,%ecx,4),%eax
	cmpl   -4(%edi,%ecx,4),%eax
	jb     .Lsn_karasqr_a1_bigger
	ja     .Lsn_karasqr_a1_smaller
	loop   .Lsn_karasqr_cmp_a01

	/* ici a0 = a1. Calcule a0^2 */
	movl   b,%edi
	pushl  %edi
	pushl  l0
	pushl  %esi
	call   sn_karasqr
	addl   $12,%esp

	/* a1^2 = a0^2 */
	movl   l0,%ecx
	addl   %ecx,%ecx
	movl   %ecx,%edx
	leal   (%edi,%ecx,4),%esi
	.align 4

.Lsn_karasqr_copy_a02:
	movl   -4(%edi,%ecx,4),%eax
	movl   %eax,-4(%esi,%ecx,4)
	loop   .Lsn_karasqr_copy_a02

	/* si la est impair, ajoute deux zros en tte */
	testl  $1,la
	jz     .Lsn_karasqr_la_pair
	movl   $0,(%esi,%edx,4)
	movl   $0,4(%esi,%edx,4)
.Lsn_karasqr_la_pair:

	/* ajoute 2a0^2 dcal de l0 */
	movl   l0,%ecx
	leal   (%esi,%ecx,4),%edi
	leal   (%edi,%ecx,4),%esi
	movl   %edx,%ecx
	negl   %ecx
	xorl   %ebx,%ebx
	.align 4

.Lsn_karasqr_dupp_a02:
	xorl   %edx,%edx
	movl   (%esi,%ecx,4),%eax
	addl   %eax,%eax
	adcl   $0,%edx
	addl   %eax,%ebx
	adcl   $0,%edx
	addl   %ebx,(%edi,%ecx,4)
	movl   %edx,%ebx
	adcl   $0,%ebx
	incl   %ecx
	jne    .Lsn_karasqr_dupp_a02

	/* propage la retenue */
	addl   %ebx,(%edi)
	jnc    .Lsn_karasqr_ret_a02_done
.Lsn_karasqr_ret_a02:
	leal   4(%edi),%edi
	adcl   $0,(%edi)
	jc     .Lsn_karasqr_ret_a02
.Lsn_karasqr_ret_a02_done:
	RETURN	
	
	/* ici a0 <> a1 */
.Lsn_karasqr_a1_smaller:               /* a0 > a1, les change */
	xchgl  %esi,%edi
.Lsn_karasqr_a1_bigger:	             /* a0 < a1, calcule a1-a0 */
	pushl  %ecx                  /* l2 */
	movl   b,%ebx
        leal   (%esi,%ecx,4),%esi
        leal   (%edi,%ecx,4),%edi
        leal   (%ebx,%ecx,4),%ebx
	negl   %ecx
	clc
	.align 4

.Lsn_karasqr_sub_a01:                  /* soustrait les chiffres communs */
	movl   (%edi,%ecx,4),%eax
	sbbl   (%esi,%ecx,4),%eax
	movl   %eax,(%ebx,%ecx,4)
	incl   %ecx
	jne    .Lsn_karasqr_sub_a01
	decl   %edx
	jnz    .Lsn_karasqr_sub_a01_done
	movl   (%edi),%eax           /* si a1 est plus long, dernier chiffre */
	sbbl   $0,%eax
	movl   %eax,(%ebx)
	incl   l2
.Lsn_karasqr_sub_a01_done:

	/* calcule (a1-a0)^2 */
	movl   l2,%eax
	movl   %eax,%ebx
	addl   %eax,%eax
	addl   %eax,%eax
	addl   %eax,%eax
	subl   %eax,%esp
	pushl  %esp
	pushl  %ebx
	movl   b,%edi
	pushl  %edi
	call   sn_karasqr
	addl   $12,%esp

	/* calcule a0^2 */
	movl   a,%esi
	movl   l0,%ebx
	pushl  %edi
	pushl  %ebx
	pushl  %esi
	call   sn_karasqr
	addl   $12,%esp

	/* calcule a1^2 */
	leal   (%esi,%ebx,4),%esi
	leal   (%edi,%ebx,8),%edi
	pushl  %edi
	pushl  l1
	pushl  %esi
	call   sn_karasqr
	addl   $12,%esp
	
/*  	addition croise :
  
  		l0      l0      l0      l0     2l1-2l0
  	     <------> <-----> <-----> <-----> <--->
  	     +-------+-------+-------+-------+-----+
  	     | alpha | beta  | gamma | delta | eps |
  	     +-------+-------+-------+-------+-----+
  		     | alpha | beta  |
  		     +-------+-------+-----+
  		     | gamma | delta | eps |
  		     +-------+-------+-----+
*/
	
	/* gamma <- gamma + beta          */
	/* edi -> fin de beta (dj fait) */
	/* esi -> fin de gamma            */
	movl   l0,%ecx
	leal   (%edi,%ecx,4),%esi
	negl   %ecx
	clc
	.align 4

.Lsn_karasqr_gb:
	movl   (%edi,%ecx,4),%eax
	adcl   %eax,(%esi,%ecx,4)
	incl   %ecx
	jne    .Lsn_karasqr_gb
	jnc    .Lsn_karasqr_gb_done
.Lsn_karasqr_gb_ret:
	adcl   $0,(%esi,%ecx,4)
	incl   %ecx
	jc     .Lsn_karasqr_gb_ret
.Lsn_karasqr_gb_done:

	/* beta <- alpha + gamma           */
	/* edi -> fin de beta (dj fait)  */
	/* esi -> fin de gamma (dj fait) */
	/* ebx -> fin de alpha             */
	movl   l0,%ecx
	negl   %ecx
	leal   (%edi,%ecx,4),%ebx
	clc
	.align 4

.Lsn_karasqr_ab:
	movl   (%esi,%ecx,4),%eax
	adcl   (%ebx,%ecx,4),%eax
	movl   %eax,(%edi,%ecx,4)
	incl   %ecx
	jne    .Lsn_karasqr_ab
	pushf			/* sauve retenue */

	/* gamma:delta <- gamma:delta + delta:eps */
	/* esi -> fin de gamma:eps                */
	/* ebx -> fin de delta:eps                */
	movl   l1,%ecx
	addl   %ecx,%ecx
	subl   l0,%ecx
	leal   (%esi,%ecx,4),%ebx
	leal   (%edi,%ecx,4),%esi
	negl   %ecx
	popf			/* rcupre retenue prcdente */
	.align 4

.Lsn_karasqr_gd:
	movl   (%ebx,%ecx,4),%eax
	adcl   %eax,(%esi,%ecx,4)
	incl   %ecx
	jne    .Lsn_karasqr_gd
	jnc    .Lsn_karasqr_gd_done
.Lsn_karasqr_gd_ret:
	adcl   $0,(%esi,%ecx,4)
	incl   %ecx
	jc     .Lsn_karasqr_gd_ret
.Lsn_karasqr_gd_done:

	/* retranche (a1-a0)^2 */
	/* edi -> beta         */
	/* esp -> (a1-a0)^2    */
	movl   l0,%ecx
	negl   %ecx
	leal   (%edi,%ecx,4),%edi
	movl   l2,%ecx
	addl   %ecx,%ecx
	clc
	.align 4

.Lsn_karasqr_sub_x2:
	popl   %eax
	sbbl   %eax,(%edi)
	leal   4(%edi),%edi
	loop   .Lsn_karasqr_sub_x2
	jnb    .Lsn_karasqr_sub_x2_done
.Lsn_karasqr_sub_x2_ret:
	sbbl   $0,(%edi)	
	leal   4(%edi),%edi
	jb     .Lsn_karasqr_sub_x2_ret
.Lsn_karasqr_sub_x2_done:

	/* termin */
EXIT(sn_karasqr)
#endif
	

	      /* +--------------------------+
                 |  b <- a^2, carr rapide  |
                 +--------------------------+ */

/* void xn(sqr_k)(naturel a, longueur la, naturel b) */
#ifdef have_sn_sqr_k
ENTER(sn_sqr_k)
	
	/* slectionne l'algorithme de carr */
        movl   la,%ecx
        cmpl   $klim,%ecx
        jb     .Lsn_sqr_n2_aux_entry
	cmpl   $flim,%ecx
        jb     .Lsn_karasqr_aux_entry

	pushl  b
	pushl  la
	pushl  a
	call   sn_sc_fftsqr

EXIT(sn_mul_k)
#endif


                        /* +----------------+
                           |  Carr dans Z  |
                           +----------------+ */

#ifdef have_sz_sqr_k
/* c <- a^2, longueur(c) >= 2la */
/* void xz(sqr_k)(entier *a, entier *b) */
ENTER(sz_sqr_k)

        movl   za,%esi
        movl   zb,%edi
        movl   (%esi),%ecx
        andl   $LONG_m,%ecx

        /* traite  part les petits carrs */
        cmpl   $2,%ecx
        ja     .Lsz_sqr_k_big_a
        je     .Lsz_sqr_k_a2
        jecxz  .Lsz_sqr_k_one_done

        /* carr d'un chiffre */
        movl   4(%esi),%eax
        mull   %eax
        movl   %eax,4(%edi)
        testl  %edx,%edx
        jz     .Lsz_sqr_k_one_done
        incl   %ecx
        movl   %edx,8(%edi)
.Lsz_sqr_k_one_done:
        movl   %ecx,(%edi)
        RETURN

        /* carr de deux chiffres */
.Lsz_sqr_k_a2:
        movl   4(%esi),%eax
        movl   8(%esi),%ebx
        movl   %eax,%ecx
        mull   %eax
        movl   %eax,4(%edi)
        movl   %edx,8(%edi)
        movl   %ebx,%eax
        mull   %eax
        movl   %eax,12(%edi)
        movl   %edx,16(%edi)
        movl   %ebx,%eax
        mull   %ecx
        xorl   %ebx,%ebx
        movl   $3,%ecx
        addl   %eax,%eax
        adcl   %edx,%edx
        adcl   %ebx,%ebx
        addl   %eax,8(%edi)
        adcl   %edx,12(%edi)
        adcl   %ebx,16(%edi)
        jz     .Lsz_sqr_k_two_done
        incl   %ecx
.Lsz_sqr_k_two_done:
        movl   %ecx,(%edi)
        RETURN

        /* carr d'au moins trois chiffres */
.Lsz_sqr_k_big_a:
        /* pointe sur les naturels */
        leal   4(%esi),%esi
        leal   4(%edi),%edi
        
        /* longueur et signe du rsultat */
        leal   (,%ecx,2),%eax
        movl   %eax,-4(%edi)

        /* la >= flim -> carr par fft */
	cmpl   $flim,%ecx
	jc     .Lsz_sqr_k_nofft
        pushl  %edi
        pushl  %ecx
        pushl  %esi
	call   sn_sc_fftsqr
	jmp    .Lsz_sqr_k_lb

.Lsz_sqr_k_nofft:
	/* copie a dans la pile s'il va tre cras */
	cmpl   %esi,%edi
	jnz    .Lsz_sqr_k_a_free
	movl   %ecx,%eax
	.align 4
.Lsz_sqr_k_copy_a:
	pushl  -4(%esi,%eax,4)
	decl   %eax
	jne    .Lsz_sqr_k_copy_a
	movl   %esp,%esi
.Lsz_sqr_k_a_free:	

        /* effectue l'lvation au carr */
	pushl  %edi
	pushl  %ecx
	pushl  %esi
        call   sn_karasqr

        /* vrifie la longueur du rsultat */
.Lsz_sqr_k_lb:	
	movl   zb,%edi
        movl   (%edi),%ecx
        movl   (%edi,%ecx,4),%edx
        testl  %edx,%edx
        jnz    .Lsz_sqr_k_exit
        decl   (%edi)

EXIT(sz_sqr_k)
#endif


