// file kernel/n/x86-64/burnikel.S: Burnikel-Ziegler division
/*-----------------------------------------------------------------------+
 |  Copyright 2005-2006, Michel Quercia (michel.quercia@prepas.org)      |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                     Division de Burnikel et Ziegler                   |
 |                                                                       |
 +-----------------------------------------------------------------------*/


                               # +------------+
                               # |  Division  |
                               # +------------+

# entre :
#   a = naturel de longueur la     rsi = &a, rdx = la-lb
#   b = naturel de longueur lb     rbx = &b, rcx = lb
#   c = naturel de longueur la-lb  rdi = &c
#
# contraintes : 
# deux <= lb < la, le bit de poids fort de b est non nul,
# a < BASE^(la-lb)*b
# a,b,c non confondus
#
# sortie :
# a <- a mod b
# c <- floor(a/b)
#
# registres modifis : tous

#ifdef assembly_sn_burnidiv
#undef L
#define L(x) .Lsn_fburnidiv_##x
        ALIGN(32)
#ifdef debug_burnidiv
.Lsn_fburnidiv_buggy:   
#else
.Lsn_fburnidiv:
#endif
        
        # petite division => fdiv_n2
        cmpq   $burnidiv_lim, %rcx
        jbe    .Lsn_fdiv_n2
        cmpq   $div_small_c_lim, %rdx
        jbe    .Lsn_fdiv_n2

        # variables locales
        #undef _a_
        #undef _b_
        #undef _c_
        #undef _n_
        #undef _p_
        #undef _q_
        #undef _r_
        #undef _x_
        #define _x_  56(%rsp)
        #define _a_  48(%rsp)
        #define _b_  40(%rsp)
        #define _c_  32(%rsp)
        #define _n_  24(%rsp)
        #define _p_  16(%rsp)
        #define _q_   8(%rsp)
        #define _r_    (%rsp)

        # divise b en 2
        leaq   56(,%rcx,8),%rax
        ALLOCA                          # rserve lb chiffres + 7 mots
        shrq   $1,      %rcx            # rcx <- p = lb/2
        movq   %rcx,    _p_
        adcq   $0,      %rcx            # rcx <- q = (lb+1)/2
        movq   %rcx,    _q_

        # dcoupe a en tranches de q chiffres
        movq   %rdx,    %rax
        movq   %rdx,    %rbp            # rbp <- la-lb
        xorq   %rdx,    %rdx            # rdx:rax <- la-lb
        divq   %rcx                     # rdx <- r, rax <- n
        testq  %rdx,    %rdx
        jnz    1f
        movq   %rcx,    %rdx            # si r = 0, r <- q, n--
        decq   %rax
1:
        movq   %rdx,    _r_
        movq   %rax,    _n_

        movq   %rbp,    %rax
        subq   %rdx,    %rax            
        leaq   (%rsi,%rax,8), %rsi      # rsi <- &a[la-lb-r]
        leaq   (%rdi,%rax,8), %rdi      # rdi <- &c[la-lb-r]
        movq   %rbx,    _b_

        # boucle sur les tranches
        # arriver ici avec rsi = &a, rbx = &b, rdi = &c, rdx = r
        ALIGN(8)
L(tranche):

        # compare a1 et b1
        movq   %rsi,    _a_
        movq   %rdi,    _c_
        movq   _p_,     %rcx
        leaq   (%rsi,%rcx,8), %rsi      # rsi <- &a1
        leaq   (%rsi,%rdx,8), %rdi      # rdi <- &a1[r]
        leaq   (%rbx,%rcx,8), %rbx      # rbx <- &b1
        movq   _q_,     %rcx
1:
        movq -8(%rdi,%rcx,8), %rax
        cmpq -8(%rbx,%rcx,8), %rax
        loope  1b
        jne    L(a1_ok)

        # si a1 = b1, c <- BASE^r - 1 et a1 <- a1 - b1*c
        xorq   %rax,    %rax
        movq   _q_,     %rcx
        cld;   rep stosq                # a1 <- a1 - BASE^r*b
        movq   _c_,     %rdi
        movq   %rdx,    %rcx            # rcx <- r
        movq   $-1,     %rax
        rep    stosq                    # c <- BASE^r - 1
        movq   _q_,     %rcx            # rcx <- q
	movq   %rsi,    %rdi            # rdi <- &a1
        call   .Lsn_fadd_1              # a1 <- a1 + b
        adcq   %rcx,   (%rsi)           # sauve la retenue
        jmp    L(div_done)

        # si a1 < b1, c <- floor(a1/b1), a1 <- a1 mod b1
        ALIGN(8)
L(a1_ok):
        movq   _q_,     %rcx
        movq   _c_,     %rdi
        call   .Lsn_fburnidiv           # effectue la division
L(div_done):

        # calcule c*b0
        movq   _b_,     %rbx
        movq   _p_,     %rcx
        movq   _c_,     %rsi
        movq   _r_,     %rdx
        leaq   _x_,     %rdi
        cmpq   %rcx,    %rdx            # si r < p, change
        jae    1f
        xchgq  %rsi,    %rbx
        xchgq  %rdx,    %rcx
1:
        call   .Lsn_ftoommul            # x <- c*b0

        # a <- a0:r1 - x
        movq   _a_,     %rsi
        movq   _p_,     %rcx
        movq   _q_,     %rdx
        leaq   _x_,     %rbx
        leaq  1(%rcx,%rdx,1), %rdx      # rdx <- lb+1
        addq   _r_,     %rcx            # rcx <- p+r
        call   .Lsn_fdec

        # correction tant que a < 0
        jnb    L(next)
1:
        movq   _c_,     %rsi
2:
        subq   $1,     (%rsi)           # c--
        leaq   8(%rsi), %rsi
        jb     2b
        movq   _a_,     %rsi
	movq   %rsi,    %rdi
        movq   _b_,     %rbx
        movq   _p_,     %rcx
        addq   _q_,     %rcx
        call   .Lsn_fadd_1              # a += b
        adcq   %rcx,   (%rsi)           # dernire retenue
        jnb    1b

        # tranche suivante
L(next):
        movq   _a_,     %rsi
        movq   _b_,     %rbx
        movq   _c_,     %rdi
        movq   _q_,     %rdx
        movq   %rdx,    _r_
        leaq   (,%rdx,8), %rax
        subq   %rax,    %rsi            # a -= q
        subq   %rax,    %rdi            # c -= q
        decq   _n_
        jns    L(tranche)

        # termin
        movq   _p_,     %rax
        addq   _q_,     %rax
        leaq  56(%rsp,%rax,8), %rsp     # nettoie la pile
        ret
        
                              # +---------------+
                              # |  Interface C  |
                              # +---------------+
        

# void xn(burnidiv)(chiffre *a, long lc, chiffre *b, long lb, chiffre *c)
#
# entre :
# a = naturel de longueur lc+lb
# b = naturel de longueur lb
# c = naturel de longueur lc
#
# contraintes : 
# lb >= 2, lc > 0, le bit de poids fort de b est non nul,
# a < BASE^lc*b
# a,b,c non confondus
#
# sortie :
# a <- a mod b
# c <- floor(a/b)

#ifdef debug_burnidiv
ENTER(sn_burnidiv_buggy)
#else
ENTER(sn_burnidiv)
#endif

        movq   %rdx,    %rbx            # rbx <- &b
        movq   %rsi,    %rdx            # rdx <- la-lb
        movq   %rdi,    %rsi            # rsi <- &a
        movq   %r8,     %rdi            # rdi <- &c
#ifdef debug_burnidiv
        call   .Lsn_fburnidiv_buggy     # effectue la division
#else
        call   .Lsn_fburnidiv      
#endif
        RETURN_WITH_SP
        
#endif /* assembly_sn_burnidiv */

        # cas o la version assembleur est dsactive ou dbogue :
        # sn_fburnidiv renvoie vers la version C
        
#if !defined(assembly_sn_burnidiv) || defined(debug_burnidiv)
        ALIGN(32)
.Lsn_fburnidiv:
	movq   %rdi,  %r8
	movq   %rsi,  %rdi
	movq   %rdx,  %rsi
	movq   %rbx,  %rdx
        jmp   SUBR(sn_burnidiv)
        
#endif /* !defined(assembly_sn_burnidiv) || defined(debug_burnidiv) */
