/* Copyright (C) 1996 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   Contributed by David Mosberger <davidm@cs.arizona.edu>, 1996.
   Based on public-domain C source by Linus Torvalds.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Library General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Library General Public License for more details.

   You should have received a copy of the GNU Library General Public
   License along with the GNU C Library; see the file COPYING.LIB.  If not,
   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
   Boston, MA 02111-1307, USA. */

/* Vectorized sqrt routine for 21164
                      by Kazushige Goto <goto@statabo.rim.or.jp> */

/*
     Usage : sqrtv(double *src, double *dst, int size);

	This program can calculate ONLY 17 clocks/element,
        but it does NOT contain any exeptional handling.
*/

	.set noat
	.set noreorder

#ifdef __ELF__
	.section .rodata
#else
	.rdata
#endif

	.align 5        # align to cache line

sqrtdata:
.long   0x1500, 0x2ef8,   0x4d67,  0x6b02,  0x87be,  0xa395,  0xbe7a,  0xd866
.long   0xf14a, 0x1091b, 0x11fcd, 0x13552, 0x14999, 0x15c98, 0x16e34, 0x17e5f
.long  0x18d03, 0x19a01, 0x1a545, 0x1ae8a, 0x1b5c4, 0x1bb01, 0x1bfde, 0x1c28d
.long  0x1c2de, 0x1c0db, 0x1ba73, 0x1b11c, 0x1a4b5, 0x1953d, 0x18266, 0x16be0
.long  0x1683e, 0x179d8, 0x18a4d, 0x19992, 0x1a789, 0x1b445, 0x1bf61, 0x1c989
.long  0x1d16d, 0x1d77b, 0x1dddf, 0x1e2ad, 0x1e5bf, 0x1e6e8, 0x1e654, 0x1e3cd
.long  0x1df2a, 0x1d635, 0x1cb16, 0x1be2c, 0x1ae4e, 0x19bde, 0x1868e, 0x16e2e
.long  0x1527f, 0x1334a, 0x11051,  0xe951,  0xbe01,  0x8e0d,  0x5924,  0x1edd

	.text

#ifndef FORTRAN
#define NAME	dsqrtv
#else
#define NAME	dsqrtv_
#endif

	.globl  NAME
	.align 5
	.ent  NAME
NAME:
	.frame $30, 48, $26
	lda	$30 , -48($30)
	ldgp	$29 , .-NAME($27)
	stt	$f2 , 32($30) 
	.prologue 1
	.align 4

#ifdef FORTRAN
	ldl	$18, 0($18)
	unop
	unop
	unop
#endif

	stt	$f3 , 40($30) 
	ble	$18, $end
	lda	$28 , sqrtdata
	srl	$18, 2, $23

	lda	$0, 0x3fe0
	lda	$19, 0x4008
	sll	$0, 48, $0
	ldah	$25 , 0x5fe8

	stq	$0, 0x08($30)
	and	$18, 3, $24
	sll	$19, 48, $19
	ldah	$20, -0x10($19)

	stq	$19, 0x10($30)
	stq	$20, 0x18($30)
	ldt	$f1, 0x08($30)			# $f1 = 0.5
	ldt	$f2, 0x10($30)			# $f2 = 3.0

	unop
	ldt	$f10, 0x18($30)
	addt	$f1,  $f1,  $f3			# 0.5 + 0.5 = 1.0
	ble	$23, $sub
	.align 4

# main routine

	ldq	$1 ,   0($16)
	ldq	$3 ,   8($16)
	ldq	$5 ,  16($16)
	ldq	$7 ,  24($16)

	srl	$1 , 33, $1
	addq	$16, 32, $16
	srl	$3 , 33, $3
	subl	$25 , $1 , $2

	srl	$5 , 33, $5
	subl	$25 , $3 , $4
	srl	$7 , 33, $7
	subl	$25 , $5 , $6

	srl	$2,   12, $1
	subl	$25 , $7 , $8
	srl	$4,   12, $3
	and	$1, 0xfc, $1

	srl	$6,   12, $5
	and	$3, 0xfc, $3
	srl	$8,   12, $7
	and	$5, 0xfc, $5

	and	$7, 0xfc, $7
	addq	$1,   $28, $1
	addq	$3,   $28, $3
	ldl	$1,   0($1)

	addq	$5,   $28, $5
	ldl	$3,   0($3)
	addq	$7,   $28, $7
	ldl	$5,   0($5)

	ldl	$7,   0($7)
	subl	$2,   $1, $2
	subl	$4,   $3, $4
	sll	$2,   32, $2

	subl	$6,   $5, $6
	sll	$4,   32, $4
	subl	$8,   $7, $8
	sll	$6,   32, $6

	stq	$2,    0($30)
	sll	$8,   32, $8
	stq	$4,    8($30)
	stq	$6,   16($30)

	stq	$8,   24($30)
	subq	$23, 1, $23
	ldt	$f11, -32($16)
	ldt	$f16, -24($16)

	ldt	$f21, -16($16)
	ldt	$f26, - 8($16)
	ldt	$f12,  0($30)
	ldt	$f17,  8($30)

	ldt	$f22, 16($30)
	ldt	$f27, 24($30)
	nop
	ble	$23, $main_end
	.align 4

$main_loop:
	mult	$f11, $f12, $f13		# x * y
	ldq	$1 ,   0($16)
	mult	$f16, $f17, $f18		# x * y
	ldq	$3 ,   8($16)
	mult	$f21, $f22, $f23		# x * y
	ldq	$5 ,  16($16)
	mult	$f26, $f27, $f28		# x * y
	ldq	$7 ,  24($16)

	mult	$f13, $f12, $f13		# x * y * y
	mult	$f18, $f17, $f18		# x * y * y
	mult	$f23, $f22, $f23		# x * y * y
	mult	$f28, $f27, $f28		# x * y * y

	# Cache IN(wait 8 clocks after loading)
	mult	$f1,  $f12, $f14		# 0.5 * y
	srl	$1 , 33, $1
	subt	$f2,  $f13, $f13		# 3. - x * y * y
	addq	$17, 32, $17

	mult	$f1,  $f17, $f19		# 0.5 * y
	srl	$3 , 33, $3
	subt	$f2,  $f18, $f18		# 3. - x * y * y
	subl	$25 , $1 , $2

	mult	$f1,  $f22, $f24		# 0.5 * y
	srl	$5 , 33, $5
	subt	$f2,  $f23, $f23		# 3. - x * y * y
	subl	$25 , $3 , $4

	mult	$f1,  $f27, $f29		# 0.5 * y
	srl	$7 , 33, $7
	subt	$f2,  $f28, $f28		# 3. - x * y * y
	subl	$25 , $5 , $6
	
	mult	$f14, $f13, $f12		# 0.5 * y * ( 3.0 - x * y * y)
	subl	$25 , $7 , $8
	unop
	srl	$2,   12, $1

	mult	$f19, $f18, $f17		# 0.5 * y * ( 3.0 - x * y * y)
	srl	$4,   12, $3
	unop
	and	$1, 0xfc, $1

	mult	$f24, $f23, $f22		# 0.5 * y * ( 3.0 - x * y * y)
	srl	$6,   12, $5
	unop
	and	$3, 0xfc, $3

	mult	$f29, $f28, $f27		# 0.5 * y * ( 3.0 - x * y * y)
	srl	$8,   12, $7
	unop
	and	$5, 0xfc, $5

	mult	$f11, $f12, $f13		# x * y
	and	$7, 0xfc, $7
	unop
	addq	$1,   $28, $1

	mult	$f16, $f17, $f18		# x * y
	addq	$3,   $28, $3
	unop
	addq	$5,   $28, $5

	mult	$f21, $f22, $f23		# x * y
	addq	$7,   $28, $7
	unop
	ldl	$1,   0($1)

	mult	$f26, $f27, $f28		# x * y
	ldl	$3,   0($3)
	unop
	ldl	$5,   0($5)

	mult	$f13, $f12, $f13		# x * y * y
	ldl	$7,   0($7)
	mult	$f18, $f17, $f18		# x * y * y
	addq	$16, 32, $16

	mult	$f23, $f22, $f23		# x * y * y
	mult	$f28, $f27, $f28		# x * y * y
	mult	$f1,  $f12, $f14		# 0.5 * y
	subt	$f10, $f13, $f13		# 3. - x * y * y

	mult	$f1,  $f17, $f19		# 0.5 * y
	subt	$f10, $f18, $f18		# 3. - x * y * y
	mult	$f1,  $f22, $f24		# 0.5 * y
	subt	$f10, $f23, $f23		# 3. - x * y * y

	# Cache IN(wait 8 clocks after loading)
	mult	$f1,  $f27, $f29		# 0.5 * y
	subl	$2,   $1, $2
	subt	$f10, $f28, $f28		# 3. - x * y * y
	subl	$4,   $3, $4

	mult	$f14, $f13, $f12		# 0.5 * y * ( 3.0 - x * y * y)
	subl	$6,   $5, $6
	unop
	sll	$2,   32, $2

	mult	$f19, $f18, $f17		# 0.5 * y * ( 3.0 - x * y * y)
	subl	$8,   $7, $8
	unop
	sll	$4,   32, $4

	mult	$f24, $f23, $f22		# 0.5 * y * ( 3.0 - x * y * y)
	sll	$6,   32, $6
	mult	$f29, $f28, $f27		# 0.5 * y * ( 3.0 - x * y * y)
	sll	$8,   32, $8

	mult	$f11, $f12, $f13		# z = x * y
	stq	$2,    0($30)
	mult	$f16, $f17, $f18		# z = x * y
	stq	$4,    8($30)
	mult	$f21, $f22, $f23		# z = x * y
	stq	$6,   16($30)
	mult	$f26, $f27, $f28		# z = x * y
	stq	$8,   24($30)

	mult	$f13, $f12, $f15		# z * y
	ldt	$f11, -32($16)
	mult	$f18, $f17, $f20		# z * y
	ldt	$f16, -24($16)
	mult	$f23, $f22, $f25		# z * y
	ldt	$f21, -16($16)
	mult	$f28, $f27, $f30		# z * y
	ldt	$f26, - 8($16)

	mult	$f1,  $f13, $f14		# z * 0.5
	ldt	$f12,  0($30)
	subt	$f3,  $f15, $f15		# 1.0 - z * y
	ldt	$f17,  8($30)

	mult	$f1,  $f18, $f19		# z * 0.5
	ldt	$f22, 16($30)
	subt	$f3,  $f20, $f20		# 1.0 - z * y
	ldt	$f27, 24($30)

	mult	$f1,  $f23, $f24		# z * 0.5
	subt	$f3,  $f25, $f25		# 1.0 - z * y
	mult	$f1,  $f28, $f29		# z * 0.5
	subt	$f3,  $f30, $f30		# 1.0 - z * y

	mult	$f14, $f15, $f15		# z * 0.5 *(1.0-z*y)
	ldt	$f31 ,  80($16)			# prefetch
	mult	$f19, $f20, $f20		# z * 0.5 *(1.0-z*y)
	unop

	mult	$f24, $f25, $f25		# z * 0.5 *(1.0-z*y)
	mult	$f29, $f30, $f30		# z * 0.5 *(1.0-z*y)

	addt	$f13, $f15, $f13		# z +z * 0.5 *(1.0-z*y)
	addt	$f18, $f20, $f18		# z +z * 0.5 *(1.0-z*y)
	addt	$f23, $f25, $f23		# z +z * 0.5 *(1.0-z*y)
	addt	$f28, $f30, $f28		# z +z * 0.5 *(1.0-z*y)

	stt	$f13, -32($17)
	subq	$23, 1, $23
	stt	$f18, -24($17)
	stt	$f23, -16($17)
	stt	$f28, - 8($17)
	bgt	$23, $main_loop
	.align 4

$main_end:
	mult	$f11, $f12, $f13		# x * y
	mult	$f16, $f17, $f18		# x * y
	mult	$f21, $f22, $f23		# x * y
	mult	$f26, $f27, $f28		# x * y

	mult	$f13, $f12, $f13		# x * y * y
	mult	$f18, $f17, $f18		# x * y * y
	mult	$f23, $f22, $f23		# x * y * y
	mult	$f28, $f27, $f28		# x * y * y

	mult	$f1,  $f12, $f14		# 0.5 * y
	subt	$f2,  $f13, $f13		# 3. - x * y * y
	mult	$f1,  $f17, $f19		# 0.5 * y
	subt	$f2,  $f18, $f18		# 3. - x * y * y

	mult	$f1,  $f22, $f24		# 0.5 * y
	subt	$f2,  $f23, $f23		# 3. - x * y * y
	mult	$f1,  $f27, $f29		# 0.5 * y
	subt	$f2,  $f28, $f28		# 3. - x * y * y

	mult	$f14, $f13, $f12		# 0.5 * y * ( 3.0 - x * y * y)
	mult	$f19, $f18, $f17		# 0.5 * y * ( 3.0 - x * y * y)
	mult	$f24, $f23, $f22		# 0.5 * y * ( 3.0 - x * y * y)
	mult	$f29, $f28, $f27		# 0.5 * y * ( 3.0 - x * y * y)

	mult	$f11, $f12, $f13		# x * y
	mult	$f16, $f17, $f18		# x * y
	mult	$f21, $f22, $f23		# x * y
	mult	$f26, $f27, $f28		# x * y

	mult	$f13, $f12, $f13		# x * y * y
	mult	$f18, $f17, $f18		# x * y * y
	mult	$f23, $f22, $f23		# x * y * y
	mult	$f28, $f27, $f28		# x * y * y

	mult	$f1,  $f12, $f14		# 0.5 * y
	subt	$f10, $f13, $f13		# 3. - x * y * y
	mult	$f1,  $f17, $f19		# 0.5 * y
	subt	$f10, $f18, $f18		# 3. - x * y * y

	mult	$f1,  $f22, $f24		# 0.5 * y
	subt	$f10, $f23, $f23		# 3. - x * y * y
	mult	$f1,  $f27, $f29		# 0.5 * y
	subt	$f10, $f28, $f28		# 3. - x * y * y

	mult	$f14, $f13, $f12		# 0.5 * y * ( 3.0 - x * y * y)
	mult	$f19, $f18, $f17		# 0.5 * y * ( 3.0 - x * y * y)
	mult	$f24, $f23, $f22		# 0.5 * y * ( 3.0 - x * y * y)
	mult	$f29, $f28, $f27		# 0.5 * y * ( 3.0 - x * y * y)

	mult	$f11, $f12, $f13		# z = x * y
	mult	$f16, $f17, $f18		# z = x * y
	mult	$f21, $f22, $f23		# z = x * y
	mult	$f26, $f27, $f28		# z = x * y

	mult	$f13, $f12, $f15		# z * y
	mult	$f18, $f17, $f20		# z * y
	mult	$f23, $f22, $f25		# z * y
	mult	$f28, $f27, $f30		# z * y

	mult	$f1,  $f13, $f14		# z * 0.5
	subt	$f3,  $f15, $f15		# 1.0 - z * y
	mult	$f1,  $f18, $f19		# z * 0.5
	subt	$f3,  $f20, $f20		# 1.0 - z * y

	mult	$f1,  $f23, $f24		# z * 0.5
	subt	$f3,  $f25, $f25		# 1.0 - z * y
	mult	$f1,  $f28, $f29		# z * 0.5
	subt	$f3,  $f30, $f30		# 1.0 - z * y

	mult	$f14, $f15, $f15		# z * 0.5 *(1.0-z*y)
	mult	$f19, $f20, $f20		# z * 0.5 *(1.0-z*y)
	mult	$f24, $f25, $f25		# z * 0.5 *(1.0-z*y)
	mult	$f29, $f30, $f30		# z * 0.5 *(1.0-z*y)

	addt	$f13, $f15, $f13		# z +z * 0.5 *(1.0-z*y)
	addq	$17, 32, $17
	addt	$f18, $f20, $f18		# z +z * 0.5 *(1.0-z*y)
	addt	$f23, $f25, $f23		# z +z * 0.5 *(1.0-z*y)
	addt	$f28, $f30, $f28		# z +z * 0.5 *(1.0-z*y)

	stt	$f13, -32($17)
	stt	$f18, -24($17)
	stt	$f23, -16($17)
	stt	$f28, - 8($17)
	.align 4

$sub:
	ble	$24, $end

	subq	$24, 1, $24

	ldq	$1 ,  0($16)
	srl	$1 , 33, $1
	subl	$25 , $1 , $2

	srl	$2,   12, $1
	and	$1, 0xfc, $1
	addq	$1,   $28, $1
	ldl	$1,   0($1)
	subl	$2,   $1, $2
	sll	$2,   32, $2
	stq	$2,   0x18($30)
	nop
	ldt	$f11, 0x00($16)
	ldt	$f12, 0x18($30)
	ble	$24, $sub_end
	.align 4

$sub_loop:
	mult	$f11, $f12, $f13		# x * y
	ldq	$1 ,   8($16)
	mult	$f1,  $f12, $f14		# 0.5 * y
	mult	$f13, $f12, $f13		# x * y * y

	subt	$f2,  $f13, $f13		# 3. - x * y * y
	srl	$1 , 33, $1
	subl	$25 , $1 , $2
	srl	$2,   12, $1

	and	$1, 0xfc, $1
	mult	$f14, $f13, $f12		# 0.5 * y * ( 3.0 - x * y * y)
	addq	$1,   $28, $1
	ldl	$1,   0($1)

	mult	$f11, $f12, $f13		# x * y
	mult	$f1,  $f12, $f14		# 0.5 * y
	mult	$f13, $f12, $f13		# x * y * y
	subt	$f10, $f13, $f13		# 3. - x * y * y

	mult	$f14, $f13, $f12		# 0.5 * y * ( 3.0 - x * y * y)
	subl	$2,   $1, $2
	sll	$2,   32, $2
	stq	$2,    0($30)

	mult	$f11, $f12, $f13		# z = x * y
	ldt	$f11,  8($16)
	addq	$16, 8, $16
	addq	$17, 8, $17

	mult	$f13, $f12, $f15		# z * y
	subq	$24, 1, $24
	mult	$f1,  $f13, $f14		# z * 0.5
	ldt	$f12,  0($30)

	subt	$f3,  $f15, $f15		# 1.0 - z * y
	mult	$f14, $f15, $f15		# z * 0.5 *(1.0-z*y)
	addt	$f13, $f15, $f13		# z +z * 0.5 *(1.0-z*y)
	unop

	stt	$f13, -8($17)
	bgt	$24, $sub_loop
	.align 4

$sub_end:
	mult	$f11, $f12, $f13		# x * y
	mult	$f1,  $f12, $f14		# 0.5 * y
	mult	$f13, $f12, $f13		# x * y * y
	subt	$f2,  $f13, $f13		# 3. - x * y * y
	mult	$f14, $f13, $f12		# 0.5 * y * ( 3.0 - x * y * y)

	mult	$f11, $f12, $f13		# x * y
	mult	$f1,  $f12, $f14		# 0.5 * y
	mult	$f13, $f12, $f13		# x * y * y
	subt	$f10, $f13, $f13		# 3. - x * y * y
	mult	$f14, $f13, $f12		# 0.5 * y * ( 3.0 - x * y * y)

	mult	$f11, $f12, $f13		# z = x * y

	mult	$f13, $f12, $f15		# z * y
	mult	$f1,  $f13, $f14		# z * 0.5
	subt	$f3,  $f15, $f15		# 1.0 - z * y
	mult	$f14, $f15, $f15		# z * 0.5 *(1.0-z*y)
	addt	$f13, $f15, $f13		# z +z * 0.5 *(1.0-z*y)

	stt	$f13, 0($17)
	.align 4

$end:
	ldt	$f2 , 32($30) 
	ldt	$f3 , 40($30) 
	fclr	$f0
	addq	$30 , 48, $30 			# e0    :
	ret

	.end  NAME

