/* dsp.c */
/* asserv - Position & speed motor control on a ATmega128. {{{
 *
 * Copyright (C) 2004 Nicolas Schodet
 *
 * Robot APB Team/Efrei 2005.
 *        Web: http://assos.efrei.fr/robot/
 *      Email: robot AT efrei DOT fr
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 * }}} */
#include "dsp.h"

/* +AutoDec */
/* -AutoDec */

/** Numbers notation:
 * [u]{i|f}x[.y]
 * u: unsigned
 * i: integer
 * f: fixed point
 * x: integral part size in bits
 * y: fractionnal part size in bits
 *
 * Ex: i16: signed 16 bit word, uf8.8: unsigned fixed 8.8.
 *
 * Angles are mapped from [0, 2pi) to [0,1). */

/** Add two signed words (i16) and saturate. UNUSED and UNTESTED. */
extern inline int16_t
dsp_add_sat_i16i16 (int16_t a, int16_t b)
{
    asm ("add %A0, %A1"		"\n\t"
	 "adc %B0, %B1"		"\n\t"
	 /* Branch if not V. */
	 "brvc 1f"		"\n\t"
	 /* Load Max. */
	 "ldi %A0, 0xff"	"\n\t"
	 "ldi %B0, 0x7f"	"\n\t"
	 /* Branch if not S. */
	 "brlt 1f"		"\n\t"
	 /* Load Min. */
	 "ldi %A0, 0x00"	"\n\t"
	 "ldi %B0, 0x80"	"\n\t"
    "1:" ""			"\n\t"
	 : "=r" (a) : "0" (a), "r" (b));
    return a;
}

/** Multiply i16 by uf8.8, return i16. */
extern inline int16_t
dsp_mul_i16f88 (int16_t a, uint16_t b)
{
    int16_t r;
    asm ("mulsu %B1, %B2"	"\n\t"
	 "mov %B0, r0"		"\n\t"
	 "mul %A1, %A2"		"\n\t"
	 "mov %A0, r1"		"\n\t"
	 "mul %A1, %B2"		"\n\t"
	 "add %A0, r0"		"\n\t"
	 "adc %B0, r1"		"\n\t"
	 "mulsu %B1, %A2"	"\n\t"
	 "add %A0, r0"		"\n\t"
	 "adc %B0, r1"		"\n\t"
	 "clr r1"		"\n\t"
	 : "=&r" (r) : "a" (a), "a" (b) : "r0");
    return r;
}

/** Multiply f8.24 by f8.24, return f8.24. */
int32_t
dsp_mul_f824 (int32_t a, int32_t b)
{
    register int32_t ar asm ("r16"), br asm ("r20");
    ar = a;
    br = b;
    int32_t r;
    int8_t z;
    asm (""
	 "clr %1"		"\n\t"
	 /* Low dword (>> 8, with 8 guards). */
	 "mul %A2, %B3"		"\n\t"
	  "movw %A0, r0"	"\n\t"
	  "clr %C0"		"\n\t"
	  "clr %D0"		"\n\t"
	 "mul %A2, %A3"		"\n\t"
	  "add %A0, r1"		"\n\t"
	  "adc %B0, %1"		"\n\t"
	  "adc %C0, %1"		"\n\t"
	 "mul %B2, %A3"		"\n\t"
	  "add %A0, r0"		"\n\t"
	  "adc %B0, r1"		"\n\t"
	  "adc %C0, %1"		"\n\t"
	 "mul %A2, %C3"		"\n\t"
	  "add %B0, r0"		"\n\t"
	  "adc %C0, r1"		"\n\t"
	  "adc %D0, %1"		"\n\t"
	 "mul %B2, %B3"		"\n\t"
	  "add %B0, r0"		"\n\t"
	  "adc %C0, r1"		"\n\t"
	  "adc %D0, %1"		"\n\t"
	 "mul %C2, %A3"		"\n\t"
	  "add %B0, r0"		"\n\t"
	  "adc %C0, r1"		"\n\t"
	  "adc %D0, %1"		"\n\t"
	 /* Shift. */
	 "movw %A0, %C0"	"\n\t"
	 /* Upper word. */
	 "mulsu %D3, %C2"	"\n\t"
	  "movw %C0, r0"	"\n\t"
	 "mulsu %D3, %A2"	"\n\t"
	  "sbc %C0, %1"		"\n\t"
	  "sbc %D0, %1"		"\n\t"
	  "add %A0, r0"		"\n\t"
	  "adc %B0, r1"		"\n\t"
	  "adc %C0, %1"		"\n\t"
	  "adc %D0, %1"		"\n\t"
	 "mul %B2, %C3"		"\n\t"
	  "add %A0, r0"		"\n\t"
	  "adc %B0, r1"		"\n\t"
	  "adc %C0, %1"		"\n\t"
	  "adc %D0, %1"		"\n\t"
	 "mul %C2, %B3"		"\n\t"
	  "add %A0, r0"		"\n\t"
	  "adc %B0, r1"		"\n\t"
	  "adc %C0, %1"		"\n\t"
	  "adc %D0, %1"		"\n\t"
	 "mulsu %D2, %A3"	"\n\t"
	  "sbc %C0, %1"		"\n\t"
	  "sbc %D0, %1"		"\n\t"
	  "add %A0, r0"		"\n\t"
	  "adc %B0, r1"		"\n\t"
	  "adc %C0, %1"		"\n\t"
	  "adc %D0, %1"		"\n\t"
	 "mulsu %D3, %B2"	"\n\t"
	  "sbc %D0, %1"		"\n\t"
	  "add %B0, r0"		"\n\t"
	  "adc %C0, r1"		"\n\t"
	  "adc %D0, %1"		"\n\t"
	 "mul %C2, %C3"		"\n\t"
	  "add %B0, r0"		"\n\t"
	  "adc %C0, r1"		"\n\t"
	  "adc %D0, %1"		"\n\t"
	 "mulsu %D2, %B3"	"\n\t"
	  "sbc %D0, %1"		"\n\t"
	  "add %B0, r0"		"\n\t"
	  "adc %C0, r1"		"\n\t"
	  "adc %D0, %1"		"\n\t"
	 "mulsu %D2, %C3"	"\n\t"
	  "add %C0, r0"		"\n\t"
	  "adc %D0, r1"		"\n\t"
	 "muls %D2, %D3"	"\n\t"
	  "add %D0, r0"		"\n\t"
	 "clr r1"		"\n\t"
	 : "=&r" (r), "=&r" (z) : "a" (ar), "a" (br) : "r0");
    return r;
}

/** Divide f8.24 by f8.24, result f8.24. */
int32_t
dsp_div_f824 (int32_t dd, int32_t dv)
{
    int32_t rem;
    int8_t cnt;
    asm (""
	 /* Store sign. */
	 "mov __tmp_reg__, %D1"	"\n\t"
	 "eor __tmp_reg__, %D3"	"\n\t"
	 /* Change sign. */
	 "sbrs %D1, 7"		"\n\t"
	 "rjmp 1f"		"\n\t"
	 "com %A1"		"\n\t"
	 "com %B1"		"\n\t"
	 "com %C1"		"\n\t"
	 "com %D1"		"\n\t"
	 "subi %A1, 0xff"	"\n\t"
	 "sbci %B1, 0xff"	"\n\t"
	 "sbci %C1, 0xff"	"\n\t"
	 "sbci %D1, 0xff"	"\n\t"
    "1:" "sbrs %D3, 7"		"\n\t"
	 "rjmp 2f"		"\n\t"
	 "com %A3"		"\n\t"
	 "com %B3"		"\n\t"
	 "com %C3"		"\n\t"
	 "com %D3"		"\n\t"
	 "subi %A3, 0xff"	"\n\t"
	 "sbci %B3, 0xff"	"\n\t"
	 "sbci %C3, 0xff"	"\n\t"
	 "sbci %D3, 0xff"	"\n\t"
	 /* Clear rem. */
    "2:" "clr %A0"		"\n\t"
	 "clr %B0"		"\n\t"
	 "movw %C0, %A0"	"\n\t"
	 /* First loop, dropped bits. */
	 "ldi %2, 24"		"\n\t"
    "1:" //"lsl %A1"		"\n\t"	// shift out dd
	 "lsl %B1"		"\n\t"	// do not touch A1
	 "rol %C1"		"\n\t"
	 "rol %D1"		"\n\t"
	 "rol %A0"		"\n\t"	// shift in rem
	 "rol %B0"		"\n\t"	// 24 bits only
	 "rol %C0"		"\n\t"
	 //"rol %D0"		"\n\t"
	 "sub %A0, %A3"		"\n\t"	// rem -= dv
	 "sbc %B0, %B3"		"\n\t"
	 "sbc %C0, %C3"		"\n\t"
	 "sbc %D0, %D3"		"\n\t"
	 "brcc 2f"		"\n\t"	// if negative, restore rem
	 "add %A0, %A3"		"\n\t"
	 "adc %B0, %B3"		"\n\t"
	 "adc %C0, %C3"		"\n\t"
	 "adc %D0, %D3"		"\n\t"
    "2:" "dec %2"		"\n\t"	// test for loop
	 "brne 1b"		"\n\t"
	 /* Second loop, stored bits. */
	 "ldi %2, 8"		"\n\t"
    "3:" "rol %A1"		"\n\t"	// shift out dd, shift in result
	 "rol %A0"		"\n\t"	// shift in rem
	 "rol %B0"		"\n\t"
	 "rol %C0"		"\n\t"
	 "rol %D0"		"\n\t"
	 "sub %A0, %A3"		"\n\t"	// rem -= dv
	 "sbc %B0, %B3"		"\n\t"
	 "sbc %C0, %C3"		"\n\t"
	 "sbc %D0, %D3"		"\n\t"
	 "brcc 4f"		"\n\t"	// if negative, restore rem
	 "add %A0, %A3"		"\n\t"
	 "adc %B0, %B3"		"\n\t"
	 "adc %C0, %C3"		"\n\t"
	 "adc %D0, %D3"		"\n\t"
	 "clc"			"\n\t"	// result bit 0
	 "sbrc __zero_reg__, 0" "\n\t"
    "4:" "sec"			"\n\t"	// result bit 1
	 "dec %2"		"\n\t"	// test for loop
	 "brne 3b"		"\n\t"
	 /* Last loop, stored bits, dd padding bits. */
	 "ldi %2, 24"		"\n\t"
    "5:" "rol %A1"		"\n\t"	// shift out dd, shift in result
	 "rol %B1"		"\n\t"	// 0s come from the first loop
	 "rol %C1"		"\n\t"
	 "rol %D1"		"\n\t"
	 "rol %A0"		"\n\t"	// shift in rem
	 "rol %B0"		"\n\t"
	 "rol %C0"		"\n\t"
	 "rol %D0"		"\n\t"
	 "sub %A0, %A3"		"\n\t"	// rem -= dv
	 "sbc %B0, %B3"		"\n\t"
	 "sbc %C0, %C3"		"\n\t"
	 "sbc %D0, %D3"		"\n\t"
	 "brcc 6f"		"\n\t"	// if negative, restore rem
	 "add %A0, %A3"		"\n\t"
	 "adc %B0, %B3"		"\n\t"
	 "adc %C0, %C3"		"\n\t"
	 "adc %D0, %D3"		"\n\t"
	 "clc"			"\n\t"	// result bit 0
	 "sbrc __zero_reg__, 0" "\n\t"
    "6:" "sec"			"\n\t"	// result bit 1
	 "dec %2"		"\n\t"	// test for loop
	 "brne 5b"		"\n\t"
	 /* Store last bit. */
	 "rol %A1"		"\n\t"	// shift in result
	 "rol %B1"		"\n\t"
	 "rol %C1"		"\n\t"
	 "rol %D1"		"\n\t"
	 /* Restore sign. */
	 "sbrs __tmp_reg__, 7"	"\n\t"
	 "rjmp 7f"		"\n\t"
	 "com %A1"		"\n\t"
	 "com %B1"		"\n\t"
	 "com %C1"		"\n\t"
	 "com %D1"		"\n\t"
	 "subi %A1, 0xff"	"\n\t"
	 "sbci %B1, 0xff"	"\n\t"
	 "sbci %C1, 0xff"	"\n\t"
	 "sbci %D1, 0xff"	"\n\t"
    "7:" ""			"\n\t"
	 : "=&r" (rem), "=d" (dd), "=&d" (cnt) : "d" (dv), "1" (dd));
    return dd;
}

/** Compute cosinus for angles between [0,pi/2]. */
int32_t
dsp_cos_dli (int32_t a)
{
    static const int32_t f[] = {
	(1L << 24) * -26.42625678337439745096,
	(1L << 24) * 60.24464137187666035919,
	(1L << 24) * -85.45681720669372773226,
	(1L << 24) * 64.93939402266829148905,
	(1L << 24) * -19.73920880217871723738,
	(1L << 24) * 1
    };
    int32_t r;
    int32_t a2;
    uint8_t i;
    a2 = dsp_mul_f824 (a, a);
    r = f[0];
    for (i = 1; i < sizeof (f) / sizeof (f[0]); i++)
	r = dsp_mul_f824 (r, a2) + f[i];
    return r;
}

/** Compute cosinus, angle f8.24, result f8.24. */
int32_t
dsp_cos_f824 (int32_t a)
{
    a &= (1L << 24) - 1;
    uint8_t z = ((uint32_t) a >> 16) & 0xc0;
    if (z == 0)
	return dsp_cos_dli (a);
    else if (z == 1 << 6)
	return -dsp_cos_dli ((1L << 23) - a);
    else if (z == 2 << 6)
	return -dsp_cos_dli (a & 0xff7fffff);
    else
	return dsp_cos_dli ((1L << 24) - a);
}

/** Compute sinus, angle f8.24, result f8.24. */
int32_t
dsp_sin_f824 (int32_t a)
{
    a &= (1L << 24) - 1;
    uint8_t z = ((uint32_t) a >> 16) & 0xc0;
    if (z == 0)
	return dsp_cos_dli ((1L << 22) - a);
    else if (z == 1 << 6)
	return dsp_cos_dli (a - (1L << 22));
    else if (z == 2 << 6)
	return -dsp_cos_dli ((1L << 23) + (1L << 22) - a);
    else
	return -dsp_cos_dli (a - (1L << 23) - (1L << 22));
}

/** Compute square root, uf24.8. */
uint32_t
dsp_sqrt_f248 (uint32_t x)
{
    uint32_t root, bit, test;
    root = 0;
    bit = 1L << 30;
    do
      {
	test = root + bit;
	//printf ("test = 0x%x, root = 0x%x, bit = 0x%x\n", test, root, bit);
	if (x >= test)
	  {
	    x -= test;
	    root = test + bit;
	    //printf ("x = 0x%x, root = 0x%x\n", x, root);
	  }
	root >>= 1;
	bit >>= 2;
      } while (bit);
    return root << 4;
}

/** Compute square root, ui32 -> ui16. */
uint16_t
dsp_sqrt_ui32 (uint32_t x)
{
    uint32_t root, bit, test;
    root = 0;
    bit = 1L << 30;
    do
      {
	test = root + bit;
	//printf ("test = 0x%x, root = 0x%x, bit = 0x%x\n", test, root, bit);
	if (x >= test)
	  {
	    x -= test;
	    root = test + bit;
	    //printf ("x = 0x%x, root = 0x%x\n", x, root);
	  }
	root >>= 1;
	bit >>= 2;
      } while (bit);
    return root;
}

/** Compute hypothenuse, f24.8 -> ui16. Input limited to +-65535 */
uint16_t
dsp_hypot (int32_t dx, int32_t dy)
{
    dx >>= 8;
    dy >>= 8;
    return dsp_sqrt_ui32 (dx * dx + dy * dy);
}