#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "audiohw.h"
#include "printf.h"
#include "boot.h"
#include "heap.h"
#include "irqs.h"
#include "ral.h"
#include "cpu.h"
#define MY_LIB_ID					0x00
#define DMA_LIB_SLOT_IN_GLOBALS		0x18
#include "../dma_driver/DmaDriver.h"

#define AUDIO_BUF_SZ				2048

#define LOWER_QUALITY			//uses lower sampling rate and thus less CPU

#ifdef LOWER_QUALITY
	#define AUDIO_MAX_VALID_VAL					8000
	#define AUDIO_DIV_MULTIPLICATIVE_INVERSE	0xFA04A397
	#define AUDIO_DIV_MULTIPLICATIVE_INV_SHIFT	11
#else
	#define AUDIO_MAX_VALID_VAL					4000
	#define AUDIO_DIV_MULTIPLICATIVE_INVERSE	0xFA04A397
	#define AUDIO_DIV_MULTIPLICATIVE_INV_SHIFT	12
#endif


#define AUDIO_MIDDLE_VALUE						(AUDIO_MAX_VALID_VAL / 2)
#define AUDIO_DIV_VAL 							(0x01000000 / AUDIO_MAX_VALID_VAL)




static AudioOutHwReadyForMoreSamplesF mReadyForSamplesF;
static bool mCurOnState = false;
static DmaStream mDmaStream;
static uint16_t *mDmaMem;

//this func assumes that num is divisible by 4, else things will break!
//it also assumes that destination is 4(!!!)-byte aligned on v7E
static void __attribute__((naked)) audioHwConvertSamples(int16_t *dst, const int32_t *src, uint32_t num)
{
	/*
		basically:
	
			while(num--) {
				
				int32_t t = *src++;
				
				//saturate to signed 24-bit value
				if (t >= 0x800000)
					t = 0x7fffff;
				else if (t < -0x800000)
					t = -0x800000;
				
				//convert to unsigned
				t += 0x00800000;
				
				//scale to our range (unsigned div, it is faster)
				t = (uint32_t)(t + AUDIO_DIV_VAL / 2) / AUDIO_DIV_VAL;
				
				*dst++ = t;
			}
	*/
	
	asm volatile(
		"	push    {r4-r11, lr}			\n\t"
		"	ldr     r9, =%0					\n\t"
		"	ldr     r10, =%1				\n\t"
		"1:									\n\t"
		"	ldmia   r1!, {r3-r8,r12,lr}		\n\t"
		"	ssat    r3,  #24, r3			\n\t"
		"	add     r3,  r9					\n\t"
		"	umull   r11, r3, r10, r3		\n\t"
		"	ssat    r4,  #24, r4			\n\t"
		"	add     r4,  r9					\n\t"
		"	umull   r11, r4, r10, r4		\n\t"
		"	ssat    r5,  #24, r5			\n\t"
		"	add     r5,  r9					\n\t"
		"	umull   r11, r5, r10, r5		\n\t"
		"	ssat    r6,  #24, r6			\n\t"
		"	add     r6,  r9					\n\t"
		"	umull   r11, r6, r10, r6		\n\t"
		"	ssat    r7,  #24, r7			\n\t"
		"	add     r7,  r9					\n\t"
		"	umull   r11, r7, r10, r7		\n\t"
		"	ssat    r8,  #24, r8			\n\t"
		"	add     r8,  r9					\n\t"
		"	umull   r11, r8, r10, r8		\n\t"
		"	ssat    r12, #24, r12			\n\t"
		"	add     r12, r9					\n\t"
		"	umull   r11, r12, r10, r12		\n\t"
		"	ssat    lr,  #24, lr			\n\t"
		"	add     lr,  r9					\n\t"
		"	umull   r11, lr, r10, lr		\n\t"
			
	#ifdef HAVE_v7E_SUPPORT
		"	lsrs    r3,  %2					\n\t"
		"	lsrs    r5,  %2					\n\t"
		"	lsrs    r7,  %2					\n\t"
		"	lsrs    r12, %2					\n\t"
		"	pkhbt   r3,  r3,  r4, lsl %3	\n\t"
		"	pkhbt   r5,  r5,  r6, lsl %3	\n\t"
		"	pkhbt   r7,  r7,  r8, lsl %3	\n\t"
		"	pkhbt   r12, r12, lr, lsl %3	\n\t"
	#else
		"	lsrs    r3,  %2					\n\t"
		"	lsrs    r4,  %2					\n\t"
		"	lsrs    r5,  %2					\n\t"
		"	lsrs    r6,  %2					\n\t"
		"	lsrs    r7,  %2					\n\t"
		"	lsrs    r8,  %2					\n\t"
		"	lsrs    r12, %2					\n\t"
		"	lsrs    lr,  %2					\n\t"
		"	uxth    r3,  r3					\n\t"
		"	add     r3,  r3,  r4, lsl #16	\n\t"
		"	uxth    r5,  r5					\n\t"
		"	add     r5,  r5,  r6, lsl #16	\n\t"
		"	uxth    r7,  r7					\n\t"
		"	add     r7,  r7,  r8, lsl #16	\n\t"
		"	uxth    r12, r12				\n\t"
		"	add     r12, r12, lr, lsl #16	\n\t"
	#endif
		
		"	stmia   r0!, {r3, r5, r7, r12}	\n\t"
		"	subs    r2, #8					\n\t"
		"	bne     1b						\n\t"
		"	pop     {r4-r11, pc}			\n\t"
		".ltorg								\n\t"
		:
		:"i"(0x00800000 + AUDIO_DIV_VAL / 2), "i"(AUDIO_DIV_MULTIPLICATIVE_INVERSE), "I"(AUDIO_DIV_MULTIPLICATIVE_INV_SHIFT), "I"(16 - AUDIO_DIV_MULTIPLICATIVE_INV_SHIFT)
		:"memory","cc"
	);
}

static void audioHwPrvReqData(bool secondHalf)
{
	uint32_t r9state = ralSetSafeR9();

	audioHwConvertSamples((int16_t*)mDmaMem + (secondHalf ? AUDIO_BUF_SZ / 2 : 0), mReadyForSamplesF(true), AUDIO_BUF_SZ / 2);
	mReadyForSamplesF(false);
	
	ralRestoreR9(r9state);
}

static void audioHwPrvDmaIrq(void* userData, uint32_t strmSta)
{
	if (strmSta & DMA_STRM_IRQ_HALF)
		audioHwPrvReqData(false);
	else if (strmSta & DMA_STRM_IRQ_DONE)
		audioHwPrvReqData(true);
}

//value in CCR is how many cycles out of 4000 are to be high. so accepted values are 0..4000
bool audioOutHwInit(AudioOutHwReadyForMoreSamplesF readyForSamplesF, uint32_t *numSamplesPerBufP, enum AudioSampleRate* nativeRateP, bool *nativeStereoP)
{
	uint32_t i;
	static const struct DmaStreamUserCfg audioDmaCfg = {
		.magic = CFG_STRUCT_MAGIX,
		.chan = 7,
		.circBuf = 1,
		.prio = 2,
		.perSz = __builtin_ctz(sizeof(*mDmaMem)),
		.memSz = __builtin_ctz(sizeof(*mDmaMem)),
		.memIncr = true,
		.toMem = 0,
		.numItems = AUDIO_BUF_SZ,
	};
	
	
	//if you want to disable sampled sound out and use simple sound, return false here
	
	
	
	
	TIM8->CR1 = TIM_CR1_ARPE | TIM_CR1_URS; 			//upcount, edger mode, dma req only on overflow, 
	TIM8->CR2 = TIM_CR2_CCDS;							//dma req when update event occurs
	TIM8->SMCR = 0;
	TIM8->CCMR1 = TIM_CCMR1_OC1M_2 | TIM_CCMR1_OC1M_1;	//PWM mode with high for as many cycles as CCR says
	TIM8->CCER = TIM_CCER_CC1NE;
	
	TIM8->RCR = 0;										//do a dma request every 1 PWM cycle
	
	TIM8->CNT = 0;
	TIM8->ARR = AUDIO_MAX_VALID_VAL - 1;				//count is inclusive so range is 0..999
	TIM8->CCR1 = AUDIO_MIDDLE_VALUE;					//duty cycle is not inclusive
	
	TIM8->CR1 |= TIM_CR1_CEN;							//turn timer on
	TIM8->BDTR = TIM_BDTR_MOE;							//pwm output on

	if ((AUDIO_BUF_SZ / 2) & 7)
		fatal("we REQUIRE hardware buffer to be a multiple of 8 samples in size for speed\n");
	
	mReadyForSamplesF = readyForSamplesF;
	
	mDmaStream = DmaLibStreamReserve(2, 1);
	if (!mDmaStream) {
		logw("Audio failed to grab out DMA stream\n");
		goto out_err;
	}
	
	if (!DmaLibStreamConfigure(mDmaStream, &audioDmaCfg)) {
		logw("Audio failed to configure our stream\n");
		goto out_free_dma;
	}

	if (!DmaLibStreamSetIrqHandler(mDmaStream, audioHwPrvDmaIrq, NULL)) {
		logw("Audio failed to configure irq handler\n");
		goto out_free_dma;
	}

	if (!DmaLibStreamSetPeriphAddr(mDmaStream, (uintptr_t)&TIM8->CCR1)) {
		logw("Audio failed to configure irq DADDR\n");
		goto out_free_dma;
	}

	mDmaMem = kheapAllocEx(AUDIO_BUF_SZ * sizeof(*mDmaMem), MEM_USABLE_FOR_DMA);
	if (!mDmaMem) {
		logw("Audio failed to get buffer memory\n");
		goto out_free_dma;
	}
	
	*numSamplesPerBufP = AUDIO_BUF_SZ / 2;	//client writes half at a time (while dma plays the other half)
	#ifdef LOWER_QUALITY
		*nativeRateP = AudioRate24000;
	#else
		*nativeRateP = AudioRate48000;
	#endif
	*nativeStereoP = false;
	
	return true;

out_free_dma:
	if (!DmaLibStreamRelease(mDmaStream))
		logw("Audio failed to release our dma stream\n");

out_err:
	return false;
}

void audioOutHwSetState(bool on)
{
	uint32_t i;
	bool changed = false;
	irq_state_t sta;
	
	sta = irqsAllOff();
	if (!mCurOnState != !on)
		changed = true;
	mCurOnState = on;
	irqsRestoreState(sta);

	if (!changed)
		return;
	
	if (!on) {
		
		TIM8->DIER = 0;						//turn off dma reqs
		TIM8->CCR3 = AUDIO_MIDDLE_VALUE;	//set to middle level

		if (!DmaLibStreamSetIrqState(mDmaStream, 0))
			logw("Audio failed to disable dma irqs\n");
		
		TIM8->CCR1 = AUDIO_MIDDLE_VALUE;	//set to middle level (in case we caught dma mid-transfer)
		
		if (!DmaLibStreamSetEnabled(mDmaStream, false))
			logw("Audio failed to disable dma\n");
	}
	else {
		
		//every turn-on we do this to make sure it starts from start of buffer
		if (!DmaLibStreamSetMemAddr(mDmaStream, 0, (uintptr_t)mDmaMem))
			logw("Audio failed to configure irq SADDR\n");
		
		//start with silence, let audio mixer do its thing for a bit
		for (i = 0; i < AUDIO_BUF_SZ; i++)
			mDmaMem[i] = AUDIO_MIDDLE_VALUE;
		
		if (!DmaLibStreamSetEnabled(mDmaStream, true))
			logw("Audio failed to enable dma\n");

		if (!DmaLibStreamSetIrqState(mDmaStream, DMA_STRM_IRQ_HALF | DMA_STRM_IRQ_DONE))
			logw("Audio failed to enable dma irqs\n");
		
		//enable DMA requests
		TIM8->DIER = TIM_DIER_UDE;
	}
}

bool audioInHwInit(AudioInHwSamplesReadyF acceptSamplesF, uint32_t *numSamplesPerBufP)
{
	logi("this board has no audio in for now\n");
	return false;
}

bool audioInHwSetState(bool on, enum AudioSampleRate rate)
{
	return false;
}

//this will onyl be caled if sampled audio is not avail, which you'll need to disable above to test
bool audioOnlySimpleOutInit(void)
{
	TIM8->CR1 = TIM_CR1_ARPE; 							//upcount, edge mode
	TIM8->CR2 = 0;
	TIM8->SMCR = 0;
	TIM8->CCMR1 = TIM_CCMR1_OC1M_2 | TIM_CCMR1_OC1M_1;	//PWM mode with high for as many cycles as CCR says
	TIM8->CCER = TIM_CCER_CC1NE;
	
	TIM8->BDTR = TIM_BDTR_MOE;							//pwm output on
	
	return true;
}

void audioOnlySimpleTone(uint32_t freq, uint32_t amp)
{
	TIM8->CR1 &=~ TIM_CR1_CEN;							//turn timer off
	
	if (amp && freq) {
		
		uint32_t period, psc = 1;
		
		//rough but fine
		while (freq <= CPU_CLOCK_RATE / 65000) {
			
			freq *= 2;
			psc *= 2;
		}
		TIM8->PSC = psc - 1;
		
		period = (CPU_CLOCK_RATE + freq / 2) / freq;
		if (!period)
			period = 1;
		
		TIM8->CNT = 0;
		TIM8->ARR = period - 1;								//count is inclusive so range is 0..period - 1
		TIM8->CCR1 = amp * period / (2 * sndMaxAmp);		//duty cycle is not inclusive
		
		TIM8->CR1 |= TIM_CR1_CEN;							//turn timer on
	}
}


int32_t adcGetValue(enum AdcValueIdx which)
{
	//none supported yet
	
	return DAL_ADC_VAL_INVALID;
}





//stm32f429 discovery notes:
//we do mono output (stereo is doable but we do mono)
// we do need a fast timer for fast sample rates, so we need one on APB2 (fast) or to set the TIMPRE bit.
// for self-dma it needs to be capable of dma triggering, and for output at least one channel must connect to an
// unused available GPIO. keep in mind that timers that are up-clockable using the TIMPRE bit still have a slower
// iface clock and thus are less preferable (though by a very very very small amount)
//
//always fast timers: 1, 8, 9, 10, 11
//timers we can clock up using TIMPRE: 2, 3, 4, 5, 6, 7, 12, 13, 14
//
//let's remove all timers that cannot do PWM out
//
//always fast timers: 1, 8, 9, 10, 11
//timers we can clock up using TIMPRE: 2, 3, 4, 5, 12, 13, 14
//
//let's remove all timers that cannot trigger dma
//
//always fast timers: 1, 8
//timers we can clock up using TIMPRE: 2, 3, 4, 5
//
//now we'll annotate the pins and dma channels each can use (~ = inverted)
// keep in mind some of these do not exist on our package (annotated as "*")
//
//	TIMER		OUT PINS							DMA CHs
//	1			~A7,~B0,~B1,~B13, ~E8,E9			2.1,2.2,2.3,2.4,2.6
//				~E10,E11,~E12,E13,E14
//
//	2			A0,A1,A2,A3,A5,A15,B3,B10,B11		1.1,1.5,1.6,1.7
//
//	3			A6,A7,B0,B1,B4,B5,C6,C7,C8,C9		1.2,1.4,1.5,1.7
//
//	4			B6,B7,B8,B9,D12,D13,D14,D15			1.0,1.3,1.7
//
//	5			A0,A1,A2,A3,*H10,*H11,*H12,*I0		1.0,1.1,1.2,1.3,1.4
//
//	8			~A5,~A7,~B0,~B1,~B14,~B15,C6,		2.2,2.3,2.4,2.7
//				C7,C8,C9,*~H13,*~H14,*~H15
//				*I2,*I5,*I6,*I7
//
//now let's remove all pins that we cannot use (do not exist or are used on the discovery board)
// any timer that runs out of pins or DMAs will be removed from the list
//
//	TIMER		OUT PINS							DMA CHs
//	1			~A7									2.1,2.2,2.3,2.4,2.6
//
//	2			A5,B3								1.1,1.5,1.6,1.7
//
//	3			A7,B4,C8							1.2,1.4,1.5,1.7
//
//	4			B7									1.0,1.3,1.7
//
//	8			~A5,~A7,C8							2.2,2.3,2.4,2.7
//A7 is a maybe (i2c ext rst - notused on the board but brought out to a special i2c connector)
//
//there are now few enough options to enumerate the pins,channels,etc they'd use
//
//	TIMER		OUT PINS							DMA CHs
//	1			~A7									2.1,2.2,2.3,2.4,2.6
//
//	2			A5,B3								1.1,1.5,1.6,1.7
//
//	3			A7,B4,C8							1.2,1.4,1.5,1.7
//
//	4			B7									1.0,1.3,1.7
//
//	8			~A5,~A7,C8							2.2,2.3,2.4,2.7
//
//	BUT, remember that SD card uses some pins: C8,C9,C10,C11,C12,D2,G2
//			also, IR/UART uses pins: A9, B7
//
//	so remove them:
//
//	TIMER		OUT PINS							DMA CHs
//	1			~A7									2.1,2.2,2.3,2.4,2.6
//
//	2			A5,B3								1.1,1.5,1.6,1.7
//
//	3			A7,B4								1.2,1.4,1.5,1.7
//
//	8			~A5,~A7								2.2,2.3,2.4,2.7
//
//Let's do mono and use TIM8 channel 1_inverted on pin A5 (which we'll configure to output the non-inverted waveform) and DMA 2 stream 1 (channel 7)
