#include "stm32h7b0xx.h"
#include "printf.h"
#include "msio.h"
#include "cpu.h"


#define NUM_RX_BUFS			3

static volatile uint8_t mRxBuffers[NUM_RX_BUFS][512] __attribute__((section(".uncached")));
static volatile uint8_t mRegs[MS_NUM_REGS + 4] __attribute__((section(".uncached")));	//reg window is at the end
static uint8_t mOutBuffer[MSIO_OUT_BUFFER_CT][MSIO_OUT_BUFER_SZ] __attribute__((section(".uncached")));

static volatile uint8_t mRxBufIdxW, mRxBufIdxR, mRxBufNumUsed, mDoneTpc[NUM_RX_BUFS];
static volatile uint16_t mBytesRxd[NUM_RX_BUFS];

#define mRegWindow			(mRegs + MS_NUM_REGS)

static volatile uint8_t mTpcInProgress, mTpc;			//0x00 while none, 0xff if invalid
static uint8_t mCurCategory;


struct ReadDataInfo {
	const void *data;
	uint32_t len;		//512 is really max, but having this as u32 makes asm easier
};

static struct ReadDataInfo mLongReadDataInfo;
static struct ReadDataInfo mShortReadDataInfo;



void __attribute__((section(".ramcode"))) dtx(uint_fast8_t val)
{
	(void)val;
	while (!(UART4->ISR & USART_ISR_TXFT));
	UART4->TDR = val;
}

static void __attribute__((section(".ramcode"))) msioPrvConfigForRx(void)
{
	//we often do this after a red/data read and before userspace has a chance to cancel irqs so we might spuriously signal an irq when it hsoudl be cleared
	//this is considered ok (for now?)
	
	dtx('c');
	DMA1_Stream0->CR = 0;
	DMA1_Stream1->CR = 0;
	DMA1_Stream4->CR = 0;
	DMA1_Stream2->CR = 0;
	DMA1_Stream3->CR = 0;
	DMA1->LIFCR = -1;
	DMA1->HIFCR = -1;
	
	while (DMA1_Stream0->CR & DMA_SxCR_EN);
	DMA1_Stream0->NDTR = 1;
	DMA1_Stream0->CR = DMA_SxCR_PL | DMA_SxCR_MSIZE_1 | DMA_SxCR_PSIZE_1 | DMA_SxCR_DIR_0 | DMA_SxCR_EN;
	
	while (DMA1_Stream1->CR & DMA_SxCR_EN);
	DMA1_Stream1->NDTR = 1;
	DMA1_Stream1->CR = DMA_SxCR_PL | DMA_SxCR_MSIZE_1 | DMA_SxCR_PSIZE_1 | DMA_SxCR_DIR_0 | DMA_SxCR_EN;
	
	SPI1->CR1 = 0;
	SPI1->IFCR = -1;
	SPI1->CR2 = 0;
	SPI1->CFG1 = 3 << SPI_CFG1_DSIZE_Pos;
	
	SPI2->CR1 = 0;
	SPI2->CR2 = 0xfffefffe;
	SPI2->CFG1 = (7 << SPI_CFG1_DSIZE_Pos) | SPI_CFG1_RXDMAEN;	
	SPI2->IFCR = -1;
	
	SPI3->CR1 = 0;
	SPI3->CR2 = 0xfffefffe;
	SPI3->CFG1 = (7 << SPI_CFG1_DSIZE_Pos) | SPI_CFG1_RXDMAEN;	
	SPI3->IFCR = -1;
	
	while (DMA1_Stream4->CR & DMA_SxCR_EN);
	DMA1_Stream4->NDTR = 512;
	DMA1_Stream4->M0AR = (uintptr_t)mRxBuffers[mRxBufIdxW];
	DMA1_Stream4->CR = DMA_SxCR_PL | DMA_SxCR_MINC | DMA_SxCR_EN;
	
	while (DMA1_Stream2->CR & DMA_SxCR_EN);
	DMA1_Stream2->NDTR = 514;
	DMA1_Stream2->M0AR = (uintptr_t)&CRC->DR;
	DMA1_Stream2->CR = DMA_SxCR_PL | DMA_SxCR_EN;
	
	CRC->CR = CRC_CR_POLYSIZE_0 | CRC_CR_RESET;
	
	while (DMA1_Stream3->CR & DMA_SxCR_EN);
	
	EXTI->PR1 = 1;
	(void)EXTI->PR1;
	EXTI->IMR1 = 1;
	
	GPIOC->MODER = (GPIOC->MODER &~ (3 << (2 * 3))) | (1 << (2 * 3));	//int signal as needed
	
	mTpcInProgress = 0;
	mTpc = 0;
	
	dtx('d');
}

/* make sure this does not get inlined into fastcode - we dont have the space */
static void __attribute__((section(".ramcode"))) msioPrvTpcSuccess(bool success)
{
	dtx(0xab);
	dtx(success ? 0xaa: 0xff);
	if (success) {
		
		dtx(mRxBufIdxW);
		mDoneTpc[mRxBufIdxW] = mTpc;
		if (++mRxBufIdxW == NUM_RX_BUFS)
			mRxBufIdxW = 0;
		if (mRxBufNumUsed++ == NUM_RX_BUFS)
			fatal("too many buffers unserviced\n");
		
		dtx(mRxBufNumUsed);
		asm volatile("dsb sy":::"memory");	//DO. NOT. ASK.
		asm volatile("isb sy":::"memory");
		
		//tell users
		NVIC_SetPendingIRQ(MSIO_IRQn);
	}
	msioPrvConfigForRx();
}

void __attribute__((used, section(".ramcode"))) msioSignalIrq(bool requestingInt)		//only do this while not in command
{
	GPIOC->BSRR = requestingInt ? (1 << 3) : (1 << (16 + 3));
}

void __attribute__((used, section(".ramcode"))) EXTI15_10_IRQHandler(int a, int b, int c, int d, uint32_t onSp)
{
	//do not ask
	if (EXTI->PR1 & (1 << 10)) {
		
		EXTI->IMR1 &=~ (1 << 10);
		EXTI->PR1 = 1 << 10;
		(void)EXTI->PR1;
	
		dtx('j');
		SPI2->CR1 = 0;
		SPI3->CR1 = 0;
		SPI1->CR1 = 0;
		mTpc = mTpcInProgress;
		dtx('k');
		dtx(DMA1_Stream3->NDTR);
		
		msioPrvTpcSuccess(true);
		
		if(0){
			uint32_t *v;
			
			if (__builtin_return_address(0) == (void*)0xFFFFFFFD)
				asm("mrs %0, PSP":"=r"(v));
			else
				v = &onSp;
			
			dtx('$');
			dtx(v[7]);
			dtx(v[6] >> 24);
			dtx(v[6] >> 16);
			dtx(v[6] >> 8);
			dtx(v[6] >> 0);
		}
	}
}

/*
void __attribute__((used, naked, section(".ramcode"))) EXTI0_IRQHandler_old(void)
{
	#define STRINGIFY2(x)		#x
	#define STRINGIFY(x)		STRINGIFY2(x)
	
	asm volatile(
	//we arrive as BS went up, by now SPI1 might have received the first nibble (TPC), or it might now so we wait for it
	
		"	push   {r4, r5}																		\n\t"
	
	//SPI2->CR1 = SPI_CR1_SPE
	//SPI3->CR1 = SPI_CR1_SPE
	//SPI2->TXDR = 0xAAAAAAAA
		"	movs   r0, %[val_SPI_CR1_SPE]														\n\t"
		"	movw   r1, #0xffff & %[val_SPI2_BASE]												\n\t"
		"	mov    r2, #0xaaaaaaaa																\n\t"
		"	movt   r1, #0 + (%[val_SPI2_BASE]) >> 16											\n\t"
		"	str    r0, [r1, #0 + %[val_OFST_SPI_CR1]]											\n\t"	
		"	str    r0, [r1, #0 + %[val_OFST_SPI_CR1] + %[val_SPI3_BASE] - %[val_SPI2_BASE]]		\n\t"
		"	str    r2, [r1, #0 + %[val_OFST_SPI_TXDR]]											\n\t"
	
	//while (!(SPI1->SR & SPI_SR_RXP));		//wait for a TPC type (first nibble)
		"	movw   r0, #0xffff & %[val_SPI1_BASE]												\n\t"
		"	movt   r0, #0 + (%[val_SPI1_BASE]) >> 16											\n\t"
		"1:																						\n\t"
		"	ldr    r2, [r0, #0 + %[val_OFST_SPI_SR]]											\n\t"
		"	lsrs   r2, r2, #1 + %[val_SPI_SR_RXP_Pos]											\n\t"
		"	bcc    1b																			\n\t"
	
	//tpc = *(volatile uint8_t*)&SPI1->RXDR;
		"	ldrb   r5, [r0, #0 + %[val_OFST_SPI_RXDR]]											\n\t"
		
	//if (tpc & 0x08) -> write_tpc
		"	lsls   r2, r5, #29																	\n\t"
		"	bcs    is_write_tpc																	\n\t"
		"is_read_tpc:																			\n\t"
		
	//switch (tpc) { //7 cases possible
		"	tbb    pc, r5																		\n\t"
		"1:																						\n\t"
		"	.byte  (tpc_rd_invalid - 1b) / 2													\n\t"
		"	.byte  (tpc_rd_invalid - 1b) / 2													\n\t"
		"	.byte  (tpc_0x02_rd_ldata - 1b) / 2													\n\t"
		"	.byte  (tpc_0x03_rd_sdata - 1b) / 2													\n\t"
		"	.byte  (tpc_0x04_rd_regs - 1b) / 2													\n\t"
		"	.byte  (tpc_rd_invalid - 1b) / 2													\n\t"
		"	.byte  (tpc_rd_invalid - 1b) / 2													\n\t"
		"	.byte  (tpc_rd_int - 1b) / 2														\n\t"
		
	//case MS_RD_LDATA (0x02):
		"tpc_0x02_rd_ldata:																		\n\t"
		"	ldr    r2, =mLongReadDataInfo														\n\t"
		"	ldmia  r2, {r2, r3}																	\n\t"
		"	cbz    r2, tpc_rd_invalid															\n\t"
		"	b      tpc_rd_do																	\n\t"
	
	//case MS_RD_SDATA (0x03):
		"tpc_0x02_rd_ldata:																		\n\t"
		"	ldr    r2, =mShortReadDataInfo														\n\t"
		"	ldmia  r2, {r2, r3}																	\n\t"
		"	cbz    r2, tpc_rd_invalid															\n\t"
		"	b      tpc_rd_do																	\n\t"
	
	//case MS_RD_REG (0x04):
		"	ldr    r2, =mRegs																	\n\t"
		"	ldrb   r3, [r2, #1 + %[val_MS_NUM_REGS]]											\n\t"
		"	ldrb   r4, [r2, #0 + %[val_MS_NUM_REGS]]											\n\t"
		"	add    r2, r4																		\n\t"
		"	b      tpc_rd_do																	\n\t"
	
	//case MS_GET_INT (0x07):
		"	ldr    r2, =mRegs																	\n\t"
		"	movs   r3, #1																		\n\t"
		"	adds   r2, #0 + %[val_MS_REG_NO_INT]												\n\t"
		"	b      tpc_rd_do																	\n\t"
	
	//tpc_rd_invalid:	//r2 is currently 0, r1 is "SPI2"
	//SPI2->CR1 = 0;	//XXX: was SPI1, but should be spi2
		"	str    r2, [r1, #0 + %[val_OFST_SPI_CR1]]											\n\t"	
		"	b      out_resetup_cur_failed														\n\t"
	
	//tpc_rd_do:	//r0 = SPI1, r1 = SPI2, r2 = rdPtr, r3 = rdLen
		"tpc_rd_do:																				\n\t"
	//SPI2->CFG1 = (3 << SPI_CFG1_DSIZE_Pos);
		"	movs   r4, #3 << %[val_SPI_CFG1_DSIZE_Pos]											\n\t"
		"	str    r4, [r1, #0 + %[val_OFST_SPI_CFG1]											\n\t"
	//SPI2->CR1 = SPI_CR1_SPE | SPI_CR1_HDDIR;	//send ACK! (after this we have enough time to finish the setup of the rest of this)
		"	movs   r4, #0 + %[val_SPI_CR1_SPE] + %[val_SPI_CR1_HDDIR]							\n\t"
		"	str    r4, [r1, #0 + %[val_OFST_SPI_CR1]											\n\t"
	//	while (!(SPI1->SR & SPI_SR_RXP));		//finally RX second nibble of the TPC (check value)
		"1:																						\n\t"
		"	ldr    r4, [r0, #0 + %[val_OFST_SPI_SR]]											\n\t"
		"	lsrs   r4, r4, #1 + %[val_SPI_SR_RXP_Pos]											\n\t"
		"	bcc    1b																			\n\t"
	//tpcCheck = *(volatile uint8_t*)&SPI1->RXDR;
		"	ldrb   r4, [r0, #0 + %[val_OFST_SPI_RXDR]]											\n\t"
	//XXX: we used to clear SPI1->CR1 but i do not see why we should. Let's not
	//if (tpcCheck != (tpc ^ 0x0f))
		"	eors   r4, r5																		\n\t"
		"	subs   r4, #0x0f																	\n\t"
		"	itt    ne																			\n\t"
	//SPI2->CR1 = 0;
		"	strne  r4, [r1, #0 + %[val_OFST_SPI_CR1]											\n\t"
		"	bne    out_resetup_cur_failed														\n\t"
	//we are here because TPC check value matches. we are probably already or soon will be sending the ACK. we need to enqueue the data quickly

	//DMA1_Stream3->NDTR = readLen;
	//DMA1_Stream3->M0AR = (uintptr_t)readData;
	//DMA1_Stream3->CR = DMA_SxCR_PL | DMA_SxCR_MINC | DMA_SxCR_DIR_0 | DMA_SxCR_EN | DMA_SxCR_TCIE;
		"	movw   r1, #0xffff & %[val_DMA1_Stream3_BASE]										\n\t"
		"	movt   r1, #0 + (%[val_DMA1_Stream3_BASE]) >> 16									\n\t"
		"	str    r3, [r1, #0 + %[val_OFST_DMA_STRM_NDTR]]										\n\t"
		"	str    r2, [r1, #0 + %[val_OFST_DMA_STRM_M0AR]]										\n\t"
		"	movw   r3, #0xffff & (%[val_RX_DMA_CR])												\n\t"
		"	movt   r3, #0 + (%[val_RX_DMA_CR]) >> 16											\n\t"
		"	str    r3, [r1, #0 + %[val_OFST_DMA_STRM_CR]]										\n\t"
	//SPI1->CR2 = readLen;
	//SPI1->CFG1 = (7 << SPI_CFG1_DSIZE_Pos) | (15 << SPI_CFG1_CRCSIZE_Pos) | SPI_CFG1_CRCEN | SPI_CFG1_TXDMAEN;
	//SPI1->CR1 = SPI_CR1_HDDIR | SPI_CR1_SPE;
		"	str    r2, [r0, #0 + %[val_OFST_SPI_CR2]]											\n\t"
		"	movw   r1, #0xffff & %[val_RX_SPI_CFG1]												\n\t"
		"	movt   r1, #0 + (%[val_RX_SPI_CFG1]) >> 16											\n\t"
		"	movs   r2, #0 + %[val_SPI_CR1_SPE] + %[val_SPI_CR1_HDDIR]							\n\t"
		"	str    r1, [r0, #0 + %[val_OFST_SPI_CFG1]]											\n\t"
		"	str    r2, [r0, #0 + %[val_OFST_SPI_CR1]]											\n\t"
	//EXTI->IMR1 = 1 << 10;
	//EXTI->PR1 = 1 | (1 << 10);
	//(void)EXTI->PR1;
		"	mov    r1, #1 << 10																	\n\t"
		"	mov    r2, #0 + %[val_EXTI_BASE]													\n\t"
		"	adds   r3, r1, #1																	\n\t"
		"	str    r1, [r2, #0 + %[val_OFST_EXTI_IMR1]]											\n\t"
		"	str    r3, [r2, #0 + %[val_OFST_EXTI_PR1]]											\n\t"
		"	ldr    r3, [r2, #0 + %[val_OFST_EXTI_PR1]]											\n\t"
	//mTpcInProgress = tpc;
		"	ldr    r0, =mTpcInProgress															\n\t"
		"	strb   r5, [r0]																		\n\t"
		"	pop    {r4, r5}																		\n\t"
		"	bx     lr																			\n\t"
	
	//out_resetup_cur_failed:
		"out_resetup_cur_failed:																\n\t"
	//EXTI->IMR1 = 0;
	//EXTI->PR1 = 1;
	//(void)EXTI->PR1;
		"	mov    r2, #0 + %[val_EXTI_BASE]													\n\t"
		"	movs   r0, #0																		\n\t"
		"	movs   r1, #1																		\n\t"
		"	str    r0, [r2, #0 + %[val_OFST_EXTI_IMR1]]											\n\t"
		"	str    r1, [r2, #0 + %[val_OFST_EXTI_PR1]]											\n\t"
		"	ldr    r1, [r2, #0 + %[val_OFST_EXTI_PR1]]											\n\t"
		"	pop    {r4, r5}																		\n\t"
	//msioPrvTpcSuccess(false)
		"	movs   r0, #0																		\n\t"
		"	b      msioPrvTpcSuccess															\n\t"
	
	//is_write_tpc:
	//r2 = tpc & 7
		"is_write_tpc:																			\n\t"
	//	while (!(SPI1->SR & SPI_SR_RXP));		//RX second nibble of the TPC (check value)
		"1:																						\n\t"
		"	ldr    r4, [r0, #0 + %[val_OFST_SPI_SR]]											\n\t"
		"	lsrs   r4, r4, #1 + %[val_SPI_SR_RXP_Pos]											\n\t"
		"	bcc    1b																			\n\t"
	//tpcCheck = *(volatile uint8_t*)&SPI1->RXDR;
		"	ldrb   r4, [r0, #0 + %[val_OFST_SPI_RXDR]]											\n\t"
	//SPI1->CR1 = 0;
		"	movs   r2, #0																		\n\t"
		"	str    r2, [r0, #0 + %[val_OFST_SPI_CR1]]											\n\t"
	//if (tpcCheck != (tpc ^ 0x0f))
		"	eors   r4, r5																		\n\t"
		"	subs   r4, #0x0f																	\n\t"
		"	bne    out_resetup_cur_failed														\n\t"
	//for various reasons we need to go to 16 bit mode here...do not ask...
	//SPI1->CFG1 = (15 << SPI_CFG1_DSIZE_Pos);
	//SPI1->CR1 = SPI_CR1_HDDIR | SPI_CR1_SPE;
		"	movs   r2, #0 + 15 << %[val_SPI_CFG1_DSIZE_Pos]										\n\t"
		"	movs   r3, #0 + %[val_SPI_CR1_SPE] + %[val_SPI_CR1_HDDIR]							\n\t"
		"	str    r2, [r0, #0 + %[val_OFST_SPI_CFG1]]											\n\t"
		"	str    r3, [r0, #0 + %[val_OFST_SPI_CR1]]											\n\t"
	// *(volatile uint16_t*)&SPI1->TXDR = 0x0055;	//give us max time to pause, we can only write one thing into the buffer at this point
	// *(volatile uint16_t*)&SPI1->TXDR = 0x5555;	//we can now write another
		"	movs   r2, #0x55																	\n\t"
		"	mov    r3, #0x5555																	\n\t"
		"	str    r2, [r0, #0 + %[val_OFST_SPI_TXDR]]											\n\t"
		"	str    r3, [r0, #0 + %[val_OFST_SPI_TXDR]]											\n\t"
	//now we wait for BS2 to end, we're RXing using DMA and CRCing using DMA as well
	//while (!(GPIOC->IDR & (1 << 10)));
		"	movw   r2, #0xffff & %[val_GPIOC_BASE]												\n\t"
		"	movt   r2, #0 + (%[val_GPIOC_BASE]) >> 16											\n\t"
		"1:																						\n\t"
		"	ldr    r3, [r2, #0 + %[val_OFST_GPIO_IDR]]											\n\t"
		"	lsrs   r3, #1 + 10																	\n\t"
		"	bcc    1b																			\n\t"
	//crc is ready. we should check it, and decide if we want to ACK or NAK
	//we also need to disable SPI2 and 3 quickly. do that first
	//SPI2->CR1 = 0;
	//SPI3->CR1 = 0;
		"	movs   r2, #0																		\n\t"
		"	str    r2, [r1, #0 + %[val_OFST_SPI_CR1]]											\n\t"
		"	str    r2, [r1, #0 + %[val_OFST_SPI_CR1] + %[val_SPI3_BASE] - %[val_SPI2_BASE]]		\n\t"
	//now check crc
		"	movw   r3, #0xffff & %[val_CRC_BASE]												\n\t"
		"	movt   r3, #0 + (%[val_CRC_BASE]) >> 16												\n\t"
		"	ldr    r1, [r3, #0 + %[val_OFST_CRC_DR]]											\n\t"
		"	cbz    r1, wr_tpc_crc_pass															\n\t"
		"	str    r2, [r0, #0 + %[val_OFST_SPI_CR1]]											\n\t"	
		"	b      out_resetup_cur_failed														\n\t"
	
	//data crc passes
	//nRxed = 514 - DMA1_Stream2->NDTR;
	//mTpc = tpc;
		"wr_tpc_crc_pass:																		\n\t"
		"	movw   r2, #0xffff & %[val_DMA1_Stream2_BASE]										\n\t"
		"	movt   r2, #0 + (%[val_DMA1_Stream2_BASE]) >> 16									\n\t"
		"	ldr    r4, [r2, #0 + %[val_OFST_DMA_STRM_NDTR]]										\n\t"
		"	ldr    r3, =mTpc																	\n\t"
		"	rsb    r4, r4, #514																	\n\t"
		"	strb   r5, [r3]																		\n\t"
	
	//wait for ack/nak to be accepted
	//while (GPIOC->IDR & (1 << 10));
		"	movw   r2, #0xffff & %[val_GPIOC_BASE]												\n\t"
		"	movt   r2, #0 + (%[val_GPIOC_BASE]) >> 16											\n\t"
		"1:																						\n\t"
		"	ldr    r3, [r2, #0 + %[val_OFST_GPIO_IDR]]											\n\t"
		"	lsrs   r3, #1 + 10																	\n\t"
		"	bcs    1b																			\n\t"
	//SPI1->CR1 = 0;
		"	movs   r2, #0																		\n\t"
		"	str    r2, [r0, #0 + %[val_OFST_SPI_CR1]]											\n\t"
	
	//mBytesRxd[mRxBufIdxW] = nRxed;
	//rxb = mRxBuffers[mRxBufIdxW];
		"	ldr    r2, =mRxBufIdxW																\n\t"
		"	ldr    r3, =mBytesRxd																\n\t"
		"	strh   r4, [r3, r2, lsl #1]															\n\t"
		"	ldr    r3, =mRxBuffers																\n\t"
		"	add    r2, r3, r2, lsl #9															\n\t"
	//switch (tpc)
		"	tbb    pc, r5																		\n\t"
		"1:																						\n\t"
		"	.byte  (tpc_wr_invalid - 1b) / 2													\n\t"
		"	.byte  (tpc_wr_invalid - 1b) / 2													\n\t"
		"	.byte  (tpc_wr_invalid - 1b) / 2													\n\t"
		"	.byte  (tpc_wr_invalid - 1b) / 2													\n\t"
		"	.byte  (tpc_wr_invalid - 1b) / 2													\n\t"
		"	.byte  (tpc_wr_invalid - 1b) / 2													\n\t"
		"	.byte  (tpc_wr_invalid - 1b) / 2													\n\t"
		"	.byte  (tpc_wr_invalid - 1b) / 2													\n\t"
		"	.byte  (tpc_wr_set_rw_reg_adrs - 1b) / 2											\n\t"
		"	.byte  (tpc_wr_done - 1b) / 2														\n\t"
		"	.byte  (tpc_wr_done - 1b) / 2														\n\t"
		"	.byte  (tpc_wr_reg - 1b) / 2														\n\t"
		"	.byte  (tpc_wr_done - 1b) / 2														\n\t"
		"	.byte  (tpc_wr_done - 1b) / 2														\n\t"
		"	.byte  (tpc_wr_cmd - 1b) / 2														\n\t"
		"	.byte  (tpc_wr_done - 1b) / 2														\n\t"
	//tpc_wr_set_rw_reg_adrs:
		"tpc_wr_set_rw_reg_adrs:																\n\t"
		"	cmp    r4, #6																		\n\t"
		"	ittt   ge																			\n\t"
		"	ldrge  r3, [r2]																		\n\t"
		"	ldrge  r2, =mRegs																	\n\t"
		"	strge  r3, [r2, #0 + %[val_MS_NUM_REGS]]											\n\t"
		"	b      tpc_wr_done																	\n\t"
	//tpc_wr_reg:
		"tpc_wr_reg:																			\n\t"
		"	ldr    r3, =mRegs																	\n\t"
		"	ldrb   r1, [r3, #3 + %[val_MS_NUM_REGS]]											\n\t"
		"	ldrb   r0, [r3, #2 + %[val_MS_NUM_REGS]]											\n\t"
		"	subs   r4, #2																		\n\t"
		"	ble    tpc_wr_done																	\n\t"
		"	cmp    r4, r1																		\n\t"
		"	bne    tpc_wr_done																	\n\t"
		"	adds   r3, r0																		\n\t"
		"1:																						\n\t"
		"	subs   r1, #1																		\n\t"
		"	ldrb   [r2, r1]																		\n\t"
		"	strb   [r3, r1]																		\n\t"
		"	bne    1b																			\n\t"
		"	b      tpc_wr_done																	\n\t"
	//tpc_wr_cmd
		"tpc_wr_cmd:																			\n\t"
		"	ldr    r3, =mRegs																	\n\t"
		"	cmp    r4, #3																		\n\t"
		"	ldrb   r1, [r2]																		\n\t"
		"	itte   eq																			\n\t"
		"	ldrbeq r0, [r3, #0 + %[val_MS_REG_NO_INT]]											\n\t"
		"	biceq  r0, #0 + %[val_INT_VAL_CMD_DONE]												\n\t"
		"	movne  r0, #0 + " STRINGIFY(INT_VAL_INVAL_CMD) 	"									\n\t"
		"	strb   r0, [r3, #0 + %[val_MS_REG_NO_INT]]											\n\t"
		"	b      tpc_wr_done																	\n\t"
	//tpc_wr_invalid
		"tpc_wr_invalid:																		\n\t"
		"	b      out_resetup_cur_failed														\n\t"
	//tpc_wr_done:
		"tpc_wr_done:																			\n\t"
	//EXTI->IMR1 = 0;
	//EXTI->PR1 = 1;
	//(void)EXTI->PR1;
		"	mov    r2, #0 + %[val_EXTI_BASE]													\n\t"
		"	movs   r0, #0																		\n\t"
		"	movs   r1, #1																		\n\t"
		"	str    r0, [r2, #0 + %[val_OFST_EXTI_IMR1]]											\n\t"
		"	str    r1, [r2, #0 + %[val_OFST_EXTI_PR1]]											\n\t"
		"	ldr    r1, [r2, #0 + %[val_OFST_EXTI_PR1]]											\n\t"
		"	pop    {r4, r5}																		\n\t"
	//msioPrvConfigForRx()
		"	movs   r0, #1																		\n\t"
		"	b      msioPrvTpcSuccess															\n\t"
	
		".ltorg																					\n\t"
	:
	:
		//bases
		[val_CRC_BASE] "i" (CRC_BASE),
		[val_DMA1_Stream2_BASE] "i" (DMA1_Stream2_BASE),
		[val_DMA1_Stream3_BASE] "i" (DMA1_Stream3_BASE),
		[val_EXTI_BASE] "i" (EXTI_BASE),
		[val_GPIOC_BASE] "i" (GPIOC_BASE),
		[val_SPI1_BASE] "i" (SPI1_BASE),
		[val_SPI2_BASE] "n" (SPI2_BASE),
		[val_SPI3_BASE] "i" (SPI3_BASE),
		
		//values
		[val_SPI_CR1_SPE] "i" (SPI_CR1_SPE),
		[val_INT_VAL_CMD_DONE] "i" (INT_VAL_CMD_DONE),
		[val_MS_NUM_REGS] "i" (MS_NUM_REGS),
		[val_MS_REG_NO_INT] "i" (MS_REG_NO_INT),
		[val_RX_DMA_CR] "i" (DMA_SxCR_PL | DMA_SxCR_MINC | DMA_SxCR_DIR_0 | DMA_SxCR_EN | DMA_SxCR_TCIE),
		[val_RX_SPI_CFG1] "i" ((7 << SPI_CFG1_DSIZE_Pos) | (15 << SPI_CFG1_CRCSIZE_Pos) | SPI_CFG1_CRCEN | SPI_CFG1_TXDMAEN),
		[val_SPI_CFG1_DSIZE_Pos] "i" (SPI_CFG1_DSIZE_Pos),
		[val_SPI_CR1_HDDIR] "i" (SPI_CR1_HDDIR),
		[val_SPI_SR_RXP_Pos] "i" (SPI_SR_RXP_Pos),
		
		//offsets
		[val_OFST_CRC_DR] "i" (offsetof(CRC_TypeDef, DR)),
		[val_OFST_GPIO_IDR] "i" (offsetof(GPIO_TypeDef, IDR)),
		[val_OFST_DMA_STRM_CR] "i" (offsetof(DMA_Stream_TypeDef, CR)),
		[val_OFST_DMA_STRM_M0AR] "i" (offsetof(DMA_Stream_TypeDef, M0AR)),
		[val_OFST_DMA_STRM_NDTR] "i" (offsetof(DMA_Stream_TypeDef, NDTR)),
		[val_OFST_EXTI_IMR1] "i" (offsetof(EXTI_TypeDef, IMR1)),
		[val_OFST_EXTI_PR1] "i" (offsetof(EXTI_TypeDef, PR1)),
		[val_OFST_SPI_CFG1] "i" (offsetof(SPI_TypeDef, CFG1)),
		[val_OFST_SPI_CR1] "i" (offsetof(SPI_TypeDef, CR1)),
		[val_OFST_SPI_CR2] "i" (offsetof(SPI_TypeDef, CR2)),
		[val_OFST_SPI_RXDR] "i" (offsetof(SPI_TypeDef, RXDR)),
		[val_OFST_SPI_SR] "i" (offsetof(SPI_TypeDef, SR)),
		[val_OFST_SPI_TXDR] "i" (offsetof(SPI_TypeDef, TXDR))
		
		
	:"memory","cc");
}
*/

void __attribute__((used, section(".ramcode"))) EXTI0_IRQHandler(int a, int b, int c, int d, uint32_t onSp)
{
	//we get here on BS going up. by the time we are running here, BS.d2 has already gone up, spi is recieving the TPC, if all is well
	bool doResetup = false, success = false;
	uint_fast8_t tpc, tpcCheck;
	uint32_t sanityCheck;
	
	//dtx('n');
	
	SPI2->CR1 = SPI_CR1_SPE;	//early enough to do this
	SPI3->CR1 = SPI_CR1_SPE;	//early enough to do this
	
	SPI2->TXDR = 0xAAAAAAAA;		//get it ready in case of read
	while (!(SPI1->SR & SPI_SR_RXP));		//wait for a TPC type (first nibble)

	//TPC RXed
	tpc = *(volatile uint8_t*)&SPI1->RXDR;
	
	dtx(tpc);
	
	if (tpc & 0x08) {		//write tpc
		
		volatile uint8_t *rxb;
		uint32_t nRxed;
		
		dtx(0x33);
		
		while (!(SPI1->SR & SPI_SR_RXP));
		tpcCheck = *(volatile uint8_t*)&SPI1->RXDR;
		dtx(tpcCheck);
		if (tpcCheck != (tpc ^ 0x0f)) {
			doResetup = true;
			goto tpc_check_fail;
		}
		
		//for various reasons we need to go to 16 bit mode here...do not ask...
		SPI1->CR1 = 0;
		SPI1->CFG1 = (15 << SPI_CFG1_DSIZE_Pos);
		SPI1->CR1 = SPI_CR1_HDDIR | SPI_CR1_SPE;
		*(volatile uint16_t*)&SPI1->TXDR = 0x0055;	//give us max time to pause, we can only write one thing into the buffer at this point
		*(volatile uint16_t*)&SPI1->TXDR = 0x5555;	//we can now write another
		
		dtx(0x34);
		while (!(GPIOC->IDR & (1 << 10)));
		dtx(0x35);
		SPI2->CR1 = 0;
		SPI3->CR1 = 0;
		
		//CRC is already READY (thanks to spi3)
		//having another spi unit available would help as we could DMA from there to crc directly and thus not need to wait here...
		if (CRC->DR) {
			
			//fail - reconfigure all
			SPI1->CR1 = 0;
			dtx(0xde);
			doResetup = true;
			goto tpc_check_fail;	
		}
		
		nRxed = 514 - DMA1_Stream2->NDTR;
		dtx(nRxed);
		
		mTpc = tpc;
		
		//wait for ack/nak to be accepted
		while (GPIOC->IDR & (1 << 10));
		SPI1->CR1 &=~ SPI_CR1_HDDIR;
		dtx(0x56);
		
		mBytesRxd[mRxBufIdxW] = nRxed;
		rxb = mRxBuffers[mRxBufIdxW];
		
		switch (tpc) {
			
			case MS_SET_RW_REG_ADRS:
				if (nRxed >= 6) {
					
					uint_fast8_t rS = rxb[0], rL = rxb[1], wS = rxb[2], wL = rxb[3];
					
					if (rL && wL && rS < MS_NUM_REGS && MS_NUM_REGS - rS >= rL && wS < MS_NUM_REGS && MS_NUM_REGS - wS >= wL) {
						
						mRegWindow[0] = rS;
						mRegWindow[1] = rL;
						mRegWindow[2] = wS;
						mRegWindow[3] = wL;
					}
				}
				break;
			
			case MS_WR_REG: {
				
				volatile uint8_t *dst = mRegs + mRegWindow[2], *end = dst + mRegWindow[3];
				volatile uint8_t *src = rxb;
				
				while (dst != end)
					*dst++ = *src++;
				
				break;
			}
			
			case MS_SET_CMD:
				dtx(0x44);
				dtx(nRxed);
				dtx(rxb[0]);
				if (nRxed != 3)
					mRegs[MS_REG_NO_INT] = INT_VAL_INVAL_CMD;
				else
					mRegs[MS_REG_NO_INT] &=~ INT_VAL_CMD_DONE;
				break;
		}
		
		success = true;
		doResetup = true;
	}
	else {					//read tpc
		
		uint_fast16_t readLen;
		const void *readData;
		
		switch (tpc) {
			
			case MS_RD_REG:
				readData = (uint8_t*)(mRegs + mRegWindow[0]);
				readLen = mRegWindow[1];
				break;
			
			case MS_GET_INT:
				readData = (uint8_t*)(mRegs + MS_REG_NO_INT);
				readLen = 1;
				break;
			
			case MS_RD_LDATA:
			
				if (mLongReadDataInfo.data) {
					
					readData = mLongReadDataInfo.data;
					mLongReadDataInfo.data = NULL;
					readLen = mLongReadDataInfo.len;
				}
				else {
					SPI1->CR1 &=~ SPI_CR1_SPE;
					doResetup = true;
					goto tpc_check_fail;
				}
				break;
			
			case MS_RD_SDATA:
			
				if (mShortReadDataInfo.data) {
					
					readData = mShortReadDataInfo.data;
					mShortReadDataInfo.data = NULL;
					readLen = mShortReadDataInfo.len;
				}
				else {
					
					SPI1->CR1 &=~ SPI_CR1_SPE;
					doResetup = true;
					goto tpc_check_fail;
				}
				break;
			
			default:
				doResetup = true;
				SPI2->CR1 = 0;
				goto tpc_check_fail;
		}
		
		SPI2->CFG1 = (3 << SPI_CFG1_DSIZE_Pos);
		SPI2->CR1 = SPI_CR1_SPE | SPI_CR1_HDDIR;	//send ACK! (after this we have enough time to finish the setup of the rest of this)
		
		while (!(SPI1->SR & SPI_SR_RXP));
		tpcCheck = *(volatile uint8_t*)&SPI1->RXDR;
		SPI1->CR1 = 0;
		if (tpcCheck != (tpc ^ 0x0f)) {
			//cancel the ACK we were about to send or are already sending
			SPI2->CR1 = 0;
			goto tpc_check_fail;
		}
		
		dtx(tpcCheck);
		dtx(readLen);
		
		//reset/enable crc?
		DMA1_Stream3->NDTR = readLen;
		DMA1_Stream3->M0AR = (uintptr_t)readData;
		DMA1_Stream3->CR = DMA_SxCR_PL | DMA_SxCR_MINC | DMA_SxCR_DIR_0 | DMA_SxCR_EN | DMA_SxCR_TCIE;
		
		SPI1->CR2 = readLen;
		SPI1->CFG1 = (7 << SPI_CFG1_DSIZE_Pos) | (15 << SPI_CFG1_CRCSIZE_Pos) | SPI_CFG1_CRCEN | SPI_CFG1_TXDMAEN;
		SPI1->CR1 = SPI_CR1_HDDIR | SPI_CR1_SPE;
		
		EXTI->PR1 = 1 << 10;
		(void)EXTI->PR1;
		EXTI->IMR1 = 1 << 10;
		
		mTpcInProgress = tpc;
	}

tpc_check_fail:
	
	EXTI->IMR1 &=~ 1;
	EXTI->PR1 = 1;
	(void)EXTI->PR1;
	dtx(0x59);
	asm volatile("dsb sy");
	
	if (doResetup)
		msioPrvTpcSuccess(success);
	
	
	
	if(0){
		uint32_t *v;
		
		if (__builtin_return_address(0) == (void*)0xFFFFFFFD)
			asm("mrs %0, PSP":"=r"(v));
		else
			v = &onSp;
		
		dtx('@');
		dtx(v[7]);
		dtx(v[6] >> 24);
		dtx(v[6] >> 16);
		dtx(v[6] >> 8);
		dtx(v[6] >> 0);
	}
}

void msioInit(void)
{
	static volatile uint32_t  __attribute__((section(".uncached"))) mSpiEnableCR1, __attribute__((section(".uncached"))) mGpiocModerVal;

	mRegWindow[1] = 0x1f;
	mRegWindow[2] = 0x10;
	mRegWindow[3] = 0x0f;

	mSpiEnableCR1 = SPI_CR1_SPE;

	SYSCFG->EXTICR[0] = (SYSCFG->EXTICR[0] &~ SYSCFG_EXTICR1_EXTI0_Msk) | SYSCFG_EXTICR1_EXTI0_PA;		//EXTI0 is PA0
	SYSCFG->EXTICR[2] = (SYSCFG->EXTICR[2] &~ SYSCFG_EXTICR3_EXTI10_Msk) | SYSCFG_EXTICR3_EXTI10_PC;	//EXTI10 is PC0
	
	mRegs[MS_REG_NO_STA0] = 0x11;	//write-locked and a flag that all MSs set (0x10)
	mRegs[MS_REG_NO_TYPE] = 0xff;
	mRegs[5] = 0xff;				//as per real card
	mRegs[MS_REG_NO_CATEGORY] = mCurCategory = MS_CATEGORY_STORAGE;
	mRegs[MS_REG_NO_CLASS] = 0xff;
	mRegs[MS_REG_NO_SYSCFG] = 0x80;
	
	NVIC_EnableIRQ(EXTI0_IRQn);
	
	NVIC_EnableIRQ(EXTI15_10_IRQn);
	
	EXTI->RTSR1 |= EXTI_RTSR1_TR0;	//trigger on EXTI0 rising edge
	EXTI->FTSR1 |= EXTI_FTSR1_TR10;	//trigger on EXTI10 falling edge
	
	//SPI1 will accept TPC with nCS active high
	//SPI2 will TX/RX data with nCS low   (se UDRCFG)
	//SPI1 will send data in BS3 with nCS active high
	// SPI3 shadows SPI2 for data RX to provide CRC unit with data while SPI2 writes it to RAM
	
	//on BS going up:
	// enable SPI1, clear buffers
	
	SPI1->CR1 = 0;
	SPI1->CRCPOLY = 0x18005;
	SPI1->CFG2 = SPI_CFG2_SSIOP | SPI_CFG2_COMM_1 | SPI_CFG2_COMM_0;
	
	SPI2->CR1 = 0;
	SPI2->CRCPOLY = 0x18005;
	SPI2->CFG2 = SPI_CFG2_COMM_1 | SPI_CFG2_COMM_0;
	
	SPI3->CR1 = 0;
	SPI3->CFG2 = SPI_CFG2_COMM_1 | SPI_CFG2_COMM_0;
	
	CRC->INIT = 0;
	CRC->POL = 0x8005;
	
	DMAMUX1_RequestGenerator0->RGCR = DMAMUX_RGxCR_GPOL_0 | DMAMUX_RGxCR_GE | 6;
	DMAMUX1_RequestGenerator1->RGCR = DMAMUX_RGxCR_GPOL_0 | DMAMUX_RGxCR_GE | 6;
	DMAMUX1_Channel0->CCR = 1;	//request source is generateor output 0
	DMAMUX1_Channel1->CCR = 1;	//request source is generateor output 0
	
	//triggering two DMA from one trigger does not reliably work...
	
	DMAMUX1_Channel3->CCR = 38;		//spi1 tx
	DMAMUX1_Channel4->CCR = 39;		//spi2 rx
	DMAMUX1_Channel2->CCR = 61;		//spi3 rx
	
	DMA1_Stream0->PAR = (uintptr_t)&SPI1->CR1;
	DMA1_Stream0->M0AR = (uintptr_t)&mSpiEnableCR1;
	
	mGpiocModerVal = GPIOC->MODER;
	DMA1_Stream1->PAR = (uintptr_t)&GPIOC->MODER;
	DMA1_Stream1->M0AR = (uintptr_t)&mGpiocModerVal;
		
	DMA1_Stream3->PAR = (uintptr_t)&SPI1->TXDR;
	
	DMA1_Stream4->PAR = (uintptr_t)&SPI2->RXDR;
	DMA1_Stream2->PAR = (uintptr_t)&SPI3->RXDR;
	
	msioSignalIrq(false);
}

void msioEnable(void)
{
	msioPrvConfigForRx();
}

volatile uint8_t* __attribute__((section(".ramcode")))  msioGetRegs(void)
{
	return mRegs;
}

void __attribute__((section(".ramcode"))) msioReleaseBuf(void)
{
	uint32_t newNumBufsUsed, fail;
	
	dtx('p');
	
	do {
		asm volatile(
			"1:							\n\t"
			"	ldrexb	%0, [%2]		\n\t"
			"	subs	%0, #1			\n\t"
			"	strexb	%1, %0, [%2]	\n\t"
			:"=&r"(newNumBufsUsed), "=&r"(fail)
			:"r"(&mRxBufNumUsed)
			:"cc", "memory"
		);
	} while (fail);
	
	dtx(newNumBufsUsed);
	
	if (++mRxBufIdxR == NUM_RX_BUFS)
		mRxBufIdxR = 0;
}

bool __attribute__((section(".ramcode"))) msioPoll(uint8_t *tpcP, const void **dataP, uint16_t *lenP)
{
	uint32_t dummy, tpc;
	
	dtx('q');
	dtx(mRxBufNumUsed);
	
	if (!mRxBufNumUsed)
		return false;
	
	tpc = mDoneTpc[mRxBufIdxR];
	if (tpc == MS_WR_REG) {		//this might want to be done in interrupt context?
		
		if (mRegs[MS_REG_NO_CATEGORY] != mCurCategory) {
			
			if (msioCategoryChanged(mRegs[MS_REG_NO_CATEGORY]))
				mCurCategory = mRegs[MS_REG_NO_CATEGORY];
			else
				mRegs[MS_REG_NO_CATEGORY] = mCurCategory;
		}
	}
	
	if (tpcP)
		*tpcP = tpc;
	
	if (dataP)
		*dataP = (uint8_t*)mRxBuffers[mRxBufIdxR];
	
	if (lenP)
		*lenP = mBytesRxd[mRxBufIdxR];
	
	return !!tpc;
}

bool __attribute__((used, section(".ramcode"))) msioHaveLongDataToTx(void)
{
	return !!mLongReadDataInfo.data;
}

bool __attribute__((used, section(".ramcode"))) msioHaveShortDataToTx(void)
{
	return !!mShortReadDataInfo.data;
}

bool __attribute__((used, section(".ramcode"))) msioProvideLongReadData(const void *data, uint_fast16_t len)
{
	bool hadPrevData = !!mLongReadDataInfo.data;
	
	mLongReadDataInfo.len = len;
	mLongReadDataInfo.data = data;
	
	return !hadPrevData;
}

bool __attribute__((used, section(".ramcode"))) msioProvideShortReadData(const void *data, uint_fast16_t len)
{
	bool hadPrevData = !!mShortReadDataInfo.data;
	
	mShortReadDataInfo.len = len;
	mShortReadDataInfo.data = data;
	
	return !hadPrevData;
}

void* __attribute__((used, section(".ramcode"))) msioGetOutBuffer(uint_fast8_t bufIdx)
{
	return mOutBuffer[bufIdx];
}