#define __VTOR_PRESENT	1	//rp2040.h forgot this
#include <string.h>
#include <stdarg.h>
#include "rp2040.h"


#pragma GCC optimize ("Os")
#define __loader2			__attribute__((noinline, section(".text.loader2")))


#define COPY_AMOUNT			(2 << 20)			//copy 2M
#define BUFFER_SIZE			(256 << 10)			//must be multiple of CHUNK_SIZE
#define CHUNK_SIZE			(1 << 10)			//must be multiple of 4
#define BUFFER_ADDR			(0x21000000)		//unstriped address

#define OUR_SPEED			100000000			//100MHz
#define OUR_SSI_DIV			2					//50MHz
#define FINAL_SSI_DIV		4					//div4 allows out main code to run when it clocks up to 220MHz, this is our last chance to SET that

#if 0				//simple GPIO is RAM/nROM
	
	#define PIN_RAMnROM				19

#else				//i2c expander used

	#define XPIN_RAMnROM			5
	
	#define PIN_SDA					22
	#define PIN_SCL					23
	
	#define IO_EXPANDER_ADDR		0x20		//7-bit addr
	#define REG_INPUT				0x00
	#define REG_OUTPUT				0x01
	#define REG_POL_INVERT			0x02		//for input only
	#define REG_DIRECTION			0x03		//1 is in, 0 is out
	
	#define OUR_I2C_HZ				100000
	
#endif


#ifdef PIN_RAMnROM
	
	#define useFlashChip()	do { sio_hw->gpio_clr = (1 << PIN_RAMnROM); } while(0)
	#define useRamChip()	do { sio_hw->gpio_set = (1 << PIN_RAMnROM); } while(0)

#endif

#ifdef XPIN_RAMnROM

	//these assume only one pin is output
	#define useFlashChip()	do { if (mCurMode != 0) {i2cRegWrite(REG_OUTPUT, 0 << XPIN_RAMnROM); } mCurMode = 0; } while(0)
	#define useRamChip()	do { if (mCurMode == 0) {i2cRegWrite(REG_OUTPUT, 1 << XPIN_RAMnROM); } mCurMode = 1; } while(0)
	
	static uint8_t mCurMode;

#endif


static void __loader2 i2cRegWrite(uint8_t reg, uint8_t val)
{
	i2c1_hw->data_cmd = reg;
	i2c1_hw->data_cmd = I2C_IC_DATA_CMD_STOP_BITS + val;
	while (!(i2c1_hw->status & I2C_IC_STATUS_TFE_BITS));
	while (i2c1_hw->status & I2C_IC_STATUS_MST_ACTIVITY_BITS);
	
	asm volatile("dsb sy");
}

static void __loader2 i2cExpanderInit(void)
{
	//i2c pins
	iobank0_hw->io[PIN_SDA].ctrl = (iobank0_hw->io[PIN_SDA].ctrl &~ IO_BANK0_GPIO0_CTRL_FUNCSEL_BITS) | (IO_BANK0_GPIO22_CTRL_FUNCSEL_VALUE_I2C1_SDA << IO_BANK0_GPIO0_CTRL_FUNCSEL_LSB);
	iobank0_hw->io[PIN_SCL].ctrl = (iobank0_hw->io[PIN_SCL].ctrl &~ IO_BANK0_GPIO0_CTRL_FUNCSEL_BITS) | (IO_BANK0_GPIO23_CTRL_FUNCSEL_VALUE_I2C1_SCL<< IO_BANK0_GPIO0_CTRL_FUNCSEL_LSB);
	padsbank0_hw->io[PIN_SDA] = PADS_BANK0_GPIO0_IE_BITS | PADS_BANK0_GPIO0_DRIVE_VALUE_12MA;
	padsbank0_hw->io[PIN_SCL] = PADS_BANK0_GPIO0_IE_BITS | PADS_BANK0_GPIO0_DRIVE_VALUE_12MA;
	
	//disable
	i2c1_hw->enable &=~ (I2C_IC_ENABLE_TX_CMD_BLOCK_BITS | I2C_IC_ENABLE_ABORT_BITS | I2C_IC_ENABLE_ENABLE_BITS);

	//configure
	i2c1_hw->con = (i2c1_hw->con &~ (I2C_IC_CON_IC_10BITADDR_MASTER_BITS | I2C_IC_CON_SPEED_BITS)) | I2C_IC_CON_RX_FIFO_FULL_HLD_CTRL_BITS | I2C_IC_CON_IC_SLAVE_DISABLE_BITS | I2C_IC_CON_IC_RESTART_EN_BITS | (I2C_IC_CON_SPEED_VALUE_FAST << I2C_IC_CON_SPEED_LSB) | I2C_IC_CON_MASTER_MODE_BITS;
	i2c1_hw->tar = (i2c1_hw->tar &~ (I2C_IC_TAR_SPECIAL_BITS | I2C_IC_TAR_GC_OR_START_BITS | I2C_IC_TAR_IC_TAR_BITS)) | (IO_EXPANDER_ADDR << I2C_IC_TAR_IC_TAR_LSB);
	i2c1_hw->fs_scl_hcnt = (i2c1_hw->fs_scl_hcnt &~ I2C_IC_FS_SCL_HCNT_IC_FS_SCL_HCNT_BITS) | ((OUR_SPEED / OUR_I2C_HZ * 3 / 5) << I2C_IC_FS_SCL_HCNT_IC_FS_SCL_HCNT_LSB);
	i2c1_hw->fs_scl_lcnt = (i2c1_hw->fs_scl_lcnt &~ I2C_IC_FS_SCL_LCNT_IC_FS_SCL_LCNT_BITS) | ((OUR_SPEED / OUR_I2C_HZ * 2 / 5) << I2C_IC_FS_SCL_LCNT_IC_FS_SCL_LCNT_LSB);
	i2c1_hw->fs_spklen = (i2c1_hw->fs_spklen &~ I2C_IC_FS_SPKLEN_IC_FS_SPKLEN_BITS) | ((OUR_SPEED / OUR_I2C_HZ / 20) << I2C_IC_FS_SPKLEN_IC_FS_SPKLEN_LSB);
	i2c1_hw->intr_mask &=~ I2C_IC_INTR_MASK_BITS;
	i2c1_hw->enable |= I2C_IC_ENABLE_ENABLE_BITS;
	
	asm volatile("dsb sy");
	
	i2cRegWrite(REG_DIRECTION, ~(1 << XPIN_RAMnROM));		//proper direction: our pin is out, all others are in
	
	i2cRegWrite(REG_OUTPUT, 0 << XPIN_RAMnROM);				//rom mode
	mCurMode = 0;											//rememebr it
}

static void __loader2 ramQuadEnter(void)
{
	useRamChip();
	
	//send "enter quad mode" command. we might already be in quad mode, this is safe. we send as "data"
	ssi_hw->ssienr = 0;
	ssi_hw->ctrlr0 = (ssi_hw->ctrlr0 &~ (SSI_CTRLR0_SPI_FRF_BITS | SSI_CTRLR0_CFS_BITS | SSI_CTRLR0_TMOD_BITS | SSI_CTRLR0_DFS_32_BITS)) | (SSI_CTRLR0_SPI_FRF_VALUE_STD << SSI_CTRLR0_SPI_FRF_LSB) | (7 << SSI_CTRLR0_DFS_32_LSB) | (SSI_CTRLR0_TMOD_VALUE_TX_AND_RX << SSI_CTRLR0_TMOD_LSB);
	ssi_hw->ssienr = SSI_SSIENR_SSI_EN_BITS;
	ssi_hw->dr0 = 0x35;	//enter quad mode
	while (!(ssi_hw->sr & SSI_SR_RFNE_BITS));
	(void)ssi_hw->dr0;
	while (ssi_hw->sr & SSI_SR_BUSY_BITS);
	asm volatile("dsb sy");
	
	useFlashChip();
}

static void __loader2 ramXipOn(void)
{
	useRamChip();
	
	xip_ctrl_hw->ctrl &=~ XIP_CTRL_EN_BITS;
	ssi_hw->ssienr = 0;
	ssi_hw->ctrlr0 = (ssi_hw->ctrlr0 &~ (SSI_CTRLR0_SPI_FRF_BITS | SSI_CTRLR0_DFS_32_BITS | SSI_CTRLR0_CFS_BITS | SSI_CTRLR0_TMOD_BITS)) | (SSI_CTRLR0_SPI_FRF_VALUE_QUAD << SSI_CTRLR0_SPI_FRF_LSB) | (31 << SSI_CTRLR0_DFS_32_LSB) | (7 << SSI_CTRLR0_CFS_LSB) | (SSI_CTRLR0_TMOD_VALUE_EEPROM_READ << SSI_CTRLR0_TMOD_LSB);
	ssi_hw->spi_ctrlr0 = (ssi_hw->spi_ctrlr0 &~ (SSI_SPI_CTRLR0_XIP_CMD_BITS | SSI_SPI_CTRLR0_WAIT_CYCLES_BITS | SSI_SPI_CTRLR0_INST_L_BITS | SSI_SPI_CTRLR0_ADDR_L_BITS | SSI_SPI_CTRLR0_TRANS_TYPE_BITS | SSI_SPI_CTRLR0_INST_DDR_EN_BITS | SSI_SPI_CTRLR0_SPI_DDR_EN_BITS)) | (SSI_SPI_CTRLR0_INST_L_VALUE_8B << SSI_SPI_CTRLR0_INST_L_LSB) | (6 << SSI_SPI_CTRLR0_ADDR_L_LSB) | (SSI_SPI_CTRLR0_TRANS_TYPE_VALUE_2C2A << SSI_SPI_CTRLR0_TRANS_TYPE_LSB) | (4 << SSI_SPI_CTRLR0_WAIT_CYCLES_LSB) | (0x0b << SSI_SPI_CTRLR0_XIP_CMD_LSB);
	ssi_hw->ctrlr1 = (ssi_hw->ctrlr1 &~ SSI_CTRLR1_NDF_BITS) | (0 << SSI_CTRLR1_NDF_LSB);	//1 word as that is all that's acceptable
	ssi_hw->baudr = (ssi_hw->baudr &~ SSI_BAUDR_SCKDV_BITS) | (FINAL_SSI_DIV << SSI_BAUDR_SCKDV_LSB);
	ssi_hw->rx_sample_dly = (ssi_hw->rx_sample_dly &~ SSI_RX_SAMPLE_DLY_RSD_BITS) | (1 << SSI_RX_SAMPLE_DLY_RSD_LSB);
	ssi_hw->ssienr = SSI_SSIENR_SSI_EN_BITS;
	
	xip_ctrl_hw->ctrl = (xip_ctrl_hw->ctrl &~ XIP_CTRL_POWER_DOWN_BITS) | XIP_CTRL_EN_BITS;
	xip_ctrl_hw->flush = XIP_FLUSH_BITS;
	asm volatile("dsb sy");
	
	(void)*(volatile uint32_t*)0x13000000;
	asm volatile("dsb sy");
	
	xip_ctrl_hw->flush = XIP_FLUSH_BITS;
}

static void __loader2 flashRead(uint32_t *dst, uint32_t addr, uint32_t numBytes)		//addr should be KB-aligned
{
	uint32_t count = numBytes / sizeof(uint32_t), dummy;

	useFlashChip();

	ssi_hw->ssienr = 0;
	ssi_hw->ctrlr0 = (ssi_hw->ctrlr0 &~ (SSI_CTRLR0_SPI_FRF_BITS | SSI_CTRLR0_DFS_32_BITS | SSI_CTRLR0_CFS_BITS | SSI_CTRLR0_TMOD_BITS)) | (SSI_CTRLR0_SPI_FRF_VALUE_QUAD << SSI_CTRLR0_SPI_FRF_LSB) | (31 << SSI_CTRLR0_DFS_32_LSB) | (7 << SSI_CTRLR0_CFS_LSB) | (SSI_CTRLR0_TMOD_VALUE_EEPROM_READ << SSI_CTRLR0_TMOD_LSB);
	ssi_hw->ctrlr1 = (ssi_hw->ctrlr1 &~ SSI_CTRLR1_NDF_BITS) | ((count - 1) << SSI_CTRLR1_NDF_LSB);
	ssi_hw->spi_ctrlr0 = (ssi_hw->spi_ctrlr0 &~ (SSI_SPI_CTRLR0_WAIT_CYCLES_BITS | SSI_SPI_CTRLR0_INST_L_BITS | SSI_SPI_CTRLR0_ADDR_L_BITS | SSI_SPI_CTRLR0_TRANS_TYPE_BITS | SSI_SPI_CTRLR0_INST_DDR_EN_BITS | SSI_SPI_CTRLR0_SPI_DDR_EN_BITS)) | (SSI_SPI_CTRLR0_INST_L_VALUE_8B << SSI_SPI_CTRLR0_INST_L_LSB) | (7 << SSI_SPI_CTRLR0_ADDR_L_LSB) | (SSI_SPI_CTRLR0_TRANS_TYPE_VALUE_1C2A << SSI_SPI_CTRLR0_TRANS_TYPE_LSB) | (5 << SSI_SPI_CTRLR0_WAIT_CYCLES_LSB);
	ssi_hw->ssienr = SSI_SSIENR_SSI_EN_BITS;
	ssi_hw->dr0 = 0xeb;
	ssi_hw->dr0 = addr << 4;
	
	//GCC compiles this code very bad to the point where it is suboptimal and sometimes broken
	asm volatile(
		".syntax unified				\n\t"
		"1:								\n\t"
		"	ldr		%0, [%3, #0x28]		\n\t"
		"	lsrs	%0, %0, #4			\n\t"
		"	bcc		1b					\n\t"
		"	ldr		%0, [%3, #0x60]		\n\t"
		"	stmia	%1!, {%0}			\n\t"
		"	subs	%2, #1				\n\t"
		"	bne		1b					\n\t"
		:"=&l"(dummy), "+l"(dst), "+l"(count)
		:"l"(ssi_hw), "1"(dst), "2"(count)
		:"memory", "cc"
	);
	
	while (ssi_hw->sr & SSI_SR_BUSY_BITS);
	asm volatile("dsb sy");
}

static void __loader2 ramWrite(const uint32_t *src, uint32_t addr, uint32_t numBytes)		//addr should be KB-aligned
{
	uint32_t count = numBytes / sizeof(uint32_t), dummy1, dummy2;

	useRamChip();
	
	ssi_hw->ssienr = 0;
	ssi_hw->ctrlr0 = (ssi_hw->ctrlr0 &~ (SSI_CTRLR0_SPI_FRF_BITS | SSI_CTRLR0_DFS_32_BITS | SSI_CTRLR0_CFS_BITS | SSI_CTRLR0_TMOD_BITS)) | (SSI_CTRLR0_SPI_FRF_VALUE_QUAD << SSI_CTRLR0_SPI_FRF_LSB) | (31 << SSI_CTRLR0_DFS_32_LSB) | (7 << SSI_CTRLR0_CFS_LSB) | (SSI_CTRLR0_TMOD_VALUE_TX_ONLY << SSI_CTRLR0_TMOD_LSB);
	ssi_hw->ctrlr1 = (ssi_hw->ctrlr1 &~ SSI_CTRLR1_NDF_BITS) | ((count - 1) << SSI_CTRLR1_NDF_LSB);
	ssi_hw->spi_ctrlr0 = (ssi_hw->spi_ctrlr0 &~ (SSI_SPI_CTRLR0_WAIT_CYCLES_BITS | SSI_SPI_CTRLR0_INST_L_BITS | SSI_SPI_CTRLR0_ADDR_L_BITS | SSI_SPI_CTRLR0_TRANS_TYPE_BITS | SSI_SPI_CTRLR0_INST_DDR_EN_BITS | SSI_SPI_CTRLR0_SPI_DDR_EN_BITS)) | (SSI_SPI_CTRLR0_INST_L_VALUE_8B << SSI_SPI_CTRLR0_INST_L_LSB) | (6 << SSI_SPI_CTRLR0_ADDR_L_LSB) | (SSI_SPI_CTRLR0_TRANS_TYPE_VALUE_2C2A << SSI_SPI_CTRLR0_TRANS_TYPE_LSB);
	ssi_hw->ssienr = SSI_SSIENR_SSI_EN_BITS;
	
	//GCC compiles this code very bad to the point where it is suboptimal and sometimes broken
	//SSI unit in write only mode ignores ctrlr1 and will end a write if we do nto feed it fast enough. at CLK=Fosc/2 this is rather fast. GCC is not good enough
	asm volatile(
		".syntax unified				\n\t"
		"	movs	%1, #0x38			\n\t"
		"	str		%1, [%4, #0x60]		\n\t"
		"	str		%5, [%4, #0x60]		\n\t"
		
		"2:								\n\t"
		"	ldmia	%2!, {%1}			\n\t"
		"1:								\n\t"
		"	ldr		%0, [%4, #0x28]		\n\t"
		"	lsrs	%0, %0, #2			\n\t"
		"	bcc		1b					\n\t"
		"	str		%1, [%4, #0x60]		\n\t"
		"	subs	%3, #1				\n\t"
		"	bne		2b					\n\t"
		:"=&l"(dummy1), "=&l"(dummy2), "+l"(src), "+l"(count)
		:"l"(ssi_hw), "l"(addr),  "2"(src), "3"(count)
		:"memory", "cc"
	);
	
	while (ssi_hw->sr & SSI_SR_BUSY_BITS);
	asm volatile("dsb sy");
}


static void flashQuadEnable(void)
{
	uint_fast8_t curSta, chipManufacturer, cmdReadStaRegWithQeBit, cmdSetWelForQeWrite, cmdQeWrite, cmdQeWriteZeroBytesBeforeData, qeBitMask;
	
	useFlashChip();

	ssi_hw->ssienr = 0;
	ssi_hw->ctrlr0 = (ssi_hw->ctrlr0 &~ (SSI_CTRLR0_SPI_FRF_BITS | SSI_CTRLR0_DFS_32_BITS | SSI_CTRLR0_CFS_BITS | SSI_CTRLR0_TMOD_BITS)) | (SSI_CTRLR0_SPI_FRF_VALUE_STD << SSI_CTRLR0_SPI_FRF_LSB) | (7 << SSI_CTRLR0_DFS_32_LSB)| (SSI_CTRLR0_TMOD_VALUE_TX_AND_RX << SSI_CTRLR0_TMOD_LSB);
	ssi_hw->ctrlr1 = (ssi_hw->ctrlr1 &~ SSI_CTRLR1_NDF_BITS);
	ssi_hw->spi_ctrlr0 = (ssi_hw->spi_ctrlr0 &~ (SSI_SPI_CTRLR0_WAIT_CYCLES_BITS | SSI_SPI_CTRLR0_INST_L_BITS | SSI_SPI_CTRLR0_ADDR_L_BITS | SSI_SPI_CTRLR0_TRANS_TYPE_BITS | SSI_SPI_CTRLR0_INST_DDR_EN_BITS | SSI_SPI_CTRLR0_SPI_DDR_EN_BITS)) | (SSI_SPI_CTRLR0_TRANS_TYPE_VALUE_1C1A << SSI_SPI_CTRLR0_TRANS_TYPE_LSB);
	ssi_hw->ssienr = SSI_SSIENR_SSI_EN_BITS;
	
	
	//different flash chips store "QE" bit in different places so we need to sort out which chip we have. we'll use RDID
	ssi_hw->dr0 = 0x9f;
	ssi_hw->dr0 = 0x00;
	while (!(ssi_hw->sr & SSI_SR_RFNE_BITS));
	(void)ssi_hw->dr0;
	while (!(ssi_hw->sr & SSI_SR_RFNE_BITS));
	chipManufacturer = ssi_hw->dr0;
	while (ssi_hw->sr & SSI_SR_BUSY_BITS);
	
	switch (chipManufacturer) {
		case 0x15:						//ISSI
			
			cmdReadStaRegWithQeBit = 0x05;
			qeBitMask = 0x40;
			cmdSetWelForQeWrite = 0x06;
			cmdQeWriteZeroBytesBeforeData = 0;
			cmdQeWrite = 0x01;
			break;
			
		case 0x85:						//PUYA
		
			cmdReadStaRegWithQeBit = 0x35;
			qeBitMask = 0x02;
			cmdSetWelForQeWrite = 0x06;
			cmdQeWriteZeroBytesBeforeData = 1;
			cmdQeWrite = 0x01;
			break;
			
		
		default:
			while(1);
	}
	
	
	//read status
	ssi_hw->dr0 = cmdReadStaRegWithQeBit;
	ssi_hw->dr0 = 0x00;
	while (!(ssi_hw->sr & SSI_SR_RFNE_BITS));
	(void)ssi_hw->dr0;
	while (!(ssi_hw->sr & SSI_SR_RFNE_BITS));
	curSta = ssi_hw->dr0;
	while (ssi_hw->sr & SSI_SR_BUSY_BITS);
	asm volatile("dsb sy");
	
	if (!(curSta & qeBitMask)) {		//need to set it
		
		uint_fast8_t i;
		
		//WEL
		ssi_hw->dr0 = cmdSetWelForQeWrite;
		while (!(ssi_hw->sr & SSI_SR_RFNE_BITS));
		(void)ssi_hw->dr0;
		while (ssi_hw->sr & SSI_SR_BUSY_BITS);
		asm volatile("dsb sy");
		
	
		//write status reg
		ssi_hw->dr0 = cmdQeWrite;
		for (i = 0; i < cmdQeWriteZeroBytesBeforeData; i++)
			ssi_hw->dr0 = 0;
		ssi_hw->dr0 = curSta | qeBitMask;
		for (i = 0; i < cmdQeWriteZeroBytesBeforeData + 2; i++)	{
		
			while (!(ssi_hw->sr & SSI_SR_RFNE_BITS));
			(void)ssi_hw->dr0;
		}
		while (ssi_hw->sr & SSI_SR_BUSY_BITS);
		asm volatile("dsb sy");
	
		//wait for write complete (always using normal status register)
		do {
			ssi_hw->dr0 = 0x05;
			ssi_hw->dr0 = 0x00;
			while (!(ssi_hw->sr & SSI_SR_RFNE_BITS));
			(void)ssi_hw->dr0;
			while (!(ssi_hw->sr & SSI_SR_RFNE_BITS));
			curSta = ssi_hw->dr0;
			while (ssi_hw->sr & SSI_SR_BUSY_BITS);
			asm volatile("dsb sy");
		} while (curSta & 0x01);
	}
}

static void speedUp(void)
{
	//tell refclock to use ROSC
	clocks_hw->clk[clk_ref].ctrl = (clocks_hw->clk[clk_ref].ctrl &~ CLOCKS_CLK_REF_CTRL_SRC_BITS) | (CLOCKS_CLK_REF_CTRL_SRC_VALUE_ROSC_CLKSRC_PH << CLOCKS_CLK_REF_CTRL_SRC_LSB);
	
	//use ref clock for cpu, use sys clock for periphs
	clocks_hw->clk[clk_peri].ctrl &=~ CLOCKS_CLK_PERI_CTRL_ENABLE_BITS;
	clocks_hw->clk[clk_sys].ctrl = (clocks_hw->clk[clk_sys].ctrl &~ CLOCKS_CLK_SYS_CTRL_SRC_BITS)| (CLOCKS_CLK_SYS_CTRL_SRC_VALUE_CLK_REF << CLOCKS_CLK_SYS_CTRL_SRC_LSB);
	clocks_hw->clk[clk_peri].ctrl = (clocks_hw->clk[clk_peri].ctrl &~ (CLOCKS_CLK_PERI_CTRL_KILL_BITS | CLOCKS_CLK_PERI_CTRL_AUXSRC_BITS)) | CLOCKS_CLK_PERI_CTRL_ENABLE_BITS | (CLOCKS_CLK_PERI_CTRL_AUXSRC_VALUE_CLK_SYS << CLOCKS_CLK_PERI_CTRL_AUXSRC_LSB);
	
	//start XOSC (by stopping it first...)
	xosc_hw->ctrl = (xosc_hw->ctrl &~ XOSC_CTRL_ENABLE_BITS) | (XOSC_CTRL_ENABLE_VALUE_DISABLE << XOSC_CTRL_ENABLE_LSB);
	while (xosc_hw->status & XOSC_STATUS_ENABLED_BITS);
	xosc_hw->startup = (xosc_hw->startup &~ XOSC_STARTUP_DELAY_BITS) | (8191 << XOSC_STARTUP_DELAY_LSB);
	xosc_hw->ctrl = (xosc_hw->ctrl &~ (XOSC_CTRL_FREQ_RANGE_BITS | XOSC_CTRL_ENABLE_BITS)) | (XOSC_CTRL_ENABLE_VALUE_ENABLE << XOSC_CTRL_ENABLE_LSB) | (XOSC_CTRL_FREQ_RANGE_VALUE_1_15MHZ << XOSC_CTRL_FREQ_RANGE_LSB);
	while ((xosc_hw->status & (XOSC_STATUS_STABLE_BITS | XOSC_STATUS_ENABLED_BITS)) != (XOSC_STATUS_STABLE_BITS | XOSC_STATUS_ENABLED_BITS));
	
	//tell refclock to use XOSC
	clocks_hw->clk[clk_ref].ctrl = (clocks_hw->clk[clk_ref].ctrl &~ CLOCKS_CLK_REF_CTRL_SRC_BITS) | (CLOCKS_CLK_REF_CTRL_SRC_VALUE_XOSC_CLKSRC << CLOCKS_CLK_REF_CTRL_SRC_LSB);
	
	//reset pll
	resets_hw->reset |= RESETS_RESET_PLL_SYS_BITS;
	asm volatile("dsb sy");
	resets_hw->reset &=~ RESETS_RESET_PLL_SYS_BITS;
	asm volatile("dsb sy");
	while (!(resets_hw->reset_done & RESETS_RESET_PLL_SYS_BITS));
		
	//pll up and wait for it
	pll_sys_hw->pwr |= (PLL_PWR_VCOPD_BITS | PLL_PWR_POSTDIVPD_BITS | PLL_PWR_PD_BITS);		//dividers on
	pll_sys_hw->fbdiv_int = (pll_sys_hw->fbdiv_int &~ PLL_FBDIV_INT_BITS) | ((OUR_SPEED / 1000000 / 2) << PLL_FBDIV_INT_LSB);
	pll_sys_hw->prim = (pll_sys_hw->prim &~ (PLL_PRIM_POSTDIV1_BITS | PLL_PRIM_POSTDIV2_BITS)) | (6 << PLL_PRIM_POSTDIV1_LSB) | (1 << PLL_PRIM_POSTDIV2_LSB);
	pll_sys_hw->pwr &=~ (PLL_PWR_VCOPD_BITS | PLL_PWR_POSTDIVPD_BITS | PLL_PWR_PD_BITS);		//dividers on
	while (!(pll_sys_hw->cs & PLL_CS_LOCK_BITS));
	pll_sys_hw->cs &=~PLL_CS_BYPASS_BITS;
	
	//switch sys.AUX to pll
	clocks_hw->clk[clk_sys].ctrl = (clocks_hw->clk[clk_sys].ctrl &~ CLOCKS_CLK_SYS_CTRL_AUXSRC_BITS) | (CLOCKS_CLK_SYS_CTRL_AUXSRC_VALUE_CLKSRC_PLL_SYS << CLOCKS_CLK_SYS_CTRL_AUXSRC_LSB);
		
	//switch sys to AUX and wait for it
	clocks_hw->clk[clk_sys].ctrl = (clocks_hw->clk[clk_sys].ctrl &~ CLOCKS_CLK_SYS_CTRL_SRC_BITS) | (CLOCKS_CLK_SYS_CTRL_SRC_VALUE_CLKSRC_CLK_SYS_AUX << CLOCKS_CLK_SYS_CTRL_SRC_LSB);
	while (((clocks_hw->clk[clk_sys].selected & CLOCKS_CLK_REF_SELECTED_BITS) >> CLOCKS_CLK_REF_SELECTED_LSB) != (1 << CLOCKS_CLK_SYS_CTRL_SRC_VALUE_CLKSRC_CLK_SYS_AUX));
	
	//set baud rate now
	ssi_hw->ssienr = 0;
	ssi_hw->baudr = (ssi_hw->baudr &~ SSI_BAUDR_SCKDV_BITS) | (OUR_SSI_DIV << SSI_BAUDR_SCKDV_LSB);
	ssi_hw->ssienr = SSI_SSIENR_SSI_EN_BITS;
}


static void __loader2 __attribute__((naked)) HardFault_Handler(void)
{
	asm volatile(
		"	mrs r0, msp		\n\t"
		"	mrs	r1, psp		\n\t"
		"1:					\n\t"
		"	b	1b			\n\t"
	);
}

static void __loader2 hwSetup(void)
{
	uint32_t i, neededUnits = RESETS_RESET_PADS_BANK0_BITS | RESETS_RESET_IO_BANK0_BITS | RESETS_RESET_PLL_SYS_BITS | RESETS_RESET_PADS_QSPI_BITS | RESETS_RESET_IO_QSPI_BITS;
	static __attribute__ ((section(".vectors.loader2"), aligned (256))) void (* const __ISR_VECTORS[])  (void) =
	{
		0,							// unused: initial sp
		0,							// unused: reset handler
	
		0,							// The NMI handler
		HardFault_Handler,			// The hard fault handler
	};


	SCB->VTOR = (uint32_t)__ISR_VECTORS;

	neededUnits |= RESETS_WDSEL_USBCTRL_BITS;	//enable USB to enable USB ram (which is useful to have)

	#ifdef XPIN_RAMnROM	//i2c expander -> need i2c1 unit
		
		neededUnits |= RESETS_RESET_I2C1_BITS;
		
	#endif
	
	resets_hw->reset &=~ neededUnits;
	
	//disable qspi schmitt trigger for speed, enable harder drive strength
	for (i = 0; i < 6; i++)
		pads_qspi_hw->io[i] = (pads_qspi_hw->io[i] &~ (PADS_BANK0_GPIO0_SCHMITT_BITS | PADS_BANK0_GPIO0_DRIVE_BITS)) | PADS_BANK0_GPIO0_SLEWFAST_BITS | (PADS_BANK0_GPIO0_DRIVE_VALUE_12MA << PADS_BANK0_GPIO0_DRIVE_LSB);
	
	//xip off
	xip_ctrl_hw->ctrl &=~ XIP_CTRL_EN_BITS;
	
	//wroom
	speedUp();
	
	//set up our control GPIO
	#ifdef PIN_RAMnROM
		
		sio_hw->gpio_clr = (1 << PIN_RAMnROM);		//prepare to be low
		sio_hw->gpio_oe_set = (1 << PIN_RAMnROM);	//make it an output
		iobank0_hw->io[PIN_RAMnROM].ctrl = (iobank0_hw->io[PIN_RAMnROM].ctrl &~ IO_BANK0_GPIO0_CTRL_FUNCSEL_BITS) | (IO_BANK0_GPIO0_CTRL_FUNCSEL_VALUE_SIO_0 << IO_BANK0_GPIO0_CTRL_FUNCSEL_LSB);
		padsbank0_hw->io[PIN_RAMnROM] = (PADS_BANK0_GPIO0_DRIVE_VALUE_8MA << PADS_BANK0_GPIO0_DRIVE_LSB) | PADS_BANK0_GPIO0_SLEWFAST_BITS;
	
	#endif
	
	#ifdef XPIN_RAMnROM
		
		i2cExpanderInit();
	
	#endif
}

void __attribute__((used)) __loader2 loader2(void)
{
	uint32_t pos;
	
	hwSetup();
	
	
	//ram shoudl enter quad mode now
	ramQuadEnter();

	//our flash might not have QE bit on. we need it on. it is nonvolatile so writing it pointlessly is also not good. we'll read and write if needed
	flashQuadEnable();
	
	for (pos = 0; pos < COPY_AMOUNT; pos += BUFFER_SIZE) {
		
		uint32_t subpos = 0, *buf = (uint32_t*)BUFFER_ADDR;
		
		for (subpos = 0; subpos < BUFFER_SIZE; subpos += CHUNK_SIZE)
			flashRead(buf + subpos / sizeof(uint32_t), pos + subpos, CHUNK_SIZE);
		
		for (subpos = 0; subpos < BUFFER_SIZE; subpos += CHUNK_SIZE)
			ramWrite(buf + subpos / sizeof(uint32_t), pos + subpos, CHUNK_SIZE);
	}
	
	//now enable xip in ram and go...
	ramXipOn();
	
	//go
	asm volatile(
		"	mov   r0, %0		\n\t"
		"	ldmia r0, {r0, r1}	\n\t"
		"	mov   sp, r0		\n\t"
		"	bx    r1			\n\t"
		::"r"(0x10001000)
	);
}