#define BASE			(0xfffff000)

#define GRPBASEA		0x100		//16 bit
#define GRPMASKA		0x108		//16 bit
#define CSA0			0x110		//32-bit
#define CSA1			0x114		//32-bit
#define CSA2			0x118		//32-bit

#define WCR				0x618		//16-bit



.globl __vectors
__vectors:

	//spacing and vector addrs
	moveq #0, %d0
	moveq #0, %d0
	moveq #0, %d0
	moveq #0, %d0
	moveq #0, %d0
	moveq #0, %d0
	moveq #0, %d0
	moveq #0, %d0

	//we are running at *SOME* address, but we do not know which since CSA0 maps the entire address space.
	//let's jump to an address we control so that when we enable chip-select decoding we're not stuck in a place which does not decode to our chip select
	jmp	(0x00800000)
	
	lea  (0 + BASE).w, %a0
	
	move.l #0x80010006, CSA0(%a0)	//CSA0 is 0x00800000..0x0080ffff, 16-wide, 6 wait states
	move.l #0x00000002, CSA1(%a0)	//CSA1 is 0x00000000..0x0001ffff, 8-wide, 2 wait states
	move.l #0x81010006, CSA2(%a0)	//CSA2 is 0x00810000..0x0080ffff, 16-wide, 6 wait states
	
	move.w #0x00ff, GRPMASKA(%a0)	//top 8 bits determine when we're using CSA
	move.w #0x0001, GRPBASEA(%a0)	//enable CSA at 0x00000000..0x00ffffff
	
	
	//ram is at 0x00000000 now
	//we'll put our first stage downloader at the end
	//it will expect d0 set (num words minus one) to download and a5 set to download location, a4 is the entry point of the downloaded code
	//problem is that 68k seems to sometimes(!) prefetch the next instr while execuring the current one, so we are not sure where our download
	//will begin. we thus pad with zeroes and read until we see nonzero
	//the loader is:
	//	1:	//skip zeroes and first nonzero word
	//		tst.w  (0x00800000)
	//		beq.b  1b
	//	1:	//download the proper numberf of words
	//		move.w (0x00800000), (%a5)+
	//		dbra   %d0, 1b
	//		jmp    (%a4)
	//in words this is:
	//	4A79 0080 0000
	//	67F8
	//	3AF9 0080 0000
	//	51C8 FFF8
	//	4ED4
	
	lea 	(0x00007f00), %a2
	movea.l %a2, %a3
	
	move.w  #0x4A79, (%a2)+
	move.w  #0x0080, (%a2)+
	move.w  #0x0000, (%a2)+
	move.w  #0x67F8, (%a2)+
	move.w  #0x3AF9, (%a2)+
	move.w  #0x0080, (%a2)+
	move.w  #0x0000, (%a2)+
	move.w  #0x51C8, (%a2)+
	move.w  #0xFFF8, (%a2)+
	move.w  #0x4ED4, (%a2)+

	//now prepare the data for the loader (where to download to, how much)
	moveq	#0 + (__load_end - __load_start - 2) / 2, %d0		//-2 allows us to not have to subtract 2 from the difference later
	movea.l #0x00007000, %a5
	movea.l %a5, %a4
	jmp    (%a3)
	
	//now come the zeroes to make sure we start where we should
	.word 0
	.word 0
	.word 0
	.word 0xffff
	
	
	//what follows should be downloaded and then run from RAM at 0x00007000
__load_start:

	//disable irqs
	ori.w	#0x700,%sr
	
	//disable double map
	move.b  #0xF8, (%a0)
	
	//disable watchdog
	clr.w	WCR(%a0)

	//we now need to make the first valid request of our protocol.

	movea.l #0x00000000, %a2	//load to start of RAM (we are at end)
	movea.l	%a2, %a3
	movea.l	#0x00800000, %a1
	moveq.l #0, %d1


	move.w	#0xa005, (%a1)		//command.
	move.w	%d1, (%a1)			//data len. why not a clr.w ? that seems to issue a read before the write. this is documented(ish), and we do not need that
	move.w  %d1, (0x00810000)	//trigger.
reply_wait:
	cmpi.w	#0xFACE, (%a1)
	bne.b	reply_wait
	//check for magic number
	
	move.w	(%a1), %d0		//num words enclosed in reply
	
	bra.b	get_words_loopcheck
get_more_words:
	move.w	(%a1), (%a3)+
get_words_loopcheck:
	dbra	%d0, get_more_words
	
	//we got it - set its pc and go there
	movea.l	(%a2)+, %a7
	movea.l	(%a2), %a2
	jmp		(%a2)


__load_end: