#include <windows.h>
#include "stdint.h"

#define true		(1)
#define false		(0)


#define CACHE_SYNC_DISCARD      0x001   /* write back & discard all cached data */
#define CACHE_SYNC_INSTRUCTIONS 0x002   /* discard all cached instructions */
#define CACHE_SYNC_WRITEBACK    0x004   /* write back but don't discard data cache*/
#define CACHE_SYNC_FLUSH_I_TLB  0x008   /* flush I-TLB */
#define CACHE_SYNC_FLUSH_D_TLB  0x010   /* flush D-TLB */
#define CACHE_SYNC_FLUSH_TLB    (CACHE_SYNC_FLUSH_I_TLB|CACHE_SYNC_FLUSH_D_TLB)    /* flush all TLB */
#define CACHE_SYNC_L2_WRITEBACK 0x020   /* write-back L2 Cache */
#define CACHE_SYNC_L2_DISCARD   0x040   /* discard L2 Cache */

BOOL WINAPI VirtualCopy( LPVOID lpvDest, LPVOID lpvSrc, DWORD cbSize, DWORD fdwProtect );
void WINAPI CacheRangeFlush (LPVOID pAddr, DWORD dwLength, DWORD dwFlags);
void WINAPI CacheSync(int flags);


//#define EMULATOR

#ifdef EMULATOR		//emulator

	#define DEVICE_RAM_BASE				0x30000000
	#define TTBR_OFFSET					0x220000

#else		//Axim

	#define DEVICE_RAM_BASE				0xa0000000
	#define TTBR_OFFSET					0x080000

#endif

#define VECTOR_PAGE_OFFST			(TTBR_OFFSET + 04000)
#define DEVICE_RAM_MEGABYTES		64


#define PXA_BASE_FFUART		0x40100000
struct PxaUart {
	union {
		volatile DWORD RBR;	//RO
		volatile DWORD THR;	//WO
		volatile DWORD DLL;	//when DLAB is on
	};
	union {
		volatile DWORD IER;
		volatile DWORD DLH;	//when DLAB is on
	};
	union {
		volatile DWORD IIR;	//RO
		volatile DWORD FCR;	//WO;
	};
	volatile DWORD LCR, MCR, LSR, MSR, SPR, ISR;
	
	//HWUART only:
	volatile DWORD FOR, ABR, ACR;
};

struct PxaDMA {
	volatile DWORD DCSR[16];
	DWORD rfu1[44];
	volatile DWORD DINT;
	DWORD rfu2[3];
	volatile DWORD DRCMR[40];
	DWORD rfu3[24];
	struct PxaDMAch {
		volatile DWORD DDADR, DSADR, DTADR, DCMD;
	} ch[16];
};
#define PXA_BASE_DMA		0x40000000

#define PXA_BASE_OSTMR		0x40a00000
struct PxaOsTimer {
	volatile DWORD OSMR[4];
	volatile DWORD OSCR, OSSR, OWER, OIER;
};

#define PXA_BASE_IC			0x40d00000
struct PxaIc {
	volatile DWORD ICIP, ICMR, ICLR, ICFP, ICPR, ICCR;
};

#define PXA_BASE_GPIO		0x40e00000
struct PxaGpio {
	volatile DWORD GPLR[3];
	volatile DWORD GPDR[3];
	volatile DWORD GPSR[3];
	volatile DWORD GPCR[3];
	volatile DWORD GRER[3];
	volatile DWORD GFER[3];
	volatile DWORD GEDR[3];
	volatile DWORD GAFR[6];
};

#define PXA_BASE_CLK		0x41300000
struct PxaClockMgr {
	volatile DWORD CCCR, CKEN, OSCC;
};



static void pr(const wchar_t *fmt, ...)
{
	static wchar_t x[1024];
	va_list vl;

	va_start(vl, fmt);
	vswprintf(x, fmt,vl);
	va_end(vl);

	MessageBoxW(0, x, L"rePalm Loader", MB_OK | MB_ICONINFORMATION);
}

static void err_(const wchar_t *str)
{
	DWORD err = GetLastError();
	static wchar_t x[1024];
	LPVOID lpMsgBuf;

	FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |  FORMAT_MESSAGE_IGNORE_INSERTS, NULL, err, 0, (LPTSTR) &lpMsgBuf, 0, NULL);

	swprintf(x, L"%s, err=0x%08x (%s)", str, err, lpMsgBuf);
	LocalFree(lpMsgBuf);
	MessageBoxW(0, x, L"rePalm Loader", MB_OK | MB_ICONERROR);
}

#define err(_str)	err_(L ## _str)

static BOOL v2p(void *ram, DWORD ttbrOFst, DWORD v, DWORD* pP)
{
	DWORD t = ((DWORD*)(((char*)ram) + ttbrOFst))[v >> 20], nextMask, nextShift;
	BOOL fine = false;

	//pr(L"v2p(0x%08x) 1. using offset 0x%08x, entry idx 0x%03x val is 0x%08x\n", v, ttbrOFst, v >> 20, t);

	switch (t & 3) {
		case 0:
			return false;
		case 1:
			nextMask = 0x000ff000;
			nextShift = 12;
			t &= 0xfffffc00;
			break;
		case 2:	//section
			*pP = (t & 0xfff00000) | (v & 0x000fffff);
			return true;
		case 3:
			nextMask = 0x000ffc00;
			nextShift = 10;
			t &= 0xfffff000;
			fine = true;
			break;
	}
	//second table must be in ram
	if (t < DEVICE_RAM_BASE || t - DEVICE_RAM_BASE >= (DEVICE_RAM_MEGABYTES << 20))
		return false;
	
	ttbrOFst = t - DEVICE_RAM_BASE;
	t = ((DWORD*)(((char*)ram) + ttbrOFst))[(v & nextMask) >> nextShift];

	//pr(L"v2p(0x%08x) 2. using offset 0x%08x, entry idx 0x%03x val is 0x%08x\n", v, t - DEVICE_RAM_BASE, (v & nextMask) >> nextShift, t);

	switch (t & 3) {
		case 0:
			return false;
		case 1:
			*pP = (t & 0xffff0000) | (v & 0x0000ffff);
			return true;
		case 3:
			if (fine) {
				*pP = (t & 0xfffffc00) | (v & 0x000003ff);
				return true;
			}
			//fallthrough
		case 2:
			*pP = (t & 0xfffff000) | (v & 0x00000fff);
			return true;
	}

	//unreachable
	return false;
}

static const unsigned char asm_bin[] = {
	#include "asm/asm.inc"
};


int WINAPI WinMain(HINSTANCE hInstance,
                   HINSTANCE hPrevInstance,
                   LPTSTR    lpCmdLine,
                   int       nCmdShow)
{
	
	struct PxaClockMgr *clk = (struct PxaClockMgr*)(0xac000000 + PXA_BASE_CLK - 0x40000000);
	struct PxaUart *uart = (struct PxaUart*)(0xac000000 + PXA_BASE_FFUART - 0x40000000);
	struct PxaGpio *gpio = (struct PxaGpio*)(0xac000000 + PXA_BASE_GPIO - 0x40000000);
	struct PxaDMA *dma = (struct PxaDMA*)(0xac000000 + PXA_BASE_DMA - 0x40000000);
	DWORD romSz, romSzHi, gotSz = 0, i, j, *pagemap, pa;
	unsigned char *ramView, *romImg;
	void *ramViewOrig;
	wchar_t path[1024];
	int idx, last = -1;
	HANDLE romF;

	GetModuleFileName(NULL, path, sizeof(path) / sizeof(*path));
	for (idx = 0; idx < sizeof(path) && path[idx]; idx++)
		if (path[idx] == '\\')
			last = idx;
	if (last == -1) {
		err("Cannot figure out the path");
		return 0;
	}
	wcscpy(path + last + 1, L"rom.bin");

	romF = CreateFile(path, GENERIC_READ | GENERIC_WRITE, 0, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0);
	if (romF == INVALID_HANDLE_VALUE) {
		err("Cannot open rom file");
		return 0;
	}

	ramViewOrig = VirtualAlloc(0, ((DEVICE_RAM_MEGABYTES + 1 /* for rounding */) << 20) - 1, MEM_RESERVE, PAGE_NOACCESS);
	if (!ramViewOrig) {
		err("Cannot allocate VA for RAM view image");
		return 0;
	}
	//round up to MB
	ramView = (unsigned char*)((((DWORD)ramViewOrig) + (1 << 20) - 1) &~ ((1 << 20) - 1));

	romSz = GetFileSize(romF, &romSzHi);
	if (romSzHi || romSz > (4 << 20)) {		//we cna only support 4M with 2 level page map
		err("Rom file size invalid");
		return 0;
	}

	//alloc  alrage chunk of VA (>32MB) to make sure if it not in FCSE amra
	romImg = VirtualAlloc(0, 32 << 20, MEM_RESERVE, PAGE_NOACCESS);
	if (!romImg) {
		err("Cannot allocate VA for ROM image");
		return 0;
	}
	romImg = VirtualAlloc(romImg, 0x401000 /* 4M for rom and 4K for page list */, MEM_COMMIT, PAGE_EXECUTE_READWRITE | PAGE_NOCACHE);
	if (!romImg) {
		err("Cannot commit pages for ROM image");
		return 0;
	}

	if (!ReadFile(romF, romImg, romSz, &gotSz, NULL)) {
		err("Cannot read ROM image");
		return 0;
	}

	//we map RAM 1MB at a time to avoid weird runtime mapping restrictions ("no crossing 32MB VA boundary")
	for (i = 0; i < DEVICE_RAM_MEGABYTES; i++) {
		if (!VirtualCopy(ramView + (i << 20), (void*)((DEVICE_RAM_BASE + (i << 20)) >> 8), 1 << 20, PAGE_EXECUTE_READWRITE | PAGE_NOCACHE | PAGE_PHYSICAL)) {
			err("Cannot map RAM");
			return 0;
		}
	}

	//pagemap is precisely one page (4096 bytes = 1024 entries, each pointing to one 4096 byte page), thus it describes 4M
	pagemap = (DWORD*)(romImg + 0x400000);
	for (i = 0; i < 0x400000; i += 4096) {
		
		//try a few times
		for (j = 0, pa = 0; j < 128 && !pa; j++) {
			//page it in
			(void)*(volatile DWORD*)(romImg + i);

			if (!v2p(ramView, TTBR_OFFSET, (DWORD)(romImg + i), &pa))
				pa = 0;
		}
		if (!pa) {
			pr(L"Cannot get phys page for a rom image page at offset 0x%08x", i);
			return 0;
		}
		*pagemap++ = pa;
	}
	
	//get the pa of the pagemap
	pagemap -= 1024;
	(void)*(volatile DWORD*)pagemap;
	if (!v2p(ramView, TTBR_OFFSET, (DWORD)pagemap, &pa)) {
		err("cannot find PA of pagemap\n");
		return 0;
	}
//	pr(L"pagemap at 0x%08x", pa);

	//remap all high mappings to have AP =3
	for (j = 0, i = 2048; i <= 4096; i++) {
		volatile DWORD *pte = ((volatile DWORD*)(ramView + TTBR_OFFSET)) + i;

		if ((pte[0] & 0x0f) == 2) {
			pte[0] |= 0xc00;
			j++;
		}
	}
	//a syscall to be sure
	CacheSync(CACHE_SYNC_FLUSH_TLB);

//	pr(L"adjusted %u mappings\n", j);

	//find second level pagetable for 0xfff00000
	i = ((volatile DWORD*)(ramView + TTBR_OFFSET))[0xfff];
	if ((i & 3) != 1) {
		err("Second level for 0xfff00000 not as expected");
		return 0;
	}
	i &= 0xfffffc00;
	i -= DEVICE_RAM_BASE;

	//verify first page mapping is empty
	j = ((volatile DWORD*)(ramView + i))[0];
	if (j & 3) {
		err("Second level for 0xfff00000 not as expected");
		return 0;
	}

	//verify 240th page is mapped as a small page
	j = ((volatile DWORD*)(ramView + i))[0xf0];
	if ((j & 3) != 2) {
		err("Second level for 0xffff0000 not as expected");
		return 0;
	}

	//map that same page at 0xfff0000
	j |= 0xff0;		//accessible by all
	j &=~ 0x00c;	//not cached or buffered
	((volatile DWORD*)(ramView + i))[0] = j;

	//flush the TLB
	CacheSync(CACHE_SYNC_FLUSH_TLB);

	//copy our code to there
	for (i = 0; i < sizeof(asm_bin); i++)
		*(volatile char*)(0xfff00024 + i) = asm_bin[i];
	//write page map pointer
	*(volatile DWORD*)0xfff00020 = pa;

	//flush the cache so that vector page also sees the write
	CacheRangeFlush((void*)0xfff00000, 0x00100000, CACHE_SYNC_INSTRUCTIONS | CACHE_SYNC_DISCARD);

	//prepare
//	pr(L"preparing hardware");
	gpio->GPDR[1] |= 1 << (39 - 32);
	gpio->GAFR[2] &=~ (3 << ((39 - 32) * 2));
	gpio->GAFR[2] |= 2 << ((39 - 32) * 2);

	clk->CKEN |= 1<< 6;

	uart->LCR = 0;
	uart->IER = 0;
	uart->FCR = 0;
	uart->LCR = 0x80;	//DLAB
	uart->DLL = 24;		//38200
	uart->DLH = 0;
	uart->LCR = 0x07;	//DLAB off, 8n2
	uart->IER = 0x40;	//on
	
//	pr(L"killind dmas\n");
	for (i = 0; i < 16; i++) {
		dma->DCSR[i] = 0;
		while(!(dma->DCSR[0] & 8));
	}

//	pr(L"testing uart\n");
	uart->THR = 'X';
	while (!(uart->LSR & 0x40));
	uart->THR = 'Y';
	while (!(uart->LSR & 0x40));
	uart->THR = 'Z';
	while (!(uart->LSR & 0x40));

//verify we can see our code and run it
//	pr(L"Booting now");
	i = ((DWORD (*)(DWORD))0xfff00024)(pa);
	pr(L"You should not see this\n", i);

	return 0;
}
