#include <stdbool.h>
#include <string.h>
#include <stdint.h>
#include <stdio.h>
#include "boot.h"
#include "emuJitInternal.h"
#include "memmap.h"
#include "kernel.h"
#include "printf.h"
#include "disasm.h"
#include "emit.h"
#include "emu.h"
#include "dal.h"
#include "ral.h"
#include "kal.h"

//fast plz
#pragma GCC push_options
#pragma GCC optimize("Ofast")


///XXX: TODO: pattern match fpu code

#define TC_SIZE_INITIAL					1024	//in halfwords
#define TC_SIZE_MAX						65536	//in halfwords

#define TC_HASH_ENTRY_SZ_LOG			9

#define JUMP_TO_EXTANT_XLATIONS					//if while translating a TU we meet an address we've translated, jump to that xlation. likely saves TC as a perf cost
#define SUPPORT_ICACHE_FLUSH					//costs us TC space and speed

#define MAX_SWITCH_SIZE					32 		//switches of this many cases or fewer will be optimized. DO not set over 256 ever (breaks decoding). also anythign over 72 will break our way to encode TBB

#ifndef FLUSH_GROW_TICKS
	#define FLUSH_GROW_TICKS			500		//if less than this many ticks pass between a flush and another flush, TC will grow
#endif


//translate settings
#ifndef MAX_LITERALS_BEFORE_SPILL
	#define MAX_LITERALS_BEFORE_SPILL	32		//if we accumulate this many literals, spill them now
#endif

#define SHORT_LOOP_OPT_SZ				32		//we handle backwards jumps for short loops of this size internally by tracking addrs - helps with hot loops
#define FORWARD_JUMP_CACHE_SZ			16
#define MAX_LITERALS_RANGE				256		//max halfwords from first literal load to now  before we spill
#define MAX_IN_TU_FWD_JUMP_LEN			128		//if jump forward is over this many instrs forward, do not assume it might be in the same TU
#define SHORT_LOOP_OFST_INVALID			0xFFFF	//if we cannot jump there (like in the middle of a peephole-produced thing

#define INSTR_UDF_N						0xde00
#define INSTR_UDF_W_1					0xf7f0
#define INSTR_UDF_W_2					0xa000

#define UDF_VAL_UNTRANSLATEABLE	0x0080	//set if we cannot at all translate an instr
#define UDF_VAL_DELETED_TU		0x0081	//first instr of a deleted TU, if encountered, translate TU's addr anew, then replace this instr with a jump to new tu (at least 2 halfwords in TU guaranteed)
#define UDF_VAL_LITERAL_LOAD	0x0082	//literl load placeholder - we should never hit this for real

#define STRINGIFY2(x)	#x
#define STRINGIFY(x)	STRINGIFY2(x)

#ifdef BUILD_FOR_THUMB_1

	#define HASH_ADDRESS(dst, src)																	\
			" lsrs " #dst ", " #src ", #12											\n\t"			\
			" eors " #dst ", " #src "												\n\t"			\
			" lsls " #dst ", " #dst ", #30 - " STRINGIFY(TC_HASH_ENTRY_SZ_LOG) "	\n\t"			\
			" lsrs " #dst ", " #dst ", #32 - " STRINGIFY(TC_HASH_ENTRY_SZ_LOG) "	\n\t"
#else

	#define HASH_ADDRESS(dst, src)																	\
			" eors " #dst ", " #src ", " #src ", LSR #12							\n\t"			\
			" ubfx " #dst ", " #dst ", #2, " STRINGIFY(TC_HASH_ENTRY_SZ_LOG) "		\n\t"

#endif

struct FwdJumpInfo {
	uint16_t loc;	//ofst in halfword from start of tu
	uint16_t dest;	//ofst in words from addr where tu started translating;
	uint16_t curLen	:12;
	uint16_t cc		:4;	//to save space we stuff cc in here
};

struct LiteralPoolEntry {	//we do not expect literals to be reused so no point allowing multiple palces to reer to one literal
	union {
		uint32_t value;
		uint32_t idxOfIdenticalLiteral;
	};
	uint32_t loc		: 27;		//ofst in halfword from start of tu
	uint32_t identical	: 1;		//if set, idxOfIdenticalLiteral is relevant and no word needs to be emitted
	uint32_t reg		: 4;		//reg
};

//logic in jitPrvClassifyAddressRange assumes these go in ascending order of difficulty
#define TU_SOURCE_ROM		0
#define TU_SOURCE_STORAGE	1
#define TU_SOURCE_RAM		2
#define TU_SOURCE_UNKNOWN	3

struct TU {	//must be an even number of bytes in size
	//first 2 members MUST be in this order - asm code in "emuCustomUsageFaultHandler" uses it
	uint32_t baseAddr;
	struct TU* next;

#ifdef SUPPORT_ICACHE_FLUSH
	uint32_t srcHash;
	union {
		struct {
			uint16_t srcType : 2;			//TU_SOURCE_*, we assume the compiler will put this into lower bits
			uint16_t srcLen	 : 14;			//IN WORDS!
		};
		uint16_t srcLenWord;
	};
#endif

	//the offset to this member is also used in asm code in "emuCustomUsageFaultHandler" but it is parametrized and thus moves if we change the above
	uint16_t code[];			//generated code
};

struct JitTls {
	struct TU *hashes[1 << TC_HASH_ENTRY_SZ_LOG];		//must be first (asm assumes it)
	
	//special state
	struct JitBackendRuntimeData backendRuntimeData;
	
	uint32_t tcLevel, tcSize;	//in halfwords
	uint32_t lastFlushTicks;
		
	#ifdef SUPPORT_ICACHE_FLUSH
		uint64_t lastFlushCount;
		uint64_t lastStorageTaintCount;
	#endif
	
	//flags
	uint16_t allocingNewOneCurrently	:1;		//set before a call to system's allocation funcs to avoid recursive calls to them
	uint16_t permanent					:1;		//never free or resize this one!
	
	//translate state (better here than on stack)
	uint16_t literalIdx					: 14;
	struct LiteralPoolEntry litPool[MAX_LITERALS_BEFORE_SPILL];
	struct FwdJumpInfo fwdJumps[FORWARD_JUMP_CACHE_SZ];
	uint16_t shortLoopOfsts[SHORT_LOOP_OPT_SZ];				//we assume that all offsets fit into 16 bits
	
	//the TC (must be 4-byte aligned)
	uint16_t tc[];
};


#ifdef SUPPORT_ICACHE_FLUSH
	static uint64_t mCurFlushCount = 0;
	static uint64_t mStorageTaintCount = 0;
	static uint8_t mStorageWriteable = false;
#endif

static enum EmitStatus jitPrvSpillLiterals(struct JitTls* state, struct EmitBuf *dest, bool jumpOver);

static __attribute__((pure)) uint32_t jitTuHashAddress(uint32_t addr)
{
	uint32_t hashIdx;
	
	asm (
		HASH_ADDRESS(%0, %1)
		: "=&l"(hashIdx)
		: "l"(addr)
		:
	);
	
	return hashIdx;
}

#ifdef SUPPORT_ICACHE_FLUSH
	
	//base addr may not be aligned (see "arm v4 BX PC quirk")
	
	#ifdef BUILD_FOR_THUMB_1
		
		static uint32_t __attribute__((naked)) jitTuHashSource(struct TU *tu)
		{
			asm volatile(
				".syntax unified					\n\t"
				"	push    {r4-r7}					\n\t"
				"	ldr     r1, [r0, %0]			\n\t"	//get base address
				"	lsrs    r1, #2					\n\t"	//clear bottom two bits
				"	lsls    r1, #2					\n\t"
				"	ldrh    r2, [r0, %1]			\n\t"	//get length (in words)
				"	lsrs    r2, #2					\n\t"	//remove "source" from the length word
			
			#ifdef HAVE_v8M_BASE
				"	movw    r0, #0x177				\n\t"	//prepare hash value
				"	movw    r3, #0x0193				\n\t"	//get multiplicand
				"	movt    r3, #0x0100				\n\t"	//get multiplicand (pt 2)
			#else
				"	ldr     r0, =0x177				\n\t"	//prepare hash value
				"	ldr     r3, =0x01000193			\n\t"	//get multiplicand
			#endif
				"1:									\n\t"	//big loop
				"	subs    r2, #4					\n\t"	//do we have 4+ words left?
				"	bmi     2f						\n\t"	//no? go to short loop
				"	ldmia   r1!, {r4-r7}			\n\t"	//get 4 words
				"	eors    r0, r4					\n\t"	//hash in word 0
				"	muls    r0, r3					\n\t"
				"	eors    r0, r5					\n\t"	//hash in word 1
				"	muls    r0, r3					\n\t"
				"	eors    r0, r6					\n\t"	//hash in word 2
				"	muls    r0, r3					\n\t"
				"	eors    r0, r7					\n\t"	//hash in word 3
				"	muls    r0, r3					\n\t"
				"	b       1b						\n\t"	//next loop!
				
				"2:									\n\t"	//finish up
				"	adds    r2, #4					\n\t"	//r2 is 0..3
				"	lsls    r2, #4					\n\t"	//dispatch based on how many we have left to do (8 instrs per option)
				"	add     pc, r2					\n\t"
				"	nop								\n\t"
				"left_0_units:						\n\t"
				"	pop     {r4-r7}					\n\t"
				"	bx      lr						\n\t"
				"	nop                             \n\t"
				"	nop                             \n\t"
				"	nop                             \n\t"
				"	nop                             \n\t"
				"	nop                             \n\t"
				"	nop                             \n\t"
				"left_1_unit:						\n\t"
				"	ldr     r4, [r1]				\n\t"
				"	eors    r0, r4					\n\t"
				"	muls    r0, r3					\n\t"
				"	pop     {r4-r7}					\n\t"
				"	bx      lr						\n\t"
				"	nop                             \n\t"
				"	nop                             \n\t"
				"	nop                             \n\t"
				"left_2_units:						\n\t"
				"	ldmia   r1!, {r4-r5}			\n\t"	//get 2 words
				"	eors    r0, r4					\n\t"
				"	muls    r0, r3					\n\t"
				"	eors    r0, r5					\n\t"
				"	muls    r0, r3					\n\t"
				"	pop     {r4-r7}					\n\t"
				"	bx      lr						\n\t"
				"	nop                             \n\t"
				"left_3_units:						\n\t"
				"	ldmia   r1!, {r4-r6}			\n\t"	//get 3 words
				"	eors    r0, r4					\n\t"
				"	muls    r0, r3					\n\t"
				"	eors    r0, r5					\n\t"
				"	muls    r0, r3					\n\t"
				"	eors    r0, r6					\n\t"
				"	muls    r0, r3					\n\t"
				"	pop     {r4-r7}					\n\t"
				"	bx      lr						\n\t"
				".ltorg								\n\t"
			:
			:"I"(offsetof(struct TU, baseAddr)), "I"(offsetof(struct TU, srcLenWord))
			:"cc","r0","r1","r2","r3","lr"
			);
			
			return 0;	//shut gcc up
		}
		
	#else
	
		static uint32_t __attribute__((naked)) jitTuHashSource(struct TU *tu)
		{
			asm volatile(
				".syntax unified					\n\t"
				"	push    {r4-r7}					\n\t"
				"	ldr     r1, [r0, %0]			\n\t"	//get base address
				"	bic     r1, #3					\n\t"	//clear bottom two bits
				"	ldrh    r2, [r0, %1]			\n\t"	//get length (in words)
				"	lsrs    r2, #2					\n\t"	//remove "source" from the length word
				"	mov     r0, #0x177				\n\t"	//prepare hash value
				"	ldr     r3, =0x01000193			\n\t"	//get multiplicand
				
				"1:									\n\t"	//big loop
				"	subs    r2, #4					\n\t"	//do we have 4+ words left?
				"	bmi     2f						\n\t"	//no? go to short loop
				"	ldmia   r1!, {r4-r7}			\n\t"	//get 4 words
				"	eors    r0, r4					\n\t"	//hash in word 0
				"	muls    r0, r3					\n\t"
				"	eors    r0, r5					\n\t"	//hash in word 1
				"	muls    r0, r3					\n\t"
				"	eors    r0, r6					\n\t"	//hash in word 2
				"	muls    r0, r3					\n\t"
				"	eors    r0, r7					\n\t"	//hash in word 3
				"	muls    r0, r3					\n\t"
				"	b       1b						\n\t"	//next loop!
				
				"2:									\n\t"	//finish up
				"	lsls    r2, #31					\n\t"
				"	bcc     3f						\n\t"
				"	ldmia   r1!, {r4-r5}			\n\t"
				"	itttt   cs						\n\t"	//faster than using long instrs next, and we need flags preserved. itttt al would work too, but gcc chokes on it
				"	eorcs   r0, r4					\n\t"
				"	mulcs   r0, r3					\n\t"
				"	eorcs   r0, r5					\n\t"
				"	mulcs   r0, r3					\n\t"
				
				"3:									\n\t"
				"	ittt    mi						\n\t"
				"	ldrmi   r4, [r1]				\n\t"
				"	eormi   r0, r4					\n\t"
				"	mulmi   r0, r3					\n\t"
				
				"	pop     {r4-r7}					\n\t"
				"	bx      lr						\n\t"
				".ltorg								\n\t"
			:
			:"I"(offsetof(struct TU, baseAddr)), "I"(offsetof(struct TU, srcLenWord))
			:"cc","r0","r1","r2","r3","lr"
			);
			
			return 0;	//shut gcc up
		}
	#endif
#endif

//a safe way to get current ticks
static uint32_t jitGetTicks(void)
{
	return //we need to call dal directly since any call into system/etc can cause more translation and thus a loop
	
	HALTimeGetSystemTime();
}

static void jitStateInit(struct JitTls* state, uint32_t allocSz)
{
	int32_t tcSize = (allocSz - sizeof(struct JitTls)) / sizeof(uint16_t);
	
	if (tcSize < 0)
		fatal("JIT state too small\n");
	
	memset(state, 0, allocSz);
	
	state->tcSize = tcSize;
	state->lastFlushTicks = jitGetTicks();
	#ifdef SUPPORT_ICACHE_FLUSH
		state->lastFlushCount = mCurFlushCount;
	#endif
	
	logjt("Inited a TC with %u instr slots\n", tcSize);

	
/*
// this is a convenient test code piece for emitHLloadImmToReg
	{
		int32_t total = 0, startAt = 0;
		enum EmitStatus now;
		struct EmitBuf buf;
		uint32_t seed = 1;
		
		//necessery for literal code to be happy
		TLS.slot[TLS_SLOT_JIT_STATE] = (uintptr_t)state;

		while (1) {
			
			bool storeTest = true;
			
			//xorshift128+
			{
				static uint64_t s[2] = {1, 1};
				
			    uint64_t s1 = s[0];
			    const uint64_t s0 = s[1];
			    s[0] = s0;
			    s1 ^= s1 << 23; // a
			    s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c
			    seed = (s[1] + s0) ^ ((s[1] + s0) >> 32);
			}

			if (++total < startAt)									//skip the first "startAt" tests
				continue;

			emitBufferInit(&buf, state->tc, 128);
			
			pr("\n 0x%08x\n", seed);
			
			now = emitHLloadImmToReg(&buf, 5, seed, true, true, false);
			
			if (now != EmitErrNone)
				loge("FAIL\n");
			else if (EmitErrNone != jitPrvSpillLiterals(state, &buf, true))
				loge("LIT SPILL FAIL\n");
			else {
				
				char disasmStr[64];
				
				while (buf.bufStart < buf.buf) {
					buf.bufStart += disasm(disasmStr, buf.bufStart, true, true) / sizeof(uint16_t);
					pr("\t%s\n", disasmStr);
				}
			}
		}
		while(1);
	}
//*/
/*
// this is a convenient test code piece for LDRD/STRD.reg
	{
		static const uint8_t nRegs[] = {0, 1, 2, 12, 13, 14, 15};
		static const uint8_t mRegs[] = {0, 1, 2, 12, 13, 14,};
		static const uint8_t tRegs[] = {0,2,10,12};
		uint32_t n, m, t, u, a, rn, rm, rt;
		uint32_t instrVal = 0xffffffff, instrAddr = (uint32_t)&instrVal;
		int32_t total = 0, startAt = 0;
		bool testStore = true;
		enum EmitStatus now;
		struct EmitBuf buf;
		uint32_t seed = 1;
		
		//necessery for literal code to be happy
		TLS.slot[TLS_SLOT_JIT_STATE] = (uintptr_t)state;

		while (1) {
			
			bool storeTest = true;
			
			//xorshift128+
			{
				static uint64_t s[2] = {1, 1};
				
			    uint64_t s1 = s[0];
			    const uint64_t s0 = s[1];
			    s[0] = s0;
			    s1 ^= s1 << 23; // a
			    s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c
			    seed = (s[1] + s0) ^ ((s[1] + s0) >> 32);
			}
			
			n = seed % sizeof(nRegs);		seed /= sizeof(nRegs);
			m = seed % sizeof(mRegs);		seed /= sizeof(mRegs);
			t = seed % sizeof(tRegs);		seed /= sizeof(tRegs);
			u = seed % 2;					seed /= 2;
			a = seed % 3;					seed /= 3;
			
			rn = nRegs[n];
			rm = mRegs[m];
			rt = tRegs[t];
			
			//loads with wbak and reg in list arent allowed
			if (!testStore && a != EmitAdrModeIndex && (rt == rn || rt + 1 == rn))
				continue;
			//m cannot be same as dest regs
			if (!testStore && (rt == rm || rt + 1 == rm))
				continue;
			//no wbak to pc
			if (rn == 15 && a != EmitAdrModeIndex)
				continue;

			if (++total < startAt)									//skip the first "startAt" tests
				continue;

			emitBufferInit(&buf, state->tc, 128);
			
			pr("\n [%6d] 0x%08x: %sRD R%d, [R%d", total, instrAddr, testStore ? "ST" : "LD", rt, rn);
			if (a != EmitAdrModePostindex)
				pr(", %cR%d]%s\n", u ? '+' : '-', rm, (a == EmitAdrModeIndexWbak) ? "!" : "");
			else	
				pr("], %cR%d\n", u ? '+' : '-', rm);
			
			now = jitEmitRegRegMemStrd(&buf, instrAddr, rt, rn, u, rm, a);
			
			if (now != EmitErrNone)
				loge("FAIL\n");
			else if (EmitErrNone != jitPrvSpillLiterals(state, &buf, true))
				loge("LIT SPILL FAIL\n");
			else {
				
				char disasmStr[64];
				
				while (buf.bufStart < buf.buf) {
					buf.bufStart += disasm(disasmStr, buf.bufStart, true, true) / sizeof(uint16_t);
					pr("\t%s\n", disasmStr);
				}
			}
		}
		while(1);
	}
//*/
/*
// this is a convenient test code piece for LDRD/STRD.imm
	{
		static const uint8_t baseRegs[] = {0, 1, 2, 12, 13, 14, 15};
		static const uint8_t tRegs[] = {0,2,10,12};
		static const uint8_t ofsts[] = {0, 0x03, 0x80, 0xff};
		uint32_t n, t, i, u, m, rn, rt;
		uint32_t instrVal = 0xffffffff, instrAddr = (uint32_t)&instrVal;
		int32_t total = 0, startAt = 0;
		bool testStore = false;
		enum EmitStatus now;
		struct EmitBuf buf;
		uint32_t seed = 1;
		
		//necessery for literal code to be happy
		TLS.slot[TLS_SLOT_JIT_STATE] = (uintptr_t)state;

		while (1) {
			
			bool storeTest = true;
			
			//xorshift128+
			{
				static uint64_t s[2] = {1, 1};
				
			    uint64_t s1 = s[0];
			    const uint64_t s0 = s[1];
			    s[0] = s0;
			    s1 ^= s1 << 23; // a
			    s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c
			    seed = (s[1] + s0) ^ ((s[1] + s0) >> 32);
			}
			
			n = seed % sizeof(baseRegs);	seed /= sizeof(baseRegs);
			t = seed % sizeof(tRegs);		seed /= sizeof(tRegs);
			i = seed % sizeof(ofsts);		seed /= sizeof(ofsts);
			u = seed % 2;					seed /= 2;
			m = seed % 3;					seed /= 3;
			
			rn = baseRegs[n];
			rt = tRegs[t];
			i = ofsts[i];
			if (!u)
				i = -i;
			
			//loads with wbak and reg in list arent allowed
			if (!testStore && m != EmitAdrModeIndex && (rt == rn || rt + 1 == rn))
				continue;
			//sp ofsts need to be word aligned
			if (rn == EMIT_REG_NO_SP && (i & 3))
				continue;

			if (++total < startAt)									//skip the first "startAt" tests
				continue;

			emitBufferInit(&buf, state->tc, 128);
			
			pr("\n [%6d] 0x%08x: %sRD R%d, [R%d", total, instrAddr, testStore ? "ST" : "LD", rt, rn);
			if (m != EmitAdrModePostindex)
				pr(", #%d]%s\n", i, (m == EmitAdrModeIndexWbak) ? "!" : "");
			else	
				pr("], #%d\n", i);
			
			now = jitEmitImmMemLdrd(&buf, instrAddr, rt, rn, i, m);
			
			if (now != EmitErrNone)
				loge("FAIL\n");
			else if (EmitErrNone != jitPrvSpillLiterals(state, &buf, true))
				loge("LIT SPILL FAIL\n");
			else {
				
				char disasmStr[64];
				
				while (buf.bufStart < buf.buf) {
					buf.bufStart += disasm(disasmStr, buf.bufStart, true, true) / sizeof(uint16_t);
					pr("\t%s\n", disasmStr);
				}
			}
		}
		while(1);
	}
//*/
/*
// this is a convenient test code piece for LDM/STM. kept here for future use
	{
		static const uint8_t baseRegs[] = {0,2,11,12,13,14,};
		uint32_t i, j, k, l, m, n, w, rn, mask;
		static const char *funcSuffix = "DA";
		uint32_t instrAddr = 0xd0c0cafe;
		int32_t total = 0, startAt = 0;
		bool testStore = false;
		enum EmitStatus now;
		struct EmitBuf buf;
		uint32_t seed = 1;
		
		//necessery for literal code to be happy
		TLS.slot[TLS_SLOT_JIT_STATE] = (uintptr_t)state;

		while (1) {
			
			bool storeTest = true;
			
			//xorshift128+
			{
				static uint64_t s[2] = {1, 1};
				
			    uint64_t s1 = s[0];
			    const uint64_t s0 = s[1];
			    s[0] = s0;
			    s1 ^= s1 << 23; // a
			    s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c
			    seed = (s[1] + s0) ^ ((s[1] + s0) >> 32);
			}
			
			i = seed % sizeof(baseRegs);	seed /= sizeof(baseRegs);
			j = seed % 2;	seed /= 2;
			k = seed % 2;	seed /= 2;
			l = seed % 2;	seed /= 2;
			m = seed % 8;	seed /= 8;
			n = seed % 0x2000;	seed /= 0x2000;
			w = seed % 2;	seed /= 2;
			
			rn = baseRegs[i];
			mask = 0;
			
			if (j)	mask |= 0x8000;	//PC?
			if (k)	mask |= 0x4000;	//LR?
			if (l)	mask |= 0x2000;	//SP?
			
			switch (m) {
				case 0:				//no other regs
					break;
				case 1:				//ALL Regs
					mask |= 0x1fff;
					break;
				default:			//ramdon regs
					mask |= n;		//other regs
					break;
			}
			
			//loads with wbak and reg in list arent allowed
			if (!testStore && w && (mask & (1 << rn)))
				continue;
			//stores with reg in list and not lowest not allowed
			if (testStore && w && (mask & (1 << rn)) && (mask & ((1 << rn) - 1)))
				continue;
			//mask cannot be empty
			if (!mask)
				continue;

			if (++total < startAt)									//skip the first "startAt" tests
				continue;

			emitBufferInit(&buf, state->tc, 128);
			
			pr("\n[%6d] 0x%08x: %sM%s R%d%s, {", total, instrAddr, testStore ? "ST" : "LD", funcSuffix, rn, w ? "!" : "");
			for (i = 0; i < 16; i++) {
				if (mask & (1 << i))
					pr("R%d, ", i);
			}
			pr("}\n");
			
			now = jitEmitLdmda(&buf, instrAddr, rn, mask, w);
			
			if (now != EmitErrNone)
				loge("FAIL\n");
			else if (EmitErrNone != jitPrvSpillLiterals(state, &buf, true))
				loge("LIT SPILL FAIL\n");
			else {
				
				char disasmStr[64];
				
				while (buf.bufStart < buf.buf) {
					buf.bufStart += disasm(disasmStr, buf.bufStart, true, true) / sizeof(uint16_t);
					pr("\t%s\n", disasmStr);
				}
			}
		}
		while(1);
	}
//*/
/*
// this is a convenient test code piece for DR reg shifted by reg xlations. kept here for future use
	{
		static const char* names[] = {"AND","EOR","SUB","RSB","ADD","ADC","SBC","RSC","TST","TEQ","CMP","CMN","ORR","MOV","BIC","MVN"};
		static const char *shiftNames[] = {"LSL", "LSR", "ASR", "ROR"};
		static uint16_t hasRd = 0b1111000011111111, hasRn = 0b0101111111111111;
		uint8_t regs[] = {0,1,2,10,11,12,13,};
		uint32_t i, j, k, l, n, r, s;
		uint32_t instrAddr = 0xd0c0cafe;
		int32_t total = 0, startAt = 0;
		enum EmitStatus now;
		struct EmitBuf buf;
		uint32_t seed = 1;
		
		//for (i = 0; i < sizeof(regs); i++) {								//Rd
		//	for (j = 0; j < sizeof(regs); j++) {							//Rn
		//		for (k = 0; k < sizeof(regs); k++) {						//Rm
		//			for (l = 0; l < sizeof(regs); l++) {					//Rs
		//				for (n = 0; n < 4; n++) {							//shiftType
		//					for (r = 0; r < 16; r++) {						//op
		//						for (s = 0; r < 2; s++) {					//s

		//necessery for literal code to be happy
		TLS.slot[TLS_SLOT_JIT_STATE] = (uintptr_t)state;

		while (1) {
			
			bool storeTest = true;
			
			//xorshift128+
			{
				static uint64_t s[2] = {1, 1};
				
			    uint64_t s1 = s[0];
			    const uint64_t s0 = s[1];
			    s[0] = s0;
			    s1 ^= s1 << 23; // a
			    s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c
			    seed = (s[1] + s0) ^ ((s[1] + s0) >> 32);
			}
			
			i = seed % sizeof(regs);	seed /= sizeof(regs);
			j = seed % sizeof(regs);	seed /= sizeof(regs);
			k = seed % sizeof(regs);	seed /= sizeof(regs);
			l = seed % sizeof(regs);	seed /= sizeof(regs);
			n = seed % 4;	seed /= 4;
			r = seed % 16;	seed /= 16;
			s = seed % 2;	seed /= 2;
			
			uint32_t rd = regs[i];
			uint32_t rn = regs[j];
			uint32_t rm = regs[k];
			uint32_t rs = regs[l];
			
			//skip non-S compares and S-ops to PC
			if (s && rd == 15)
				continue;
			if (!s && !(hasRd & (1 << r)))
				continue;
			
			if (++total < startAt)									//skip the first "startAt" tests
				continue;

			emitBufferInit(&buf, state->tc, 128);
			
			pr("\n[%6d] 0x%08x: %s%s ", total, instrAddr, names[r], (s && (hasRd & (1 << r))) ? "S": "");
			if (hasRd & (1 << r))
				pr("R%d, ", rd);
			if (hasRn & (1 << r))
				pr("R%d, ", rn);
			pr(" R%d, %s R%d\n", rm, shiftNames[n], rs);
			
			now = jitEmitAluOpRegShiftReg(&buf, EmitCcAl, r, instrAddr, rd, rn, rm, rs, n, s);
			
			if (now != EmitErrNone)
				loge("FAIL\n");
			else if (EmitErrNone != jitPrvSpillLiterals(state, &buf, true))
				loge("LIT SPILL FAIL\n");
			else {
				
				char disasmStr[64];
				
				while (buf.bufStart < buf.buf) {
					buf.bufStart += disasm(disasmStr, buf.bufStart, true, true) / sizeof(uint16_t);
					pr("\t%s\n", disasmStr);
				}
			}
		}
		while(1);
	}
//*/
/*
// this is a convenient test code piece for DR reg shifted by imm xlations. kept here for future use
	{
		static const char* names[] = {"AND","EOR","SUB","RSB","ADD","ADC","SBC","RSC","TST","TEQ","CMP","CMN","ORR","MOV","BIC","MVN"};
		static const char *shiftNames[] = {" LSL #%d", " LSR #%d", " ASR #%d", " ROR #%d"};
		static uint16_t hasRd = 0b1111000011111111, hasRn = 0b0101111111111111;
		uint8_t regs[] = {0,1,2,10,11,12,13,15,};
		uint32_t i, j, k, n, p, r, s;
		uint32_t instrAddr = 0xd0c0cafe;
		int32_t total = 0, startAt = 0;
		uint8_t shifts[] = {0, 3, 5};
		enum EmitStatus now;
		struct EmitBuf buf;
		uint32_t seed = 1;
		
		//for (i = 0; i < sizeof(regs); i++) {								//Rd
		//	for (j = 0; j < sizeof(regs); j++) {							//Rn
		//		for (k = 0; k < sizeof(regs); k++) {						//Rm
		//			for (n = 0; n < 4; n++) {								//shiftType
		//				for (p = 0; p < sizeof(shifts); p++) {				//shift amt
		//					for (r = 0; r < 16; r++) {						//op
		//						for (s = 0; r < 2; s++) {					//s

		//necessery for literal code to be happy
		TLS.slot[TLS_SLOT_JIT_STATE] = (uintptr_t)state;

		while (1) {
			
			bool storeTest = true;
			
			//xorshift128+
			{
				static uint64_t s[2] = {1, 1};
				
			    uint64_t s1 = s[0];
			    const uint64_t s0 = s[1];
			    s[0] = s0;
			    s1 ^= s1 << 23; // a
			    s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c
			    seed = (s[1] + s0) ^ ((s[1] + s0) >> 32);
			}
			
			i = seed % sizeof(regs);	seed /= sizeof(regs);
			j = seed % sizeof(regs);	seed /= sizeof(regs);
			k = seed % sizeof(regs);	seed /= sizeof(regs);
			n = seed % 4;	seed /= 4;
			p = seed % sizeof(shifts);	seed /= sizeof(shifts);
			r = seed % 16;	seed /= 16;
			s = seed % 2;	seed /= 2;
			
			uint32_t rd = regs[i];
			uint32_t rn = regs[j];
			uint32_t rm = regs[k];
			uint32_t shiftAmt = shifts[p];
			const char *shiftName = shiftNames[n];

			if (!shiftAmt) {										//skip illegal shifts
				if (n && n != 3)
					continue;
				if (n == 3)
					shiftName = " RRX";
				else
					shiftName = NULL;
			}
			
			//skip non-S compares and S-ops to PC
			if (s && rd == 15)
				continue;
			if (!s && !(hasRd & (1 << r)))
				continue;
			
			if (++total < startAt)									//skip the first "startAt" tests
				continue;

			emitBufferInit(&buf, state->tc, 128);
			
			
			pr("\n[%6d] 0x%08x: %s%s ", total, instrAddr, names[r], (s && (hasRd & (1 << r))) ? "S": "");
			if (hasRd & (1 << r))
				pr("R%d, ", rd);
			if (hasRn & (1 << r))
				pr("R%d, ", rn);
			pr(" R%d", rm);
			if (shiftName)
				pr(shiftName, shiftAmt);
			pr("\n");
			
			now = jitEmitAluOpRegShiftImm(&buf, EmitCcAl, r, instrAddr, rd, rn, rm, n, shiftAmt, s);
			
			if (now != EmitErrNone)
				loge("FAIL\n");
			else if (EmitErrNone != jitPrvSpillLiterals(state, &buf, true))
				loge("LIT SPILL FAIL\n");
			else {
				
				char disasmStr[64];
				
				while (buf.bufStart < buf.buf) {
					buf.bufStart += disasm(disasmStr, buf.bufStart, true, true) / sizeof(uint16_t);
					pr("\t%s\n", disasmStr);
				}
			}
		}
		while(1);
	}
//*/
/*
// this is a convenient test code piece for DP imm ops xlations. kept here for future use
	{
		static const char* names[] = {"AND","EOR","SUB","RSB","ADD","ADC","SBC","RSC","TST","TEQ","CMP","CMN","ORR","MOV","BIC","MVN"};
		static uint16_t hasRd = 0b1111000011111111, hasRn = 0b0101111111111111;
		static uint8_t imms[] = {0x00, 0x01, 0x40, 0x80, 0xff, };
		static uint8_t rots[] = {0, 1, 4, 8, 16, 31, };
		uint32_t seed, i, j, k, l, m, n, imm;
		uint8_t regs[] = {0,1,10,12,13,15};
		uint32_t instrAddr = 0xd0c0cafe;
		int32_t total = 0, startAt = 0;
		struct EmitBuf buf;
		
		//for (i = 0; i < sizeof(regs); i++) {								//Rd
		//	for (j = 0; j < sizeof(regs); j++) {							//Rn
		//		for (k = 0; k < sizeof(imms); k++) {						//valIn
		//			for (l = 0; l < sizeof(rots); l++) {					//rotBy
		//				for (m = 0; m < 2; m++) {							//s
		//					for (n = 0; n < 16; n++) {						//op
	
		//necessery for literal code to be happy
		TLS.slot[TLS_SLOT_JIT_STATE] = (uintptr_t)state;

		while (1) {
		
			//xorshift128+
			{
				static uint64_t s[2] = {1, 1};
				
			    uint64_t s1 = s[0];
			    const uint64_t s0 = s[1];
			    s[0] = s0;
			    s1 ^= s1 << 23; // a
			    s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c
			    seed = (s[1] + s0) ^ ((s[1] + s0) >> 32);
			}
			
			i = seed % sizeof(regs);	seed /= sizeof(regs);
			j = seed % sizeof(regs);	seed /= sizeof(regs);
			k = seed % sizeof(imms);	seed /= sizeof(imms);
			l = seed % sizeof(rots);	seed /= sizeof(rots);
			m = seed % 2;	seed /= 2;
			n = seed % 16;	seed /= 16;
			
			uint32_t rd = regs[i];
			uint32_t rn = regs[j];
			uint32_t val = imms[k];
			
			//S required for instrs with no Rd
			if (!m && !(hasRd & (1 << n)))
				continue;
			
			//we do not support SPSR
			if (m && rd == 15)
				continue;
			
			if (++total < startAt)									//skip the first "startAt" tests
				continue;

			if (rots[l])
				val = (val >> rots[l]) | (val << (32 - rots[l]));
			
			emitBufferInit(&buf, state->tc, 128);
			
			pr("\n[%6d] 0x%08x: %s%s ", total, instrAddr, names[n], (m && (hasRd & (1 << n))) ? "S": "");
			if (hasRd & (1 << n))
				pr("R%d, ", rd);
			if (hasRn & (1 << n))
				pr("R%d, ", rn);
			pr(" #0x%02X ROR %u // == 0x%08x\n", imms[k], rots[l], val);
			
			
			if (EmitErrNone != jitEmitAluOpImm(&buf, EmitCcAl, n, instrAddr, rd, rn, imms[k], rots[l], m))
				loge("FAIL\n");
			else if (EmitErrNone != jitPrvSpillLiterals(state, &buf, true))
				loge("LIT SPILL FAIL\n");
			else {
				
				char disasmStr[64];
				
				while (buf.bufStart < buf.buf) {
					buf.bufStart += disasm(disasmStr, buf.bufStart, true, true) / sizeof(uint16_t);
					pr("\t%s\n", disasmStr);
				}
			}
		}
		while(1);
	}
//*/
/*
// this is a convenient test code piece for SWP xlations. kept here for future use
	{
		uint8_t regs[] = {0,1,2,10,11,12,13,};
		uint32_t i, j, k, l, m, n, p, q, r;
		uint32_t instrAddr = 0xd0c0cafe;
		int32_t total = 0, startAt = 0;
		struct EmitBuf buf;
		uint32_t seed = 1;

		//for (i = 0; i < sizeof(regs); i++) {								//Rd
		//	for (j = 0; j < sizeof(regs); j++) {							//Rn
		//		for (k = 0; k < sizeof(regs); k++) {						//Rm
		//				for (l = 0; l < 3; l++) {							//size
		
		//necessery for literal code to be happy
		TLS.slot[TLS_SLOT_JIT_STATE] = (uintptr_t)state;

		while (1) {
		
			//xorshift128+
			{
				static uint64_t s[2] = {1, 1};
				
			    uint64_t s1 = s[0];
			    const uint64_t s0 = s[1];
			    s[0] = s0;
			    s1 ^= s1 << 23; // a
			    s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c
			    seed = (s[1] + s0) ^ ((s[1] + s0) >> 32);
			}
			
			i = seed % sizeof(regs);	seed /= sizeof(regs);
			j = seed % sizeof(regs);	seed /= sizeof(regs);
			k = seed % sizeof(regs);	seed /= sizeof(regs);
			l = seed % 3;	seed /= 3;
			
			if (l == EmitSzHalfword)
				continue;
			
			if (j == i || j == k)
				continue;
			
			if (++total < startAt)									//skip the first "startAt" tests
				continue;

			uint32_t rd = regs[i];
			uint32_t rn = regs[j];
			uint32_t rm = regs[k];
			
			emitBufferInit(&buf, state->tc, 128);
			
			pr("\n[%6d] 0x%08x: SWP%s R%d, R%d, [R%d]\n", total, instrAddr, l == EmitSzByte ? "B" : "", rd, rm, rn);
			
			if (EmitErrNone != jitEmitSwap(&buf, instrAddr, rd, rn, rm, l))
				loge("FAIL\n");
			else if (EmitErrNone != jitPrvSpillLiterals(state, &buf, true))
				loge("LIT SPILL FAIL\n");
			else {
				
				char disasmStr[64];
				
				while (buf.bufStart < buf.buf) {
					buf.bufStart += disasm(disasmStr, buf.bufStart, true, true) / sizeof(uint16_t);
					pr("\t%s\n", disasmStr);
				}
			}
		}
		while(1);
	}
//*/
/*
// this is a convenient test code piece for complex reg-reg mem access xlations. kept here for future use
	{
		static const char *shiftNames[] = {" LSL #%d", " LSR #%d", " ASR #%d", " ROR #%d"};
		uint8_t regs[] = {0,1,2,10,11,12,13,15,};
		uint32_t i, j, k, l, m, n, p, q, r;
		uint32_t instrAddr = 0xd0c0cafe;
		int32_t total = 0, startAt = 0;
		uint8_t shifts[] = {0, 3, 5};
		enum EmitStatus now;
		struct EmitBuf buf;
		uint32_t seed = 1;
		
		//for (i = 0; i < sizeof(regs); i++) {								//Rt
		//	for (j = 0; j < sizeof(regs); j++) {							//Rn
		//		for (k = 0; k < sizeof(regs); k++) {						//Rm
		//			for (l = 0; l < 3; l++) {								//adrMode
		//				for (m = 0; m < 2; m++) {							//isAdd
		//					for (n = 0; n < 4; n++) {						//shiftType
		//						for (p = 0; p < sizeof(shifts); p++) {		//shift amt
		//							for (q = 0; q < 3; q++) {				//size
		//								for (r = 0; r < 2; r++) {			//sext

		//necessery for literal code to be happy
		TLS.slot[TLS_SLOT_JIT_STATE] = (uintptr_t)state;

		while (1) {
			
			bool storeTest = true;
			
			//xorshift128+
			{
				static uint64_t s[2] = {1, 1};
				
			    uint64_t s1 = s[0];
			    const uint64_t s0 = s[1];
			    s[0] = s0;
			    s1 ^= s1 << 23; // a
			    s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c
			    seed = (s[1] + s0) ^ ((s[1] + s0) >> 32);
			}
			
			i = seed % sizeof(regs);	seed /= sizeof(regs);
			j = seed % sizeof(regs);	seed /= sizeof(regs);
			k = seed % sizeof(regs);	seed /= sizeof(regs);
			l = seed % 3;	seed /= 3;
			m = seed % 2;	seed /= 2;
			n = seed % 4;	seed /= 4;
			p = seed % sizeof(shifts);	seed /= sizeof(shifts);
			q = seed % 3;	seed /= 3;
			r = seed % 2;	seed /= 2;
			
			uint32_t rt = regs[i];
			uint32_t rn = regs[j];
			uint32_t rm = regs[k];
			uint32_t shiftAmt = shifts[p];
			const char *shiftName = shiftNames[n];

			if (r && q == 2)										//do not sign extended word loads
				continue;

			if ((rn == rt || rn == 15) && l != EmitAdrModeIndex)	//no writeback when Rn == Rt or Rn == 15
				continue;

			if (!shiftAmt) {										//skip illegal shifts
				if (n && n != 3)
					continue;
				if (n == 3)
					shiftName = " RRX";
				else
					shiftName = NULL;
			}
			
			if (n && p > 1)											//non LSL shifts only need one variant tested
				continue;
			
			if (rm == 15)											//rm == 15 is not allowed
				continue;
			
			if (++total < startAt)									//skip the first "startAt" tests
				continue;

			emitBufferInit(&buf, state->tc, 128);
			
			pr("\n[%6d] 0x%08x: %sR%s%s R%d, [R%d", total, instrAddr, storeTest ? "ST" : "LD", (r && !storeTest) ? "S" : "", q ? ((q == 1) ? "H" : "") : "B", rt, rn);
			pr(l == EmitAdrModePostindex ? "], %cR%d" : ", %cR%d", m ? '+' : '-', rm);
			
			if (shiftName)
				pr(shiftName, shiftAmt);
			
			if (l == EmitAdrModeIndex)
				pr("]: \n");
			else if (l == EmitAdrModeIndexWbak)
				pr("]!: \n");
			else
				pr("\n");

			if (storeTest)
				now = jitEmitRegRegMemStr(&buf, instrAddr, rt, rn, m, rm, n, shiftAmt, l, q);
			else
				now = jitEmitRegRegMemLdr(&buf, instrAddr, r, rt, rn, m, rm, n, shiftAmt, l, q);
			
			if (now != EmitErrNone)
				loge("FAIL\n");
			else if (EmitErrNone != jitPrvSpillLiterals(state, &buf, true))
				loge("LIT SPILL FAIL\n");
			else {
				
				char disasmStr[64];
				
				while (buf.bufStart < buf.buf) {
					buf.bufStart += disasm(disasmStr, buf.bufStart, true, true) / sizeof(uint16_t);
					pr("\t%s\n", disasmStr);
				}
			}
		}
		while(1);
	}
//*/
}

static struct JitTls* jitStateAllocate(uint32_t tcSize)	//size is in halfwords, also assigns it to TLS
{
	struct JitTls *ret;
	uint32_t tid, sz;
	
	sz = sizeof(struct JitTls) + tcSize * sizeof(uint16_t);
	ret = (struct JitTls*)kheapAllocEx(sz, MEM_FAST | MEM_USABLE_FOR_EXEC);
	
	if (!ret) {
		sz = sizeof(struct JitTls) + TC_SIZE_INITIAL * sizeof(uint16_t);
		ret = (struct JitTls*)kheapAllocEx(sz, MEM_FAST | MEM_USABLE_FOR_EXEC);
		if (!ret)
			fatal("Canot allocate JIT TLS & TC (sz %u)\n", sz);
		else
			logw("Reallocated TC to minimum size\n");
	}
	
	jitStateInit(ret, sz);
	jitStateBackendInit(&ret->backendRuntimeData, false);
	TLS.slot[TLS_SLOT_JIT_STATE] = (uintptr_t)ret;
	
	KALTaskGetCurrentID(&tid);
	logjt(" *JS* alloced state at 0x%08x with size %lu for thread tid_%d\n", ret, ret->tcSize, tid);
	
	return ret;
}

static void jitStateFree(struct JitTls* state)
{
	uint32_t tid;
	
	KALTaskGetCurrentID(&tid);
	logjt(" *JS* freeing state at 0x%08x with size %lu for thread tid_%d\n", state, state->tcSize, tid);
	
	kheapFree(state);
}

static struct JitTls* jitGetState(bool allocIfNone)	//no locks neeed when workng with the TLS - only we can touch it!
{
	uintptr_t ret;
	
	ret = TLS.slot[TLS_SLOT_JIT_STATE];
	if (ret)
		return (struct JitTls*)ret;
	
	return allocIfNone ? jitStateAllocate(TC_SIZE_INITIAL) : NULL;
}

void __attribute__((used)) emuJitThreadExit(void)		//called before a thread is destroyed in the context of that thread. should free jit state
{
	struct JitTls *state = jitGetState(false);
	uint32_t tid;
	
	KALTaskGetCurrentID(&tid);
	logjt(" *JS* thread exit tid_%d\n", tid);
	
	if (state)
		jitStateFree(state);

	TLS.slot[TLS_SLOT_JIT_STATE] = 0xfefefefe;
}

static void emuJitInstrCacheClearAll(void)
{
	#ifdef SUPPORT_ICACHE_FLUSH
		struct JitTls *s = jitGetState(false);
		bool validateStorage = false;
		irq_state_t sta;
		uint32_t i;
		
		if (!s)
			return;
	
		if (mStorageWriteable)
			validateStorage = true;
		else if (mStorageTaintCount != s->lastStorageTaintCount) {
			s->lastStorageTaintCount = mStorageTaintCount;
			validateStorage = true;
		}
	
		for (i = 0; i < (1 << TC_HASH_ENTRY_SZ_LOG); i++) {
			
			struct TU *prev = NULL, *cur = s->hashes[i], *t;
			
			while (cur) {
				
				//We know ROM will never jump to RAM directly in a way we'd link it in
				//in ram invalidate all things that changed (hash mismatch), for storage see if it has been unlocked
				
				uint_fast8_t codeClass = cur->srcType;
				
				if (codeClass == TU_SOURCE_ROM || (codeClass == TU_SOURCE_STORAGE && !validateStorage)) {
					
					prev = cur;
					cur = cur->next;
				}
				else {
					struct EmitBuf dest = {};
					enum EmitStatus now;
					uint16_t *where;
					
					//this TU is now stale, but others may jump to it (only to its start). leave a special UDF here to mark this
					//we'll replace this with a jump to proper translation. remove this TU from the hash bucket as well
					
					t = cur;
					cur = cur->next;
					if (!prev)
						s->hashes[i] = cur;
					else
						prev->next = cur;
					
					//the UDF shall be where code starts, not prologue (if any) since jumps may go there.
					now = jitEmitWhereToJumpFromAnotherTu(t->code, &where);
					if (now != EmitErrNone)
						fatal("Cannot %s start of deleted TU\n", "find");

					emitBufferInit(&dest, where, sizeof(uint16_t) * 2 /* every TU has space for at least 2 halfwords, as ensured by jitTuInsert() */);
					
					now = emitLLudf(&dest, UDF_VAL_DELETED_TU, false);
					if (now != EmitErrNone)
						fatal("Cannot %s start of deleted TU\n", "replace");
					
					emitBufCacheCoherencyManage(&dest);
				}
			}
		}
		//no LDREXD on v7M. this is not going to be accessed by another core, it is per-core, so irq off is good enough
		sta = irqsAllOff();
		s->lastFlushCount = ++mCurFlushCount;
		irqsRestoreState(sta);
		
	#endif
}

void emuJitInstrCacheClear(uintptr_t addr, int32_t sz)
{
	emuJitInstrCacheClearAll();
}

void emuJitStorageTaintNotif(bool writesToStorageAllowed)
{
	#ifdef SUPPORT_ICACHE_FLUSH
		mStorageWriteable = writesToStorageAllowed;
		if (!writesToStorageAllowed) {
			//no LDREXD on v7M. this is not going to be accessed by another core, it is per-core, so irq off is good enough
			irq_state_t sta = irqsAllOff();
			mStorageTaintCount++;
			irqsRestoreState(sta);
		}
	#endif
}

static struct JitTls* jitTcFlush(void)	//returns new state (which may differ from initial)
{
	struct JitTls* state = jitGetState(true), *newState;
	bool maybeExpand = true;
	uint32_t newSz;
	
	maybeExpand = !state->permanent && (jitGetTicks() - state->lastFlushTicks <= FLUSH_GROW_TICKS);
	
	if (maybeExpand && state->allocingNewOneCurrently) {
		logjt("Not expanding TC in recursive call to jitTcFlush()\n");
		maybeExpand = false;
	}
	
	// XXX: this can be useful to debug jumping to flushed code
	//memset(state, 0xff, sizeof(uint16_t) * state->tcSize + sizeof(struct JitTls));
	
	//clear the current TC no matter what (this way if our attempt at allocation hits ARM code, we have space to translate it)
	memset(state->hashes, 0, sizeof(state->hashes));
	state->tcLevel = 0;
	
	state->lastFlushTicks = jitGetTicks();
	
	//we might be done
	if (!maybeExpand || state->tcSize == TC_SIZE_MAX)
		return state;
		
	//if power of two, grow by 50%, else by 25% so it is alwsy aa power of 2 or a sum of two sequential powers of two
	//makes heap's work easier
	
	newSz = state->tcSize + (1 << (30 - __builtin_clz(state->tcSize)));
	if (newSz > TC_SIZE_MAX)
		newSz = TC_SIZE_MAX;
	if (newSz & 1)	//make sure size is even
		newSz++;

	state->allocingNewOneCurrently = 1;
	newState = jitStateAllocate(newSz);
	newState->backendRuntimeData = state->backendRuntimeData;	//must persist
	jitStateFree(state);
	
	return newState;
}

void emuCoreInit(void)	//called once and only from the ui thread
{
	#ifdef CPU_HARDWIRED_UI_THREAD_JIT_TC_BASE
		
		struct JitTls* state = jitGetState(false);
		
		if (state) {
			loge("UI thread somehow already has state?\n");
			jitStateFree(state);
		}
		
		state = (struct JitTls*)CPU_HARDWIRED_UI_THREAD_JIT_TC_BASE;
		
		jitStateInit(state, CPU_HARDWIRED_UI_THREAD_JIT_TC_SIZE);
		jitStateBackendInit(&state->backendRuntimeData, true);
		state->permanent = 1;
		TLS.slot[TLS_SLOT_JIT_STATE] = (uintptr_t)state;
		
		loge("ui thread jit state inited\n");
		
	#endif
}

static struct TU* jitTuFindByExactAddr(uint32_t addr)
{
	struct JitTls* state = jitGetState(true);
	struct TU* tu;
	
	for (tu = state->hashes[jitTuHashAddress(addr)]; tu; tu = tu->next) {
		
		if (tu->baseAddr == addr)
			return tu;
	}
	
	return NULL;
}

static struct TU* jitTuAllocate(uint32_t *maxInstrsP, bool* tcFlushedP)
{
	struct JitTls* state = jitGetState(true);
	const uint32_t tuHdrSizeInTcUnits = (sizeof(struct TU) + sizeof(*state->tc) - 1) / sizeof(*state->tc);
	const uint32_t tcUnits = state->tcSize;
	struct TU* ret;
	
	logjt("now alloc, tclevel is %u\n", state->tcLevel);
	logjt("state->tcSize - state->tcLevel = %d, tuHdrSizeInTcUnits=%u\n", state->tcSize - state->tcLevel,  tuHdrSizeInTcUnits);
	
	if (state->literalIdx)
		fatal("Allocatig a TU with unspilled literals is undoubtedly bad\n");
	
	if (state->tcSize - state->tcLevel <= tuHdrSizeInTcUnits){
		
		state = jitTcFlush();
		if (tcFlushedP)
			*tcFlushedP = true;
	}
	
	ret = (struct TU*)(char*)(state->tc + state->tcLevel);
	#ifdef SUPPORT_ICACHE_FLUSH
		ret->srcLen = 0;
	#endif
	
	*maxInstrsP = state->tcSize - state->tcLevel - tuHdrSizeInTcUnits;
	
	return ret;
}

static uint_fast8_t jitPrvClassifyAddress(uint32_t addr)
{
	if (addr >= CPU_STORAGE_RAM_BASE && addr - CPU_STORAGE_RAM_BASE < CPU_STORAGE_RAM_SIZE)
		return TU_SOURCE_STORAGE;
	
	if (addr >= CPU_ROM_BASE && addr - CPU_ROM_BASE < CPU_ROM_SIZE)
		return TU_SOURCE_ROM;
	
	if (addr >= CPU_DYN_RAM_BASE && addr - CPU_DYN_RAM_BASE < CPU_DYN_RAM_SIZE)
		return TU_SOURCE_RAM;
	
	return TU_SOURCE_UNKNOWN;
}

static uint_fast8_t jitPrvClassifyAddressRange(uint32_t addr, uint32_t size)
{
	uint_fast8_t clasStart = jitPrvClassifyAddress(addr);
	uint_fast8_t clasEnd = jitPrvClassifyAddress(addr + size - 1);
	
	return clasStart > clasEnd ? clasStart : clasEnd;
}

static void jitTuInsert(struct TU* tu, struct EmitBuf *dest)
{
	struct JitTls* state = jitGetState(true);
	const int32_t tuHdrSizeInTcUnits = (sizeof(struct TU) + sizeof(*state->tc) - 1) / sizeof(*state->tc);
	uint32_t hash = jitTuHashAddress(tu->baseAddr), numInstrs = (uint16_t*)dest->buf - (uint16_t*)dest->bufStart;
	
	#ifdef SUPPORT_ICACHE_FLUSH
		if (numInstrs < 2)									//for flushing logic to work each TU needs to be at least 2 halfwords long
			numInstrs = 2;
	#endif
	
	emitBufCacheCoherencyManage(dest);
	
	if ((numInstrs & 1) != (tuHdrSizeInTcUnits & 1))	//compiles to best code for this
		numInstrs++;	//round up to next word

	#ifdef SUPPORT_ICACHE_FLUSH
		if (!tu->srcLen)
			fatal("Inserting TU with no source length (src addr 0x%08x)\n", tu->baseAddr);
		
		//addr may not be aligned (see "arm v4 BX PC quirk")
		tu->srcHash = jitTuHashSource(tu);
		
		//record TU source
		tu->srcType = jitPrvClassifyAddressRange(tu->baseAddr, sizeof(uint32_t) * tu->srcLen);
	#endif
	
	//insert into hash
	tu->next = state->hashes[hash];
	state->hashes[hash] = tu;
	
	//adjust level
	state->tcLevel += numInstrs + tuHdrSizeInTcUnits;
	
	if (state->tcLevel > state->tcSize)
		fatal("OOOPS %u > %u! (added %u hws)\n", state->tcLevel, state->tcSize, numInstrs);
} 

static uint32_t jitWorkOutArmBranchTarget(const uint32_t* branch)	//B and BL
{
	uint32_t val = *branch;
	
	if ((val & 0x0e000000) != 0x0a000000)
		fatal("not a branch\n");
	
	return (uintptr_t)(branch + 2 + (((int32_t)(val << 8)) >> 8));
}

enum EmitStatus jitEmitJumpToArm(struct EmitBuf *dest, enum EmitCc cc, uint32_t dstArmAddr, bool* tuWasFoundP)
{
	struct EmitBuf jumpOver;
	enum EmitStatus now;
	struct TU* tu;
	
	tu = jitTuFindByExactAddr(dstArmAddr);	//do we have the address translated already?
	
	if (tuWasFoundP)
		*tuWasFoundP = !!tu;
	
	if (tu) {									//emit jump to it 
		now = jitEmitJumpToAnotherTu(dest, tu->code, cc);
		if (now != EmitErrNone)
			return now;
	}
	else {
		
		//complicated - emit a way to change this later when we translate
		
		//handle conditionalness
		if (cc != EmitCcAl) {
			
			uint32_t nHalfwordsToSave;
			
			now = jitEmitNumHalfwordsNeededForConditionalSkipover(cc, &nHalfwordsToSave);
			if (now != EmitErrNone)
				return now;
			
			EMIT(SaveSpace, &jumpOver, nHalfwordsToSave);
		}
	
		//use long UDF just for this. due to addr length this is actually no longer than short UDF would have been - i promise
		EMIT(LLudf, dstArmAddr >> 16, true);
		
		//second half of addr
		EMIT(LLrawHalfword, dstArmAddr);
		
		if (cc != EmitCcAl) {
			now = jitEmitIntraTuBranch(&jumpOver, emitGetPtrToJumpHere(dest), emitCcInvert(cc));
			if (now != EmitErrNone)
				return now;
		}
	}
	return EmitErrNone;
}

static enum EmitStatus jitTranslateNvInstr(struct EmitBuf *dest, uint32_t instr, uint32_t instrAddr, bool *terminateP)
{
	if ((instr >> 25) == 0x7D) {					//BLX imm
		
		uint32_t destAddr = jitWorkOutArmBranchTarget((uint32_t*)instrAddr);
		enum EmitStatus now;
		
		if (instr & 0x01000000)
			destAddr += 2;
		
		//we'll need to emit:
		// * a move of return addr imm to LR
		// * jump to thumb address
		
		now = jitEmitLoadImmToReg(dest, EMIT_REG_NO_LR, instrAddr + 4, false, false, false);
		if (now != EmitErrNone)
			return now;
		
		now = jitEmitJumpToAbsThumbAddrNotInTu(dest, destAddr);
		if (now != EmitErrNone)
			return now;
		
		*terminateP = true;		//called code will likely want to return, so any further translation will be killed anyways since we'll need a new basic block at distination, so do not bother trying...
		return EmitErrNone;
	}
	else if ((instr & 0xFD70F000) == 0xF550F000)	//PLD emits no code but is valid
		return EmitErrNone;
	else 											//unknown instr
		return EmitErrInvalidInput;
}

static enum EmitStatus jitHandleArmMode2(struct EmitBuf *dest, uint32_t instr, uint32_t instrAddr, enum EmitCc cc, bool *terminateP)
{
	bool isAdd = !!(instr & 0x00800000), isByte = !!(instr & 0x00400000), isLoad = !!(instr & 0x00100000);
	uint32_t rnNo = (instr >> 16) & 0x0F, rdNo = (instr >> 12) & 0x0F;
	enum EmitAddrMode adrMode;
	
	if (instr & 0x01000000)
		adrMode = (instr & 0x00200000) ? EmitAdrModeIndexWbak : EmitAdrModeIndex;
	else if (instr & 0x00200000)
		return EmitErrInvalidInput;
	else
		adrMode = EmitAdrModePostindex;
	
	if (rdNo == EMIT_REG_NO_PC && cc == EmitCcAl && isLoad)
		*terminateP = true;
	
	if (instr & 0x02000000) {
		
		enum EmitShiftType shiftType = (enum EmitShiftType)((instr >> 5) & 3);
		uint32_t shiftAmt = (instr >> 7) & 0x1f, rmNo = instr & 0x0f;
		
		if (instr & 0x10)
			return EmitErrInvalidInput;
		
		if (isLoad)
			return jitEmitRegRegMemLdr(dest, cc, instrAddr, false, rdNo, rnNo, isAdd, rmNo, shiftType, shiftAmt, adrMode, isByte ? EmitSzByte : EmitSzWord);
		else
			return jitEmitRegRegMemStr(dest, cc, instrAddr, rdNo, rnNo, isAdd, rmNo, shiftType, shiftAmt, adrMode, isByte ? EmitSzByte : EmitSzWord);
	}
	else {
		
		int32_t imm = instr & 0xfff;
		
		if (!isAdd)
			imm = -imm;
		
		if (isLoad)
			return jitEmitImmMemLdr(dest, cc, instrAddr, false, rdNo, rnNo, imm, adrMode, isByte ? EmitSzByte : EmitSzWord);
		else
			return jitEmitImmMemStr(dest, cc, instrAddr, rdNo, rnNo, imm, adrMode, isByte ? EmitSzByte : EmitSzWord);
	}
}

static enum EmitStatus jitHandleMultiplies(struct EmitBuf *dest, uint32_t instr, uint32_t instrAddr, enum EmitCc cc, bool *terminateP)
{
	//reg names & order are as per ARMv5 (not ARMv7M)
	uint32_t rmNo = instr & 0x0F, rsNo = (instr >> 8) & 0x0F, rnNo = (instr >> 12) & 0x0F, rdNo = (instr >> 16) & 0x0F, rdLoNo = rnNo, rdHiNo = rdNo;
	bool s = !!(instr & 0x00100000), a = !!(instr & 0x00200000), isUnsigned  = !(instr & 0x00400000);
	
	//ARM says: no PC anywhere, d != m for short instrs, dHi != dLo, and dHi != m && dLo != m for long ones
	
	if (instr & 0x00800000)			//long multiplies
		return jitEmitLongMul(dest, cc, rdLoNo, rdHiNo, rmNo, rsNo, isUnsigned, a, s);
	else if (!isUnsigned)			//shrt multiplies - always encoded as "unsigned"
		return EmitErrInvalidInput;
	else if (a) 					//MLA
		return jitEmitMla(dest, cc, rdNo, rmNo, rsNo, rnNo, s);
	else							//MUL
		return jitEmitMul(dest, cc, rdNo, rmNo, rsNo, s);
}

static enum EmitStatus jitHandleTable32(struct EmitBuf *dest, uint32_t instr, uint32_t instrAddr, enum EmitCc cc, bool *terminateP)
{
	enum EmitStatus now;

	if ((instr >> 5) & 3) {				//loads/stores mode 3
		
		uint32_t imm = (instr & 0x0F) + ((instr >> 4) & 0xF0), rmNo = instr & 0x0F, rnNo = (instr >> 16) & 0x0F, rdNo = (instr >> 12) & 0x0F;
		bool isImm = !!(instr & 0x00400000), doAdd = !!(instr & 0x00800000);
		enum EmitAddrMode adrMode;

		if (instr & 0x01000000)
			adrMode = (instr & 0x00200000) ? EmitAdrModeIndexWbak : EmitAdrModeIndex;
		else if (instr & 0x00200000)
			return EmitErrInvalidInput;
		else
			adrMode = EmitAdrModePostindex;

		if (instr & 0x00100000) {		//loads
			switch ((instr >> 5) & 3) {
				case 0b00:	//unsigned byte - not encoded this way
					return EmitErrInvalidInput;
				case 0b01:	//LDRH
					if (cc == EmitCcAl && rdNo == EMIT_REG_NO_PC)
						*terminateP = true;
					if (isImm)
						return jitEmitImmMemLdr(dest, cc, instrAddr, false, rdNo, rnNo, doAdd ? imm : -imm, adrMode, EmitSzHalfword);
					else
						return jitEmitRegRegMemLdr(dest, cc, instrAddr, false, rdNo, rnNo, doAdd, rmNo, EmitShiftLsl, 0, adrMode, EmitSzHalfword);
					break;
				case 0b10:	//LDRSB
					if (cc == EmitCcAl && rdNo == EMIT_REG_NO_PC)
						*terminateP = true;
					if (isImm)
						return jitEmitImmMemLdr(dest, cc, instrAddr, true, rdNo, rnNo, doAdd ? imm : -imm, adrMode, EmitSzByte);
					else
						return jitEmitRegRegMemLdr(dest, cc, instrAddr, true, rdNo, rnNo, doAdd, rmNo, EmitShiftLsl, 0, adrMode, EmitSzByte);
					break;
				case 0b11:	//LDRSH:
					if (cc == EmitCcAl && rdNo == EMIT_REG_NO_PC)
						*terminateP = true;
					if (isImm)
						return jitEmitImmMemLdr(dest, cc, instrAddr, true, rdNo, rnNo, doAdd ? imm : -imm, adrMode, EmitSzHalfword);
					else
						return jitEmitRegRegMemLdr(dest, cc, instrAddr, true, rdNo, rnNo, doAdd, rmNo, EmitShiftLsl, 0, adrMode, EmitSzHalfword);
					break;
				
				default:
					return EmitErrInvalidInput;
			}
		}
		else {							//stores and LDRD
			switch ((instr >> 5) & 3) {
				case 0b00:	//unsigned byte - not encoded this way
					return EmitErrInvalidInput;
				case 0b01:	//STRH
					if (isImm)
						return jitEmitImmMemStr(dest, cc, instrAddr, rdNo, rnNo, doAdd ? imm : -imm, adrMode, EmitSzHalfword);
					else
						return jitEmitRegRegMemStr(dest, cc, instrAddr, rdNo, rnNo, doAdd, rmNo, EmitShiftLsl, 0, adrMode, EmitSzHalfword);
					break;
				case 0b10:	//LDRD
					if (isImm)
						return jitEmitImmMemLdrd(dest, cc, instrAddr, rdNo, rnNo, doAdd ? imm : -imm, adrMode);
					else
						return jitEmitRegRegMemLdrd(dest, cc, instrAddr, rdNo, rnNo, doAdd, rmNo, adrMode);
				case 0b11:	//STRD
					if (isImm)
						return jitEmitImmMemStrd(dest, cc, instrAddr, rdNo, rnNo, doAdd ? imm : -imm, adrMode);
					else
						return jitEmitRegRegMemStrd(dest, cc, instrAddr, rdNo, rnNo, doAdd, rmNo, adrMode);
				
				default:
					return EmitErrInvalidInput;
			}
		}
	}
	else if (instr & 0x01000000) {		//swp/swpb
		
		uint32_t rdNo = (instr >> 12) & 0x0F, rmNo = instr & 0x0F, rnNo = (instr >> 16) & 0x0F;
		bool isByte = !!(instr & 0x00400000);
		
		return jitEmitSwap(dest, cc, instrAddr, rdNo, rnNo, rmNo, isByte ? EmitSzByte : EmitSzWord);
	}
	else
		return jitHandleMultiplies(dest, instr, instrAddr, cc, terminateP);
}

static enum EmitStatus jitHandleTable33(struct EmitBuf *dest, uint32_t instr, uint32_t instrAddr, enum EmitCc cc, bool *terminateP)
{
	uint32_t t, rdNo = (instr >> 12) & 0x0F, rmNo = instr & 0x0F, rnNo = (instr >> 16) & 0x0F, rsNo = (instr >> 8) & 0x0F;
	enum EmitStatus now;
	int32_t encImm;
	
	switch ((instr >> 4) & 0x0F) {
		
		case 0b0000:	//could be MRS/MSR
			if ((instr & 0x00bf0f0f) == 0x000f0000) { 	//MRS
				
				if (instr & 0x00400000) {
					loge("SPSR access not supported\n");
					return EmitErrInvalidInput;
				}
				
				return jitEmitMrsReg(dest, cc, rdNo); 
			}
			else if ((instr & 0x00b0fff0) == 0x0020f000) { 	//MRS
				
				if (instr & 0x00400000) {
					loge("SPSR access not supported\n");
					return EmitErrInvalidInput;
				}
				
				if (!(instr & 0x00080000))	//in user mode only flags can be set, if it is not requested, instr executes as a NOP
					return EmitErrNone;
			
				return jitEmitMsrReg(dest, cc, instrAddr, rmNo);
			}
			return EmitErrInvalidInput;
			
		case 0b0001:	//BX and CLZ
			if ((instr & 0x006f0f00) == 0x006f0f00) {		//clz
				
				return jitEmitClz(dest, cc, rdNo, rmNo);
			}
			else if ((instr & 0x006fff00) == 0x002fff00) {	//bx
				
				if (cc == EmitCcAl)
					*terminateP = true;
				
				return jitEmitBxReg(dest, cc, instrAddr, rmNo);
			}
			return EmitErrInvalidInput;
			
		case 0b0011:	//BLX
			if ((instr & 0x006fff00) == 0x002fff00) {	//blx
				
				now = jitEmitBlxReg(dest, cc, instrAddr, rmNo);
				if (now != EmitErrNone)
					return now;

				*terminateP = true;	//need to translate after this anyways
				return EmitErrNone;
			}
			return EmitErrInvalidInput;

		case 0b0101:	//enhanced DSP adds/subs
		
			switch ((instr >> 21) & 0x03) {
				case 0b00:
					now = jitEmitQadd(dest, cc, instrAddr, rdNo, rnNo, rmNo);
					if (now != EmitErrNone)
						return now;
					break;
				case 0b01:
					now = jitEmitQsub(dest, cc, instrAddr, rdNo, rnNo, rmNo);
					if (now != EmitErrNone)
						return now;
					break;
				case 0b10:
					now = jitEmitQdadd(dest, cc, instrAddr, rdNo, rnNo, rmNo);
					if (now != EmitErrNone)
						return now;
					break;
				case 0b11:
					now = jitEmitQdsub(dest, cc, instrAddr, rdNo, rnNo, rmNo);
					if (now != EmitErrNone)
						return now;
					break;
			}
			return EmitErrNone;
			
		case 0b1000:	//enhanced DSP multiplies
			
			//this is CORRECT, these encodings shuffle rgs around a lot
			switch ((instr >> 21) & 0x03) {
				case 0b00:				//SMLAxy
					if (rdNo)
						return EmitErrInvalidInput;
					now = jitEmitSmlaxy(dest, cc, instrAddr, rnNo, rmNo, rdNo, (instr >> 8) & 0x0f, !!(instr & 0x20), !!(instr & 0x40));
					if (now != EmitErrNone)
						return now;
					break;
				
				case 0b01:				//SMLAWy,SMULWy
					if (instr & 0x20) {	//SMULWy
						
						if (rdNo)
							return EmitErrInvalidInput;
						
						now = jitEmitSmulwy(dest, cc, instrAddr, rnNo, rmNo, (instr >> 8) & 0x0f, !!(instr & 0x40));
						if (now != EmitErrNone)
							return now;
					}
					else {				//SMLAWy
						
						now = jitEmitSmlawy(dest, cc, instrAddr, rnNo, rmNo, (instr >> 8) & 0x0f, rdNo, !!(instr & 0x40));
						if (now != EmitErrNone)
							return now;
					}
					break;
				
				case 0b10:				//SMLALxy
					now = jitEmitSmlalxy(dest, cc, instrAddr, rdNo, rnNo, rmNo, (instr >> 8) & 0x0f, !!(instr & 0x20), !!(instr & 0x40));
					if (now != EmitErrNone)
						return now;
					break;
				
				case 0b11:				//SMULxy
					if (rdNo)
						return EmitErrInvalidInput;
					now = jitEmitSmulxy(dest, cc, instrAddr, rnNo, rmNo, (instr >> 6) & 0x0f, !!(instr & 0x20), !!(instr & 0x40));
					if (now != EmitErrNone)
						return now;
					break;
				
			}
			return EmitErrNone;

		case 0b0111:	//breakpoint
		case 0b1010:
		case 0b1100:
		case 0b1110:
			return EmitErrInvalidInput;
		
		default: 
			return EmitErrInvalidInput;
	}
}

static enum EmitStatus jitHandleMsrImm(struct EmitBuf *dest, uint32_t instr, enum EmitCc cc)
{
	uint32_t val = (instr & 0xff), rotBy = (instr >> 7) & 0x1e;
	enum EmitStatus now;

	if (instr & 0x00400000) {
		loge("SPSR access not supported\n");
		return EmitErrInvalidInput;
	}
	
	if (rotBy)
		val = (val >> rotBy) | (val << (32 - rotBy));
	
	if (!(instr & 0x00080000))	//in user mode only flags can be set, if it is not requested, instr executes as a NOP
		return EmitErrNone;

	now = jitEmitMsrImm(dest, cc, val);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

static enum EmitStatus jitEmitLdmStm(struct EmitBuf *dest, uint32_t instrAddr, uint32_t instr, uint32_t rnNo, uint32_t regsMask, bool load, bool u, bool p, bool w, enum EmitCc cc)
{
	if (p && u) {
		if (load)
			return jitEmitLdmib(dest, cc, instrAddr, rnNo, regsMask, w);
		else
			return jitEmitStmib(dest, cc, instrAddr, rnNo, regsMask, w);
	}
	else if (u) {
		if (load)
			return jitEmitLdmia(dest, cc, instrAddr, rnNo, regsMask, w);
		else
			return jitEmitStmia(dest, cc, instrAddr, rnNo, regsMask, w);
	}
	else if (p) {
		if (load)
			return jitEmitLdmdb(dest, cc, instrAddr, rnNo, regsMask, w);
		else
			return jitEmitStmdb(dest, cc, instrAddr, rnNo, regsMask, w);
	}
	else{
		if (load)
			return jitEmitLdmda(dest, cc, instrAddr, rnNo, regsMask, w);
		else
			return jitEmitStmda(dest, cc, instrAddr, rnNo, regsMask, w);
	}
}

static enum EmitStatus jitHandleArmMode4(struct EmitBuf *dest, uint32_t instr, uint32_t instrAddr, enum EmitCc cc, bool *terminateP)
{
	bool load = !!(instr & 0x00100000), s = !!(instr & 0x00400000), p = !!(instr & 0x01000000), u = !!(instr & 0x00800000), w = !!(instr & 0x00200000);
	uint32_t rnNo = (instr >> 16) & 0x0f, regsMask = instr & 0xffff;
	
	#ifdef ARM_EMU_DEFINE_THE_UNDEFINED
		if (load && s && (regsMask & 0x8000))
			s = false;
	#endif
	
	if (load && (regsMask & (1 << EMIT_REG_NO_PC)) && cc == EmitCcAl)
		*terminateP = true;
	
	//specifiying writeback with base reg in reg set produces unpredictable values (in reg or in mem) so we treat those as invalid instrs
	if (w && (regsMask & (1 << rnNo)))
		return EmitErrInvalidInput;
	
	//empty reg set is also invalid
	if (!regsMask)
		return EmitErrInvalidInput;
	
	return jitEmitLdmStm(dest, instrAddr, instr, rnNo, regsMask, load, u, p, w, cc);
}

static enum EmitStatus jitHandleSwi(struct EmitBuf *dest, uint32_t instr, uint32_t instrAddr, enum EmitCc cc, bool *terminateP)
{
	uint32_t swiNo = instr & 0x00ffffff;
	
	logjt("SWI 0x%06x at 0x%08x seen\n", swiNo, instrAddr);
	
	if (swiNo == 0x123456) {			//we do not support any swi numbers other than semihosting one
		
		//*terminateP MUST be false - we'll return in thumb mode to callsite!
		return jitEmitSemihostingCall(dest, instrAddr, cc);
	}
	
	return EmitErrInvalidInput;	
}

static enum EmitStatus jitDataProcessingIsTerminal(enum JitArmDpOp op, uint32_t rdNo, bool unconditional, bool *terminateP)
{
	switch (op) {
		case ArmDpOpTst:
		case ArmDpOpTeq:
		case ArmDpOpCmp:
		case ArmDpOpCmn:
		
			return EmitErrNone;

		case ArmDpOpAnd:
		case ArmDpOpEor:
		case ArmDpOpSub:
		case ArmDpOpRsb:
		case ArmDpOpAdd:
		case ArmDpOpAdc:
		case ArmDpOpSbc:
		case ArmDpOpRsc:
		case ArmDpOpOrr:
		case ArmDpOpMov:
		case ArmDpOpBic:
		case ArmDpOpMvn:

			if (rdNo == EMIT_REG_NO_PC && unconditional)
				*terminateP = true;
			return EmitErrNone;
		
		default:
			return EmitErrNotEncodeable;
	}
}

static enum EmitStatus jitHandleDataProcessingRegReg(struct EmitBuf *dest, uint32_t instr, uint32_t instrAddr, enum EmitCc cc, bool *terminateP)
{
	uint32_t rdNo = (instr >> 12) & 0x0F, rnNo = (instr >> 16) & 0x0F, rmNo = instr & 0x0F;
	enum EmitShiftType shiftType = (enum EmitShiftType)((instr >> 5) & 3);
	enum JitArmDpOp op = (enum JitArmDpOp)((instr >> 21) & 0x0F);
	bool s = !!((instr >> 20) & 1);
	enum EmitStatus now;
	
	//table 3.3 ?
	if ((instr & 0x01900000) == 0x01000000)
		return EmitErrInvalidInput;
	
	//table 3.2 ?
	if ((instr & 0x00000090) == 0x00000090)
		return EmitErrInvalidInput;
	
	now = jitDataProcessingIsTerminal(op, rdNo, cc == EmitCcAl, terminateP);
	if (now != EmitErrNone)
		return now;
	
	if (instr & 0x10) {	//reg shift by reg
		
		uint32_t rsNo = (instr >> 8) & 0x0F;
		
		if (instr & 0x80)
			return EmitErrInvalidInput;
		
		return jitEmitAluOpRegShiftReg(dest, cc, op, instrAddr, rdNo, rnNo, rmNo, rsNo, shiftType, s);
	}
	else {				//reg shift by imm
		
		uint32_t shiftAmt = (instr >> 7) & 0x1f;	//shift type & amt match between arm and thumb2 so we do not need to parse things like RRX out
		
		return jitEmitAluOpRegShiftImm(dest, cc, op, instrAddr, rdNo, rnNo, rmNo, shiftType, shiftAmt, s);
	}
}

static enum EmitStatus jitHandleDataProcessingImm(struct EmitBuf *dest, uint32_t instr, uint32_t instrAddr, enum EmitCc cc, bool *terminateP)
{
	uint32_t immValRaw = instr & 0xFF, immShift = (instr >> 7) & 0x1e, rdNo = (instr >> 12) & 0x0F, rnNo = (instr >> 16) & 0x0F, s = (instr >> 20) & 1;
	enum JitArmDpOp op = (enum JitArmDpOp)((instr >> 21) & 0x0F);
	enum EmitStatus now;

	//undef and MSR imm
	if ((instr & 0x01900000) == 0x01000000)
		return EmitErrInvalidInput;

	now = jitDataProcessingIsTerminal(op, rdNo, cc == EmitCcAl, terminateP);
	if (now != EmitErrNone)
		return now;
	
	return jitEmitAluOpImm(dest, cc, op, instrAddr, rdNo, rnNo, immValRaw, immShift, s);
}

static bool jitPrvAnalyzeRegsUseOneInstr(uint32_t instr, uint32_t *regsReadP, uint32_t *regsWrittenP)		//return true if we should stop and process no furthe rinstrs
{
	uint32_t regsRead = 0, regsWritten = 0, regNo;
	bool terminal = false, ignoreRd = false;
	enum JitArmDpOp dpOpCode;
	
	
	if ((instr & 0xFD70F000) == 0xf550f000) {										//PLD
		
		instr &=~ 0x10000000;														//tee hee :)
		ignoreRd = true;															//convert it to an LDR that does not write Rd
	}
	
	if ((instr >> 25) == 0x7d) {													//BLX.imm clobbers LR
		
		regsWritten = 1 << EMIT_REG_NO_LR;
		terminal = true;
	}
	else if ((instr >> 28) == EmitCcNv)											//UDF or BLX - either way terminal
		terminal = true;
	else switch ((instr >> 25) & 7) {
	
		case 0b000:																	//DP reg, table 3.2 and table 3.3
		
			if ((instr & 0x90) == 0x90) {											//table 3.2	
				
				if ((instr & 0x01c00060) == 0x00000000) {							//MUL/MLA
					
					if (instr & 0x00200000)											//MLA
						regsRead |= 1 << ((instr >> 12) & 0x0f);					//Rn
					else if (instr & 0x0000f000) {									//UDF
						terminal = true;
						break;
					}
					regsWritten |= 1 << ((instr >> 16) & 0x0f);						//Rd
					regsRead |= 1 << ((instr >> 0) & 0x0f);							//Rm
					regsRead |= 1 << ((instr >> 8) & 0x0f);							//Rs
				}
				else if ((instr & 0x01800060) == 0x00800000) {						//long multiplies
					
					regsWritten |= 1 << ((instr >> 16) & 0x0f);						//RdHi
					regsWritten |= 1 << ((instr >> 12) & 0x0f);						//RdLo
					
					if (instr & 0x00200000) {										//accumulate means RdHi & RdLo are also read
						
						regsRead |= 1 << ((instr >> 16) & 0x0f);					//RdHi
						regsRead |= 1 << ((instr >> 12) & 0x0f);					//RdLo
					}
					
					regsRead |= 1 << ((instr >> 0) & 0x0f);							//Rm
					regsRead |= 1 << ((instr >> 8) & 0x0f);							//Rs
				}
				else if ((instr & 0x01b00f60) == 0x01000000) {						//SWP/SWPB
					
					regsWritten |= 1 << ((instr >> 12) & 0x0f);						//Rd
					regsRead |= 1 << ((instr >> 16) & 0x0f);						//Rn
					regsRead |= 1 << ((instr >> 0) & 0x0f);							//Rm
				}
				else if (!(instr & 0x00000060))										//UDF
					terminal = true;
				else if (!(instr & 0x00400000) && (instr & 0x00000f00))				//UDF (SBZ not zero for reg-reg ops)
					terminal = true;
				else if ((instr & 0x00101040) == 0x00001040)						//UDF (LDRD/STRD to odd=numbered reg)
					terminal = true;
				else {																//adr mode 3 (all valid cases)
					
					regsRead |= 1 << ((instr >> 16) & 0x0f);						//Rn is always read
					if (instr & 0x00200000)											//wbak? if so Rn is written
						regsWritten |= 1 << ((instr >> 16) & 0x0f);					//Rn
					if (!(instr & 0x00400000))										//if reg mode, Rm is also read
						regsRead |= 1 << ((instr >> 0) & 0x0f);						//Rm
					
					//now we need to sort out WTF this is and how to handle it. First just get Rd as a number
					regNo = (instr >> 12) & 0x0f;
					
					//now get Rd as the mask of affected regs
					if ((instr & 0x00100040) == 0x00000040) 						//LDRD/STRD
						regNo = 3 << regNo;											//convert regNo to a mask of the relevant regs
					else															//{LDR,STR}{H,SH,B,SB}
						regNo = 1 << regNo;											//convert regNo to a mask of the relevant regs
					
					//now sort out if this is a lod or a store, and act accordingly
					if (instr & 0x40) {
						
						if (instr & 0x00100000)										//LDRH
							regsWritten |= regNo;
						else
							regsRead |= regNo;
					}
					else if (instr & 0x00100000)									//signed loads
						regsWritten |= regNo;
					else {															//LDRD/STRD
						
						if (instr & 0x20)
							regsRead |= regNo;
						else
							regsWritten |= regNo;
					}
				}
			}
			else if ((instr & 0x01900000) == 0x01000000) {							//table 3.3
				
				switch ((instr >> 4) & 0x0f) {
					
					case 0b0000:
						if ((instr & 0x002f0f0f) == 0x000f0000) {					//MRS
							regNo = ((instr >> 12) & 0x0f);
							if (regNo != EMIT_REG_NO_PC)
								regsWritten |= 1 << ((instr >> 12) & 0x0f);			//Rd if not PC (else CPSR is written and PC left alone)
						}
						else if ((instr & 0x0020ff00) == 0x0020f000)				//MSR reg
							regsRead |= 1 << ((instr >> 0) & 0x0f);					//Rm
						else														//UDF
							terminal = true;
						break;
					
					case 0b0001:
						if ((instr & 0x00ffff00) == 0x002fff00) {					//BX
							regsRead |= 1 << ((instr >> 0) & 0x0f);					//Rm
							terminal = true;
						}
						else if ((instr & 0x00ff0f00) == 0x006f0f00) {				//CLZ
							regsWritten |= 1 << ((instr >> 12) & 0x0f);				//Rd
							regsRead |= 1 << ((instr >> 0) & 0x0f);					//Rm
						}
						else														//UDF
							terminal = true;
						break;
					
					case 0b0011:
						if ((instr & 0x00ffff00) == 0x002fff00) {					//BLX
							regsWritten |= 1 << EMIT_REG_NO_LR;						//LR clobbered
							regsRead |= 1 << ((instr >> 0) & 0x0f);					//Rm
							terminal = true;
						}
						else														//UDF
							terminal = true;
						break;
					
					case 0b0101:													//EDSP add/sub
						if (instr & 0x00000f00)										//UDF
							terminal = true;
						else {
							regsWritten |= 1 << ((instr >> 12) & 0x0f);				//Rd
							regsRead |= 1 << ((instr >> 16) & 0x0f);				//Rn
							regsRead |= 1 << ((instr >> 0) & 0x0f);					//Rm
						}
						break;
					
					case 0b1000:													//EDSP multiplies require a bit more discrimination
					case 0b1010:
					case 0b1100:
					case 0b1110:
						switch ((instr >> 21) & 3) {
							case 0b00:												//SMLAxy
								regsWritten |= 1 << ((instr >> 16) & 0x0f);			//Rd
								regsRead |= 1 << ((instr >> 12) & 0x0f);			//Rn
								regsRead |= 1 << ((instr >> 0) & 0x0f);				//Rm
								regsRead |= 1 << ((instr >> 8) & 0x0f);				//Rs
								break;
							
							case 0b01:												//SMULWy/SMLAWy
								if (!(instr & 0x20)) 								//SMLAWy
									regsRead |= 1 << ((instr >> 12) & 0x0f);		//Rn
								else if (instr & 0x0000f000) {						//UDF
									terminal = true;
									break;
								}
								regsWritten |= 1 << ((instr >> 16) & 0x0f);			//Rd
								regsRead |= 1 << ((instr >> 0) & 0x0f);				//Rm
								regsRead |= 1 << ((instr >> 8) & 0x0f);				//Rs
								break;
							
							case 0b10:												//SMLALxy
								regsWritten |= 1 << ((instr >> 16) & 0x0f);			//RdHi
								regsWritten |= 1 << ((instr >> 16) & 0x0f);			//RdLo
								regsRead |= 1 << ((instr >> 16) & 0x0f);			//RdHi
								regsRead |= 1 << ((instr >> 16) & 0x0f);			//RdLo
								regsRead |= 1 << ((instr >> 0) & 0x0f);				//Rm
								regsRead |= 1 << ((instr >> 8) & 0x0f);				//Rs
								break;
							
							case 0b11:												//SMULxy
								if (instr & 0x0000f000)								//UDF
									terminal = true;
								else {
									regsWritten |= 1 << ((instr >> 16) & 0x0f);		//Rd
									regsRead |= 1 << ((instr >> 0) & 0x0f);			//Rm
									regsRead |= 1 << ((instr >> 8) & 0x0f);			//Rs
								}
								break;
						}
						break;
					
					default:														//UDF
						terminal = true;
						break;
				}
			}
			else																	//actual DP.reg
				goto dp_common;
			break;
		
		case 0b001:																	//DP imm, MSR imm, UDF
			if ((instr & 0x01b00000) == 0x01000000)									//UDF
				terminal = true;
			else if ((instr & 0x01b00000) != 0x01200000) {							//NOT MSR imm (msr imm uses no regs and isnt terminal
	
	dp_common:																		//jumped to from above to avoid code duplication	
				dpOpCode = (enum JitArmDpOp)((instr >> 21) & 0x0f);
				
				if (dpOpCode == ArmDpOpTst || dpOpCode == ArmDpOpTeq || dpOpCode == ArmDpOpCmp || dpOpCode == ArmDpOpCmn) {
					
					if (instr & 0x0000f000) {										//SBZ not zero?
						terminal = true;
						break;
					}
					ignoreRd = true;
				}
				
				if (dpOpCode != ArmDpOpMov && dpOpCode != ArmDpOpMvn)
					regsRead |= 1 << ((instr >> 16) & 0x0f);						//Rn
				else if (instr & 0x000f0000) {										//SBZ not zero?
					terminal = true;
					break;
				}
				if (!ignoreRd)
					regsWritten |= 1 << ((instr >> 12) & 0x0f);						//Rd
				
				if (!(instr & 0x02000000)) {										//reg mode DP? need Rm and possibly Rs
				
					regsRead |= 1 << ((instr >> 0) & 0x0f);							//Rm
					if (instr & 0x10)
						regsRead |= 1 << ((instr >> 8) & 0x0f);						//Rs
				}
			}
			break;
		
		case 0b011:																	//adr mode 2.reg load/store
			if (instr & 0x10) {														//UDF
				terminal = true;
				break;
			}
			regsRead |= 1 << ((instr >> 0) & 0x0f);									//Rm
			//fallthrough
			
		case 0b010:																	//adr mode 2.imm load/store
			regsRead |= 1 << ((instr >> 16) & 0x0f);								//Rn is always read
			if (instr & 0x00200000)													//wbak? if so Rn is written
				regsWritten |= 1 << ((instr >> 16) & 0x0f);							//Rn
			if (ignoreRd)															//preload sets this
				{}																	//nothing
			else if (instr & 0x00100000)											//if load, Rd is written
				regsWritten |= 1 << ((instr >> 12) & 0x0f);							//Rd
			else																	//if store, Rd is read
				regsRead |= 1 << ((instr >> 12) & 0x0f);							//Rd
			break;
		
		case 0b100:																	//LDM/STM
			regsRead |= 1 << ((instr >> 16) & 0x0f);								//Rn is always read
			if (instr & 0x00200000)													//wbak? if so Rn is written
				regsWritten |= 1 << ((instr >> 16) & 0x0f);							//Rn
			if (instr & 0x00200000)													//it may be written too (wbak)
				regsWritten |= 1 << ((instr >> 16) & 0x0f);							//Rn
			if (instr & 0x00100000)													//if load, reg set is written
				regsWritten |= instr & 0xffff;
			else																	//if store, Rd is read
				regsRead |= instr & 0xffff;
			break;
		
		case 0b101:																	//B and BL
			if (instr & 0x01000000)													//BL clobbers LR so we can pre-clobber it
				regsWritten |= 1 << EMIT_REG_NO_LR;
			terminal = true;
			break;
		
		case 0b110:																	//CP instrs
		case 0b111:																	//CP instrs and UDF
			terminal = true;
			break;
	}
	
	
	if (regsWritten & (1 << EMIT_REG_NO_PC))										//any PC write is for sure terminal
		terminal = true;
	
	*regsReadP = regsRead;
	*regsWrittenP = regsWritten;
	
	return terminal;
}

//find regs we can clobber based on decoding current and future instrs. Optionally include
// based on future instrs. all regs current instr uses are NOT returned
uint32_t jitPrvFindClobberables(const uint32_t *curInstrP)					//returns a bitmask of clobberables
{
	uint32_t regsDisallowed = (1 << EMIT_REG_NO_PC) | (1 << EMIT_REG_NO_SP);	//things we are not allowed to clobber anymore
	uint32_t regsAllowed = 0;
	uint32_t i;
	
	for (i = 0; i < 16; i++) {													//go at most 8 instrs forward
		
		uint32_t instr = *curInstrP++, regsUsed = 0, regsClobbered = 0;
		bool final = false, conditional;										//conditional instrs give us no help but they do consume regs which we 
		
		conditional = ((instr >> 28) != EmitCcAl) && ((instr >> 28) != EmitCcNv);
		final = jitPrvAnalyzeRegsUseOneInstr(instr, &regsUsed, &regsClobbered);
		
		if (!i) {	//"current" instr has special handling
			
			regsDisallowed |= regsUsed | regsClobbered;
		}
		else {
		
			regsDisallowed |= regsUsed;				//any regs consumed are from now on disallowed
			regsClobbered &=~ regsDisallowed;		//any regs we clobber but were previously consumed are off limits
			if (!conditional)						//ocnditional instrs may not exec and tus may not clobber - they are of less than no use (they consume bu tmay not produce)
				regsAllowed |= regsClobbered;		//any regs we clobbered that were not previously read are allowed!
		}
		
		//either way if cur instr is final, further analysis is pointless
		if (final)
			break;
	}
	
	return regsAllowed;
}

static enum EmitStatus jitTranslateInstr(struct EmitBuf *dest, uint32_t instr, uint32_t instrAddr, enum EmitCc cc, bool *terminateP)
{
	enum EmitStatus now;
	uint32_t tmp;
	
	switch ((instr >> 24) & 0x0F) {

		case 0x00:
		case 0x01:
			now = jitHandleDataProcessingRegReg(dest, instr, instrAddr, cc, terminateP);
			if (now != EmitErrInvalidInput)
				return now;
			
			if ((instr & 0x90) == 0x90)
				return jitHandleTable32(dest, instr, instrAddr, cc, terminateP);
			
			return jitHandleTable33(dest, instr, instrAddr, cc, terminateP);
		
		case 0x02:
		case 0x03:
			now = jitHandleDataProcessingImm(dest, instr, instrAddr, cc, terminateP);
			if (now != EmitErrInvalidInput)
				return now;
			
			//could be MSR_imm
			if ((instr & 0x00b0f000) == 0x0020f000)
				return jitHandleMsrImm(dest, instr, cc);
			return EmitErrInvalidInput;

		case 0x04:
		case 0x05:
		case 0x06:
		case 0x07:
			return jitHandleArmMode2(dest, instr, instrAddr, cc, terminateP);

		case 0x08:
		case 0x09:
			return jitHandleArmMode4(dest, instr, instrAddr, cc, terminateP);

		case 0x0B:	//BL
			*terminateP = true;	//need to translate after this anyways
			return jitEmitBlToArm(dest, cc, instrAddr, jitWorkOutArmBranchTarget((uint32_t*)instrAddr));
		
		case 0x0A:	//B
			fatal("we do not expect B to ever make it here...\n");
		
		case 0x0C:
		case 0x0D:
			//coprocessor loads/store/double reg xfers
			return EmitErrInvalidInput;
		
		case 0x0E:
			//coproc data processing, coproc reg xfers
			return EmitErrInvalidInput;
			
		case 0x0F:
			return jitHandleSwi(dest, instr, instrAddr, cc, terminateP);

		default:
			return EmitErrInvalidInput;
	}
}

static enum EmitStatus jitPrvSpillLiterals(struct JitTls* state, struct EmitBuf *dest, bool jumpOver)
{
	struct LiteralPoolEntry *entries = state->litPool, *entry = entries;
	uint32_t i, numLits, *words, curEmittedWordIdx = 0;
	struct EmitBuf jumpOverBuf, loadBuf;
	enum EmitStatus now;
	
	//bail if nothing to spill
	if (!state->literalIdx)
		return EmitErrNone;
	
	//get and clear state (in case we error out, we do not want to leave literals in the pool)
	numLits = state->literalIdx;
	state->literalIdx = 0;
	
	if (!numLits)
		return EmitErrNone;
	
	//create the jump over if needed
	if (jumpOver)
		EMIT(SaveSpace, &jumpOverBuf, 1);
	
	//align to word
	if (((uintptr_t)dest->buf) & 2)
		EMIT(LLnop, false);
	
	words = (uint32_t*)dest->buf;
	
	for (i = 0; i < numLits; i++, entry++) {
		
		uint32_t wordIdx = entry->identical ? entries[entry->idxOfIdenticalLiteral].value : curEmittedWordIdx;
		uint32_t instrPos = (uintptr_t)(((uint16_t*)dest->bufStart) + entry->loc);
		uint32_t wordPos = (uintptr_t)(words + wordIdx);
		uint32_t instrPc = (instrPos &~ 3) + 4;
		uint32_t ofst = wordPos - instrPc;		//GUARANTEED possible due to encodings
				
		emitBufferInit(&loadBuf, (void*)instrPos, sizeof(uint16_t) * (EMIT_IS_LOREG(entry->reg) ? 1 : 2));
		
		EMIT_TO(LLloadImm, &loadBuf, entry->reg, EMIT_REG_NO_PC, ofst, EmitSzWord, false, EmitAdrModeIndex);
		
		if (!entry->identical) {	//this entry HAS a word, we need to emit it
			
			//emit the word
			EMIT(LLrawHalfword, entry->value & 0xffff);
			EMIT(LLrawHalfword, entry->value >> 16);
		
			//remember where it was (in case there are idential entried pointing to this one)
			entry->value = curEmittedWordIdx;
		
			//account for it
			curEmittedWordIdx++;
		}
	}
	
	//jump over if needed
	if (jumpOver) {
		now = jitEmitIntraTuBranch(&jumpOverBuf, emitGetPtrToJumpHere(dest), EmitCcAl);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

static int32_t jitPatternMatchLinkerStubInvocation(const uint32_t *code)	//return module ID of caller if success, else negative
{
	//this format violates ABI but is often used:			 	STMFD SP!, {R0-R3,LR}; SUB R2, SP, #4; ADR R1, . + 4; STR PC, [R2]; LDR R2, [R2]; ...
	static const uint32_t sysLinkerStubPattern1a[] = {0xE92D400F, 0xE24D2004, 0xE24F1004, 0xE582F000, 0xE5922000, 0xE0422001, 0xE2822014, 0xE59D1018, 0xE0411002, 0xE58D1018, 0xE5910008, 0xE3A03D40, 0xE2433001, 0xE0000003, 0xE5992000};
	//this format sticks with the ABI but is not used often:	STMFD SP!, {R0-R3,LR}; ADR R1, . + 4; STMFD SP!, {PC}; LDMFD SP!, {R2}; ...
	static const uint32_t sysLinkerStubPattern1b[] = {0xE92D400F, 0xE24F1004, 0xE92D8000, 0xE8BD0004, 0xE0422001, 0xE2822014, 0xE59D1018, 0xE0411002, 0xE58D1018, 0xE5910008, 0xE3A03D40, 0xE2433001, 0xE0000003, 0xE5992000};
	static const uint32_t sysLinkerStubPattern2[] = {0xE0822000, 0xE5910004, 0xE0001003, 0xE1A01121, 0xE59D0014};
	static const uint32_t sysLinkerStubPattern3[] = {0xE8BD400F, 0xE8BD1000, 0xE8BD1000, 0xE1A0F00C};
	const uint32_t *stub;
	uint32_t t, myMod;
	
	if (!memcmp(code, sysLinkerStubPattern1a, sizeof(sysLinkerStubPattern1a)))
		code += sizeof(sysLinkerStubPattern1a) / sizeof(*sysLinkerStubPattern1a);
	else if (!memcmp(code, sysLinkerStubPattern1b, sizeof(sysLinkerStubPattern1b)))
		code += sizeof(sysLinkerStubPattern1b) / sizeof(*sysLinkerStubPattern1b);
	else
		return -1;
	
	t = *code++;
	if ((t & 0xfffff000) != 0xe5922000)
		return -1;
	myMod = (t & 0xfff) >> 2;
	
	if (memcmp(code, sysLinkerStubPattern2, sizeof(sysLinkerStubPattern2)))
		return -1;
	code += sizeof(sysLinkerStubPattern2) / sizeof(*sysLinkerStubPattern2);
	
	t = *code++;
	if ((t & 0xff000000) != 0xeb000000)
		return -1;
	
	//verify that we are calling sys linker stub
	stub = code + 1 + (((int32_t)(t << 8)) >> 8);
	if (stub[0] != 0xe519c008 || stub[1] != 0xe59cf87c)
		return -1;
	
	if (memcmp(code, sysLinkerStubPattern3, sizeof(sysLinkerStubPattern3)))
		return -1;
	code += sizeof(sysLinkerStubPattern3) / sizeof(*sysLinkerStubPattern3);

	return myMod;
}

static void __attribute__((used, naked)) LinkerStubCallout(void) {
	//some of this can be writen better using thumb2, but this is cleaner and this is not a bottleneck, so leave it be!
	asm volatile(
		".syntax unified					\n\t"
		"	push   {r0-r3, lr}				\n\t"
		
		//our callout generation code adjusts this such that teh address pushed is already func start. easy peasy
		"	ldr    r0, [sp, #0x18]			\n\t"	//get stashd addr of stub func
		"	ldrh   r1, [r0, #0x04]			\n\t"	//get current module id (normal stub has it hardcoded, but we cannot afford that)
		"	ldrh   r2, [r0, #0x08]			\n\t"	//get the point to where the table goes	
		
		//isolate module ID and globals offset for the table
		"	movs   r3, #0xf					\n\t"
		"	lsls   r3, #12					\n\t"
		"	bics   r1, r3					\n\t"
		"	bics   r2, r3					\n\t"
		
		//get pointer to table
		"	mov    r3, r9					\n\t"
		"	ldr    r3, [r3]					\n\t"
		"	ldr    r3, [r3, r1]				\n\t"
		"	adds   r2, r3					\n\t"
			
		//call SysLinkerStub
		"	ldr    r0, [sp, #0x14]			\n\t"	//module descriptor
		"	lsrs   r1, #2					\n\t"	//client ID
		"	adds   r2, #1					\n\t"	//tell linker stub that we can handle thumb instructions in the table (our emitted stubs can)
		"	bl     SysLinkerStub			\n\t"
			
		//fix things up and return
		"	ldr    r0, [sp, #0x10]			\n\t"
		"	mov    lr, r0					\n\t"
		"	pop    {r0-r3}					\n\t"
		"	add    sp, #8					\n\t"
	#ifdef BUILD_FOR_THUMB_1			//we have no such callout here and it would be hard to make one - just do it
		"	pop    {pc}						\n\t"
	#else
		"	b      jitPrvPopPcCallout		\n\t"
	#endif
	
	:
	:
	:"cc", "memory"
	);
}

static uint32_t armShifterImmDecode(uint32_t val)
{
	if (val >> 8)
		val = ((val & 0xFF) >> ((val >> 8) * 2)) + ((val & 0xFF) << (32 - ((val >> 8) * 2)));
	
	return val;
}

static bool jitParseArmMovToRegFomRegShiftImm(uint32_t instr, uint8_t* rdNoP, uint8_t* rmNoP, enum EmitShiftType *shiftTypeP, uint8_t *shiftAmtP, enum EmitCc *ccP)
{
	enum EmitShiftType shiftType = (enum EmitShiftType)((instr >> 5) & 0x03);
	uint32_t shiftAmt = (instr >> 7) & 0x1F;
	
	if ((instr & 0x0fff0010) != 0x01a00000)
		return false;
	
	switch (shiftType) {			//we do not care about RRX here
		case EmitShiftLsr:
		case EmitShiftAsr:
			if (!shiftAmt)
				shiftAmt = 32;
			break;
		default:
			break;
	}
	
	if (rdNoP)
		*rdNoP = (instr >> 12) & 0x0F;
	
	if (rmNoP)
		*rmNoP = instr & 0x0F;
	
	if (shiftTypeP)
		*shiftTypeP = shiftType;
	
	if (shiftAmtP)
		*shiftAmtP = shiftAmt;
	
	if (ccP)
		*ccP = (enum EmitCc)(instr >> 28);
	
	return true;
}

#ifdef BUILD_FOR_THUMB_1
	#include "emuJitPatMatchAndPeepholeReader_m0.inc.h"
#else
	#include "emuJitPatMatchAndPeepholeReader_m3.inc.h"
#endif

//return false on any error, else true. if we fail, we may have corrupted the old place, so do not continue with this TU
static bool jitMatchPastFutureBranches(struct EmitBuf* dest, uint32_t instrAddrInitial, uint32_t instrAddr, enum EmitCc cc, struct FwdJumpInfo *fwdJumps, uint16_t *fwdJumpIdxP)
{
	uint32_t destOfst = (instrAddr - instrAddrInitial) / 4;
	uint32_t len, i = 0;
	enum EmitStatus now;
	
	while (i < *fwdJumpIdxP) {
		
		struct EmitBuf tbuf;
		
		if (fwdJumps[i].dest != destOfst) {
			i++;
			continue;
		}
		
		//match
		len = fwdJumps[i].curLen;
		logjt("Forward jump at 0x%08x (ofst %u) matched. It was %u halfwords and had cc 0x%02x. Sending it to 0x%08x (ofst %u)\n",
			((uint16_t*)dest->bufStart) + fwdJumps[i].loc, fwdJumps[i].loc, len,
			fwdJumps[i].cc, dest->buf, (uint16_t*)dest->buf - (uint16_t*)dest->bufStart);
		
		emitBufferInit(&tbuf, ((uint16_t*)dest->bufStart) + fwdJumps[i].loc, sizeof(uint16_t) * len);
		
		now = jitEmitIntraTuBranch(&tbuf, emitGetPtrToJumpHere(dest), (enum EmitCc)fwdJumps[i].cc);
		if (now != EmitErrNone) {
			loge("adjusting past future branch failed: %d\n", now);
			return false;
		}
		logjt("Adjusted past future branch. Appending %u nops\n", (uint16_t*)tbuf.bufEnd - (uint16_t*)tbuf.buf);
		now = jitEmitNopFill(&tbuf);
		if (now != EmitErrNone)
			return now;
		
		//remove this entry from the list by replacing it with last and decrementing count
		fwdJumps[i] = fwdJumps[--(*fwdJumpIdxP)];
	}
	
	return true;
}

static void jitDebugPrintXlation(uint32_t armAddr, uint32_t armWords, const uint16_t *thumbCode, uint32_t thumbHalfwords)
{
	if (LOG_JIT_TRACE) {

		const uint16_t *end = thumbCode + thumbHalfwords;
		char disasmStr[64];
		
		logjt("Xltd ARM (%u instrs):\n", armWords);
		
		while (armWords--) {
			
			if (4 != disasm(disasmStr, (const void*)armAddr, false, false))
				fatal("cannot disassemble ARM instr: [0x%08x] = 0x%08x", armAddr, *(uint32_t*)armAddr);
			
			logjt("\t%s\n", disasmStr);
			armAddr += sizeof(uint32_t);
		}
		
		if (!thumbHalfwords)
			logjt("to thumb (0 halfwords) : {}\n");
		else {
			logjt("to thumb (%u halfwords): {\n", thumbHalfwords);
			while (thumbCode < end) {
				thumbCode += disasm(disasmStr, thumbCode, true, true) / sizeof(uint16_t);
				logjt("\t%s\n", disasmStr);
			}
			logjt("}\n");
		}
	}
}

//for various reasons, desired addr is passed in TLS.slot[TLS_SLOT_JIT_ADDR]
struct TU* __attribute__((used)) jitTranslateNewTu(bool* tcFlushedP)
{
	uint32_t instr = 0, instrAddrInitial, instrAddr, maxCodeWords, shortLoopEntryWriteLoc, r9, tuAddr;
	struct EmitBuf dest = {};
	bool startedWithEmptyTc;
	enum EmitStatus now;
	
	r9 = ralSetSafeR9();	//we're running in who the hell knows what context - this is a good safe thing to do
	
	//do this before anything else to make sure re-entrancy is resolved
	tuAddr = instrAddrInitial = (uintptr_t)TLS.slot[TLS_SLOT_JIT_ADDR];
	asm volatile("":::"memory");	//verify this is done before all after this
	
	
	if (instrAddrInitial < 0x100000)
		fatal("Refusing to translate below 1M (requested 0x%08x) - there should be no ARM code at NULL!\n", instrAddrInitial);
	
	if (instrAddrInitial & 3) {
		//codewarrior likes to assume that thumb "BX PC" will round PC DOWN to multiple of 4. v7M does not do this
		
		if ((instrAddrInitial & 3) == 2 && *(uint16_t*)(instrAddrInitial - 4) == 0x4778) {
		
			//the infamous arm v4 BX PC quirk
			logw("Trying to JIT 0x%08x, assuming ARMv4 \"BX PC\" quirk\n", instrAddrInitial);
			instrAddrInitial &=~ 3;
		}
		else
			fatal("UNALIGNED ARM ADDR PASSED TO JIT: 0x%08x\n", instrAddrInitial);
	}
	
	do {
		bool terminate = false, tcFlushed, forceTerminated = false;
		uint32_t maxInstrs, nHalfwordsToSave;
		enum EmitCc cc = EmitCcAl;
		uint16_t fwdJumpIdx = 0;						//number of entries in fwd jump list that is used. 0 .. FORWARD_JUMP_CACHE_SZ - 1
		struct TU *tu, *extant;
		struct JitTls* state;
		
		state = jitGetState(true);
		startedWithEmptyTc = !state->tcLevel;
		state->literalIdx = 0;
		
		maxInstrs = state->tcSize / 4;	//a neat-enough heurstic
		
		#ifdef SUPPORT_ICACHE_FLUSH
			uint64_t curFlushCt;
			
			do {
				curFlushCt = mCurFlushCount;
				asm volatile("":::"memory");
			} while (curFlushCt != mCurFlushCount);
			
			if (state->lastFlushCount != curFlushCt) {		//flush icache.
				
				//why only in xlate? old code was already running and invalidating it would be undefined behaviour anyways
				// assume no one does it
				emuJitInstrCacheClearAll();
				state->lastFlushCount = curFlushCt;
			}
		#endif
	
		shortLoopEntryWriteLoc = 0;
		instrAddr = instrAddrInitial;
		tcFlushed = false;
		tu = jitTuAllocate(&maxCodeWords, &tcFlushed);
		if (tcFlushed) {
			state = jitGetState(true);
			if (tcFlushedP)
				*tcFlushedP = true;
		}
		
		tu->baseAddr = tuAddr;
		emitBufferInit(&dest, tu->code, maxCodeWords * sizeof(*tu->code));
		logjt("Translating ARM @ 0x%08X to tu code at 0x%08x\n", instrAddr, tu->code);
		
		now = jitEmitTuPrologue(&dest, tuAddr);
		if (now != EmitErrNone){
			if (now != EmitErrNoSpace)
				logw("prologue failed");
			goto xlateFail;
		}
		
		tcFlushed = false;
		now = jitPatternMatcherMatch(&tu, &dest, (const uint32_t*)instrAddr, tcFlushedP);
		if (tcFlushed) {
			state = jitGetState(true);
			if (tcFlushedP)
				*tcFlushedP = true;
		}
		if (now == EmitErrNone) {
			logjt("patern matcher matched!\n");
		}
		else if (now == EmitErrNoSpace) {
			logjt("pattern matcher result ran us out of space\n");
			goto xlateFail;
		}
		else if (now != EmitErrInvalidInput)
			fatal("unexpected error from pattern matcher: %d. src was 0x%08x\n", now, instrAddr);
		else {
			
			do {
				uint32_t jumpTo = 0, peepholeStart;
				uint16_t *outputStart = dest.buf;
				bool tuWasFound = false;
				
				instr = *(uint32_t*)instrAddr;
				cc = (enum EmitCc)(instr >> 28);
				
				//see if we need to spill some literals due to range
				if (state->literalIdx && ((uint16_t*)dest.buf - (uint16_t*)dest.bufStart) - state->litPool[0].loc >= MAX_LITERALS_RANGE) {
					
					now = jitPrvSpillLiterals(state, &dest, true);
					if (now != EmitErrNone)
						goto xlateFail;
				}
				
				if (!jitMatchPastFutureBranches(&dest, instrAddrInitial, instrAddr, cc, state->fwdJumps, &fwdJumpIdx)) {
					now = EmitErrInternalErr;
					goto xlateFail;
				}
			
				#ifdef JUMP_TO_EXTANT_XLATIONS
					//if this addr has been translated already, just go there and call it done
					extant = jitTuFindByExactAddr(instrAddr);
					if (extant) {
						logjt("Already have translation of 0x%08x (at 0x%08x) -> going there\n", instrAddr, extant->code);

						now = jitEmitJumpToAnotherTu(&dest, extant->code, EmitCcAl);
						if (now != EmitErrNone)
							goto xlateFail;

						terminate = true;
						continue;
					}
				#endif
				
				//peephole reader
				peepholeStart = instrAddr;
				now = jitPeepholeCodeReader(&dest, (uint32_t**)&instrAddr, &terminate);
				if (now != EmitErrInvalidInput) {
					
					uint32_t nInstrsConsumed = (instrAddr - peepholeStart) / sizeof(uint32_t);
					
					if (now != EmitErrNone)
						goto xlateFail;
					
					//log it
					jitDebugPrintXlation(peepholeStart, nInstrsConsumed, outputStart, (uint16_t*)dest.buf - outputStart);
					
					//the initial instr is jumpable
					state->shortLoopOfsts[shortLoopEntryWriteLoc] = outputStart - (uint16_t*)dest.bufStart;
					if (++shortLoopEntryWriteLoc == SHORT_LOOP_OPT_SZ)
						shortLoopEntryWriteLoc = 0;
					
					//all others arent
					while (--nInstrsConsumed) {
						state->shortLoopOfsts[shortLoopEntryWriteLoc] = outputStart - (uint16_t*)dest.bufStart;
						if (++shortLoopEntryWriteLoc == SHORT_LOOP_OPT_SZ)
							shortLoopEntryWriteLoc = 0;
					}

					continue;
				}
				
				//jumps handled here since there is a lot of special handling to do
				// calculate dest and then handle
				if ((instr & 0x0f000000) == 0x0a000000 && (cc != EmitCcNv)) {
					
					uint16_t *jumpStart = dest.buf;
					
					jumpTo = jitWorkOutArmBranchTarget((uint32_t*)instrAddr);
					
					logjt("jump to 0x%08x from 0x%08x (%s)\n", jumpTo, instrAddr, (jumpTo == instrAddr) ? "SELF" : ((jumpTo > instrAddr) ? "FWD" : "BAK"));
					
					if (jumpTo == instrAddr) {
						
						if (cc != EmitCcAl)
							logjt("conditional (0x%02x) infinite loop detected at 0x%08x\n", cc, instrAddr);
						else {
							logjt("unconditional infinite loop detected at 0x%08x\n", instrAddr);
							terminate = true;
						}
							
						now = jitEmitIntraTuBranch(&dest, emitGetPtrToJumpHere(&dest), cc);
						if (now != EmitErrNone)
							goto xlateFail;
						goto smth_emitted;
					}
					//backwards jumps may be to this same TU - check range and try	to emit a short backwards jump
					if (jumpTo < instrAddr && jumpTo >= instrAddrInitial) {
						
						uint32_t backwardsInstrs = (instrAddr - jumpTo) / 4;
						
						logjt("backwards %u instrs\n", backwardsInstrs);
						
						if (backwardsInstrs < SHORT_LOOP_OPT_SZ) {
							
							int32_t dstOfst, shortLoopArrIdx = -backwardsInstrs;
							
							shortLoopArrIdx += shortLoopEntryWriteLoc;
							while (shortLoopArrIdx < 0)
								shortLoopArrIdx += SHORT_LOOP_OPT_SZ;
							
							if (state->shortLoopOfsts[shortLoopArrIdx] != SHORT_LOOP_OFST_INVALID) {	//if we CAN jump there (cannot into peephole-produced blocks)
							
								dstOfst = (uint16_t*)dest.buf - (uint16_t*)dest.bufStart - state->shortLoopOfsts[shortLoopArrIdx];	//how many words backwards to jump
								dstOfst = -2 - dstOfst;									//jump target calculation
								
								logjt("short loop opt possible for branching from 0x%08x (ofst %u) to 0x%08x (ofst %u) [instr was 0x%08x, calculated jump offset is %d]\n",
									instrAddr, (uint16_t*)dest.buf - (uint16_t*)dest.bufStart, jumpTo, state->shortLoopOfsts[shortLoopArrIdx], instr, dstOfst);
								
								now = jitEmitIntraTuBranch(&dest, (uintptr_t)(((uint16_t*)dest.bufStart) + state->shortLoopOfsts[shortLoopArrIdx]), cc);
								if (now != EmitErrNone)
									goto xlateFail;
								
								if (cc == EmitCcAl)
									terminate = true;
								goto smth_emitted;
							}
						}
					}
					
					//emit proper jump and record whether we found a TU to jump to
					now = jitEmitJumpToArm(&dest, cc, jumpTo, &tuWasFound);
					if (now != EmitErrNone)
						goto xlateFail;
					terminate = (cc == EmitCcAl);
					
					//jumps forward might be to this same TU. Record them in case we do come to it
					if (jumpTo > instrAddr && (jumpTo - instrAddr) / 4 <= MAX_IN_TU_FWD_JUMP_LEN && !tuWasFound) {
					
						uint32_t wordFromStart = (jumpTo - instrAddrInitial) / 4;
						
						if (wordFromStart < 0x10000) {
							
							if (fwdJumpIdx == FORWARD_JUMP_CACHE_SZ) {
								
								memmove(state->fwdJumps, state->fwdJumps + 1, sizeof(state->fwdJumps) - sizeof(*state->fwdJumps));
								fwdJumpIdx--;
							}
							state->fwdJumps[fwdJumpIdx].loc = jumpStart - (uint16_t*)dest.bufStart;
							state->fwdJumps[fwdJumpIdx].dest = wordFromStart;
							state->fwdJumps[fwdJumpIdx].cc = cc;
							state->fwdJumps[fwdJumpIdx].curLen = (uint16_t*)dest.buf - jumpStart;
							fwdJumpIdx++;
						}
					}
					goto smth_emitted;
				}
				
				if (cc == EmitCcNv) {
					
					now = jitTranslateNvInstr(&dest, instr, instrAddr, &terminate);
					if (now != EmitErrNone && now != EmitErrInvalidInput)
						goto xlateFail;
					cc = EmitCcAl;
				}
				else {
					
					now = jitTranslateInstr(&dest, instr, instrAddr, cc, &terminate);
					if (now != EmitErrNone && now != EmitErrInvalidInput)
						goto xlateFail;
				}
			
				if (now == EmitErrInvalidInput) {
					logw("Failed to translate instruction 0x%08x at 0x%08x. BB start was at 0x%08x. Emitted %u halfwords so far. Terminating TU translation and adding a UDF.N UNTR\n",
						instr, instrAddr, instrAddrInitial, (uint16_t*)dest.buf - (uint16_t*)dest.bufStart);
					
					now = emitLLudf(&dest, UDF_VAL_UNTRANSLATEABLE, false);
					terminate = true;
				}
			
			smth_emitted:
			
				if (now != EmitErrNone)
					goto xlateFail;
				
				jitDebugPrintXlation(instrAddr, 1, outputStart, (uint16_t*)dest.buf - outputStart);

				state->shortLoopOfsts[shortLoopEntryWriteLoc] = outputStart - (uint16_t*)dest.bufStart;
				if (++shortLoopEntryWriteLoc == SHORT_LOOP_OPT_SZ)
					shortLoopEntryWriteLoc = 0;
			
				instrAddr += 4;
				
				if ((instrAddr - instrAddrInitial) / sizeof(uint32_t) > maxInstrs) {
					
					loge("TU too long - truncating (%u [0x%08x..0x%08x] > %u (max %u))\n", 
						(instrAddrInitial - instrAddr) / sizeof(uint32_t), instrAddrInitial, instrAddr, maxInstrs, state->tcSize);
					forceTerminated = true;
				}
				
			} while (!terminate);
			
			//if we had a terminating instruction but not with an AL cond code (or were force terminated), we need to generate an unconditional jump to next instr
			if (cc != EmitCcAl || forceTerminated) {
				now = jitEmitJumpToArm(&dest, EmitCcAl, instrAddr, NULL);
				if (now != EmitErrNone)
					goto xlateFail;
			}
			#ifdef SUPPORT_ICACHE_FLUSH
				tu->srcLen = (instrAddr - instrAddrInitial) / sizeof(uint32_t);
			#endif
		}
		
		//spill any literals we might have before inserting the TU
		now = jitPrvSpillLiterals(state, &dest, false);
		if (now != EmitErrNone)
			goto xlateFail;
		
		if (LOG_TRACE) {
			static uint32_t inWords = 0, outHalfwords = 0;
			
			#if LOG_TRACE
				#warning "LOG_TRACE is incompatible with hardware which needs to always handle some IRQs" 
			#endif
			
			asm volatile("cpsid i\n":::"memory");
			
			inWords += (instrAddr - instrAddrInitial) / 4;
			outHalfwords += (uint16_t*)dest.buf - (uint16_t*)dest.bufStart;
			
			if (inWords) {
				logt("TC stats so far %u instrs in (%u bytes), %u HWs out (%u bytes): %u ppm of original size\n",
					inWords, inWords * 4 , outHalfwords, outHalfwords * 2,
					(uint32_t)((500000ULL * outHalfwords + inWords / 2) / inWords));
			}
			
			asm volatile("cpsie i":::"memory");
		}
		
		jitTuInsert(tu, &dest);
		ralRestoreR9(r9);
		return tu;
	
	xlateFail:
		if (now == EmitErrNoSpace) {
			
			logjt("failed to make space for translation\n");
			
			jitTcFlush();					//reset TC
			if (tcFlushedP)
				*tcFlushedP = true;
		}
		else
			break;
		
	} while (!startedWithEmptyTc);
	
	static const char* codes[] = {
		[EmitErrNone] = "none",
		[EmitErrNoSpace] = "out of space",
		[EmitErrNotEncodeable] = "not encodeable",
		[EmitErrInvalidInput] = "unknown instr",
		[EmitErrInternalErr] = "internal error",
	};
	
	fatal("Failed to translate instruction 0x%08x at 0x%08x with error code %d (%s). BB start was at 0x%08x. Emitted %u halfwords so far\n",
		instr, instrAddr, now, (now < sizeof(codes) / sizeof(*codes) && codes[now]) ? codes[now] : "???",
		instrAddrInitial, (uint16_t*)dest.buf - (uint16_t*)dest.bufStart);

	ralRestoreR9(r9);
	return NULL;
}

enum EmitStatus jitPrvLiteralLoadsFlush(struct EmitBuf *dest, uint32_t slotsNeeded)	//make sure we can emit this many loads withotu a flush
{
	struct JitTls* state = jitGetState(true);
	enum EmitStatus now;
	
	
	if (slotsNeeded >= MAX_LITERALS_BEFORE_SPILL)
		fatal("Can never make this many slots\n");
	else if (MAX_LITERALS_BEFORE_SPILL - state->literalIdx < (int32_t)slotsNeeded) {
		
		now = jitPrvSpillLiterals(state, dest, true);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

enum EmitStatus jitPrvLiteralLoad(struct EmitBuf *dest, uint32_t regNo, uint32_t val)
{
	struct JitTls* state = jitGetState(true);
	struct LiteralPoolEntry *entry;
	uint32_t identicalIdx, ofst;
	bool hasIdentical = false;
	enum EmitStatus now;
	
	
	//if there is no space, spill now
	if (state->literalIdx == MAX_LITERALS_BEFORE_SPILL) {
		
		now = jitPrvSpillLiterals(state, dest, true);
		if (now != EmitErrNone)
			return now;
	}
	
	//check if we already have this same literal and if so, just use it
	for (identicalIdx = 0; identicalIdx < state->literalIdx; identicalIdx++) {
		
		if (state->litPool[identicalIdx].value == val && !state->litPool[identicalIdx].identical) {
			hasIdentical = true;
			break;
		}
	}
	ofst = (uint16_t*)dest->buf - (uint16_t*)dest->bufStart;
	
	//literal load placeholder (short UDF - 1 halfword)
	EMIT(LLudf, UDF_VAL_LITERAL_LOAD, false);
	
	//hi regs need 2 words so take up another with a nop
	if (!EMIT_IS_LOREG(regNo))
		EMIT(LLnop, false);
	
	entry = &state->litPool[state->literalIdx++];
	entry->reg = regNo;
	entry->loc = ofst;
	
	if (hasIdentical){ 
		
		entry->idxOfIdenticalLiteral = identicalIdx;
		entry->identical = true;
	}
	else {
		
		entry->value = val;
		entry->identical = false;
	}
	
	return EmitErrNone;
}

static void jitPatchJumpPerformPatch(uint16_t* pc, const uint16_t *to)
{
	struct EmitBuf tbuf;
	enum EmitStatus now;
	
	emitBufferInit(&tbuf, (void*)pc, sizeof(uint16_t) * 3 /* we only have that many halfwords */);
	
	now = jitEmitJumpToAnotherTu(&tbuf, to, EmitCcAl);
	if (now != EmitErrNone)
		fatal("Cannot %s! Need jump from 0x%08x to 0x%08x\n", "replace temporary jump with proper one", pc, to);
	
	now = jitEmitNopFill(&tbuf);
	if (now != EmitErrNone)
		fatal("Cannot %s! Need jump from 0x%08x to 0x%08x\n", "NOP pad replaced jump", pc, to);
	
	emitBufCacheCoherencyManage(&tbuf);
}

void* __attribute__((used)) jitTranslateAndPatchJumpUserspaceHandlerC(void)	//return new PC to jump to
{
	struct JitTls* state = jitGetState(true);
	uint16_t *jump = (uint16_t*)TLS.slot[TLS_SLOT_JIT_JUMP_AT], *jumpTo;
	uint32_t addr = (uintptr_t)TLS.slot[TLS_SLOT_JIT_ADDR];
	bool tcFlushed = false;
	enum EmitStatus now;
	struct TU *tu;
	
	logjt("did not find tu for jump to 0x%08x at  0x%08x - translating\n", addr, jump);
	tu = jitTranslateNewTu(&tcFlushed);
	if (!tu)
		fatal("Cannot translate at 0x%08x\n", addr);
	if (!tcFlushed)
		jitPatchJumpPerformPatch(jump, tu->code);
	
	now = jitEmitWhereToJumpFromAnotherTu(tu->code, &jumpTo);	//sort out what to return
	if (now != EmitErrNone)
		fatal("Cannot %s! Need jump from 0x%08x to 0x%08x\n", "find where to go to", jump, tu->code);
	
	return ((char*)jumpTo) + 1;
}


void* __attribute__((used)) jitOnlyPatchJumpUserspaceHandlerC(void)	//return address to jump to
{
	uint16_t *jumpTo, *jump = (uint16_t*)TLS.slot[TLS_SLOT_JIT_JUMP_AT];
	struct TU* tu = (struct TU*)TLS.slot[TLS_SLOT_JIT_ADDR];
	enum EmitStatus now;
	
	jitPatchJumpPerformPatch(jump, tu->code);
	
	now = jitEmitWhereToJumpFromAnotherTu(tu->code, &jumpTo);	//sort out what to return
	if (now != EmitErrNone)
		fatal("Cannot %s! Need jump from 0x%08x to 0x%08x\n", "find where to go to", jump, tu->code);
	
	return ((char*)jumpTo) + 1;
}

static void __attribute__((naked)) jitTranslateAndPatchJumpUserspaceHandler(void)
{
	#ifdef BUILD_FOR_THUMB_1
	
		asm volatile(
			"	push   {r0-r5, lr}									\n\t"	//stash regs
			"	mov    r5, r12										\n\t"
			"	mrs    r4, apsr										\n\t"
			"	bl     jitTranslateAndPatchJumpUserspaceHandlerC	\n\t"	//so go translate
			"	ldr    r1, [sp, #4 * 6]								\n\t"	//reload lr
			"	mov    lr, r1										\n\t"
			"	str    r0, [sp, #4 * 6]								\n\t"	//store ret addr
			"	msr    apsr_nzcvq, r4								\n\t"	//start popping things
			"	mov    r12, r5										\n\t"
			"	pop    {r0-r5, pc}									\n\t"	//load regs
		);
	
	#else
	
		asm volatile(
			"	push   {r0-r4, r12, lr}								\n\t"	//stash regs
			"	mrs    r4, apsr										\n\t"
			"	bl     jitTranslateAndPatchJumpUserspaceHandlerC	\n\t"	//so go translate
			"	ldr    lr, [sp, #4 * 6]								\n\t"	//reload lr
			"	str    r0, [sp, #4 * 6]								\n\t"	//store ret addr
			"	msr    apsr_nzcvq, r4								\n\t"	//start popping things
			"	pop    {r0-r4, r12, pc}								\n\t"	//load regs
		);
	
	#endif
}

static void __attribute__((naked)) jitOnlyPatchJumpUserspaceHandler(void)
{
	asm volatile(
			"	push   {r0-r5, lr}									\n\t"	//stash regs
			"	mov    r5, r12										\n\t"
			"	mrs    r4, apsr										\n\t"
			"	bl     jitOnlyPatchJumpUserspaceHandlerC			\n\t"	//so go translate
			"	ldr    r1, [sp, #4 * 6]								\n\t"	//reload lr
			"	mov    lr, r1										\n\t"
			"	str    r0, [sp, #4 * 6]								\n\t"	//store ret addr
			"	msr    apsr_nzcvq, r4								\n\t"	//start popping things
			"	mov    r12, r5										\n\t"
			"	pop    {r0-r5, pc}									\n\t"	//load regs
		);
}

//CALLED in EXC mode! This means that we cannot write to ROMRAM region here (will silently fail)
void __attribute__((used)) jitPatchJump(struct CortexExcFrame* exc, uint32_t addr)	//return addr to jump to, also patch caller.
{
	uint16_t *pc = (uint16_t*)exc->pc, *jumpTo;
	enum EmitStatus now;
	struct TU* tu;

	logjt("need to patch arm jump to 0x%08x at 0x%08x\n", addr, pc);
	
	//do a quick lookup in exception mode for speed
	tu = jitTuFindByExactAddr(addr);
	if (tu) {
		
		#ifdef JIT_NO_EXC_MODE_WRITES
			
			//we cannot write to the state in exc mode - we'll do the patching in userspace, but we need to preserve the state
			TLS.slot[TLS_SLOT_JIT_ADDR] = (uintptr_t)tu;
			TLS.slot[TLS_SLOT_JIT_JUMP_AT] = (uintptr_t)pc;
			exc->pc = ((uintptr_t)&jitOnlyPatchJumpUserspaceHandler) &~ 1;
		
		#else
		
			jitPatchJumpPerformPatch(pc, tu->code);
			
			now = jitEmitWhereToJumpFromAnotherTu(tu->code, &jumpTo);	//sort out what to return
			if (now != EmitErrNone)
				fatal("Cannot %s! Need jump from 0x%08x to 0x%08x\n", "find where to go to", pc, tu->code);
			
			exc->pc = (uintptr_t)jumpTo;
		#endif
	}
	else {

		struct JitTls* state = jitGetState(true);
		
		TLS.slot[TLS_SLOT_JIT_ADDR] = addr;
		TLS.slot[TLS_SLOT_JIT_JUMP_AT] = (uintptr_t)pc;

		exc->pc = ((uintptr_t)&jitTranslateAndPatchJumpUserspaceHandler) &~ 1;
	}
}

struct CortexExcFrame* __attribute__((used)) emuJitHandleCallToDeletedTu(struct CortexExcFrame *ctx)
{
	struct TU *dst, *src = (struct TU*)(((uintptr_t)ctx->pc) - offsetof(struct TU, code));
	uint32_t srcAddr = src->baseAddr;
	
	dst = jitTuFindByExactAddr(srcAddr);

	fatal("deleted TUs untested on m0\n");

	if (dst) {	//jumps between TUs should fit into two instrs
		
		struct EmitBuf dest;
		enum EmitStatus now;
		
		emitBufferInit(&dest, src->code, sizeof(uint16_t) * 2 /* every TU has space for at least 2 halfwords, as ensured by jitTuInsert() */);
		
		now = jitEmitJumpToAnotherTu(&dest, dst->code, EmitCcAl);
		if (now != EmitErrNone)
			fatal("Failed to emit replacement jump from 0x%08x to 0x%08x (src addr 0x%08x)\n", src->code, dst->code, srcAddr);
		
		emitBufCacheCoherencyManage(&dest);
	}
	else {		//we have no translation, creating a special path here to it is a pain, let the normal path take it
		
		ctx->sr &=~ CORTEX_SR_FLAG_T;
		ctx->pc = srcAddr;
	}
	return ctx;
}

struct JitBackendRuntimeData *jitPrvGetRuntimeDataPtr(void)
{
	struct JitTls* state = jitGetState(true);
	
	return &state->backendRuntimeData;
}

#ifdef BUILD_FOR_THUMB_1
	
	#define CODE_TO_GO_TO_ARM																												\
		/* now we need to lookup in the hash */																								\
		HASH_ADDRESS(r3, r2)								/* hash the addr */																\
																																			\
		"	ldr    r1, =TLS							\n\t"	/* get pointer to hash array (if any) */										\
		"	ldr    r1, [r1, %0]						\n\t"																					\
		"	cmp    r1, #0							\n\t"	/* and there could be nothing there - handle that! */							\
		"	beq    6f								\n\t"																					\
		"	lsls   r3, #2							\n\t"	/* get pointer to hash chain root ptr */										\
		"	ldr    r1, [r1, r3]						\n\t"																					\
																																			\
		"7:											\n\t"	/* hash_search_loop: lookup loop */												\
		"	cmp    r1, #0							\n\t"	/* if NULL, translate */														\
		"	beq    6f								\n\t"																					\
		"	ldr    r3, [r1, #0]						\n\t"	/* grab the base addr */														\
		"	cmp    r3, r2							\n\t"																					\
		"	bne    8f								\n\t"	/* miss -> try next bucket */													\
		"	adds   r1, %1							\n\t"	/* point r1 to jump address */													\
		"	str    r1, [r0, #0x18]					\n\t"	/* save it on stack where we can pop it */										\
		"	bx     lr								\n\t"	/* return directly to translated code */										\
		"8:											\n\t"	/* not_this_one: try next link in this bucket */								\
		"	ldr    r1, [r1, #4]						\n\t"	/* load next link */															\
		"	b      7b								\n\t"	/* and go on */																	\
																																			\
		"6:											\n\t"	/* hash_lookup_fail: need to translate  */										\
		"	ldr    r1, =TLS							\n\t"																					\
		"	str    r2, [r1, %2]						\n\t"	/* store desired address */														\
		"	ldr    r2, =jitUsermodeEntry			\n\t"	/* get pointer to code we'll run in user mode in thread */						\
		"	str    r2, [r0, #0x18]					\n\t"	/* point PC there */															\
		"	bx     lr								\n\t"	/* and bail */
	
	#define CODE_GOTO_ARM_PC_IN_EXC_VALID																									\
																																			\
		"	ldr    r2, [r0, #0x18]					\n\t"	/* load the ARM pc */															\
		CODE_TO_GO_TO_ARM									/* same as before :)  */
	
	#ifdef HAVE_v8M_BASE
		#define LOAD(reg, val)	"	movw   " reg ", " val
	#else
		#define LOAD(reg, val)	"	ldr   " reg ", =" val
	#endif
	
	#define CODE_FOR_POSSIBLY_UNDEF_16BIT_INSTR																								\
																																			\
		LOAD("r3", "%3")" 							\n\t"	/* maybe UDF_VAL_DELETED_TU ? */												\
		"	cmp   r1, r3							\n\t"																					\
		"	bne   5f								\n\t"																					\
		"	ldr   r3, =emuJitHandleCallToDeletedTu	\n\t"																					\
		"	bx    r3								\n\t"																					\
		"5:											\n\t"
	
	#define CODE_FOR_UNDEF_32BIT_INSTR																										\
																																			\
		LOAD("r3", "%4")"							\n\t"	/* maybe this is a udf.w ? */													\
		"	mov   r12, r3							\n\t"																					\
		"	lsrs  r3, r1, #4						\n\t"																					\
		"	cmp   r3, r12							\n\t"																					\
		"	bne   5f								\n\t"																					\
		"	lsrs  r3, r2, #12						\n\t"																					\
		"	cmp   r3, %5							\n\t"																					\
		"	bne   5f								\n\t"																					\
		"is_udf_w:									\n\t"	/* we need to assemble the entire address (hi in udf, lo in next halfword) */	\
		"	lsls  r1, #28							\n\t"																					\
		"	lsls  r2, #20							\n\t"																					\
		"	lsrs  r2, #4							\n\t"																					\
		"	ldr   r3, [r0, #0x18]					\n\t"	/* re-grab PC */																\
		"	ldrh  r3, [r3, #4]						\n\t"	/* get the halfword with lo bit sof addr */										\
		"	adds  r1, r3							\n\t"																					\
		"	adds  r1, r2							\n\t"	/* the entire address is now calculated */										\
		"	ldr   r2, =jitPatchJump					\n\t"																					\
		"	bx    r2								\n\t"																					\
		"5:											\n\t"	/* not a udf.w */
	
	#define CODE_AT_END_EXTRA																												\
																																			\
		/* common code for userspace */																										\
		".balign 4									\n\t"																					\
		".globl jitUsermodeEntry					\n\t"																					\
		"jitUsermodeEntry:							\n\t"																					\
		"	push   {r0-r5, lr}						\n\t"	/* stash regs */																\
		"	mov    r5, r12							\n\t"																					\
		"	mrs    r4, apsr							\n\t"																					\
		"	movs   r0, #0							\n\t"	/* bool* tcFlushedP */															\
		"	ldr    r1, =jitTranslateNewTu			\n\t"	/* so go translate. this handler is likely in ram so this is better than BL */	\
		"	blx    r1								\n\t"																					\
		"	adds   r0, %1 + 1						\n\t"	/* point r0 to jump address (with thumb bit) */									\
		"	ldr    r1, [sp, #4 * 6]					\n\t"	/* restore lr */																\
		"	mov    lr, r1							\n\t"																					\
		"	str    r0, [sp, #4 * 6]					\n\t"	/* save it on stack where we can pop it */										\
		"	msr    apsr_nzcvq, r4					\n\t"	/* start popping things */														\
		"	mov    r12, r5							\n\t"																					\
		"	pop    {r0-r5, pc}						\n\t"	/* jump to TC */
		
	
	#define EXTRA_ASM_INPUTS		/* 0 */ "I"(TLS_SLOT_JIT_STATE * 4),																	\
									/* 1 */ "I"(offsetof(struct TU, code)),																	\
									/* 2 */ "I"(TLS_SLOT_JIT_ADDR * 4),																		\
									/* 3 */ "i"(UDF_VAL_DELETED_TU | INSTR_UDF_N),															\
									/* 4 */ "i"(INSTR_UDF_W_1 >> 4),																		\
									/* 5 */ "I"(INSTR_UDF_W_2 >> 12)

	#ifdef CUSTOM_RP2040_FAULT_DISPATCHER
		#include "m0FaultDispatch_rp2040_ROMRAM.h"
	#else
		#include "m0FaultDispatch_release.h"
	#endif
		
#else
	
	void __attribute__((naked, used, section(".ramcode"))) UsageFault_Handler(void)
	{
		//the reasons we get here, ranked form most to least common
		// * BX to an arm address (likely a return from thumb code)
		// * BLX.T in thumb code
		// * UDF.W for branch patching
		// * all else
		//we should optimize accordingly
	
		asm volatile(
			".syntax unified								\n\t"
			"	tst    lr, #4								\n\t"	//see which stack fault was on
			"	ite    eq									\n\t"
			"	mrseq  r0, msp								\n\t"	//grab the appropriate SP
			"	mrsne  r0, psp								\n\t"
			"	ldr    r1, =0xE000ED2A						\n\t"	//UFSR address
			"	ldrh   r2, [r1]								\n\t"	//get UFSR
			
			//handle UsageFault if T is set.
			"	lsls   r3, r2, #31							\n\t"	//test for UFSR.UNDEFINSTR (into N) and UFSR.INVSTATE (into C)
			"	bpl    not_undefinstr						\n\t"
			
			//undef instr handler (BLX.T is VERY likely)
			"	movs   r3, #1								\n\t"	//clear UFSR.UNDEFINSTR
			"	strh   r3, [r1]								\n\t"
			"	ldr    r3, [r0, #0x18]						\n\t"	//load PC
			"	ldrh   r2, [r3, #0]							\n\t"	//load first half of instr
			"	lsrs   r1, r2, #11							\n\t"	//verify it is proper for BLX (top 5 bits matter) and move S bit into APSR.C
			"	cmp    r1, #0x1e							\n\t"
			"	bne    not_blx_t_halfword_read				\n\t"	//not BLX.T
			"	ldrh   r3, [r3, #2]							\n\t"	//load second half of instr
			"	lsrs   r1, r3, #12							\n\t"	//verify it is proper for BLX (this extra instr to not modify C)
			"	ands   r1, #0x0D							\n\t"
			"	cmp    r1, #0x0C							\n\t"
			"	bne    not_blx_t_word_read					\n\t"	//not BLX.T
			
			//BLX.T for sure. the halfwords are in r2 and r3
			//we calc ofst >> 1
			"is_blx_t:										\n\t"
			
			"	asrs   r3, #1								\n\t"	//place imm10L into place
			"	sbfx   r2, r2, #0, #11						\n\t"	//get imm11 into place
			"	bfi    r3, r2, #10, #22						\n\t"	//insert into place
			
			//we now have in r3 the offset divided by 4 - calc the destination. we do this this way since we need to round PC to multiple of 4 anyways
			"	ldr    r1, [r0, #0x18]						\n\t"	//load PC
			"	adds   r1, #5								\n\t"	//adjust for expected offset (and also make it a valid thumb ret addr, low bit will go away anyways)
			"	add    r2, r3, r1, lsr #2					\n\t"	//calc destination div 4
			"	lsls   r2, #2								\n\t"	//calc destination addr
			"	str    r1, [r0, #0x14]						\n\t"	//store return "LR" value as BLX does

			//often BLX is used to simply call OsCalls  we have a fast path for that here
			"blx_oscall_fast_path:							\n\t"
			"	ldr    r3, [r2]								\n\t"
			"	ldr    r1, =0xE519C000						\n\t"
			"	subs   r3, r1								\n\t"
			"	ands   r12, r3, #0x0C						\n\t"	//table offset in r12
			"	beq    try_jump_to_arm_addr					\n\t"	//[r9, #-0]	is NOT a valid OsCall!
			"	cmp    r3, r12								\n\t"
			"	bne    try_jump_to_arm_addr					\n\t"
			
			"	ldr    r3, [r2, #4]							\n\t"
			"	ldr    r1, =0xE59CF000						\n\t"
			"	subs   r3, r1								\n\t"
			"	ubfx   r1, r3, #0, #12						\n\t"	//get call offset to r1
			"	bfc    r3, #2, #10							\n\t"	//bic r3, #0xffc
			"	cbnz   r3, try_jump_to_arm_addr				\n\t"	//verify valid ofst
			
			"oscall_confirmed:								\n\t"
			"	sub    r3, r9, r12							\n\t"
			"	ldr    r3, [r3]								\n\t"
			"	ldr    r2, [r3, r1]							\n\t"
		
			"oscall_addr_calced:							\n\t"
			"	lsrs   r1, r2, #1							\n\t"	//see if thumb addr
			"	ittt   cs									\n\t"	//if so, simply return to it
			"	lslcs  r1, #1								\n\t"	//with the bottom bit clear
			"	strcs  r1, [r0, #0x18]						\n\t"	//set pc as is expected
			"	bxcs   lr									\n\t"
			//else fallthrough and translate it

			"try_jump_to_arm_addr:							\n\t"	//common to this and INVSTATE path, expects exc frame in r0, and target pc in r2
			//now we need to lookup in the hash
			HASH_ADDRESS(r3, r2)							//hash the addr
				
			"	ldr    r1, =TLS								\n\t"	//get pointer to hash array (if any)
			"	ldr    r1, [r1, %0]							\n\t"
			"	cbz    r1, hash_lookup_fail					\n\t"	//and there could be nothing there - handle that!
			"	ldr    r1, [r1, r3, LSL #2]					\n\t"	//get pointer to hash chain root ptr
			
			"hash_search_loop:								\n\t"	//lookup loop
			"	cbz    r1, hash_lookup_fail					\n\t"	//if NULL, translate
			"	ldr    r3, [r1, #0]							\n\t"	//grab the base addr
			"	cmp    r3, r2								\n\t"
			"	ittt   eq									\n\t"	//HIT?
			"	addeq  r1, %1								\n\t"	//point r1 to jump address
			"	streq  r1, [r0, #0x18]						\n\t"	//save it on stack where we can pop it
			"	bxeq   lr									\n\t"	//return directly to translated code
			"	ldr    r1, [r1, #4]							\n\t"	//load next link
			"	b      hash_search_loop						\n\t"	//and go on
			
			"hash_lookup_fail:								\n\t"	//need to translate - sort out a way to call the C code to do the translation. r2 has desired address, r0 has exc frame
			"	ldr    r1, =TLS								\n\t"
			"	str    r2, [r1, %4]							\n\t"	//store desired address
			"	ldr    r2, =jitUsermodeEntry				\n\t"	//get pointer to code we'll run in user mode in thread
			"	bic    r2, #1								\n\t"	//we need low bit clear!
			"	str    r2, [r0, #0x18]						\n\t"	//point PC there
			"	bx     lr									\n\t"	//and bail
			
			"not_blx_t_word_read:							\n\t"	//undefined instrs that are not BLX.T end up here. r2 definitely has first halfword, r3 has second. we get here if first starts with 0b11110
			"	lsrs   r1, r2, #4							\n\t"
			"	sub    r1, #0xf7f							\n\t"
			"	cbnz   r1, generic_handler					\n\t"	//we know of no other undefined long instr
			"	lsrs   r1, r3, #12							\n\t"
			"	cmp    r1, #0x0a							\n\t"
			"	bne    generic_handler						\n\t"	//we know of no other undefined long instr
			
			"is_udf_w:										\n\t"
			"	ldr    r1, [r0, #0x18]						\n\t"	//load PC
			"	ldrh   r1, [r1, #0x4]						\n\t"	//get the low word of addr
			"	bfi    r3, r2, #12, #4						\n\t"	//assemble high 16 bits of addr in r3
			"	add    r1, r1, r3, lsl #16					\n\t"	//assemble the entire address in r1
			"	ldr    pc, =jitPatchJump + 1				\n\t"	//patch it
			
			"not_blx_t_halfword_read:						\n\t"	//undefined instrs that are not BLX.T end up here. r2 definitely has first halfword and top bits are NOT 0b11110
			"	lsrs   r3, r2, #8							\n\t"	//maybe a UDF.N
			"	cmp    r3, 0xde								\n\t"
			"	bne    generic_handler						\n\t"
			
			"is_udf_n:										\n\t"
			"	uxtb   r2, r2								\n\t"	//get code
			"	cmp    r2, %3								\n\t"	//compare with UDF_VAL_DELETED_TU
			"	it     eq									\n\t"
			"	ldreq  pc, =emuJitHandleCallToDeletedTu + 1	\n\t"
			"	b      generic_handler						\n\t"
			
			"not_undefinstr:								\n\t"	//APSR.C still has UFSR.INVSTATE
			"	bcc    not_inv_state_either					\n\t"
			"	movs   r3, #2								\n\t"	//clear UFSR.INVSTATE
			"	strh   r3, [r1]								\n\t"
			"	ldr    r3, [r0, #0x1c]						\n\t"	//get stashed CPSR
			"	orr    r3, %2								\n\t"	//set "T" flag
			"	str    r3, [r0, #0x1c]						\n\t"	//set stashed CPSR
			"	ldr    r2, [r0, #0x18]						\n\t"	//load PC
			"	b      try_jump_to_arm_addr					\n\t"
			
			"not_inv_state_either:							\n\t"
			"	lsrs   r3, r2, #4							\n\t"	//UFSR.NOCP -> APSR.C
			"	it     cs									\n\t"
			"	ldrcs  pc, =schedHandleUsageFaultNoCp + 1	\n\t"	//if it is NOCP, call the scheduler to cope with a coprocessor access
			
			//fallthrough to here
			"generic_handler:								\n\t"
			"	ldr    pc, =faultHandlerWithExcFrame + 1	\n\t"	//else call error handler for all other usage faults
			:
			:"I"(TLS_SLOT_JIT_STATE * 4), "I"(offsetof(struct TU, code)), "I"(CORTEX_SR_FLAG_T), "I"(UDF_VAL_DELETED_TU), "I"(TLS_SLOT_JIT_ADDR * 4)
			:"cc", "memory"
		);
	}
	
	void __attribute__((naked, used)) jitUsermodeEntry(void)
	{
		asm volatile(
				//common code for userspace
			"	push   {r0-r4, r12, lr}				\n\t"	//stash regs
			"	mrs    r4, apsr						\n\t"
			"	movs   r0, #0						\n\t"	//bool* tcFlushedP
			"	bl     jitTranslateNewTu			\n\t"	//so go translate
			"	adds   r0, %0 + 1					\n\t"	//point r0 to jump address (with thumb bit)
			"	ldr    lr, [sp, #4 * 6]				\n\t"	//restore lr
			"	str    r0, [sp, #4 * 6]				\n\t"	//save it on stack where we can pop it
			"	msr    apsr_nzcvq, r4				\n\t"	//start popping things
			"	pop    {r0-r4, r12, pc}				\n\t"	//jump to TC

			".ltorg									\n\t"
			:
			:"I"(offsetof(struct TU, code))
			:"cc", "memory"
		);
	}
#endif



//these callouts are used by backend to optimize various return paths

#ifdef BUILD_FOR_THUMB_1

	void __attribute__((naked)) jitPrvPopCtxAndJumpCalloutInterwork(struct M0backendRegState *ctx, uint32_t sr, uint32_t pc)
	{
		asm volatile(
			".syntax unified												\n\t"
			"	lsrs   r3, r2, #1											\n\t"	//check for thumb
			"	bcs    1f													\n\t"
			
			".globl jitPrvPopCtxAndJumpCalloutNoninterwork					\n\t"
			"jitPrvPopCtxAndJumpCalloutNoninterwork:						\n\t"
			".type jitPrvPopCtxAndJumpCalloutNoninterwork, %%function		\n\t"
			
			"	push   {r0, r1}												\n\t"	//save our state
			/* now we need to lookup in the hash */
			HASH_ADDRESS(r3, r2)													// hash the addr
			
			"	ldr    r1, =TLS												\n\t"	// get pointer to hash array (if any)
			"	ldr    r1, [r1, %0]											\n\t"
			"	cmp    r1, #0												\n\t"	// and there could be nothing there - handle that!
			"	beq    6f													\n\t"
			"	lsls   r3, #2												\n\t"	// get pointer to hash chain root ptr
			"	ldr    r1, [r1, r3]											\n\t"
			
			"7:																\n\t"	// hash_search_loop: lookup loop
			"	cmp    r1, #0												\n\t"	// if NULL, translate
			"	beq    6f													\n\t"
			"	ldr    r3, [r1, #0]											\n\t"	// grab the base addr
			"	cmp    r3, r2												\n\t"
			"	bne    8f													\n\t"	// miss -> try next bucket
			"	adds   r1, %1 + 1											\n\t"	// point r1 to jump address plus thumb bit
			"	mov    r2, r1												\n\t"	// we like addresses in r2
			"	pop    {r0, r1}												\n\t"	// restore state
			"	b      1f													\n\t"	// and jump there
			"8:																\n\t"	// not_this_one: try next link in this bucket
			"	ldr    r1, [r1, #4]											\n\t"	// load next link
			"	b      7b													\n\t"	// and go on

			"6:																\n\t"	// hash_lookup_fail: need to translate 
			"	ldr    r0, =TLS												\n\t"
			"	str    r2, [r0, %2]											\n\t"	// store desired address
			"	pop    {r0, r1}												\n\t"	// restore r0 and r1
			"	ldr    r4, [r0, %5]											\n\t"	// get r4
			"	ldr    r5, [r0, %6]											\n\t"	// get r5
			"	push   {r4-r6, lr}											\n\t"	// push r4, r5 and make space for what will become pc
			"	mov    r4, r1												\n\t"	// save cpsr in r4
			"	ldr    r5, [r0, %3]											\n\t"	// get r12
			"	ldr    r6, [r0, %4]											\n\t"	// get lr
			"	ldr    r3, [r0, #0x0c]										\n\t"	// get pushed r3
			"	ldr    r1, [r0, #0x08]										\n\t"	// get pushed r2
			"	push   {r1, r3}												\n\t"	// push 'em
			"	ldr    r3, [r0, #0x04]										\n\t"	// get pushed r1
			"	ldr    r1, [r0, #0x00]										\n\t"	// get pushed r0
			"	push   {r1, r3}												\n\t"	// push 'em
			"	movs   r0, #0												\n\t"	// param
			"	bl     jitTranslateNewTu									\n\t"	// go translate
			"	adds   r0, %1 + 1											\n\t"	// point r0 to jump address (with thumb bit)
			"	str    r0, [sp, #0x1c]										\n\t"	// save it in proper place
			"	msr    APSR_nzcvq, r4										\n\t"	// restore flags
			"	mov    r12, r5												\n\t"
			"	mov    lr, r6												\n\t"
			"	pop    {r0-r6, pc}											\n\t"	// go!
			
			"1:																\n\t"	// jump to proper address with context load
			".globl jitPrvPopCtxAndJumpCalloutThumbOnly						\n\t"
			"jitPrvPopCtxAndJumpCalloutThumbOnly:							\n\t"
			".type jitPrvPopCtxAndJumpCalloutThumbOnly, %%function			\n\t"
			
			"	msr    APSR_nzcvq, r1										\n\t"
			"	ldr    r1, [r0, %3]											\n\t"	//get r12
			"	ldr    r3, [r0, %4]											\n\t"	//get lr
			"	mov    r12, r1												\n\t"
			"	mov    lr, r3												\n\t"
			"	push   {r2}													\n\t"
			"	ldmia  r0, {r0-r5}											\n\t"
			"	pop    {pc}													\n\t"
			"	.ltorg														\n\t"
			
			:
			:"I"(TLS_SLOT_JIT_STATE * 4), "I"(offsetof(struct TU, code)), "I"(TLS_SLOT_JIT_ADDR * 4),
				"I"(offsetof(struct M0backendRegState, r12)), "I"(offsetof(struct M0backendRegState, lr)),
				"I"(offsetof(struct M0backendRegState, regs[4])), "I"(offsetof(struct M0backendRegState, regs[5]))
			:
		);
	}
	
#else

	void __attribute__((naked, section(".ramcode"))) jitPrvPopPcArmOnlyCallout(void)
	{
		asm volatile(
			".syntax unified					\n\t"
			"	push   {r0, r1, r3, r4}			\n\t"
			"	ldr    r4, [sp, #0x10]			\n\t"
			"	mrs    r3, apsr					\n\t"
			
			HASH_ADDRESS(r0, r4)						//hash addr
	
			"	ldr    r1, =TLS					\n\t"	//get pointer to hash array
			"	ldr    r1, [r1, %0]				\n\t"
			"	cbz    r1, 3f					\n\t"	//and there could be nothing there - handle that!
			"	ldr    r1, [r1, r0, LSL #2]		\n\t"	//get pointer to hash chain root ptr
			"									\n\t"
			"1:									\n\t"	//lookup loop
			"	cbz    r1, 3f					\n\t"	//if NULL, we need to xlate
			"	ldr    r0, [r1, #0]				\n\t"	//grab the base addr
			"	cmp    r0, r4					\n\t"
			"	beq    2f						\n\t"	//HIT
			"	ldr    r1, [r1, #4]				\n\t"	//load next link
			"	b      1b						\n\t"	//and go on
			"									\n\t"
			"2:									\n\t"	//we get here with the matching TU pointer in r1
			"	adds   r1, %1 + 1				\n\t"	//point r1 to jump address and the low bit for jump
			"	str    r1, [sp, #0x10]			\n\t"	//save it on stack where we'll pop it & return to translated code
			"	msr    apsr_nzcvq, r3			\n\t"
			"	pop    {r0,r1,r3,r4,pc}			\n\t"
			"									\n\t"
			"3:									\n\t"	//call direct into xlation func
			"	msr    apsr_nzcvq, r3			\n\t"
			"	ldr    r0, =TLS					\n\t"
			"	str    r4, [r0, %2]				\n\t"	//store desired address
			"	pop    {r0,r1,r3,r4}			\n\t"
			"	add    sp, sp, #4				\n\t"	//we "popped" the address
			"	ldr    pc, =jitUsermodeEntry + 1\n\t"
			".ltorg								\n\t"
			:
			:"I"(TLS_SLOT_JIT_STATE * 4), "I"(offsetof(struct TU, code)), "I"(TLS_SLOT_JIT_ADDR * 4)
			:
		);
	}
	
	void __attribute__((naked, section(".ramcode"))) jitPrvPopPcCallout(void)
	{
		asm volatile(
			".syntax unified					\n\t"
			"	push   {r3, r4}					\n\t"
			"	ldr    r4, [sp, #8]				\n\t"
			"	and    r3, r4, #1				\n\t"
			"	cbnz   r3, 9f					\n\t"
			"	mrs    r3, apsr					\n\t"
			"	push   {r0, r1}					\n\t"
			
			HASH_ADDRESS(r0, r4)						//hash addr
	
			"	ldr    r1, =TLS					\n\t"	//get pointer to hash array
			"	ldr    r1, [r1, %0]				\n\t"
			"	cbz    r1, 3f					\n\t"	//and there could be nothing there - handle that!
			"	ldr    r1, [r1, r0, LSL #2]		\n\t"	//get pointer to hash chain root ptr
			"									\n\t"
			"1:									\n\t"	//lookup loop
			"	cbz    r1, 3f					\n\t"	//if NULL, we need to xlate
			"	ldr    r0, [r1, #0]				\n\t"	//grab the base addr
			"	cmp    r0, r4					\n\t"
			"	beq    2f						\n\t"	//HIT
			"	ldr    r1, [r1, #4]				\n\t"	//load next link
			"	b      1b						\n\t"	//and go on
			"									\n\t"
			"2:									\n\t"	//we get here with the matching TU pointer in r1
			"	adds   r1, %1 + 1				\n\t"	//point r1 to jump address and the low bit for jump
			"	str    r1, [sp, #0x10]			\n\t"	//save it on stack where we'll pop it & return to translated code
			"	msr    apsr_nzcvq, r3			\n\t"
			"	pop    {r0,r1,r3,r4,pc}			\n\t"
			"									\n\t"
			"3:									\n\t"	//call direct into xlation func
			"	msr    apsr_nzcvq, r3			\n\t"
			"	ldr    r0, =TLS					\n\t"
			"	str    r4, [r0, %2]				\n\t"	//store desired address
			"	pop    {r0,r1,r3,r4}			\n\t"
			"	add    sp, sp, #4				\n\t"	//we "popped" the address
			"	ldr    pc, =jitUsermodeEntry + 1\n\t"
			"									\n\t"
			"9:									\n\t"	//thumb return
			"	pop    {r3,r4,pc}				\n\t"
			".ltorg								\n\t"
			:
			:"I"(TLS_SLOT_JIT_STATE * 4), "I"(offsetof(struct TU, code)), "I"(TLS_SLOT_JIT_ADDR * 4)
			:
		);
	}
	
	void __attribute__((naked, section(".ramcode"))) jitPrvPopPcAndAdvanceSpCallout(void)
	{
		asm volatile(
			".syntax unified					\n\t"
			"	str    r3, [sp]					\n\t"	//stack space already exists
			"	ldr    r3, [sp, #4]				\n\t"	//get the pc
			"	str    r3, [sp, #8]				\n\t"	//store it in place of trash
			"	str    r4, [sp, #4]				\n\t"	//store r4, so that it looks like we just pushed {r3, r4, pc}
			"	and    r4, r3, #1				\n\t"
			"	cbnz   r4, 9f					\n\t"
			"	mrs    r4, apsr					\n\t"
			"	push   {r0, r1}					\n\t"
			
			HASH_ADDRESS(r0, r3)						//hash addr
	
			"	ldr    r1, =TLS					\n\t"	//get pointer to hash array
			"	ldr    r1, [r1, %0]				\n\t"
			"	cbz    r1, 3f					\n\t"	//and there could be nothing there - handle that!
			"	ldr    r1, [r1, r0, LSL #2]		\n\t"	//get pointer to hash chain root ptr
			"									\n\t"
			"1:									\n\t"	//lookup loop
			"	cbz    r1, 3f					\n\t"	//if NULL, we need to xlate
			"	ldr    r0, [r1, #0]				\n\t"	//grab the base addr
			"	cmp    r0, r3					\n\t"
			"	beq    2f						\n\t"	//HIT
			"	ldr    r1, [r1, #4]				\n\t"	//load next link
			"	b      1b						\n\t"	//and go on
			"									\n\t"
			"2:									\n\t"	//we get here with the matching TU pointer in r1
			"	adds   r1, %1 + 1				\n\t"	//point r1 to jump address and the low bit for jump
			"	str    r1, [sp, #0x10]			\n\t"	//save it on stack where we'll pop it & return to translated code
			"	msr    apsr_nzcvq, r4			\n\t"
			"	pop    {r0,r1,r3,r4,pc}			\n\t"
			"									\n\t"
			"3:									\n\t"	//call direct into xlation func
			"	msr    apsr_nzcvq, r4			\n\t"
			"	ldr    r0, =TLS					\n\t"
			"	str    r3, [r0, %2]				\n\t"	//store desired address
			"	pop    {r0,r1,r3,r4}			\n\t"
			"	add    sp, sp, #4				\n\t"	//we "popped" the address
			"	ldr    pc, =jitUsermodeEntry + 1\n\t"
			"									\n\t"
			"9:									\n\t"	//thumb return
			"	pop    {r3,r4,pc}				\n\t"
			".ltorg								\n\t"
			:
			:"I"(TLS_SLOT_JIT_STATE * 4), "I"(offsetof(struct TU, code)), "I"(TLS_SLOT_JIT_ADDR * 4)
			:
		);
	}
	
	void __attribute__((naked, section(".ramcode"))) jitPrvBxLrArmOnlyCallout(void)
	{
		asm volatile(
			".syntax unified					\n\t"
			"	push   {r0, r1, r3, r4}			\n\t"
			"	mrs    r3, apsr					\n\t"
			
			HASH_ADDRESS(r0, lr)						//hash addr
	
			"	ldr    r1, =TLS					\n\t"	//get pointer to hash array
			"	ldr    r1, [r1, %0]				\n\t"
			"	cbz    r1, 3f					\n\t"	//and there could be nothing there - handle that!
			"	ldr    r1, [r1, r0, LSL #2]		\n\t"	//get pointer to hash chain root ptr
			"									\n\t"
			"1:									\n\t"	//lookup loop
			"	cbz    r1, 3f					\n\t"	//if NULL, we need to xlate
			"	ldr    r0, [r1, #0]				\n\t"	//grab the base addr
			"	cmp    r0, lr					\n\t"
			"	beq    2f						\n\t"	//HIT
			"	ldr    r1, [r1, #4]				\n\t"	//load next link
			"	b      1b						\n\t"	//and go on
			"									\n\t"
			"2:									\n\t"	//we get here with the matching TU pointer in r1
			"	adds   r1, %1 + 1				\n\t"	//point r1 to jump address and the low bit for jump
			"	str    r1, [sp, #0x0c]			\n\t"	//save it on stack where we'll pop it & return to translated code
			"	msr    apsr_nzcvq, r3			\n\t"
			"	pop    {r0,r1,r3,pc}			\n\t"
			"									\n\t"
			"3:									\n\t"	//call direct into xlation func
			"	msr    apsr_nzcvq, r3			\n\t"
			"	ldr    r0, =TLS					\n\t"
			"	str    lr, [r0, %2]				\n\t"	//store desired address
			"	pop    {r0,r1,r3,r4}			\n\t"
			"	ldr    pc, =jitUsermodeEntry + 1\n\t"
			".ltorg								\n\t"
			:
			:"I"(TLS_SLOT_JIT_STATE * 4), "I"(offsetof(struct TU, code)), "I"(TLS_SLOT_JIT_ADDR * 4)
			:
		);
	}
	
	void __attribute__((naked, section(".ramcode"))) jitPrvBxLrCallout(void)
	{
		asm volatile(
			".syntax unified					\n\t"
			"	push   {r3, r4}					\n\t"
			"	and    r3, lr, #1				\n\t"
			"	cbz    r3, 9f					\n\t"
			"	pop    {r3, r4}					\n\t"
			"	bx     lr						\n\t"
			"9:									\n\t"
			"	mrs    r3, apsr					\n\t"
			"	push   {r0, r1}					\n\t"
			
			HASH_ADDRESS(r0, lr)						//hash addr
	
			"	ldr    r1, =TLS					\n\t"	//get pointer to hash array
			"	ldr    r1, [r1, %0]				\n\t"
			"	cbz    r1, 3f					\n\t"	//and there could be nothing there - handle that!
			"	ldr    r1, [r1, r0, LSL #2]		\n\t"	//get pointer to hash chain root ptr
			"									\n\t"
			"1:									\n\t"	//lookup loop
			"	cbz    r1, 3f					\n\t"	//if NULL, we need to xlate
			"	ldr    r0, [r1, #0]				\n\t"	//grab the base addr
			"	cmp    r0, lr					\n\t"
			"	beq    2f						\n\t"	//HIT
			"	ldr    r1, [r1, #4]				\n\t"	//load next link
			"	b      1b						\n\t"	//and go on
			"									\n\t"
			"2:									\n\t"	//we get here with the matching TU pointer in r1
			"	adds   r1, %1 + 1				\n\t"	//point r1 to jump address and the low bit for jump
			"	str    r1, [sp, #0x0c]			\n\t"	//save it on stack where we'll pop it & return to translated code
			"	msr    apsr_nzcvq, r3			\n\t"
			"	pop    {r0,r1,r3,pc}			\n\t"
			"									\n\t"
			"3:									\n\t"	//call direct into xlation func
			"	msr    apsr_nzcvq, r3			\n\t"
			"	ldr    r0, =TLS					\n\t"
			"	str    lr, [r0, %2]				\n\t"	//store desired address
			"	pop    {r0,r1,r3,r4}			\n\t"
			"	ldr    pc, =jitUsermodeEntry + 1\n\t"
			".ltorg								\n\t"
			:
			:"I"(TLS_SLOT_JIT_STATE * 4), "I"(offsetof(struct TU, code)), "I"(TLS_SLOT_JIT_ADDR * 4)
			:
		);
	}
	
	void __attribute__((naked, section(".ramcode"))) jitPrvBxR12Callout(void)
	{
		asm volatile(
			".syntax unified					\n\t"
			"	push   {r3, r4}					\n\t"
			"	and    r3, r12, #1				\n\t"
			"	cbz    r3, 9f					\n\t"
			"	pop    {r3, r4}					\n\t"
			"	bx     r12						\n\t"
			"9:									\n\t"
			"	mrs    r3, apsr					\n\t"
			"	push   {r0, r1}					\n\t"
			
			HASH_ADDRESS(r0, r12)						//hash addr
	
			"	ldr    r1, =TLS					\n\t"	//get pointer to hash array
			"	ldr    r1, [r1, %0]				\n\t"
			"	cbz    r1, 3f					\n\t"	//and there could be nothing there - handle that!
			"	ldr    r1, [r1, r0, LSL #2]		\n\t"	//get pointer to hash chain root ptr
			"									\n\t"
			"1:									\n\t"	//lookup loop
			"	cbz    r1, 3f					\n\t"	//if NULL, we need to xlate
			"	ldr    r0, [r1, #0]				\n\t"	//grab the base addr
			"	cmp    r0, r12					\n\t"
			"	beq    2f						\n\t"	//HIT
			"	ldr    r1, [r1, #4]				\n\t"	//load next link
			"	b      1b						\n\t"	//and go on
			"									\n\t"
			"2:									\n\t"	//we get here with the matching TU pointer in r1
			"	adds   r1, %1 + 1				\n\t"	//point r1 to jump address and the low bit for jump
			"	str    r1, [sp, #0x0c]			\n\t"	//save it on stack where we'll pop it & return to translated code
			"	msr    apsr_nzcvq, r3			\n\t"
			"	pop    {r0,r1,r3,pc}			\n\t"
			"									\n\t"
			"3:									\n\t"	//call direct into xlation func
			"	msr    apsr_nzcvq, r3			\n\t"
			"	ldr    r0, =TLS					\n\t"
			"	str    r12, [r0, %2]			\n\t"	//store desired address
			"	pop    {r0,r1,r3,r4}			\n\t"
			"	ldr    pc, =jitUsermodeEntry + 1\n\t"
			".ltorg								\n\t"
			:
			:"I"(TLS_SLOT_JIT_STATE * 4), "I"(offsetof(struct TU, code)), "I"(TLS_SLOT_JIT_ADDR * 4)
			:
		);
	}

#endif


