#include "emuJitBackend_m0.h"
#include "emuJitInternal.h"
#include "memmap.h"
#include "kernel.h"
#include "printf.h"

/* future opt ideas



J Xltd ARM (1 instrs):
J       0x70f07688: e8930060  LDMIA    R3, {R5, R6}
J to thumb (4 halfwords): {
J       0x7000cb20: 68e8      LDR      R0, [R5, #0x0c]
J       0x7000cb22: c806      LDMIA.N  R0!, {R1, R2}
J       0x7000cb24: 6169      STR      R1, [R5, #0x14]
J       0x7000cb26: 4616      MOVhi.N  R6, R2


J Xltd ARM (1 instrs):
J       0x70f076b0: e1866004  ORR      R6, R6, R4
J to thumb (4 halfwords): {
J       0x7000cbcc: 6928      LDR      R0, [R5, #0x10]
J       0x7000cbce: 4631      MOVhi.N  R1, R6
J       0x7000cbd0: 4301      ORR.N    R1, R0
J       0x7000cbd2: 460e      MOVhi.N  R6, R1




J Xltd ARM (1 instrs):
J       0x70f076c0: e1833606  ORR      R3, R3, R6, LSL #12
J to thumb (5 halfwords): {
J       0x7000cbe8: 4630      MOVhi.N  R0, R6
J       0x7000cbea: 0300      LSL.N    R0, R0, #12
J       0x7000cbec: 68e9      LDR      R1, [R5, #0x0c]
J       0x7000cbee: 4301      ORR.N    R1, R0
J       0x7000cbf0: 60e9      STR      R1, [R5, #0x0c]



J Xltd ARM (1 instrs):
J       0x70f076dc: e0073003  AND      R3, R7, R3
J to thumb (4 halfwords): {
J       0x7000cc20: 68e8      LDR      R0, [R5, #0x0c]
J       0x7000cc22: 4639      MOVhi.N  R1, R7
J       0x7000cc24: 4001      AND.N    R1, R0
J       0x7000cc26: 60e9      STR      R1, [R5, #0x0c]
J }


J Xltd ARM (1 instrs):
J       0x70dd0810: e1a03427  MOV      R3, R7, LSR #8
J to thumb (3 halfwords): {
J       0x70009848: 4638      MOVhi.N  R0, R7
J       0x7000984a: 0a00      LSR.N    R0, R0, #8
J       0x7000984c: 60e8      STR      R0, [R5, #0x0c]



*/


#define NUM_HALFWORDS_PROLOGUE			3

#define REG_NO_CONTEXT					5	//we're kind of hardwired for this being the higest indirect reg
#define REG_NO_SR						4
#define REG_NO_DST_EPILOGUE_REG			2

#define REG_NO_TMP_1					0	//these can be changed, but REG_NO_TMP_3 == REG_NO_DST_EPILOGUE_REG helps the exit code a little
#define REG_NO_TMP_2					1	//their order should also match regs order (else ldm/stm/ldrd/strd will break)
#define REG_NO_TMP_3					2
#define REG_NO_TMP_4					3
#define REG_NO_TMP_HI					12	//hireg tmp


#define ARM_SR_BIT_N					0x80000000UL
#define ARM_SR_BIT_Z					0x40000000UL
#define ARM_SR_BIT_C					0x20000000UL
#define ARM_SR_BIT_V					0x10000000UL
#define ARM_SR_BIT_Q					0x08000000UL
#define ARM_SR_BITS_APP					(ARM_SR_BIT_N | ARM_SR_BIT_Z | ARM_SR_BIT_C | ARM_SR_BIT_V | ARM_SR_BIT_Q)

#define ARM_V5_SR_USER_MODE_BITS		0x00000010UL



const uint8_t mPopcntTab[] = {
		0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
		1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
		1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
		2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
		1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
		2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
		2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
		3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, 
	};


static enum EmitStatus jitPrvConditionalBranchToThumb(struct EmitBuf *dest, uintptr_t to, enum EmitCc cc);


static uint32_t __attribute__((const)) jitPrvRor(uint32_t val, uint32_t by)
{
	if (by)
		val = (val >> by) | (val << (32 - by));

	return val;
}

static bool __attribute__((const)) jitPrvIsVregArealReg(uint_fast8_t regNo)
{
	uint32_t realRegsUsedByUs = (1 << REG_NO_TMP_HI) | (1 << REG_NO_CONTEXT) | (1 << REG_NO_SR) | (1 << REG_NO_TMP_1) | (1 << REG_NO_TMP_2) | (1 << REG_NO_TMP_3) | (1 << REG_NO_TMP_4) | (1 << REG_NO_TMP_HI) | (1 << EMIT_REG_NO_LR) | (1 << EMIT_REG_NO_PC);
	
	return !(realRegsUsedByUs & (1 << regNo));
}

//to hwReg
enum EmitStatus jitPrvEmitLoadImmToLoreg(struct EmitBuf *dest, uint32_t rdNo, uint32_t val, bool canCorruptNZ, bool canCorruptC)
{
	enum EmitStatus now;
	
	if (!EMIT_IS_LOREG(rdNo))
		return EmitErrNotEncodeable;
	
	if (canCorruptNZ && !(val >> 8)) {
		
		//we can use a MOV.N only if we can clobber N & Z
		EMIT(LLmovImm, rdNo, val, 0, EmitFlagsDoNotCare, false);
		return EmitErrNone;
	}
	
	if (canCorruptNZ && !((~val) >> 8)) {
		
		//we can use a MOV.N and a MVN.N if we can clobber N & Z
		EMIT(LLmovImm, rdNo, ~val, 0, EmitFlagsDoNotCare, false);
		EMIT(LLmvnReg, rdNo, rdNo, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
		return EmitErrNone;
	}
	
	if (val <= 0x1fe && canCorruptNZ && canCorruptC) {
		
		EMIT(LLmovImm, rdNo, 0xff, 0, EmitFlagsDoNotCare, false);
		EMIT(LLaddImm, rdNo, rdNo, val - 0xff, EmitFlagsDoNotCare, false);
		return EmitErrNone;
	}
	
	if (canCorruptNZ && canCorruptC) {
		
		if (__builtin_clz(val) + __builtin_ctz(val) >= 24) {	//move and shift left
		
			//we can move and shift
			EMIT(LLmovImm, rdNo, val >> __builtin_ctz(val), 0, EmitFlagsDoNotCare, false);
			EMIT(LLmov, rdNo, rdNo, EmitShiftLsl, __builtin_ctz(val), EmitFlagsDoNotCare, false);
			return EmitErrNone;
		}
		
		if (__builtin_clz(~val) + __builtin_ctz(~val) >= 24) {	//move, shift left, mvn
		
			val = ~val;
			EMIT(LLmovImm, rdNo, val >> __builtin_ctz(val), 0, EmitFlagsDoNotCare, false);
			EMIT(LLmov, rdNo, rdNo, EmitShiftLsl, __builtin_ctz(val), EmitFlagsDoNotCare, false);
			EMIT(LLmvnReg, rdNo, rdNo, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
			
			//not directly tested
			return EmitErrNone;
		}
		
		//by the time we get here we KNOW that the value is not representable as u8 shifted left by any amount! We assume that in the logic below
		
		//move, shift, add?
		if ((val &~ (0xff << (24 - __builtin_clz(val)))) < 0x100) {
			
			EMIT(LLmovImm, rdNo, val >> (24 - __builtin_clz(val)), 0, EmitFlagsDoNotCare, false);
			EMIT(LLmov, rdNo, rdNo, EmitShiftLsl, 24 - __builtin_clz(val), EmitFlagsDoNotCare, false);
			EMIT(LLaddImm, rdNo, rdNo, val &~ (0xff << (24 - __builtin_clz(val))), EmitFlagsDoNotCare, false);
			
			//not directly tested
			return EmitErrNone;
		}
	}

	//else we'll use a literal load
	now = jitPrvLiteralLoad(dest, rdNo, val);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

//SHALL NEVER clobber real hw flags!
static enum EmitStatus jitPrvStoreVreg(struct EmitBuf *dest, uint32_t vReg, uint32_t hwReg, bool spAllowed)
{
	uint32_t ofst;
	
	if (vReg <= REG_NO_CONTEXT)
		ofst = offsetof(struct M0backendRegState, regs[vReg]);
	else if (vReg == 12)
		ofst = offsetof(struct M0backendRegState, r12);
	else if (vReg == EMIT_REG_NO_SP && !spAllowed)
		return EmitErrNotEncodeable;
	else if (vReg == EMIT_REG_NO_LR)
		ofst = offsetof(struct M0backendRegState, lr);
	else if (vReg == EMIT_REG_NO_PC)
		return EmitErrNotEncodeable;
	else {
		
		//direct reg: mov vReg, hwReg
		EMIT(LLmov, vReg, hwReg, EmitShiftLsl, 0, EmitLeaveFlags, false);
		return EmitErrNone;
	}
	
	EMIT(LLstoreImm, hwReg, REG_NO_CONTEXT, ofst, EmitSzWord, EmitAdrModeIndex);

	return EmitErrNone;
}

static enum EmitStatus jitPrvLoadVregNotPc(struct EmitBuf *dest, uint32_t hwReg, uint32_t vReg)
{
	uint32_t ofst;
	
	if (vReg <= REG_NO_CONTEXT)
		ofst = offsetof(struct M0backendRegState, regs[vReg]);
	else if (vReg == 12)
		ofst = offsetof(struct M0backendRegState, r12);
	else if (vReg == EMIT_REG_NO_LR)
		ofst = offsetof(struct M0backendRegState, lr);
	else if (vReg == EMIT_REG_NO_PC)
		return EmitErrNotEncodeable;
	else {
		
		//direct reg: mov hwReg, vReg
		EMIT(LLmov, hwReg, vReg, EmitShiftLsl, 0, EmitLeaveFlags, false);
		return EmitErrNone;
	}

	EMIT(LLloadImm, hwReg, REG_NO_CONTEXT, ofst, EmitSzWord, false, EmitAdrModeIndex);
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvLoadVreg(struct EmitBuf *dest, uint32_t hwReg, uint32_t vReg, uint32_t instrAddr)
{
	enum EmitStatus now;
	
	if (vReg == EMIT_REG_NO_PC) {
		
		now = jitPrvEmitLoadImmToLoreg(dest, hwReg, instrAddr + 8, true, true);
		if (now != EmitErrNone)
			return now;
	}
	else {
	
		return jitPrvLoadVregNotPc(dest, hwReg, vReg);
	}
	
	return EmitErrNone;
}

//do not accidntally use this! use jitPrvEmitLoadImmToLoreg() this is for jit frontend
enum EmitStatus jitEmitLoadImmToReg(struct EmitBuf *dest, uint32_t rdNo, uint32_t val, bool canCorruptNZ, bool canCorruptC, bool isInIt)
{
	enum EmitStatus now;
	
	now = jitPrvEmitLoadImmToLoreg(dest, REG_NO_TMP_1, val, true, true);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvStoreVreg(dest, rdNo, REG_NO_TMP_1, false);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitNopFillLen(struct EmitBuf *dest, uint32_t halfwords)
{
	if (halfwords >= 3) {	//jump forward is faster
		
		EMIT(LLbranch, (uintptr_t)(((uint16_t*)dest->buf) + halfwords), EmitCcAl);
	}
	else {
		
		while (halfwords--)
			EMIT(LLnop, false);
	}
	
	return EmitErrNone;
}

enum EmitStatus jitEmitNopFill(struct EmitBuf *dest)
{
	return jitEmitNopFillLen(dest, (uint16_t*)dest->bufEnd - (uint16_t*)dest->buf);
}

void jitStateBackendInit(struct JitBackendRuntimeData *rtd, bool isUiThreadPreallocatedState)
{
	#ifdef CPU_HARDWIRED_UI_THREAD_JIT_TC_BASE
		
		static struct M0backendRegState uiThreadState;	//in real SRAM
		
		if (isUiThreadPreallocatedState) {
			
			rtd->regStatePtr = &uiThreadState;
			return;
		}
	#endif
	
	rtd->regStatePtr = &rtd->regStateStorage;
}

static enum EmitStatus jitPrvEmitActualPrologueAndEpilogue(struct JitBackendRuntimeData *rtd)
{
	struct EmitBuf destBuf, *dest = &destBuf;
	uintptr_t pcGetAddr, ptrAddr;
	enum EmitStatus now;
	uint32_t i;
	
	if (rtd->prologue[0])		//nonzeor means we've done it already
		return EmitErrNone;
	
	//emit the prologue, it is entered with arm pc pushed, see jitEmitTuPrologue(). We must use BL for the reach
	
	emitBufferInit(dest, rtd->prologue, sizeof(rtd->prologue));

	//push {Rctx, R7}	(get some working space)
	EMIT(HLpush, 0x0080 | (1 << REG_NO_CONTEXT));
	
	//MRS r7, CPSR
	EMIT(LLmrs, 7, EMIT_SYSM_APSR);
	
	pcGetAddr = (((uintptr_t)emitGetPtrToJumpHere(dest)) + 4) / 4 * 4;	//where read PC would be
	ptrAddr = (uintptr_t)&rtd->regStatePtr;
	if (ptrAddr < pcGetAddr || (ptrAddr - pcGetAddr) % 4)
		fatal("state rtd misfigured: %08xh references %08xh\n", pcGetAddr , ptrAddr);
	
	//ldr Rctx, [PC (actual PC inside this buffer, yes), #properOffset]  //to load rtd->regStatePtr
	EMIT(LLloadImm, REG_NO_CONTEXT, EMIT_REG_NO_PC, ptrAddr - pcGetAddr, EmitSzWord, false, EmitAdrModeIndex);

	//stmia Rctx!, {r0-R{ctx - 1}}	
	EMIT(LLstmia, REG_NO_CONTEXT, (1 << REG_NO_CONTEXT) - 1, true);

	//pop {r0,r1,r3}	//Rctx, r7, LR
	EMIT(HLpop, 0x000b);
	
	//mov r2, r12
	EMIT(LLmov, 2, 12, EmitShiftLsl, 0, EmitLeaveFlags, false);
	
	//stmia Rctx!, {r0,r2,r3}		//store Rctx, R12, LR
	EMIT(LLstmia, REG_NO_CONTEXT, 0x000d, true);
	
	//mov Rsr, r7	(Rsr = what CPSR was on entry)
	EMIT(LLmov, REG_NO_SR, 7, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
	
	//mov R7, r1	(restore r7 to what it was)
	EMIT(LLmov, 7, 1, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
	
	//sub Rctx, sizeof(struct M0backendRegState)
	EMIT(LLsubImm, REG_NO_CONTEXT, REG_NO_CONTEXT, sizeof(struct M0backendRegState), EmitFlagsDoNotCare, false);

	//bx lr (prologue is done)
	EMIT(LLbx, EMIT_REG_NO_LR);

	//emit the epilogues

	emitBufferInit(dest, rtd->epiloguePopPc, sizeof(rtd->epiloguePopPc));
	EMIT(HLldmia, EMIT_REG_NO_SP, 1 << REG_NO_DST_EPILOGUE_REG, true);
	EMIT(LLbranch, (uintptr_t)rtd->epilogueInterworking, EmitCcAl);
	
	emitBufferInit(dest, rtd->epilogueBxLr, sizeof(rtd->epilogueBxLr));///aaa
	now = jitPrvLoadVreg(dest, REG_NO_DST_EPILOGUE_REG, EMIT_REG_NO_LR, 0);
	if (now != EmitErrNone)
		return now;
	EMIT(LLnop, false);
	
	emitBufferInit(dest, rtd->epilogueInterworking, sizeof(rtd->epilogueInterworking));

	//mov r0, Rctx
	EMIT(LLmov, 0, REG_NO_CONTEXT, EmitShiftLsl, 0, EmitLeaveFlags, false);

	//mov r1, Rsr
	EMIT(LLmov, 1, REG_NO_SR, EmitShiftLsl, 0, EmitLeaveFlags, false);
	
	//ldr r3, =callout
	EMIT(LLloadImm, 3, EMIT_REG_NO_PC, 0, EmitSzWord, false, EmitAdrModeIndex);
	
	//bx r3
	EMIT(LLbx, 3);

	//.word callout
	EMIT(LLrawHalfword, ((uintptr_t)&jitPrvPopCtxAndJumpCalloutInterwork) & 0xffff);
	EMIT(LLrawHalfword, ((uintptr_t)&jitPrvPopCtxAndJumpCalloutInterwork) >> 16);


	emitBufferInit(dest, rtd->epilogueNoninterworking, sizeof(rtd->epilogueNoninterworking));

	//mov r0, Rctx
	EMIT(LLmov, 0, REG_NO_CONTEXT, EmitShiftLsl, 0, EmitLeaveFlags, false);

	//mov r1, Rsr
	EMIT(LLmov, 1, REG_NO_SR, EmitShiftLsl, 0, EmitLeaveFlags, false);
	
	//ldr r3, =callout
	EMIT(LLloadImm, 3, EMIT_REG_NO_PC, 0, EmitSzWord, false, EmitAdrModeIndex);
	
	//bx r3
	EMIT(LLbx, 3);

	//.word callout
	EMIT(LLrawHalfword, ((uintptr_t)&jitPrvPopCtxAndJumpCalloutNoninterwork) & 0xffff);
	EMIT(LLrawHalfword, ((uintptr_t)&jitPrvPopCtxAndJumpCalloutNoninterwork) >> 16);


	emitBufferInit(dest, rtd->epilogueThumbOnly, sizeof(rtd->epilogueThumbOnly));

	//mov r0, Rctx
	EMIT(LLmov, 0, REG_NO_CONTEXT, EmitShiftLsl, 0, EmitLeaveFlags, false);

	//mov r1, Rsr
	EMIT(LLmov, 1, REG_NO_SR, EmitShiftLsl, 0, EmitLeaveFlags, false);
	
	//ldr r3, =callout
	EMIT(LLloadImm, 3, EMIT_REG_NO_PC, 0, EmitSzWord, false, EmitAdrModeIndex);
	
	//bx r3
	EMIT(LLbx, 3);

	//.word callout
	EMIT(LLrawHalfword, ((uintptr_t)&jitPrvPopCtxAndJumpCalloutThumbOnly) & 0xffff);
	EMIT(LLrawHalfword, ((uintptr_t)&jitPrvPopCtxAndJumpCalloutThumbOnly) >> 16);


	return EmitErrNone;
}

enum EmitStatus jitEmitTuPrologue(struct EmitBuf *dest, uint32_t sourceStartAddr)
{
	struct JitBackendRuntimeData *rtd = jitPrvGetRuntimeDataPtr();
	enum EmitStatus now;
	
	now = jitPrvEmitActualPrologueAndEpilogue(rtd);
	if (now != EmitErrNone)
		return now;
	
	//per-tu prologue is 3 halfwords
	
	//push {LR}
	EMIT(HLpush, 1 << EMIT_REG_NO_LR);
	
	//bl rtd->prologue
	EMIT(LLbl, (uintptr_t)rtd->prologue);
	
	return EmitErrNone;
}

enum EmitStatus jitEmitWhereToJumpFromAnotherTu(uint16_t *startOfCodeTu, uint16_t **jumpToP)
{
	*jumpToP = startOfCodeTu + NUM_HALFWORDS_PROLOGUE;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitJumpToAbsThumbAddrNotInTu(struct EmitBuf *dest, uintptr_t dstAddr)
{
	struct JitBackendRuntimeData *rtd = jitPrvGetRuntimeDataPtr();
	enum EmitStatus now;
	
	//ldr RepilogueDst, =dstAddr_with_interworking
	now = jitPrvLiteralLoad(dest, REG_NO_DST_EPILOGUE_REG, dstAddr | 1);
	if (now != EmitErrNone)
		return now;
	
	//bl rtd->epilogueThumbOnly
	EMIT(LLbl, (uintptr_t)rtd->epilogueThumbOnly);
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvAddImmToReg(struct EmitBuf *dest, uint32_t rdNo, uint32_t regTmp, int32_t imm)
{
	uint32_t immAbs = (imm < 0) ? -imm : imm;
	enum EmitStatus now;
	
	if (immAbs <= 3 * 0xFC && rdNo < 8) {		//up to 3 ADD/SUB instrs make more sense than literal load
		
		while (immAbs) {
			
			uint32_t now = immAbs > 0xFC ? 0xFC : immAbs;
		
			if (imm < 0) {
				
				EMIT(LLsubImm, rdNo, rdNo, now, EmitFlagsDoNotCare, false);
				imm += now;
			}
			else {
				
				EMIT(LLaddImm, rdNo, rdNo, now, EmitFlagsDoNotCare, false);
				imm -= now;
			}
			
			immAbs -= now;
		}
	}
	else {	//literal load makes more sense
		
		now = jitPrvEmitLoadImmToLoreg(dest, regTmp, imm, true, true);
		if (now != EmitErrNone)
			return now;
		
		EMIT(LLaddReg, rdNo, rdNo, regTmp, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
	}

	return EmitErrNone;
}

static enum EmitStatus jitPrvShiftReg(struct EmitBuf *dest, uint32_t rdNo, uint32_t tmpReg, enum EmitShiftType shiftType, uint32_t shiftAmt, enum EmitFlagSettingPrefs flagPrefs)
{
	enum EmitStatus now;
	
	if (shiftType == EmitShiftLsl && !shiftAmt)
		return EmitErrNone;

	if (shiftType != EmitShiftRor) {
		
		EMIT(LLmov, rdNo, rdNo, shiftType, shiftAmt, flagPrefs, false);
	}
	else if (shiftAmt) {		//ROR is hard
		
		EMIT(LLmovImm, tmpReg, shiftAmt, 0, EmitFlagsDoNotCare, false);

		EMIT(LLshiftByReg, rdNo, rdNo, tmpReg, EmitShiftRor, flagPrefs, false);
	}
	else {						//RRX is harder yet
		
		//move SR's C into bit 1
		EMIT(LLmov, tmpReg, REG_NO_SR, EmitShiftLsr, 29, EmitFlagsDoNotCare, false);

		//move SR's C into bit 31
		EMIT(LLmov, tmpReg, tmpReg, EmitShiftLsl, 31, EmitFlagsDoNotCare, false);

		//lsrs Rd, #1								//leaves flags set properly for RRX
		EMIT(LLmov, rdNo, rdNo, EmitShiftLsr, 1, flagPrefs, false);
		
		//add Rd, Rtmp	(place C bit into place)	//does nto touch flags thus leaving them where they need to be
		EMIT(LLaddReg, rdNo, rdNo, tmpReg, EmitShiftLsl, 0, EmitLeaveFlags, false);
	}

	return EmitErrNone;
}

static enum EmitStatus jitPrvHandleCcStart(struct EmitBuf *dest, struct EmitBuf *ccSkip, enum EmitCc cc)
{
	enum EmitStatus now;
	uint32_t nWords;
	
	if (cc == EmitCcNv)
		return EmitErrNotEncodeable;
	
	if (cc == EmitCcAl)
		return EmitErrNone;
	
	now = jitEmitNumHalfwordsNeededForConditionalSkipover(emitCcInvert(cc), &nWords);
	if (now != EmitErrNone)
		return now;
	
	EMIT(SaveSpace, ccSkip, nWords);

	return EmitErrNone;	
}

static enum EmitStatus jitPrvHandleCcEnd(struct EmitBuf *dest, struct EmitBuf *ccSkip, enum EmitCc cc)
{
	if (cc == EmitCcNv)
		return EmitErrNotEncodeable;
	
	if (cc == EmitCcAl)
		return EmitErrNone;
	
	return jitPrvConditionalBranchToThumb(ccSkip, emitGetPtrToJumpHere(dest), emitCcInvert(cc));
}

enum EmitStatus jitEmitImmMemLdr(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, bool sext, uint32_t rtNo, uint32_t rnNo, int32_t imm, enum EmitAddrMode adrMode, enum EmitMemOpSz size)
{
	uint_fast8_t baseReg = (rnNo < 8 && jitPrvIsVregArealReg(rnNo)) ? rnNo : REG_NO_TMP_1;
	uint_fast8_t dstReg = (rtNo < 8 && jitPrvIsVregArealReg(rtNo)) ? rtNo : REG_NO_TMP_3;
	uint32_t idxOfstToUse = 0;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	//a sanity check
	if (size == EmitSzWord && sext)
		return EmitErrNotEncodeable;
	if (adrMode != EmitAdrModeIndex && rtNo == rnNo)
		return EmitErrNotEncodeable;
	if (adrMode != EmitAdrModeIndex && !imm)
		return EmitErrNotEncodeable;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	//some SP special handling
	if (rnNo == EMIT_REG_NO_SP && rtNo != EMIT_REG_NO_PC && adrMode == EmitAdrModeIndex && size == EmitSzWord && imm >= 0 && !(imm & 3) && imm < 1024) {
		
		EMIT(LLloadImm, dstReg, EMIT_REG_NO_SP, imm, EmitSzWord, false, EmitAdrModeIndex);
		
		goto write_loaded_val;
	}
	
	//more special SP handling: (ldr Rx, [SP], #4 aka pop)
	if (rnNo == EMIT_REG_NO_SP && rtNo != EMIT_REG_NO_PC && adrMode == EmitAdrModePostindex && size == EmitSzWord && imm == 4) {
		
		EMIT(HLpop, 1 << dstReg);
		
		goto write_loaded_val;
	}
	
	if (rnNo == EMIT_REG_NO_PC) {
		
		uint32_t ea = instrAddr + 8 + imm, val;
		uint_fast8_t instrImm = 0;
		
		if (adrMode != EmitAdrModeIndex)	//pc-base requires no writeback
			return EmitErrNotEncodeable;
		
		//in case we can pool immediates, use load to record the lowest bits.
		//this will not always help, but it will sometimes
		
		switch (size) {
			case EmitSzByte:
				instrImm = ea & 0x1f;
				val = *(uint8_t*)ea;
				break;
			
			case EmitSzHalfword:
				instrImm = ea & 0x3e;
				val = *(uint16_t*)ea;
				break;
			
			case EmitSzWord:
				instrImm = ea & 0x7c;
				val = *(uint32_t*)ea;
				break;
		}
		
		
		if (ea >= CPU_ROM_BASE && ea - CPU_ROM_BASE < CPU_ROM_SIZE) {		//rom is not expected to change
						
			now = jitPrvEmitLoadImmToLoreg(dest, dstReg, val, true, true);
			if (now != EmitErrNone)
				return now;
		}
		else {
			
			//get addr
			now = jitPrvEmitLoadImmToLoreg(dest, dstReg, ea - instrImm, true, true);
			if (now != EmitErrNone)
				return now;
			
			//load from it
			EMIT(LLloadImm, dstReg, dstReg, instrImm, size, false, EmitAdrModeIndex);
		}
		goto write_loaded_val;
	}
	
	if (adrMode == EmitAdrModeIndex) {
		
		bool canUseDirect = false;
		
		//small negative immediate is suboptimal for what comes next, so handle it specially
		if (!sext && imm < 0 && imm > -0x100) {
			
			//get the base TO a corruptible reg!
			now = jitPrvLoadVreg(dest, REG_NO_TMP_1, rnNo, instrAddr);
			if (now != EmitErrNone)
				return now;
			
			//subtract imm
			EMIT(LLsubImm, REG_NO_TMP_1, REG_NO_TMP_1, -imm, EmitFlagsDoNotCare, false);
			
			//do the load
			EMIT(LLloadImm, dstReg, REG_NO_TMP_1, 0, size, sext, EmitAdrModeIndex);

			goto write_loaded_val;
		}
		
		if (imm >= 0 && !sext) switch (size) {
			case EmitSzByte:
				if (imm < 32)
					canUseDirect = true;
				break;
			
			case EmitSzHalfword:
				if (imm < 64  && !(imm & 1))
					canUseDirect = true;
				break;
			
			case EmitSzWord:
				if (imm < 128 && !(imm & 3))
					canUseDirect = true;
				break;
		}
		
		//get the base
		if (baseReg == REG_NO_TMP_1) {
			
			now = jitPrvLoadVreg(dest, baseReg, rnNo, instrAddr);
			if (now != EmitErrNone)
				return now;
		}
		
		//if we can do a direct load, do it
		if (canUseDirect) {
			
			//perform the load
			EMIT(LLloadImm, dstReg, baseReg, imm, size, sext, EmitAdrModeIndex);
		}
		else {		//else load the imm and use a reg-reg load
			
			//get imm
			now = jitPrvEmitLoadImmToLoreg(dest, REG_NO_TMP_2, imm, true, true);
			if (now != EmitErrNone)
				return now;
			
			//do the load
			EMIT(LLloadRegReg, dstReg, baseReg, REG_NO_TMP_2, 0, size, sext);
		}
		goto write_loaded_val;
	}
	
	//indexed mode is handled, both the remaining modes left (postindexed and indew+wbak) have writeback and they all need the base reg
	if (baseReg == REG_NO_TMP_1) {
		
		now = jitPrvLoadVreg(dest, baseReg, rnNo, instrAddr);
		if (now != EmitErrNone)
			return now;
	}
	
	if (adrMode == EmitAdrModePostindex) {
	
		if (size == EmitSzWord && imm == 4) {		//special case where LDM works
			
			EMIT(LLldmia, baseReg, 1 << dstReg, true);
			goto wbak_base_and_write_loaded_val;
		}
		
		if (sext) {							//for sext-load with postincrement, we need a zero register for the actual load
			
			//get imm of zero
			now = jitPrvEmitLoadImmToLoreg(dest, REG_NO_TMP_2, 0, true, true);
			if (now != EmitErrNone)
				return now;
			
			//LDRSx Rdst, [Rbase, Rtmp2]
			EMIT(LLloadRegReg, dstReg, baseReg, REG_NO_TMP_2, 0, size, sext);
		}
		else {										//normal load from base
			
			EMIT(LLloadImm, dstReg, baseReg, 0, size, false, EmitAdrModeIndex);
		}

		now = jitPrvAddImmToReg(dest, baseReg, REG_NO_TMP_2, imm);
		goto wbak_base_and_write_loaded_val;
	}
	
	//writeback mode with sext is best done in a special way
	if (sext) {
		
		//get imm
		now = jitPrvEmitLoadImmToLoreg(dest, REG_NO_TMP_2, imm, true, true);
		if (now != EmitErrNone)
			return now;
		
		//do the load
		EMIT(LLloadRegReg, dstReg, baseReg, REG_NO_TMP_2, 0, size, sext);
		
		//not directly tested
		goto add_tmp2_to_base;
	}
	
	now = jitPrvAddImmToReg(dest, baseReg, REG_NO_TMP_2, imm);
	if (now != EmitErrNone)
		return now;
	
	EMIT(LLloadImm, dstReg, baseReg, 0, size, false, EmitAdrModeIndex);
	goto wbak_base_and_write_loaded_val;

add_tmp2_to_base:
	
	if (jitPrvIsVregArealReg(rnNo)) {
		
		EMIT(LLaddReg, rnNo, rnNo, REG_NO_TMP_2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
		goto write_loaded_val;
	}
	EMIT(LLaddReg, baseReg, baseReg, REG_NO_TMP_2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);

wbak_base_and_write_loaded_val:
	
	if (baseReg == REG_NO_TMP_1) {
		
		now = jitPrvStoreVreg(dest, rnNo, baseReg, true);
		if (now != EmitErrNone)
			return now;
	}

write_loaded_val:

	if (rtNo == EMIT_REG_NO_PC) {
		
		struct JitBackendRuntimeData *rtd = jitPrvGetRuntimeDataPtr();
	
		if (REG_NO_DST_EPILOGUE_REG != dstReg) {
			
			//mov RepilogueDst, Rtmp3
			EMIT(LLmov, REG_NO_DST_EPILOGUE_REG, dstReg, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
		}
		
		//bl rtd->epilogueInterworking
		EMIT(LLbl, (uintptr_t)rtd->epilogueInterworking);
	}
	else if (dstReg == REG_NO_TMP_3) {
		
		now = jitPrvStoreVreg(dest, rtNo, dstReg, true);
		if (now != EmitErrNone)
			return now;
	}

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitImmMemStr(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rtNo, uint32_t rnNo, int32_t imm, enum EmitAddrMode adrMode, enum EmitMemOpSz size)
{
	int_fast8_t baseReg = (rnNo < 8 && jitPrvIsVregArealReg(rnNo)) ? (int_fast8_t)(uint_fast8_t)rnNo : -1;
	int_fast8_t srcReg = (rtNo < 8 && jitPrvIsVregArealReg(rtNo)) ? (int_fast8_t)(uint_fast8_t)rtNo : -1;
	uint32_t idxOfstToUse = 0;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	//a sanity check
	if (adrMode != EmitAdrModeIndex && rnNo == EMIT_REG_NO_PC)
		return EmitErrNotEncodeable;

	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	//some SP special handling (str Rx, [SP, #])
	if (rnNo == EMIT_REG_NO_SP && adrMode == EmitAdrModeIndex && size == EmitSzWord && imm >= 0 && !(imm & 3) && imm < 1024) {
		
		if (srcReg >= 0) {
			
			EMIT(LLstoreImm, srcReg, EMIT_REG_NO_SP, imm, EmitSzWord, EmitAdrModeIndex);
		}
		else {
		
			//get Rt
			now = jitPrvLoadVreg(dest, REG_NO_TMP_1, rtNo, instrAddr);
			if (now != EmitErrNone)
				return now;
			
			//perform store
			EMIT(LLstoreImm, REG_NO_TMP_1, EMIT_REG_NO_SP, imm, EmitSzWord, EmitAdrModeIndex);
		}
	}
	//more special SP handling: (str Rx, [SP, #-4] aka push)
	else if (rnNo == EMIT_REG_NO_SP && adrMode == EmitAdrModeIndexWbak && size == EmitSzWord && imm == -4) {
		
		if (srcReg >= 0) {
			
			EMIT(HLpush, 1 << srcReg);
		}
		else {
		
			//get Rt
			now = jitPrvLoadVreg(dest, REG_NO_TMP_1, rtNo, instrAddr);
			if (now != EmitErrNone)
				return now;
			
			//perform store using push
			EMIT(HLpush, 1 << REG_NO_TMP_1);
		}
	}
	else {	//generic code
		if (rnNo == EMIT_REG_NO_PC) {	//precalc the position
			
			now = jitPrvEmitLoadImmToLoreg(dest, REG_NO_TMP_1, instrAddr + 8 + imm, true, true);
			if (now != EmitErrNone)
				return now;
			
			imm = 0;
		}
		else {
			
			//in indexed mode we might be able to use instrs that support this
			if (imm > 0 && adrMode == EmitAdrModeIndex) {
				
				switch (size) {
					case EmitSzByte:
						if (imm < 32)
							idxOfstToUse = imm;
						else
							idxOfstToUse = 31;
						break;
					
					case EmitSzHalfword:
						if (imm < 64 && !(imm & 1))
							idxOfstToUse = imm;
						else
							idxOfstToUse = 62;
						break;
					
					case EmitSzWord:
						if (imm < 128 && !(imm & 3))
							idxOfstToUse = imm;
						else
							idxOfstToUse = 124;
						break;
				}
				imm -= idxOfstToUse;
			}
			
			//we can only use the actual reg if we'll not change it, or if it is ok to change it
			if (adrMode == EmitAdrModeIndex && imm != 0)
				baseReg = -1;
			
			if (baseReg < 0) {
				
				//get base reg into Rtmp1
				now = jitPrvLoadVreg(dest, REG_NO_TMP_1, rnNo, instrAddr);
				if (now != EmitErrNone)
					return now;
			}
		}
		
		//get Rt into Rtmp3
		if (rtNo == rnNo) {	//if same reg, use the fact we have it already
			
			//mov Rtmp3, Rtmp1
			EMIT(LLmov, REG_NO_TMP_3, baseReg >= 0 ? baseReg : REG_NO_TMP_1, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
			srcReg = REG_NO_TMP_3;
		}
		else if (srcReg < 0) {
			
			now = jitPrvLoadVreg(dest, REG_NO_TMP_3, rtNo, instrAddr);
			if (now != EmitErrNone)
				return now;
			srcReg = REG_NO_TMP_3;
		}
		
		//if we need to preincrement, do so
		if (adrMode != EmitAdrModePostindex) {
			
			now = jitPrvAddImmToReg(dest, baseReg >= 0 ? baseReg : REG_NO_TMP_1, REG_NO_TMP_2, imm);
			if (now != EmitErrNone)
				return now;
		}
		
		//some postincrements are handleable better
		if (adrMode == EmitAdrModePostindex && idxOfstToUse == 0 && size == EmitSzWord && imm == 4) {
			
			//perform store from Rdst
			EMIT(LLstoreImm, srcReg, baseReg >= 0 ? baseReg : REG_NO_TMP_1, 4, size, EmitAdrModePostindex);
		}
		else {
			
			//perform store from Rdst
			EMIT(LLstoreImm, srcReg, baseReg >= 0 ? baseReg : REG_NO_TMP_1, idxOfstToUse, size, EmitAdrModeIndex);
			
			//if we need to postincrement, do so
			if (adrMode == EmitAdrModePostindex) {
				
				now = jitPrvAddImmToReg(dest, baseReg >= 0 ? baseReg : REG_NO_TMP_1, REG_NO_TMP_2, imm);
				if (now != EmitErrNone)
					return now;
			}
		}
		
		//if we need to store base reg back, do so (we know it will not be PC)
		if (adrMode != EmitAdrModeIndex && baseReg < 0) {
			
			now = jitPrvStoreVreg(dest, rnNo, REG_NO_TMP_1, true);
			if (now != EmitErrNone)
				return now;
		}
	}

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitRegRegMemLdr(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, bool sext, uint32_t rtNo, uint32_t rnNo, bool isAdd, uint32_t rmNo, enum EmitShiftType shiftType, uint32_t shiftAmt, enum EmitAddrMode adrMode, enum EmitMemOpSz size)
{
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	//a sanity check
	if (size == EmitSzWord && sext)
		return EmitErrNotEncodeable;
	else if (adrMode != EmitAdrModeIndex && rtNo == rnNo)
		return EmitErrNotEncodeable;
	else if (adrMode != EmitAdrModeIndex && rnNo == EMIT_REG_NO_PC)
		return EmitErrNotEncodeable;
	else if (adrMode != EmitAdrModeIndex && rnNo == rmNo)
		return EmitErrNotEncodeable;
	else if (rmNo == EMIT_REG_NO_PC)
		return EmitErrNotEncodeable;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	//get base reg into Rtmp1
	now = jitPrvLoadVreg(dest, REG_NO_TMP_1, rnNo, instrAddr);
	if (now != EmitErrNone)
		return now;
	
	//get Rm into Rtmp2
	if (rmNo == rnNo) {	//if same reg, use the fact we have it already
		
		//mov Rtmp2, Rtmp1
		EMIT(LLmov, REG_NO_TMP_2, REG_NO_TMP_1, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
	}
	else {
		
		now = jitPrvLoadVreg(dest, REG_NO_TMP_2, rmNo, instrAddr);
		if (now != EmitErrNone)
			return now;
	}
	
	//shift Rm if we need to
	now = jitPrvShiftReg(dest, REG_NO_TMP_2, REG_NO_TMP_3, shiftType, shiftAmt, EmitFlagsDoNotCare);
	if (now != EmitErrNone)
		return now;

	//we have reg-reg loads which work well, but also some (signed loads) loads require reg-reg loads. lots of thinking to do here
	if (sext || (adrMode == EmitAdrModeIndex && isAdd)) {
		
		uint32_t reg2 = REG_NO_TMP_2;
		
		if (adrMode == EmitAdrModePostindex) {
			
			reg2 = REG_NO_TMP_4;
			
			EMIT(LLmovImm, REG_NO_TMP_4, 0, 0,  EmitFlagsDoNotCare, false);
		}
		else if (!isAdd) {
			
			EMIT(LLrsbImm, REG_NO_TMP_2, REG_NO_TMP_2, 0, EmitFlagsDoNotCare, false);
			isAdd = true;
		}
		
		//perform load into Rtmp3
		EMIT(LLloadRegReg, REG_NO_TMP_3, REG_NO_TMP_1, reg2, 0, size, sext);
		
		//if we need to wbak, do it
		if (adrMode == EmitAdrModeIndexWbak) {
			
			EMIT(LLaddReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
		}
	}
	else {
		
		//if we need to preincrement, do so
		if (adrMode != EmitAdrModePostindex) {
			
			if (isAdd)
				EMIT(LLaddReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
			else
				EMIT(LLsubReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
		}
		
		//perform load into Rtmp3
		EMIT(LLloadImm, REG_NO_TMP_3, REG_NO_TMP_1, 0, size, sext, EmitAdrModeIndex);
	}
	
	//if we need to postincrement, do so
	if (adrMode == EmitAdrModePostindex) {
		
		if (isAdd)
			EMIT(LLaddReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
		else
			EMIT(LLsubReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
	}

	//if we need to store base reg back, do so (we know it will not be PC)
	if (adrMode != EmitAdrModeIndex) {
		
		now = jitPrvStoreVreg(dest, rnNo, REG_NO_TMP_1, true);
		if (now != EmitErrNone)
			return now;
	}
	
	//store Rt (if pc, do the jump)
	if (rtNo == EMIT_REG_NO_PC) {
		
		struct JitBackendRuntimeData *rtd = jitPrvGetRuntimeDataPtr();
	
		if (REG_NO_DST_EPILOGUE_REG != REG_NO_TMP_3) {
			
			//mov RepilogueDst, Rtmp3
			EMIT(LLmov, REG_NO_DST_EPILOGUE_REG, REG_NO_TMP_3, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
		}
		
		//bl rtd->epilogueInterworking
		EMIT(LLbl, (uintptr_t)rtd->epilogueInterworking);
	}
	else {
		
		now = jitPrvStoreVreg(dest, rtNo, REG_NO_TMP_3, true);
		if (now != EmitErrNone)
			return now;
	}

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitRegRegMemStr(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rtNo, uint32_t rnNo, bool isAdd, uint32_t rmNo, enum EmitShiftType shiftType, uint32_t shiftAmt, enum EmitAddrMode adrMode, enum EmitMemOpSz size)
{
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	//a sanity check
	if ((rnNo == EMIT_REG_NO_PC || rnNo == rtNo) && adrMode != EmitAdrModeIndex)
		return EmitErrNotEncodeable;
	else if (rmNo == EMIT_REG_NO_PC)
		return EmitErrNotEncodeable;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	//get base reg into Rtmp1
	now = jitPrvLoadVreg(dest, REG_NO_TMP_1, rnNo, instrAddr);
	if (now != EmitErrNone)
		return now;
	
	//get Rm into Rtmp2
	if (rmNo == rnNo) {	//if same reg, use the fact we have it already
		
		//mov Rtmp2, Rtmp1
		EMIT(LLmov, REG_NO_TMP_2, REG_NO_TMP_1, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
	}
	else {
		
		now = jitPrvLoadVreg(dest, REG_NO_TMP_2, rmNo, instrAddr);
		if (now != EmitErrNone)
			return now;
	}
	
	//get Rt into Rtmp3
	if (rtNo == rnNo) {			//if same reg, use the fact we have it already
		
		//mov Rtmp3, Rtmp1
		EMIT(LLmov, REG_NO_TMP_3, REG_NO_TMP_1, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
	}
	else if (rtNo == rmNo) {	//if same reg, use the fact we have it already
		
		//mov Rtmp3, Rtmp2
		EMIT(LLmov, REG_NO_TMP_3, REG_NO_TMP_2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
	}
	else {
		
		now = jitPrvLoadVreg(dest, REG_NO_TMP_3, rtNo, instrAddr);
		if (now != EmitErrNone)
			return now;
	}
	
	//shift Rm if we need to
	now = jitPrvShiftReg(dest, REG_NO_TMP_2, REG_NO_TMP_4, shiftType, shiftAmt, EmitFlagsDoNotCare);
	if (now != EmitErrNone)
		return now;

	//for indexed mode use reg-reg op
	if (adrMode == EmitAdrModeIndex) {
		
		if (!isAdd)
			EMIT(LLrsbImm, REG_NO_TMP_2, REG_NO_TMP_2, 0, EmitFlagsDoNotCare, false);
		
		EMIT(LLstoreRegReg, REG_NO_TMP_3, REG_NO_TMP_1, REG_NO_TMP_2, 0, size);
	}
	else {	//for all others do our usual thing
		
		//if we need to preincrement, do so
		if (adrMode == EmitAdrModeIndexWbak) {
			
			if (isAdd)
				EMIT(LLaddReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
			else
				EMIT(LLsubReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
		}
		
		EMIT(LLstoreImm, REG_NO_TMP_3, REG_NO_TMP_1, 0, size, EmitAdrModeIndex);

		//if we need to postincrement, do so
		if (adrMode == EmitAdrModePostindex) {
			
			if (isAdd)
				EMIT(LLaddReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
			else
				EMIT(LLsubReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
		}
		
		//wbak
		now = jitPrvStoreVreg(dest, rnNo, REG_NO_TMP_1, true);
		if (now != EmitErrNone)
			return now;
	}

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvEmitMulOrMla(struct EmitBuf *dest, enum EmitCc cc, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, int32_t raNoReq, bool s)
{
	uint32_t raNo = (raNoReq < 0) ? 31 : raNoReq;	//some nonexistent reg so it does not accidentally match anything, but < 32
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	//v5 forbids Rm == Rd. It does not help us any to enforce that so we do not. we do enforce no use of PC
	if (((1 << rdNo) | (1 << rnNo) | (1 << rmNo) | (1 << raNo)) & (1 << EMIT_REG_NO_PC))
		return EmitErrNotEncodeable;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	//get Rn reg into Rtmp1
	now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_1, rnNo);
	if (now != EmitErrNone)
		return now;
	
	//get Rm into Rtmp2
	if (rmNo == rnNo) {	//if same reg, use the fact we have it already
		
		//mov Rtmp2, Rtmp1
		EMIT(LLmov, REG_NO_TMP_2, REG_NO_TMP_1, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
	}
	else {
		
		now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_2, rmNo);
		if (now != EmitErrNone)
			return now;
	}
	
	//if we have Ra, get it into Rtmp3
	if (raNo == rnNo) {			//if same reg, use the fact we have it already
		
		//mov Rtmp3, Rtmp1
		EMIT(LLmov, REG_NO_TMP_3, REG_NO_TMP_1, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
	}
	else if (raNo == rmNo) {	//if same reg, use the fact we have it already
		
		//mov Rtmp3, Rtmp2
		EMIT(LLmov, REG_NO_TMP_3, REG_NO_TMP_2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
	}
	else if (raNoReq >= 0) {
		
		now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_3, raNo);
		if (now != EmitErrNone)
			return now;
	}
	
	//multiply into REG_NO_TMP_1
	EMIT(LLmulReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_2, EmitFlagsDoNotCare, false);
	
	if (raNoReq >= 0) {
	
		//add Rtmp1, Rtmp3	//if we had Ra, do the accumulate
		EMIT(LLaddReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_3, EmitShiftLsl, 0, s ? EmitSetFlags : EmitFlagsDoNotCare, false);
	}

	//store
	now = jitPrvStoreVreg(dest, rdNo, REG_NO_TMP_1, true);
	if (now != EmitErrNone)
		return now;
	
	//if we need to set flags, do it
	if (s) {
		
		//MRS Rtmp1, CPSR
		EMIT(LLmrs, REG_NO_TMP_1, EMIT_SYSM_APSR);
		
		//ldr Rtmp3, =ARM_SR_BIT_N | ARM_SR_BIT_Z
		now = jitPrvEmitLoadImmToLoreg(dest, REG_NO_TMP_3, ARM_SR_BIT_N | ARM_SR_BIT_Z, true, true);
		if (now != EmitErrNone)
			return now;
		
		//bic Rsr, Rtmp3
		EMIT(LLbicReg, REG_NO_SR, REG_NO_SR, REG_NO_TMP_3, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);

		//and Rtmp1, Rtmp3
		EMIT(LLandReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_3, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
		
		//add Rsr, Rtmp1
		EMIT(LLaddReg, REG_NO_SR, REG_NO_SR, REG_NO_TMP_1, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
	}

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitMul(struct EmitBuf *dest, enum EmitCc cc, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, bool s)
{
	return jitPrvEmitMulOrMla(dest, cc, rdNo, rnNo, rmNo, -1, s);
}

enum EmitStatus jitEmitMla(struct EmitBuf *dest, enum EmitCc cc, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, uint32_t raNo, bool s)
{
	return jitPrvEmitMulOrMla(dest, cc, rdNo, rnNo, rmNo, raNo, s);
}

static uint32_t jitPrvCalloutClz(uint32_t val)
{
	return val ? __builtin_clz(val) : 32;
}

enum EmitStatus jitEmitClz(struct EmitBuf *dest, enum EmitCc cc, uint32_t rdNo, uint32_t rmNo)
{
	struct EmitBuf space, ccSkip;
	enum EmitStatus now;
	uint32_t shift;
	
	//v5 forbids PC
	if (((1 << rdNo) | (1 << rmNo)) & (1 << EMIT_REG_NO_PC))
		return EmitErrNotEncodeable;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	//get Rm reg into r0
	now = jitPrvLoadVregNotPc(dest, 0, rmNo);
	if (now != EmitErrNone)
		return now;
	
	//get pointer to CLZ into r1
	now = jitPrvLiteralLoad(dest, 1, (uintptr_t)&jitPrvCalloutClz);
	if (now != EmitErrNone)
		return now;
	
	//blx r1
	EMIT(LLblx, 1);

	//store
	now = jitPrvStoreVreg(dest, rdNo, 0, true);
	if (now != EmitErrNone)
		return now;

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

//all multiply callouts shall leave N flag set if top bit of top word is one!

#define UMULL																							\
		"	lsrs r2, r0, #16	\n\t"	/* A.hi */														\
		"	lsrs r3, r1, #16	\n\t"	/* B.hi */														\
		"	uxth r0, r0			\n\t"	/* A.lo */														\
		"	uxth r1, r1			\n\t"	/* B.lo */														\
		"	movs r4, r0			\n\t"	/* A.lo */														\
		"	muls r0, r1			\n\t"	/* A.lo * B.lo */												\
		"	muls r1, r2			\n\t"	/* A.hi * B.lo */												\
		"	muls r2, r3			\n\t"	/* A.hi * B.hi */												\
		"	muls r3, r4			\n\t"	/* A.lo * B.hi */												\
		"	movs r4, #0			\n\t"																	\
		"	adds r1, r3			\n\t"	/* LO32(A.hi * B.lo + A.lo * B.hi) */							\
		"	adcs r4, r4			\n\t"	/* HI32(A.hi * B.lo + A.lo * B.hi) */							\
		"	lsls r4, r4, #16	\n\t"																	\
		"	lsrs r1, r3, #16	\n\t"																	\
		"	adds r1, r4			\n\t"	/* HI32((A.hi * B.lo + A.lo * B.hi) << 16) */					\
		"	lsls r3, r3, #16	\n\t"	/* LO32((A.hi * B.lo + A.lo * B.hi) << 16) */					\
		"	adds r0, r3			\n\t"	/* LO32((A.hi * B.lo + A.lo * B.hi) << 16) + A.lo * B.lo */		\
		"	adcs r1, r2			\n\t"	/* HI32((A.hi * B.lo + A.lo * B.hi) << 16) + A.hi * B.hi */		\

static uint64_t __attribute__((naked)) jitPrvCalloutUmull(uint32_t n, uint32_t m)
{
	asm volatile(
		"	.syntax unified		\n\t"
		"	push {r4}			\n\t"
		UMULL
		"	pop  {r4}			\n\t"
		"	bx   lr				\n\t"
	);
	
	//shut up gcc
	return 0;
}

static int64_t __attribute__((naked)) jitPrvCalloutSmull(int32_t n, int32_t m)
{
	asm volatile(
		"	.syntax unified		\n\t"
		"	push {r4}			\n\t"
		"	tst  r0, r0			\n\t"
		"	bmi  1f				\n\t"
		"	tst  r1, r1			\n\t"
		"	bmi  2f				\n\t"
		//both positive
		UMULL
		"	pop  {r4}			\n\t"
		"	bx   lr				\n\t"
		//A is positive, B is negative
		"2:						\n\t"
		"	negs r1, r1			\n\t"
		UMULL
		//go to result negation
		"	b    9f				\n\t"
		//A is negative
		"1:						\n\t"
		"	negs r0, r0			\n\t"
		"	tst  r1, r1			\n\t"
		"	bmi  3f				\n\t"
		//A is negative, B is positive
		UMULL
		//negate results
		"9:						\n\t"
		"	movs r2, #0			\n\t"
		"	negs r0, r0			\n\t"
		"	sbcs r2, r1			\n\t"
		"	mov  r1, r2			\n\t"
		"	pop  {r4}			\n\t"
		"	bx   lr				\n\t"
		//both negative
		"3:						\n\t"
		"	negs r1, r1			\n\t"
		UMULL
		"	pop  {r4}			\n\t"
		"	bx   lr				\n\t"
	);
	
	//shut up gcc
	return 0;
}

static uint64_t __attribute__((naked)) jitPrvCalloutUmlal(uint32_t n, uint32_t m, uint64_t acc)
{
	asm volatile(
		"	.syntax unified		\n\t"
		"	push {r2-r4}		\n\t"
		UMULL
		"	pop  {r2, r3}		\n\t"
		"	adds r0, r2			\n\t"
		"	adcs r1, r3			\n\t"
		"	pop  {r4}			\n\t"
		"	bx   lr				\n\t"
	);
	
	//shut up gcc
	return 0;
}

static int64_t __attribute__((naked)) jitPrvCalloutSmlal(int32_t n, int32_t m, int64_t acc)
{
	asm volatile(
		"	.syntax unified		\n\t"
		"	push {r2-r4}		\n\t"
		"	tst  r0, r0			\n\t"
		"	bmi  1f				\n\t"
		"	tst  r1, r1			\n\t"
		"	bmi  2f				\n\t"
		//both positive
		UMULL
		"	b 8f				\n\t"
		//A is positive, B is negative
		"2:						\n\t"
		"	negs r1, r1			\n\t"
		UMULL
		//go to result negation
		"	b    9f				\n\t"
		//A is negative
		"1:						\n\t"
		"	negs r0, r0			\n\t"
		"	tst  r1, r1			\n\t"
		"	bmi  3f				\n\t"
		//A is negative, B is positive
		UMULL
		//negate results
		"9:						\n\t"
		"	movs r2, #0			\n\t"
		"	negs r0, r0			\n\t"
		"	sbcs r2, r1			\n\t"
		"	mov  r1, r2			\n\t"
		"	b 8f				\n\t"
		//both negative
		"3:						\n\t"
		"	negs r1, r1			\n\t"
		UMULL
		
		//accumulate
		"8:						\n\t"
		"	pop  {r2, r3}		\n\t"
		"	adds r0, r2			\n\t"
		"	adcs r1, r3			\n\t"
		"	pop  {r4}			\n\t"
		"	bx   lr				\n\t"
	);
	
	//shut up gcc
	return 0;
}

enum EmitStatus jitEmitLongMul(struct EmitBuf *dest, enum EmitCc cc, uint32_t rdLoNo, uint32_t rdHiNo, uint32_t rnNo, uint32_t rmNo, bool unsign, bool accum, bool s)
{
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	void *calloutPtr;
	
	//no PC is allowed, dest regs must be distinct
	if (((1 << rdLoNo) | (1 << rdHiNo) | (1 << rnNo) | (1 << rmNo)) & (1 << EMIT_REG_NO_PC))
		return EmitErrNotEncodeable;
	if (rdLoNo == rdHiNo)
		return EmitErrNotEncodeable;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	//get Rn reg into r0
	now = jitPrvLoadVregNotPc(dest, 0, rnNo);
	if (now != EmitErrNone)
		return now;
	
	//get Rm into r1
	if (rmNo == rnNo) {	//if same reg, use the fact we have it already
		
		//mov r1, r0
		EMIT(LLmov, 1, 0, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
	}
	else {
		
		now = jitPrvLoadVregNotPc(dest, 1, rmNo);
		if (now != EmitErrNone)
			return now;
	}
	
	//if we're accumulating (get rdLo & rdHi)
	if (accum) {
		
		//get RdLo into r2
		if (rdLoNo == rnNo) {	//if same reg, use the fact we have it already
			
			//mov r2, r0
			EMIT(LLmov, 2, 0, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
		}
		else if (rdLoNo == rmNo) {	//if same reg, use the fact we have it already
			
			//mov r2, r1
			EMIT(LLmov, 2, 1, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
		}
		else {
			
			now = jitPrvLoadVregNotPc(dest, 2, rdLoNo);
			if (now != EmitErrNone)
				return now;
		}
		
		//get RdHi into r3
		if (rdHiNo == rnNo) {	//if same reg, use the fact we have it already
			
			//mov r3, r0
			EMIT(LLmov, 3, 0, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
		}
		else if (rdHiNo == rmNo) {	//if same reg, use the fact we have it already
			
			//mov r3, r1
			EMIT(LLmov, 3, 1, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
		}
		else {
			
			now = jitPrvLoadVregNotPc(dest, 3, rdHiNo);
			if (now != EmitErrNone)
				return now;
		}
		
		calloutPtr = unsign ? (void*)&jitPrvCalloutUmlal : (void*)&jitPrvCalloutSmlal;
	}
	else {
		
		calloutPtr = unsign ? (void*)&jitPrvCalloutUmull : (void*)&jitPrvCalloutSmull;
	}
	
	//get pointer to callout into r4
	now = jitPrvLiteralLoad(dest, 4, (uintptr_t)calloutPtr);
	if (now != EmitErrNone)
		return now;
	
	//blx r4
	EMIT(LLblx, 4);
	
	//store
	now = jitPrvStoreVreg(dest, rdLoNo, 0, true);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvStoreVreg(dest, rdHiNo, 1, true);
	if (now != EmitErrNone)
		return now;

	//flags
	if (s) {

		//MRS r2, CPSR		//N is currently correct, just grab it into r2
		EMIT(LLmrs, 2, EMIT_SYSM_APSR);
		
		//lsr r2, #31 (cut off all but the N bit
		EMIT(LLmov, 2, 2, EmitShiftLsr, 31, EmitFlagsDoNotCare, false);
		
		//orrs r0, r1		//calculate Z by orring words
		EMIT(LLorrReg, 0, 0, 1, EmitShiftLsl, 0, EmitSetFlags, false);

		//MRS r0, CPSR		//read APSR with proper C into r0
		EMIT(LLmrs, 2, EMIT_SYSM_APSR);
		
		//lsr r0, #31 (shift read APSR's Z bit into APSR.C)
		EMIT(LLmov, 2, 2, EmitShiftLsr, 31, EmitSetFlags, false);
		
		//adcs r2, r2	//now r2 has proper N in bit 1 and proper Z in bit 0
		EMIT(LLadcReg, 2, 2, 2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);

		//lsl Rsr, #2 (cut off N & Z)
		EMIT(LLmov, REG_NO_SR, REG_NO_SR, EmitShiftLsl, 2, EmitFlagsDoNotCare, false);
		
		//lsr Rsr, #2 (put positions back into place)
		EMIT(LLmov, REG_NO_SR, REG_NO_SR, EmitShiftLsr, 2, EmitFlagsDoNotCare, false);
		
		//lsl r2, #30 (place new NZ into proper place)
		EMIT(LLmov, 2, 2, EmitShiftLsl, 30, EmitFlagsDoNotCare, false);
		
		//add Rsr, r2 (combine)
		EMIT(LLaddReg, REG_NO_SR, REG_NO_SR, 2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
	}

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitPrvPop(struct EmitBuf *dest, enum EmitCc cc, uint32_t regsMask)
{
	struct JitBackendRuntimeData *rtd = jitPrvGetRuntimeDataPtr();
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	if (regsMask & (1 << EMIT_REG_NO_SP))	//writeback AND load of the reg not allowed
		return EmitErrNotEncodeable;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	while (regsMask) {
		
		uint_fast8_t i, nextFreeRealReg = 0, numFreeRealRegs = 4, usedRealRegs = 0;
		uint8_t indirectPoppingMap[4][2];
		uint_fast16_t popMask = 0;
		
		if (regsMask == (1 << EMIT_REG_NO_PC)) {	//will be last
			
			//bl rtd->epiloguePopPc
			EMIT(LLbl, (uintptr_t)rtd->epiloguePopPc);		//only second word is used
			break;
		}
		
		while (regsMask) {
			
			uint_fast8_t realReg, nextVregToPop = __builtin_ctz(regsMask);
						
			if (nextVregToPop < 8 && jitPrvIsVregArealReg(nextVregToPop)) {
				
				realReg = nextVregToPop;	//vreg maps to real reg
			}
			else if (nextVregToPop == EMIT_REG_NO_PC) {	//must be last, needs special handling
				
				if (popMask >> REG_NO_DST_EPILOGUE_REG)	//verify that this is safe to do (order)
					break;
				
				realReg = REG_NO_DST_EPILOGUE_REG;
				
				//record the mapping
				nextFreeRealReg = realReg + 1;
				indirectPoppingMap[usedRealRegs][0] = realReg;
				indirectPoppingMap[usedRealRegs][1] = nextVregToPop;
				usedRealRegs++;
			}
			else {							//we need a real reg to load this vreg
				
				if (nextFreeRealReg >= numFreeRealRegs)	//no more available -> bail
					break;
				
				realReg = nextFreeRealReg;
				
				//but this will only work if we have not yet loaded any higher reg in our mask
				if (popMask >> realReg)
					break;
				
				//record the mapping
				nextFreeRealReg++;
				indirectPoppingMap[usedRealRegs][0] = realReg;
				indirectPoppingMap[usedRealRegs][1] = nextVregToPop;
				usedRealRegs++;
			}
			popMask |= 1 << realReg;
			regsMask &=~ (1 << nextVregToPop);
		}
		
		//do the pop
		EMIT(HLldmia, EMIT_REG_NO_SP, popMask, true);
		
		//move any regs that need it
		for (i = 0; i < usedRealRegs; i++) {
			
			uint_fast8_t rReg = indirectPoppingMap[i][0], vReg = indirectPoppingMap[i][1];
			
			if (vReg == EMIT_REG_NO_PC) {
				
				//this must be the last reg of the last batch, luckily the code above assures this
				if (rReg != REG_NO_DST_EPILOGUE_REG)
					fatal("unexpected PC reg\n");
				
				//bl rtd->epilogueInterworking
				EMIT(LLbl, (uintptr_t)rtd->epilogueInterworking);
			}
			else {
				
				now = jitPrvStoreVreg(dest, vReg, rReg, true);	//ok to wbak because if SP, we decreased it
				if (now != EmitErrNone)
					return now;
			}
		}
	}
	
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvLdmStm(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rnNo, const uint32_t regsMask, bool wbak, bool add, bool before, bool load)
{
	const uint8_t dstsNormal[] = {REG_NO_TMP_1, REG_NO_TMP_2, REG_NO_TMP_3, REG_NO_TMP_4}, *dsts = dstsNormal + 1;
	uint_fast8_t baseReg = REG_NO_TMP_1, maxRegsAtOnce = 3;
	int32_t pcReg = -1, popCnt = jitPrvPopcount16(regsMask);
	bool skipFirstLoad = false;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	uint32_t i, ofst;
	
	//a sanity check
	if (!regsMask || rnNo == EMIT_REG_NO_PC)
		return EmitErrNotEncodeable;
	
	if (load){ 
		//a sanity check
		if ((regsMask & (1 << rnNo)) & wbak)
			return EmitErrNotEncodeable;
	}
	else {
		
		//a sanity check
		if ((regsMask & (1 << rnNo)) & wbak && (regsMask & ((1 << rnNo) - 1)))
			return EmitErrNotEncodeable;
	}
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	//some cases with SP base allow better encoding
	if (!load && !add && before && wbak && rnNo == EMIT_REG_NO_SP && !(regsMask & ((1 << EMIT_REG_NO_SP) | (1 << EMIT_REG_NO_PC)))) {			//PUSH without SP or PC gets a special path
		
		uint_fast16_t regsLeft = regsMask;
		
		maxRegsAtOnce++;
		dsts--;
		
		//this can be improved for cases like "push {r6, r7}", but they are uncommon
		while (regsLeft) {
			
			int8_t curRegIdx = maxRegsAtOnce - 1;
			uint_fast16_t realMask = 0;
			
			while (regsLeft) {
				
				uint_fast8_t regNo = 31 - __builtin_clz(regsLeft);
				
				if (curRegIdx < 0)
					break;
				
				regsLeft -= 1 << regNo;
				now = jitPrvLoadVregNotPc(dest, dsts[curRegIdx], regNo);
				if (now != EmitErrNone)
					return now;
				realMask |= 1 << dsts[curRegIdx];
				curRegIdx--;
			}
			
			EMIT(LLstmdb, EMIT_REG_NO_SP, realMask, true);
		}
		
		return EmitErrNone;
	}
	else {
		
		//get Rn reg into Rtmp1
		now = jitPrvLoadVregNotPc(dest, baseReg, rnNo);
		if (now != EmitErrNone)
			return now;
		
		if (!add) {
			
			if (!wbak) {
			
				//get new base !! for IA mode !!
				EMIT(LLsubImm, baseReg, baseReg, sizeof(uint32_t) * (popCnt - (before ? 0 : 1)), EmitFlagsDoNotCare, false);
				
				before = false;
			}
			else {
				if (!load && (regsMask & ((2U << rnNo) - 1)) == (1U << rnNo)){	//base reg is set, lowest in set, wbak, store. need to store original value of base reg
				
					//move base reg to dsts[0]
					EMIT(LLmov, dsts[0], REG_NO_TMP_1, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
					
					//remember it
					skipFirstLoad = true;
				}
				
				//get value for wbak
				EMIT(LLsubImm, baseReg, baseReg, sizeof(uint32_t) * popCnt, EmitFlagsDoNotCare, false);
				
				//do wbak
				now = jitPrvStoreVreg(dest, rnNo, baseReg, true);	//ok to wbak because if SP, we decreased it
				if (now != EmitErrNone)
					return now;
				
				before = !before;
				wbak = false;
			}
			
			add = true;
		}
	}
	
	if (!before || popCnt >= 3) {		//three or four at a time. IA or (IB and 3+ regs)
		
		uint32_t regsLeft = regsMask;
		
		if (before)
			EMIT(LLaddImm, baseReg, baseReg, sizeof(uint32_t), EmitFlagsDoNotCare, false);
			
		while (regsLeft) {
			uint32_t i, nowMask = 0, nregs;
			uint8_t reg[4];
			
			for (i = 0; i < maxRegsAtOnce && regsLeft; i++) {
				
				reg[i] = __builtin_ctz(regsLeft);
				regsLeft &= regsLeft - 1;
				nowMask |= 1 << dsts[i];
				
				if (!load || skipFirstLoad) {
					
					now = jitPrvLoadVreg(dest, dsts[i], reg[i], instrAddr);
					if (now != EmitErrNone)
						return now;
				}
				skipFirstLoad = false;
			}
			
			if (load)
				EMIT(LLldmia, baseReg, nowMask, true);
			else
				EMIT(LLstmia, baseReg, nowMask, true);
			
			if (load) {
				
				for (nregs = i, i = 0; i < nregs; i++) {
					
					//store if not pc or sp. for pc we'll keep the value in its current reg till later (we know it is the last reg to be loaded so this is ok),
					//	for sp, in REG_NO_TMP_HI
					if (reg[i] == EMIT_REG_NO_PC)
						pcReg = dsts[i];
					else if (reg[i] == EMIT_REG_NO_SP)
						EMIT(LLmov, REG_NO_TMP_HI, dsts[i], EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
					else {
						now = jitPrvStoreVreg(dest, reg[i], dsts[i], false);
						if (now != EmitErrNone)
							return now;
					}
				}
			}
		}
		
		//wbak if needed
		if (wbak) {
			
			//we over incremented - undo
			if (before)
				EMIT(LLaddImm, baseReg, baseReg, sizeof(uint32_t), EmitFlagsDoNotCare, false);
			
			now = jitPrvStoreVreg(dest, rnNo, baseReg, true);
			if (now != EmitErrNone)
				return now;
		}
	}
	else {	//IB mode only
		
		uint32_t ofst = 1;
		
		for (i = 0; i < 16; i++) {
			
			if (!(regsMask & (1 << i)))
				continue;
			
			if (load) {
				
				//load reg into Rtmp3
				EMIT(LLloadImm, REG_NO_TMP_3, baseReg, sizeof(uint32_t) * ofst, EmitSzWord, false, EmitAdrModeIndex);
				
				//store if not pc. for pc we'll keep the value in Rtmp3 till later
				if (i == EMIT_REG_NO_PC)
					pcReg = REG_NO_TMP_3;
				else if (i == EMIT_REG_NO_SP)
					EMIT(LLmov, REG_NO_TMP_HI, REG_NO_TMP_3, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
				else {
					now = jitPrvStoreVreg(dest, i, REG_NO_TMP_3, false);
					if (now != EmitErrNone)
						return now;
				}
			}
			else {
				
				now = jitPrvLoadVreg(dest, REG_NO_TMP_3, i, instrAddr);
				if (now != EmitErrNone)
					return now;
				
				EMIT(LLstoreImm, REG_NO_TMP_3, baseReg, sizeof(uint32_t) * ofst, EmitSzWord, EmitAdrModeIndex);
			}
			
			ofst++;
		}
		
		if (wbak) {
			
			EMIT(LLaddImm, baseReg, baseReg, sizeof(uint32_t) * (ofst - 1), EmitFlagsDoNotCare, false);
			
			now = jitPrvStoreVreg(dest, rnNo, baseReg, true);
			if (now != EmitErrNone)
				return now;
		}
	}
	
	//if we loaded sp, set sp
	if (load && (regsMask & (1 << EMIT_REG_NO_SP)))
		EMIT(LLmov, EMIT_REG_NO_SP, REG_NO_TMP_HI, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
	
	//if we loaded pc, branch
	if (pcReg >= 0) {
	
		struct JitBackendRuntimeData *rtd = jitPrvGetRuntimeDataPtr();
	
		if (REG_NO_DST_EPILOGUE_REG != pcReg) {
			
			//mov RepilogueDst, pcReg
			EMIT(LLmov, REG_NO_DST_EPILOGUE_REG, pcReg, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
		}
		
		//bl rtd->epilogueInterworking
		EMIT(LLbl, (uintptr_t)rtd->epilogueInterworking);
	}

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitStmia(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rnNo, uint32_t regsMask, bool wbak)
{
	return jitPrvLdmStm(dest, cc, instrAddr, rnNo, regsMask, wbak, true, false, false);
}

enum EmitStatus jitEmitStmib(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rnNo, uint32_t regsMask, bool wbak)
{
	return jitPrvLdmStm(dest, cc, instrAddr, rnNo, regsMask, wbak, true, true, false);
}

enum EmitStatus jitEmitStmda(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rnNo, uint32_t regsMask, bool wbak)
{
	return jitPrvLdmStm(dest, cc, instrAddr, rnNo, regsMask, wbak, false, false, false);
}

enum EmitStatus jitEmitStmdb(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rnNo, uint32_t regsMask, bool wbak)
{
	return jitPrvLdmStm(dest, cc, instrAddr, rnNo, regsMask, wbak, false, true, false);
}

enum EmitStatus jitEmitLdmia(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rnNo, uint32_t regsMask, bool wbak)
{
	if (rnNo == EMIT_REG_NO_SP && wbak)
		return jitPrvPop(dest, cc, regsMask);
	
	return jitPrvLdmStm(dest, cc, instrAddr, rnNo, regsMask, wbak, true, false, true);
}

enum EmitStatus jitEmitLdmib(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rnNo, uint32_t regsMask, bool wbak)
{
	return jitPrvLdmStm(dest, cc, instrAddr, rnNo, regsMask, wbak, true, true, true);
}

enum EmitStatus jitEmitLdmda(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rnNo, uint32_t regsMask, bool wbak)
{
	return jitPrvLdmStm(dest, cc, instrAddr, rnNo, regsMask, wbak, false, false, true);
}

enum EmitStatus jitEmitLdmdb(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rnNo, uint32_t regsMask, bool wbak)
{
	return jitPrvLdmStm(dest, cc, instrAddr, rnNo, regsMask, wbak, false, true, true);
}

enum EmitStatus jitEmitSwap(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, enum EmitMemOpSz size)
{
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	//none of the regs is allowed to be PC
	if (((1 << rdNo) | (1 << rnNo) | (1 << rmNo)) & (1 << EMIT_REG_NO_PC))
		return EmitErrNotEncodeable;
	else if (rnNo == rmNo || rnNo == rdNo)	//Rn must be distinct from Rm and Rd
		return EmitErrNotEncodeable;
	else if (size == EmitSzHalfword)				//there is no SWPH
		return EmitErrNotEncodeable;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	//get Rn reg into Rtmp1
	now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_1, rnNo);
	if (now != EmitErrNone)
		return now;
	
	//get Rm into Rtmp1
	if (rmNo == rnNo) {	//if same reg, use the fact we have it already
		
		//mov Rtmp2, Rtmp1
		EMIT(LLmov, REG_NO_TMP_2, REG_NO_TMP_1, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
	}
	else {
		
		now = jitPrvLoadVreg(dest, REG_NO_TMP_2, rmNo, instrAddr);
		if (now != EmitErrNone)
			return now;
	}
	
	//disabled for only a little, so this is ok even on hardware where ints must stay on
	
	//CPSID i		//ints off
	EMIT(LLcps, true, false, false);

	//ldr[b] Rtmp3, [Rn]
	EMIT(LLloadImm, REG_NO_TMP_3, REG_NO_TMP_1, 0, size, false, EmitAdrModeIndex);
	
	//str[b] Rtmp2, [Rn]
	EMIT(LLstoreImm, REG_NO_TMP_2, REG_NO_TMP_1, 0, size, EmitAdrModeIndex);
	
	//CPSIE i		//ints off
	EMIT(LLcps, true, false, true);
	
	//write Rd from Rtmp3
	now = jitPrvStoreVreg(dest, rdNo, REG_NO_TMP_3, true);
	if (now != EmitErrNone)
		return now;

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvBxBlxReg(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rmNo, bool withLink)
{
	struct JitBackendRuntimeData *rtd = jitPrvGetRuntimeDataPtr();
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	if (withLink) {
		
		now = jitPrvEmitLoadImmToLoreg(dest, REG_NO_TMP_1, instrAddr + 4, true, true);
		if (now != EmitErrNone)
			return now;
		
		now = jitPrvStoreVreg(dest, EMIT_REG_NO_LR, REG_NO_TMP_1, true);
		if (now != EmitErrNone)
			return now;
	}
	
	now = jitPrvLoadVreg(dest, REG_NO_DST_EPILOGUE_REG, rmNo, instrAddr);
	if (now != EmitErrNone)
		return now;
	
	//bl rtd->epilogueInterworking
	EMIT(LLbl, (uintptr_t)rtd->epilogueInterworking);

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitBxReg(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rmNo)
{
	if (rmNo == EMIT_REG_NO_LR) {	//special callout for the common case
		
		struct JitBackendRuntimeData *rtd = jitPrvGetRuntimeDataPtr();
		struct EmitBuf ccSkip;
		enum EmitStatus now;
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	
		EMIT(LLbl, (uintptr_t)rtd->epilogueBxLr);		//only second word is used
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		return EmitErrNone;
	}
	
	return jitPrvBxBlxReg(dest, cc, instrAddr, rmNo, false);
}

enum EmitStatus jitEmitBlxReg(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rmNo)
{
	return jitPrvBxBlxReg(dest, cc, instrAddr, rmNo, true);
}

enum EmitStatus jitEmitBlToArm(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t dstAddr)
{
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	//set LR
	now = jitPrvEmitLoadImmToLoreg(dest, REG_NO_TMP_1, instrAddr + 4, true, true);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvStoreVreg(dest, EMIT_REG_NO_LR, REG_NO_TMP_1, true);
	if (now != EmitErrNone)
		return now;
	
	//jump
	now = jitEmitJumpToArm(dest, EmitCcAl, dstAddr, NULL);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitMsrReg(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rmNo)
{
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	//load desired reg directly into Rsr
	now = jitPrvLoadVreg(dest, REG_NO_SR, rmNo, instrAddr);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitMsrImm(struct EmitBuf *dest, enum EmitCc cc, uint32_t val)
{
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	//mask off the bits we should never set from user mode
	val &= 0xf8000000;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	//set it
	now = jitPrvEmitLoadImmToLoreg(dest, REG_NO_SR, val, true, true);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitMrsReg(struct EmitBuf *dest, enum EmitCc cc, uint32_t rdNo)
{
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	//first we groom Rsr, we do this lazily (read: now)
	
	//mask off all bits above Q from Rsr (in case they were there) with an LSR and an LSL
	EMIT(LLmov, REG_NO_SR, REG_NO_SR, EmitShiftLsr, 27, EmitFlagsDoNotCare, false);
	EMIT(LLmov, REG_NO_SR, REG_NO_SR, EmitShiftLsl, 27, EmitFlagsDoNotCare, false);
		
	//ADD Rsr, #0x10						//act like we're in ARM mode, user priviledge level
	EMIT(LLaddImm, REG_NO_SR, REG_NO_SR, ARM_V5_SR_USER_MODE_BITS, EmitFlagsDoNotCare, false);
	
	//write Rd from Rsr
	now = jitPrvStoreVreg(dest, rdNo, REG_NO_SR, true);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

//shifter_op is to end up in REG_NO_TMP_1

static bool jitPrvDpInstrNeedsRealShifterCarryOut(enum JitArmDpOp op)
{
	const uint16_t dpInstrNeedsRealCarry = (1UL << ArmDpOpMov) | (1UL << ArmDpOpMvn) | (1UL << ArmDpOpAnd) | (1UL << ArmDpOpEor) | (1UL << ArmDpOpOrr) | (1UL << ArmDpOpBic) | (1UL << ArmDpOpTst) | (1UL << ArmDpOpTeq);
	
	return !!(dpInstrNeedsRealCarry & (1UL << op));
}

static bool jitPrvDpInstrHasRd(enum JitArmDpOp op)
{
	const uint16_t dpInstrHasNoRd = (1UL << ArmDpOpTst) | (1UL << ArmDpOpTeq) | (1UL << ArmDpOpCmp) | (1UL << ArmDpOpCmn);
	
	return !(dpInstrHasNoRd & (1UL << op));
}

static bool jitPrvDpInstrHasRn(enum JitArmDpOp op)
{
	const uint16_t dpInstrHasNoRn = (1UL << ArmDpOpMov) | (1UL << ArmDpOpMvn);
	
	return !(dpInstrHasNoRn & (1UL << op));
}

static bool jitPrvDpInstrConsumesCarry(enum JitArmDpOp op)	//these do not care about shifter carry out but DO need initial carry not clobbered
{
	const uint16_t dpInstrConsumesCarry = (1UL << ArmDpOpAdc) | (1UL << ArmDpOpSbc) | (1UL << ArmDpOpRsc);
	
	return !!(dpInstrConsumesCarry & (1UL << op));
}

static enum EmitStatus jitPrvAluOp(struct EmitBuf *dest, enum JitArmDpOp op, uint32_t instrAddr, uint32_t rdNo, uint32_t rnNo, bool s)
{
	uint32_t dstRegNo = REG_NO_TMP_2;
	enum EmitStatus now;
	
	if ((rdNo == EMIT_REG_NO_PC) && s)
		return EmitErrNotEncodeable;
	
	//if we need carry in from flags, do so now
	//all of the instrs that need it will ONLY read carry, so instead of MSR, we can shift
	if (jitPrvDpInstrConsumesCarry(op)) {
		
		//shift our SR's "C" into real SR's "C"
		EMIT(LLmov, REG_NO_TMP_2, REG_NO_SR, EmitShiftLsl, 1 + __builtin_clz(ARM_SR_BIT_C), EmitSetFlags, false);
	}
	
	//if we need Rn, get it into REG_NO_TMP_2
	if (jitPrvDpInstrHasRn(op)) {
	
		now = jitPrvLoadVreg(dest, REG_NO_TMP_2, rnNo, instrAddr);
		if (now != EmitErrNone)
			return now;
	}

	switch (op) {
		case ArmDpOpAnd:
		case ArmDpOpTst:
			EMIT(LLandReg, REG_NO_TMP_2, REG_NO_TMP_2, REG_NO_TMP_1, EmitShiftLsl, 0, s ? EmitSetFlags : EmitFlagsDoNotCare, false);
			break;
		
		case ArmDpOpEor:
		case ArmDpOpTeq:
			EMIT(LLeorReg, REG_NO_TMP_2, REG_NO_TMP_2, REG_NO_TMP_1, EmitShiftLsl, 0, s ? EmitSetFlags : EmitFlagsDoNotCare, false);
			break;
		
		case ArmDpOpSub:
		case ArmDpOpCmp:
			EMIT(LLsubReg, REG_NO_TMP_2, REG_NO_TMP_2, REG_NO_TMP_1, EmitShiftLsl, 0, s ? EmitSetFlags : EmitFlagsDoNotCare, false);
			break;
		
		case ArmDpOpRsb:
			EMIT(LLsubReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_2, EmitShiftLsl, 0, s ? EmitSetFlags : EmitFlagsDoNotCare, false);
			dstRegNo = REG_NO_TMP_1;
			break;
		
		case ArmDpOpAdd:
		case ArmDpOpCmn:
			EMIT(LLaddReg, REG_NO_TMP_2, REG_NO_TMP_2, REG_NO_TMP_1, EmitShiftLsl, 0, s ? EmitSetFlags : EmitFlagsDoNotCare, false);
			break;
		
		case ArmDpOpAdc:
			EMIT(LLadcReg, REG_NO_TMP_2, REG_NO_TMP_2, REG_NO_TMP_1, EmitShiftLsl, 0, s ? EmitSetFlags : EmitFlagsDoNotCare, false);
			break;
		
		case ArmDpOpSbc:
			EMIT(LLsbcReg, REG_NO_TMP_2, REG_NO_TMP_2, REG_NO_TMP_1, EmitShiftLsl, 0, s ? EmitSetFlags : EmitFlagsDoNotCare, false);
			break;
		
		case ArmDpOpRsc:
			EMIT(LLsbcReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_2, EmitShiftLsl, 0, s ? EmitSetFlags : EmitFlagsDoNotCare, false);
			dstRegNo = REG_NO_TMP_1;
			break;
		
		case ArmDpOpOrr:
			EMIT(LLorrReg, REG_NO_TMP_2, REG_NO_TMP_2, REG_NO_TMP_1, EmitShiftLsl, 0, s ? EmitSetFlags : EmitFlagsDoNotCare, false);
			break;
		
		case ArmDpOpMov:
			//prep work already did all we need for this
			dstRegNo = REG_NO_TMP_1;
			break;
		
		case ArmDpOpBic:
			EMIT(LLbicReg, REG_NO_TMP_2, REG_NO_TMP_2, REG_NO_TMP_1, EmitShiftLsl, 0, s ? EmitSetFlags : EmitFlagsDoNotCare, false);
			break;
		
		case ArmDpOpMvn:
			EMIT(LLmvnReg, REG_NO_TMP_2, REG_NO_TMP_1, EmitShiftLsl, 0, s ? EmitSetFlags : EmitFlagsDoNotCare, false);
			break;
	}
	
	//do we need to output flags?
	if (s) {
		
		//MRS Rsr, CPSR
		EMIT(LLmrs, REG_NO_SR, EMIT_SYSM_APSR);
	}
	
	//produce output
	if (jitPrvDpInstrHasRd(op)) {
		if (rdNo != EMIT_REG_NO_PC) {
			now = jitPrvStoreVreg(dest, rdNo, dstRegNo, true);
			if (now != EmitErrNone)
				return now;
		}
		else {
		
			struct JitBackendRuntimeData *rtd = jitPrvGetRuntimeDataPtr();
		
			if (REG_NO_DST_EPILOGUE_REG != dstRegNo) {
				
				//mov RepilogueDst, dstRegNo
				EMIT(LLmov, REG_NO_DST_EPILOGUE_REG, dstRegNo, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
			}
			
			//bl rtd->epilogueNoninterworking
			EMIT(LLbl, (uintptr_t)rtd->epilogueNoninterworking);
		}
	}
	
	return EmitErrNone;
}

static bool jitPrvAluOpImmEvalPc(uint32_t pcVal, uint32_t val, enum JitArmDpOp op, uint32_t *valOut)
{
	switch (op) {
		case ArmDpOpAnd:	val = pcVal & val;	break;
		case ArmDpOpEor:	val = pcVal ^ val;	break;
		case ArmDpOpSub:	val = pcVal - val;	break;
		case ArmDpOpRsb:	val = val - pcVal;	break;
		case ArmDpOpAdd:	val = pcVal + val;	break;
		case ArmDpOpOrr:	val = pcVal | val;	break;
		case ArmDpOpBic:	val = pcVal &~ val;	break;
		default:
			return false;
	}
	
	if (valOut)
		*valOut = val;
	
	return true;
}
enum EmitStatus jitEmitAluOpImm(struct EmitBuf *dest, enum EmitCc cc, enum JitArmDpOp op, uint32_t instrAddr, uint32_t rdNo, uint32_t rnNo, uint32_t imm, uint32_t rotBy, bool s)
{
	uint32_t eImm = jitPrvRor(imm, rotBy);
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	//zero imm is a mov. let jitEmitAluOpRegShiftImm() care about conditionalness 
	if (!s && !imm && (op == ArmDpOpEor || op == ArmDpOpSub || op == ArmDpOpAdd || op == ArmDpOpOrr || op == ArmDpOpBic)) {
		
		return jitEmitAluOpRegShiftImm(dest, cc, ArmDpOpMov, instrAddr, rdNo, 0, rnNo, EmitShiftLsl, 0, s);
	}
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	//pc with imm might be evaluatable immediately
	if (!s && rnNo == EMIT_REG_NO_PC && jitPrvAluOpImmEvalPc(instrAddr + 8, eImm, op, &eImm)) {
		
		if (rdNo != EMIT_REG_NO_PC) {
			
			now = jitEmitLoadImmToReg(dest, rdNo, eImm, true, true, false);
			if (now != EmitErrNone)
				return now;
		}
		else {
			
			struct JitBackendRuntimeData *rtd = jitPrvGetRuntimeDataPtr();
			
			now = jitPrvEmitLoadImmToLoreg(dest, REG_NO_DST_EPILOGUE_REG, eImm, true, true);
			if (now != EmitErrNone)
				return now;
				
			//bl rtd->epilogueNoninterworking
			EMIT(LLbl, (uintptr_t)rtd->epilogueNoninterworking);
		}
	}
	//and with 0xff is an extend
	else if (!s && op == ArmDpOpAnd && eImm == 0xff) {
		
		now = jitEmitExtend(dest, rdNo, rnNo, 0, true, true);
		if (now != EmitErrNone)
			return now;
	}
	//some SP ops can be handled better
	else if (rnNo == EMIT_REG_NO_SP && !s && !(eImm & 3) && rdNo == EMIT_REG_NO_SP && (op == ArmDpOpSub || op == ArmDpOpAdd) && eImm < 512) {
			
		if (op == ArmDpOpAdd)
			EMIT(LLaddImm, EMIT_REG_NO_SP, EMIT_REG_NO_SP, eImm, EmitFlagsDoNotCare, false);
		else
			EMIT(LLsubImm, EMIT_REG_NO_SP, EMIT_REG_NO_SP, eImm, EmitFlagsDoNotCare, false);
	}
	//some SP ops can be handled better
	else if (rnNo == EMIT_REG_NO_SP && !s && !(eImm & 3) && rdNo != EMIT_REG_NO_PC /* just to avoid dealing with that */ && op == ArmDpOpAdd && eImm < 1024) {
			
		EMIT(LLaddImm, REG_NO_TMP_1, EMIT_REG_NO_SP, eImm, EmitFlagsDoNotCare, false);
		
		now = jitPrvStoreVreg(dest, rdNo, REG_NO_TMP_1, true);
		if (now != EmitErrNone)
			return now;
	}
	//things we have imm8 ops for we can handle specially and more efficiently here. these all do not need shifter carry out
	else if (!(eImm >> 8) && (op == ArmDpOpSub || op == ArmDpOpCmp || op == ArmDpOpAdd || op == ArmDpOpCmn)) {
		
		//if we need Rn, get it into REG_NO_TMP_2
		if (jitPrvDpInstrHasRn(op)) {
		
			now = jitPrvLoadVreg(dest, REG_NO_TMP_2, rnNo, instrAddr);
			if (now != EmitErrNone)
				return now;
		}

		if (op == ArmDpOpAdd || op == ArmDpOpCmn)
			EMIT(LLaddImm, REG_NO_TMP_2, REG_NO_TMP_2, eImm, s ? EmitSetFlags : EmitFlagsDoNotCare, false);
		else
			EMIT(LLsubImm, REG_NO_TMP_2, REG_NO_TMP_2, eImm, s ? EmitSetFlags : EmitFlagsDoNotCare, false);

		//do we need to output flags?
		if (s) {
			
			//MRS Rsr, CPSR
			EMIT(LLmrs, REG_NO_SR, EMIT_SYSM_APSR);
		}
		
		//produce output, if needed
		if (jitPrvDpInstrHasRd(op)) {
			if (rdNo != EMIT_REG_NO_PC) {
				now = jitPrvStoreVreg(dest, rdNo, REG_NO_TMP_2, true);
				if (now != EmitErrNone)
					return now;
			}
			else {
			
				struct JitBackendRuntimeData *rtd = jitPrvGetRuntimeDataPtr();
			
				if (REG_NO_DST_EPILOGUE_REG != REG_NO_TMP_2) {
					
					//mov RepilogueDst, Rtmp2
					EMIT(LLmov, REG_NO_DST_EPILOGUE_REG, REG_NO_TMP_2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
				}
				
				//bl rtd->epilogueNoninterworking
				EMIT(LLbl, (uintptr_t)rtd->epilogueNoninterworking);
			}
		}
	}
	//we may be obligated to belt out the moves and maybe even the RORs
	else {
		
		uint32_t eImmNormalized = eImm >> __builtin_ctz(eImm);
		
		if (!s && op == ArmDpOpAnd && !((eImmNormalized + 1) & eImmNormalized) && (eImm + 1) && rdNo != EMIT_REG_NO_PC && rnNo != EMIT_REG_NO_PC) {	//non-flag-setting AND with a mask that looks like 00..00001111..11110...00 for some number of 1s and zeroes
			
			uint_fast8_t zeroesOnLeft = __builtin_clz(eImm), zeroesOnRight = __builtin_ctz(eImm);
			uint_fast8_t srcReg = (rnNo < 8 && jitPrvIsVregArealReg(rnNo)) ? rnNo : REG_NO_TMP_1;
			uint_fast8_t dstReg = (rdNo < 8 && jitPrvIsVregArealReg(rdNo)) ? rdNo : REG_NO_TMP_1;
			
			//this is all only worth it if there are zeroes only one end (compiles to 2 instrs) or the imm is wider than 8 bits
			
			if (zeroesOnLeft + zeroesOnRight < 24 || !zeroesOnRight || !zeroesOnLeft) {
				
				if (srcReg == REG_NO_TMP_1) {
				
					now = jitPrvLoadVreg(dest, srcReg, rnNo, instrAddr);
					if (now != EmitErrNone)
						return now;
				}
				
				if (zeroesOnLeft && zeroesOnRight) {
					
					EMIT(LLmov, dstReg, srcReg, EmitShiftLsl, zeroesOnLeft, EmitFlagsDoNotCare, false);
					EMIT(LLmov, dstReg, dstReg, EmitShiftLsr, zeroesOnRight + zeroesOnLeft, EmitFlagsDoNotCare, false);
					EMIT(LLmov, dstReg, dstReg, EmitShiftLsl, zeroesOnRight, EmitFlagsDoNotCare, false);
				}
				else if (zeroesOnLeft) {
					
					EMIT(LLmov, dstReg, srcReg, EmitShiftLsl, zeroesOnLeft, EmitFlagsDoNotCare, false);
					EMIT(LLmov, dstReg, dstReg, EmitShiftLsr, zeroesOnLeft, EmitFlagsDoNotCare, false);
				}
				else if (zeroesOnRight) {
					
					EMIT(LLmov, dstReg, srcReg, EmitShiftLsr, zeroesOnRight, EmitFlagsDoNotCare, false);
					EMIT(LLmov, dstReg, dstReg, EmitShiftLsl, zeroesOnRight, EmitFlagsDoNotCare, false);
				}
						
				if (dstReg == REG_NO_TMP_1) {
					
					now = jitPrvStoreVreg(dest, rdNo, dstReg, true);
					if (now != EmitErrNone)
						return now;
				}
				goto handled;
			}
		}

		if (s && jitPrvDpInstrNeedsRealShifterCarryOut(op)) {
		
			//MSR APSR_nzcvq, Rsr
			EMIT(LLmsr, EMIT_SYSM_APSR, EMIT_MSR_APSR_MASK_NZCVQ, REG_NO_SR);
			
			//form imm
			EMIT(LLmovImm, REG_NO_TMP_1, imm, 0, EmitFlagsDoNotCare, false);
			
			if (rotBy) {
				//get imm into reg (ror is only by reg)
				EMIT(LLmovImm, REG_NO_TMP_3, rotBy, 0, EmitFlagsDoNotCare, false);
				
				//do the ror
				EMIT(LLshiftByReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_3, EmitShiftRor, s ? EmitSetFlags : EmitFlagsDoNotCare, false);
			}
		}
		else {
			
			now = jitPrvEmitLoadImmToLoreg(dest, REG_NO_TMP_1, eImm, true, true);
			if (now != EmitErrNone)
				return now;
		}
	
		now = jitPrvAluOp(dest, op, instrAddr, rdNo, rnNo, s);
		if (now != EmitErrNone)
			return now;
	}

handled:
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitAluOpRegShiftImm(struct EmitBuf *dest, enum EmitCc cc, enum JitArmDpOp op, uint32_t instrAddr, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, enum EmitShiftType shiftType, uint32_t shiftAmt, bool s)
{
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	
	if (op == ArmDpOpMov && shiftType == EmitShiftLsl && !shiftAmt && rdNo == EMIT_REG_NO_PC && rmNo == EMIT_REG_NO_PC && !s) {	//"MOV PC, PC" is weird, but valid, skips next inst
				
		return jitEmitJumpToArm(dest, cc, instrAddr + 8, NULL);
	}
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	//some special cases that we have special handling for
	if (shiftType == EmitShiftLsl && !shiftAmt) {
		
		uint32_t regs = (1 << rdNo) | (1 << rnNo) | (1 << rmNo);
		int_fast8_t rhs = -1;
		
		if (op == ArmDpOpMov && rdNo == EMIT_REG_NO_PC && !s) {	//move to PC (likely from LR)
			
			struct JitBackendRuntimeData *rtd = jitPrvGetRuntimeDataPtr();
			
			now = jitPrvLoadVreg(dest, REG_NO_DST_EPILOGUE_REG, rmNo, instrAddr);
			if (now != EmitErrNone)
				return now;
			
			//bl rtd->epilogueNoninterworking
			EMIT(LLbl, (uintptr_t)rtd->epilogueNoninterworking);
			goto handled;
		}
		
		if (op == ArmDpOpCmp) {
			
			rdNo = rnNo;
			rhs = rmNo;
		}
		else if (op == ArmDpOpMov) {
			
			rdNo = rdNo;
			rhs = rmNo;
		}
		else if (rnNo == rdNo)
			rhs = rmNo;
		else if (rmNo == rdNo)
			rhs = rnNo;
		
		if (rhs >= 0 && !(regs & (1 << EMIT_REG_NO_PC))) {	//two-op ALU operation with no shift and no PC involvement
			
			if (op == ArmDpOpMov && !s) {	//use MOV (3) 
				
				//there are 9 cases
				//regTO	regFROM:	r0-r3,r12,lr	r6-r7 	r8-r11,sp
				//r0-r3,r12,lr		LDR+STR			STR		MOV+STR					groupA = !jitPrvIsVregArealReg()
				//r6-r7				LDR				MOV		MOV						groupB = jitPrvIsVregArealReg() && r < 8
				//r8-r11,sp			LDR+MOV			MOV		MOV						groupC = jitPrvIsVregArealReg() && r >= 8
				
				if (jitPrvIsVregArealReg(rdNo) && jitPrvIsVregArealReg(rmNo)) {
					
					EMIT(LLmov, rdNo, rmNo, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
					goto handled;
				}
				
				if (jitPrvIsVregArealReg(rdNo) && rdNo < 8) {
					
					now = jitPrvLoadVreg(dest, rdNo, rmNo, instrAddr);
					if (now != EmitErrNone)
						return now;
					goto handled;
				}
				
				if (jitPrvIsVregArealReg(rmNo) && rmNo < 8) {
					
					now = jitPrvStoreVreg(dest, rdNo, rmNo, true);
					if (now != EmitErrNone)
						return now;
					goto handled;
				}
				
				now = jitPrvLoadVreg(dest, REG_NO_TMP_1, rmNo, instrAddr);
				if (now != EmitErrNone)
					return now;
				
				now = jitPrvStoreVreg(dest, rdNo, REG_NO_TMP_1, true);
					if (now != EmitErrNone)
						return now;
				goto handled;
				
			}
			else if (op == ArmDpOpCmp || (op == ArmDpOpAdd && !s)) {					//can use ADD(4) or CMP(3) maybe
				
				bool dIsReal = jitPrvIsVregArealReg(rdNo);
				bool nIsReal = jitPrvIsVregArealReg(rhs);
				
				if (dIsReal && nIsReal) {
					
					if (op == ArmDpOpCmp) {
						
						EMIT(LLcmpReg, rdNo, rhs, EmitShiftLsl, 0);
						EMIT(LLmrs, REG_NO_SR, EMIT_SYSM_APSR);
					}
					else {
						
						EMIT(LLaddReg, rdNo, rdNo, rhs, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
					}
					goto handled;
				}
				if (dIsReal) {
					
					now = jitPrvLoadVreg(dest, REG_NO_TMP_1, rhs, instrAddr);
					if (now != EmitErrNone)
						return now;
					
					if (op == ArmDpOpCmp) {
						
						EMIT(LLcmpReg, rdNo, REG_NO_TMP_1, EmitShiftLsl, 0);
						EMIT(LLmrs, REG_NO_SR, EMIT_SYSM_APSR);
					}
					else {
						
						EMIT(LLaddReg, rdNo, rdNo, REG_NO_TMP_1, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
					}
					goto handled;
				}
				if (nIsReal) {
					
					now = jitPrvLoadVreg(dest, REG_NO_TMP_1, rdNo, instrAddr);
					if (now != EmitErrNone)
						return now;
					
					if (op == ArmDpOpCmp) {
						
						EMIT(LLcmpReg, REG_NO_TMP_1, rhs, EmitShiftLsl, 0);
						EMIT(LLmrs, REG_NO_SR, EMIT_SYSM_APSR);
					}
					else {
						
						EMIT(LLaddReg, REG_NO_TMP_1, REG_NO_TMP_1, rhs, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
					
						now = jitPrvStoreVreg(dest, rdNo, REG_NO_TMP_1, true);
						if (now != EmitErrNone)
							return now;
					}
					goto handled;
				}
			}
		}
	}
	
	
	now = jitPrvLoadVreg(dest, REG_NO_TMP_1, rmNo, instrAddr);
	if (now != EmitErrNone)
		return now;

	if (s && jitPrvDpInstrNeedsRealShifterCarryOut(op)) {
		
		//MSR APSR_nzcvq, Rsr
		EMIT(LLmsr, EMIT_SYSM_APSR, EMIT_MSR_APSR_MASK_NZCVQ, REG_NO_SR);
	}
	
	now = jitPrvShiftReg(dest, REG_NO_TMP_1, REG_NO_TMP_2, shiftType, shiftAmt, s ? EmitSetFlags : EmitFlagsDoNotCare);
	if (now != EmitErrNone)
		return now;
	
	if (op == ArmDpOpMov && s && shiftType == EmitShiftLsl && shiftAmt == 0)	//we need a MOVS since the above did nothing
		EMIT(LLmov, REG_NO_TMP_1, REG_NO_TMP_1, EmitShiftLsl, 0, EmitSetFlags, false);
	
	now = jitPrvAluOp(dest, op, instrAddr, rdNo, rnNo, s);
	if (now != EmitErrNone)
		return now;

handled:
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitAluOpRegShiftReg(struct EmitBuf *dest, enum EmitCc cc, enum JitArmDpOp op, uint32_t instrAddr, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, uint32_t rsNo, enum EmitShiftType shiftType, bool s)
{
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvLoadVreg(dest, REG_NO_TMP_1, rmNo, instrAddr);
	if (now != EmitErrNone)
		return now;
	
	if (rsNo == rmNo) {
		
		//mov Rtmp2, Rtmp1
		EMIT(LLmov, REG_NO_TMP_2, REG_NO_TMP_1, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
	}
	else {
		
		now = jitPrvLoadVreg(dest, REG_NO_TMP_2, rsNo, instrAddr);
		if (now != EmitErrNone)
			return now;
	}
	
	if (s && jitPrvDpInstrNeedsRealShifterCarryOut(op)) {
		
		//MSR APSR_nzcvq, Rsr
		EMIT(LLmsr, EMIT_SYSM_APSR, EMIT_MSR_APSR_MASK_NZCVQ, REG_NO_SR);
	}
	
	EMIT(LLshiftByReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_2, shiftType, s ? EmitSetFlags : EmitFlagsDoNotCare, false);

	now = jitPrvAluOp(dest, op, instrAddr, rdNo, rnNo, s);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvEmitConditionCheck(struct EmitBuf *dest, enum EmitCc cc, bool jumpOnMatch, enum EmitCc *restrict firstJumpCcP, enum EmitCc *restrict secondJumpCcP)
{
	*secondJumpCcP = EmitCcNv;
	*firstJumpCcP = EmitCcNv;
	
	if ((cc == EmitCcHi && !jumpOnMatch) || (cc == EmitCcLs && jumpOnMatch)) {	//special faster case, when possible
		
		EMIT(LLmov, REG_NO_TMP_1, REG_NO_SR, EmitShiftLsl, 1 + __builtin_clz(ARM_SR_BIT_Z), EmitSetFlags, false);		//arm.Z->sr.C, arm.C->sr.N
		*firstJumpCcP = EmitCcPl;
		*secondJumpCcP = EmitCcCs;
	}
	else switch (cc) {
		case EmitCcEq:	// Z set
			EMIT(LLmov, REG_NO_TMP_1, REG_NO_SR, EmitShiftLsl, 1 + __builtin_clz(ARM_SR_BIT_Z), EmitSetFlags, false);
			*firstJumpCcP = jumpOnMatch ? EmitCcCs : EmitCcCc;
			break;
		
		case EmitCcNe:	// Z clear
			EMIT(LLmov, REG_NO_TMP_1, REG_NO_SR, EmitShiftLsl, 1 + __builtin_clz(ARM_SR_BIT_Z), EmitSetFlags, false);
			*firstJumpCcP = jumpOnMatch ? EmitCcCc : EmitCcCs;
			break;
		
		case EmitCcCs:	// C set
			EMIT(LLmov, REG_NO_TMP_1, REG_NO_SR, EmitShiftLsl, 1 + __builtin_clz(ARM_SR_BIT_C), EmitSetFlags, false);
			*firstJumpCcP = jumpOnMatch ? EmitCcCs : EmitCcCc;
			break;
		
		case EmitCcCc:	// C clear
			EMIT(LLmov, REG_NO_TMP_1, REG_NO_SR, EmitShiftLsl, 1 + __builtin_clz(ARM_SR_BIT_C), EmitSetFlags, false);
			*firstJumpCcP = jumpOnMatch ? EmitCcCc : EmitCcCs;
			break;
		
		case EmitCcMi:	// N set
			EMIT(LLmov, REG_NO_TMP_1, REG_NO_SR, EmitShiftLsl, 1 + __builtin_clz(ARM_SR_BIT_N), EmitSetFlags, false);
			*firstJumpCcP = jumpOnMatch ? EmitCcCs : EmitCcCc;
			break;
		
		case EmitCcPl:	// N clear
			EMIT(LLmov, REG_NO_TMP_1, REG_NO_SR, EmitShiftLsl, 1 + __builtin_clz(ARM_SR_BIT_N), EmitSetFlags, false);
			*firstJumpCcP = jumpOnMatch ? EmitCcCc : EmitCcCs;
			break;
		
		case EmitCcVs:	// V set
			EMIT(LLmov, REG_NO_TMP_1, REG_NO_SR, EmitShiftLsl, 1 + __builtin_clz(ARM_SR_BIT_V), EmitSetFlags, false);
			*firstJumpCcP = jumpOnMatch ? EmitCcCs : EmitCcCc;
			break;
		
		case EmitCcVc:	// V clear
			EMIT(LLmov, REG_NO_TMP_1, REG_NO_SR, EmitShiftLsl, 1 + __builtin_clz(ARM_SR_BIT_V), EmitSetFlags, false);
			*firstJumpCcP = jumpOnMatch ? EmitCcCc : EmitCcCs;
			break;
		
		case EmitCcGe:
			EMIT(LLmov, REG_NO_TMP_1, REG_NO_SR, EmitShiftLsl, __builtin_clz(ARM_SR_BIT_V) - __builtin_clz(ARM_SR_BIT_N), EmitSetFlags, false);	//V in top bit of Rtmp1
			EMIT(LLeorReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_SR, EmitShiftLsl, 0, EmitSetFlags, false);										// now N is set if arm.N was not equal To arm.V
			*firstJumpCcP = jumpOnMatch ? EmitCcPl : EmitCcMi;
			break;
		
		case EmitCcLt:
			EMIT(LLmov, REG_NO_TMP_1, REG_NO_SR, EmitShiftLsl, __builtin_clz(ARM_SR_BIT_V) - __builtin_clz(ARM_SR_BIT_N), EmitSetFlags, false);	//V in top bit of Rtmp1
			EMIT(LLeorReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_SR, EmitShiftLsl, 0, EmitSetFlags, false);										// now N is set if arm.N was not equal To arm.V
			*firstJumpCcP = jumpOnMatch ? EmitCcMi : EmitCcPl;
			break;
		
		case EmitCcHi:
		case EmitCcLs:
		case EmitCcGt:
		case EmitCcLe:
			//MSR APSR_nzcvq, Rsr
			EMIT(LLmsr, EMIT_SYSM_APSR, EMIT_MSR_APSR_MASK_NZCVQ, REG_NO_SR);
			*firstJumpCcP = jumpOnMatch ? cc : emitCcInvert(cc);
			break;
		
		case EmitCcAl:
		case EmitCcNv:
			__builtin_unreachable();
			return EmitErrNotEncodeable;
	}
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvConditionalBranchToThumb(struct EmitBuf *dest, uintptr_t to, enum EmitCc cc)
{
	enum EmitStatus now;
	void *bkp;
	
	//handle unconditional things first
	if (cc == EmitCcNv)
		return EmitErrNotEncodeable;
	else if (cc == EmitCcAl) {
		
		bkp = emitBufferBackup(dest);
		
		now = emitLLbranch(dest, to, EmitCcAl);
		if (now != EmitErrNotEncodeable)
			return now;
		
		//no? then use a BL (and fail if no joy
		emitBufferRestore(dest, bkp);
		EMIT(LLbl, to);
		
		return EmitErrNone;
	}
	else {		//all conditional branches handled here
		
		enum EmitCc jmpCC1, jmpCC2;
		struct EmitBuf space;
		
		//let's try a normal conditional branch
		bkp = emitBufferBackup(dest);
		now = jitPrvEmitConditionCheck(dest, cc, true, &jmpCC1, &jmpCC2);
		if (now != EmitErrNone)
			return now;
		
		now = emitLLbranch(dest, to, jmpCC1);
		if (now == EmitErrNone) {
			
			if (jmpCC2 != EmitCcNv)
				now = emitLLbranch(dest, to, jmpCC2);
			
			if (now == EmitErrNone)
				return EmitErrNone;
		}
		
		//all other cases require an inverted jump-over
		emitBufferRestore(dest, bkp);
		now = jitPrvEmitConditionCheck(dest, cc, false, &jmpCC1, &jmpCC2);
		if (now != EmitErrNone)
			return now;
		
		//space for actual conditional branch(es)
		EMIT(SaveSpace, &space, jmpCC2 == EmitCcNv ? 1 : 2);
		
		//try a short branch, lacking that, emit a BL
		bkp = emitBufferBackup(dest);
		now = emitLLbranch(dest, to, EmitCcAl);
		if (now == EmitErrNotEncodeable) {
			emitBufferRestore(dest, bkp);
			now = emitLLbl(dest, to);
		}
		
		if (now != EmitErrNone)
			return now;
		
		//now complete the branch-over that we saved space for
		EMIT_TO(LLbranch, &space, emitGetPtrToJumpHere(dest), jmpCC1);
		if (jmpCC2 != EmitCcNv)
			EMIT_TO(LLbranch, &space, emitGetPtrToJumpHere(dest), jmpCC2);
		
		return EmitErrNone;
	}
}

enum EmitStatus jitEmitIntraTuBranch(struct EmitBuf *dest, uintptr_t to, enum EmitCc cc)
{
	return jitPrvConditionalBranchToThumb(dest, to, cc);
}

enum EmitStatus jitEmitJumpToAnotherTu(struct EmitBuf *dest, const uint16_t* startOfCodeInOtherTu, enum EmitCc cc)
{
	return jitPrvConditionalBranchToThumb(dest, (uintptr_t)(startOfCodeInOtherTu + NUM_HALFWORDS_PROLOGUE), cc);
}

enum EmitStatus jitEmitNumHalfwordsNeededForConditionalSkipover(enum EmitCc cc, uint32_t *nHalfwordsP)		//for simple short forward jumps
{
	uint32_t ret;
	
	switch (cc) {
		case EmitCcEq:
		case EmitCcNe:
		case EmitCcCs:
		case EmitCcCc:
		case EmitCcMi:
		case EmitCcPl:
		case EmitCcVs:
		case EmitCcVc:
			ret = 2;
			break;
		
		case EmitCcHi:
		case EmitCcLs:
		case EmitCcGe:
		case EmitCcLt:
		case EmitCcGt:
		case EmitCcLe:
			ret = 3;
			break;
			
		case EmitCcAl:
			ret = 1;
			break;
		
		case EmitCcNv:
			return EmitErrNotEncodeable;
			
		default:
			__builtin_unreachable();
	}
	
	*nHalfwordsP = ret;
	
	return EmitErrNone;
}

//used by our pattern matcher and peephole reader
enum EmitStatus jitPrvCalloutM0(struct EmitBuf *dest, const void *func, const int8_t *inRegsMap, const int8_t* outRegsMap)
{
	int32_t from, to, callReg = 3;
	enum EmitStatus now;
	uint32_t i;
	
	now = jitPrvEmitLoadImmToLoreg(dest, callReg, ((uintptr_t)func) | 1, true, true);	//assume thumb func
	if (now != EmitErrNone)
		return now;

	//in regs
	if (inRegsMap) while ((from = *inRegsMap++) >= 0) {
		
		to = *inRegsMap++;
		
		if (to == callReg) {
			
			EMIT(LLmov, EMIT_REG_NO_LR, callReg, EmitShiftLsl, 0, EmitLeaveFlags, false);
			callReg = EMIT_REG_NO_LR;
		}
		
		now = jitPrvLoadVregNotPc(dest, to, from);
		if (now != EmitErrNone)
			return now;
	}

	
	//call callout
	EMIT(LLblx, callReg);
	
	//out regs
	if (outRegsMap) while ((from = *outRegsMap++) >= 0) {
		
		to = *outRegsMap++;
		
		now = jitPrvStoreVreg(dest, to, from, false /* no sp output from callouts please */);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

enum EmitStatus jitEmitExtend(struct EmitBuf *dest, uint32_t rdNo, uint32_t rmNo, uint32_t rotateBy, bool byte, bool unsign)
{
	uint8_t srcReg = (jitPrvIsVregArealReg(rmNo) && rmNo < 8) ? rmNo : REG_NO_TMP_1;
	uint8_t dstReg = (jitPrvIsVregArealReg(rdNo) && rdNo < 8) ? rdNo : REG_NO_TMP_1;
	enum EmitStatus now;
	
	if (rdNo == EMIT_REG_NO_PC || rmNo == EMIT_REG_NO_PC)
		return EmitErrNotEncodeable;
	
	if (srcReg == REG_NO_TMP_1) {
		
		now = jitPrvLoadVregNotPc(dest, srcReg, rmNo);
		if (now != EmitErrNone)
			return now;
	}
	
	EMIT(LLextend, dstReg , srcReg, rotateBy, byte, unsign);
	
	if (dstReg == REG_NO_TMP_1) {
		
		now = jitPrvStoreVreg(dest, rdNo, dstReg, true);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

enum EmitStatus jitPrvSwitchBranchM0(struct EmitBuf *dest, struct SwitchBranchStateM0 *state, uint32_t rmNo, uint32_t numCases, uint32_t halfwordsPerCase)
{
	uint32_t i, bytesPerCase = halfwordsPerCase * sizeof(uint16_t), bytesTotal, jumpOfst;
	struct EmitBuf space;
	enum EmitStatus now;
	
	if (numCases > 256)
		return EmitErrNotEncodeable;
	
	if (bytesPerCase >= 256)
		return EmitErrNotEncodeable;
	
	now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_1, rmNo);
	if (now != EmitErrNone)
		return now;
	
	if (bytesPerCase & (bytesPerCase - 1)) {	//use mul
		
		EMIT(LLmovImm, REG_NO_TMP_2, bytesPerCase, 0, EmitFlagsDoNotCare, false);
	
		EMIT(LLmulReg, REG_NO_TMP_2, REG_NO_TMP_2, REG_NO_TMP_1, EmitFlagsDoNotCare, false);
	}
	else {										//use lsl
		
		EMIT(LLmov, REG_NO_TMP_2, REG_NO_TMP_1, EmitShiftLsl, __builtin_ctz(bytesPerCase), EmitFlagsDoNotCare, false);
	}
	
	EMIT(LLcmpImm, REG_NO_TMP_1, numCases - 1);
	
	//precalc how far our jump forward will be so we can estimate how many halfwords to save for it
	bytesTotal = bytesPerCase * numCases;
	jumpOfst = bytesTotal - sizeof(uint16_t);
	
	EMIT(SaveSpace, &space, (jumpOfst >= 254) ? 2 : 1);	//long conditional jump will need 2 words, else we cna use one

	//branch
	EMIT(LLaddReg, EMIT_REG_NO_PC, EMIT_REG_NO_PC, REG_NO_TMP_2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);

	//space needed
	EMIT(LLnop, false);

	//save loc
	state->cases = dest->buf;
	state->nCases = numCases;
	state->nHalfwordsPerCase = halfwordsPerCase;
	
	//take up all places with udfs
	for (i = 0; i < numCases * halfwordsPerCase; i++)
		EMIT(LLudf, 0, false);
	
	//emit branch-over
	EMIT_TO(LLbranch, &space, emitGetPtrToJumpHere(dest), EmitCcHi);

	return EmitErrNone;
}

enum EmitStatus jitPrvSwitchBranchWriteCaseM0(struct SwitchBranchStateM0 *state, uint32_t caseIdx, struct EmitBuf *caseSpace)
{
	if (caseIdx >= state->nCases)
		return EmitErrNotEncodeable;
	
	emitBufferInit(caseSpace, state->cases + caseIdx * state->nHalfwordsPerCase, sizeof(uint16_t) * state->nHalfwordsPerCase);

	return EmitErrNone;
}

//if we fail to jump, we need to push "armFuncPtr" else all regs are ours
enum EmitStatus jitPrvDynamicLibStubM0(struct EmitBuf *dest, uint32_t moduleId, uint32_t globalsOfst, uint32_t funcNo, uint32_t armFuncPtr, uint16_t *libStubCode)
{
	struct JitBackendRuntimeData *rtd = jitPrvGetRuntimeDataPtr();
	struct EmitBuf space;
	enum EmitStatus now;
	
	//mov Rtmp1, r9
	now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_1, 9);
	if (now != EmitErrNone)
		return now;
	
	//ldr Rtmp1, [Rtmp1]
	EMIT(LLloadImm, REG_NO_TMP_1, REG_NO_TMP_1, 0, EmitSzWord, false, EmitAdrModeIndex);

	//Rtmp1 += (moduleId * 4) &~ 0x7C
	now = jitPrvAddImmToReg(dest, REG_NO_TMP_1, REG_NO_TMP_1, (moduleId * 4) &~ 0x7C);
	if (now != EmitErrNone)
		return now;

	//ldr Rtmp1, [Rtmp1, (moduleId * 4) & 0x7C]
	EMIT(LLloadImm, REG_NO_TMP_1, REG_NO_TMP_1, (moduleId * 4) & 0x7C, EmitSzWord, false, EmitAdrModeIndex);

	//Rtmp1 += globalsOfst &~ 0x7C
	now = jitPrvAddImmToReg(dest, REG_NO_TMP_1, REG_NO_TMP_2, globalsOfst &~ 0x7C);
	if (now != EmitErrNone)
		return now;

	//ldr Rtmp1, [Rtmp1, globalsOfst & 0x7C]
	EMIT(LLloadImm, REG_NO_TMP_1, REG_NO_TMP_1, globalsOfst & 0x7C, EmitSzWord, false, EmitAdrModeIndex);
	
	#ifdef HAVE_v8M_BASE
	
		//space for "cbz Rtmp1, do_load"
		EMIT(SaveSpace, &space, 1);
		
	#else
	
		//cmp Rtmp1, #0
		EMIT(LLcmpImm, REG_NO_TMP_1, 0);
	
		//space for "beq do_load"
		EMIT(SaveSpace, &space, 1);
		
	#endif
	
	//Rtmp1 += (funcNo * 4) &~ 0x7C
	now = jitPrvAddImmToReg(dest, REG_NO_TMP_1, REG_NO_TMP_2, (funcNo * 4) &~ 0x7C);
	if (now != EmitErrNone)
		return now;

	//ldr Repilogue, [Rtmp1, (funcNo * 4) & 0x7C]
	EMIT(LLloadImm, REG_NO_DST_EPILOGUE_REG, REG_NO_TMP_1, (funcNo * 4) & 0x7C, EmitSzWord, false, EmitAdrModeIndex);

	//bl rtd->epilogueInterworking
	EMIT(LLbl, (uintptr_t)rtd->epilogueInterworking);
	
	#ifdef HAVE_v8M_BASE
	
		//do_load is here, emit the "beq do_load" above
		EMIT_TO(LLcbz, &space, REG_NO_TMP_1, emitGetPtrToJumpHere(dest));
		
	#else
	
		//do_load is here, emit the "beq do_load" above
		EMIT_TO(LLbranch, &space, emitGetPtrToJumpHere(dest), EmitCcEq);
		
	#endif
	
	//ldr Rtmp1, = ARM_STUB_INVOCATION_FUNC
	now = jitPrvEmitLoadImmToLoreg(dest, REG_NO_TMP_1, armFuncPtr, true, true);
	if (now != EmitErrNone)
		return now;
	
	//push {Rtmp1}
	EMIT(HLpush, 1 << REG_NO_TMP_1);
	
	//B libStubCode
	now = jitEmitJumpToAnotherTu(dest, libStubCode, EmitCcAl);
	if (now != EmitErrNone)
		return now;

	return EmitErrNone;
}

////LDRD and EDSP support we'll get to later

static enum EmitStatus jitPrvSat(struct EmitBuf *dest, uint32_t rdNo, uint32_t tmpReg)
{
	struct EmitBuf skipSat, isNeg;
	
	//space for "bvc done"
	EMIT(SaveSpace, &skipSat, 1);

	//mov Rd, #0x80	//do not corrupt CPSR.C
	EMIT(HLloadImmToReg, rdNo, 0x80, true, false, false);

	//rev Rd, Rd	// make Rd 0x80000000
	EMIT(LLrev, rdNo, rdNo);

	//space for "bcs isNeg"
	EMIT(SaveSpace, &isNeg, 1);
	
	//sub Rd, #1	// make Rd 0x7fffffff
	EMIT(LLsubImm, rdNo, rdNo, 1, EmitFlagsDoNotCare, false);
	
	//this is "isNeg:", emit that bcs to skip making result positive
	EMIT_TO(LLbranch, &isNeg, emitGetPtrToJumpHere(dest), EmitCcCs);
	
	//set Q flag
	
	//mov tmpReg, #ARM_SR_BIT_Q
	EMIT(HLloadImmToReg, tmpReg, ARM_SR_BIT_Q, true, true, false);
	
	//orr Rsr, tmpReg
	EMIT(LLorrReg, REG_NO_SR, REG_NO_SR, tmpReg, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);

	//this is "done:", emit that bvc to skip saturation
	EMIT_TO(LLbranch, &skipSat, emitGetPtrToJumpHere(dest), EmitCcVc);
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvSatAdd(struct EmitBuf *dest, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, uint32_t tmpReg)
{
	//adds Rd, Rn, Rm
	EMIT(LLaddReg, rdNo, rnNo, rmNo, EmitShiftLsl, 0, EmitSetFlags, false);
	
	return jitPrvSat(dest, rdNo, tmpReg);
}

static enum EmitStatus jitPrvSatSub(struct EmitBuf *dest, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, uint32_t tmpReg)
{
	//subs Rd, Rn, Rm
	EMIT(LLsubReg, rdNo, rnNo, rmNo, EmitShiftLsl, 0, EmitSetFlags, false);
	
	return jitPrvSat(dest, rdNo, tmpReg);
}

enum EmitStatus jitEmitQadd(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo)
{
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	//no PC
	if (((1 << rdNo) | (1 << rmNo) | (1 << rnNo)) & (1 << EMIT_REG_NO_PC))
		return EmitErrNotEncodeable;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_1, rnNo);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_2, rmNo);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvSatAdd(dest, REG_NO_TMP_3, REG_NO_TMP_1, REG_NO_TMP_2, REG_NO_TMP_4);
	if (now != EmitErrNone)
		return now;

	now = jitPrvStoreVreg(dest, rdNo, REG_NO_TMP_3, true);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitQsub(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo)
{
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	//no PC
	if (((1 << rdNo) | (1 << rmNo) | (1 << rnNo)) & (1 << EMIT_REG_NO_PC))
		return EmitErrNotEncodeable;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_1, rnNo);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_2, rmNo);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvSatSub(dest, REG_NO_TMP_3, REG_NO_TMP_1, REG_NO_TMP_2, REG_NO_TMP_4);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvStoreVreg(dest, rdNo, REG_NO_TMP_3, true);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitQdadd(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo)
{
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	//no PC
	if (((1 << rdNo) | (1 << rmNo) | (1 << rnNo)) & (1 << EMIT_REG_NO_PC))
		return EmitErrNotEncodeable;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_1, rnNo);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvSatAdd(dest, REG_NO_TMP_2, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_4);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_1, rmNo);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvSatAdd(dest, REG_NO_TMP_3, REG_NO_TMP_1, REG_NO_TMP_2, REG_NO_TMP_4);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvStoreVreg(dest, rdNo, REG_NO_TMP_3, true);
	if (now != EmitErrNone)
		return now;

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitQdsub(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo)
{
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	//no PC
	if (((1 << rdNo) | (1 << rmNo) | (1 << rnNo)) & (1 << EMIT_REG_NO_PC))
		return EmitErrNotEncodeable;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_1, rnNo);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvSatAdd(dest, REG_NO_TMP_2, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_4);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_1, rmNo);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvSatSub(dest, REG_NO_TMP_3, REG_NO_TMP_1, REG_NO_TMP_2, REG_NO_TMP_4);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvStoreVreg(dest, rdNo, REG_NO_TMP_3, true);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvSmulSmlaXY(struct EmitBuf *dest, enum EmitCc cc, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, int32_t raNo, bool nTop, bool mTop)
{
	uint32_t regDst = REG_NO_TMP_1;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	//no PC
	if (((1 << rdNo) | (1 << rmNo) | (1 << rnNo) | (raNo >= 0 ? (1 << raNo) : 0)) & (1 << EMIT_REG_NO_PC))
		return EmitErrNotEncodeable;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_1, rnNo);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_2, rmNo);
	if (now != EmitErrNone)
		return now;
	
	if (nTop)	//ASR 16
		EMIT(LLmov, REG_NO_TMP_1, REG_NO_TMP_1, EmitShiftAsr, 16, EmitFlagsDoNotCare, false);
	else		//SXTH
		EMIT(LLextend, REG_NO_TMP_1, REG_NO_TMP_1, 0, false, false);
	
	if (mTop)	//ASR 16
		EMIT(LLmov, REG_NO_TMP_2, REG_NO_TMP_2, EmitShiftAsr, 16, EmitFlagsDoNotCare, false);
	else		//SXTH
		EMIT(LLextend, REG_NO_TMP_2, REG_NO_TMP_2, 0, false, false);

	//mul Rtmp1, Rtmp2	
	EMIT(LLmulReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_2, EmitFlagsDoNotCare, false);
	
	if (raNo >= 0) {
		
		now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_2, raNo);
		if (now != EmitErrNone)
			return now;
		
		now = jitPrvSatAdd(dest, REG_NO_TMP_3, REG_NO_TMP_1, REG_NO_TMP_2, REG_NO_TMP_4);
		if (now != EmitErrNone)
			return now;
		
		regDst = REG_NO_TMP_3;
	}
	
	now = jitPrvStoreVreg(dest, rdNo, regDst, true);
	if (now != EmitErrNone)
		return now;

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvSmulwSmlawY(struct EmitBuf *dest, enum EmitCc cc, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, int32_t raNo, bool mTop)
{
	uint32_t regDst = REG_NO_TMP_1;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	//no PC
	if (((1 << rdNo) | (1 << rmNo) | (1 << rnNo) | (raNo >= 0 ? (1 << raNo) : 0)) & (1 << EMIT_REG_NO_PC))
		return EmitErrNotEncodeable;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_1, rnNo);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_2, rmNo);
	if (now != EmitErrNone)
		return now;
	
	if (mTop)	//ASR 16
		EMIT(LLmov, REG_NO_TMP_2, REG_NO_TMP_2, EmitShiftAsr, 16, EmitFlagsDoNotCare, false);
	else		//SXTH
		EMIT(LLextend, REG_NO_TMP_2, REG_NO_TMP_2, 0, false, false);
	
	//uxth Rtmp3, Rtmp1				//lo
	EMIT(LLextend, REG_NO_TMP_3, REG_NO_TMP_1, 0, false, true);
	
	//asr Rtmp1, #16					//hi
	EMIT(LLmov, REG_NO_TMP_1, REG_NO_TMP_1, EmitShiftAsr, 16, EmitFlagsDoNotCare, false);
	
	//mul Rtmp3, Rtmp2				//lo
	EMIT(LLmulReg, REG_NO_TMP_3, REG_NO_TMP_3, REG_NO_TMP_2, EmitFlagsDoNotCare, false);
	
	//mul Rtmp1, Rtmp2				//hi
	EMIT(LLmulReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_2, EmitFlagsDoNotCare, false);
	
	//lsr Rtmp3, #16
	EMIT(LLmov, REG_NO_TMP_3, REG_NO_TMP_3, EmitShiftLsr, 16, EmitFlagsDoNotCare, false);
	
	//add Rtmp1, Rtmp3				//cannot overflow
	EMIT(LLaddReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_3, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
	
	if (raNo >= 0) {
		
		now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_2, raNo);
		if (now != EmitErrNone)
			return now;
		
		now = jitPrvSatAdd(dest, REG_NO_TMP_3, REG_NO_TMP_1, REG_NO_TMP_2, REG_NO_TMP_4);
		if (now != EmitErrNone)
			return now;
		
		regDst = REG_NO_TMP_3;
	}
	
	now = jitPrvStoreVreg(dest, rdNo, regDst, true);
	if (now != EmitErrNone)
		return now;

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitSmulxy(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, bool nTop, bool mTop)
{
	return jitPrvSmulSmlaXY(dest, cc, rdNo, rnNo, rmNo, -1, nTop, mTop);
}

enum EmitStatus jitEmitSmlaxy(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, uint32_t raNo, bool nTop, bool mTop)
{
	return jitPrvSmulSmlaXY(dest, cc, rdNo, rnNo, rmNo, raNo, nTop, mTop);
}

enum EmitStatus jitEmitSmulwy(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, bool mTop)
{
	return jitPrvSmulwSmlawY(dest, cc, rdNo, rnNo, rmNo, -1, mTop);
}

enum EmitStatus jitEmitSmlawy(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, uint32_t raNo, bool mTop)
{
	return jitPrvSmulwSmlawY(dest, cc, rdNo, rnNo, rmNo, raNo, mTop);
}

enum EmitStatus jitEmitSmlalxy(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rdLoNo, uint32_t rdHiNo, uint32_t rnNo, uint32_t rmNo, bool nTop, bool mTop)
{
	uint32_t regDst = REG_NO_TMP_1;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	//no PC
	if (((1 << rdLoNo) | (1 << rdHiNo) | (1 << rnNo) | (1 << rmNo)) & (1 << EMIT_REG_NO_PC))
		return EmitErrNotEncodeable;
	
	//output regs must differ
	if (rdHiNo == rdLoNo)
		return EmitErrNotEncodeable;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_1, rnNo);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_2, rmNo);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_3, rdLoNo);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_4, rdHiNo);
	if (now != EmitErrNone)
		return now;
	
	if (nTop)	//ASR 16
		EMIT(LLmov, REG_NO_TMP_1, REG_NO_TMP_1, EmitShiftAsr, 16, EmitFlagsDoNotCare, false);
	else		//SXTH
		EMIT(LLextend, REG_NO_TMP_1, REG_NO_TMP_1, 0, false, false);
	
	if (mTop)	//ASR 16
		EMIT(LLmov, REG_NO_TMP_2, REG_NO_TMP_2, EmitShiftAsr, 16, EmitFlagsDoNotCare, false);
	else		//SXTH
		EMIT(LLextend, REG_NO_TMP_2, REG_NO_TMP_2, 0, false, false);
	
	//mul Rtmp1, Rtmp2
	EMIT(LLmulReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_2, EmitFlagsDoNotCare, false);
	
	//ldr Rtmp2, =0
	EMIT(HLloadImmToReg, REG_NO_TMP_2, 0, true, true, false);

	//adds Rtmp3, Rtmp1
	EMIT(LLaddReg, REG_NO_TMP_3, REG_NO_TMP_3, REG_NO_TMP_1, EmitShiftLsl, 0, EmitSetFlags, false);
	
	//adcs Rtmp4, Rtmp2	//capture the carry
	EMIT(LLadcReg, REG_NO_TMP_4, REG_NO_TMP_4, REG_NO_TMP_2, EmitShiftLsl, 0, EmitSetFlags, false);
	
	//we are writing two regs. one might be SP. clukily this is ok since we have no more memory accesses to do now
	now = jitPrvStoreVreg(dest, rdLoNo, REG_NO_TMP_3, true);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvStoreVreg(dest, rdHiNo, REG_NO_TMP_4, true);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitImmMemStrd(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rtNo, uint32_t rnNo, int32_t imm, enum EmitAddrMode adrMode)
{
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	if ((rtNo & 1) || rtNo == EMIT_REG_NO_LR || (adrMode != EmitAdrModeIndex && (rtNo == rnNo || rtNo + 1 == rnNo)))
		return EmitErrNotEncodeable;
	
	//pc is never allowed with wbak
	if (rnNo == EMIT_REG_NO_PC && adrMode != EmitAdrModeIndex)
		return EmitErrNotEncodeable;
		
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvLoadVreg(dest, REG_NO_TMP_1, rnNo, instrAddr);
	if (now != EmitErrNone)
		return now;
	
	if (adrMode != EmitAdrModePostindex) {
		
		now = jitPrvAddImmToReg(dest, REG_NO_TMP_1, REG_NO_TMP_2, imm);
		if (now != EmitErrNone)
			return now;
	}
	
	if (adrMode == EmitAdrModeIndexWbak) {
		
		//ok even for sp
		now = jitPrvStoreVreg(dest, rnNo, REG_NO_TMP_1, true);
		if (now != EmitErrNone)
			return now;
	}
	
	now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_2, rtNo);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_3, rtNo + 1);
	if (now != EmitErrNone)
		return now;
	
	//store them with wbak
	EMIT(LLstmia, REG_NO_TMP_1, (1 << REG_NO_TMP_2) | (1 << REG_NO_TMP_3), true);

	//handle any postindexing we need to do
	if (adrMode == EmitAdrModePostindex) {
		
		now = jitPrvAddImmToReg(dest, REG_NO_TMP_1, REG_NO_TMP_2, imm - 2 * sizeof(uint32_t) /* account for the wbak we did already */);
		if (now != EmitErrNone)
			return now;
		
		//ok even for sp
		now = jitPrvStoreVreg(dest, rnNo, REG_NO_TMP_1, true);
		if (now != EmitErrNone)
			return now;
	}

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitImmMemLdrd(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rtNo, uint32_t rnNo, int32_t imm, enum EmitAddrMode adrMode)
{
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	if ((rtNo & 1) || rtNo == EMIT_REG_NO_LR || (adrMode != EmitAdrModeIndex && (rtNo == rnNo || rtNo + 1 == rnNo)))
		return EmitErrNotEncodeable;
	
	//pc is never allowed with wbak
	if (rnNo == EMIT_REG_NO_PC && adrMode != EmitAdrModeIndex)
		return EmitErrNotEncodeable;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvLoadVreg(dest, REG_NO_TMP_1, rnNo, instrAddr);
	if (now != EmitErrNone)
		return now;
	
	if (adrMode != EmitAdrModePostindex) {
		
		now = jitPrvAddImmToReg(dest, REG_NO_TMP_1, REG_NO_TMP_2, imm);
		if (now != EmitErrNone)
			return now;
	}
	
	if (adrMode == EmitAdrModeIndexWbak) {
		
		//ok even for sp
		now = jitPrvStoreVreg(dest, rnNo, REG_NO_TMP_1, true);
		if (now != EmitErrNone)
			return now;
	}
	
	//load them with wbak
	EMIT(LLldmia, REG_NO_TMP_1, (1 << REG_NO_TMP_2) | (1 << REG_NO_TMP_3), true);

	//ok for sp since we're done with memory access
	now = jitPrvStoreVreg(dest, rtNo, REG_NO_TMP_2, true);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvStoreVreg(dest, rtNo + 1, REG_NO_TMP_3, true);
	if (now != EmitErrNone)
		return now;
	
	//handle any postindexing we need to do
	if (adrMode == EmitAdrModePostindex) {
		
		now = jitPrvAddImmToReg(dest, REG_NO_TMP_1, REG_NO_TMP_2, imm - 2 * sizeof(uint32_t) /* account for the wbak we did already */);
		if (now != EmitErrNone)
			return now;
		
		//ok even for sp
		now = jitPrvStoreVreg(dest, rnNo, REG_NO_TMP_1, true);
		if (now != EmitErrNone)
			return now;
	}

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitRegRegMemStrd(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rtNo, uint32_t rnNo, bool isAdd, uint32_t rmNo, enum EmitAddrMode adrMode)
{
	struct EmitBuf ccSkip;
	enum EmitStatus now;

	if ((rtNo & 1) || rtNo == EMIT_REG_NO_LR || (adrMode != EmitAdrModeIndex && (rtNo == rnNo || rtNo + 1 == rnNo)))
		return EmitErrNotEncodeable;
	
	//pc is never allowed with wbak
	if (rnNo == EMIT_REG_NO_PC && adrMode != EmitAdrModeIndex)
		return EmitErrNotEncodeable;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvLoadVreg(dest, REG_NO_TMP_1, rnNo, instrAddr);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvLoadVreg(dest, REG_NO_TMP_2, rmNo, instrAddr);
	if (now != EmitErrNone)
		return now;
	
	if (adrMode != EmitAdrModePostindex) {
		
		if (isAdd)
			EMIT(LLaddReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
		else
			EMIT(LLsubReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
	}
	
	if (adrMode == EmitAdrModeIndexWbak) {
		
		//ok even for sp
		now = jitPrvStoreVreg(dest, rnNo, REG_NO_TMP_1, true);
		if (now != EmitErrNone)
			return now;
	}
	
	now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_3, rtNo);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvLoadVregNotPc(dest, REG_NO_TMP_4, rtNo + 1);
	if (now != EmitErrNone)
		return now;
	
	//store them with wbak
	EMIT(LLstmia, REG_NO_TMP_1, (1 << REG_NO_TMP_3) | (1 << REG_NO_TMP_4), true);

	//handle any postindexing we need to do
	if (adrMode == EmitAdrModePostindex) {
		
		//undo wbak from stmia
		EMIT(LLsubImm, REG_NO_TMP_1, REG_NO_TMP_1, 8, EmitFlagsDoNotCare, false);

		if (isAdd)
			EMIT(LLaddReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
		else
			EMIT(LLsubReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
		
		//ok even for sp
		now = jitPrvStoreVreg(dest, rnNo, REG_NO_TMP_1, true);
		if (now != EmitErrNone)
			return now;
	}

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitRegRegMemLdrd(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rtNo, uint32_t rnNo, bool isAdd, uint32_t rmNo, enum EmitAddrMode adrMode)
{
	struct EmitBuf ccSkip;
	enum EmitStatus now;

	if ((rtNo & 1) || rtNo == EMIT_REG_NO_LR || (adrMode != EmitAdrModeIndex && (rtNo == rnNo || rtNo + 1 == rnNo)) || rtNo == rmNo || rtNo + 1 == rmNo)
		return EmitErrNotEncodeable;
	
	//pc is never allowed with wbak
	if (rnNo == EMIT_REG_NO_PC && adrMode != EmitAdrModeIndex)
		return EmitErrNotEncodeable;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvLoadVreg(dest, REG_NO_TMP_1, rnNo, instrAddr);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvLoadVreg(dest, REG_NO_TMP_2, rmNo, instrAddr);
	if (now != EmitErrNone)
		return now;
	
	if (adrMode != EmitAdrModePostindex) {
		
		if (isAdd)
			EMIT(LLaddReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
		else
			EMIT(LLsubReg, REG_NO_TMP_1, REG_NO_TMP_1, REG_NO_TMP_2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
	}
	
	if (adrMode == EmitAdrModeIndexWbak) {
		
		//ok even for sp
		now = jitPrvStoreVreg(dest, rnNo, REG_NO_TMP_1, true);
		if (now != EmitErrNone)
			return now;
	}
	
	//for post index mode, save original base reg value in Rtmp3
	if (adrMode == EmitAdrModePostindex)
		EMIT(LLmov, REG_NO_TMP_3, REG_NO_TMP_1, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);

	//load them with no wbak
	EMIT(LLldmia, REG_NO_TMP_1, (1 << REG_NO_TMP_1) | (1 << REG_NO_TMP_4), false);

	//ok for sp since we're done with memory access
	now = jitPrvStoreVreg(dest, rtNo, REG_NO_TMP_1, true);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvStoreVreg(dest, rtNo + 1, REG_NO_TMP_4, true);
	if (now != EmitErrNone)
		return now;

	//handle any postindexing we need to do
	if (adrMode == EmitAdrModePostindex) {
		
		if (isAdd)
			EMIT(LLaddReg, REG_NO_TMP_3, REG_NO_TMP_3, REG_NO_TMP_2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
		else
			EMIT(LLsubReg, REG_NO_TMP_3, REG_NO_TMP_3, REG_NO_TMP_2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
		
		//ok even for sp
		now = jitPrvStoreVreg(dest, rnNo, REG_NO_TMP_3, true);
		if (now != EmitErrNone)
			return now;
	}

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitSemihostingCall(struct EmitBuf *dest, uint32_t instrAddr, enum EmitCc cc)
{
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	uint32_t i, nWords;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	//semihosting swi might need pc, but is unlikely to need r4. we agree (with our kernel)
	// to pass ARM pc as if after SWI (that is addr of instr + 4) in r4
	// we only promise to swi to have r0-r3, and r12 in regs, which is nice since those are
	// our temp regs, freely clobberable
	
	//in case we change temp regs later, catch it here, we NEED them all to be temp and clobberable
	if ((((1 << REG_NO_TMP_1) | (1 << REG_NO_TMP_2) | (1 << REG_NO_TMP_3) | (1 << REG_NO_TMP_4) | (1 << REG_NO_TMP_HI)) & 0x100f) != 0x100f)
		fatal("cannot translate semihosting swi due to not enough correct temp regs\n");
	
	//get vR12 into r0
	now = jitPrvLoadVreg(dest, 0, 12, instrAddr);
	if (now != EmitErrNone)
		return now;
	
	//mov r12, r0	//get it into r12
	EMIT(LLmov, 12, 0, EmitShiftLsl, 0, EmitLeaveFlags, false);
	
	//get vLR into r0
	now = jitPrvLoadVreg(dest, 0, EMIT_REG_NO_LR, instrAddr);
	if (now != EmitErrNone)
		return now;
	
	//mov lr, r0	//get it into lr
	EMIT(LLmov, EMIT_REG_NO_LR, 0, EmitShiftLsl, 0, EmitLeaveFlags, false);
	
	//get vR0..vR3 into r0..r3
	for (i = 0; i <= 3; i++) {
		now = jitPrvLoadVreg(dest, i, i, instrAddr);
		if (now != EmitErrNone)
			return now;
	}
	
	//push {r4}	//which has vSR
	EMIT(HLpush, 1 << 4);
	
	//ldr r4, =effective_pc
	now = jitPrvEmitLoadImmToLoreg(dest, 4, instrAddr + 4, true, true);
	if (now != EmitErrNone)
		return now;
	
	//svc #KERNEL_ARM_SEMIHOSTING_SWI
	EMIT(LLsvc, KERNEL_ARM_SEMIHOSTING_SWI);

	//pop {r4}
	EMIT(HLpop, 1 << 4);
	
	//get r0..r3 into vR0..vR3
	for (i = 0; i <= 3; i++) {
		now = jitPrvStoreVreg(dest, i, i, false);
		if (now != EmitErrNone)
			return now;
	}

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}
