#include "emuJitInternal.h"
#include "kernel.h"
#include "printf.h"




static uint32_t jitPrvRor(uint32_t val, uint32_t by)
{
	if (by)
		val = (val >> by) | (val << (32 - by));

	return val;
}

static int32_t jitPrvEncodeThumbImm(uint32_t val, bool forceRotate)	//return negative on error (since no valid ThumbImm encoding has top bit set)
{
	uint32_t i;
	
	if (!forceRotate) {							//forcing rotate makes sure carry out is not carry in but instead val[31]
		
		if (!(val >> 8))
			return val;
		
		if ((val & 0xFFFF) == (val >> 16)) {	//all special forms are of the form where top 16 bits are the same as bottom
			if (!(val & 0xFF00FF00))
				return 0x1000 + (val & 0xFF);
			
			if (!(val & 0x00FF00FF))
				return 0x2000 + ((val >> 8) & 0xFF);
	
			if (((val >> 8) & 0xFF) == (val & 0xFF))
				return 0x3000 + (val & 0xFF);
		}
	}
	
	if (val < 0x100)
		return -1;			//low vals cannot be encoded with a shift
	
	i = val ? __builtin_clz(val) : 32;
	if (!((val << i) << 8)) {	//less than 8 bits set
	
		val <<= i;
		val <<= 1;
		val >>= 25;
		
		i += 8;
		
		return ((i & 0x10) << 22) + ((i & 0x0e) << 11) + ((i & 0x01) << 7) + val;
	}
	
	return -1;	
}

//list is a null-terminated and each points to a u32 which gets reg no we can push
//never returns LR or SP. regs returned in order from lowest to highest
//pc spacer number is never returned, just is part of pushRegs/popRegs
uint32_t jitPrvFindTempRegs(uint32_t instrAddr, uint32_t unallowedRegs, uint32_t *pushRegsP, uint32_t *popRegsP, bool needPcSpacer, ...)
{
	uint32_t nClobberable, needRegs = 0, pushRegs = 0, popRegs, retVal = 0, i, t, reg, *regNoP;
	va_list vl;
	
	//never consider PC or SP allowed
	unallowedRegs |= 1 << EMIT_REG_NO_SP;
	unallowedRegs |= 1 << EMIT_REG_NO_PC;
	
	//count how many regs we need
	va_start(vl, needPcSpacer);
	while (va_arg(vl, uint32_t*))
		needRegs++;
	va_end(vl);

	//ask higher layer which regs we can clobber (these are likely to be hiregs)
	t = jitPrvFindClobberables((const uint32_t*)instrAddr) &~ unallowedRegs;
	nClobberable = jitPrvPopcount16(t);
	
//	loge(" [@0x%08x]want %u temp regs%s, clobberable is %u regs (0x%04x), disallowed %04x\n", instrAddr, needRegs, needPcSpacer ? " + pc spacer" : "", nClobberable, t, unallowedRegs);
	
	for (i = nClobberable; i < needRegs; i++) {
		
		//jitUtilPickLowestClearBit should never return a very high reg here.
		reg = jitUtilPickLowestClearBit(unallowedRegs | t);
		if (reg >= EMIT_REG_NO_SP)
			fatal("Unexpectedly high reg suggested!\n");
		
		//these regs need to be preserved, account for this
		pushRegs |= 1 << reg;
		t |= 1 << reg;
		retVal++;
	}
	
	if (needPcSpacer) {
		
		//if we need a pc spacer, use lr (which should up till now be pushed, and we'll check)
		if (pushRegs & (1 << EMIT_REG_NO_LR))
			fatal("LR aready unexpectedly pushed\n");
	
		popRegs = pushRegs | (1 << EMIT_REG_NO_PC);
		pushRegs |= 1 << EMIT_REG_NO_LR;
		retVal++;
	}
	else {
		
		popRegs = pushRegs;
	}
	
//	loge(" -> picked 0x%04x (%u slots on stack: 0x%04x -> 0x%04x)\n", t, retVal, pushRegs, popRegs);
	
	//export regs to requested vals
	va_start(vl, needPcSpacer);
	while ((regNoP = va_arg(vl, uint32_t*)) != NULL) {

		*regNoP = __builtin_ctz(t);
		t &= t - 1;
	}
	va_end(vl);
	
	if (pushRegsP)
		*pushRegsP = pushRegs;
	
	if (popRegsP)
		*popRegsP = popRegs;
	
	return retVal;
}

static enum EmitStatus jitPrvEmitJumpOneInstr(struct EmitBuf *dest, uintptr_t to)			//guaranteed to be one instr. only to thumb
{
	void *positionBackup;
	enum EmitStatus now;
	
	positionBackup = emitBufferBackup(dest);
	now = emitLLbranch(dest, to, EmitCcAl);
	if (now != EmitErrNotEncodeable)
		return now;
	emitBufferRestore(dest, positionBackup);
	
	return jitPrvLiteralLoad(dest, EMIT_REG_NO_PC, to | 1);
}

//guaranteed to be one instr, if in IT
static enum EmitStatus jitPrvEmitLoadValueOneInstrIfInIt(struct EmitBuf *dest, uint32_t regNo, uint32_t val, bool isInIt)
{
	enum EmitStatus now;
	
	if (isInIt) {

		void *positionBackup;
	
		positionBackup = emitBufferBackup(dest);
		now = emitLLmovImm(dest, regNo, val, 0, EmitLeaveFlags, isInIt);
		if (now != EmitErrNotEncodeable)
			return now;
		emitBufferRestore(dest, positionBackup);
		
		now = jitPrvLiteralLoad(dest, regNo, val);
		if (now != EmitErrNone)
			return now;
	}
	else {
		
		now = jitEmitLoadImmToReg(dest, regNo, val, false, false, isInIt);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

//emit code to optimally pop regs (possibly including PC, no SP allowed)
static enum EmitStatus jitPrvEmitPopWithOpts(struct EmitBuf *dest, enum EmitCc cc, uint32_t regsMask, bool interworking)
{
	uint32_t pcMask = 1 << EMIT_REG_NO_PC, nonPcRegs = regsMask &~ pcMask;
	enum EmitStatus now;
	
	if (regsMask & (1 << EMIT_REG_NO_SP))
		return EmitErrNotEncodeable;
	
	now = jitPrvLiteralLoadsFlush(dest, 1);	//so we do not flush in the middle of "IT"
	if (now != EmitErrNone)
		return now;
	
	if (cc != EmitCcAl) {
		
		if (nonPcRegs && (regsMask & pcMask))
			EMIT(LLitt, cc);
		else if (nonPcRegs || (regsMask & pcMask))
			EMIT(LLit, cc);
		else
			fatal("impossible pop\n");
	}
	
	if (nonPcRegs) {
		
		EMIT(HLpop, nonPcRegs);
	}
	
	if (regsMask & pcMask) {
		
		now = jitPrvEmitJumpOneInstr(dest, (uintptr_t)(interworking ? &jitPrvPopPcCallout : &jitPrvPopPcArmOnlyCallout));
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvHandleCcStart(struct EmitBuf *dest, struct EmitBuf *ccSkip, enum EmitCc cc)
{
	enum EmitStatus now;
	
	if (cc == EmitCcNv)
		return EmitErrNotEncodeable;
	
	if (cc == EmitCcAl)
		return EmitErrNone;
	
	EMIT(SaveSpace, ccSkip, 1);

	return EmitErrNone;	
}

static enum EmitStatus jitPrvHandleCcEnd(struct EmitBuf *dest, struct EmitBuf *ccSkip, enum EmitCc cc)
{
	if (cc == EmitCcNv)
		return EmitErrNotEncodeable;
	
	if (cc == EmitCcAl)
		return EmitErrNone;
	
	EMIT_TO(LLbranch, ccSkip, emitGetPtrToJumpHere(dest), emitCcInvert(cc));
	
	return EmitErrNone;
}

//do not EVER call this. Use jitEmitImmMemLdr()
static enum EmitStatus jitPrvEmitImmMemLdrToPc(struct EmitBuf *dest, enum EmitCc cc, uint32_t pcVal, uint32_t rnNo, int32_t imm, enum EmitAddrMode adrMode)
{
	int32_t regToPopWithPc = -1;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	uint32_t ea;
	
	if (rnNo == EMIT_REG_NO_PC) {	//Rn is PC too, which means no writeback is possible
		
		ea = pcVal + imm;
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		//PUSH {R0, R1}
		EMIT(HLpush, 0x0003);
		
		//LDR R0, =ea
		EMIT(HLloadImmToReg, 0, ea, false, false, false);
		
		//LDR R0, [R0]
		EMIT(LLloadImm, 0, 0, 0, EmitSzWord, false, EmitAdrModeIndex);
		
		//STR R0, [SP, #4]
		EMIT(LLstoreImm, 0, EMIT_REG_NO_SP, 4, EmitSzWord, EmitAdrModeIndex);
		
		regToPopWithPc = 0;
	}
	else if ((adrMode == EmitAdrModeIndex && imm >=0 && imm < 0x1000) || (imm > -0x100 && imm < 0x100)) {	//load into pc, with base reg != PC, encodeable in v7M
		
		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		
		EMIT(LLloadImm, EMIT_REG_NO_PC, rnNo, imm, EmitSzWord, false, adrMode);
	}
	else {																									//load into PC with base reg != PC, not encodeable in v7M
		
		if (adrMode == EmitAdrModeIndexWbak) {
			
			if (cc != EmitCcAl)
				EMIT(LLitt, cc);
			
			if (imm >= 0)
				//add Rn, #imm
				EMIT(LLaddImm, rnNo, rnNo, imm, EmitLeaveFlags, cc != EmitCcAl);
			else
				//sub Rn, #-imm
				EMIT(LLsubImm, rnNo, rnNo, -imm, EmitLeaveFlags, cc != EmitCcAl);

			//XXX: we might have a cool way to optimize this by duplicating the "bx lr" path for every reg
		
			//LDR PC, [Rn]
			EMIT(LLloadImm, EMIT_REG_NO_PC, rnNo, 0, EmitSzWord, false, adrMode);
		}
		else if (adrMode == EmitAdrModeIndex) {	//at this point the imm can only be negative
			
			if (rnNo == EMIT_REG_NO_SP)
				loge("load to PC from below SP is very likely wrong!\n");
			
			now = jitPrvHandleCcStart(dest, &ccSkip, cc);
			if (now != EmitErrNone)
				return now;
			
			//PUSH {R0, R1}
			EMIT(HLpush, 0x0003);
		
			//SUB R0, Rn, #-imm + (Rn == SP ? 8 : 0)
			EMIT(LLsubImm, 0, rnNo, ((rnNo == EMIT_REG_NO_SP) ? 8 : 0) - imm, EmitLeaveFlags, false);
			
			//LDR R0, [R0]
			EMIT(LLloadImm, 0, 0, 0, EmitSzWord, false, adrMode);
			
			//STR R0, [SP, #4]
			EMIT(LLstoreImm, 0, EMIT_REG_NO_SP, 4, EmitSzWord, EmitAdrModeIndex);
			
			regToPopWithPc = 0;
		}
		else if (rnNo != EMIT_REG_NO_SP) {			//postindex mode and Rn is not SP
			
			now = jitPrvHandleCcStart(dest, &ccSkip, cc);
			if (now != EmitErrNone)
				return now;
			
			if (rnNo == 0) {
				
				//PUSH {R1, R2}
				EMIT(HLpush, 0x0006);
				
				//x = 1
				regToPopWithPc = 1;
			}
			else {
			
				//PUSH {R0, R1}
				EMIT(HLpush, 0x0003);
				
				//x = 0
				regToPopWithPc = 0;
			}
			
			//LDR Rx, [Rn]
			EMIT(LLloadImm, regToPopWithPc, rnNo, 0, EmitSzWord, false, adrMode);
			
			if (imm >= 0)
				//ADD Rn, Rn, #imm
				EMIT(LLaddImm, rnNo, rnNo, imm, EmitLeaveFlags, false);
			else
				//SUB Rn, Rn, #imm
				EMIT(LLsubImm, rnNo, rnNo, -imm, EmitLeaveFlags, false);
			
			//STR Rx, [SP, #4]
			EMIT(LLstoreImm, regToPopWithPc, EMIT_REG_NO_SP, 4, EmitSzWord, EmitAdrModeIndex);
		}
		else if (imm < 0) {								//postindex mode, Rn is SP, postindex value is negative
			
			if (cc != EmitCcAl)
				EMIT(LLitt, cc);
			
			//SUB SP, SP, #-imm
			EMIT(LLsubImm, EMIT_REG_NO_SP, EMIT_REG_NO_SP, -imm, EmitLeaveFlags, cc != EmitCcAl);
			
			//LDR PC, [SP, #imm]
			EMIT(LLloadImm, EMIT_REG_NO_PC, EMIT_REG_NO_SP, -imm, EmitSzWord, false, adrMode);
		}
		else if (imm & 3) {								//postindex mode, Rn is SP, postindex value is positive, not multiple of 4
			
			loge("adjusting SP by non-multiple of 4 isnt easily supported\n");
			
			return EmitErrNotEncodeable;
		}
		else {											//postindex mode, Rn is SP, postindex value is positive, multiple of 4, >= 0x100
			
			//SP must end up at SP + imm. we can move it so SP + imm - 8, so we can then pop a reg and PC off
			//but first we need to store r0 there. this is safe since this is above the stack and thus protected
			
			now = jitPrvHandleCcStart(dest, &ccSkip, cc);
			if (now != EmitErrNone)
				return now;
			
			//STR R0, [SP, #imm - 8]
			EMIT(LLstoreImm, 0, EMIT_REG_NO_SP, imm - 8, EmitSzWord, EmitAdrModeIndex);
			
			//LDR R0, [Rn]
			EMIT(LLloadImm, 0, rnNo, 0, EmitSzWord, false, adrMode);
			
			//STR R0, [SP, #imm - 4]		//store desired PC
			EMIT(LLstoreImm, 0, EMIT_REG_NO_SP, imm - 4, EmitSzWord, EmitAdrModeIndex);
			
			//ADD SP, SP, #imm - 8
			EMIT(LLaddImm, EMIT_REG_NO_SP, EMIT_REG_NO_SP, imm - 8, EmitLeaveFlags, false);
			
			regToPopWithPc = 0;
		}
	}
	
	if (regToPopWithPc >= 0) {
		
		now = jitPrvEmitPopWithOpts(dest, EmitCcAl, (1 << EMIT_REG_NO_PC) + (1 << regToPopWithPc), true);
		if (now != EmitErrNone)
			return now;
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

//emit code to properly swap SP. please note: doing any loads after this may be a bad idea since once SP moves, anything you are loading,
// no matter the base reg, may end up below SP.
enum EmitStatus jitPrvEmitSwapSp(struct EmitBuf *dest, uint32_t regWithNewVal, uint32_t regTmp, uint32_t pushedRegs, uint32_t numPushedRegs)
{
	struct EmitBuf jumpOverEasyCase, jumpOverComplexCase;
	enum EmitStatus now;
	uint32_t i;
	
	
	//if there are no pushed regs, we can just set sp now
	if (!numPushedRegs) {
		
		//mov SP, regWithNewVal
		EMIT(LLmov, EMIT_REG_NO_SP, regWithNewVal, EmitShiftLsl, 0, EmitLeaveFlags, false);
	}
	else {
	
		//at this point the regs mask "pushedRegs" has the regs we need to pop and the reg number containing new SP is in "regWithNewVal"
		//this gets more complex yet now. note: we cannot corrupt flags. we need to. we'll stash flags in a reg.
		//if SP is going DOWN or staying, we can move SP there, then load our pushed regs back
		
		//MRS regTmp, APSR
		EMIT(LLmrs, regTmp, EMIT_SYSM_APSR);
	
		//CMP regWithNewVal, SP
		EMIT(LLcmpReg, regWithNewVal, EMIT_REG_NO_SP, EmitShiftLsl, 0);
		
		//save space for a branch over the easy case
		EMIT(SaveSpace, &jumpOverEasyCase, 1);
	
		//easy case: new sp is lower than old
		
		//MSR APSR_nzcvq, regTmp
		EMIT(LLmsr, EMIT_SYSM_APSR, EMIT_MSR_APSR_MASK_NZCVQ, regTmp);
		
		//MOV regTmp, SP
		EMIT(LLmov, regTmp, EMIT_REG_NO_SP, EmitShiftLsl, 0, EmitLeaveFlags, false);
		
		//MOV SP, regWithNewVal
		EMIT(LLmov, EMIT_REG_NO_SP, regWithNewVal, EmitShiftLsl, 0, EmitLeaveFlags, false);
		
		//LDMIA regTmp, {pushedRegs}	//guaranteed to be ABOVE new stack val and thus safe
		EMIT(LLldmia, regTmp, pushedRegs, false);
	
		//save space for a branch over the hard case (only if we did not just load PC)
		if (!(pushedRegs & (1 << EMIT_REG_NO_PC)))
			EMIT(SaveSpace, &jumpOverComplexCase, 1);
		
		//this is the complex case. fill in the branch above to here
		EMIT_TO(LLbranch, &jumpOverEasyCase, emitGetPtrToJumpHere(dest), EmitCcHi);
		
		//MSR APSR_nzcvq, regTmp
		EMIT(LLmsr, EMIT_SYSM_APSR, EMIT_MSR_APSR_MASK_NZCVQ, regTmp);
		
		//hard case. new SP is higher than old, if we switch to it, all our pushed regs might go bye-bye
		//on the other hand, it might not be much higher, so if we try to store below it, we might overrun our current stack
		//we'll be very careful with ordering then
		
		//SUB regWithNewVal, #4 * numPushedRegs	//space for our pushed regs so we can pop them when we switch
		EMIT(LLsubImm, regWithNewVal, regWithNewVal, sizeof(uint32_t) * numPushedRegs, EmitLeaveFlags, false);
		
		//copy the pushed regs unto the future new stack from higest to lowest (works even if they overlap)
		for (i = 0; i < numPushedRegs; i++) {
		
			uint32_t ofst = sizeof(uint32_t) * (numPushedRegs - 1 - i);
		
			//LDR regTmp, [SP, #4 * (num regs - 1 - i)]					//load pushed reg
			EMIT(LLloadImm, regTmp, EMIT_REG_NO_SP, ofst, EmitSzWord, false, EmitAdrModeIndex);
			
			//STR regTmp, [regWithNewVal, #4 * (num regs - 1 - i)]		//load pushed reg
			EMIT(LLstoreImm, regTmp, regWithNewVal, ofst, EmitSzWord, EmitAdrModeIndex);
		}
		
		//MOV SP, regWithNewVal
		EMIT(LLmov, EMIT_REG_NO_SP, regWithNewVal, EmitShiftLsl, 0, EmitLeaveFlags, false);
		
		//POP {pushedRegs}
		now = jitPrvEmitPopWithOpts(dest, EmitCcAl, pushedRegs, true);
		if (now != EmitErrNone)
			return now;
		
		//fill in the branch over the complex case we made above (if PC was not loaded)
		if (!(pushedRegs & (1 << EMIT_REG_NO_PC)))
			EMIT_TO(LLbranch, &jumpOverComplexCase, emitGetPtrToJumpHere(dest), EmitCcAl);
	}
	
	return EmitErrNone;
}

enum EmitStatus jitPrvHandleSpStart(struct EmitBuf *dest, uint32_t instrAddr, struct JitSimpleSpStatus* sta, uint32_t *regNoOut1P, uint32_t *regNoOut2P, ... /* in regs pointers, NULL terminated */)
{
	uint32_t *regNoP, nPushedRegs = 0, spReplace, swapSpTmp, pushRegs = 0, regNoOut1 = *regNoOut1P, regNoOut2 = regNoOut2P ? *regNoOut2P : regNoOut1, usedRegs = (1 << regNoOut1) | (1 << regNoOut2);
	bool spIn = false, spOut = (regNoOut1 == EMIT_REG_NO_SP) || (regNoOut2 == EMIT_REG_NO_SP), canClobberOutput1 = true, canClobberOutput2 = true;
	va_list vl;
	
	//see if we have any SP inputs
	va_start(vl, regNoOut2P);
	while ((regNoP = va_arg(vl, uint32_t*)) != NULL) {

		uint32_t regNo = *regNoP;

		if (regNo == EMIT_REG_NO_SP)
			spIn = true;
		
		if (regNoOut1 == regNo)
			canClobberOutput1 = false;
		
		if (regNoOut2 == regNo)
			canClobberOutput2 = false;
		
		usedRegs |= 1 << regNo;
	}
	va_end(vl);
	
	if (spOut) {									//SP output means we might need swapSp (if we end up having to push)
		
		nPushedRegs = jitPrvFindTempRegs(instrAddr, usedRegs, &pushRegs, NULL, false, &spReplace, NULL);
		if (nPushedRegs) {
			sta->nPushedRegs = nPushedRegs = jitPrvFindTempRegs(instrAddr, usedRegs, &pushRegs, NULL, false, &spReplace, &swapSpTmp, NULL);
			sta->tmpReg = swapSpTmp;
		}
		
		sta->pushRegs = pushRegs;
		sta->srcReg = spReplace;
		if (regNoOut1P && *regNoOut1P == EMIT_REG_NO_SP)
			*regNoOut1P = spReplace;
		if (regNoOut2P && *regNoOut2P == EMIT_REG_NO_SP)
			*regNoOut2P = spReplace;
	}
	else if (!spIn) {								//no SP at all? easy
		
		sta->pushRegs = 0;
		sta->srcReg = -1;
		
		return EmitErrNone;
	}
	else if (canClobberOutput1 || canClobberOutput2) {					//dest reg is not SP, dest reg is not an input reg
		
		sta->pushRegs = 0;
		sta->srcReg = -1;
		spReplace = canClobberOutput1 ? regNoOut1 : regNoOut2;
	}
	else  {											//SP is in-only. maybe we have a clobberable reg to use. or else we'll push
		
		nPushedRegs = jitPrvFindTempRegs(instrAddr, usedRegs, &pushRegs, NULL, false, &spReplace, NULL);

		sta->pushRegs = pushRegs;
		sta->srcReg = -1;
	}
	
	//handle SP inputs
	va_start(vl, regNoOut2P);
	while ((regNoP = va_arg(vl, uint32_t*)) != NULL) {

		if (*regNoP == EMIT_REG_NO_SP)
			*regNoP = spReplace;
	}
	va_end(vl);

	//PUSH {..pushRegs..}
	EMIT(HLpush, pushRegs);

	//ADD spReplace, SP, #4 * nPushedRegs
	EMIT(LLaddImm, spReplace, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushedRegs, EmitLeaveFlags, false);
	
	return EmitErrNone;
}

enum EmitStatus jitPrvHandleSpEnd(struct EmitBuf *dest, const struct JitSimpleSpStatus* sta)
{
	uint32_t pushRegs = sta->pushRegs;
	int32_t srcReg = sta->srcReg;
	enum EmitStatus now;
	
	if (srcReg < 0) {			//just pop regs - no outputs to handle
		
		//POP {..pushRegs..}
		EMIT(HLpop, pushRegs);
	}
	else if (srcReg >= 0) {	//we need to just set SP
		
		EMIT(LLmov, EMIT_REG_NO_SP, srcReg, EmitShiftLsl, 0, EmitLeaveFlags, false);
	}
	else {							//we need to swapSp (have pushed regs and need to set SP
		
		now = jitPrvEmitSwapSp(dest, srcReg, sta->tmpReg, sta->pushRegs, sta->nPushedRegs);
		if (now != EmitErrNone)
			return now;
	}

	return EmitErrNone;
}

//do not EVER call this. Use jitEmitImmMemLdr()
static enum EmitStatus jitPrvEmitImmMemLdrToSp(struct EmitBuf *dest, enum EmitCc cc, uint32_t pcVal, uint32_t rnNo, int32_t imm, enum EmitAddrMode adrMode, enum EmitMemOpSz size, bool sext)
{
	uint32_t ea, pushedRegs = 0x0003, regWithNewVal = 0, regTmp = 1;
	bool needSwapSp = false;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	int32_t roundedSp;
	
	if (rnNo != EMIT_REG_NO_PC && size == EmitSzWord && adrMode == EmitAdrModeIndex && imm > -0x100 && imm < 0x1000) {	//can be encoded in v7M?
		
		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		
		EMIT(LLloadImm, EMIT_REG_NO_SP, rnNo, imm, size, sext, adrMode);
	}
	//things not representable in v7M. this is either cause the load is not word sized, or the imm is not representable in requested ddressing mode
	else if (rnNo == EMIT_REG_NO_PC) {		//Rn is PC, indexed mode guaranteed
		
		ea = pcVal + imm;
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		//PUSH {R0, R1}
		EMIT(HLpush, 0x0003);
		
		//LDR R0, =ea
		EMIT(HLloadImmToReg, 0, ea, false, false, false);
		
		//proper_sized_and_extended_load R0, [R0]
		EMIT(LLloadImm, 0, 0, 0, size, sext, EmitAdrModeIndex);
		
		needSwapSp = true;
	}
	else if (rnNo == EMIT_REG_NO_SP) {	//since Rn == Rt, we know the addressing mode is "index"
		
		//since we're about to overwrite SP anyways, we can adjust it at will. do so such that the value we want is above SP
		roundedSp = imm &~ 3;	//round down to nearest multiple of 4. on the negative since this might push us to -0x1000, that is still encodeable!
		
		if (size == EmitSzWord){
			
			if (cc != EmitCcAl)
				EMIT(LLitt, cc);
			
			if (roundedSp >= 0)
				//ADD SP, SP, #roundedSp
				EMIT(LLaddImm, EMIT_REG_NO_SP, EMIT_REG_NO_SP, roundedSp, EmitLeaveFlags, cc != EmitCcAl);
			else
				//SUB SP, SP, #roundedSp
				EMIT(LLsubImm, EMIT_REG_NO_SP, EMIT_REG_NO_SP, -roundedSp, EmitLeaveFlags, cc != EmitCcAl);
			
			imm &= 3;
			
			//proper_sized_and_extended_load SP, [SP, #imm]
			EMIT(LLloadImm, EMIT_REG_NO_SP, EMIT_REG_NO_SP, imm, EmitSzWord, false, EmitAdrModeIndex);
		}
		else {
			
			now = jitPrvHandleCcStart(dest, &ccSkip, cc);
			if (now != EmitErrNone)
				return now;
		
			if (roundedSp >= 0)
				//ADD SP, SP, #roundedSp
				EMIT(LLaddImm, EMIT_REG_NO_SP, EMIT_REG_NO_SP, roundedSp, EmitLeaveFlags, false);
			else
				//SUB SP, SP, #roundedSp
				EMIT(LLsubImm, EMIT_REG_NO_SP, EMIT_REG_NO_SP, -roundedSp, EmitLeaveFlags, false);
			
			imm &= 3;
			
			//PUSH {R0, R1}
			EMIT(HLpush, 0x0003);
			
			//proper_sized_and_extended_load R0, [SP, #imm]
			EMIT(LLloadImm, 0, EMIT_REG_NO_SP, imm, size, sext, EmitAdrModeIndex);
			
			needSwapSp = true;
		}
	}
	else if (adrMode == EmitAdrModeIndexWbak) {
		
		if (size == EmitSzWord) {
			
			if (cc != EmitCcAl)
				EMIT(LLitt, cc);
			
			if (imm >= 0)
				//ADD Rn, Rn, #imm
				EMIT(LLaddImm, rnNo, rnNo, imm, EmitLeaveFlags, cc != EmitCcAl);
			else
				//SUB Rn, Rn, #-imm
				EMIT(LLsubImm, rnNo, rnNo, -imm, EmitLeaveFlags, cc != EmitCcAl);
			
			//proper_sized_and_extended_load SP, [Rn, #0]
			EMIT(LLloadImm, EMIT_REG_NO_SP, rnNo, 0, EmitSzWord, false, EmitAdrModeIndex);
		}
		else {
			
			now = jitPrvHandleCcStart(dest, &ccSkip, cc);
			if (now != EmitErrNone)
				return now;
		
			if (imm >= 0)
				//ADD Rn, Rn, #imm
				EMIT(LLaddImm, rnNo, rnNo, imm, EmitLeaveFlags, false);
			else
				//SUB Rn, Rn, #-imm
				EMIT(LLsubImm, rnNo, rnNo, -imm, EmitLeaveFlags, false);

			//PUSH {R0, R1}
			EMIT(HLpush, 0x0003);
			
			//proper_sized_and_extended_load R0, [Rn]
			EMIT(LLloadImm, 0, rnNo, 0, size, sext, EmitAdrModeIndex);
			
			needSwapSp = true;
		}
	}
	else if (adrMode == EmitAdrModePostindex) {
		
		if (size == EmitSzWord) {
		
			if (cc != EmitCcAl)
				EMIT(LLitt, cc);
			
			//proper_sized_and_extended_load SP, [Rn, #0]
			EMIT(LLloadImm, EMIT_REG_NO_SP, rnNo, 0, EmitSzWord, false, EmitAdrModeIndex);
			
			if (imm >= 0)
				//ADD Rn, Rn, #imm
				EMIT(LLaddImm, rnNo, rnNo, imm, EmitLeaveFlags, cc != EmitCcAl);
			else
				//SUB Rn, Rn, #-imm
				EMIT(LLsubImm, rnNo, rnNo, -imm, EmitLeaveFlags, cc != EmitCcAl);
		}
		else {
			now = jitPrvHandleCcStart(dest, &ccSkip, cc);
			if (now != EmitErrNone)
				return now;
		
			if (rnNo == 0) {
			
				pushedRegs = 0x0006;
				regWithNewVal = 1;
				regTmp = 2;
				
				//PUSH {R1, R2}
				EMIT(HLpush, 0x0006);
			}
			else {
				
				//PUSH {R0, R1}
				EMIT(HLpush, 0x0003);
			}
			
			//proper_sized_and_extended_load regWithNewVal, [Rn]
			EMIT(LLloadImm, regWithNewVal, rnNo, 0, size, sext, EmitAdrModeIndex);
			
			if (imm >= 0)
				//ADD Rn, Rn, #imm
				EMIT(LLaddImm, rnNo, rnNo, imm, EmitLeaveFlags, false);
			else
				//SUB Rn, Rn, #-imm
				EMIT(LLsubImm, rnNo, rnNo, -imm, EmitLeaveFlags, false);
			
			needSwapSp = true;
		}
	}
	else if (imm > -0x100) {		//indexed mode with imm we can express is v7M
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	
		//PUSH {R0, R1}
		EMIT(HLpush, 0x0003);
		
		//proper_sized_and_extended_load R0, [Rn]
		EMIT(LLloadImm, 0, rnNo, imm, size, sext, EmitAdrModeIndex);
		
		needSwapSp = true;
	}
	else {							//indexed mode with negative imm not expressable in v7M
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	
		//PUSH {R0, R1}
		EMIT(HLpush, 0x0003);
		
		if (imm >= 0)
			//ADD R0, Rn, #imm
			EMIT(LLaddImm, 0, rnNo, imm, EmitLeaveFlags, false);
		else
			//SUB R0, Rn, #-imm
			EMIT(LLsubImm, 0, rnNo, -imm, EmitLeaveFlags, false);
		
		//proper_sized_and_extended_load R0, [R0]
		EMIT(LLloadImm, 0, 0, imm, size, sext, EmitAdrModeIndex);
		
		needSwapSp = true;
	}

	if (needSwapSp) {
	
		now = jitPrvEmitSwapSp(dest, regWithNewVal, regTmp, pushedRegs, 2);
		if (now != EmitErrNone)
			return now;
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

enum EmitStatus jitEmitImmMemLdr(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, bool sext, uint32_t rtNo, uint32_t rnNo, int32_t imm, enum EmitAddrMode adrMode, enum EmitMemOpSz size)
{
	uint32_t pcVal = instrAddr + 8, ea;
	enum EmitStatus now;
	
	//we only bother with immediates that ARMv5 could have produced
	if (imm >= 0x1000 || imm <= -0x1000)
		return EmitErrNotEncodeable;
	
	//cannot sign extend words
	if (size == EmitSzWord && sext)
		return EmitErrNotEncodeable;
	
	//PC cannot be used for Rn if writeback is enabled
	if (rnNo == EMIT_REG_NO_PC && adrMode != EmitAdrModeIndex)
		return EmitErrNotEncodeable;
	
	//writeback is not possible if destination is Rn
	if (rtNo == rnNo && adrMode != EmitAdrModeIndex)
		return EmitErrNotEncodeable;
	
	//load into PC
	if (rtNo == EMIT_REG_NO_PC) {
		
		if (size != EmitSzWord)
			return EmitErrNotEncodeable;
	
		//pop {pc} has special handling in some cases
		if (rnNo == EMIT_REG_NO_SP && imm == 4 && adrMode == EmitAdrModePostindex) {
			
			now = jitPrvEmitPopWithOpts(dest, cc, 1 << EMIT_REG_NO_PC, true);
			if (now != EmitErrNone)
				return now;
		}
		else {
			
			now = jitPrvEmitImmMemLdrToPc(dest, cc, pcVal, rnNo, imm, adrMode);
			if (now != EmitErrNone)
				return now;
		}
	}
	else if (rtNo == EMIT_REG_NO_SP) {
	
		now = jitPrvEmitImmMemLdrToSp(dest, cc, pcVal, rnNo, imm, adrMode, size, sext);
		if (now != EmitErrNone)
			return now;
	}
	else {	//load to a non-SP, non-PC reg
		
		if (rnNo != EMIT_REG_NO_PC && ((adrMode == EmitAdrModeIndex && imm >=0 && imm < 0x1000) || (imm > -0x100 && imm < 0x100))) {	//encodeable in v7M
			
			if (cc != EmitCcAl)
				EMIT(LLit, cc);
			
			EMIT(LLloadImm, rtNo, rnNo, imm, size, sext, adrMode);
		}
		else if (rnNo == EMIT_REG_NO_PC) {		//Rn is PC, indexed mode guaranteed. dest is not SP or PC, thus we can clobber dest all we want
			
			ea = pcVal + imm;

			now = jitPrvLiteralLoadsFlush(dest, 1);
			if (now != EmitErrNone)
				return now;

			if (cc != EmitCcAl)
				EMIT(LLitt, cc);
			
			//LDR Rt, =ea
			now = jitPrvEmitLoadValueOneInstrIfInIt(dest, rtNo, ea, cc != EmitCcAl);
			if (now != EmitErrNone)
				return now;
			
			//proper_sized_and_extended_load Rt, [Rt]
			EMIT(LLloadImm, rtNo, rtNo, 0, size, sext, EmitAdrModeIndex);
		}
		else if (adrMode == EmitAdrModeIndexWbak) {
			
			if (cc != EmitCcAl)
				EMIT(LLitt, cc);
			
			if (imm >= 0)
				//ADD Rn, Rn, #imm
				EMIT(LLaddImm, rnNo, rnNo, imm, EmitLeaveFlags, cc != EmitCcAl);
			else
				//SUB Rn, Rn, #-imm
				EMIT(LLsubImm, rnNo, rnNo, -imm, EmitLeaveFlags, cc != EmitCcAl);

			//proper_sized_and_extended_load Rt, [Rn]
			EMIT(LLloadImm, rtNo, rnNo, 0, size, sext, adrMode);
		}
		else if (adrMode == EmitAdrModePostindex) {	//we know Rt != Rn, so this is easy

			if (cc != EmitCcAl)
				EMIT(LLitt, cc);
			
			//proper_sized_and_extended_load Rt, [Rn]
			EMIT(LLloadImm, rtNo, rnNo, 0, size, sext, adrMode);
			
			if (imm >= 0)
				//ADD Rn, Rn, #imm
				EMIT(LLaddImm, rnNo, rnNo, imm, EmitLeaveFlags, cc != EmitCcAl);
			else
				//SUB Rn, Rn, #-imm
				EMIT(LLsubImm, rnNo, rnNo, -imm, EmitLeaveFlags, cc != EmitCcAl);
		}
		else  {					//indexed mode
			
			if (cc != EmitCcAl)
				EMIT(LLitt, cc);
			
			if (imm >= 0)
				//ADD Rt, Rn, #imm
				EMIT(LLaddImm, rtNo, rnNo, imm, EmitLeaveFlags, cc != EmitCcAl);
			else
				//SUB Rt, Rn, #-imm
				EMIT(LLsubImm, rtNo, rnNo, -imm, EmitLeaveFlags, cc != EmitCcAl);
			
			//proper_sized_and_extended_load Rt, [Rt]
			EMIT(LLloadImm, rtNo, rtNo, 0, size, sext, adrMode);
		}
	}

	return EmitErrNone;
}

//DO NOT USE THIS. Use jitEmitImmMemStr()
// this func assumes Rt != PC
static enum EmitStatus jitPrvEmitSimpleRegImmMemStr(struct EmitBuf *dest, enum EmitCc cc, uint32_t rtNo, uint32_t rnNo, int32_t imm, enum EmitAddrMode adrMode, enum EmitMemOpSz size)
{
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	uint32_t tmpReg;
	
	if (imm > -0x100 && ((adrMode == EmitAdrModeIndex && imm < 0x1000) || imm < 0x100)) {
		
		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		
		EMIT(LLstoreImm, rtNo, rnNo, imm, size, adrMode);
	}
	else if (adrMode == EmitAdrModeIndexWbak) {
		
		if (cc != EmitCcAl)
			EMIT(LLitt, cc);
		
		if (imm >= 0)
			//ADD Rn, Rn, #imm
			EMIT(LLaddImm, rnNo, rnNo, imm, EmitLeaveFlags, cc != EmitCcAl);
		else
			//SUB Rn, Rn, #-imm
			EMIT(LLsubImm, rnNo, rnNo, -imm, EmitLeaveFlags, cc != EmitCcAl);
		
		//proper_sized_store rtNo, [rnNo, #0]
		EMIT(LLstoreImm, rtNo, rnNo, 0, size, EmitAdrModeIndex);
	}
	else if (adrMode == EmitAdrModePostindex) {
		
		if (cc != EmitCcAl)
			EMIT(LLitt, cc);
		
		//proper_sized_store rtNo, [rnNo, #0]
		EMIT(LLstoreImm, rtNo, rnNo, 0, size, EmitAdrModeIndex);
		
		if (imm >= 0)
			//ADD Rn, Rn, #imm
			EMIT(LLaddImm, rnNo, rnNo, imm, EmitLeaveFlags, cc != EmitCcAl);
		else
			//SUB Rn, Rn, #-imm
			EMIT(LLsubImm, rnNo, rnNo, -imm, EmitLeaveFlags, cc != EmitCcAl);
	}
	else if (rtNo != rnNo){				//index mode with imm we cannot easily handle but Rt != Rn
		
		if (cc != EmitCcAl)
			EMIT(LLittt, cc);
		
		if (imm >= 0)
			//ADD Rn, Rn, #imm
			EMIT(LLaddImm, rnNo, rnNo, imm, EmitLeaveFlags, cc != EmitCcAl);
		else
			//SUB Rn, Rn, #-imm
			EMIT(LLsubImm, rnNo, rnNo, -imm, EmitLeaveFlags, cc != EmitCcAl);
		
		//proper_sized_store rtNo, [rnNo, #0]
		EMIT(LLstoreImm, rtNo, rnNo, 0, size, EmitAdrModeIndex);
		
		if (imm >= 0)
			//SUB Rn, Rn, #imm
			EMIT(LLsubImm, rnNo, rnNo, imm, EmitLeaveFlags, cc != EmitCcAl);
		else
			//ADD Rn, Rn, #-imm
			EMIT(LLaddImm, rnNo, rnNo, -imm, EmitLeaveFlags, cc != EmitCcAl);
	}
	else if (rtNo != EMIT_REG_NO_SP) {		//index mode with Rt == Rn != SP
		
		tmpReg = rnNo == 0 ? 1 : 0;
		
		if (cc != EmitCcAl)
			EMIT(LLitttt, cc);
		
		//PUSH {tmpReg}
		EMIT(HLpush, 1 << tmpReg);
		
		if (imm >= 0)
			//ADD tmpReg, Rn, #imm
			EMIT(LLaddImm, tmpReg, rnNo, imm, EmitLeaveFlags, cc != EmitCcAl);
		else
			//SUB tmpReg, Rn, #-imm
			EMIT(LLsubImm, tmpReg, rnNo, -imm, EmitLeaveFlags, cc != EmitCcAl);
		
		//proper_sized_store rtNo, [tmpReg, #0]
		EMIT(LLstoreImm, rtNo, tmpReg, 0, size, EmitAdrModeIndex);
		
		//POP {tmpReg}
		EMIT(HLpop, 1 << tmpReg);
	}
	else {									//index mode, Rt == Rn == SP
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		if (imm < 0) {
			loge("Refusing to store below SP\n");
			return EmitErrNotEncodeable;
		}
		
		//PUSH {R0, R1}
		EMIT(HLpush, 0x0003);
		
		//ADD R0, SP, #8	//calc sp val it was before we pushed
		EMIT(LLaddImm, 0, EMIT_REG_NO_SP, 8, EmitLeaveFlags, false);
		
		if (imm >= 0)
			//ADD R1, R0, #imm
			EMIT(LLaddImm, 1, 0, imm, EmitLeaveFlags, false);
		else
			//SUB R1, R0, #-imm
			EMIT(LLsubImm, 1, 0, -imm, EmitLeaveFlags, false);
 
 		//proper_sized_store R0, [R1, #0]
		EMIT(LLstoreImm, 0, 1, 0, size, EmitAdrModeIndex);
		
		//POP {R0, R1}
		EMIT(HLpop, 0x0003);
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

enum EmitStatus jitEmitImmMemStr(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rtNo, uint32_t rnNo, int32_t imm, enum EmitAddrMode adrMode, enum EmitMemOpSz size)
{
	uint32_t pcVal = instrAddr + 8, ea, t, tmpReg, tmpReg2, pushRegs, nPushedRegs;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	int32_t eImm;
	
	//NOTE: if writeback is specified and Rt == Rn, it is undefined what value is stored
	
	//we only bother with immediates that ARMv5 could have produced
	if (imm >= 0x1000 || imm <= -0x1000)
		return EmitErrNotEncodeable;
	
	//PC cannot be used for Rn if writeback is enabled
	if (rnNo == EMIT_REG_NO_PC && adrMode != EmitAdrModeIndex)
		return EmitErrNotEncodeable;
	
	if (rtNo == EMIT_REG_NO_PC) {	//Rt == PC		(storing PC)
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		t = pcVal;
		
		if (rnNo == EMIT_REG_NO_PC) {	//Rt == PC, Rn == PC, mode must be indexed
			
			ea = pcVal + imm;
			
			if (imm >- 0x100) {		//guaranteed encodeable with a single access
				
				jitPrvFindTempRegs(instrAddr, 1 << rnNo, &pushRegs, NULL, false, &tmpReg, NULL);		
				
				//PUSH {..pushRegs..}
				EMIT(HLpush, pushRegs);
				
				//LDR tmpReg, =PC_VAL
				EMIT(HLloadImmToReg, tmpReg, t, false, false, false);
				
				//proper_sized_store tmpReg, [tmpReg, #imm]		//indeed this might overlap SP but there is nothing we can do, sorry
				EMIT(LLstoreImm, tmpReg, tmpReg, imm, size, EmitAdrModeIndex);

				//POP {..pushRegs..}
				EMIT(HLpop, pushRegs);
			}
			else {

				jitPrvFindTempRegs(instrAddr, 1 << rnNo, &pushRegs, NULL, false, &tmpReg, &tmpReg2, NULL);		
				
				//PUSH {..pushRegs..}
				EMIT(HLpush, pushRegs);
				
				//LDR tmpReg2, =ea
				EMIT(HLloadImmToReg, tmpReg2, ea, false, false, false);
				
				//calculate the value to store into "tmpReg"(PC_VAL) from "tmpReg2"(ea)
				if (imm >= 0)
					//SUB tmpReg, tmpReg2, #imm
					EMIT(LLsubImm, tmpReg, tmpReg2, imm, EmitLeaveFlags, false);
				else
					//ADD tmpReg, tmpReg2, #imm
					EMIT(LLaddImm, tmpReg, tmpReg2, -imm, EmitLeaveFlags, false);
				
				//proper_sized_store tmpReg, [tmpReg2]		//indeed this might overlap SP but there is nothing we can do, sorry
				EMIT(LLstoreImm, tmpReg, tmpReg2, 0, size, EmitAdrModeIndex);

				//POP {..pushRegs..}
				EMIT(HLpop, pushRegs);
			}
		}
		else if (rnNo == EMIT_REG_NO_SP) {			//Rt == PC, Rn == SP	(storing PC to stack)
			
			int32_t storeAt;
			
			if (adrMode != EmitAdrModeIndex) {
				
				//perform the indexing for both postindex and index_wbak modes. sp will end up where it does so this si safe
				if (imm >= 0)
					//ADD SP, SP, #imm
					EMIT(LLaddImm, EMIT_REG_NO_SP, EMIT_REG_NO_SP, imm, EmitLeaveFlags, false);
				else
					//SUM SP, SP, #imm
					EMIT(LLsubImm, EMIT_REG_NO_SP, EMIT_REG_NO_SP, -imm, EmitLeaveFlags, false);
			}
			
			switch (adrMode) {
				case EmitAdrModeIndex:		storeAt = imm;	break;	//we still need to index
				case EmitAdrModePostindex:	storeAt = -imm;	break;	//we did the indexing but needed to store at the old value, so do that
				case EmitAdrModeIndexWbak:	storeAt = 0;	break;	//we indexed so now addr is zero
				default:	__builtin_unreachable();
			}
			
			if (storeAt < 0) {
				loge("Refusing to store below SP\n");
				return EmitErrNotEncodeable;
			}
			
			storeAt += 4;					//we'll push a reg so our imm needs to be 4 mode
			
			if (storeAt < 0x1000) {			//if it is within normal range, do the simple thing
				//PUSH {R0}
				EMIT(HLpush, 0x0001);
				
				//LDR R0, =PC_VAL (shortcut if storing less length)
				t = pcVal;
				switch (size) {
					case EmitSzByte:		t &= 0xff;		break;
					case EmitSzHalfword:	t &= 0xffff;	break;
					case EmitSzWord:						break;
				}
				EMIT(HLloadImmToReg, 0, t, false, false, false);
				
				//proper_sized_store R0, [SP, #storeAt]
				now = jitPrvEmitSimpleRegImmMemStr(dest, EmitCcAl, 0, EMIT_REG_NO_SP, storeAt, EmitAdrModeIndex, size);
				if (now != EmitErrNone)
					return now;
				
				//POP {R0}
				EMIT(HLpop, 0x0001);
			}
			else {							//it is >= 0x1000, this gets a bit harder
				
				storeAt += 4;				//we are pushign one MORE thing thatn we planned, so account for that
				
				//PUSH {R0, R1}
				EMIT(HLpush, 0x0003);
				
				//LDR R0, =PC_VAL (shortcut if storing less length)
				t = pcVal;
				switch (size) {
					case EmitSzByte:		t &= 0xff;		break;
					case EmitSzHalfword:	t &= 0xffff;	break;
					case EmitSzWord:						break;
				}
				EMIT(HLloadImmToReg, 0, t, false, false, false);
				
				//ADD R1, SP, #0x1000
				EMIT(LLaddImm, 1, EMIT_REG_NO_SP, 0x1000, EmitLeaveFlags, false);

				//proper_sized_store R0, [R1, #storeAt - 0x1000]
				now = jitPrvEmitSimpleRegImmMemStr(dest, EmitCcAl, 0, 1, storeAt - 0x1000, EmitAdrModeIndex, size);
				if (now != EmitErrNone)
					return now;
				
				//POP {R0, R1}
				EMIT(HLpop, 0x0003);
			}
		}
		else {										//Rt == PC, Rn != PC, Rn != SP
			
			jitPrvFindTempRegs(instrAddr, 1 << rnNo, &pushRegs, NULL, false, &tmpReg, NULL);		
				
			//PUSH {..pushRegs..}
			EMIT(HLpush, pushRegs);
			
			//LDR tmpReg, =PC_VAL (shortcut if storing less length)
			t = pcVal;
			switch (size) {
				case EmitSzByte:		t &= 0xff;		break;
				case EmitSzHalfword:	t &= 0xffff;	break;
				case EmitSzWord:						break;
			}
			EMIT(HLloadImmToReg, tmpReg, t, false, false, false);
			
			//store it with proper addressing mode
			now = jitPrvEmitSimpleRegImmMemStr(dest, EmitCcAl, tmpReg, rnNo, imm, adrMode, size);
			if (now != EmitErrNone)
				return now;
			
			//POP {..pushRegs..}
			EMIT(HLpop, pushRegs);
		}
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	else if (rtNo == EMIT_REG_NO_SP && size != EmitSzWord) {
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		
		if (adrMode == EmitAdrModeIndex && imm <= -0x100) {		//need two regs
			
			nPushedRegs = jitPrvFindTempRegs(instrAddr, 1 << rnNo, &pushRegs, NULL, false, &tmpReg, &tmpReg2, NULL);		
			
			//PUSH {..pushRegs..}
			EMIT(HLpush, pushRegs);
			
			//SUB tmpReg, Rn, #-imm
			EMIT(LLsubImm, tmpReg, rnNo, -imm, EmitLeaveFlags, false);
			
			//ADD tmpReg2, sp, #4 * nRegsPushed
			EMIT(LLsubImm, tmpReg2, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushedRegs, EmitLeaveFlags, false);

			//proper_sized_store tmpReg2, [tmpReg]
			EMIT(LLstoreImm, tmpReg2, tmpReg, 0, size, EmitAdrModeIndex);
			
			//POP {..pushRegs..}
			EMIT(HLpop, pushRegs);
		}
		else {													//need one reg
			
			nPushedRegs = jitPrvFindTempRegs(instrAddr, 1 << rnNo, &pushRegs, NULL, false, &tmpReg, NULL);		
			
			//PUSH {..pushRegs..}
			EMIT(HLpush, pushRegs);
			
			//ADD tmpReg, sp, #4 * nRegsPushed
			EMIT(LLsubImm, tmpReg, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushedRegs, EmitLeaveFlags, false);

			switch (adrMode) {
				case EmitAdrModeIndex:
					eImm = imm;
					break;
				
				case EmitAdrModePostindex:
					eImm = 0;
					break;
				
				case EmitAdrModeIndexWbak:
					eImm = 0;
					
					if (imm < 0)
						EMIT(LLsubImm, rnNo, rnNo, -imm, EmitLeaveFlags, false);
					else
						EMIT(LLaddImm, rnNo, rnNo, imm, EmitLeaveFlags, false);
					break;
				
				default:
					__builtin_unreachable();
			}

			//proper_sized_store tmpReg, [Rn, eImm]
			EMIT(LLstoreImm, tmpReg, rnNo, eImm, size, EmitAdrModeIndex);

			if (adrMode == EmitAdrModePostindex) {
				if (imm < 0)
					EMIT(LLsubImm, rnNo, rnNo, -imm, EmitLeaveFlags, false);
				else
					EMIT(LLaddImm, rnNo, rnNo, imm, EmitLeaveFlags, false);
			}
			
			//POP {..pushRegs..}
			EMIT(HLpop, pushRegs);
		}
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	else {
	
		//Rt != PC? call jitPrvEmitSimpleRegImmMemStr directly
		now = jitPrvEmitSimpleRegImmMemStr(dest, cc, rtNo, rnNo, imm, adrMode, size);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

//DO NOT USE THIS. Use jitEmitRegRegMemLdr
static enum EmitStatus jitPrvEmitRegRegMemLdrSimple(struct EmitBuf *dest, enum EmitCc cc, bool sext, uint32_t rtNo, uint32_t rnNo, bool isAdd, uint32_t rmNo, enum EmitShiftType shiftType, uint32_t shiftAmt, enum EmitAddrMode adrMode, enum EmitMemOpSz size)
{
	//none of this will handle Rn == PC properly
	if (rnNo == EMIT_REG_NO_PC)
		return EmitErrNotEncodeable;
	
	//Rm cannot be PC or SP
	if (rmNo == EMIT_REG_NO_SP || rmNo == EMIT_REG_NO_PC)
		return EmitErrNotEncodeable;
	
	if (shiftType == EmitShiftLsl && shiftAmt < 4 && isAdd && adrMode == EmitAdrModeIndex) {
		
		//safe to load to PC in this path
		//safe to load to SP in this path IFF word sized
		if (rtNo == EMIT_REG_NO_SP && size != EmitSzWord)
			return EmitErrNotEncodeable;
		
		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		
		//do the load
		EMIT(LLloadRegReg, rtNo, rnNo, rmNo, shiftAmt, size, sext);
	}
	else if (adrMode == EmitAdrModeIndexWbak) {
		
		//safe to load to PC in this path
		//safe to load to SP in this path IFF word sized
		if (rtNo == EMIT_REG_NO_SP && size != EmitSzWord)
			return EmitErrNotEncodeable;
		
		if (cc != EmitCcAl)
			EMIT(LLitt, cc);
		
		//modify Rn as requested
		if (isAdd)
			EMIT(LLaddReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, cc != EmitCcAl);
		else
			EMIT(LLsubReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, cc != EmitCcAl);
		
		//do the load
		EMIT(LLloadImm, rtNo, rnNo, 0, size, sext, EmitAdrModeIndex);
	}
	else if (adrMode == EmitAdrModePostindex) {
			
		//NOT safe to load to PC in this path
		if (rtNo == EMIT_REG_NO_PC)
			return EmitErrNotEncodeable;
		
		//safe to load to SP in this path IFF word sized
		if (rtNo == EMIT_REG_NO_SP && size != EmitSzWord)
			return EmitErrNotEncodeable;
		
		//we'd clobber Rm
		if (rtNo == rmNo)
			return EmitErrNotEncodeable;
		
		if (cc != EmitCcAl)
			EMIT(LLitt, cc);
		
		//do the load
		EMIT(LLloadImm, rtNo, rnNo, 0, size, sext, EmitAdrModeIndex);
		
		//modify Rn as requested (safe to do as we know Rt != Rn)
		if (isAdd)
			EMIT(LLaddReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, cc != EmitCcAl);
		else
			EMIT(LLsubReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, cc != EmitCcAl);
	}
	else {	//index mode
		
		//NOT safe to load to PC in this path
		if (rtNo == EMIT_REG_NO_PC)
			return EmitErrNotEncodeable;
		
		//NOT safe to load to SP in this path
		if (rtNo == EMIT_REG_NO_SP)
			return EmitErrNotEncodeable;
		
		if (cc != EmitCcAl)
			EMIT(LLitt, cc);
		
		//use Rt as clobber since we know it is not a reg we cannot clobber
		if (isAdd)
			EMIT(LLaddReg, rtNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, cc != EmitCcAl);
		else
			EMIT(LLsubReg, rtNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, cc != EmitCcAl);
			
		//do the load
		EMIT(LLloadImm, rtNo, rtNo, 0, size, sext, EmitAdrModeIndex);
	}
	
	return EmitErrNone;
}

static enum EmitStatus jitEmitRegRegMemLdrIntoPc(struct EmitBuf *dest, enum EmitCc cc, uint32_t pcVal, bool sext, uint32_t rnNo, bool isAdd, uint32_t rmNo, enum EmitShiftType shiftType, uint32_t shiftAmt, enum EmitAddrMode adrMode, enum EmitMemOpSz size)
{
	uint32_t tmpReg1, tmpReg2, tmpReg3, usedBits = (1 << EMIT_REG_NO_PC) | (1 << rnNo) | (1 << rmNo), pushRegs, popRegs, nPushRegs;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	if (adrMode == EmitAdrModeIndexWbak) {			//Rn cannot be PC, Rm cannot be PC
		
		if (rnNo == EMIT_REG_NO_SP) {				//we'll need to writeback to SP too
			
			nPushRegs = jitPrvFindTempRegs(pcVal - 8, usedBits, &pushRegs, &popRegs, true, &tmpReg1, &tmpReg2, NULL);
		
			//PUSH {..pushRegs..}
			EMIT(HLpush, pushRegs);
			
			//get SP orig val into tmpReg1
			EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushRegs, EmitLeaveFlags, false);
			if (rmNo == EMIT_REG_NO_SP)
				rmNo = tmpReg1;
			
			//calculate the proper value into tmpReg1
			if (isAdd)
				EMIT(LLaddReg, tmpReg1, tmpReg1, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
			else
				EMIT(LLsubReg, tmpReg1, tmpReg1, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
			
			//do the load into tmpReg2
			EMIT(LLloadImm, tmpReg2, tmpReg1, 0, size, sext, EmitAdrModeIndex);
			
			//store into the proper stack slot
			EMIT(LLstoreImm, tmpReg2, EMIT_REG_NO_SP, sizeof(uint32_t) * (nPushRegs - 1), EmitSzWord, EmitAdrModeIndex);
			
			//swap into new stack and new PC
			now = jitPrvEmitSwapSp(dest, tmpReg1, tmpReg2, popRegs, nPushRegs);
			if (now != EmitErrNone)
				return now;
		}
		else if (size != EmitSzWord) {
			
			nPushRegs = jitPrvFindTempRegs(pcVal - 8, usedBits, &pushRegs, &popRegs, true, &tmpReg1, NULL);
			
			//PUSH {..pushRegs..}
			EMIT(HLpush, pushRegs);
			
			//if Rm is SP, get a copy into a more useful reg
			if (rmNo == EMIT_REG_NO_SP) {
				EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, nPushRegs * sizeof(uint32_t), EmitLeaveFlags, false);
				rmNo = tmpReg1;
			}
			
			//calculate the proper value in Rn
			if (isAdd)
				EMIT(LLaddReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
			else
				EMIT(LLsubReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
			
			//do the load into tmpReg1
			EMIT(LLloadImm, tmpReg1, rnNo, 0, size, sext, EmitAdrModeIndex);
			
			//store it where we'll pop it off into PC
			EMIT(LLstoreImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * (nPushRegs - 1), EmitSzWord, EmitAdrModeIndex);
			
			//pop {..popRegs..}
			now = jitPrvEmitPopWithOpts(dest, EmitCcAl, popRegs, true);
			if (now != EmitErrNone)
				return now;
		}
		else if (rmNo == EMIT_REG_NO_SP) {
			
			nPushRegs = jitPrvFindTempRegs(pcVal - 8, usedBits, &pushRegs, &popRegs, false, &tmpReg1, NULL);
			
			//PUSH {..pushRegs..}
			EMIT(HLpush, pushRegs);
			
			//get SP orig val into tmpReg1
			EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, nPushRegs * sizeof(uint32_t), EmitLeaveFlags, false);
			rmNo = tmpReg1;
			
			//calculate the proper value in rnNo
			if (isAdd)
				EMIT(LLaddReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
			else
				EMIT(LLsubReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);

			//POP {..popRegs..}
			EMIT(HLpop, popRegs);
		
			//do the load
			EMIT(LLloadImm, EMIT_REG_NO_PC, rnNo, 0, size, sext, EmitAdrModeIndex);
		}
		else {
			
			//calculate the proper value in rnNo
			if (isAdd)
				EMIT(LLaddReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
			else
				EMIT(LLsubReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		
			//do the load
			EMIT(LLloadImm, EMIT_REG_NO_PC, rnNo, 0, size, sext, EmitAdrModeIndex);
		}
	}
	else if (adrMode == EmitAdrModeIndex) {			//Rn CAN be PC or SP, Rm can be SP but cannot be PC
		
		nPushRegs = jitPrvFindTempRegs(pcVal - 8, usedBits, &pushRegs, &popRegs, true, &tmpReg1, &tmpReg2, NULL);
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		if (rmNo == EMIT_REG_NO_SP) {				//VERY unlikely, but allowed
			
			//calculate proper SP value into tmpReg1
			EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushRegs, EmitLeaveFlags, false);
			
			//replace Rm with tmpReg1
			rmNo = tmpReg1;
		}
		
		if (rnNo == EMIT_REG_NO_PC) {
			
			EMIT(HLloadImmToReg, tmpReg2, pcVal, false, false, false);
			rnNo = tmpReg2;
		}
		
		//calculate the proper value in tmpReg1
		if (isAdd)
			EMIT(LLaddReg, tmpReg1, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		else
			EMIT(LLsubReg, tmpReg1, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		
		//load the value into tmpReg1. Note that if Rn is SP, we'll need to load at index 4 * nPushRegs and not 0 to account for our push
		EMIT(LLloadImm, tmpReg1, tmpReg1, (rnNo == EMIT_REG_NO_SP) ? (sizeof(uint32_t) * nPushRegs) : 0, size, sext, EmitAdrModeIndex);
		
		//store it into the proper stack slot
		EMIT(LLstoreImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * (nPushRegs - 1), EmitSzWord, EmitAdrModeIndex);

		//POP {..popRegs..}
		now = jitPrvEmitPopWithOpts(dest, EmitCcAl, popRegs, true);
		if (now != EmitErrNone)
			return now;
	}
	else {	// postindex mode
		
		if (rnNo == EMIT_REG_NO_SP)
			nPushRegs = jitPrvFindTempRegs(pcVal - 8, usedBits, &pushRegs, &popRegs, true, &tmpReg1, &tmpReg2, NULL);
		else
			nPushRegs = jitPrvFindTempRegs(pcVal - 8, usedBits, &pushRegs, &popRegs, true, &tmpReg1, NULL);
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
			
		//load the value into tmpReg1. Note that if Rn is SP, we'll need to load at index 4 * nPushRegs and not 0 to account for our push
		EMIT(LLloadImm, tmpReg1, rnNo, rnNo == EMIT_REG_NO_SP ? sizeof(uint32_t) * nPushRegs : 0, size, sext, EmitAdrModeIndex);
		
		//store it into the proper stack slot
		EMIT(LLstoreImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * (nPushRegs - 1), EmitSzWord, EmitAdrModeIndex);

		if (rmNo == EMIT_REG_NO_SP) {				//VERY unlikely, but allowed
			
			//calculate proper SP value into tmpReg1
			EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushRegs, EmitLeaveFlags, false);
			
			//replace Rm with tmpReg1
			rmNo = tmpReg1;
		}
		
		//if Rn != SP, we can just write it and bail
		if (rnNo != EMIT_REG_NO_SP) {
			
			//adjust Rn
			if (isAdd)
				EMIT(LLaddReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
			else
				EMIT(LLsubReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);

			//POP {..popRegs..}
			now = jitPrvEmitPopWithOpts(dest, EmitCcAl, popRegs, true);
			if (now != EmitErrNone)
				return now;
		}
		else {		//Rn is SP, this gets hard (again)
			
			//calculate expected Rn value in tmpReg1
			if (isAdd)
				EMIT(LLaddReg, tmpReg1, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
			else
				EMIT(LLsubReg, tmpReg1, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
			
			//swap SP and return
			now =jitPrvEmitSwapSp(dest, tmpReg1, tmpReg2, popRegs, nPushRegs);
			if (now != EmitErrNone)
				return now;
		}
	}
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

static enum EmitStatus jitEmitRegRegMemLdrIntoSp(struct EmitBuf *dest, enum EmitCc cc, uint32_t pcVal, bool sext, uint32_t rnNo, bool isAdd, uint32_t rmNo, enum EmitShiftType shiftType, uint32_t shiftAmt, enum EmitAddrMode adrMode, enum EmitMemOpSz size)
{
	uint32_t tmpReg1, tmpReg2, usedBits = (1 << EMIT_REG_NO_SP) | (1 << rnNo) | (1 << rmNo), pushRegs = 0;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	tmpReg1 = jitUtilPickLowestClearBit(usedBits + pushRegs);
	pushRegs += 1 << tmpReg1;
	tmpReg2 = jitUtilPickLowestClearBit(usedBits + pushRegs);
	pushRegs += 1 << tmpReg2;
	
	if (adrMode == EmitAdrModeIndexWbak) {	//Rn cannot be PC (no wbak to pc) or SP (no wbak when Rt == Rn). Rm cannot be PC (never allowed) but can be SP
		
		//we can only direct load into SP if size is word
		if (size == EmitSzWord && rmNo != EMIT_REG_NO_SP) {			
			
			
			if (cc != EmitCcAl)
				EMIT(LLitt, cc);
			
			//calculate the proper value in Rn
			if (isAdd)
				EMIT(LLaddReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, cc != EmitCcAl);
			else
				EMIT(LLsubReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, cc != EmitCcAl);
			
			//do the load
			EMIT(LLloadImm, EMIT_REG_NO_SP, rnNo, 0, size, sext, EmitAdrModeIndex);
		}
		else if (size == EmitSzWord) {
			
			now = jitPrvHandleCcStart(dest, &ccSkip, cc);
			if (now != EmitErrNone)
				return now;
			
			//PUSH {tmpReg1}
			EMIT(HLpush, 1 << tmpReg1);
			
			//get SP orig val into tmpReg1
			EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t), EmitLeaveFlags, false);
			rmNo = tmpReg1;
			
			//calculate the proper value in rnNo
			if (isAdd)
				EMIT(LLaddReg, rnNo, rnNo, tmpReg1, shiftType, shiftAmt, EmitLeaveFlags, false);
			else
				EMIT(LLsubReg, rnNo, rnNo, tmpReg1, shiftType, shiftAmt, EmitLeaveFlags, false);
			
			//POP {tmpReg1}
			EMIT(HLpop, 1 << tmpReg1);
		
			//do the load
			EMIT(LLloadImm, EMIT_REG_NO_SP, rnNo, 0, size, sext, EmitAdrModeIndex);
			
			now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
			if (now != EmitErrNone)
				return now;
		}
		else {
			
			now = jitPrvHandleCcStart(dest, &ccSkip, cc);
			if (now != EmitErrNone)
				return now;
			
			//PUSH {..pushRegs..}
			EMIT(HLpush, pushRegs);
			
			if (rmNo == EMIT_REG_NO_SP) {
				//get SP orig val into tmpReg1
				EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * 2, EmitLeaveFlags, false);
				rmNo = tmpReg1;
			}
			
			//calculate the proper value in rnNo
			if (isAdd)
				EMIT(LLaddReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
			else
				EMIT(LLsubReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
			
			//do the load
			EMIT(LLloadImm, tmpReg1, rnNo, 0, size, sext, EmitAdrModeIndex);
			
			//swap into new SP
			now = jitPrvEmitSwapSp(dest, tmpReg1, tmpReg2, pushRegs, 2);
			if (now != EmitErrNone)
				return now;
				
			now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
			if (now != EmitErrNone)
				return now;
		}
	}
	else if (adrMode == EmitAdrModePostindex) {	//Rn cannot be PC (no wbak to pc) or SP (no wbak when Rt == Rn). Rm cannot be PC (never allowed) but can be SP
		
		//we can only direct load into SP if size is word
		if (size == EmitSzWord && rmNo != EMIT_REG_NO_SP) {
		
			if (cc != EmitCcAl)
				EMIT(LLitt, cc);
			
			//do the load
			EMIT(LLloadImm, EMIT_REG_NO_SP, rnNo, 0, size, sext, EmitAdrModeIndex);
		
			//calculate the proper value in Rn
			if (isAdd)
				EMIT(LLaddReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, cc != EmitCcAl);
			else
				EMIT(LLsubReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, cc != EmitCcAl);
		}
		else if (size == EmitSzWord) {		// LDR SP, [Rn], +/- SP, Rn != SP
			
			now = jitPrvHandleCcStart(dest, &ccSkip, cc);
			if (now != EmitErrNone)
				return now;
			
			//PUSH {..pushRegs..}
			EMIT(HLpush, pushRegs);
			
			//get SP orig val into tmpReg1
			EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, 2 * sizeof(uint32_t), EmitLeaveFlags, false);
			rmNo = tmpReg1;
			
			//do the load into tmpReg2
			EMIT(LLloadImm, tmpReg2, rnNo, 0, size, sext, EmitAdrModeIndex);
			
			//calculate the proper new value for Rn into Rn (which we know is not SP)
			if (isAdd)
				EMIT(LLaddReg, rnNo, rnNo, tmpReg1, shiftType, shiftAmt, EmitLeaveFlags, false);
			else
				EMIT(LLsubReg, rnNo, rnNo, tmpReg1, shiftType, shiftAmt, EmitLeaveFlags, false);
			
			//swap sp
			now = jitPrvEmitSwapSp(dest, tmpReg2, tmpReg1, pushRegs, 2);
			if (now != EmitErrNone)
				return now;
			
			now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
			if (now != EmitErrNone)
				return now;
		}
		else {
			
			now = jitPrvHandleCcStart(dest, &ccSkip, cc);
			if (now != EmitErrNone)
				return now;
			
			//PUSH {..pushRegs..}
			EMIT(HLpush, pushRegs);
			
			//do the load
			EMIT(LLloadImm, tmpReg2, rnNo, 0, size, sext, EmitAdrModeIndex);
			
			if (rmNo == EMIT_REG_NO_SP) {
				//get SP orig val into tmpReg1
				EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, 2 * sizeof(uint32_t), EmitLeaveFlags, false);
				rmNo = tmpReg1;
			}
			
			//calculate the proper value in rnNo
			if (isAdd)
				EMIT(LLaddReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
			else
				EMIT(LLsubReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
			
			//swap into new SP
			now = jitPrvEmitSwapSp(dest, tmpReg2, tmpReg1, pushRegs, 2);
			if (now != EmitErrNone)
				return now;
			
			now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
			if (now != EmitErrNone)
				return now;
		}
	}
	else  {		//index mode, Rn can be PC or SP, Rm can be SP but cannot be PC
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		if (rmNo == EMIT_REG_NO_SP || rnNo == EMIT_REG_NO_SP) {
			
			//calc effective SP into tmpReg1 and set Rn & Rm as needed
			EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * 2, EmitLeaveFlags, false);
			if (rmNo == EMIT_REG_NO_SP)
				rmNo = tmpReg1;
			if (rnNo == EMIT_REG_NO_SP)
				rnNo = tmpReg1;
		}
		if (rnNo == EMIT_REG_NO_PC) {
			
			//get effective PC into tmpReg2 & set Rn
			EMIT(HLloadImmToReg, tmpReg2, pcVal, false, false, false);
			rnNo = tmpReg2;
		}
		
		//calculate proper address into tmpReg1
		if (isAdd)
			EMIT(LLaddReg, tmpReg1, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		else
			EMIT(LLsubReg, tmpReg1, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		
		//perform the load into tmpReg1
		EMIT(LLloadImm, tmpReg1, tmpReg1, 0, size, sext, EmitAdrModeIndex);
		
		now = jitPrvEmitSwapSp(dest, tmpReg1, tmpReg2, pushRegs, 2);
		if (now != EmitErrNone)
			return now;
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}

	return EmitErrNone;
}

enum EmitStatus jitEmitRegRegMemLdr(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, bool sext, uint32_t rtNo, uint32_t rnNo, bool isAdd, uint32_t rmNo, enum EmitShiftType shiftType, uint32_t shiftAmt, enum EmitAddrMode adrMode, enum EmitMemOpSz size)
{
	uint32_t tmpReg1, usedBits = (1 << rtNo) | (1 << rnNo) | (1 << rmNo), pcVal = instrAddr + 8;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	void *bufBkp;
	
	//cannot sign extend word loads
	if (size == EmitSzWord && sext)
		return EmitErrNotEncodeable;
	
	//writeback with Rt == Rn are undefined
	if (rtNo == rnNo && adrMode != EmitAdrModeIndex)
		return EmitErrNotEncodeable;
	
	//writeback with Rn == PC is undefined
	if (rnNo == EMIT_REG_NO_PC && adrMode != EmitAdrModeIndex)
		return EmitErrNotEncodeable;
	
	//Rm == PC is undefined
	if (rmNo == EMIT_REG_NO_PC)
		return EmitErrNotEncodeable;
	
	//Rn == Rm is not allowed with wbak
	if (rnNo == rmNo && adrMode != EmitAdrModeIndex)
		return EmitErrNotEncodeable;
	
	//jitPrvEmitRegRegMemLdrSimple enforces ALL its limits besides the ones we already enforced above, so try it directly first
	bufBkp = emitBufferBackup(dest);
	now = jitPrvEmitRegRegMemLdrSimple(dest, cc, sext, rtNo, rnNo, isAdd, rmNo, shiftType, shiftAmt, adrMode, size);
	if (now != EmitErrNotEncodeable)
		return now;
	emitBufferRestore(dest, bufBkp);
	
	//we need a slow path for every possible case
	if (rtNo == EMIT_REG_NO_PC) {						//loading into PC
		
		now = jitEmitRegRegMemLdrIntoPc(dest, cc, pcVal, sext, rnNo, isAdd, rmNo, shiftType, shiftAmt, adrMode, size);
		if (now != EmitErrNone)
			return now;
	}
	else if (rtNo == EMIT_REG_NO_SP) {				//loading into SP
		
		now = jitEmitRegRegMemLdrIntoSp(dest, cc, pcVal, sext, rnNo, isAdd, rmNo, shiftType, shiftAmt, adrMode, size);
		if (now != EmitErrNone)
			return now;
	}
	else if (rnNo == EMIT_REG_NO_PC) {				//pc-rel load. index mode only
		
		if (rtNo == rmNo) {							//we cannot clobber Rt, but we DO know that Rm is not SP (since it equals Rt)
			
			tmpReg1 = jitUtilPickLowestClearBit(usedBits);
		
			now = jitPrvHandleCcStart(dest, &ccSkip, cc);
			if (now != EmitErrNone)
				return now;
			
			//PUSH {tmpReg1}
			EMIT(HLpush, 1 << tmpReg1);
			
			//LDR tmpReg1, =PC_VAL
			EMIT(HLloadImmToReg, tmpReg1, pcVal, false, false, false);
			
			//calculate proper address into tempReg1
			if (isAdd)
				EMIT(LLaddReg, tmpReg1, tmpReg1, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
			else
				EMIT(LLsubReg, tmpReg1, tmpReg1, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
			
			//perform the load
			EMIT(LLloadImm, rtNo, tmpReg1, 0, size, sext, EmitAdrModeIndex);
			
			//POP {tmpReg1}
			EMIT(HLpop, 1 << tmpReg1);
			
			now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
			if (now != EmitErrNone)
				return now;
		}
		else if (rmNo != EMIT_REG_NO_SP) {				//can we get away with just clobberring Rt?
			
			now = jitPrvLiteralLoadsFlush(dest, 1);
			if (now != EmitErrNone)
				return now;
			
			if (cc != EmitCcAl)
				EMIT(LLittt, cc);
			
			//LDR Rt, =PC_VAL
			now = jitPrvEmitLoadValueOneInstrIfInIt(dest, rtNo, pcVal, cc != EmitCcAl);
			if (now != EmitErrNone)
				return now;
			
			//calculate proper address into Rt
			if (isAdd)
				EMIT(LLaddReg, rtNo, rtNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, cc != EmitCcAl);
			else
				EMIT(LLsubReg, rtNo, rtNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, cc != EmitCcAl);
			
			//perform the load
			EMIT(LLloadImm, rtNo, rtNo, 0, size, sext, EmitAdrModeIndex);
		}
		else {											//Rn is PC, Rm is SP, Rt is neither of those
			
			tmpReg1 = jitUtilPickLowestClearBit(usedBits);
			
			now = jitPrvHandleCcStart(dest, &ccSkip, cc);
			if (now != EmitErrNone)
				return now;
			
			//PUSH {tmpReg1}
			EMIT(HLpush, 1 << tmpReg1);
			
			//LDR rtNo, =PC_VAL
			EMIT(HLloadImmToReg, rtNo, pcVal, false, false, false);
			
			//tmpReg1 = SP_VAL (before push)
			EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t), EmitLeaveFlags, false);
			
			//calculate proper address into Rt
			if (isAdd)
				EMIT(LLaddReg, rtNo, rtNo, tmpReg1, shiftType, shiftAmt, EmitLeaveFlags, false);
			else
				EMIT(LLsubReg, rtNo, rtNo, tmpReg1, shiftType, shiftAmt, EmitLeaveFlags, false);
			
			//POP {tmpReg1}
			EMIT(HLpop, 1 << tmpReg1);
			
			//perform the load
			EMIT(LLloadImm, rtNo, rtNo, 0, size, sext, EmitAdrModeIndex);
			
			now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
			if (now != EmitErrNone)
				return now;
		}
	}
	else if ((adrMode == EmitAdrModeIndexWbak || (adrMode == EmitAdrModePostindex && rmNo != rtNo)) && rmNo != EMIT_REG_NO_SP && (rnNo != EMIT_REG_NO_SP || (shiftType == EmitShiftLsl && shiftAmt < 4))) {
		
		//we can directly modify base as needed (base is not PC, Rm is not PC, Rm is not SP, (and if postindex, Rm != Rt) shift is applicable to SP of base is SP)
		
		if (cc != EmitCcAl)
			EMIT(LLitt, cc);
		
		//load if needed
		if (adrMode == EmitAdrModePostindex)
			EMIT(LLloadImm, rtNo, rnNo, 0, size, sext, EmitAdrModeIndex);
		
		//adjust base
		if (isAdd)
			EMIT(LLaddReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, cc != EmitCcAl);
		else
			EMIT(LLsubReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, cc != EmitCcAl);
		
		//load if needed
		if (adrMode == EmitAdrModeIndexWbak)
			EMIT(LLloadImm, rtNo, rnNo, 0, size, sext, EmitAdrModeIndex);
	}
	else if (adrMode == EmitAdrModeIndexWbak || adrMode == EmitAdrModeIndex) {	//can clobber Rt
		
		uint32_t origRn = rnNo, calcDest = (adrMode == EmitAdrModeIndex) ? rtNo : rnNo;
		
		if (rmNo == EMIT_REG_NO_SP || rnNo == EMIT_REG_NO_SP) {
			
			if (cc != EmitCcAl) {
				
				if (calcDest != origRn && adrMode == EmitAdrModeIndexWbak)
					EMIT(LLitttt, cc);
				else
					EMIT(LLittt, cc);
			}
			
			//move SP to Rt if needed
			EMIT(LLmov, rtNo, EMIT_REG_NO_SP, EmitShiftLsl, 0, EmitLeaveFlags, cc != EmitCcAl);
			
			if (rmNo == EMIT_REG_NO_SP)
				rmNo = rtNo;
			if (rnNo == EMIT_REG_NO_SP)
				rnNo = rtNo;
			if (calcDest == EMIT_REG_NO_SP)
				calcDest = rtNo;
		}
		else if (cc != EmitCcAl) {
			
			if (calcDest != origRn && adrMode == EmitAdrModeIndexWbak)
				EMIT(LLittt, cc);
			else
				EMIT(LLitt, cc);
		}
		
		//calculate new base
		if (isAdd)
			EMIT(LLaddReg, calcDest, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, cc != EmitCcAl);
		else
			EMIT(LLsubReg, calcDest, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, cc != EmitCcAl);
		
		//writeback if needed
		if (calcDest != origRn && adrMode == EmitAdrModeIndexWbak)
			EMIT(LLmov, origRn, calcDest, EmitShiftLsl, 0, EmitLeaveFlags, cc != EmitCcAl);
		
		//perform the load
		EMIT(LLloadImm, rtNo, calcDest, 0, size, sext, EmitAdrModeIndex);
	}
	else if (rnNo != EMIT_REG_NO_SP) {		//postindex mode. Rn != SP, Rn != Rt (wbak forbids that)
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		if (rmNo == EMIT_REG_NO_SP || rmNo == rnNo) {
			//move Rm to Rt if needed (if we'll clobber it or it is SP
			EMIT(LLmov, rtNo, rmNo, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
			rmNo = rtNo;
		}

		//calc the new base into Rn
		if (isAdd)
			EMIT(LLaddReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		else
			EMIT(LLsubReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		
		if (!isAdd && shiftType == EmitShiftLsl && shiftAmt < 4) {	//we can do it simpler
			
			now = jitPrvEmitRegRegMemLdrSimple(dest, EmitCcAl, sext, rtNo, rnNo, true, rmNo, shiftType, shiftAmt, EmitAdrModeIndex, size);
			if (now != EmitErrNone)
				return now;
		}
		else {
		
			//calc oritinal base into Rt
			if (isAdd)
				EMIT(LLsubReg, rtNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
			else
				EMIT(LLaddReg, rtNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
			
			//perform the load
			EMIT(LLloadImm, rtNo, rtNo, 0, size, sext, EmitAdrModeIndex);
		}
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	else {		// postindex mode with Rn == SP
		uint32_t tmpReg2, pushRegs = 0;
		
		tmpReg1 = jitUtilPickLowestClearBit(usedBits + pushRegs);
		pushRegs += 1 << tmpReg1;
		tmpReg2 = jitUtilPickLowestClearBit(usedBits + pushRegs);
		pushRegs += 1 << tmpReg2;
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		if (rmNo == EMIT_REG_NO_SP || rmNo == rtNo) {
			//move Rm to tmpReg1 in case we're about to clobber it or it is SP
			EMIT(LLmov, tmpReg1, rmNo, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
			rmNo = tmpReg1;
		}
		
		//perform the load (Rn was SP so adjust for how mush we pushed)
		EMIT(LLloadImm, rtNo, rnNo, sizeof(uint32_t) * 4, size, sext, EmitAdrModeIndex);
		
		//calc the new base into tmpReg1
		if (isAdd)
			EMIT(LLaddReg, tmpReg2, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		else
			EMIT(LLsubReg, tmpReg2, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		
		//go to new stack
		now = jitPrvEmitSwapSp(dest, tmpReg2, tmpReg1, pushRegs, 2);
		if (now != EmitErrNone)
			return now;
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

static enum EmitStatus jitEmitRegRegMemStrPc(struct EmitBuf *dest, enum EmitCc cc, uint32_t pcVal, uint32_t rnNo, bool isAdd, uint32_t rmNo, enum EmitShiftType shiftType, uint32_t shiftAmt, enum EmitAddrMode adrMode, enum EmitMemOpSz size)
{
	uint32_t tmpReg1, tmpReg2, usedBits = (1 << EMIT_REG_NO_PC) | (1 << rnNo) | (1 << rmNo), pushRegs = 0;
	uint32_t pcStoreVal = pcVal;
	struct EmitBuf ccSkip;
	enum EmitStatus now;

	tmpReg1 = jitUtilPickLowestClearBit(usedBits + pushRegs);
	pushRegs += 1 << tmpReg1;
	tmpReg2 = jitUtilPickLowestClearBit(usedBits + pushRegs);
	pushRegs += 1 << tmpReg2;
	
	if (rnNo != EMIT_REG_NO_PC) switch (size) {
		case EmitSzByte:		pcStoreVal &= 0xff;		break;
		case EmitSzHalfword:	pcStoreVal &= 0xffff;	break;
		case EmitSzWord:								break;
	}
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	if (shiftType == EmitShiftLsl && shiftAmt < 4 && isAdd && adrMode == EmitAdrModeIndex && rnNo != EMIT_REG_NO_SP && rnNo != EMIT_REG_NO_PC && rmNo != EMIT_REG_NO_SP) {
		//one temp will do
		
		//PUSH {tmpReg1}
		EMIT(HLpush, 1 << tmpReg1);
		
		//LDR tmpReg1, =PC_VAL_TO_STORE
		EMIT(HLloadImmToReg, tmpReg1, pcStoreVal, false, false, false);
		
		if (rnNo == EMIT_REG_NO_PC)
			rnNo = tmpReg1;
		
		//STORE
		EMIT(LLstoreRegReg, tmpReg1, rnNo, rmNo, shiftAmt, size);
		
		//POP {tmpReg1}
		EMIT(HLpop, 1 << tmpReg1);
	}
	else if (adrMode == EmitAdrModeIndex) {			//Rn might be PC
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		//LDR tmpReg1, =PC_VAL
		EMIT(HLloadImmToReg, tmpReg1, pcStoreVal, false, false, false);
		
		if (rnNo == EMIT_REG_NO_PC)
			rnNo = tmpReg1;
		
		if (rnNo == EMIT_REG_NO_SP || rmNo == EMIT_REG_NO_SP) {
			
			//LDR tmpReg2, =SP_VAL (if we hadnt pushed)
			EMIT(LLaddImm, tmpReg2, EMIT_REG_NO_SP, 2 * sizeof(uint32_t), EmitLeaveFlags, false);
			
			if (rnNo == EMIT_REG_NO_SP)
				rnNo = tmpReg2;
			if (rmNo == EMIT_REG_NO_SP)
				rmNo = tmpReg2;
		}
		
		//form the address in tmpReg2
		if (isAdd)
			EMIT(LLaddReg, tmpReg2, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		else
			EMIT(LLsubReg, tmpReg2, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		
		//STORE
		EMIT(LLstoreImm, tmpReg1, tmpReg2, 0, size, EmitAdrModeIndex);

		//POP {..pushRegs..}
		EMIT(HLpop, pushRegs);
	}
	else if (rnNo != EMIT_REG_NO_SP) {			//when not writing back to SP, things are simpler, we also know Rn is not PC, so that helps (no wbak to PC allowed)
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		//LDR tmpReg1, =PC_VAL
		EMIT(HLloadImmToReg, tmpReg1, pcStoreVal, false, false, false);
		
		//store if needed
		if (adrMode == EmitAdrModePostindex) {
			//STORE
			EMIT(LLstoreImm, tmpReg1, rnNo, 0, size, EmitAdrModeIndex);
		}
		
		//adjust base
		
		if (rmNo == EMIT_REG_NO_SP) {
			
			//LDR tmpReg2, =SP_VAL (if we hadnt pushed
			EMIT(LLaddImm, tmpReg2, EMIT_REG_NO_SP, 2 * sizeof(uint32_t), EmitLeaveFlags, false);
			rmNo = tmpReg2;
		}
		
		//form the address in Rn
		if (isAdd)
			EMIT(LLaddReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		else
			EMIT(LLsubReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		
		//store if needed
		if (adrMode == EmitAdrModeIndexWbak) {
			//STORE
			EMIT(LLstoreImm, tmpReg1, rnNo, 0, size, EmitAdrModeIndex);
		}

		//POP {..pushRegs..}
		EMIT(HLpop, pushRegs);
	}
	else {										//wbak to SP
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		//LDR tmpReg1, =PC_VAL
		EMIT(HLloadImmToReg, tmpReg1, pcStoreVal, false, false, false);
		
		//LDR tmpReg2, =SP_VAL (if we hadnt )
		EMIT(LLaddImm, tmpReg2, EMIT_REG_NO_SP, 2 * sizeof(uint32_t), EmitLeaveFlags, false);
		if (rmNo == EMIT_REG_NO_SP)
			rmNo = tmpReg2;
		
		//store if needed
		if (adrMode == EmitAdrModePostindex) {
			//STORE
			EMIT(LLstoreImm, tmpReg1, tmpReg2, 0, size, EmitAdrModeIndex);
		}
		
		//form the address in tmpReg2
		if (isAdd)
			EMIT(LLaddReg, tmpReg2, tmpReg2, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		else
			EMIT(LLsubReg, tmpReg2, tmpReg2, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		
		//store if needed
		if (adrMode == EmitAdrModeIndexWbak) {
			//STORE
			EMIT(LLstoreImm, tmpReg1, tmpReg2, 0, size, EmitAdrModeIndex);
		}
		
		//swap to new SP
		now = jitPrvEmitSwapSp(dest, tmpReg2, tmpReg1, pushRegs, 2);
		if (now != EmitErrNone)
			return now;
	}
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

static enum EmitStatus jitEmitRegRegMemStrSp(struct EmitBuf *dest, enum EmitCc cc, uint32_t pcVal, uint32_t rnNo, bool isAdd, uint32_t rmNo, enum EmitShiftType shiftType, uint32_t shiftAmt, enum EmitAddrMode adrMode, enum EmitMemOpSz size)
{
	uint32_t tmpReg1, tmpReg2, usedBits = (1 << EMIT_REG_NO_SP) | (1 << rnNo) | (1 << rmNo), pushRegs = 0;
	struct EmitBuf ccSkip;
	enum EmitStatus now;

	//in some cases, we could do a little better by using the reg-reg store instr, but i figure storing SP is rare enough that it is not worth it
	
	tmpReg1 = jitUtilPickLowestClearBit(usedBits + pushRegs);
	pushRegs += 1 << tmpReg1;
	tmpReg2 = jitUtilPickLowestClearBit(usedBits + pushRegs);
	pushRegs += 1 << tmpReg2;
	
	if (adrMode == EmitAdrModeIndex && isAdd && size == EmitSzWord && shiftType == EmitShiftLsl && shiftAmt < 4 && rmNo != EMIT_REG_NO_SP && rnNo != EMIT_REG_NO_PC) {
		
		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		
		EMIT(LLstoreRegReg, EMIT_REG_NO_SP, rnNo, rmNo, shiftAmt, EmitSzWord);
	}
	else if (rnNo == EMIT_REG_NO_PC || rnNo == EMIT_REG_NO_SP) {	//must be indexed mode
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		//LDR tmpReg1, =SP_VAL (if we hadnt pushed)
		EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, 2 * sizeof(uint32_t), EmitLeaveFlags, false);
		if (rmNo == EMIT_REG_NO_SP)
			rmNo = tmpReg1;
		if (rnNo == EMIT_REG_NO_SP)
			rnNo = tmpReg1;
		
		if (rnNo == EMIT_REG_NO_PC) {
			//LDR tmpReg2, =PC_VAL
			EMIT(HLloadImmToReg, tmpReg2, pcVal, false, false, false);
			rnNo = tmpReg2;
		}
		
		//form the address in tmpReg2
		if (isAdd)
			EMIT(LLaddReg, tmpReg2, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		else
			EMIT(LLsubReg, tmpReg2, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		
		//STORE
		EMIT(LLstoreImm, tmpReg1, tmpReg2, 0, size, EmitAdrModeIndex);

		//POP {..pushRegs..}
		EMIT(HLpop, pushRegs);
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	else {
		
		uint32_t dstReg = adrMode == EmitAdrModeIndex ? tmpReg2 : rnNo;
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
			
		//LDR tmpReg1, =SP_VAL
		EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, 2 * sizeof(uint32_t), EmitLeaveFlags, false);
		if (rmNo == EMIT_REG_NO_SP)
			rmNo = tmpReg1;
		
		//store if needed
		if (adrMode == EmitAdrModePostindex) {
			//STORE
			EMIT(LLstoreImm, tmpReg1, rnNo, 0, size, EmitAdrModeIndex);
		}
		
		//form the address in dstReg (which is Rn except in index mode)
		if (isAdd)
			EMIT(LLaddReg, dstReg, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		else
			EMIT(LLsubReg, dstReg, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		
		//store if needed
		if (adrMode != EmitAdrModePostindex) {
			//STORE
			EMIT(LLstoreImm, tmpReg1, dstReg, 0, size, EmitAdrModeIndex);
		}
		
		//POP {..pushRegs..}
		EMIT(HLpop, pushRegs);
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}

	return EmitErrNone;
}

static enum EmitStatus jitEmitRegRegMemStrPcBased(struct EmitBuf *dest, enum EmitCc cc, uint32_t pcVal, uint32_t rtNo, bool isAdd, uint32_t rmNo, enum EmitShiftType shiftType, uint32_t shiftAmt, enum EmitAddrMode adrMode, enum EmitMemOpSz size)
{
	uint32_t tmpReg1, tmpReg2, usedBits = (1 << rtNo) | (1 << EMIT_REG_NO_PC) | (1 << rmNo), pushRegs = 0;
	struct EmitBuf ccSkip;
	enum EmitStatus now;

	//indexed mode is guaranteed since PC cannot be used with wbak
	//Rt is not SP or PC, Rm is not PC
	//in some cases, we could do a little better by using the reg-reg store instr, but i figure PC-based stores are uncommon
	
	tmpReg1 = jitUtilPickLowestClearBit(usedBits + pushRegs);
	pushRegs += 1 << tmpReg1;
	tmpReg2 = jitUtilPickLowestClearBit(usedBits + pushRegs);
	pushRegs += 1 << tmpReg2;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	if (rmNo != EMIT_REG_NO_SP) {
		
		//PUSH {tmpReg1}
		EMIT(HLpush, 1 << tmpReg1);
		
		//LDR tmpReg1, =PC_VAL
		EMIT(HLloadImmToReg, tmpReg1, pcVal, false, false, false);
		
		//form the address in tmpReg1
		if (isAdd)
			EMIT(LLaddReg, tmpReg1, tmpReg1, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		else
			EMIT(LLsubReg, tmpReg1, tmpReg1, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		
		//STORE
		EMIT(LLstoreImm, rtNo, tmpReg1, 0, size, EmitAdrModeIndex);
		
		//POP {tmpReg1}
		EMIT(HLpop, 1 << tmpReg1);
	}
	else {
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		//LDR tmpReg1, =PC_VAL
		EMIT(HLloadImmToReg, tmpReg1, pcVal, false, false, false);
		
		//LDR tmpReg2, =SP_VAL
		EMIT(LLaddImm, tmpReg2, EMIT_REG_NO_SP, 2 * sizeof(uint32_t), EmitLeaveFlags, false);
		
		//form the address in tmpReg1
		if (isAdd)
			EMIT(LLaddReg, tmpReg1, tmpReg1, tmpReg2, shiftType, shiftAmt, EmitLeaveFlags, false);
		else
			EMIT(LLsubReg, tmpReg1, tmpReg1, tmpReg2, shiftType, shiftAmt, EmitLeaveFlags, false);
		
		//STORE
		EMIT(LLstoreImm, rtNo, tmpReg1, 0, size, EmitAdrModeIndex);
		
		//POP {..pushRegs..}
		EMIT(HLpop, pushRegs);
	}

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

static enum EmitStatus jitEmitRegRegMemStrSpBased(struct EmitBuf *dest, enum EmitCc cc, uint32_t pcVal, uint32_t rtNo, bool isAdd, uint32_t rmNo, enum EmitShiftType shiftType, uint32_t shiftAmt, enum EmitAddrMode adrMode, enum EmitMemOpSz size)
{
	uint32_t tmpReg1, tmpReg2, usedBits = (1 << rtNo) | (1 << EMIT_REG_NO_SP) | (1 << rmNo), pushRegs = 0;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	//Rt is not SP or PC, Rm is not PC
	tmpReg1 = jitUtilPickLowestClearBit(usedBits + pushRegs);
	pushRegs += 1 << tmpReg1;
	tmpReg2 = jitUtilPickLowestClearBit(usedBits + pushRegs);
	pushRegs += 1 << tmpReg2;
	
	if (adrMode == EmitAdrModeIndex && isAdd && shiftType == EmitShiftLsl && shiftAmt < 4 && rmNo != EMIT_REG_NO_SP) {
		
		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		
		EMIT(LLstoreRegReg, rtNo, EMIT_REG_NO_SP, rmNo, shiftAmt, size);
	}
	else if (adrMode == EmitAdrModeIndex) {
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		//PUSH {tmpReg1}
		EMIT(HLpush, 1 << tmpReg1);
		
		//LDR tmpReg1, =SP_VAL
		EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t), EmitLeaveFlags, false);
		if (rmNo == EMIT_REG_NO_SP)
			rmNo = tmpReg1;
		
		//form the address in tmpReg1
		if (isAdd)
			EMIT(LLaddReg, tmpReg1, tmpReg1, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		else
			EMIT(LLsubReg, tmpReg1, tmpReg1, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		
		//STORE
		EMIT(LLstoreImm, rtNo, tmpReg1, 0, size, EmitAdrModeIndex);
		
		//POP {tmpReg1}
		EMIT(HLpop, 1 << tmpReg1);
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	else {
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		//LDR tmpReg1, =SP_VAL
		EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, 2 * sizeof(uint32_t), EmitLeaveFlags, false);
		if (rmNo == EMIT_REG_NO_SP)
			rmNo = tmpReg1;
		
		//store if needed
		if (adrMode == EmitAdrModePostindex) {
			//STORE
			EMIT(LLstoreImm, rtNo, tmpReg1, 0, size, EmitAdrModeIndex);
		}
		
		//form new base reg val in tmpReg1
		if (isAdd)
			EMIT(LLaddReg, tmpReg1, tmpReg1, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		else
			EMIT(LLsubReg, tmpReg1, tmpReg1, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		
		//store if needed
		if (adrMode == EmitAdrModeIndexWbak) {
			//STORE
			EMIT(LLstoreImm, rtNo, tmpReg1, 0, size, EmitAdrModeIndex);
		}
		
		//swap to new SP
		now = jitPrvEmitSwapSp(dest, tmpReg1, tmpReg2, pushRegs, 2);
		if (now != EmitErrNone)
			return now;
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}

	return EmitErrNone;
}

static enum EmitStatus jitPrvEmitRegRegMemStrSimple(struct EmitBuf *dest, enum EmitCc cc, uint32_t rtNo, uint32_t rnNo, bool isAdd, uint32_t rmNo, enum EmitShiftType shiftType, uint32_t shiftAmt, enum EmitAddrMode adrMode, enum EmitMemOpSz size)
{
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	//Rn and Rt are both not SP or PC. Rm is not PC
	
	if (adrMode == EmitAdrModeIndex && isAdd && shiftType == EmitShiftLsl && shiftAmt < 4 && rmNo != EMIT_REG_NO_SP) {
		
		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		
		EMIT(LLstoreRegReg, rtNo, rnNo, rmNo, shiftAmt, size);
	}
	else if (rmNo != EMIT_REG_NO_SP && (adrMode != EmitAdrModeIndex || (rnNo != rtNo && rnNo != rmNo))) {
		
		if (cc != EmitCcAl) {
			if (adrMode == EmitAdrModeIndex)
				EMIT(LLitttt, cc);
			else
				EMIT(LLittt, cc);
		}
		
		//store if needed
		if (adrMode == EmitAdrModePostindex) {
			//STORE
			EMIT(LLstoreImm, rtNo, rnNo, 0, size, EmitAdrModeIndex);
		}
		
		//form new base reg val in Rn
		if (isAdd)
			EMIT(LLaddReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, cc != EmitCcAl);
		else
			EMIT(LLsubReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, cc != EmitCcAl);
		
		//store if needed
		if (adrMode != EmitAdrModePostindex) {
			//STORE
			EMIT(LLstoreImm, rtNo, rnNo, 0, size, EmitAdrModeIndex);
		}
		
		//restore base reg if needed
		if (adrMode == EmitAdrModeIndex) {
			if (isAdd)
				EMIT(LLsubReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, cc != EmitCcAl);
			else
				EMIT(LLaddReg, rnNo, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, cc != EmitCcAl);
		}
	}
	else  {
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		uint32_t tmpReg = jitUtilPickLowestClearBit((1 << rtNo) | (1 << rnNo) | (1 << rmNo));
		uint32_t dstReg = adrMode == EmitAdrModeIndex ? tmpReg : rnNo;
		
		//PUSH {tmpReg1}
		EMIT(HLpush, 1 << tmpReg);
		
		//store if needed
		if (adrMode == EmitAdrModePostindex) {
			//STORE
			EMIT(LLstoreImm, rtNo, rnNo, 0, size, EmitAdrModeIndex);
		}
		
		if (rmNo == EMIT_REG_NO_SP) {
			//LDR tmpReg, =SP_VAL
			EMIT(LLaddImm, tmpReg, EMIT_REG_NO_SP, sizeof(uint32_t), EmitLeaveFlags, false);
			rmNo = tmpReg;
		}
		
		//form the address in dstReg
		if (isAdd)
			EMIT(LLaddReg, dstReg, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		else
			EMIT(LLsubReg, dstReg, rnNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		
		//store if needed
		if (adrMode != EmitAdrModePostindex) {
			//STORE
			EMIT(LLstoreImm, rtNo, dstReg, 0, size, EmitAdrModeIndex);
		}
		
		//POP {tmpReg}
		EMIT(HLpop, 1 << tmpReg);
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

enum EmitStatus jitEmitRegRegMemStr(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rtNo, uint32_t rnNo, bool isAdd, uint32_t rmNo, enum EmitShiftType shiftType, uint32_t shiftAmt, enum EmitAddrMode adrMode, enum EmitMemOpSz size)
{
	uint32_t pcVal = instrAddr + 8;
	
	//writeback with Rn == PC or Rn == Rt is undefined
	if ((rnNo == EMIT_REG_NO_PC || rnNo == rtNo) && adrMode != EmitAdrModeIndex)
		return EmitErrNotEncodeable;
	
	//Rm == PC is undefined
	if (rmNo == EMIT_REG_NO_PC)
		return EmitErrNotEncodeable;

	if (rtNo == EMIT_REG_NO_PC)
		return jitEmitRegRegMemStrPc(dest, cc, pcVal, rnNo, isAdd, rmNo, shiftType, shiftAmt, adrMode, size);
	else if (rtNo == EMIT_REG_NO_SP)
		return jitEmitRegRegMemStrSp(dest, cc, pcVal, rnNo, isAdd, rmNo, shiftType, shiftAmt, adrMode, size);
	else if (rnNo == EMIT_REG_NO_PC)
		return jitEmitRegRegMemStrPcBased(dest, cc, pcVal, rtNo, isAdd, rmNo, shiftType, shiftAmt, adrMode, size);
	else if (rnNo == EMIT_REG_NO_SP)
		return jitEmitRegRegMemStrSpBased(dest, cc, pcVal, rtNo, isAdd, rmNo, shiftType, shiftAmt, adrMode, size);
	else
		return jitPrvEmitRegRegMemStrSimple(dest, cc, rtNo, rnNo, isAdd, rmNo, shiftType, shiftAmt, adrMode, size);
}

//a perfectly valid request with no weird regs and no regs the same
static enum EmitStatus jitPrvEmitSimpleSwap(struct EmitBuf *dest, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, uint32_t regTmp, enum EmitMemOpSz size)
{
	struct EmitBuf cbzSpot;
	uintptr_t start;
	
	//start:
	start = emitGetPtrToJumpHere(dest);

	//LDREX Rd, [Rn]
	EMIT(LLldrex, rdNo, rnNo, 0, size);
	
	//STREX regTmp, Rm, [Rn]
	EMIT(LLstrex, regTmp, rmNo, rnNo, 0, size);
	
	//we need to loop back without using flags. get clever
	
	//save space for "CBZ $+4"
	EMIT(SaveSpace, &cbzSpot, 1);
	
	//B start
	EMIT(LLbranch, start, EmitCcAl);
	
	//cbz goes here. emit it now
	EMIT_TO(LLcbz, &cbzSpot, regTmp, emitGetPtrToJumpHere(dest));

	return EmitErrNone;
}

enum EmitStatus jitEmitSwap(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, enum EmitMemOpSz size)
{
	uint32_t tmpReg1, tmpReg2, tmpReg3, usedRegs = (1 << rdNo) | (1 << rnNo) | (1 << rmNo), pushRegs = 0, nPushRegs;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	//none of the regs is allowed to be PC
	if (usedRegs & (1 << EMIT_REG_NO_PC))
		return EmitErrNotEncodeable;
	
	//Rn must be distinct from Rm and Rd
	if (rnNo == rmNo || rnNo == rdNo)
		return EmitErrNotEncodeable;
	
	//there is no SWPH
	if (size == EmitSzHalfword)
		return EmitErrNotEncodeable;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	if (rdNo == EMIT_REG_NO_SP) {
		
		nPushRegs = jitPrvFindTempRegs(instrAddr, usedRegs, &pushRegs, NULL, false, &tmpReg1, &tmpReg2, &tmpReg3, NULL);
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		if (rmNo == EMIT_REG_NO_SP) {
			//ADD tmpReg1, SP, #12	//point to where SP would be
			EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, nPushRegs * sizeof(uint32_t), EmitLeaveFlags, false);
			rmNo = tmpReg1;
		}
		
		//SWP tmpReg3, Rm, [Rn] @ with tmpReg2 as temp value
		now = jitPrvEmitSimpleSwap(dest, tmpReg3, rnNo, rmNo, tmpReg2, size);
		if (now != EmitErrNone)
			return now;
		
		//swap into new SP
		now = jitPrvEmitSwapSp(dest, tmpReg3, tmpReg2, pushRegs, nPushRegs);
		if (now != EmitErrNone)
			return now;
	}
	else if (rdNo != rmNo && rmNo != EMIT_REG_NO_SP) { 	//we only need one temp reg
		
		nPushRegs = jitPrvFindTempRegs(instrAddr, usedRegs, &pushRegs, NULL, false, &tmpReg1, NULL);
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		//SWP Rd, Rm, [Rn] @ with tmpReg1 as temp value
		now = jitPrvEmitSimpleSwap(dest, rdNo, rnNo, rmNo, tmpReg1, size);
		if (now != EmitErrNone)
			return now;
		
		//POP {..pushRegs..}
		EMIT(HLpop, pushRegs);
	}
	else {	//Rd and Rm might be the same reg or Rm no might be SP
		
		nPushRegs = jitPrvFindTempRegs(instrAddr, usedRegs, &pushRegs, NULL, false, &tmpReg1, &tmpReg2, NULL);
	
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		if (rmNo == EMIT_REG_NO_SP) {
			//ADD tmpReg1, SP, #8	//point to where SP would be
			EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, 2 * sizeof(uint32_t), EmitLeaveFlags, false);
		}
		else {
		
			//MOV tmpReg1, rmNo
			EMIT(LLmov, tmpReg1, rmNo, EmitShiftLsl, 0, EmitLeaveFlags, false);
		}
		
		//SWP Rd, tmpReg1, [Rn] @ with tmpReg2 as temp value
		now = jitPrvEmitSimpleSwap(dest, rdNo, rnNo, tmpReg1, tmpReg2, size);
		if (now != EmitErrNone)
			return now;
			
		//POP {..pushRegs..}
		EMIT(HLpop, pushRegs);
	}

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

//no SP or PC dst, no PC src
static enum EmitStatus jitPrvClz(struct EmitBuf *dest, uint32_t rdNo, uint32_t rmNo)
{
	if (rmNo == EMIT_REG_NO_SP) {
		
		//mov Rd, SP
		EMIT(LLmov, rdNo, EMIT_REG_NO_SP, EmitShiftLsl, 0, EmitLeaveFlags, false);
		
		rmNo = rdNo;
	}

	EMIT(LLclz, rdNo, rmNo);

	return EmitErrNone;
}

enum EmitStatus jitEmitClz(struct EmitBuf *dest, enum EmitCc cc, uint32_t rdNo, uint32_t rmNo)
{
	uint32_t tmpReg1, tmpReg2, usedRegs = (1 << rmNo), pushRegs = 0;
	enum EmitStatus now;
	
	//no PC allowed
	if (rdNo == EMIT_REG_NO_PC || rmNo == EMIT_REG_NO_PC)
		return EmitErrNotEncodeable;
	
	if (rdNo == EMIT_REG_NO_SP) {
		
		struct EmitBuf ccSkip;
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		tmpReg1 = jitUtilPickLowestClearBit(usedRegs + pushRegs);
		pushRegs += 1 << tmpReg1;
		tmpReg2 = jitUtilPickLowestClearBit(usedRegs + pushRegs);
		pushRegs += 1 << tmpReg2;
		
		//push {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		//CLZ tmpReg1, Rm
		now = jitPrvClz(dest, tmpReg1, rmNo);
		if (now != EmitErrNone)
			return now;
		
		//swap into new SP
		now = jitPrvEmitSwapSp(dest, tmpReg1, tmpReg2, pushRegs, 2);
		if (now != EmitErrNone)
			return now;
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	else {
		
		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		
		//CLZ Rd, Rm
		now = jitPrvClz(dest, rdNo, rmNo);
		if (now != EmitErrNone)
			return now;
	}

	return EmitErrNone;
}

static enum EmitStatus jitPrvBxBlxReg(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rmNo, bool withLink)
{
	enum EmitStatus now;
	
	now = jitPrvLiteralLoadsFlush(dest, withLink ? 2 : 1);	//so we do not flush in the middle of "IT"
	if (now != EmitErrNone)
		return now;
	
	if(rmNo == EMIT_REG_NO_LR) {
		
		if (withLink) {	//"blx lr" is a bit hard
			
			if (cc != EmitCcAl)
				EMIT(LLittt, cc);
			
			//push {lr}
			EMIT(HLpush, (1 << EMIT_REG_NO_LR));
			
			//ldr lr, =proper_lr_val
			now = jitPrvEmitLoadValueOneInstrIfInIt(dest, EMIT_REG_NO_LR, instrAddr + 4, cc != EmitCcAl);
			if (now != EmitErrNone)
				return now;
			
			//pop {pc}
			now = jitPrvEmitJumpOneInstr(dest, (uintptr_t)&jitPrvPopPcCallout);
			if (now != EmitErrNone)
				return now;
		}
		else {
		
			if (cc != EmitCcAl)
				EMIT(LLit, cc);
			
			now = jitPrvEmitJumpOneInstr(dest, (uintptr_t)&jitPrvBxLrCallout);
			if (now != EmitErrNone)
				return now;
		}
	}
	else if (rmNo == 12) {
	
		if (cc != EmitCcAl) {
			if (withLink)
				EMIT(LLitt, cc);
			else 
				EMIT(LLit, cc);
		}
		
		if (withLink) {
			now = jitPrvEmitLoadValueOneInstrIfInIt(dest, EMIT_REG_NO_LR, instrAddr + 4, cc != EmitCcAl);
			if (now != EmitErrNone)
				return now;
		}
		
		now = jitPrvEmitJumpOneInstr(dest, (uintptr_t)&jitPrvBxR12Callout);
		if (now != EmitErrNone)
			return now;
	}
	else if (rmNo == EMIT_REG_NO_PC) {
		
		struct EmitBuf ccSkip;
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		if (withLink)
			EMIT(HLloadImmToReg, EMIT_REG_NO_LR, instrAddr + 4, false, false, false);
		
		now = jitEmitJumpToArm(dest, EmitCcAl, instrAddr + 8, NULL);
		if (now != EmitErrNone)
			return now;

		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	else {
		
		if (cc != EmitCcAl) {
			if (withLink)
				EMIT(LLitt, cc);
			else 
				EMIT(LLit, cc);
		}
		
		if (withLink) {
			now = jitPrvEmitLoadValueOneInstrIfInIt(dest, EMIT_REG_NO_LR, instrAddr + 4, cc != EmitCcAl);
			if (now != EmitErrNone)
				return now;
		}
		
		EMIT(LLbx, rmNo);
	}

	return EmitErrNone;
}

enum EmitStatus jitEmitBlxReg(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rmNo)
{
	return jitPrvBxBlxReg(dest, cc, instrAddr, rmNo, true);
}

enum EmitStatus jitEmitBxReg(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rmNo)
{
	return jitPrvBxBlxReg(dest, cc, instrAddr, rmNo, false);
}

enum EmitStatus jitEmitBlToArm(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t dstAddr)
{
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	now = jitEmitLoadImmToReg(dest, EMIT_REG_NO_LR, instrAddr + 4, false, false, false);
	if (now != EmitErrNone)
		return now;
	
	now = jitEmitJumpToArm(dest, EmitCcAl, dstAddr, NULL);
		if (now != EmitErrNone)
			return now;
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvEmitSimpleMul(struct EmitBuf *dest, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, bool s)
{
	if ((EMIT_IS_LOREG(rdNo) && EMIT_IS_LOREG(rnNo) && EMIT_IS_LOREG(rmNo) && (rdNo == rnNo || rdNo == rmNo)) || !s) {
		
		//MUL[s] Rd, Rn, Rm
		EMIT(LLmulReg, rdNo, rnNo, rmNo, s ? EmitSetFlags : EmitLeaveFlags, false);
	}
	else {			//MULS not encodeable as a short instr
		
		//MUL Rd, Rn, Rm
		EMIT(LLmulReg, rdNo, rnNo, rmNo, EmitLeaveFlags, false);
		
		//TST Rd, Rd
		EMIT(LLtstReg, rdNo, rdNo, EmitShiftLsl, 0);
	}

	return EmitErrNone;
}

enum EmitStatus jitEmitMul(struct EmitBuf *dest, enum EmitCc cc, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, bool s)
{
	uint32_t tmpReg1, tmpReg2, usedRegs = (1 << rdNo) | (1 << rnNo) | (1 << rmNo), pushRegs = 0, nPushRegs = 0;
	struct EmitBuf ccSkip;
	enum EmitStatus now;

	//v5 forbids Rm == Rd. It does not help us any to enforce that so we do not. we do enforce no use of PC
	if (usedRegs & (1 << EMIT_REG_NO_PC))
		return EmitErrNotEncodeable;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	if (rdNo == EMIT_REG_NO_SP) {			//MUL into SP
		
		tmpReg1 = jitUtilPickLowestClearBit(usedRegs + pushRegs);	//picked temp regs are ALWAYS LOREGS - no need to test
		pushRegs += 1 << tmpReg1;
		nPushRegs++;
		tmpReg2 = jitUtilPickLowestClearBit(usedRegs + pushRegs);
		pushRegs += 1 << tmpReg2;
		nPushRegs++;
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		if (rmNo == EMIT_REG_NO_SP || rnNo == EMIT_REG_NO_SP) {
			
			//ADD tmpReg1, SP, #4 * nPushRegs	//point to where SP would be
			EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushRegs, EmitLeaveFlags, false);
			if (rmNo == EMIT_REG_NO_SP)
				rmNo = tmpReg1;
			if (rnNo == EMIT_REG_NO_SP)
				rnNo = tmpReg1;
		}
		
		//MUL[s] tmpReg2, Rn, Rm
		now = jitPrvEmitSimpleMul(dest, tmpReg2, rnNo, rmNo, s);
		if (now != EmitErrNone)
			return now;
		
		//swap into new SP
		now = jitPrvEmitSwapSp(dest, tmpReg2, tmpReg1, pushRegs, nPushRegs);
		if (now != EmitErrNone)
			return now;
	}
	else if ((usedRegs & (1 << EMIT_REG_NO_SP)) && rdNo != rnNo && rdNo != rmNo) {	//MUL using SP, Rd can be clobbered
		
		//MOV Rd, SP
		EMIT(LLmov, rdNo, EMIT_REG_NO_SP, EmitShiftLsl, 0, EmitLeaveFlags, false);
		
		if (rnNo == EMIT_REG_NO_SP)
			rnNo = rdNo;
		if (rmNo == EMIT_REG_NO_SP)
			rmNo = rdNo;
		
		//MUL[s] Rd, Rn, Rm
		now = jitPrvEmitSimpleMul(dest, rdNo, rnNo, rmNo, s);
		if (now != EmitErrNone)
			return now;
	}
	else if (usedRegs & (1 << EMIT_REG_NO_SP)) {	//MUL using SP, Rd cannot be clobbered
		
		tmpReg1 = jitUtilPickLowestClearBit(usedRegs + pushRegs);	//picked temp regs are ALWAYS LOREGS - no need to test
		pushRegs += 1 << tmpReg1;
		nPushRegs++;
		
		//PUSH {tmpReg1}
		EMIT(HLpush, 1 << tmpReg1);
		
		//ADD tmpReg1, SP, #4 * nPushRegs		//SP_VAL before push
		EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushRegs, EmitLeaveFlags, false);
		
		if (rnNo == EMIT_REG_NO_SP)
			rnNo = tmpReg1;
		if (rmNo == EMIT_REG_NO_SP)
			rmNo = tmpReg1;
		
		//MUL[s] Rd, Rn, Rm
		now = jitPrvEmitSimpleMul(dest, rdNo, rnNo, rmNo, s);
		if (now != EmitErrNone)
			return now;
		
		//POP {tmpReg1}
		EMIT(HLpop, 1 << tmpReg1);
	}
	else {
		
		now = jitPrvEmitSimpleMul(dest, rdNo, rnNo, rmNo, s);
		if (now != EmitErrNone)
			return now;
	}

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvEmitSimpleMla(struct EmitBuf *dest, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, uint32_t raNo, bool s)
{
	//MLA Rd, Rn, Rm, Ra
	EMIT(LLmlaReg, rdNo, rnNo, rmNo, raNo);
	
	if (s) {
		//TST Rd, Rd
		EMIT(LLtstReg, rdNo, rdNo, EmitShiftLsl, 0);
	}

	return EmitErrNone;
}

enum EmitStatus jitEmitMla(struct EmitBuf *dest, enum EmitCc cc, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, uint32_t raNo, bool s)
{
	uint32_t tmpReg1, tmpReg2, usedRegs = (1 << rdNo) | (1 << rnNo) | (1 << rmNo) | (1 << raNo), pushRegs = 0, nPushRegs = 0;
	struct EmitBuf ccSkip;
	enum EmitStatus now;

	//v5 forbids Rm == Rd. It does not help us any to enforce that so we do not. we do enforce no use of PC
	if (usedRegs & (1 << EMIT_REG_NO_PC))
		return EmitErrNotEncodeable;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	if (rdNo == EMIT_REG_NO_SP) {			//MLA into SP
		
		tmpReg1 = jitUtilPickLowestClearBit(usedRegs + pushRegs);	//picked temp regs are ALWAYS LOREGS - no need to test
		pushRegs += 1 << tmpReg1;
		nPushRegs++;
		tmpReg2 = jitUtilPickLowestClearBit(usedRegs + pushRegs);
		pushRegs += 1 << tmpReg2;
		nPushRegs++;
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		if (rmNo == EMIT_REG_NO_SP || rnNo == EMIT_REG_NO_SP || raNo == EMIT_REG_NO_SP) {
			
			//ADD tmpReg1, SP, #4 * nPushRegs	//point to where SP would be
			EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushRegs, EmitLeaveFlags, false);
			if (rmNo == EMIT_REG_NO_SP)
				rmNo = tmpReg1;
			if (rnNo == EMIT_REG_NO_SP)
				rnNo = tmpReg1;
			if (raNo == EMIT_REG_NO_SP)
				raNo = tmpReg1;
		}
		
		//MLA[s] tmpReg2, Rn, Rm, Ra
		now = jitPrvEmitSimpleMla(dest, tmpReg2, rnNo, rmNo, raNo, s);
		if (now != EmitErrNone)
			return now;
		
		//swap into new SP
		now = jitPrvEmitSwapSp(dest, tmpReg2, tmpReg1, pushRegs, nPushRegs);
		if (now != EmitErrNone)
			return now;
	}
	else if ((usedRegs & (1 << EMIT_REG_NO_SP)) && rdNo != rnNo && rdNo != rmNo && rdNo != raNo) {	//MLA using SP, Rd can be clobbered
		
		//MOV Rd, SP
		EMIT(LLmov, rdNo, EMIT_REG_NO_SP, EmitShiftLsl, 0, EmitLeaveFlags, false);
		
		if (rnNo == EMIT_REG_NO_SP)
			rnNo = rdNo;
		if (rmNo == EMIT_REG_NO_SP)
			rmNo = rdNo;
		if (raNo == EMIT_REG_NO_SP)
			raNo = rdNo;
		
		//MLA[s] Rd, Rn, Rm, Ra
		now = jitPrvEmitSimpleMla(dest, rdNo, rnNo, rmNo, raNo, s);
		if (now != EmitErrNone)
			return now;
	}
	else if (usedRegs & (1 << EMIT_REG_NO_SP)) {	//MLA using SP, Rd cannot be clobbered
		
		tmpReg1 = jitUtilPickLowestClearBit(usedRegs + pushRegs);	//picked temp regs are ALWAYS LOREGS - no need to test
		pushRegs += 1 << tmpReg1;
		nPushRegs++;
		
		//PUSH {tmpReg1}
		EMIT(HLpush, 1 << tmpReg1);
		
		//ADD tmpReg1, SP, #4 * nPushRegs		//SP_VAL before push
		EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushRegs, EmitLeaveFlags, false);
		
		if (rnNo == EMIT_REG_NO_SP)
			rnNo = tmpReg1;
		if (rmNo == EMIT_REG_NO_SP)
			rmNo = tmpReg1;
		if (raNo == EMIT_REG_NO_SP)
			raNo = tmpReg1;
		
		//MLA[s] Rd, Rn, Rm, Ra
		now = jitPrvEmitSimpleMla(dest, rdNo, rnNo, rmNo, raNo, s);
		if (now != EmitErrNone)
			return now;
		
		//POP {tmpReg1}
		EMIT(HLpop, 1 << tmpReg1);
	}
	else {
		
		now = jitPrvEmitSimpleMla(dest, rdNo, rnNo, rmNo, raNo, s);
		if (now != EmitErrNone)
			return now;
	}

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvEmitSimpleLongMul(struct EmitBuf *dest, uint32_t rdLoNo, uint32_t rdHiNo, uint32_t rnNo, uint32_t rmNo, bool unsign, bool accum, bool s)
{
	EMIT(LLlongMul, rdLoNo, rdHiNo, rnNo, rmNo, unsign, accum);

	if (s) {
		//this gets very hard since we need to set N based on RdHi[31] and Z based on the 64-bit value RdHi:RdLo
		//  without clobering C or V flags, our result regs, or any other regs
		//we break this into cases:
		//	if RdHi[31] is set:
		//		N should be set and Z cleared (TST RdHi, RdHi will do this)
		//	else if (RdHi)
		//		N should be cleared and Z cleared (TST RdHi, RdHi will do this too)
		//	NOTE BOTH OF THE ABOVE CASE CAN BE SUMMARIZED AS: "TST RdHi, RdHi"
		//	else, we need N cleared, Z set based on RdLo
		//		if (RdLo >= 0)
		//			N = 0, Z = 0	(TST Rdo, RdLo) will do this)
		//		else we need N = 0, Z = 0 somehow, we know top bit is zet
		//			if (!RdLo[0])
		//				ROR RdLo, #1
		//				TST RdLo, RdLo // top bit will be zero so this produces proper Z & N
		//				ROR RdLo, #31
		//			else	(we know at least one more but is set - the bottom one)
		//				TST RdLo, #1 will already do this
		//
		//	thus we arrive at this monstrosity:
		//		TST RdHi, RdHi
		//		BNE done			//RdHi[0..31] had at least one bit set -> all our flags are set
		//		TST RdLo, RdLo
		//		BPL done			//RdLo evaluated and flags set properly
		//		TST RdLo, #1
		//		BNE done			//RdLo has bottom bit set -> we're done
		//		ROR RdLo, #1		//guarantees top bit will be clear, will set Z properly
		//		TST RdLo, RdLo
		//		ROR RdLo, #31
		
		struct EmitBuf firstBne, bpl, secondBne;
		
		//TST RdHi, RdHi
		EMIT(LLtstReg, rdHiNo, rdHiNo, EmitShiftLsl, 0);
		
		//space for "BNE done"
		EMIT(SaveSpace, &firstBne, 1);
		
		//TST RdLo, RdLo
		EMIT(LLtstReg, rdLoNo, rdLoNo, EmitShiftLsl, 0);
		
		//space for "BPL done"
		EMIT(SaveSpace, &bpl, 1);
		
		//TST RdLo, #1
		EMIT(LLtstImm, rdLoNo, 1, 0);
		
		//space for "BNE done"
		EMIT(SaveSpace, &secondBne, 1);
		
		//ROR RdLo, #1
		EMIT(LLmov, rdLoNo, rdLoNo, EmitShiftRor, 1, EmitLeaveFlags, false);
		
		//TST RdLo, RdLo
		EMIT(LLtstReg, rdLoNo, rdLoNo, EmitShiftLsl, 0);
		
		//ROR RdLo, #3
		EMIT(LLmov, rdLoNo, rdLoNo, EmitShiftRor, 31, EmitLeaveFlags, false);
		
		//done label is here, fill in our jumps
		EMIT_TO(LLbranch, &firstBne, emitGetPtrToJumpHere(dest), EmitCcNe);
		EMIT_TO(LLbranch, &bpl, emitGetPtrToJumpHere(dest), EmitCcPl);
		EMIT_TO(LLbranch, &secondBne, emitGetPtrToJumpHere(dest), EmitCcNe);
	}

	return EmitErrNone;
}

enum EmitStatus jitEmitLongMul(struct EmitBuf *dest, enum EmitCc cc, uint32_t rdLoNo, uint32_t rdHiNo, uint32_t rnNo, uint32_t rmNo, bool unsign, bool accum, bool s)
{
	uint32_t tmpReg1, tmpReg2, usedRegs = (1 << rdLoNo) | (1 << rdHiNo) | (1 << rnNo) | (1 << rmNo), pushRegs = 0, nPushRegs = 0;
	struct EmitBuf ccSkip;
	enum EmitStatus now;

	//no PC is allowed, dest regs must be distinct
	if (usedRegs & (1 << EMIT_REG_NO_PC))
		return EmitErrNotEncodeable;
	if (rdLoNo == rdHiNo)
		return EmitErrNotEncodeable;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	if (rdLoNo == EMIT_REG_NO_SP || rdHiNo == EMIT_REG_NO_SP) {
		
		tmpReg1 = jitUtilPickLowestClearBit(usedRegs + pushRegs);	//picked temp regs are ALWAYS LOREGS - no need to test
		pushRegs += 1 << tmpReg1;
		nPushRegs++;
		tmpReg2 = jitUtilPickLowestClearBit(usedRegs + pushRegs);
		pushRegs += 1 << tmpReg2;
		nPushRegs++;
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		//they might be inputs too, so use tmpReg1 for in AND out
		if (rmNo == EMIT_REG_NO_SP || rnNo == EMIT_REG_NO_SP || accum) {
			
			//ADD tmpReg1, SP, #4 * nPushRegs	//point to where SP would be
			EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushRegs, EmitLeaveFlags, false);
			if (rmNo == EMIT_REG_NO_SP)
				rmNo = tmpReg1;
			if (rnNo == EMIT_REG_NO_SP)
				rnNo = tmpReg1;
		}
		//in or out, either way SP becomes tmpReg1
		if (rdLoNo == EMIT_REG_NO_SP)
			rdLoNo = tmpReg1;
		else
			rdHiNo = tmpReg1;
		
		//longmul[s] RdLo, RdHi, Rn, Rm
		now = jitPrvEmitSimpleLongMul(dest, rdLoNo, rdHiNo, rnNo, rmNo, unsign, accum, s);
		if (now != EmitErrNone)
			return now;

		//swap into new SP
		now = jitPrvEmitSwapSp(dest, tmpReg1, tmpReg2, pushRegs, nPushRegs);
		if (now != EmitErrNone)
			return now;
	}
	else if (usedRegs & (1 << EMIT_REG_NO_SP)) {	//long mul using SP as input (only Rn or Rm can be SP)
		
		int32_t tmpReg = -1;
		
		if (rdLoNo != rnNo && rdLoNo != rmNo)
			tmpReg = rdLoNo;
		else if (rdHiNo != rnNo && rdHiNo != rmNo)
			tmpReg = rdHiNo;
		
		if (tmpReg >= 0 && !accum) {		//a temp reg we can clobber exists
			
			//MOV tmpReg, SP
			EMIT(LLmov, tmpReg, EMIT_REG_NO_SP, EmitShiftLsl, 0, EmitLeaveFlags, false);
		
			if (rnNo == EMIT_REG_NO_SP)
				rnNo = tmpReg;
			if (rmNo == EMIT_REG_NO_SP)
				rmNo = tmpReg;
			
			//longmul[s], RdLo ,RdHi, Rn, Rm
			now = jitPrvEmitSimpleLongMul(dest, rdLoNo, rdHiNo, rnNo, rmNo, unsign, accum, s);
			if (now != EmitErrNone)
				return now;
		}
		else {					//no temp regs exist
			
			tmpReg1 = jitUtilPickLowestClearBit(usedRegs + pushRegs);	//picked temp regs are ALWAYS LOREGS - no need to test
			pushRegs += 1 << tmpReg1;
			nPushRegs++;
			
			//PUSH {tmpReg1}
			EMIT(HLpush, 1 << tmpReg1);
			
			//ADD tmpReg1, SP, #4 * nPushRegs		//SP_VAL before push
			EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushRegs, EmitLeaveFlags, false);
			
			if (rnNo == EMIT_REG_NO_SP)
				rnNo = tmpReg1;
			if (rmNo == EMIT_REG_NO_SP)
				rmNo = tmpReg1;
			
			//longmul[s], RdLo ,RdHi, Rn, Rm
			now = jitPrvEmitSimpleLongMul(dest, rdLoNo, rdHiNo, rnNo, rmNo, unsign, accum, s);
			if (now != EmitErrNone)
				return now;
			
			//POP {tmpReg1}
			EMIT(HLpop, 1 << tmpReg1);
		}
	}
	else {
		
		//longmul[s], RdLo ,RdHi, Rn, Rm
		now = jitPrvEmitSimpleLongMul(dest, rdLoNo, rdHiNo, rnNo, rmNo, unsign, accum, s);
		if (now != EmitErrNone)
			return now;
	}

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvMrsReg(struct EmitBuf *dest, enum EmitCc cc, uint32_t rdNo)		//no weird regs please
{
	if (rdNo == EMIT_REG_NO_PC || rdNo == EMIT_REG_NO_SP)
		return EmitErrNotEncodeable;
	
	if (cc != EmitCcAl)
		EMIT(LLittt, cc);
	
	//MRS Rd, APSR
	EMIT(LLmrs, rdNo, EMIT_SYSM_APSR);
	
	//AND Rd, Rd, 0xf8000000				//hide all flags that ARMv5TE does not have/understand
	EMIT(LLandImm, rdNo, rdNo, 0xf8000000, 0, EmitLeaveFlags);

	//ADD Rd, Rd, #0x10						//act like we're in ARM mode, user priviledge level
	EMIT(LLaddImm, rdNo, rdNo, 0x10, EmitLeaveFlags, cc != EmitCcAl);

	return EmitErrNone;
}

enum EmitStatus jitEmitMrsReg(struct EmitBuf *dest, enum EmitCc cc, uint32_t rdNo)
{
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	if (rdNo == EMIT_REG_NO_PC)
		return EmitErrNotEncodeable;
	
	if (rdNo != EMIT_REG_NO_SP) {
		
		now = jitPrvMrsReg(dest, cc, rdNo);
		if (now != EmitErrNone)
			return now;
	}
	else {
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	
		//push {r0,r1}
		EMIT(HLpush, 0x0003);
		
		//MRS r0, CPSR
		now = jitPrvMrsReg(dest, EmitCcAl, 0);
		if (now != EmitErrNone)
			return now;
		
		//swap into new SP
		now = jitPrvEmitSwapSp(dest, 0, 1, 0x0003, 2);
		if (now != EmitErrNone)
			return now;
	
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

enum EmitStatus jitEmitMsrReg(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rmNo)
{
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;

	if (rmNo == EMIT_REG_NO_PC || rmNo == EMIT_REG_NO_SP) {
		
		//push {r0}
		EMIT(HLpush, 0x0001);

		if (rmNo == EMIT_REG_NO_PC) {
			//LDR r0, =PC_VAL
			EMIT(HLloadImmToReg, 0, instrAddr + 8, true, true, false);
		}
		else {
			//ADD 0, SP, #4		//SP_VAL before push
			EMIT(LLaddImm, 0, EMIT_REG_NO_SP, sizeof(uint32_t), EmitFlagsDoNotCare, false);
		}

		rmNo = 0;
	}
	
	//MSR APSR_nzcvq, rmNo
	EMIT(LLmsr, EMIT_SYSM_APSR, EMIT_MSR_APSR_MASK_NZCVQ, rmNo);
	
	if (rmNo == EMIT_REG_NO_PC || rmNo == EMIT_REG_NO_SP) {
		
		//pop {r0}
		EMIT(HLpop, 0x0001);
	}
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitMsrImm(struct EmitBuf *dest, enum EmitCc cc, uint32_t val)
{
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	//mask off the bits we should never set from user mode
	val &= 0xf8000000;

	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;

	//push {r0}
	EMIT(HLpush, 0x0001);
	
	//LDR r0, =val  (corrupting flags is ok, we're about to reset them)
	now = jitEmitLoadImmToReg(dest, 0, val, true, true, false);
	if (now != EmitErrNone)
		return now;

	//MSR APSR_nzcvq, r0
	EMIT(LLmsr, EMIT_SYSM_APSR, EMIT_MSR_APSR_MASK_NZCVQ, 0);
	
	//pop {r0}
	EMIT(HLpop, 0x0001);

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitNopFillLen(struct EmitBuf *dest, uint32_t halfwords)
{
	//as long as we cna, emit NOP.W
	while (halfwords >= 2) {
		halfwords -= 2;
		EMIT(LLnop, true);
	}
	
	//if we still need more fill, use a NOP.N
	if (halfwords)
		EMIT(LLnop, false);
	
	return EmitErrNone;
}

enum EmitStatus jitEmitNopFill(struct EmitBuf *dest)
{
	return jitEmitNopFillLen(dest, (uint16_t*)dest->bufEnd - (uint16_t*)dest->buf);
}

enum EmitStatus jitEmitArbitraryImmAdd(struct EmitBuf *dest, uint32_t rdNo, uint32_t rnNo, uint32_t val, bool isInIt)
{
	uint32_t shift, nowVal;
	bool useSub;
	
	if (rdNo == EMIT_REG_NO_PC || rnNo == EMIT_REG_NO_PC)
		return EmitErrNotEncodeable;

	//a quick heuristic to see which is easier: add or sub
	useSub = jitPrvPopcount32(val) <= jitPrvPopcount32(-val);
	
	if (useSub)
		val = -val;
	
	while (val) {
		
		if ((val & 0x0f00) && (val & 0x0ff)) 	//ADDW/SUBW makes sense
			nowVal = val & 0x0fff;
		else {
		
			shift = __builtin_clz(val);
			nowVal = ((val << shift) & 0xff000000) >> shift;
		}
		
		if (useSub)
			EMIT(LLsubImm, rdNo, rnNo, nowVal, EmitLeaveFlags, isInIt);
		else
			EMIT(LLaddImm, rdNo, rnNo, nowVal, EmitLeaveFlags, isInIt);
		
		rnNo = rdNo;
		val -= nowVal;
	}
		
	if (rdNo != rnNo) {
		
		EMIT(LLmov, rdNo, rnNo, EmitShiftLsl, 0, EmitLeaveFlags, isInIt);
		rnNo = rdNo;
	}
	
	return EmitErrNone;
}

//DO NOT CALL this ever. emits precisely one instr!
//Rd is not SP or PC, Rn is not SP or PC, Rm is not SP or PC
static enum EmitStatus jitPrvAluOpRegShiftedImmSimple(struct EmitBuf *dest, enum JitArmDpOp op, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, enum EmitShiftType shiftType, uint32_t shiftAmt, enum EmitFlagSettingPrefs flagPrefs, bool isInIt)
{
	switch (op) {
		case ArmDpOpAnd:	return emitLLandReg(dest, rdNo, rnNo, rmNo, shiftType, shiftAmt, flagPrefs, isInIt);
		case ArmDpOpEor:	return emitLLeorReg(dest, rdNo, rnNo, rmNo, shiftType, shiftAmt, flagPrefs, isInIt);
		case ArmDpOpSub:	return emitLLsubReg(dest, rdNo, rnNo, rmNo, shiftType, shiftAmt, flagPrefs, isInIt);
		case ArmDpOpRsb:	return emitLLrsbReg(dest, rdNo, rnNo, rmNo, shiftType, shiftAmt, flagPrefs, isInIt);
		case ArmDpOpAdd:	return emitLLaddReg(dest, rdNo, rnNo, rmNo, shiftType, shiftAmt, flagPrefs, isInIt);
		case ArmDpOpAdc:	return emitLLadcReg(dest, rdNo, rnNo, rmNo, shiftType, shiftAmt, flagPrefs, isInIt);
		case ArmDpOpSbc:	return emitLLsbcReg(dest, rdNo, rnNo, rmNo, shiftType, shiftAmt, flagPrefs, isInIt);
		case ArmDpOpRsc:	return EmitErrNotEncodeable;
		case ArmDpOpTst:	return emitLLtstReg(dest, rnNo, rmNo, shiftType, shiftAmt);
		case ArmDpOpTeq:	return emitLLteqReg(dest, rnNo, rmNo, shiftType, shiftAmt);
		case ArmDpOpCmp:	return emitLLcmpReg(dest, rnNo, rmNo, shiftType, shiftAmt);
		case ArmDpOpCmn:	return emitLLcmnReg(dest, rnNo, rmNo, shiftType, shiftAmt);
		case ArmDpOpOrr:	return emitLLorrReg(dest, rdNo, rnNo, rmNo, shiftType, shiftAmt, flagPrefs, isInIt);
		case ArmDpOpMov:	return emitLLmov(dest, rdNo, rmNo, shiftType, shiftAmt, flagPrefs, isInIt);
		case ArmDpOpBic:	return emitLLbicReg(dest, rdNo, rnNo, rmNo, shiftType, shiftAmt, flagPrefs, isInIt);
		case ArmDpOpMvn:	return emitLLmvnReg(dest, rdNo, rmNo, shiftType, shiftAmt, flagPrefs, isInIt);
		default:			__builtin_unreachable();
	}
}

//DO NOT CALL this ever
//Rd is not SP or PC, Rn is not SP or PC, imm IS encodeable as needed for flag setting required
static enum EmitStatus jitPrvAluOpImmSimple(struct EmitBuf *dest, enum JitArmDpOp op, uint32_t rdNo, uint32_t rnNo, uint32_t valIn, uint32_t rotBy, bool s, bool isInIt)
{
	enum EmitFlagSettingPrefs flagPrefs = s ? EmitSetFlags : EmitLeaveFlags;
	uint32_t val = jitPrvRor(valIn, rotBy);
	
	switch (op) {
		case ArmDpOpAnd:	return emitLLandImm(dest, rdNo, rnNo, valIn, rotBy, flagPrefs);
		case ArmDpOpEor:	return emitLLeorImm(dest, rdNo, rnNo, valIn, rotBy, flagPrefs);
		case ArmDpOpSub:	return emitLLsubImm(dest, rdNo, rnNo, val, flagPrefs, isInIt);
		case ArmDpOpRsb:	return emitLLrsbImm(dest, rdNo, rnNo, val, flagPrefs, isInIt);
		case ArmDpOpAdd:	return emitLLaddImm(dest, rdNo, rnNo, val, flagPrefs, isInIt);
		case ArmDpOpAdc:	return emitLLadcImm(dest, rdNo, rnNo, val, flagPrefs, isInIt);
		case ArmDpOpSbc:	return emitLLsbcImm(dest, rdNo, rnNo, val, flagPrefs, isInIt);
		case ArmDpOpRsc:	return EmitErrNotEncodeable;
		case ArmDpOpTst:	return emitLLtstImm(dest, rnNo, valIn, rotBy);
		case ArmDpOpTeq:	return emitLLteqImm(dest, rnNo, valIn, rotBy);
		case ArmDpOpCmp:	return emitLLcmpImm(dest, rnNo, val);
		case ArmDpOpCmn:	return emitLLcmnImm(dest, rnNo, val);
		case ArmDpOpOrr:	return emitLLorrImm(dest, rdNo, rnNo, valIn, rotBy, flagPrefs);
		case ArmDpOpMov:	return emitLLmovImm(dest, rdNo, valIn, rotBy, flagPrefs, isInIt);
		case ArmDpOpBic:	return emitLLbicImm(dest, rdNo, rnNo, valIn, rotBy, flagPrefs);
		case ArmDpOpMvn:	return emitLLmvnImm(dest, rdNo, valIn, rotBy, flagPrefs, isInIt);
		default:			__builtin_unreachable();
	}
}

static bool jitPrvDpInstrNeedsRealShifterCarryOut(enum JitArmDpOp op)
{
	const uint16_t dpInstrNeedsRealCarry = (1UL << ArmDpOpMov) | (1UL << ArmDpOpMvn) | (1UL << ArmDpOpAnd) | (1UL << ArmDpOpEor) | (1UL << ArmDpOpOrr) | (1UL << ArmDpOpBic) | (1UL << ArmDpOpTst) | (1UL << ArmDpOpTeq);
	
	return !!(dpInstrNeedsRealCarry & (1UL << op));
}

static bool jitPrvDpInstrHasRd(enum JitArmDpOp op)
{
	const uint16_t dpInstrHasNoRd = (1UL << ArmDpOpTst) | (1UL << ArmDpOpTeq) | (1UL << ArmDpOpCmp) | (1UL << ArmDpOpCmn);
	
	return !(dpInstrHasNoRd & (1UL << op));
}

static bool jitPrvDpInstrHasRn(enum JitArmDpOp op)
{
	const uint16_t dpInstrHasNoRn = (1UL << ArmDpOpMov) | (1UL << ArmDpOpMvn);
	
	return !(dpInstrHasNoRn & (1UL << op));
}

static bool jitPrvDpInstrConsumesCarry(enum JitArmDpOp op)	//these do not care about shifter carry out but DO need initial carry not clobbered
{
	const uint16_t dpInstrConsumesCarry = (1UL << ArmDpOpAdc) | (1UL << ArmDpOpSbc) | (1UL << ArmDpOpRsc);
	
	return !!(dpInstrConsumesCarry & (1UL << op));
}

static int32_t jitPrvDpImmEncodeable(enum JitArmDpOp op, uint32_t valIn, uint32_t rotBy, bool s)
{
	return jitPrvEncodeThumbImm(jitPrvRor(valIn, rotBy), jitPrvDpInstrNeedsRealShifterCarryOut(op) && s && rotBy) >= 0;
}

static bool jitPrvAluOpImmEvalPc(uint32_t pcVal, uint32_t val, enum JitArmDpOp op, uint32_t *valOut)
{
	switch (op) {
		case ArmDpOpAnd:	val = pcVal & val;	break;
		case ArmDpOpEor:	val = pcVal ^ val;	break;
		case ArmDpOpSub:	val = pcVal - val;	break;
		case ArmDpOpRsb:	val = val - pcVal;	break;
		case ArmDpOpAdd:	val = pcVal + val;	break;
		case ArmDpOpOrr:	val = pcVal | val;	break;
		case ArmDpOpBic:	val = pcVal &~ val;	break;
		default:
			return false;
	}
	
	if (valOut)
		*valOut = val;
	
	return true;
}

//eval things like "PC lsl #5" or "PC ror #1" if possible. keep[ in mind pre-evaluating this does not set flags!
static bool jitPrvAluOpRegShiftImmEvalPc(uint32_t pcVal, enum JitArmDpOp op, enum EmitShiftType shiftType, uint32_t shiftAmt, uint32_t *valOut)
{
	int32_t spcVal;
	
	switch (op) {
		case ArmDpOpMov:
			break;
		
		case ArmDpOpMvn:
			pcVal= ~pcVal;
			break;
		
		default:
			return false;
	}
	
	spcVal = pcVal;
	
	switch (shiftType) {
		case EmitShiftLsl:
			*valOut = pcVal << shiftAmt;
			return true;
		
		case EmitShiftLsr:
			*valOut = shiftAmt ? (pcVal >> shiftAmt) : 0;
			return true;
		
		case EmitShiftAsr:
			*valOut = shiftAmt ? (spcVal >> shiftAmt) : ((spcVal < 0) ? -1 : 0);
			return true;
		
		case EmitShiftRor:
			if (shiftAmt) {
				*valOut = (pcVal >> shiftAmt) | (pcVal << (32 - shiftAmt));
				return true;
			}
			return false;
		default:
			__builtin_unreachable();
	}
}

static enum EmitStatus jitPrvEmitDpImm(struct EmitBuf *dest, enum JitArmDpOp op, uint32_t regTo, uint32_t valIn, uint32_t rotBy, bool s)
{
	uint32_t val = jitPrvRor(valIn, rotBy);
	
	if (s && !jitPrvDpInstrConsumesCarry(op)) {
		
		//MOV[s] regTo, #valIn
		EMIT(LLmovImm, regTo, valIn, 0, rotBy ? EmitFlagsDoNotCare : EmitSetFlags, false);
		
		if (rotBy) {
		
			//RORS regTo, regTo, rotBy
			EMIT(LLmov, regTo, regTo, EmitShiftRor, rotBy, EmitSetFlags, false);
		}
	}
	else {
		
		//LDR regTo, =valIn
		EMIT(HLloadImmToReg, regTo, val, s, false, false);
	}
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvAluComparesImm(struct EmitBuf *dest, enum EmitCc cc, enum JitArmDpOp op, uint32_t pcVal, uint32_t rnNo, uint32_t valIn, uint32_t rotBy)
{
	uint32_t tmpReg1, tmpReg2, usedRegs = (1 << rnNo), pushRegs = 0, nPushRegs;
	bool immEncodeable = jitPrvDpImmEncodeable(op, valIn, rotBy, true);
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	
	if ((rnNo == EMIT_REG_NO_PC || rnNo == EMIT_REG_NO_SP) && !immEncodeable) {		//will need two regs
		
		nPushRegs = jitPrvFindTempRegs(pcVal - 8, usedRegs, &pushRegs, NULL, false, &tmpReg1, &tmpReg2, NULL);
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		if (rnNo == EMIT_REG_NO_PC) {
			
			//LDR tmpReg1, PC_VAL
			EMIT(HLloadImmToReg, tmpReg1, pcVal, false, false, false);
		}
		else {
			
			//ADD tmpReg1, SP, #8		//SP_VAL before push
			EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, nPushRegs * sizeof(uint32_t), EmitLeaveFlags, false);
		}
		
		//this way is USUALLY shorter and faster
			
		//MOV tmpReg2, #valIn
		EMIT(HLloadImmToReg, tmpReg2, valIn, true, false, false);
		
		//op tmpReg1, tmpReg2 {ROR rotBy} or {LSL 0}
		now = jitPrvAluOpRegShiftedImmSimple(dest, op, 0, tmpReg1, tmpReg2, rotBy ? EmitShiftRor : EmitShiftLsl, rotBy, EmitSetFlags, false);
		if (now != EmitErrNone)
			return now;

		//POP {..pushRegs..}
		EMIT(HLpop, pushRegs);
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	else if (rnNo == EMIT_REG_NO_PC || (rnNo == EMIT_REG_NO_SP && (op == ArmDpOpTst || op == ArmDpOpTeq))) {		//imm guaranteed to be encodeable, but Rn is not allowed in T2 compares
		
		nPushRegs = jitPrvFindTempRegs(pcVal - 8, usedRegs, &pushRegs, NULL, false, &tmpReg1, NULL);
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		if (rnNo == EMIT_REG_NO_PC) {
			
			//LDR tmpReg1, PC_VAL
			EMIT(HLloadImmToReg, tmpReg1, pcVal, false, false, false);
		}
		else {
			
			//ADD tmpReg1, SP, #4		//SP_VAL before push
			EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, nPushRegs * sizeof(uint32_t), EmitLeaveFlags, false);
		}
		
		//opS tmpReg1, #imm
		now = jitPrvAluOpImmSimple(dest, op, 0, tmpReg1, valIn, rotBy, true, false);
		if (now != EmitErrNone)
			return now;

		//POP {..pushRegs..}
		EMIT(HLpop, pushRegs);
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	else if (!immEncodeable) {				//Rn guaranteed allowed to be used in compare instrs in T2
		
		jitPrvFindTempRegs(pcVal - 8, usedRegs, &pushRegs, NULL, false, &tmpReg1, NULL);
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		//MOVL tmpReg1, #imm
		EMIT(HLloadImmToReg, tmpReg1, valIn, true, false, false);
		
		//op Rn, tmpReg1 {ROR rotBy} or {LSL 0}
		now = jitPrvAluOpRegShiftedImmSimple(dest, op, 0, rnNo, tmpReg1, rotBy ? EmitShiftRor : EmitShiftLsl, rotBy, EmitSetFlags, false);
		if (now != EmitErrNone)
			return now;

		//POP {..pushRegs..}
		EMIT(HLpop, pushRegs);
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	else {									//imm is encodeable, Rn is allowed
		
		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		
		//op Rn, #imm
		now = jitPrvAluOpImmSimple(dest, op, 0, rnNo, valIn, rotBy, true, cc != EmitCcAl);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvAluMovesImm(struct EmitBuf *dest, enum EmitCc cc, enum JitArmDpOp op, uint32_t rdNo, uint32_t valIn, uint32_t rotBy, bool s)
{
	uint32_t tmpReg1, tmpReg2, usedRegs = (1 << rdNo), pushRegs = 0, popRegs = 0;
	bool immEncodeable = jitPrvDpImmEncodeable(op, valIn, rotBy, s);
	uint32_t val = jitPrvRor(valIn, rotBy);
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	if (op == ArmDpOpMvn)
		val = ~val;
	
	if (rdNo == EMIT_REG_NO_PC || (!s && rdNo == EMIT_REG_NO_SP)) {			//"s" guaranteed clear for Rd == PC
		
		now = jitPrvLiteralLoadsFlush(dest, 1);
		if (now != EmitErrNone)
			return now;

		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		
		now = jitPrvEmitLoadValueOneInstrIfInIt(dest, rdNo, val, cc != EmitCcAl);
		if (now != EmitErrNone)
			return now;
	}
	else if (rdNo == EMIT_REG_NO_SP) {										//S is set
	
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		tmpReg1 = jitUtilPickLowestClearBit(usedRegs + pushRegs);
		pushRegs += 1 << tmpReg1;
		popRegs += 1 << tmpReg1;
		tmpReg2 = jitUtilPickLowestClearBit(usedRegs + pushRegs);
		pushRegs += 1 << tmpReg2;
		popRegs += 1 << tmpReg2;
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		if (immEncodeable) {
			
			//op tmpReg1, #imm
			now = jitPrvAluOpImmSimple(dest, op, tmpReg1, 0, valIn, rotBy, s, false);
			if (now != EmitErrNone)
				return now;
		}
		else if (s) {
			
			//MOV[s] tmpReg1, #valIn
			EMIT(LLmovImm, tmpReg1, valIn, 0, s ? (rotBy ? EmitFlagsDoNotCare : EmitSetFlags) : EmitLeaveFlags, false);
			
			if (op == ArmDpOpMov) {
				
				//MOV[s] tmpReg1, tmpReg1, ROR #rotBy
				EMIT(LLmov, tmpReg1, tmpReg1, EmitShiftRor, rotBy, s ? EmitSetFlags : EmitLeaveFlags, false);
			}
			else {
				
				//MVN[s] tmpReg1, tmpReg1, ROR #rotBy
				EMIT(LLmvnReg, tmpReg1, tmpReg1, EmitShiftRor, rotBy, s ? EmitSetFlags : EmitLeaveFlags, false);
			}
		}
		else {
	
			//LDR tmpReg1, =effective final imm
			EMIT(HLloadImmToReg, tmpReg1, val, false, false, false);
		}
		
		//swap into new SP
		now = jitPrvEmitSwapSp(dest, tmpReg1, tmpReg2, popRegs, 2);
		if (now != EmitErrNone)
			return now;
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	else if (!immEncodeable && s) {				//Rd guaranteed allowed to be clobbered
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		//MOV[s] Rd, #valIn
		EMIT(LLmovImm, rdNo, valIn, 0, rotBy ? EmitFlagsDoNotCare : EmitSetFlags, false);
		
		if (op == ArmDpOpMov) {
			
			//MOV[s] Rd, Rd, ROR #rotBy
			EMIT(LLmov, rdNo, rdNo, EmitShiftRor, rotBy, EmitSetFlags, false);
		}
		else {
			
			//MVN[s] Rd, Rd, ROR #rotBy
			EMIT(LLmvnReg, rdNo, rdNo, EmitShiftRor, rotBy, EmitSetFlags, false);
		}
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	else if (!immEncodeable) {
		
		now = jitPrvLiteralLoadsFlush(dest, 1);
		if (now != EmitErrNone)
			return now;

		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		
		//LDR Rd, =effective final imm
		now = jitPrvEmitLoadValueOneInstrIfInIt(dest, rdNo, val, cc != EmitCcAl);
		if (now != EmitErrNone)
			return now;
	}
	else {									//imm is encodeable, Rd is allowed
		
		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		
		//op Rn, #imm
		now = jitPrvAluOpImmSimple(dest, op, rdNo, 0, valIn, rotBy, s, cc != EmitCcAl);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvAluRscImm(struct EmitBuf *dest, enum EmitCc cc, uint32_t pcVal, uint32_t rdNo, uint32_t rnNo, uint32_t valIn, uint32_t rotBy, bool s)
{
	uint32_t tmpReg1, tmpReg2, usedRegs = (1 << rdNo) | (1 << rnNo), pushRegs, popRegs, nPushRegs;
	uint32_t val = jitPrvRor(valIn, rotBy);
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	/*
		RSC Rx, PC, #imm is ALMOST pre-calculable (plus or minus one based on APSR.C).
		We could precalc the result and use an "ITE CS" to dispatch if S is clear, BUT
		this instr is so unlikely to occur in any sane code.
		
		Thus: Fuck it, let it take the slow path
	*/
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	if (rdNo == EMIT_REG_NO_PC) {		//S bit guaranteed clear
		
		if (rnNo == EMIT_REG_NO_PC || rnNo == EMIT_REG_NO_SP)
			nPushRegs = jitPrvFindTempRegs(pcVal - 8, usedRegs, &pushRegs, &popRegs, true, &tmpReg1, &tmpReg2, NULL);
		else
			nPushRegs = jitPrvFindTempRegs(pcVal - 8, usedRegs, &pushRegs, &popRegs, true, &tmpReg1, NULL);
				
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		if (rnNo == EMIT_REG_NO_PC) {
			
			//LDR tmpReg2, PC_VAL
			EMIT(HLloadImmToReg, tmpReg2, pcVal, false, false, false);
			rnNo = tmpReg2;
		} 
		else if (rnNo == EMIT_REG_NO_SP) {
			
			//ADD tmpReg2, SP, #4 * nPushRegs		//SP_VAL before push
			EMIT(LLaddImm, tmpReg2, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushRegs, EmitLeaveFlags, false);
			rnNo = tmpReg2;
		}
		
		//LDR tmpReg1, =imm
		EMIT(HLloadImmToReg, tmpReg1, val, false, false, false);
		
		//SBC tmpReg1, tmpReg1, Rn
		EMIT(LLsbcReg, tmpReg1, tmpReg1, rnNo, EmitShiftLsl, 0, EmitLeaveFlags, false);
		
		//STR tmpReg1, [SP, #4 * (nPushRegs - 1)]	//where we'll pop it from
		EMIT(LLstoreImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * (nPushRegs - 1), EmitSzWord, EmitAdrModeIndex);
		
		//POP {..popRegs.., PC}
		now = jitPrvEmitPopWithOpts(dest, EmitCcAl, popRegs, false);
		if (now != EmitErrNone)
			return now;
	}
	else if (rdNo == EMIT_REG_NO_SP) {
		
		nPushRegs = jitPrvFindTempRegs(pcVal - 8, usedRegs, &pushRegs, &popRegs, false, &tmpReg1, &tmpReg2, NULL);
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		if (rnNo == EMIT_REG_NO_PC) {
			
			//LDR tmpReg2, PC_VAL
			EMIT(HLloadImmToReg, tmpReg2, pcVal, false, false, false);
			rnNo = tmpReg2;
		}
		else if (rnNo == EMIT_REG_NO_SP) {
			
			//ADD tmpReg2, SP, #4 * nPushRegs		//SP_VAL before push
			EMIT(LLaddImm, tmpReg2, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushRegs, EmitLeaveFlags, false);
			rnNo = tmpReg2;
		}
		
		//MOVL[s] tmpReg1, #imm
		now = jitPrvEmitDpImm(dest, ArmDpOpRsc, tmpReg1, valIn, rotBy, s);
		if (now != EmitErrNone)
			return now;
		
		//SBC[s] tmpReg1, tmpReg1, Rn
		EMIT(LLsbcReg, tmpReg1, tmpReg1, rnNo, EmitShiftLsl, 0, s ? EmitSetFlags : EmitLeaveFlags, false);

		//swap to new sp
		now = jitPrvEmitSwapSp(dest, tmpReg1, tmpReg2, popRegs, nPushRegs);
		if (now != EmitErrNone)
			return now;
	}
	else if (rnNo == EMIT_REG_NO_SP || rnNo == EMIT_REG_NO_PC || rnNo == rdNo) {	//we'll need a temp reg for sure, but Rd can be used as temp
		
		jitPrvFindTempRegs(pcVal - 8, usedRegs, &pushRegs, NULL, false, &tmpReg1, NULL);
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		//MOVL[s] tmpReg1, #imm
		now = jitPrvEmitDpImm(dest, ArmDpOpRsc, tmpReg1, valIn, rotBy, s);
		if (now != EmitErrNone)
			return now;
		
		//get Rn into Rd if it is not there yet
		if (rnNo == EMIT_REG_NO_PC) {
			
			//LDR Rd, PC_VAL
			EMIT(HLloadImmToReg, rdNo, pcVal, false, false, false);
		}
		else if (rnNo == EMIT_REG_NO_SP) {
			
			//ADD Rd, SP, #4		//SP_VAL before push
			EMIT(LLaddImm, rdNo, EMIT_REG_NO_SP, sizeof(uint32_t), EmitLeaveFlags, false);
		}
		
		//SBC[s] Rd, tmpReg1, Rd
		EMIT(LLsbcReg, rdNo, tmpReg1, rdNo, EmitShiftLsl, 0, s ? EmitSetFlags : EmitLeaveFlags, false);
		
		//POP {..pushRegs..}
		EMIT(HLpop, pushRegs);
	}
	else {	//Rd guaranteed safe to pre-clobber
	
		//MOVL[s] Rd, #imm
		now = jitPrvEmitDpImm(dest, ArmDpOpRsc, rdNo, valIn, rotBy, s);
		if (now != EmitErrNone)
			return now;
			
		//SBC[s] Rd, Rd, Rn
		EMIT(LLsbcReg, rdNo, rdNo, rnNo, EmitShiftLsl, 0, s ? EmitSetFlags : EmitLeaveFlags, false);
	}
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvAluImmOpToPc(struct EmitBuf *dest, enum EmitCc cc, enum JitArmDpOp op, uint32_t pcVal, uint32_t rnNo, uint32_t valIn, uint32_t rotBy)
{
	//3-op instr (not cmp-like or mov-like), not RSC, with PC as destination. S is guaranteed clear
	
	/*
		ADD PC, SP, #imm and SUB PC, SP, #imm can be optimized better than we
		do here, but no sane program woudl use those, so let them take the
		slow path
	*/
	
	uint32_t tmpReg1, tmpReg2, tmpReg3, usedRegs = (1 << EMIT_REG_NO_PC) | (1 << rnNo), pushRegs = 0, popRegs = 0, nPushRegs = 0;
	bool immEncodeable = jitPrvDpImmEncodeable(op, valIn, rotBy, false);
	uint32_t val = jitPrvRor(valIn, rotBy);
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	tmpReg1 = jitUtilPickLowestClearBit(usedRegs + pushRegs);
	nPushRegs++;
	pushRegs += 1 << tmpReg1;
	
	popRegs += 1 << tmpReg1;
	tmpReg2 = jitUtilPickLowestClearBit(usedRegs + pushRegs);
	nPushRegs++;
	pushRegs += 1 << tmpReg2;
	
	if (rnNo == EMIT_REG_NO_PC && jitPrvAluOpImmEvalPc(pcVal, val, op, &val)) {	//if we can pre-eval the entire answer, do so
		
		now = jitPrvLiteralLoadsFlush(dest, 1);
		if (now != EmitErrNone)
			return now;

		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		
		now = jitPrvEmitLoadValueOneInstrIfInIt(dest, EMIT_REG_NO_PC, val, cc != EmitCcAl);
		if (now != EmitErrNone)
			return now;
	}
	else {
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		if (!immEncodeable) {
			
			popRegs += 1 << tmpReg2;
			tmpReg3 = jitUtilPickLowestClearBit(usedRegs + pushRegs);
			nPushRegs++;
			pushRegs += 1 << tmpReg3;
		}
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		if (rnNo == EMIT_REG_NO_PC) {
			
			//LDR tmpReg1, PC_VAL
			EMIT(HLloadImmToReg, tmpReg1, pcVal, false, false, false);
			rnNo = tmpReg1;
		}
		else if (rnNo == EMIT_REG_NO_SP) {
			
			//ADD tmpReg1, SP, #4 * nPushRegs		//SP_VAL before push
			EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushRegs, EmitLeaveFlags, false);
			rnNo = tmpReg1;
		}
		
		if (!immEncodeable) {
			
			//this way is USUALLY shorter and faster
			
			//MOV tmpReg2, #valIn
			EMIT(HLloadImmToReg, tmpReg2, valIn, false, false, false);
			
			//op tmpReg1, Rn, tmpReg1, ROR rotBy
			// OR
			//op tmpReg1, Rn, tmpReg2, LSL #0
			now = jitPrvAluOpRegShiftedImmSimple(dest, op, tmpReg1, rnNo, tmpReg2, rotBy ? EmitShiftRor : EmitShiftLsl, rotBy, EmitLeaveFlags, false);
			if (now != EmitErrNone)
				return now;
		}
		else {
			
			//op tmpReg1, Rn, #imm
			now = jitPrvAluOpImmSimple(dest, op, tmpReg1, rnNo, valIn, rotBy, false, false);
			if (now != EmitErrNone)
				return now;
		}
		
		//STR tmpReg1, [SP, #4 * (nPushRegs - 1)]	//where we'll pop it from
		EMIT(LLstoreImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * (nPushRegs - 1), EmitSzWord, EmitAdrModeIndex);
		
		//POP {..popRegs.., PC}
		now = jitPrvEmitPopWithOpts(dest, EmitCcAl, (1 << EMIT_REG_NO_PC) + popRegs, false);
		if (now != EmitErrNone)
			return now;
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvAluImmOpToSp(struct EmitBuf *dest, enum EmitCc cc, enum JitArmDpOp op, uint32_t pcVal, uint32_t rnNo, uint32_t valIn, uint32_t rotBy, bool s)
{
	//3-op instr (not cmp-like or mov-like), not RSC, with SP as destination
	
	uint32_t tmpReg1, tmpReg2, usedRegs = (1 << EMIT_REG_NO_SP) | (1 << rnNo), pushRegs = 0, nPushRegs = 0;
	bool immEncodeable = jitPrvDpImmEncodeable(op, valIn, rotBy, s);
	uint32_t val = jitPrvRor(valIn, rotBy);
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	if (rnNo == EMIT_REG_NO_PC && !s && jitPrvAluOpImmEvalPc(pcVal, val, op, &val)) {	//if we can pre-eval the entire answer, do so
		
		now = jitPrvLiteralLoadsFlush(dest, 1);
		if (now != EmitErrNone)
			return now;

		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		
		now = jitPrvEmitLoadValueOneInstrIfInIt(dest, EMIT_REG_NO_SP, val, cc != EmitCcAl);
		if (now != EmitErrNone)
			return now;
	}
	else if ((op == ArmDpOpAdd || op == ArmDpOpSub) && !s && rnNo == EMIT_REG_NO_SP && immEncodeable) {		//common stack adjstments
		
		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		
		now = jitPrvAluOpImmSimple(dest, op, EMIT_REG_NO_SP, EMIT_REG_NO_SP, val, 0, false, cc != EmitCcAl);
		if (now != EmitErrNone)
			return now;
	}
	else {
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		tmpReg1 = jitUtilPickLowestClearBit(usedRegs + pushRegs);
		nPushRegs++;
		pushRegs += 1 << tmpReg1;
		
		tmpReg2 = jitUtilPickLowestClearBit(usedRegs + pushRegs);
		nPushRegs++;
		pushRegs += 1 << tmpReg2;
			
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		if (rnNo == EMIT_REG_NO_PC) {
			
			//LDR tmpReg1, PC_VAL
			EMIT(HLloadImmToReg, tmpReg1, pcVal, false, false, false);
			rnNo = tmpReg1;
		}
		else if (rnNo == EMIT_REG_NO_SP) {
			
			//ADD tmpReg1, SP, #4 * nPushRegs		//SP_VAL before push
			EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushRegs, EmitLeaveFlags, false);
			rnNo = tmpReg1;
		}
		
		if (!immEncodeable) {
			
			//this way is USUALLY shorter and faster, and it does set ALL the flags right
			
			//MOV tmpReg2, #valIn
			EMIT(HLloadImmToReg, tmpReg2, valIn, s, false, false);
			
			//op tmpReg1, Rn, tmpReg1, ROR rotBy
			// OR
			//op tmpReg1, Rn, tmpReg2, LSL #0
			now = jitPrvAluOpRegShiftedImmSimple(dest, op, tmpReg1, rnNo, tmpReg2, rotBy ? EmitShiftRor : EmitShiftLsl, rotBy, s ? EmitSetFlags : EmitLeaveFlags, false);
			if (now != EmitErrNone)
				return now;
		}
		else {
			
			//op tmpReg1, Rn, #imm
			now = jitPrvAluOpImmSimple(dest, op, tmpReg1, rnNo, valIn, rotBy, s, false);
			if (now != EmitErrNone)
				return now;
		}
			
		//swap to new sp
		now = jitPrvEmitSwapSp(dest, tmpReg1, tmpReg2, pushRegs, nPushRegs);
		if (now != EmitErrNone)
			return now;
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}
static enum EmitStatus jitPrvAluImmOpFromPcOrSp(struct EmitBuf *dest, enum EmitCc cc, enum JitArmDpOp op, bool fromPc, uint32_t pcVal, uint32_t rdNo, uint32_t valIn, uint32_t rotBy, bool s)
{
	bool immEncodeable = jitPrvDpImmEncodeable(op, valIn, rotBy, s);
	uint32_t tmpReg = rdNo ? 0 : 1, val = jitPrvRor(valIn, rotBy);
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	//guaranteed: Rd is not SP or PC, Rn *IS* SP or PC
	
	if ((op == ArmDpOpAdd || op == ArmDpOpSub) && !s && immEncodeable && !fromPc) {		//common stack adjstments
		
		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		
		now = jitPrvAluOpImmSimple(dest, op, rdNo, EMIT_REG_NO_SP, val, 0, false, cc != EmitCcAl);
		if (now != EmitErrNone)
			return now;
	}
	else if (fromPc && !s && jitPrvAluOpImmEvalPc(pcVal, val, op, &val)) {	//if we can pre-eval the entire answer, do so
		
		now = jitPrvLiteralLoadsFlush(dest, 1);
		if (now != EmitErrNone)
			return now;

		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		
		now = jitPrvEmitLoadValueOneInstrIfInIt(dest, rdNo, val, cc != EmitCcAl);
		if (now != EmitErrNone)
			return now;
	}
	else if (immEncodeable) {
		
		now = jitPrvLiteralLoadsFlush(dest, 1);
		if (now != EmitErrNone)
			return now;

		if (cc != EmitCcAl)
			EMIT(LLitt, cc);
		
		if (fromPc) {
			
			//LDR Rd, PC_VAL
			now = jitPrvEmitLoadValueOneInstrIfInIt(dest, rdNo, pcVal, cc != EmitCcAl);
			if (now != EmitErrNone)
				return now;
		}
		else  {
			
			//MOV Rd, SP
			EMIT(LLmov, rdNo, EMIT_REG_NO_SP, EmitShiftLsl, 0, EmitLeaveFlags, cc != EmitCcAl);
		}
	
		//op Rd, Rd, #imm
		now = jitPrvAluOpImmSimple(dest, op, rdNo, rdNo, valIn, rotBy, s, cc != EmitCcAl);
		if (now != EmitErrNone)
			return now;
	}
	else {
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		if (fromPc) {
			
			//LDR Rd, PC_VAL
			EMIT(HLloadImmToReg, rdNo, pcVal, false, false, false);
		}
		else  {
			
			//MOV Rd, SP
			EMIT(LLmov, rdNo, EMIT_REG_NO_SP, EmitShiftLsl, 0, EmitLeaveFlags, false);
		}
	
		//PUSH {tmpReg}
		EMIT(HLpush, 1 << tmpReg);
		
		//MOV tmpReg, #valIn
		EMIT(HLloadImmToReg, tmpReg, valIn, s, false, false);
		
		//op Rd, Rd, tmpReg, ROR rotBy
		// OR
		//op Rd, Rd, tmpReg, LSL #0
		now = jitPrvAluOpRegShiftedImmSimple(dest, op, rdNo, rdNo, tmpReg, rotBy ? EmitShiftRor : EmitShiftLsl, rotBy, s ? EmitSetFlags : EmitLeaveFlags, false);
		if (now != EmitErrNone)
			return now;
	
		//POP {tmpReg}
		EMIT(HLpop, 1 << tmpReg);
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
	}
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvAluImmOpSimple(struct EmitBuf *dest, enum EmitCc cc, enum JitArmDpOp op, uint32_t rdNo, uint32_t rnNo, uint32_t valIn, uint32_t rotBy, bool s)
{
	bool immEncodeable = jitPrvDpImmEncodeable(op, valIn, rotBy, s);
	uint32_t tmpReg = rdNo ? 0 : 1;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	//Rd and Rn both not SP or PC
	if (immEncodeable) {
		
		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		
		//op Rd, Rn, #imm
		now = jitPrvAluOpImmSimple(dest, op, rdNo, rnNo, valIn, rotBy, s, cc != EmitCcAl);
		if (now != EmitErrNone)
			return now;
	}
	else if (rdNo != rnNo) {
		
		now = jitPrvLiteralLoadsFlush(dest, 1);
		if (now != EmitErrNone)
			return now;
		
		if (cc != EmitCcAl)
			EMIT(LLitt, cc);

		//MOV Rd, #valIn
		now = jitPrvEmitLoadValueOneInstrIfInIt(dest, rdNo, valIn, cc != EmitCcAl);
		if (now != EmitErrNone)
			return now;
		
		//op Rd, Rd, Rd, ROR rotBy
		// OR
		//op Rd, Rd, Rd, LSL #0
		now = jitPrvAluOpRegShiftedImmSimple(dest, op, rdNo, rnNo, rdNo, rotBy ? EmitShiftRor : EmitShiftLsl, rotBy, s ? EmitSetFlags : EmitLeaveFlags, cc != EmitCcAl);
		if (now != EmitErrNone)
			return now;
	}
	else {
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		//PUSH {tmpReg}
		EMIT(HLpush, 1 << tmpReg);
		
		//MOV tmpReg, #valIn
		EMIT(HLloadImmToReg, tmpReg, valIn, s, false, false);
		
		//op Rd, Rn, tmpReg, ROR rotBy
		// OR
		//op Rd, Rn, tmpReg, LSL #0
		now = jitPrvAluOpRegShiftedImmSimple(dest, op, rdNo, rnNo, tmpReg, rotBy ? EmitShiftRor : EmitShiftLsl, rotBy, s ? EmitSetFlags : EmitLeaveFlags, false);
		if (now != EmitErrNone)
			return now;
	
		//POP {tmpReg}
		EMIT(HLpop, 1 << tmpReg);
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

static bool jitPrvIsNullShift(enum EmitShiftType shiftType, uint32_t shiftAmt)
{
	return shiftType == EmitShiftLsl && shiftAmt == 0;
}

static enum EmitStatus jitPrvAluMovesRegShiftImm(struct EmitBuf *dest, enum EmitCc cc, enum JitArmDpOp op, uint32_t pcVal, uint32_t rdNo, uint32_t rmNo, enum EmitShiftType shiftType, uint32_t shiftAmt, bool s)
{
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	uint32_t precalced;
	
	//fast path for PC source
	if (rmNo == EMIT_REG_NO_PC && !s && jitPrvAluOpRegShiftImmEvalPc(pcVal, op, shiftType, shiftAmt, &precalced)) {
		
		now = jitPrvLiteralLoadsFlush(dest, 1);
		if (now != EmitErrNone)
			return now;
		
		if (cc != EmitCcAl)
			EMIT(LLit, cc);
			
		now = jitPrvEmitLoadValueOneInstrIfInIt(dest, rdNo, precalced, cc != EmitCcAl);
		if (now != EmitErrNone)
			return now;
	}
	else if (rdNo == EMIT_REG_NO_PC) {		//S is clear. guaranteed
		
		if (jitPrvIsNullShift(shiftType, shiftAmt) && op == ArmDpOpMov && rmNo != EMIT_REG_NO_PC && rmNo != EMIT_REG_NO_SP){
			
			now = jitPrvLiteralLoadsFlush(dest, 1);
			if (now != EmitErrNone)
				return now;
			
			if (rmNo == EMIT_REG_NO_LR) {
				
				if (cc != EmitCcAl)
					EMIT(LLit, cc);
				
				now = jitPrvEmitJumpOneInstr(dest, (uintptr_t)&jitPrvBxLrArmOnlyCallout);
				if (now != EmitErrNone)
					return now;
			}
			else {
				
				if (cc != EmitCcAl)
					EMIT(LLitt, cc);
				
				//PUSH {Rm}
				EMIT(HLpush, 1 << rmNo);
				
				//POP_noninterworking {pc}
				now = jitPrvEmitJumpOneInstr(dest, (uintptr_t)&jitPrvPopPcArmOnlyCallout);
				if (now != EmitErrNone)
					return now;
			}
		}
		else {
			
			now = jitPrvHandleCcStart(dest, &ccSkip, cc);
			if (now != EmitErrNone)
				return now;
			
			//PUSH {R0, R1}
			EMIT(HLpush, 0x0002);
			
			if (rmNo == EMIT_REG_NO_PC) {
				
				//LDR r0, PC_VAL
				EMIT(HLloadImmToReg, 0, pcVal, false, false, false);
				rmNo = 0;
			}
			else if (rmNo == EMIT_REG_NO_SP) {
				
				//ADD tmpReg1, SP, #4 * nPushRegs		//SP_VAL before push
				EMIT(LLaddImm, 0, EMIT_REG_NO_SP, 2 * sizeof(uint32_t), EmitLeaveFlags, false);
				rmNo = 0;
			}
			
			if (op == ArmDpOpMov) {
				
				//MOV R0, Rm, shiftType #shiftAmt
				EMIT(LLmov, 0, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
			}
			else {
				
				//MVN R0, Rm, shiftType #shiftAmt
				EMIT(LLmvnReg, 0, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
			}
			
			//STR R0, [SP, #4]	//where we'll pop it from
			EMIT(LLstoreImm, 0, EMIT_REG_NO_SP, sizeof(uint32_t), EmitSzWord, EmitAdrModeIndex);
			
			//POP {R0, PC}
			now = jitPrvEmitPopWithOpts(dest, EmitCcAl, (1 << EMIT_REG_NO_PC) | 0x0001, false);
			if (now != EmitErrNone)
				return now;
			
			now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
			if (now != EmitErrNone)
				return now;
		}
	}
	else if (rdNo == EMIT_REG_NO_SP) {
		
		if (jitPrvIsNullShift(shiftType, shiftAmt) && op == ArmDpOpMov && !s && rmNo != EMIT_REG_NO_PC){
			
			if (cc != EmitCcAl)
				EMIT(LLit, cc);
			
			//MOV SP, Rm
			EMIT(LLmov, EMIT_REG_NO_SP, rmNo, EmitShiftLsl, 0, EmitLeaveFlags, cc != EmitCcAl);
		}
		else {
			
			now = jitPrvHandleCcStart(dest, &ccSkip, cc);
			if (now != EmitErrNone)
				return now;
			
			//PUSH {R0, R1}
			EMIT(HLpush, 0x0003);
			
			if (rmNo == EMIT_REG_NO_PC) {
				
				//LDR r0, PC_VAL
				EMIT(HLloadImmToReg, 0, pcVal, false, false, false);
				rmNo = 0;
			}
			else if (rmNo == EMIT_REG_NO_SP) {
				
				//ADD r0, SP, #4 * nPushRegs		//SP_VAL before push
				EMIT(LLaddImm, 0, EMIT_REG_NO_SP, 2 * sizeof(uint32_t), EmitLeaveFlags, false);
				rmNo = 0;
			}
			
			if (op == ArmDpOpMov) {
				
				//MOV[s] R0, Rm, shiftType #shiftAmt
				EMIT(LLmov, 0, rmNo, shiftType, shiftAmt, s ? EmitSetFlags : EmitLeaveFlags, false);
			}
			else {
				
				//MVN[s] R0, Rm, shiftType #shiftAmt
				EMIT(LLmvnReg, 0, rmNo, shiftType, shiftAmt, s ? EmitSetFlags : EmitLeaveFlags, false);
			}
			
			//swap to new sp
			now = jitPrvEmitSwapSp(dest, 0, 1, 0x0003, 2);
			if (now != EmitErrNone)
				return now;
			
			now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
			if (now != EmitErrNone)
				return now;
		}
	}
	else if (rmNo == EMIT_REG_NO_SP && !s && jitPrvIsNullShift(shiftType, shiftAmt) && op == ArmDpOpMov) {
		
		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		
		//fast path for simply moving SP somewhere
		//MOV Rd, Rm
		EMIT(LLmov, rdNo, rmNo, EmitShiftLsl, 0, EmitLeaveFlags, cc != EmitCcAl);
	}
	else {		//we know dest is not SP or PC
		
		if (rmNo == EMIT_REG_NO_SP || rmNo == EMIT_REG_NO_PC) {		//math on complicated regs - clobber dest
			
			if (cc != EmitCcAl)
				EMIT(LLitt, cc);
			
			EMIT(LLmov, rdNo, rmNo, EmitShiftLsl, 0, EmitLeaveFlags, cc != EmitCcAl);
			rmNo = rdNo;
		}
		else if (cc != EmitCcAl)
			EMIT(LLit, cc);
		
		if (op == ArmDpOpMov) {
			
			//MOV Rd, Rm, shiftType #shiftAmt
			EMIT(LLmov, rdNo, rmNo, shiftType, shiftAmt, s ? EmitSetFlags : EmitLeaveFlags, cc != EmitCcAl);
		}
		else {
			
			//MVN Rd, Rm, shiftType #shiftAmt
			EMIT(LLmvnReg, rdNo, rmNo, shiftType, shiftAmt, s ? EmitSetFlags : EmitLeaveFlags, cc != EmitCcAl);
		}
	}
	
	return EmitErrNone;
}

enum EmitStatus jitEmitAluOpImm(struct EmitBuf *dest, enum EmitCc cc, enum JitArmDpOp op, uint32_t instrAddr, uint32_t rdNo, uint32_t rnNo, uint32_t valIn, uint32_t rotBy, bool s)
{
	uint32_t pcVal = instrAddr + 8;
	
	//we do not suport SPSR-reading instrs
	if (jitPrvDpInstrHasRd(op) && rdNo == EMIT_REG_NO_PC && s)
		return EmitErrNotEncodeable;
	
	//all test instrs have s set
	if (!jitPrvDpInstrHasRd(op) && !s)
		return EmitErrNotEncodeable;
	
	//convert things equivalent to MOV.reg into a MOV.reg
	if (!valIn && !s && (op == ArmDpOpEor || op == ArmDpOpOrr || op == ArmDpOpAdd || op == ArmDpOpBic || op == ArmDpOpSub))
		return jitPrvAluMovesRegShiftImm(dest, cc, ArmDpOpMov, pcVal, rdNo, rnNo, EmitShiftLsl, 0, false);
	
	//convert things equivalent to MOV.imm into a MOV.imm
	if (!valIn && !s && op == ArmDpOpAnd)
		op = ArmDpOpMov;
	
	if (!jitPrvDpInstrHasRd(op))
		return jitPrvAluComparesImm(dest, cc, op, pcVal, rnNo, valIn, rotBy);
	else if (!jitPrvDpInstrHasRn(op))
		return jitPrvAluMovesImm(dest, cc, op, rdNo, valIn, rotBy, s);
	else if (op == ArmDpOpRsc)
		return jitPrvAluRscImm(dest, cc, pcVal, rdNo, rnNo, valIn, rotBy, s);
	else if (rdNo == EMIT_REG_NO_PC)
		return jitPrvAluImmOpToPc(dest, cc, op, pcVal, rnNo, valIn, rotBy);
	else if (rdNo == EMIT_REG_NO_SP)
		return jitPrvAluImmOpToSp(dest, cc, op, pcVal, rnNo, valIn, rotBy, s);
	else if (rnNo == EMIT_REG_NO_SP || rnNo == EMIT_REG_NO_PC)
		return jitPrvAluImmOpFromPcOrSp(dest, cc, op, rnNo == EMIT_REG_NO_PC, pcVal, rdNo, valIn, rotBy, s);
	else
		return jitPrvAluImmOpSimple(dest, cc, op, rdNo, rnNo, valIn, rotBy, s);
	
	return EmitErrNotEncodeable;
}

static enum EmitStatus jitPrvAluComparesRegShiftImm(struct EmitBuf *dest, enum EmitCc cc, enum JitArmDpOp op, uint32_t pcVal, uint32_t rnNo, uint32_t rmNo, enum EmitShiftType shiftType, uint32_t shiftAmt)
{
	uint32_t tmpReg1, tmpReg2, usedRegs = (1 << rnNo) | (1 << rmNo), pushRegs = 0, nPushRegs = 0;
	bool complexRn = rnNo == EMIT_REG_NO_SP || rnNo == EMIT_REG_NO_PC;
	bool complexRm = rmNo == EMIT_REG_NO_SP || rmNo == EMIT_REG_NO_PC;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	if (complexRn || complexRm) {
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		if (complexRn && complexRm && rnNo != rmNo) {			//we need both PC and SP (who compares SP and PC???)
			
			tmpReg1 = jitUtilPickLowestClearBit(usedRegs + pushRegs);
			pushRegs += 1 << tmpReg1;
			nPushRegs++;
			
			tmpReg2 = jitUtilPickLowestClearBit(usedRegs + pushRegs);
			pushRegs += 1 << tmpReg2;
			nPushRegs++;
		
			//PUSH {..pushRegs..}
			EMIT(HLpush, pushRegs);
			
			//LDR tmpReg1, PC_VAL
			EMIT(HLloadImmToReg, tmpReg1, pcVal, false, false, false);
			
			//ADD tmpReg2, SP, #4 * nPushRegs		//SP_VAL before push
			EMIT(LLaddImm, tmpReg2, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushRegs, EmitLeaveFlags, false);
			
			if (rnNo == EMIT_REG_NO_PC) {
				
				rnNo = tmpReg1;
				rmNo = tmpReg2;
			}
			else {
				
				rnNo = tmpReg2;
				rmNo = tmpReg1;
			}
		}
		else if (complexRn || complexRm) {			//we need one: PC or SP
			
			tmpReg1 = jitUtilPickLowestClearBit(usedRegs + pushRegs);
			pushRegs += 1 << tmpReg1;
			nPushRegs++;
			
			//PUSH {..pushRegs..}
			EMIT(HLpush, pushRegs);
			
			if (rnNo == EMIT_REG_NO_SP || rmNo == EMIT_REG_NO_SP) {
				
				//ADD tmpReg1, SP, #4 * nPushRegs		//SP_VAL before push
				EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushRegs, EmitLeaveFlags, false);
				
				if (rnNo == EMIT_REG_NO_SP)
					rnNo = tmpReg1;
				if (rmNo == EMIT_REG_NO_SP)
					rmNo = tmpReg1;
			}
			else if (rnNo == EMIT_REG_NO_PC || rmNo == EMIT_REG_NO_PC) {
				
				//LDR tmpReg1, PC_VAL
				EMIT(HLloadImmToReg, tmpReg1, pcVal, false, false, false);
				
				if (rnNo == EMIT_REG_NO_PC)
					rnNo = tmpReg1;
				if (rmNo == EMIT_REG_NO_PC)
					rmNo = tmpReg1;
			}
		}
		
		//op Rn, Rm, shiftType #shiftAmt
		now = jitPrvAluOpRegShiftedImmSimple(dest, op, 0, rnNo, rmNo, shiftType, shiftAmt, EmitSetFlags, false);
		if (now != EmitErrNone)
			return now;
	
		if (pushRegs) {
			
			//POP {..pushRegs..}
			EMIT(HLpop, pushRegs);
		}
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	else {
		
		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		
		//op Rn, Rm, shiftType #shiftAmt
		now = jitPrvAluOpRegShiftedImmSimple(dest, op, 0, rnNo, rmNo, shiftType, shiftAmt, EmitSetFlags, cc != EmitCcAl);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvAluRscRegShiftImm(struct EmitBuf *dest, enum EmitCc cc, uint32_t pcVal, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, enum EmitShiftType shiftType, uint32_t shiftAmt, bool s)
{
	uint32_t tmpReg1, tmpReg2, tmpReg3 = 0, usedRegs = (1 << rnNo) | (1 << rdNo) | (1 << rmNo), pushRegs = 0, popRegs = 0, nPushRegs = 0, precalced;
	bool complexRn = rnNo == EMIT_REG_NO_SP || rnNo == EMIT_REG_NO_PC;
	bool complexRm = rmNo == EMIT_REG_NO_SP || rmNo == EMIT_REG_NO_PC;
	bool bothComplexs = complexRn && complexRm && (rnNo != rmNo);
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	//we could use pc precalculation here, but RSC makes no sense with PC so let it take the slow path
	
	tmpReg1 = jitUtilPickLowestClearBit(usedRegs + pushRegs);
	pushRegs += 1 << tmpReg1;
	nPushRegs++;
	
	popRegs += 1 << tmpReg1;
	tmpReg2 = jitUtilPickLowestClearBit(usedRegs + pushRegs);
	pushRegs += 1 << tmpReg2;
	nPushRegs++;

	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	if (rdNo == EMIT_REG_NO_PC || rdNo == EMIT_REG_NO_SP) {
		
		//some cases requite an extra reg
		if (bothComplexs || !jitPrvIsNullShift(shiftType, shiftAmt)) {
			
			popRegs += 1 << tmpReg2;
			tmpReg3 = jitUtilPickLowestClearBit(usedRegs + pushRegs);
			pushRegs += 1 << tmpReg3;
			nPushRegs++;
		}
		
		if (rdNo == EMIT_REG_NO_PC)
			popRegs += 1 << EMIT_REG_NO_PC;
		else
			popRegs = pushRegs;
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		//load complex values as needed
		if (rnNo == EMIT_REG_NO_SP || rmNo == EMIT_REG_NO_SP) {
			
			//ADD tmpReg1, SP, #4 * nPushRegs		//SP_VAL before push
			EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushRegs, EmitLeaveFlags, false);
			
			if (rnNo == EMIT_REG_NO_SP)
				rnNo = tmpReg1;
			if (rmNo == EMIT_REG_NO_SP)
				rmNo = tmpReg1;
		}
		if (rnNo == EMIT_REG_NO_PC || rmNo == EMIT_REG_NO_PC) {
			
			uint32_t which = bothComplexs ? tmpReg2 : tmpReg1;
			
			//LDR which, PC_VAL
			EMIT(HLloadImmToReg, which, pcVal, false, false, false);
			
			if (rnNo == EMIT_REG_NO_PC)
				rnNo = which;
			if (rmNo == EMIT_REG_NO_PC)
				rmNo = which;
		}
		
		//if there is no shift, we can SBC directly
		if (jitPrvIsNullShift(shiftType, shiftAmt)) {
			
			//SBC[s] tmpReg1, Rm, Rn
			EMIT(LLsbcReg, tmpReg1, rmNo, rnNo, EmitShiftLsl, 0, s ? EmitSetFlags : EmitLeaveFlags, false);
		}
		else {
			
			uint32_t clobberable = bothComplexs ? rmNo : tmpReg2;
			
			//MOV clobberable, Rm, shift #shiftAmt
			EMIT(LLmov, clobberable, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
			
			//SBC[s] tmpReg1, clobberable, Rn
			EMIT(LLsbcReg, tmpReg1, clobberable, rnNo, EmitShiftLsl, 0, s ? EmitSetFlags : EmitLeaveFlags, false);
		}
		
		if (rdNo == EMIT_REG_NO_SP) {
			
			now = jitPrvEmitSwapSp(dest, tmpReg1, tmpReg2, pushRegs, nPushRegs);
			if (now != EmitErrNone)
				return now;
		}
		else {
			
			//STR tmpReg1, [SP, #4 * (nPushRegs - 1)]	//where we'll pop it from
			EMIT(LLstoreImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * (nPushRegs - 1), EmitSzWord, EmitAdrModeIndex);
			
			//POP {..popRegs..}
			now = jitPrvEmitPopWithOpts(dest, EmitCcAl, popRegs, false);
			if (now != EmitErrNone)
				return now;
		}
	}
	else if (bothComplexs) {	//we can clobber dst, BUT need a temp reg no matter what. we know Rn and Rm are {SP, PC} or {PC, SP}
		
		//PUSH {tmpReg1}
		EMIT(HLpush, 1 << tmpReg1);
		
		//LDR Rd, PC_VAL
		EMIT(HLloadImmToReg, rdNo, pcVal, false, false, false);
		if (rnNo == EMIT_REG_NO_PC)
			rnNo = rdNo;
		if (rmNo == EMIT_REG_NO_PC)
			rmNo = rdNo;
		
		//ADD tmpReg1, SP, #4		//SP_VAL before push
		EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t), EmitLeaveFlags, false);
		if (rnNo == EMIT_REG_NO_SP)
			rnNo = tmpReg1;
		if (rmNo == EMIT_REG_NO_SP)
			rmNo = tmpReg1;
		
		if (!jitPrvIsNullShift(shiftType, shiftAmt)) {	//Rm is safe to clobber cause it is either tempReg or Rd
		
			//MOV Rm, Rm, shift #shiftAmt
			EMIT(LLmov, rmNo, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		}
		
		//SBC[s] Rd, Rm, Rn
		EMIT(LLsbcReg, rdNo, rmNo, rnNo, EmitShiftLsl, 0, s ? EmitSetFlags : EmitLeaveFlags, false);
		
		//POP {tmpReg1}
		EMIT(HLpop, 1 << tmpReg1);
	}
	else if (complexRn && complexRm) {								//both complex BUT same reg
		
		if (rnNo == EMIT_REG_NO_PC) {
			
			//LDR Rd, PC_VAL
			EMIT(HLloadImmToReg, rdNo, pcVal, false, false, false);
		}
		else {
			//MOV Rd, SP
			EMIT(LLmov, rdNo, EMIT_REG_NO_SP, EmitShiftLsl, 0, EmitLeaveFlags, false);
		}
		
		//PUSH {tmpReg1}
		EMIT(HLpush, 1 << tmpReg1);
		
		//MOV tmpReg1, Rd, shift #shiftAmt
		EMIT(LLmov, tmpReg1, rdNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		
		//SBC[s] Rd, tmpReg1, Rd
		EMIT(LLsbcReg, rdNo, tmpReg1, rdNo, EmitShiftLsl, 0, s ? EmitSetFlags : EmitLeaveFlags, false);
		
		//POP {tmpReg1}
		EMIT(HLpop, 1 << tmpReg1);
	}
	else if (complexRn && jitPrvIsNullShift(shiftType, shiftAmt)) {	//Rn is hard, Rm is easy, we can clobber Rd and it helps
		
		if (rnNo == EMIT_REG_NO_PC) {
			
			//LDR Rd, PC_VAL
			EMIT(HLloadImmToReg, rdNo, pcVal, false, false, false);
		}
		else {
			
			//MOV Rd, SP
			EMIT(LLmov, rdNo, EMIT_REG_NO_SP, EmitShiftLsl, 0, EmitLeaveFlags, false);
		}
		
		//SBC[s] Rd, Rm, Rd
		EMIT(LLsbcReg, rdNo, rmNo, rdNo, EmitShiftLsl, 0, s ? EmitSetFlags : EmitLeaveFlags, false);
	}
	else if (complexRn) {	//we can clobber Rd, but it will not help since we need complex math on Rm
			
		if (rnNo == EMIT_REG_NO_PC) {
			
			//LDR Rd, PC_VAL
			EMIT(HLloadImmToReg, rdNo, pcVal, false, false, false);
		}
		else {
			
			//MOV Rd, SP
			EMIT(LLmov, rdNo, EMIT_REG_NO_SP, EmitShiftLsl, 0, EmitLeaveFlags, false);
		}
		
		//PUSH {tmpReg1}
		EMIT(HLpush, 1 << tmpReg1);
		
		//MOV tmpReg1, Rm, shift #shiftAmt
		EMIT(LLmov, tmpReg1, rmNo, shiftType, shiftAmt, EmitLeaveFlags, false);
		
		//SBC[s] Rd, tmpReg1, Rd
		EMIT(LLsbcReg, rdNo, tmpReg1, rdNo, EmitShiftLsl, 0, s ? EmitSetFlags : EmitLeaveFlags, false);
		
		//POP {tmpReg1}
		EMIT(HLpop, 1 << tmpReg1);
	}
	else {	//we can clobber Rd for Rm math and it helps
		
		uint32_t from = rdNo;
			
		if (rmNo == EMIT_REG_NO_PC && jitPrvAluOpRegShiftImmEvalPc(pcVal, ArmDpOpMov, shiftType, shiftAmt, &precalced)) {
				
			//LDR Rd, PC_VAL, shift, #shiftAmt
			EMIT(HLloadImmToReg, rdNo, precalced, false, false, false);
		}
		else {
			
			if (rmNo == EMIT_REG_NO_PC) {
				
				//LDR Rd, PC_VAL
				EMIT(HLloadImmToReg, rdNo, pcVal, false, false, false);
			}
			else if (rmNo == EMIT_REG_NO_SP) {
				
				//MOV Rd, Rm
				EMIT(LLmov, rdNo, rmNo, EmitShiftLsl, 0, EmitLeaveFlags, false);
			}
			else {
				
				from = rmNo;
			}

			//MOV Rd, from, shift #shiftAmt		//if needed, do the shift
			EMIT(LLmov, rdNo, from, shiftType, shiftAmt, EmitLeaveFlags, false);
		}
		
		//SBC[s] Rd, Rd, Rn
		EMIT(LLsbcReg, rdNo, rdNo, rnNo, EmitShiftLsl, 0, s ? EmitSetFlags : EmitLeaveFlags, false);
	}

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvAluRegShiftImmOpToPc(struct EmitBuf *dest, enum EmitCc cc, enum JitArmDpOp op, uint32_t pcVal, uint32_t rnNo, uint32_t rmNo, enum EmitShiftType shiftType, uint32_t shiftAmt)
{
	//instr is 3-op (not mov/mvn/test/rsc), Rd is PC, S is clear
	
	uint32_t from, tmpReg1, tmpReg2, tmpReg3, usedRegs = (1 << rnNo) | (1 << rmNo), precalced, nPushRegs = 0, pushRegs = 0, popRegs = 1 << EMIT_REG_NO_PC;
	bool complexRn = rnNo == EMIT_REG_NO_SP || rnNo == EMIT_REG_NO_PC;
	bool complexRm = rmNo == EMIT_REG_NO_SP || rmNo == EMIT_REG_NO_PC;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	if (!complexRn) {
		
		nPushRegs = jitPrvFindTempRegs(pcVal - 8, usedRegs, &pushRegs, &popRegs, true, &tmpReg1, NULL);
		from = tmpReg1;
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		if (rmNo == EMIT_REG_NO_PC && jitPrvAluOpRegShiftImmEvalPc(pcVal, ArmDpOpMov, shiftType, shiftAmt, &precalced)) {
				
			//LDR tmpReg1, PC_VAL, shift, #shiftAmt
			EMIT(HLloadImmToReg, tmpReg1, precalced, false, false, false);
			shiftType = EmitShiftLsl;
			shiftAmt = 0;
		}
		else if (rmNo == EMIT_REG_NO_PC) {
				
			//LDR tmpReg1, PC_VAL
			EMIT(HLloadImmToReg, tmpReg1, pcVal, false, false, false);
		}
		else if (rmNo == EMIT_REG_NO_SP) {
			
			//ADD tmpReg1, SP, #4 * nPushRegs		//SP_VAL before push
			EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushRegs, EmitLeaveFlags, false);
		}
		else {
			
			from = rmNo;
		}
		
		//op tmpReg1, Rn, from, shiftType, #shiftAmt
		now = jitPrvAluOpRegShiftedImmSimple(dest, op, tmpReg1, rnNo, from, shiftType, shiftAmt, EmitLeaveFlags, 0);
		if (now != EmitErrNone)
			return now;
	}
	else if (!complexRm) {
		
		nPushRegs = jitPrvFindTempRegs(pcVal - 8, usedRegs, &pushRegs, &popRegs, true, &tmpReg1, NULL);
		from = tmpReg1;
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		if (rnNo == EMIT_REG_NO_PC) {
				
			//LDR tmpReg1, PC_VAL
			EMIT(HLloadImmToReg, tmpReg1, pcVal, false, false, false);
		}
		else if (rnNo == EMIT_REG_NO_SP) {
			
			//MOV tmpReg1, SP
			EMIT(LLmov, tmpReg1, EMIT_REG_NO_SP, EmitShiftLsl, 0, EmitLeaveFlags, false);
		}
		else {
			
			from = rnNo;
		}
		
		//op tmpReg1, from, Rm, shiftType, #shiftAmt
		now = jitPrvAluOpRegShiftedImmSimple(dest, op, tmpReg1, from, rmNo, shiftType, shiftAmt, EmitLeaveFlags, 0);
		if (now != EmitErrNone)
			return now;
	}
	else if (rmNo == rnNo) {	//both complex BUT same reg
		
		nPushRegs = jitPrvFindTempRegs(pcVal - 8, usedRegs, &pushRegs, &popRegs, true, &tmpReg1, NULL);
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		if (rnNo == EMIT_REG_NO_PC) {
				
			//LDR tmpReg1, PC_VAL
			EMIT(HLloadImmToReg, tmpReg1, pcVal, false, false, false);
		}
		else if (rnNo == EMIT_REG_NO_SP) {
			
			//MOV tmpReg1, SP
			EMIT(LLmov, tmpReg1, EMIT_REG_NO_SP, EmitShiftLsl, 0, EmitLeaveFlags, false);
		}
		
		//op tmpReg1, tmpReg1, tmpReg1, shiftType, #shiftAmt
		now = jitPrvAluOpRegShiftedImmSimple(dest, op, tmpReg1, tmpReg1, tmpReg1, shiftType, shiftAmt, EmitLeaveFlags, 0);
		if (now != EmitErrNone)
			return now;
	}
	else {		//both Rn & Rm are complex, not same complex reg - we need an extra reg
		
		uint32_t fromN, fromM;
		
		nPushRegs = jitPrvFindTempRegs(pcVal - 8, usedRegs, &pushRegs, &popRegs, true, &tmpReg1, &tmpReg2, NULL);
		fromN = tmpReg2;
		fromM = tmpReg1;
		
		if (rmNo == EMIT_REG_NO_PC && jitPrvAluOpRegShiftImmEvalPc(pcVal, ArmDpOpMov, shiftType, shiftAmt, &precalced)) {
				
			//LDR tmpReg2, PC_VAL, shift, #shiftAmt
			EMIT(HLloadImmToReg, tmpReg2, precalced, false, false, false);
			shiftType = EmitShiftLsl;
			shiftAmt = 0;
		}
		else if (rmNo == EMIT_REG_NO_PC) {
				
			//LDR tmpReg2, PC_VAL
			EMIT(HLloadImmToReg, tmpReg2, pcVal, false, false, false);
		}
		else if (rmNo == EMIT_REG_NO_SP) {
			
			//MOV tmpReg2, Rm
			EMIT(LLmov, tmpReg2, rmNo, EmitShiftLsl, 0, EmitLeaveFlags, false);
		}
		else {		//this will never happoen, but no harm to keep the code here
			
			fromM = rmNo;
		}
		
		if (rnNo == EMIT_REG_NO_PC) {
				
			//LDR tmpReg1, PC_VAL
			EMIT(HLloadImmToReg, tmpReg1, pcVal, false, false, false);
		}
		else if (rnNo == EMIT_REG_NO_SP) {
			
			//MOV tmpReg1, Rn
			EMIT(LLmov, tmpReg1, rnNo, EmitShiftLsl, 0, EmitLeaveFlags, false);
		}
		else {
			
			fromN = rnNo;
		}
		
		//op tmpReg1, fromN, fromM, shiftType, #shiftAmt
		now = jitPrvAluOpRegShiftedImmSimple(dest, op, tmpReg1, fromN, fromM, shiftType, shiftAmt, EmitLeaveFlags, 0);
		if (now != EmitErrNone)
			return now;
	}
	
	//STR tmpReg1, [SP, #4 * (nPushRegs - 1)]	//where we'll pop it from
	EMIT(LLstoreImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * (nPushRegs - 1), EmitSzWord, EmitAdrModeIndex);
	
	//POP {..popRegs..}
	now = jitPrvEmitPopWithOpts(dest, EmitCcAl, popRegs, false);
	if (now != EmitErrNone)
		return now;

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvAluRegShiftImmOpToSp(struct EmitBuf *dest, enum EmitCc cc, enum JitArmDpOp op, uint32_t pcVal, uint32_t rnNo, uint32_t rmNo, enum EmitShiftType shiftType, uint32_t shiftAmt, bool s)
{
	uint32_t tmpReg1, tmpReg2, usedRegs = (1 << rnNo) | (1 << rmNo), pushRegs = 0;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	//special shortcuts for allowable ops
	if (rnNo == EMIT_REG_NO_SP && shiftType == EmitShiftLsl && shiftAmt < 4 && rmNo != EMIT_REG_NO_SP && rmNo != EMIT_REG_NO_SP && (op == ArmDpOpAdd || op == ArmDpOpSub)) {
		
		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		
		now = jitPrvAluOpRegShiftedImmSimple(dest, op, EMIT_REG_NO_SP, rnNo, rmNo, shiftType, shiftAmt, s ? EmitSetFlags : EmitLeaveFlags, cc != EmitCcAl);
		if (now != EmitErrNone)
			return now;
	}
	else {
		
		//all hard cases require jitPrvEmitSwapSp() and thus 2 regs
		tmpReg1 = jitUtilPickLowestClearBit(usedRegs + pushRegs);
		pushRegs += 1 << tmpReg1;
		
		tmpReg2 = jitUtilPickLowestClearBit(usedRegs + pushRegs);
		pushRegs += 1 << tmpReg2;	
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		//load complex values as needed
		if (rnNo == EMIT_REG_NO_SP || rmNo == EMIT_REG_NO_SP) {
			
			//ADD tmpReg1, SP, #4 * nPushRegs		//SP_VAL before push
			EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * 2, EmitLeaveFlags, false);
			
			if (rnNo == EMIT_REG_NO_SP)
				rnNo = tmpReg1;
			if (rmNo == EMIT_REG_NO_SP)
				rmNo = tmpReg1;
		}
		if (rnNo == EMIT_REG_NO_PC || rmNo == EMIT_REG_NO_PC) {
			
			//LDR tmpReg2, PC_VAL
			EMIT(HLloadImmToReg, tmpReg2, pcVal, false, false, false);
			
			if (rnNo == EMIT_REG_NO_PC)
				rnNo = tmpReg2;
			if (rmNo == EMIT_REG_NO_PC)
				rmNo = tmpReg2;
		}
		
		//op tmpReg1, Rn, Rm, shiftType, #shiftAmt
		now = jitPrvAluOpRegShiftedImmSimple(dest, op, tmpReg1, rnNo, rmNo, shiftType, shiftAmt, s ? EmitSetFlags : EmitLeaveFlags, false);
		if (now != EmitErrNone)
			return now;
		
		now = jitPrvEmitSwapSp(dest, tmpReg1, tmpReg2, pushRegs, 2);
		if (now != EmitErrNone)
			return now;
	
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvAluRegShiftImmOpCommon(struct EmitBuf *dest, enum EmitCc cc, enum JitArmDpOp op, uint32_t pcVal, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, enum EmitShiftType shiftType, uint32_t shiftAmt, bool s)
{
	uint32_t tmpReg, usedRegs = (1 << rnNo) | (1 << rmNo) | (1 << rnNo), pushRegs, nPushRegs;
	bool complexRn = rnNo == EMIT_REG_NO_SP || rnNo == EMIT_REG_NO_PC;
	bool complexRm = rmNo == EMIT_REG_NO_SP || rmNo == EMIT_REG_NO_PC;
	bool bothComplexs = complexRn && complexRm && (rnNo != rmNo);
	struct EmitBuf ccSkip;
	enum EmitStatus now;

	//always: Rd is not SP or PC
	
	//special shortcuts for allowable ops on SP, and all things with noncomplex regs
	if ((!complexRn && !complexRm) || (rnNo == EMIT_REG_NO_SP && rmNo != EMIT_REG_NO_SP && rmNo != EMIT_REG_NO_PC && (op == ArmDpOpAdd || op == ArmDpOpSub))) {
		
		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		
		//op Rd, Rn, Rm, shiftType, #shiftAmt
		now = jitPrvAluOpRegShiftedImmSimple(dest, op, rdNo, rnNo, rmNo, shiftType, shiftAmt, s ? EmitSetFlags : EmitLeaveFlags, cc != EmitCcAl);
		if (now != EmitErrNone)
			return now;
	}
	//special case when doing ADD Rx, Ry, SP.
	else if (rnNo != EMIT_REG_NO_SP && rnNo != EMIT_REG_NO_PC && rmNo == EMIT_REG_NO_SP && op == ArmDpOpAdd && shiftType == EmitShiftLsl && shiftAmt == 0) {
		
		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		
		//add Rd, SP, Rn
		EMIT(LLaddReg, rdNo, EMIT_REG_NO_SP, rnNo, EmitShiftLsl, 0, s ? EmitSetFlags : EmitLeaveFlags, cc != EmitCcAl);
	}
	else if (bothComplexs) {			//both complex regs present, Rd CAN be clobbered cause we know it is SP or PC
		
		tmpReg = jitUtilPickLowestClearBit(usedRegs);
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		//PUSH {tmpReg}
		EMIT(HLpush, 1 << tmpReg);
		
		//ADD Rd, SP, #4 * nPushRegs		//SP_VAL before push
		EMIT(LLaddImm, tmpReg, EMIT_REG_NO_SP, sizeof(uint32_t), EmitLeaveFlags, false);
		if (rnNo == EMIT_REG_NO_SP)
			rnNo = rdNo;
		if (rmNo == EMIT_REG_NO_SP)
			rmNo = rdNo;
		
		//LDR tmpReg, PC_VAL
		EMIT(HLloadImmToReg, tmpReg, pcVal, false, false, false);
		if (rnNo == EMIT_REG_NO_PC)
			rnNo = tmpReg;
		if (rmNo == EMIT_REG_NO_PC)
			rmNo = tmpReg;
		
		//op Rd, Rn, Rm, shiftType, #shiftAmt
		now = jitPrvAluOpRegShiftedImmSimple(dest, op, rdNo, rnNo, rmNo, shiftType, shiftAmt, s ? EmitSetFlags : EmitLeaveFlags, false);
		if (now != EmitErrNone)
			return now;
		
		//POP {tmpReg}
		EMIT(HLpop, 1 << tmpReg);
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	else if (rdNo == rnNo || rdNo == rmNo) {	//one complex, but Rd not clobberable
	
		nPushRegs = jitPrvFindTempRegs(pcVal - 8, usedRegs, &pushRegs, NULL, false, &tmpReg, NULL);
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		if (rnNo == EMIT_REG_NO_SP || rmNo == EMIT_REG_NO_SP) {
			
			//ADD tmpReg, SP, #4 * nPushRegs		//SP_VAL before push
			EMIT(LLaddImm, tmpReg, EMIT_REG_NO_SP, nPushRegs * sizeof(uint32_t), EmitLeaveFlags, false);
			
			if (rnNo == EMIT_REG_NO_SP)
				rnNo = tmpReg;
			if (rmNo == EMIT_REG_NO_SP)
				rmNo = tmpReg;
		}
		else {
			
			//LDR tmpReg, PC_VAL
			EMIT(HLloadImmToReg, tmpReg, pcVal, false, false, false);
			
			if (rnNo == EMIT_REG_NO_PC)
				rnNo = tmpReg;
			if (rmNo == EMIT_REG_NO_PC)
				rmNo = tmpReg;
		}
		
		//op Rd, Rn, Rm, shiftType, #shiftAmt
		now = jitPrvAluOpRegShiftedImmSimple(dest, op, rdNo, rnNo, rmNo, shiftType, shiftAmt, s ? EmitSetFlags : EmitLeaveFlags, false);
		if (now != EmitErrNone)
			return now;
		
		//POP {..pushRegs..}
		EMIT(HLpop, pushRegs);
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	else  {										//one complex reg and Rd not an input reg means we can use Rd as clobber
		
		now = jitPrvLiteralLoadsFlush(dest, 1);
		if (now != EmitErrNone)
			return now;
		
		if (cc != EmitCcAl)
			EMIT(LLitt, cc);
		
		if (rnNo == EMIT_REG_NO_SP || rmNo == EMIT_REG_NO_SP) {
			
			//MOV Rd, SP
			EMIT(LLmov, rdNo, EMIT_REG_NO_SP, EmitShiftLsl, 0, EmitLeaveFlags, cc != EmitCcAl);
			
			if (rnNo == EMIT_REG_NO_SP)
				rnNo = rdNo;
			if (rmNo == EMIT_REG_NO_SP)
				rmNo = rdNo;
		}
		else {
			
			//LDR Rd, PC_VAL
			now = jitPrvEmitLoadValueOneInstrIfInIt(dest, rdNo, pcVal, cc != EmitCcAl);
			if (now != EmitErrNone)
				return now;
			
			if (rnNo == EMIT_REG_NO_PC)
				rnNo = rdNo;
			if (rmNo == EMIT_REG_NO_PC)
				rmNo = rdNo;
		}
		
		//op Rd, Rn, Rm, shiftType, #shiftAmt
		now = jitPrvAluOpRegShiftedImmSimple(dest, op, rdNo, rnNo, rmNo, shiftType, shiftAmt, s ? EmitSetFlags : EmitLeaveFlags, cc != EmitCcAl);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

enum EmitStatus jitEmitAluOpRegShiftImm(struct EmitBuf *dest, enum EmitCc cc, enum JitArmDpOp op, uint32_t instrAddr, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, enum EmitShiftType shiftType, uint32_t shiftAmt, bool s)
{
	uint32_t pcVal = instrAddr + 8;
	
	//we do not suport SPSR-reading instrs
	if (jitPrvDpInstrHasRd(op) && rdNo == EMIT_REG_NO_PC && s)
		return EmitErrNotEncodeable;
	
	//all test instrs have s set
	if (!jitPrvDpInstrHasRd(op) && !s)
		return EmitErrNotEncodeable;
	
	if (!jitPrvDpInstrHasRd(op))
		return jitPrvAluComparesRegShiftImm(dest, cc, op, pcVal, rnNo, rmNo, shiftType, shiftAmt);
	else if (!jitPrvDpInstrHasRn(op))
		return jitPrvAluMovesRegShiftImm(dest, cc, op, pcVal, rdNo, rmNo, shiftType, shiftAmt, s);
	else if (op == ArmDpOpRsc)
		return jitPrvAluRscRegShiftImm(dest, cc, pcVal, rdNo, rnNo, rmNo, shiftType, shiftAmt, s);
	else if (rdNo == EMIT_REG_NO_PC)
		return jitPrvAluRegShiftImmOpToPc(dest, cc, op, pcVal, rnNo, rmNo, shiftType, shiftAmt);
	else if (rdNo == EMIT_REG_NO_SP)
		return jitPrvAluRegShiftImmOpToSp(dest, cc, op, pcVal, rnNo, rmNo, shiftType, shiftAmt, s);
	else
		return jitPrvAluRegShiftImmOpCommon(dest, cc, op, pcVal, rdNo, rnNo, rmNo, shiftType, shiftAmt, s);
	
	return EmitErrNotEncodeable;
}

static enum EmitStatus jitPrvAluMovesRegShiftReg(struct EmitBuf *dest, enum EmitCc cc, enum JitArmDpOp op, uint32_t pcVal, uint32_t rdNo, uint32_t rmNo, uint32_t rsNo, enum EmitShiftType shiftType, bool s)
{
	uint32_t tmpReg1, tmpReg2, usedRegs = (1 << rdNo) | (1 << rmNo) | (1 << rsNo), nPushRegs = 0, pushRegs = 0;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	if (rdNo != EMIT_REG_NO_SP && rmNo != EMIT_REG_NO_SP && rsNo != EMIT_REG_NO_SP) {
		
		if (!s || op == ArmDpOpMov) {	//else we'll corrupt flags in IT halfway through
		
			if (cc != EmitCcAl) {
				if (op == ArmDpOpMvn)
					EMIT(LLitt, cc);
				else
					EMIT(LLit, cc);
			}
			
			//shiftop[s] Rd, Rm, Rs
			EMIT(LLshiftByReg, rdNo, rmNo, rsNo, shiftType, s ? EmitSetFlags : EmitLeaveFlags, cc != EmitCcAl);
			
			if (op == ArmDpOpMvn) {
						
				//MVN[s] Rd, Rd
				EMIT(LLmvnReg, rdNo, rdNo, EmitShiftLsl, 0, s ? EmitSetFlags : EmitLeaveFlags, cc != EmitCcAl);
			}
		}
		else {
			
			now = jitPrvHandleCcStart(dest, &ccSkip, cc);
			if (now != EmitErrNone)
				return now;
				
			//shiftop[s] Rd, Rm, Rs
			EMIT(LLshiftByReg, rdNo, rmNo, rsNo, shiftType, s ? EmitSetFlags : EmitLeaveFlags, false);
			
			if (op == ArmDpOpMvn) {
						
				//MVN[s] Rd, Rd
				EMIT(LLmvnReg, rdNo, rdNo, EmitShiftLsl, 0, s ? EmitSetFlags : EmitLeaveFlags, false);
			}
			
			now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
			if (now != EmitErrNone)
				return now;
		}
	}
	else {
		
		tmpReg1 = jitUtilPickLowestClearBit(usedRegs + pushRegs);
		pushRegs += 1 << tmpReg1;
		nPushRegs++;
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		if (rdNo == EMIT_REG_NO_SP) {
			
			tmpReg2 = jitUtilPickLowestClearBit(usedRegs + pushRegs);
			pushRegs += 1 << tmpReg2;
			nPushRegs++;

			//PUSH {..pushRegs..}
			EMIT(HLpush, pushRegs);

			//get SP if needed as a reg
			if (rmNo == EMIT_REG_NO_SP || rsNo == EMIT_REG_NO_SP) {
				
				//ADD tmpReg1, SP, #4 * nPushRegs		//SP_VAL before push
				EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushRegs, EmitLeaveFlags, false);
				
				if (rmNo == EMIT_REG_NO_SP)
					rmNo = tmpReg1;
				if (rsNo == EMIT_REG_NO_SP)
					rsNo = tmpReg1;
			}
			
			//shiftop[s] tmpReg1, Rm, Rs
			EMIT(LLshiftByReg, tmpReg1, rmNo, rsNo, shiftType, s ? EmitSetFlags : EmitLeaveFlags, false);
			
			if (op == ArmDpOpMvn) {	//needed here since we'll never get to the final one at the end of the func
				
				//MVN[s] tmpReg1, tmpReg1
				EMIT(LLmvnReg, tmpReg1, tmpReg1, EmitShiftLsl, 0, s ? EmitSetFlags : EmitLeaveFlags, false);
			}
			
			//swap to new SP
			now = jitPrvEmitSwapSp(dest, tmpReg1, tmpReg2, pushRegs, nPushRegs);
			if (now != EmitErrNone)
				return now;
		}
		else if (rdNo != rmNo && rdNo != rsNo) {		//Rm and/or Rs are SP, we can clobber Rd
			
			//MOV Rd, SP
			EMIT(LLmov, rdNo, EMIT_REG_NO_SP, EmitShiftLsl, 0, EmitLeaveFlags, false);
			
			if (rmNo == EMIT_REG_NO_SP)
				rmNo = rdNo;
			if (rsNo == EMIT_REG_NO_SP)
				rsNo = rdNo;
			
			//shiftop[s] Rd, Rm, Rs
			EMIT(LLshiftByReg, rdNo, rmNo, rsNo, shiftType, s ? EmitSetFlags : EmitLeaveFlags, false);
		}
		else {	//Rm and/or Rs are SP, Rd is NOT clobberable
			
			//PUSH {tmpReg1}
			EMIT(HLpush, 1 << tmpReg1);
			
			//ADD tmpReg1, SP, #4 * nPushRegs		//SP_VAL before push
			EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t), EmitLeaveFlags, false);
			
			if (rmNo == EMIT_REG_NO_SP)
				rmNo = tmpReg1;
			if (rsNo == EMIT_REG_NO_SP)
				rsNo = tmpReg1;
			
			//shiftop[s] Rd, Rm, Rs
			EMIT(LLshiftByReg, rdNo, rmNo, rsNo, shiftType, s ? EmitSetFlags : EmitLeaveFlags, false);
			
			//POP {tmpReg1}
			EMIT(HLpop, 1 << tmpReg1);
		}
		
		if (op == ArmDpOpMvn) {
					
			//MVN[s] Rd, Rd
			EMIT(LLmvnReg, rdNo, rdNo, EmitShiftLsl, 0, s ? EmitSetFlags : EmitLeaveFlags, false);
		}
	
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvAluComparesRegShiftReg(struct EmitBuf *dest, enum EmitCc cc, enum JitArmDpOp op, uint32_t pcVal, uint32_t rnNo, uint32_t rmNo, uint32_t rsNo, enum EmitShiftType shiftType)
{
	uint32_t tmpReg1, tmpReg2, usedRegs = (1 << rnNo) | (1 << rmNo) | (1 << rsNo), nPushRegs = 0, pushRegs = 0;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	//in these cases we can get away with one temp reg
	if ((op == ArmDpOpCmp || op == ArmDpOpCmn || rnNo != EMIT_REG_NO_SP) && rmNo != EMIT_REG_NO_SP && rsNo != EMIT_REG_NO_SP) {
		
		nPushRegs = jitPrvFindTempRegs(pcVal - 8, usedRegs, &pushRegs, NULL, false, &tmpReg1, NULL);

		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		//shiftop[s] tmpReg1, Rm, Rs	//cmp/cmn set all flags so we can corrupt them at will
		EMIT(LLshiftByReg, tmpReg1, rmNo, rsNo, shiftType, EmitSetFlags, false);
		
		//cmp_op Rn, tmpReg1
		now = jitPrvAluOpRegShiftedImmSimple(dest, op, 0, rnNo, tmpReg1, EmitShiftLsl, 0, EmitSetFlags, false);
		if (now != EmitErrNone)
			return now;
		
		//POP {..pushRegs..}
		EMIT(HLpop, pushRegs);
	}
	else {		//in other cases we need two temp regs
		
		nPushRegs = jitPrvFindTempRegs(pcVal - 8, usedRegs, &pushRegs, NULL, false, &tmpReg1, &tmpReg2, NULL);
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		if (rnNo == EMIT_REG_NO_SP || rmNo == EMIT_REG_NO_SP || rsNo == EMIT_REG_NO_SP) {
		
			//ADD tmpReg1, SP, #4 * nPushRegs		//SP_VAL before push
			EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushRegs, EmitLeaveFlags, false);
			
			if (rnNo == EMIT_REG_NO_SP)
				rnNo = tmpReg1;
			if (rmNo == EMIT_REG_NO_SP)
				rmNo = tmpReg1;
			if (rsNo == EMIT_REG_NO_SP)
				rsNo = tmpReg1;
		}
		
		//shiftopS tmpReg2, Rm, Rs
		EMIT(LLshiftByReg, tmpReg2, rmNo, rsNo, shiftType, EmitSetFlags, false);
		
		//cmp_op Rn, tmpReg2
		now = jitPrvAluOpRegShiftedImmSimple(dest, op, 0, rnNo, tmpReg2, EmitShiftLsl, 0, EmitSetFlags, false);
		if (now != EmitErrNone)
			return now;
		
		//POP {..pushRegs..}
		EMIT(HLpop, pushRegs);
	}
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvAluRscRegShiftReg(struct EmitBuf *dest, enum EmitCc cc, uint32_t pcVal, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, uint32_t rsNo, enum EmitShiftType shiftType, bool s)
{
	uint32_t tmpReg1, tmpReg2, usedRegs = (1 << rnNo) | (1 << rdNo) | (1 << rmNo) | (1 << rsNo), nPushRegs = 0, pushRegs = 0;
	bool haveSpSource = rnNo == EMIT_REG_NO_SP || rmNo == EMIT_REG_NO_SP || rsNo == EMIT_REG_NO_SP;
	struct EmitBuf ccSkip;
	enum EmitStatus now;


	tmpReg1 = jitUtilPickLowestClearBit(usedRegs + pushRegs);
	pushRegs += 1 << tmpReg1;
	nPushRegs++;

	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	//PUSH {..pushRegs..}
	if (rdNo == EMIT_REG_NO_SP) {
		
		tmpReg2 = jitUtilPickLowestClearBit(usedRegs + pushRegs);
		pushRegs += 1 << tmpReg2;
		nPushRegs++;
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		//get SP if needed and sub it in
		if (haveSpSource) {
			
			//ADD tmpReg1, SP, #4 * nPushRegs		//SP_VAL before push
			EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushRegs, EmitLeaveFlags, false);
			
			if (rnNo == EMIT_REG_NO_SP)
				rnNo = tmpReg1;
			if (rmNo == EMIT_REG_NO_SP)
				rmNo = tmpReg1;
			if (rsNo == EMIT_REG_NO_SP)
				rsNo = tmpReg1;
		}
		
		//shiftop[s] tmpReg2, Rm, Rs
		EMIT(LLshiftByReg, tmpReg2, rmNo, rsNo, shiftType, EmitLeaveFlags, false);
		
		//SBC[s] tmpReg1, tmpReg2, Rn
		EMIT(LLsbcReg, tmpReg1, tmpReg2, rnNo, EmitShiftLsl, 0, s ? EmitSetFlags : EmitLeaveFlags, false);
		
		//swap to new SP
		now = jitPrvEmitSwapSp(dest, tmpReg1, tmpReg2, pushRegs, nPushRegs);
		if (now != EmitErrNone)
			return now;
	}
	else if (rnNo == EMIT_REG_NO_SP) {		//we know Rn != Rd in here cause we know Rd is not SP (that case was handled above)
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		//get SP
		
		//ADD tmpReg1, SP, #4 * nPushRegs		//SP_VAL before push
		EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushRegs, EmitLeaveFlags, false);
		
		rnNo = tmpReg1;
		if (rmNo == EMIT_REG_NO_SP)
			rmNo = tmpReg1;
		if (rsNo == EMIT_REG_NO_SP)
			rsNo = tmpReg1;
	
		//shiftop[s] Rd, Rm, Rs
		EMIT(LLshiftByReg, rdNo, rmNo, rsNo, shiftType, EmitLeaveFlags, false);
		
		//SBC[s] Rd, Rd, Rn
		EMIT(LLsbcReg, rdNo, rdNo, rnNo, EmitShiftLsl, 0, s ? EmitSetFlags : EmitLeaveFlags, false);
		
		//POP {..pushRegs..}
		EMIT(HLpop, pushRegs);
	}
	else if (rdNo == rnNo || (haveSpSource && (rdNo == rmNo || rdNo == rsNo))) {				//Rn == Rd != SP  OR we need SP as a source
	
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		if (haveSpSource) {
		
			//ADD tmpReg1, SP, #4 * nPushRegs		//SP_VAL before push
			EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushRegs, EmitLeaveFlags, false);
			
			if (rmNo == EMIT_REG_NO_SP)
				rmNo = tmpReg1;
			if (rsNo == EMIT_REG_NO_SP)
				rsNo = tmpReg1;
		}
		
		//shiftop[s] tmpReg1, Rm, Rs
		EMIT(LLshiftByReg, tmpReg1, rmNo, rsNo, shiftType, EmitLeaveFlags, false);
		
		//SBC[s] Rd, tmpReg1, Rn
		EMIT(LLsbcReg, rdNo, tmpReg1, rnNo, EmitShiftLsl, 0, s ? EmitSetFlags : EmitLeaveFlags, false);
		
		//POP {..pushRegs..}
		EMIT(HLpop, pushRegs);
	}
	else {									//Rn != SP && Rd != SP && Rd != Rn  and either no SP at all or Rd != Rm, Rd != Rs
		
		if (haveSpSource) {
		
			//MOV Rd, SP
			EMIT(LLmov, rdNo, EMIT_REG_NO_SP, EmitShiftLsl, 0, EmitLeaveFlags, false);
			
			if (rmNo == EMIT_REG_NO_SP)
				rmNo = rdNo;
			if (rsNo == EMIT_REG_NO_SP)
				rsNo = rdNo;
		}
		
		//shiftop[s] Rd, Rm, Rs
		EMIT(LLshiftByReg, rdNo, rmNo, rsNo, shiftType, EmitLeaveFlags, false);
		
		//SBC[s] Rd, Rd, Rn
		EMIT(LLsbcReg, rdNo, rdNo, rnNo, EmitShiftLsl, 0, s ? EmitSetFlags : EmitLeaveFlags, false);
	}
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvAluRegShiftRegOpToSp(struct EmitBuf *dest, enum EmitCc cc, enum JitArmDpOp op, uint32_t pcVal, uint32_t rnNo, uint32_t rmNo, uint32_t rsNo, enum EmitShiftType shiftType, bool s)
{
	uint32_t tmpReg1, tmpReg2, usedRegs = (1 << EMIT_REG_NO_SP) | (1 << rnNo) | (1 << rmNo) | (1 << rsNo), nPushRegs = 0, pushRegs = 0;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	tmpReg1 = jitUtilPickLowestClearBit(usedRegs + pushRegs);
	pushRegs += 1 << tmpReg1;
	nPushRegs++;

	tmpReg2 = jitUtilPickLowestClearBit(usedRegs + pushRegs);
	pushRegs += 1 << tmpReg2;
	nPushRegs++;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	//PUSH {..pushRegs..}
	EMIT(HLpush, pushRegs);
	
	//handle SP source
	if (rnNo == EMIT_REG_NO_SP || rmNo == EMIT_REG_NO_SP || rsNo == EMIT_REG_NO_SP) {
		
		//ADD tmpReg1, SP, #4 * nPushRegs		//SP_VAL before push
		EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushRegs, EmitLeaveFlags, false);
		
		if (rnNo == EMIT_REG_NO_SP)
			rnNo = tmpReg1;
		if (rmNo == EMIT_REG_NO_SP)
			rmNo = tmpReg1;
		if (rsNo == EMIT_REG_NO_SP)
			rsNo = tmpReg1;
	}
	
	//shiftop[s] tmpReg2, Rm, Rs
	EMIT(LLshiftByReg, tmpReg2, rmNo, rsNo, shiftType, (jitPrvDpInstrNeedsRealShifterCarryOut(op) && s) ? EmitSetFlags : EmitLeaveFlags, false);
	
	//op[s] tmpReg2, Rn, tmpReg2
	now = jitPrvAluOpRegShiftedImmSimple(dest, op, tmpReg2, rnNo, tmpReg2, EmitShiftLsl, 0, s ? EmitSetFlags : EmitLeaveFlags, false);
	if (now != EmitErrNone)
		return now;
	
	//swap to new sp
	now = jitPrvEmitSwapSp(dest, tmpReg2, tmpReg1, pushRegs, nPushRegs);
	if (now != EmitErrNone)
		return now;
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

//Rd is not SP, none of the regs are PC, op is a 3-register op (not a move or a compare)
static enum EmitStatus jitPrvAluRegShiftRegOpCommon(struct EmitBuf *dest, enum EmitCc cc, enum JitArmDpOp op, uint32_t pcVal, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, uint32_t rsNo, enum EmitShiftType shiftType, bool s)
{
	uint32_t tmpReg1, tmpReg2, usedRegs = (1 << rdNo) | (1 << rnNo) | (1 << rmNo) | (1 << rsNo), nPushRegs = 0, pushRegs = 0;
	bool spSourceRhs = rmNo == EMIT_REG_NO_SP || rsNo == EMIT_REG_NO_SP, spSourceAny = spSourceRhs || rnNo == EMIT_REG_NO_SP;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	if (rnNo != rdNo && (rnNo != EMIT_REG_NO_SP || op == ArmDpOpAdd || op == ArmDpOpSub) && (!spSourceRhs || (rdNo != rmNo && rdNo != rsNo))) {
		
		//we can clobber Rd and use no extra regs
		
		//IT is ok with "s" since op is last instr inside the IT block. EXCEPT if the shift affects us
		if (!jitPrvDpInstrNeedsRealShifterCarryOut(op) || !s) {
		
			if (cc != EmitCcAl) {	
				if (spSourceRhs)
					EMIT(LLittt, cc);
				else
					EMIT(LLitt, cc);
			}
			
			if (spSourceRhs) {
				
				//MOV Rd, SP
				EMIT(LLmov, rdNo, EMIT_REG_NO_SP, EmitShiftLsl, 0, EmitLeaveFlags, cc != EmitCcAl);
				
				if (rmNo == EMIT_REG_NO_SP)
					rmNo = rdNo;
				if (rsNo == EMIT_REG_NO_SP)
					rsNo = rdNo;
			}
			
			//shiftop[s] Rd, Rm, Rs
			EMIT(LLshiftByReg, rdNo, rmNo, rsNo, shiftType, EmitLeaveFlags, cc != EmitCcAl);
			
			//op[s] Rd, Rn, Rd
			now = jitPrvAluOpRegShiftedImmSimple(dest, op, rdNo, rnNo, rdNo, EmitShiftLsl, 0, s ? EmitSetFlags : EmitLeaveFlags, cc != EmitCcAl);
			if (now != EmitErrNone)
				return now;
		}
		else {
			
			now = jitPrvHandleCcStart(dest, &ccSkip, cc);
			if (now != EmitErrNone)
				return now;
			
			if (spSourceRhs) {
				
				//MOV Rd, SP
				EMIT(LLmov, rdNo, EMIT_REG_NO_SP, EmitShiftLsl, 0, EmitLeaveFlags, false);
				
				if (rmNo == EMIT_REG_NO_SP)
					rmNo = rdNo;
				if (rsNo == EMIT_REG_NO_SP)
					rsNo = rdNo;
			}
			
			//shiftop[s] Rd, Rm, Rs
			EMIT(LLshiftByReg, rdNo, rmNo, rsNo, shiftType, (jitPrvDpInstrNeedsRealShifterCarryOut(op) && s) ? EmitSetFlags : EmitLeaveFlags, false);
			
			//op[s] Rd, Rn, Rd
			now = jitPrvAluOpRegShiftedImmSimple(dest, op, rdNo, rnNo, rdNo, EmitShiftLsl, 0, s ? EmitSetFlags : EmitLeaveFlags, false);
			if (now != EmitErrNone)
				return now;
			
			now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
			if (now != EmitErrNone)
				return now;
		}
	}
	else {
		
		nPushRegs = jitPrvFindTempRegs(pcVal - 8, usedRegs, &pushRegs, NULL, false, &tmpReg1, NULL);
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		if (spSourceRhs) {
			
			//add tmpReg1, SP, #4 * nPushRegs
			EMIT(LLaddImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushRegs, EmitLeaveFlags, false);
			
			if (rmNo == EMIT_REG_NO_SP)
				rmNo = tmpReg1;
			if (rsNo == EMIT_REG_NO_SP)
				rsNo = tmpReg1;
		}
		
		//shiftop[s] tmpReg1, Rm, Rs
		EMIT(LLshiftByReg, tmpReg1, rmNo, rsNo, shiftType, (jitPrvDpInstrNeedsRealShifterCarryOut(op) && s) ? EmitSetFlags : EmitLeaveFlags, false);
	
		if (rnNo == EMIT_REG_NO_SP && op != ArmDpOpAdd && op != ArmDpOpSub) {	//we cannot do other ops on SP so we clobber Rd
			
			//add Rd, SP, #4 * nPushRegs
			EMIT(LLaddImm, rdNo, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushRegs, EmitLeaveFlags, false);
			rnNo = rdNo;
		}
	
		//op[s] Rd, Rn, tmpReg1
		now = jitPrvAluOpRegShiftedImmSimple(dest, op, rdNo, rnNo, tmpReg1, EmitShiftLsl, 0, s ? EmitSetFlags : EmitLeaveFlags, false);
		if (now != EmitErrNone)
			return now;
		
		//POP {..pushRegs..}
		EMIT(HLpop, pushRegs);
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}

	return EmitErrNone;
}

enum EmitStatus jitEmitAluOpRegShiftReg(struct EmitBuf *dest, enum EmitCc cc, enum JitArmDpOp op, uint32_t instrAddr, uint32_t rdNo, uint32_t rnNo, uint32_t rmNo, uint32_t rsNo, enum EmitShiftType shiftType, bool s)
{
	uint32_t pcVal = instrAddr + 8;
	
	//This mode NEVER allows Rd, Rn, Rm, or Rs to be PC
	if (rdNo == EMIT_REG_NO_PC || rmNo == EMIT_REG_NO_PC || rnNo == EMIT_REG_NO_PC || rsNo == EMIT_REG_NO_PC)
		return EmitErrNotEncodeable;
	
	//all test instrs have s set
	if (!jitPrvDpInstrHasRd(op) && !s)
		return EmitErrNotEncodeable;
	
	if (!jitPrvDpInstrHasRd(op))
		return jitPrvAluComparesRegShiftReg(dest, cc, op, pcVal, rnNo, rmNo, rsNo, shiftType);
	else if (!jitPrvDpInstrHasRn(op))
		return jitPrvAluMovesRegShiftReg(dest, cc, op, pcVal, rdNo, rmNo, rsNo, shiftType, s);
	else if (op == ArmDpOpRsc)
		return jitPrvAluRscRegShiftReg(dest, cc, pcVal, rdNo, rnNo, rmNo, rsNo, shiftType, s);
	else if (rdNo == EMIT_REG_NO_SP)
		return jitPrvAluRegShiftRegOpToSp(dest, cc, op, pcVal, rnNo, rmNo, rsNo, shiftType, s);
	else
		return jitPrvAluRegShiftRegOpCommon(dest, cc, op, pcVal, rdNo, rnNo, rmNo, rsNo, shiftType, s);

	return EmitErrNotEncodeable;
}

//////////////////////////////////////////////////////// START LDMIA ////////////////////////////////////////////////////////


static enum EmitStatus jitPrvLdmiaToSp(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t regsMask, bool wbak)
{
	const uint32_t spMask = 1 << EMIT_REG_NO_SP, lrMask = 1 << EMIT_REG_NO_LR, pcMask = 1 << EMIT_REG_NO_PC;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	uint32_t nRegs;
	
	if (regsMask & spMask) {		//we have SP in the list (we thus know that no wbak is being used, not that it matters since we're replacing SP)
		
		uint32_t regsBelowSp = regsMask & (spMask - 1);
		nRegs = jitPrvPopcount16(regsBelowSp);
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
			
		//SP is being loaded, so wbak-increment as we load to make our life easier later
		EMIT(HLldmia, EMIT_REG_NO_SP, regsBelowSp, true);
		
		//if no pc, we do not need anythign fancy-shmancy
		if (!(regsMask & pcMask)) {
			
			if (regsMask & lrMask) {
				
				//LDR lr, [SP, #4]							//load LR
				EMIT(LLloadImm, EMIT_REG_NO_LR, EMIT_REG_NO_SP, sizeof(uint32_t), EmitSzWord, false, EmitAdrModeIndex);
			}
			
			//LDR sp, [SP, #0]								//new sp
			EMIT(LLloadImm, EMIT_REG_NO_SP, EMIT_REG_NO_SP, 0, EmitSzWord, false, EmitAdrModeIndex);
		}
		else {
			
			//PUSH {r0, r1}
			EMIT(HLpush, 0x0003);
			
			if (regsMask & lrMask) {		//we can reuse that slot, need to push one reg
				
				//LDRD r0, lr, [SP, #8] 					//load SP (into r0) and LR
				EMIT(LLldrdImm, 0, EMIT_REG_NO_LR, EMIT_REG_NO_SP, 2 * sizeof(uint32_t), EmitAdrModeIndex);
			}
			else {							//LR is not in the list. pc still might be, need to push two regs
				
				//LDR r0, [SP, #8]							//load SP into r0
				EMIT(LLloadImm, 0, EMIT_REG_NO_SP, 2 * sizeof(uint32_t), EmitSzWord, false, EmitAdrModeIndex);
			}
			
			//swap to new SP, possibly popping PC
			now = jitPrvEmitSwapSp(dest, 0, 1, 0x0003 | (regsMask & pcMask), (regsMask & pcMask) ? 3 : 2);
			if (now != EmitErrNone)
				return now;
		}
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	else if (!(regsMask & pcMask) || !(regsMask & lrMask)) {			//we can use a normal LDMIA
		
		if ((regsMask & pcMask) && wbak) {								//can we use a pop pc callout?
			
			now = jitPrvEmitPopWithOpts(dest, cc, regsMask, true);
			if (now != EmitErrNone)
				return now;
		}
		else {
		
			if (cc != EmitCcAl)
				EMIT(LLit, cc);
			
			//LDMIA {regsMask}
			EMIT(HLldmia, EMIT_REG_NO_SP, regsMask, wbak);
		}
	}
	else {																//LR & PC, both in the list
		
		if (wbak) {														//wbak is on
			
			now = jitPrvLiteralLoadsFlush(dest, 1);
			if (now != EmitErrNone)
				return now;
			
			if (cc != EmitCcAl)
				EMIT(LLitt, cc);
			
			//LDMIA {regsMask minus PC}
			EMIT(HLldmia, EMIT_REG_NO_SP, regsMask &~ pcMask, wbak);
		
			//POP {pc} via callout
			now = jitPrvEmitJumpOneInstr(dest, (uintptr_t)&jitPrvPopPcCallout);
			if (now != EmitErrNone)
				return now;
		}
		else {															//wbak is off
			
			nRegs = jitPrvPopcount16(regsMask &~ pcMask);				//find the offset of where to load PC from
			
			if (cc != EmitCcAl)
				EMIT(LLitt, cc);
			
			//LDMIA {regsMask minus PC}
			EMIT(HLldmia, EMIT_REG_NO_SP, regsMask &~ pcMask, wbak);
		
			//LDR PC, [Rn, #ifst_to_pc]	//no callout helps us here
			EMIT(LLloadImm, EMIT_REG_NO_PC, EMIT_REG_NO_SP, sizeof(uint32_t) * nRegs, EmitSzWord, false, EmitAdrModeIndex);
		}
	}
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvLdmiaToLr(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t regsMask, bool wbak)
{
	const uint32_t spMask = 1 << EMIT_REG_NO_SP, lrMask = 1 << EMIT_REG_NO_LR, pcMask = 1 << EMIT_REG_NO_PC;
	uint32_t nPushRegs = 2, nRegs, popRegs;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
		
	if (regsMask & spMask) {		//we have SP in the list
		
		uint32_t regsBelowSp = regsMask & (spMask - 1);
		
		nRegs = jitPrvPopcount16(regsBelowSp);
		
		if (regsBelowSp) {
			
			//ldmia the regs below SP
			EMIT(HLldmia, EMIT_REG_NO_LR, regsBelowSp, wbak);
		}
		
		//if just sp, directly load it, keep in mind we cannot load sp with wbak
		if ((regsMask & (spMask | lrMask | pcMask)) == spMask) {
			
			//LDR sp, [Rn] 									//if wbak mode
			//LDR sp, [Rn, #4 * n_regs_below_pc]			//if not wbak mode
			EMIT(LLloadImm, EMIT_REG_NO_SP, EMIT_REG_NO_LR, (wbak ? 0 : nRegs) * sizeof(uint32_t), EmitSzWord, false, EmitAdrModeIndex);
			
			if (wbak) {
				
				//add Rn, Rn, #4							//postincremnt as is expected of us
				EMIT(LLaddImm, EMIT_REG_NO_LR, EMIT_REG_NO_LR, sizeof(uint32_t), EmitLeaveFlags, false);
			}
		}
		else {
		
			if (regsMask & pcMask) {
				
				//PUSH {r0,r1,r2}
				EMIT(HLpush, 0x0007);
				popRegs = 0x8003;
				nPushRegs = 3;
			}
			else {
				
				//PUSH {r0,r1}
				EMIT(HLpush, 0x0003);
				popRegs = 0x0003;
				nPushRegs = 2;
			}
			
			//load SPval to r0, LR as needed, PCval to R1 if needed
			if ((regsMask & lrMask) && (regsMask & pcMask)) {
				
				//LDR r1, [Rn, #8] 								//if wbak mode
				//LDR r1, [Rn, #proper_ofst]					//if not wbak mode
				EMIT(LLloadImm, 1, EMIT_REG_NO_LR, (wbak ? 2 : (nRegs + 2)) * sizeof(uint32_t), EmitSzWord, false, EmitAdrModeIndex);
				
				//LDRD r0, lr, [Rn], #12 						//if wbak mode
				//LDRD r0, lr, [Rn, #4 * n_Regs_below_sp]		//if not wbak mode
				EMIT(LLldrdImm, 0, EMIT_REG_NO_LR, EMIT_REG_NO_LR, (wbak ? 3 : nRegs) * sizeof(uint32_t), wbak ? EmitAdrModePostindex : EmitAdrModeIndex);
				nRegs += 3;
			}
			else if (regsMask & lrMask) {		//SP (to r0), LR (to lr)
				
				//LDRD r0, lr, [Rn], #8 						//if wbak mode
				//LDRD r0, lr, [Rn, #4 * n_Regs_below_sp]		//if not wbak mode
				EMIT(LLldrdImm, 0, EMIT_REG_NO_LR, EMIT_REG_NO_LR, (wbak ? 2 : nRegs) * sizeof(uint32_t), wbak ? EmitAdrModePostindex : EmitAdrModeIndex);
				nRegs += 2;
			}
			else if (regsMask & pcMask) {	//SP (to r0) and PC (to r1)
				
				//LDRD r0, r1, [Rn], #8 						//if wbak mode
				//LDRD r0, r1, [Rn, #4 * n_Regs_below_sp]		//if not wbak mode
				EMIT(LLldrdImm, 0, 1, EMIT_REG_NO_LR, (wbak ? 2 : nRegs) * sizeof(uint32_t), wbak ? EmitAdrModePostindex : EmitAdrModeIndex);
				nRegs += 2;
			}
			
			//new sp val is in r0, new PC val is r1 if we need it
			if (regsMask & pcMask) {
				
				//STR r1, [SP, #8]		//	store it where it needs to be on the stack
				EMIT(LLstoreImm, 1, EMIT_REG_NO_SP, (nPushRegs - 1) * sizeof(uint32_t), EmitSzWord, EmitAdrModeIndex);
			}
			
			now = jitPrvEmitSwapSp(dest, 0, 1, popRegs, nPushRegs);
			if (now != EmitErrNone)
				return now;
		}
	}
	else if (!(regsMask & pcMask) || !(regsMask & lrMask)) {			//we can use a normal LDMIA
		
		//LDMIA {regsMask}
		EMIT(HLldmia, EMIT_REG_NO_LR, regsMask, wbak);
	}
	else {																//LR & PC, both in the list, which means we are guaranteed to not be in wbak mode (since Rn is in the list)
		
		//LDMIA LR, {regsMask minus PC and LR}, use wbak mode to make life easier, we do not care since we're about to override it
		EMIT(HLldmia, EMIT_REG_NO_LR, regsMask & (lrMask - 1), true);
		
		//PUSH {r0, r1, r2}
		EMIT(HLpush, 0x0007);
		
		//LDMIA LR!, {r0, r1}
		EMIT(HLldmia, EMIT_REG_NO_LR, 0x0003, true);
		
		//MOV LR, r0
		EMIT(LLmov, EMIT_REG_NO_LR, 0, EmitShiftLsl, 0, EmitLeaveFlags, false);
		
		//STR r1, [SP, #8]		//	store it where it needs to be on the stack
		EMIT(LLstoreImm, 1, EMIT_REG_NO_SP, 2 * sizeof(uint32_t), EmitSzWord, EmitAdrModeIndex);
		
		//POP {r0, r1, pc}
		now = jitPrvEmitPopWithOpts(dest, EmitCcAl, 0x8003, true);
		if (now != EmitErrNone)
			return now;
	}

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvLdmiaWithRn(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rnNo, uint32_t regsMask)
{
	const uint32_t spMask = 1 << EMIT_REG_NO_SP, lrMask = 1 << EMIT_REG_NO_LR, pcMask = 1 << EMIT_REG_NO_PC, rnMask = 1 << rnNo;
	uint32_t regsBelow, regsMiddle, highRegs;
	bool needPc = !!(regsMask & pcMask);
	enum EmitStatus now;
	
	if (!(regsMask & spMask) && (regsMask & (lrMask | pcMask)) != (lrMask | pcMask)) { 	//if we can use a normal LDMIA, do so
		
		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		EMIT(HLldmia, rnNo, regsMask, false);
	}
	else {
		
		uint32_t usedRegs = 0, tmpReg1, tmpReg2, tmpReg3 = 0, nRegs = 2;
		struct EmitBuf ccSkip;

		regsBelow = regsMask & (rnMask - 1);						//regs below Rn
		regsMiddle = regsMask & ((spMask - 1) - (2 * rnMask - 1));	//regs above Rn and below SP
		highRegs = regsMask & (spMask | lrMask | pcMask);			//SP,LR,PC
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
			
		//LDMIA Rn!, {..regsBelow..}
		EMIT(HLldmia, rnNo, regsBelow, true);
		
		//ADD Rn #4			//skip Rn
		EMIT(LLaddImm, rnNo, rnNo, sizeof(uint32_t), EmitLeaveFlags, false);
		
		//LDMIA Rn!, {..regsMiddle..}
		EMIT(HLldmia, rnNo, regsMiddle, true);
	
		//we need some temp regs that are not Rn
		tmpReg1 = jitUtilPickLowestClearBit(rnMask + usedRegs);
		usedRegs += 1 << tmpReg1;
		tmpReg2 = jitUtilPickLowestClearBit(rnMask + usedRegs);
		usedRegs += 1 << tmpReg2;
		
		if (regsMask & spMask) {
	
			//we need some more temp regs that are not Rn if loading pc
			if (regsMask & pcMask) {
				
				tmpReg3 = jitUtilPickLowestClearBit(rnMask + usedRegs);
				usedRegs += 1 << tmpReg3;
				nRegs++;
			}
			
			//PUSH {..usedRegs..}
			EMIT(HLpush, usedRegs);
			
			//load SPval to tmpReg1, LR to lr as needed, PCval to tmpReg2 if needed
			if (regsMask & lrMask) {		//SP (to tmpReg1), LR (to lr)
				
				//LDRD tmpReg1, lr, [Rn], #8
				EMIT(LLldrdImm, tmpReg1, EMIT_REG_NO_LR, rnNo, 2 * sizeof(uint32_t), EmitAdrModePostindex);
			}
			else if (regsMask & pcMask) {	//SP (to tmpReg1) and PC (to tmpReg2)
				
				//LDRD tmpReg1, tmpReg2, [Rn], #8
				EMIT(LLldrdImm, tmpReg1, tmpReg2, rnNo, 2 * sizeof(uint32_t), EmitAdrModePostindex);
				needPc = false;
			}
			else {							//just SP (to tmpReg1)
				
				//LDR tmpReg1, [Rn], #4 	//if wbak mode
				EMIT(LLloadImm, tmpReg1, rnNo, sizeof(uint32_t), EmitSzWord, false, EmitAdrModePostindex);
				needPc = false;
			}
			
			if (needPc) {					//PC goes to tmpReg2
				
				//LDR tmpReg2, [Rn], #4
				EMIT(LLloadImm, tmpReg2, rnNo, sizeof(uint32_t), EmitSzWord, false, EmitAdrModePostindex);
			}
			
			//now we can load Rn
			EMIT(LLloadImm, rnNo, rnNo, -(jitPrvPopcount16(highRegs) + 1 + jitPrvPopcount16(regsMiddle)) * sizeof(uint32_t), EmitSzWord, false, EmitAdrModeIndex);
			
			//if we had PC loaded, store it on the stack so it will be popped, and remember that
			if (regsMask & pcMask) {
				
				//STR tmpReg2, [SP, #12]
				EMIT(LLstoreImm, tmpReg2, EMIT_REG_NO_SP, (nRegs - 1) * sizeof(uint32_t), EmitSzWord, EmitAdrModeIndex);
				
				usedRegs &=~ (1 << tmpReg3);
				usedRegs |= 1 << EMIT_REG_NO_PC;
			}
			
			//now swap SP
			now = jitPrvEmitSwapSp(dest, tmpReg1, tmpReg2, usedRegs, nRegs);
			if (now != EmitErrNone)
				return now;
			
		}
		else {		//both LR and PC in list
			
			//PUSH {tmpReg1,tmpReg2}
			EMIT(HLpush, usedRegs);
			
			//LDRD lr, tmpReg1, [Rn], #8
			EMIT(LLldrdImm, EMIT_REG_NO_LR, tmpReg1, rnNo, 2 * sizeof(uint32_t), EmitAdrModePostindex);
			
			//STR tmpReg2, [SP, #4]	//store PC value on stack where it is expected
			EMIT(LLstoreImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t), EmitSzWord, EmitAdrModeIndex);
			
			//now we can load Rn
			EMIT(LLloadImm, rnNo, rnNo, -(jitPrvPopcount16(highRegs) + 1 + jitPrvPopcount16(regsMiddle)) * sizeof(uint32_t), EmitSzWord, false, EmitAdrModeIndex);

			//POP {tmpReg1, pc}
			now = jitPrvEmitPopWithOpts(dest, EmitCcAl, pcMask | (1 << tmpReg1), true);
			if (now != EmitErrNone)
				return now;
		}
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvLdmiaWithoutRn(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rnNo, uint32_t regsMask, bool wbak)
{
	const uint32_t spMask = 1 << EMIT_REG_NO_SP, lrMask = 1 << EMIT_REG_NO_LR, pcMask = 1 << EMIT_REG_NO_PC, rnMask = 1 << rnNo;
	uint32_t regsBelow, nRegs = 0;
	
	if (!(regsMask & spMask) && (regsMask & (lrMask | pcMask)) != (lrMask | pcMask)) { 	//if we can use a normal LDMIA, do so
		
		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		EMIT(HLldmia, rnNo, regsMask, wbak);
	}
	else {
		
		uint32_t usedRegs = 0, tmpReg1, tmpReg2, tmpReg3, nPushRegs = 2;
		struct EmitBuf ccSkip;
		enum EmitStatus now;
		
		regsBelow = (regsMask & spMask) ? (regsMask & (spMask - 1)) : (regsMask & (pcMask - 1));					//regs below below first complex reg
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
			
		//LDMIA Rn!, {..regsBelow..}
		EMIT(HLldmia, rnNo, regsBelow, wbak);
		nRegs = jitPrvPopcount16(regsBelow);
	
		//we need some temp regs that are not Rn
		tmpReg1 = jitUtilPickLowestClearBit(rnMask + usedRegs);
		usedRegs += 1 << tmpReg1;
		tmpReg2 = jitUtilPickLowestClearBit(rnMask + usedRegs);
		usedRegs += 1 << tmpReg2;
		
		if (regsMask & spMask) {
			
			
			//if no pc (and we know that no Rn as well), we can do a more direct load
			if (!(regsMask & pcMask)) {
				
				if (regsMask & lrMask) {
					
					//LDR lr, [Rn, #4] 							//if wbak mode
					//LDR lr, [Rn, #4 * (nRegs + 1)]			//if not wbak mode
					EMIT(LLloadImm, EMIT_REG_NO_LR, rnNo, (wbak ? 1 : (nRegs+ 1)) * sizeof(uint32_t), EmitSzWord, false, EmitAdrModeIndex);
				}
			
				//LDR sp, [Rn] 									//if wbak mode
				//LDR sp, [Rn, #4 * n_regs_below_pc]			//if not wbak mode
				EMIT(LLloadImm, EMIT_REG_NO_SP, rnNo, (wbak ? 0 : nRegs) * sizeof(uint32_t), EmitSzWord, false, EmitAdrModeIndex);
				
				if (wbak) {
					
					//add Rn, Rn, #4 or 8							//postincremnt as is expected of us
					EMIT(LLaddImm, rnNo, rnNo, sizeof(uint32_t) * ((regsMask & lrMask) ? 2 : 1), EmitLeaveFlags, false);
				}
			}
			else {
			
				if (regsMask & pcMask) {
				
					//we need some more temp regs that are not Rn
					tmpReg3 = jitUtilPickLowestClearBit(rnMask + usedRegs);
					usedRegs += 1 << tmpReg3;
					nPushRegs++;
				}
				
				//PUSH {..usedRegs..}
				EMIT(HLpush, usedRegs);
				
				//load SPval to tmpReg1, LR to lr as needed, PCval to tmpReg2 if needed
				if (regsMask & lrMask) {		//SP (to tmpReg1), LR (to lr)
					
					//LDRD tmpReg1, lr, [Rn], #8 					//if wbak mode
					//LDRD tmpReg1, lr, [Rn, #4 * nRegs]			//if not wbak mode
					EMIT(LLldrdImm, tmpReg1, EMIT_REG_NO_LR, rnNo, (wbak ? 2 : nRegs) * sizeof(uint32_t), wbak ? EmitAdrModePostindex : EmitAdrModeIndex);
					nRegs += 2;
					
					if (regsMask & pcMask) {
				
						//LDR tmpReg2, [Rn], #4 							//if wbak mode
						//LDR tmpReg2, [Rn, #4 * nRegs]						//if not wbak mode
						EMIT(LLloadImm, tmpReg2, rnNo, (wbak ? 1 : nRegs) * sizeof(uint32_t), EmitSzWord, false, wbak ? EmitAdrModePostindex : EmitAdrModeIndex);
					}
				}
				else if (regsMask & pcMask) {	//SP (to tmpReg1) and PC (to tmpReg2)
					
					//LDRD tmpReg1, tmpReg2, [Rn], #8 					//if wbak mode
					//LDRD tmpReg1, tmpReg2, [Rn, #4 * nRegs]			//if not wbak mode
					EMIT(LLldrdImm, tmpReg1, tmpReg2, rnNo, (wbak ? 2 : nRegs) * sizeof(uint32_t), wbak ? EmitAdrModePostindex : EmitAdrModeIndex);
					nRegs += 2;
				}
				else {							//just SP (to tmpReg1)
					
					//LDR tmpReg1, [Rn], #4 							//if wbak mode
					//LDR tmpReg1, [Rn, #4 * nRegs]						//if not wbak mode
					EMIT(LLloadImm, tmpReg1, rnNo, (wbak ? 1 : nRegs) * sizeof(uint32_t), EmitSzWord, false, wbak ? EmitAdrModePostindex : EmitAdrModeIndex);
					nRegs++;
				}
				
				
				//if we had PC loaded, store it on the stack so it will be popped, and remember that
				if (regsMask & pcMask) {
					
					//STR tmpReg2, [SP, #8]
					EMIT(LLstoreImm, tmpReg2, EMIT_REG_NO_SP, (nPushRegs - 1) * sizeof(uint32_t), EmitSzWord, EmitAdrModeIndex);
					
					usedRegs &=~ (1 << tmpReg3);
					usedRegs |= 1 << EMIT_REG_NO_PC;
				}
				
				//now swap SP
				now = jitPrvEmitSwapSp(dest, tmpReg1, tmpReg2, usedRegs, nPushRegs);
				if (now != EmitErrNone)
					return now;
			}
		}
		else {		//both LR and PC in list
			
			//LDR pc, [Rn], #4 					//if wbak mode
			//LDR pc, [Rn, #4 * nRegs]			//if not wbak mode
			EMIT(LLloadImm, EMIT_REG_NO_PC, rnNo, (wbak ? 1 : nRegs) * sizeof(uint32_t), EmitSzWord, false, wbak ? EmitAdrModePostindex : EmitAdrModeIndex);
		}
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

enum EmitStatus jitEmitLdmia(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rnNo, uint32_t regsMask, bool wbak)
{
	//we need at least one reg, PC is not allowed as the base reg, Rn cannot be in the list if wbak is on
	if (!regsMask || rnNo == EMIT_REG_NO_PC || ((regsMask & (1 << rnNo)) & wbak))
		return EmitErrNotEncodeable;

	if (rnNo == EMIT_REG_NO_SP)
		return jitPrvLdmiaToSp(dest, cc, instrAddr, regsMask, wbak);
	else if (rnNo == EMIT_REG_NO_LR)
		return jitPrvLdmiaToLr(dest, cc, instrAddr, regsMask, wbak);
	else if (regsMask & (1 << rnNo))
		return jitPrvLdmiaWithRn(dest, cc, instrAddr, rnNo, regsMask);
	else
		return jitPrvLdmiaWithoutRn(dest, cc, instrAddr, rnNo, regsMask, wbak);
}

//////////////////////////////////////////////////////// END LDMIA ////////////////////////////////////////////////////////


//////////////////////////////////////////////////////// START STMIA ////////////////////////////////////////////////////////

static enum EmitStatus jitPrvStmiaToSp(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t regsMask, bool wbak)
{
	const uint32_t spMask = 1 << EMIT_REG_NO_SP, lrMask = 1 << EMIT_REG_NO_LR, pcMask = 1 << EMIT_REG_NO_PC;
	uint32_t lowRegs, hiRegs, tmpReg, nRegs;
	
	if (wbak) {
		loge("STMIA SP!, {...} makes no sense\n");
		return EmitErrNotEncodeable;
	}
	
	//if we can do it directly, do so
	if (!(regsMask & (pcMask | spMask))){
		
		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		
		EMIT(HLstmia, EMIT_REG_NO_SP, regsMask, false);
	}
	else {	//PC or SP in the list
		
		struct EmitBuf ccSkip;
		enum EmitStatus now;
		
		lowRegs = (regsMask & spMask) ? (regsMask & (spMask - 1)) : (regsMask & (pcMask - 1));
		nRegs = jitPrvPopcount16(lowRegs);
		hiRegs = regsMask &~ lowRegs;
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
			
		//store low regs (could be empty set)
		EMIT(HLstmia, EMIT_REG_NO_SP, lowRegs, false);
	
		//store SP (if requested)
		if (hiRegs & spMask) {
			
			EMIT(LLstoreImm, EMIT_REG_NO_SP, EMIT_REG_NO_SP, nRegs * sizeof(uint32_t), EmitSzWord, EmitAdrModeIndex);
			nRegs++;
		}
		
		if (hiRegs & pcMask) {						//pc ? this gets hard
			
			if (lowRegs) {							//we can use the lowest low reg
				
				//pick lowest low reg as temporary reg
				tmpReg = __builtin_ctz(lowRegs);
				
				//LDR tmpReg, =PC_VAL
				EMIT(HLloadImmToReg, tmpReg, instrAddr + 8, false, false, false);
				
				if (hiRegs & lrMask) {				//we need LR too
					
					//STRD LR, tmpReg, [SP, #proper_ofst]
					EMIT(LLstrdImm, EMIT_REG_NO_LR, tmpReg, EMIT_REG_NO_SP, nRegs * sizeof(uint32_t), EmitAdrModeIndex);
				}
				else {								//no LR
					
					//STR tmpReg, [SP, #proper_ofst]
					EMIT(LLstoreImm, tmpReg, EMIT_REG_NO_SP, nRegs * sizeof(uint32_t), EmitSzWord, EmitAdrModeIndex);
				}
				
				//LDR tmpRegs, [SP]		//reload tmpReg
				EMIT(LLloadImm, tmpReg, EMIT_REG_NO_SP, 0, EmitSzWord, false, EmitAdrModeIndex);
			}
			else if (hiRegs & lrMask) {				//no low regs but LR is being stored (we only get here if SP was stored)
				
				//STR LR, [SP, #proper_ofst]		//store LR
				EMIT(LLstoreImm, EMIT_REG_NO_LR, EMIT_REG_NO_SP, nRegs * sizeof(uint32_t), EmitSzWord, EmitAdrModeIndex);
				nRegs++;
				
				//LDR LR, =PC_VAL
				EMIT(HLloadImmToReg, EMIT_REG_NO_LR, instrAddr + 8, false, false, false);
				
				//STR LR, [SP, #proper_ofst]
				EMIT(LLstoreImm, EMIT_REG_NO_LR, EMIT_REG_NO_SP, nRegs * sizeof(uint32_t), EmitSzWord, EmitAdrModeIndex);
				nRegs++;
				
				//LDR LR, [SP, #proper_ofst]
				EMIT(LLloadImm, EMIT_REG_NO_LR, EMIT_REG_NO_SP, (nRegs - 2) * sizeof(uint32_t), EmitSzWord, false, EmitAdrModeIndex);
			}
			else {									//storing just PC
				
				//PUSH {r0}
				EMIT(HLpush, 0x0001);
				
				//LDR r0, =PC_VAL
				EMIT(HLloadImmToReg, 0, instrAddr + 8, false, false, false);
				
				//STR r0, [SP, #proper_ofst]	//4 added since we pushed and thus decremented SP
				EMIT(LLstoreImm, 0, EMIT_REG_NO_SP, (nRegs + 1) * sizeof(uint32_t), EmitSzWord, EmitAdrModeIndex);
				
				//POP {r0}
				EMIT(HLpop, 0x0001);
			}
		}
		else if (hiRegs & lrMask) {					//lr with no PC (we only get here is SP was stored)
		
			EMIT(LLstoreImm, EMIT_REG_NO_LR, EMIT_REG_NO_SP, nRegs * sizeof(uint32_t), EmitSzWord, EmitAdrModeIndex);
			nRegs++;
		}
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvStmiaToLr(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t regsMask, bool originalWbak)
{
	const uint32_t spMask = 1 << EMIT_REG_NO_SP, lrMask = 1 << EMIT_REG_NO_LR, pcMask = 1 << EMIT_REG_NO_PC;
	bool effectiveWbak = originalWbak && !(regsMask & lrMask);	//v7 does not allow wbak with Rn in list, even if lowest reg, v5 does	
	uint32_t lowRegs, hiRegs, tmpReg, nRegs;
	
	//if we can do it directly, do so
	if (!(regsMask & (pcMask | spMask))) {
		
		if (cc != EmitCcAl) {
			if (originalWbak && !effectiveWbak)
				EMIT(LLitt, cc);
			else
				EMIT(LLit, cc);
		}
		
		EMIT(HLstmia, EMIT_REG_NO_LR, regsMask, effectiveWbak);
		
		if (originalWbak && !effectiveWbak) {		//increment LR since we could not writeback as we went
			
			//ADD LR, LR, #4 * numRegs
			EMIT(LLaddImm, EMIT_REG_NO_LR, EMIT_REG_NO_LR, sizeof(uint32_t) * jitPrvPopcount16(regsMask), EmitLeaveFlags, cc != EmitCcAl);
		}
	}
	else {
	
		struct EmitBuf ccSkip;
		enum EmitStatus now;
		
		lowRegs = (regsMask & spMask) ? (regsMask & (spMask - 1)) : (regsMask & (pcMask - 1));
		nRegs = jitPrvPopcount16(lowRegs);
		hiRegs = regsMask &~ lowRegs;
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
			
		//store low regs (could be empty set)
		EMIT(HLstmia, EMIT_REG_NO_LR, lowRegs, effectiveWbak);
	
		//store SP (if requested)
		if (hiRegs & spMask) {
			
			EMIT(LLstoreImm, EMIT_REG_NO_SP, EMIT_REG_NO_LR, (effectiveWbak ? 1 : nRegs) * sizeof(uint32_t), EmitSzWord, effectiveWbak ? EmitAdrModePostindex : EmitAdrModeIndex);
			nRegs++;
		}
		
		if (hiRegs & pcMask) {							//pc ? this gets hard
			
			if (lowRegs &~ lrMask) {					//we can use the lowest low reg (ignore LR, since no regs above LR can be in lowregs, we're ok)
				
				//pick lowest low reg as temporary reg
				tmpReg = __builtin_ctz(lowRegs);
				
				//LDR tmpReg, =PC_VAL
				EMIT(HLloadImmToReg, tmpReg, instrAddr + 8, false, false, false);
				
				if (hiRegs & lrMask) {					//we need LR too
					
					//STRD LR, tmpReg, [LR, #proper_ofst]	//if no wbak
					//STRD LR, tmpReg, [LR], #8				//if wbak (will never happen since we know LR will never be stored to LR with wbak)
					EMIT(LLstrdImm, EMIT_REG_NO_LR, tmpReg, EMIT_REG_NO_LR, (effectiveWbak ? 2 : nRegs) * sizeof(uint32_t), effectiveWbak ? EmitAdrModePostindex : EmitAdrModeIndex);
					nRegs += 2;
				}
				else {									//no LR
					
					//STR tmpReg, [LR, #proper_ofst]	//if no wbak
					//STR tmpReg, [LR], #4				//if wbak
					EMIT(LLstoreImm, tmpReg, EMIT_REG_NO_LR, (effectiveWbak ? 1 : nRegs) * sizeof(uint32_t), EmitSzWord, effectiveWbak ? EmitAdrModePostindex : EmitAdrModeIndex);
					nRegs++;
				}
				
				//LDR tmpRegs, [LR]						//if no wbak	//reload tmpReg
				//LDR tmpRegs, [LR, #-nregs * 4]		//if wbak		//reload tmpReg
				EMIT(LLloadImm, tmpReg, EMIT_REG_NO_LR, (effectiveWbak ? -nRegs : 0) * sizeof(uint32_t), EmitSzWord, false, EmitAdrModeIndex);
			}
			else {									//storing just PC
				
				//PUSH {r0}
				EMIT(HLpush, 0x0001);
				
				//LDR r0, =PC_VAL
				EMIT(HLloadImmToReg, 0, instrAddr + 8, false, false, false);
				
				//STR r0, [LR]							//if no wbak	//reload tmpReg
				//STR r0, [LR, #-nregs * 4]				//if wbak		//reload tmpReg
				EMIT(LLstoreImm, 0, EMIT_REG_NO_LR, (effectiveWbak ? 1 : nRegs) * sizeof(uint32_t), EmitSzWord, effectiveWbak ? EmitAdrModePostindex : EmitAdrModeIndex);
				
				//POP {r0}
				EMIT(HLpop, 0x0001);
			}
		}
		else if (hiRegs & lrMask) {					//lr with no PC (we only get here is SP was stored)
		
			//STR LR, [LR, #proper_ofst]			//if no wbak
			//STR LR, [LR], #4						//if wbak (will never happen since we know LR will never be stored to LR with wbak)
			EMIT(LLstoreImm, EMIT_REG_NO_LR, EMIT_REG_NO_LR, (effectiveWbak ? 1 : nRegs) * sizeof(uint32_t), EmitSzWord, effectiveWbak ? EmitAdrModePostindex : EmitAdrModeIndex);
			nRegs++;
		}
	
		if (originalWbak && !effectiveWbak) {		//increment LR since we could not writeback as we went
			
			//ADD LR, LR, #4 * numRegs
			EMIT(LLaddImm, EMIT_REG_NO_LR, EMIT_REG_NO_LR, sizeof(uint32_t) * jitPrvPopcount16(regsMask), EmitLeaveFlags, false);
		}
		
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

//base is not SP or LR
static enum EmitStatus jitPrvStmiaSimpler(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rnNo, uint32_t regsMask, bool originalWbak)
{
	const uint32_t spMask = 1 << EMIT_REG_NO_SP, lrMask = 1 << EMIT_REG_NO_LR, pcMask = 1 << EMIT_REG_NO_PC, rnMask = 1 << rnNo;
	bool effectiveWbak = originalWbak && !(regsMask & rnMask);	//v7 does not allow wbak with Rn in list, even if lowest reg, v5 does	
	uint32_t lowRegs, hiRegs, tmpReg, nRegs;
	
	//if we can do it directly, do so
	if (!(regsMask & (pcMask | spMask))) {
		
		if (cc != EmitCcAl) {
			if (originalWbak && !effectiveWbak)
				EMIT(LLitt, cc);
			else
				EMIT(LLit, cc);
		}
		
		EMIT(HLstmia, rnNo, regsMask, effectiveWbak);
		
		if (originalWbak && !effectiveWbak) {		//increment Rn since we could not writeback as we went
			
			//ADD Rn, Rn, #4 * numRegs
			EMIT(LLaddImm, rnNo, rnNo, sizeof(uint32_t) * jitPrvPopcount16(regsMask), EmitLeaveFlags, cc != EmitCcAl);
		}
	}
	else {
		struct EmitBuf ccSkip;
		enum EmitStatus now;
		
		lowRegs = (regsMask & spMask) ? (regsMask & (spMask - 1)) : (regsMask & (pcMask - 1));
		nRegs = jitPrvPopcount16(lowRegs);
		hiRegs = regsMask &~ lowRegs;
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
			
		//store low regs (could be empty set)
		EMIT(HLstmia, rnNo, lowRegs, effectiveWbak);
	
		//store SP (if requested)
		if (hiRegs & spMask) {
			
			//STR SP, [Rn, #proper_ofst]	//if no wbak
			//STR SP, [Rn], #4				//if wbak
			EMIT(LLstoreImm, EMIT_REG_NO_SP, rnNo, (effectiveWbak ? 1 : nRegs) * sizeof(uint32_t), EmitSzWord, effectiveWbak ? EmitAdrModePostindex : EmitAdrModeIndex);
			nRegs++;
		}
		
		if (hiRegs & pcMask) {						//pc ? this gets hard
			
			if (lowRegs &~ rnMask) {				//we can use the lowest low reg that is not Rn
				
				int32_t extraOfst = 0;
				
				//pick lowest low reg as temporary reg
				tmpReg = __builtin_ctz(lowRegs &~ rnMask);
				
				if ((uint32_t)__builtin_ctz(lowRegs) != tmpReg)	//we might not have picked the lowest reg (if Rn is lowest)
					extraOfst = 4;
				
				//LDR tmpReg, =PC_VAL
				EMIT(HLloadImmToReg, tmpReg, instrAddr + 8, false, false, false);
				
				if (hiRegs & lrMask) {					//we need LR too
					
					//STRD LR, tmpReg, [Rn, #proper_ofst]	//if no wbak
					//STRD LR, tmpReg, [Rn], #8				//if wbak
					EMIT(LLstrdImm, EMIT_REG_NO_LR, tmpReg, rnNo, (effectiveWbak ? 2 : nRegs) * sizeof(uint32_t), effectiveWbak ? EmitAdrModePostindex : EmitAdrModeIndex);
					nRegs += 2;
				}
				else {								//no LR
					
					//STR tmpReg, [Rn, #proper_ofst]	//if no wbak
					//STR tmpReg, [Rn], #4				//if wbak
					EMIT(LLstoreImm, tmpReg, rnNo, (effectiveWbak ? 1 : nRegs) * sizeof(uint32_t), EmitSzWord, effectiveWbak ? EmitAdrModePostindex : EmitAdrModeIndex);
					nRegs++;
				}
				
				//LDR tmpRegs, [rnNo + extraOfst]					//if no wbak	//reload tmpReg
				//LDR tmpRegs, [rnNo, #-nregs * 4 + extraOfst]		//if wbak		//reload tmpReg
				EMIT(LLloadImm, tmpReg, rnNo, (effectiveWbak ? -nRegs : 0) * sizeof(uint32_t) + extraOfst, EmitSzWord, false, EmitAdrModeIndex);
			}
			else if (hiRegs & lrMask) {				//no low regs but LR is being stored (we only get here if SP was stored)
				
				//STR LR, [Rn, #proper_ofst]	//if no wbak
				//STR LR, [Rn], #4				//if wbak
				EMIT(LLstoreImm, EMIT_REG_NO_LR, rnNo, (effectiveWbak ? 1 : nRegs) * sizeof(uint32_t), EmitSzWord, effectiveWbak ? EmitAdrModePostindex : EmitAdrModeIndex);
				nRegs++;
				
				//LDR LR, =PC_VAL
				EMIT(HLloadImmToReg, EMIT_REG_NO_LR, instrAddr + 8, false, false, false);
				
				//STR LR, [Rn, #proper_ofst]	//if no wbak
				//STR LR, [Rn], #4				//if wbak
				EMIT(LLstoreImm, EMIT_REG_NO_LR, rnNo, (effectiveWbak ? 1 : nRegs) * sizeof(uint32_t), EmitSzWord, effectiveWbak ? EmitAdrModePostindex : EmitAdrModeIndex);
				nRegs++;
				
				//LDR LR, [Rn, #proper_ofst]
				EMIT(LLloadImm, EMIT_REG_NO_LR, rnNo, ((effectiveWbak ? 0 : nRegs) - 2) * sizeof(uint32_t), EmitSzWord, false, EmitAdrModeIndex);
			}
			else {									//storing just PC
				
				tmpReg = rnNo ? 0 : 1;
				
				//PUSH {tmpReg}
				EMIT(HLpush, 1 << tmpReg);
				
				//LDR tmpReg, =PC_VAL
				EMIT(HLloadImmToReg, tmpReg, instrAddr + 8, false, false, false);
				
				//STR tmpReg, [Rn]							//if no wbak	//reload tmpReg
				//STR tmpReg, [Rn, #-nregs * 4]				//if wbak		//reload tmpReg
				EMIT(LLstoreImm, tmpReg, rnNo, (effectiveWbak ? 1 : nRegs) * sizeof(uint32_t), EmitSzWord, effectiveWbak ? EmitAdrModePostindex : EmitAdrModeIndex);
				
				//POP {tmpReg}
				EMIT(HLpop, 1 << tmpReg);
			}
		}
		else if (hiRegs & lrMask) {					//lr with no PC (we only get here is SP was stored)
		
			//STR LR, [Rn, #proper_ofst]			//if no wbak
			//STR LR, [Rn], #4						//if wbak
			EMIT(LLstoreImm, EMIT_REG_NO_LR, rnNo, (effectiveWbak ? 1 : nRegs) * sizeof(uint32_t), EmitSzWord, effectiveWbak ? EmitAdrModePostindex : EmitAdrModeIndex);
			nRegs++;
		}
	
		if (originalWbak && !effectiveWbak) {		//increment Rn since we could not writeback as we went
			
			//ADD Rn, Rn, #4 * numRegs
			EMIT(LLaddImm, rnNo, rnNo, sizeof(uint32_t) * jitPrvPopcount16(regsMask), EmitLeaveFlags, false);
		}
	
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}

	return EmitErrNone;
}

enum EmitStatus jitEmitStmia(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rnNo, uint32_t regsMask, bool wbak)
{
	//we need at least one reg, PC is not allowed as the base reg, Rn cannot be in the list if wbak is on and it is not the lowest-numbered reg
	if (!regsMask || rnNo == EMIT_REG_NO_PC || ((regsMask & (1 << rnNo)) & wbak && (regsMask & ((1 << rnNo) - 1))))
		return EmitErrNotEncodeable;

	if (rnNo == EMIT_REG_NO_SP)
		return jitPrvStmiaToSp(dest, cc, instrAddr, regsMask, wbak);
	else if (rnNo == EMIT_REG_NO_LR)
		return jitPrvStmiaToLr(dest, cc, instrAddr, regsMask, wbak);
	else
		return jitPrvStmiaSimpler(dest, cc, instrAddr, rnNo, regsMask, wbak);
}

//////////////////////////////////////////////////////// END STMIA ////////////////////////////////////////////////////////

//////////////////////////////////////////////////////// START STMIB ////////////////////////////////////////////////////////

enum EmitStatus jitEmitStmib(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rnNo, uint32_t regsMask, bool wbak)
{
	//stmib is not very common, and v7 has no stmib instr. Yes, sometimes we could use stmia, but it is not always worth it.
	//we do it in some cases but we do not see the need to do it often
	
	const uint32_t pcMask = 1 << EMIT_REG_NO_PC, spMask = 1 << EMIT_REG_NO_SP, rnMask = 1 << rnNo;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
		
	//we need at least one reg, PC is not allowed as the base reg, Rn cannot be in the list if wbak is on and it is not the lowest-numbered reg
	if (!regsMask || rnNo == EMIT_REG_NO_PC || ((regsMask & (1 << rnNo)) & wbak && (regsMask & ((1 << rnNo) - 1))))
		return EmitErrNotEncodeable;
	else if (rnNo == EMIT_REG_NO_SP && wbak) {
		
		//everything written would almost immediately be below the stack, only the last store would matter
		
		logw("STMIB SP!, {...} makes almost no sense, trying anyways\n");
		
		//add SP, SP, #4 * numRegs
		EMIT(LLaddImm, EMIT_REG_NO_SP, EMIT_REG_NO_SP, sizeof(uint32_t) * jitPrvPopcount16(regsMask), EmitLeaveFlags, false);

		//str highest_reg_in_set, [SP]
		now = jitEmitImmMemStr(dest, EmitCcAl, instrAddr, 31 - __builtin_clz(regsMask), EMIT_REG_NO_SP, 0, EmitAdrModeIndex, EmitSzWord);
		if (now != EmitErrNone)
			return now;
	}
	else if (!(regsMask & (rnMask | pcMask | spMask)) && jitPrvPopcount16(regsMask) >= 6 && rnNo != EMIT_REG_NO_SP) {
		//if a simple STMIA can be used, and at least 6 regs are in the list, use a stmia
		
		//add Rn, Rn, #4
		EMIT(LLaddImm, rnNo, rnNo, sizeof(uint32_t), EmitLeaveFlags, false);
		
		//stmia Rn{!}, {regsMask}
		EMIT(HLstmia, rnNo, regsMask, wbak);
		
		//sub Rn, Rn, #4
		EMIT(LLsubImm, rnNo, rnNo, sizeof(uint32_t), EmitLeaveFlags, false);
	}
	else {
		
		uint32_t loRegs = (regsMask & spMask) ? regsMask & (spMask - 1) : regsMask & (pcMask - 1);
		uint32_t hiRegs = regsMask - loRegs, tmpReg1, tmpReg2, nRegs = 0;
		bool inc4 = true, eWbak = wbak && !(regsMask & rnMask);
		
		while (loRegs & (loRegs - 1)) {	//while at least two regs remain
			
			tmpReg1 = __builtin_ctz(loRegs);
			loRegs &= loRegs - 1;
			tmpReg2 = __builtin_ctz(loRegs);
			loRegs &= loRegs - 1;
			
			//strd tmpReg1, tmpReg2, [Rn, #proper offset]	//if no wbak
			//strd tmpReg1, tmpReg2, [Rn, #4]!				//if wbak and (inc4)
			//strd tmpReg1, tmpReg2, [Rn, #8]!				//if wbak and (!inc4)
			now = emitLLstrdImm(dest, tmpReg1, tmpReg2, rnNo, sizeof(uint32_t) * (eWbak ? (inc4 ? 1 : 2) : (nRegs + 1)), eWbak ? EmitAdrModeIndexWbak : EmitAdrModeIndex);
			if (now != EmitErrNone)
				return now;
			inc4 = false;
			nRegs += 2;
		}
		
		hiRegs |= loRegs;
		loRegs = hiRegs &~ (1 << EMIT_REG_NO_PC);
		hiRegs &= (1 << EMIT_REG_NO_PC);
		
		//sp can be stored as single reg but not in a STRD, so this loop might have more than one iteration
		while (loRegs) {
			
			tmpReg1 = __builtin_ctz(loRegs);
			loRegs &= loRegs - 1;
			
			//str tmpReg1, [Rn, #proper offset]				//if no wbak
			//str tmpReg1, [Rn, #4]!						//if wbak (inc4)
			//str tmpReg1, [Rn, #8]!						//if wbak (!inc4)
			now = emitLLstoreImm(dest, tmpReg1, rnNo, sizeof(uint32_t) * (eWbak ? (inc4 ? 1 : 2) : (nRegs + 1)), EmitSzWord, eWbak ? EmitAdrModeIndexWbak : EmitAdrModeIndex);
			if (now != EmitErrNone)
				return now;
			inc4 = true;
			nRegs++;
		}
		
		//at this point only PC can be left over. see if we have a non-Rn, non-SP reg, non-PC to use as temp
		if (hiRegs) {
			//if base is SP, we know no wbak will be used
			loRegs = regsMask &~ (pcMask | spMask | rnMask);
			
			if (loRegs)
				tmpReg1 = __builtin_ctz(loRegs);
			else {
				
				tmpReg1 = rnNo ? 0 : 1;
				
				//push {tmpReg1}
				EMIT(HLpush, 1 << tmpReg1);
			}
			
			//ldr tmpReg1, =PC_VAL
			EMIT(HLloadImmToReg, tmpReg1, instrAddr + 8, false, false, false);
			
			//store it as needed
			
			//str tmpReg1, [Rn, #proper offset]				//if no wbak and not SP base
			//str tmpReg1, [Rn, #proper offset + 4]				//if no wbak and SP base
			//str tmpReg1, [Rn, #4]!						//if wbak (inc4)
			//str tmpReg1, [Rn, #8]!						//if wbak (!inc4)
			now = emitLLstoreImm(dest, tmpReg1, rnNo, sizeof(uint32_t) * (eWbak ? (inc4 ? 1 : 2) : ((nRegs + 1) + (rnNo == EMIT_REG_NO_SP ? 1 : 0))), EmitSzWord, eWbak ? EmitAdrModeIndexWbak : EmitAdrModeIndex);
			if (now != EmitErrNone)
				return now;
			inc4 = true;
			nRegs++;
			
			if (loRegs) {
				
				int32_t ofst;
				
				//sort out our offset
				if (eWbak)
					ofst = -jitPrvPopcount16(regsMask &~ ((2 << tmpReg1) - 1));
				else
					ofst = jitPrvPopcount16(regsMask & ((1 << tmpReg1) - 1)) + 1;
				
				//ldr tmpReg1, [Rn, #proper_ofst]		//reload tmpReg1
				now = emitLLloadImm(dest, tmpReg1, rnNo, ofst * sizeof(uint32_t), EmitSzWord, false, EmitAdrModeIndex);
				if (now != EmitErrNone)
					return now;
			}
			else {
				
				//pop {tmpReg1}
				EMIT(HLpop, 1 << tmpReg1);
			}
		}

		//if we got this far and we are in wbak mode and inc4 is false, we need to increment Rn by 4 so it points where it needs to
		if (eWbak && !inc4) {
			
			//add Rn, Rn, #4
			EMIT(LLaddImm, rnNo, rnNo, sizeof(uint32_t), EmitLeaveFlags, false);
		}

		//if we needed wbak but did not execute it yet, do so now
		if (wbak && !eWbak) {
			
			//add Rn, Rn, #4 * numRegs
			EMIT(LLaddImm, rnNo, rnNo, sizeof(uint32_t) * jitPrvPopcount16(regsMask), EmitLeaveFlags, false);
		}
	}
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

//////////////////////////////////////////////////////// END STMIB ////////////////////////////////////////////////////////


//////////////////////////////////////////////////////// START LDMIB ////////////////////////////////////////////////////////

enum EmitStatus jitEmitLdmib(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rnNo, uint32_t regsMask, bool wbak)
{
	//ldmib is not very common, and v7 has no ldmib instr. Yes, sometimes we could use ldmia, but it is not always worth it.
	//we do it in some cases but we do not see the need to do it often
	
	const uint32_t pcMask = 1 << EMIT_REG_NO_PC, spMask = 1 << EMIT_REG_NO_SP, lrMask = 1 << EMIT_REG_NO_LR, rnMask = 1 << rnNo;
	uint32_t nRegs = 0, tmpReg1, tmpReg2, regsNow, regsHi;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	bool inc4 = true;

	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
		
	//we need at least one reg, PC is not allowed as the base reg, Rn cannot be in the list if wbak is on
	if (!regsMask || rnNo == EMIT_REG_NO_PC || ((regsMask & rnMask) & wbak))
		return EmitErrNotEncodeable;
	else if (!(regsMask & rnMask)) {					//handle all the cases of Rn not being in the list
		
		//sort out the easy regs
		regsNow = (regsMask & spMask) ? (regsMask & (spMask - 1)) : (regsMask & (pcMask - 1));
		regsHi = regsMask - regsNow;
		
		//load them two at a time with wbak
		while (regsNow & (regsNow - 1)) {	//while at least two regs remain
			
			tmpReg1 = __builtin_ctz(regsNow);
			regsNow &= regsNow - 1;
			tmpReg2 = __builtin_ctz(regsNow);
			regsNow &= regsNow - 1;
			
			//ldrd tmpReg1, tmpReg2, [Rn, #proper offset]	//if no wbak
			//ldrd tmpReg1, tmpReg2, [Rn, #4]!				//if wbak and (inc4)
			//ldrd tmpReg1, tmpReg2, [Rn, #8]!				//if wbak and (!inc4)
			now = emitLLldrdImm(dest, tmpReg1, tmpReg2, rnNo, sizeof(uint32_t) * (wbak ? (inc4 ? 1 : 2) : (nRegs + 1)), wbak ? EmitAdrModeIndexWbak : EmitAdrModeIndex);
			if (now != EmitErrNone)
				return now;
			inc4 = false;
			nRegs += 2;
		}
		
		//we might have a reg left now. if only SP or only PC is left, we can just load that one reg directly too, except we
		// cannot load SP with wbak, so we do handle that one carefully
		if (regsHi == spMask || regsHi == pcMask) {
			regsNow |= regsHi;
			regsHi = 0;
		}
		
		//load regs one at a time (guaranteed to leave Rn precisely where needed, so it is safe to do even when loading PC)
		while (regsNow) {
			
			tmpReg1 = __builtin_ctz(regsNow);
			regsNow &= regsNow - 1;
			
			//ldr tmpReg1, [Rn, #proper offset]				//if no wbak
			//ldr tmpReg1, [Rn, #4]!						//if wbak (inc4)
			//ldr tmpReg1, [Rn, #8]!						//if wbak (!inc4)
			now = emitLLloadImm(dest, tmpReg1, rnNo, sizeof(uint32_t) * (wbak ? (inc4 ? 1 : 2) : (nRegs + 1)), EmitSzWord, false, (wbak && (tmpReg1 != EMIT_REG_NO_SP)) ? EmitAdrModeIndexWbak : EmitAdrModeIndex);
			if (now != EmitErrNone)
				return now;
			
			if (wbak && (tmpReg1 == EMIT_REG_NO_SP)) { 	//we did not wbak, we need to
				
				//add Rn, Rn, #proper_amt
				EMIT(LLaddImm, rnNo, rnNo, sizeof(uint32_t) * (inc4 ? 1 : 2), EmitLeaveFlags, false);
			}
			
			inc4 = true;
			nRegs++;
		}
		
		//if we are done now, just adjust for writeback if needed
		if (!regsHi) {
			
			if (wbak && !inc4) {
			
				//add Rn, Rn, #4
				EMIT(LLaddImm, rnNo, rnNo, sizeof(uint32_t), EmitLeaveFlags, false);
			}
		}
		else {		//we're loading SP (possibly with LR and/or PC). We know Rn is not SP (thus we can adjust it for wbak final value right away)
			
			uint32_t whichSecond;
			
			//possible combos are: {SP PC} {SP LR} {SP LR PC}
			
			//find us some tempporary regs to use (the only requirement is that they not be Rn)
			regsNow = 0;
			tmpReg1 = jitUtilPickLowestClearBit(regsNow | rnMask);
			regsNow |= 1 << tmpReg1;
			tmpReg2 = jitUtilPickLowestClearBit(regsNow | rnMask);
			regsNow |= 1 << tmpReg2;
			
			if (regsMask & pcMask)	 	//we'll need to push another reg, we do not care which, just use tmpReg2 + 1
				regsNow |= 1 << (tmpReg2 + 1);
			
			//adjust the reg for FINAL value, if wbak, calculate the offset of first reg to load *from current value of Rn*
			if (wbak) {
				
				//add Rn, Rn, #proper_amt
				EMIT(LLaddImm, rnNo, rnNo, sizeof(uint32_t) * ((inc4 ? 0 : 1) + jitPrvPopcount16(regsHi)), EmitLeaveFlags, false);
			}

			//push {..regsNow..}
			EMIT(HLpush, regsNow);
			
			
			//if we need three regs (sp,lr,pc), load pc first (into tmpReg2)
			if ((regsMask & pcMask) && (regsMask & lrMask)) { 	//load pc new val
				
				now = emitLLloadImm(dest, tmpReg2, rnNo, sizeof(uint32_t) * (wbak ? 0 : (nRegs + 3)), EmitSzWord, false, EmitAdrModeIndex);
				if (now != EmitErrNone)
					return now;
			}
			
			//we definitely have two regs to load, first being SP (into tmpReg1)
			whichSecond = (regsMask & lrMask) ? EMIT_REG_NO_LR : tmpReg2;
			
			//ldrd tmpReg1, whichSecond, [Rn, #proper_ofst]
			now = emitLLldrdImm(dest, tmpReg1, whichSecond, rnNo, sizeof(uint32_t) * (wbak ? (1 - nRegs) : (nRegs + 1)), EmitAdrModeIndex);
			if (now != EmitErrNone)
				return now;
			if (!wbak)
				nRegs += 2;
			
			//handle PC loading
			if (regsMask & pcMask) {
				
				//save PC if we loaded it
				EMIT(LLstoreImm, tmpReg2, EMIT_REG_NO_SP, sizeof(uint32_t) * 2, EmitSzWord, EmitAdrModeIndex);
				
				regsNow &=~ (1 << (tmpReg2 + 1));
				regsNow |= pcMask;
				nRegs = 3;
			}
			else
				nRegs = 2;
		
			//swap sp
			now = jitPrvEmitSwapSp(dest, tmpReg1, tmpReg2, regsNow, nRegs);
			if (now != EmitErrNone)
				return now;
		}
	}
	else {		//non-wbak, loads Rn, can use ldmia, even for SP
		
		//add Rn, Rn, #4
		EMIT(LLaddImm, rnNo, rnNo, sizeof(uint32_t), EmitLeaveFlags, false);
		
		now = jitEmitLdmia(dest, EmitCcAl, instrAddr, rnNo, regsMask, false);
		if (now != EmitErrNone)
			return now;
	}

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}


//////////////////////////////////////////////////////// END LDMIB ////////////////////////////////////////////////////////

//////////////////////////////////////////////////////// START STMDB ////////////////////////////////////////////////////////

static enum EmitStatus jitPrvStmdbToSp(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t regsMask)	//this is basically a push, wbak guaranteed. we only get here if SP or PC are in the list
{
	const uint32_t pcMask = 1 << EMIT_REG_NO_PC, spMask = 1 << EMIT_REG_NO_SP, lrMask = 1 << EMIT_REG_NO_LR;
	uint32_t loRegs, hiRegs, nHiRegs, tmpReg = 0, storeOfst = 0, restoreOfst = 0;
	bool needPop = false, needReload = false;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	loRegs = (regsMask & spMask) ? (regsMask & (spMask - 1)) : (regsMask & (pcMask - 1));
	hiRegs = regsMask - loRegs;
	nHiRegs = jitPrvPopcount16(hiRegs);
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	//sub sp, sp, #4 * nHiRegs
	EMIT(LLsubImm, EMIT_REG_NO_SP, EMIT_REG_NO_SP, sizeof(uint32_t) * nHiRegs, EmitLeaveFlags, false);
	
	//stmfd sp!, {..loRegs..}
	EMIT(HLstmdb, EMIT_REG_NO_SP, loRegs, true);
	storeOfst = jitPrvPopcount16(loRegs);

	//now we need to store the complex regs. how depends on whether we have temp regs
	if (loRegs) {
		
		tmpReg = __builtin_ctz(loRegs);
	}
	else if (regsMask & lrMask) {		//we can use LR as a temp reg (we only get here if sp is being stored and no lo regs exist)
		
		//str LR, [sp, #4]
		EMIT(LLstoreImm, EMIT_REG_NO_LR, EMIT_REG_NO_SP, sizeof(uint32_t), EmitSzWord, EmitAdrModeIndex);
		tmpReg = EMIT_REG_NO_LR;
		restoreOfst = 1;
	}
	else if (nHiRegs) {
		//push {r0}
		EMIT(HLpush, 0x0001);
		
		tmpReg = 0;
		storeOfst += 1;
		needPop = true;
	}

	if (hiRegs & spMask) {
		
		//add tmpReg, sp, #4 * (nHiRegs + storeOfst)		//calc SP val we should have pushed
		EMIT(LLaddImm, tmpReg, EMIT_REG_NO_SP, sizeof(uint32_t) * (nHiRegs + storeOfst), EmitLeaveFlags, false);
		
		needReload = true;
		if ((hiRegs & lrMask) && tmpReg != EMIT_REG_NO_LR) {
			
			//strd tmpReg, LR, [sp, #proper_ofst]
			EMIT(LLstrdImm, tmpReg, EMIT_REG_NO_LR, EMIT_REG_NO_SP, sizeof(uint32_t) * storeOfst, EmitAdrModeIndex);
			storeOfst += 2;
		}
		else {
			
			//str tmpReg, [sp, #proper_ofst]
			EMIT(LLstoreImm, tmpReg, EMIT_REG_NO_SP, sizeof(uint32_t) * storeOfst, EmitSzWord, EmitAdrModeIndex);
			storeOfst++;
			
			//skip the already-stored LR
			if (tmpReg == EMIT_REG_NO_LR)
				storeOfst++;
		}
	}
	
	if (hiRegs & pcMask) {
		
		needReload = true;
		
		//LDR tmpReg, =PC_VAL
		EMIT(HLloadImmToReg, tmpReg, instrAddr + 8, true, true, false);
		
		//str tmpReg, [sp, #proper_ofst]
		EMIT(LLstoreImm, tmpReg, EMIT_REG_NO_SP, sizeof(uint32_t) * storeOfst, EmitSzWord, EmitAdrModeIndex);
		storeOfst++;
	}
	
	if (needPop) {
		
		//pop {r0}
		EMIT(HLpop, 0x0001);
	}
	else if (needReload) {
		
		//ldr tmpReg, [sp, #proper_ofst]	//reload orig reg
		EMIT(LLloadImm, tmpReg, EMIT_REG_NO_SP, sizeof(uint32_t) * restoreOfst, EmitSzWord, false, EmitAdrModeIndex);
	}

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvStmdbToLr(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t regsMask, bool wbak)
{
	const uint32_t pcMask = 1 << EMIT_REG_NO_PC, spMask = 1 << EMIT_REG_NO_SP, lrMask = 1 << EMIT_REG_NO_LR;
	uint32_t loRegs, hiRegs, nHiRegs, nLoRegs, tmpReg, regA, regB;
	bool needPush = false, needPop = false, needReload = false;
	int32_t hiRegsStart, reloadOfst = 0;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	loRegs = (regsMask & spMask) ? (regsMask & (spMask - 1)) : (regsMask & (pcMask - 1));
	hiRegs = regsMask - loRegs;
	nHiRegs = jitPrvPopcount16(hiRegs);
	nLoRegs = jitPrvPopcount16(loRegs);
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	switch (nLoRegs) {
		case 0:
			if (wbak) {
				
				//sub lr, lr, #4 * nHiRegs
				EMIT(LLsubImm, EMIT_REG_NO_LR, EMIT_REG_NO_LR, sizeof(uint32_t) * nHiRegs, EmitLeaveFlags, false);
				hiRegsStart = nLoRegs;
			}
			else {
				
				hiRegsStart = -nHiRegs;
			}
			tmpReg = 0;
			needPush = true;
			break;
		
		case 1:
			regA = 31 - __builtin_clz(loRegs);
			
			//str regA, [lr, #-4 * total_num_regs] {!}
			EMIT(LLstoreImm, regA, EMIT_REG_NO_LR, -sizeof(uint32_t) * (nHiRegs + 1), EmitSzWord, wbak ? EmitAdrModeIndexWbak : EmitAdrModeIndex);
			tmpReg = regA;
			
			if (wbak)
				hiRegsStart = nLoRegs;
			else
				hiRegsStart = -nHiRegs - 1;
			break;
		
		case 2:
			regA = __builtin_ctz(loRegs);
			regB = 31 - __builtin_clz(loRegs &~ (1 << regA));
		
			//strd regA, regB, [lr, #-4 * total_num_regs] {!}
			EMIT(LLstrdImm, regA, regB, EMIT_REG_NO_LR, -sizeof(uint32_t) * (nHiRegs + 2), wbak ? EmitAdrModeIndexWbak : EmitAdrModeIndex);
			tmpReg = regA;
			
			if (wbak)
				hiRegsStart = nLoRegs;
			else
				hiRegsStart = -nHiRegs - 2;
			break;
		
		default:	//more than 2 regs - use an stmdb
			
			//sub lr, lr, #4 * nHiRegs
			EMIT(LLsubImm, EMIT_REG_NO_LR, EMIT_REG_NO_LR, sizeof(uint32_t) * nHiRegs, EmitLeaveFlags, false);
			
			if (wbak)
				hiRegsStart = nLoRegs;
			else
				hiRegsStart = -nHiRegs;
			
			//stmdb lr{!}, {loRegs}
			EMIT(HLstmdb, EMIT_REG_NO_LR, loRegs, wbak);
			tmpReg = __builtin_ctz(loRegs);
			
			if (!wbak) {
				//add lr, lr, #4 * nHiRegs
				EMIT(LLaddImm, EMIT_REG_NO_LR, EMIT_REG_NO_LR, sizeof(uint32_t) * nHiRegs, EmitLeaveFlags, false);
			}
			
			break;
	}
	
	//sort out where we'll reload our temp reg from
	reloadOfst = hiRegsStart - nLoRegs;
	
	//ok now we store the complex regs as needed
	if (hiRegs & spMask) {
		
		//str sp, [lr, #4 * hiRegsStart]
		EMIT(LLstoreImm, EMIT_REG_NO_SP, EMIT_REG_NO_LR, sizeof(uint32_t) * hiRegsStart, EmitSzWord, EmitAdrModeIndex);
		hiRegsStart++;
	}
	if (hiRegs & lrMask) {
		
		uint32_t whichReg = EMIT_REG_NO_LR;
		
		if (wbak) {
			
			if (needPush && !needPop) {
			
				//push {r0}
				EMIT(HLpush, 0x0001);
				
				needPop = true;
			}
			needReload = true;
		
			//add tmpReg, lr, #4 * num_pushed_regs 		//calc SP val we should have pushed
			EMIT(LLaddImm, tmpReg, EMIT_REG_NO_LR, sizeof(uint32_t) * (nLoRegs + nHiRegs), EmitLeaveFlags, false);
			whichReg = tmpReg;
		}
		
		//str whichReg, [lr, #4 * hiRegsStart]
		EMIT(LLstoreImm, whichReg, EMIT_REG_NO_LR, sizeof(uint32_t) * hiRegsStart, EmitSzWord, EmitAdrModeIndex);
		hiRegsStart++;
	}
	if (hiRegs & pcMask) {
		
		if (needPush && !needPop) {
			
			//push {r0}
			EMIT(HLpush, 0x0001);
			
			needPop = true;
		}
		needReload = true;
		
		//LDR tmpReg, =PC_VAL
		EMIT(HLloadImmToReg, tmpReg, instrAddr + 8, true, true, false);
		
		//str tmpReg, [lr, #4 * hiRegsStart]
		EMIT(LLstoreImm, tmpReg, EMIT_REG_NO_LR, sizeof(uint32_t) * hiRegsStart, EmitSzWord, EmitAdrModeIndex);
		hiRegsStart++;
	}
	
	if (needPop) {
		
		//pop {r0}
		EMIT(HLpop, 0x0001);
	}
	else if (needReload) {
		
		//ldr tmpReg, [lr, #proper_ofst]	//reload orig reg
		EMIT(LLloadImm, tmpReg, EMIT_REG_NO_LR, sizeof(uint32_t) * reloadOfst, EmitSzWord, false, EmitAdrModeIndex);
	}

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvStmdbSimpler(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rnNo, uint32_t regsMask, bool wbakOrig)
{
	const uint32_t pcMask = 1 << EMIT_REG_NO_PC, spMask = 1 << EMIT_REG_NO_SP, lrMask = 1 << EMIT_REG_NO_LR, rnMask = 1 << rnNo;
	bool eWbak = wbakOrig && !(regsMask & rnMask), needPush = false, needPop = false, needReload = false;
	uint32_t loRegs, hiRegs, nHiRegs, nLoRegs, tmpReg, regA, regB;
	int32_t hiRegsStart, reloadOfst = 0;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	loRegs = (regsMask & spMask) ? (regsMask & (spMask - 1)) : (regsMask & (pcMask - 1));
	hiRegs = regsMask - loRegs;
	nLoRegs = jitPrvPopcount16(loRegs);
	nHiRegs = jitPrvPopcount16(hiRegs);
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	switch (nLoRegs) {
		case 0:
			if (eWbak) {
				
				//sub Rn, Rn, #4 * nHiRegs
				EMIT(LLsubImm, rnNo, rnNo, sizeof(uint32_t) * nHiRegs, EmitLeaveFlags, false);
				hiRegsStart = nLoRegs;
			}
			else {
				
				hiRegsStart = -nHiRegs;
			}
			tmpReg = 0;
			needPush = true;
			break;
		
		case 1:
			regA = 31 - __builtin_clz(loRegs);	//if regA is Rn, proper value *IS* stored
			
			//str regA, [Rn, #-4 * total_num_regs] {!}
			EMIT(LLstoreImm, regA, rnNo, -sizeof(uint32_t) * (nHiRegs + 1), EmitSzWord, eWbak ? EmitAdrModeIndexWbak : EmitAdrModeIndex);
			
			if (regA != rnNo)
				tmpReg = regA;
			else {
				tmpReg = 0;
				needPush = true;
			}
			
			if (eWbak)
				hiRegsStart = nLoRegs;
			else
				hiRegsStart = -nHiRegs - 1;
			break;
		
		case 2:
			regA = __builtin_ctz(loRegs);						//if regA or regB is Rn, proper value *IS* stored
			regB = 31 - __builtin_clz(loRegs &~ (1 << regA));
		
			//strd regA, regB, [Rn, #-4 * total_num_regs] {!}
			EMIT(LLstrdImm, regA, regB, rnNo, -sizeof(uint32_t) * (nHiRegs + 2), eWbak ? EmitAdrModeIndexWbak : EmitAdrModeIndex);
			tmpReg = regA;
			
			if (regA != rnNo)
				tmpReg = regA;
			else {
				tmpReg = regB;
				reloadOfst++;
			}
			
			if (eWbak)
				hiRegsStart = nLoRegs;
			else
				hiRegsStart = -nHiRegs - 2;
			break;
		
		default:	//more than 2 regs - use an stmdb
			
			//sub Rn, Rn, #4 * nHiRegs
			EMIT(LLsubImm, rnNo, rnNo, sizeof(uint32_t) * nHiRegs, EmitLeaveFlags, false);
			
			if (eWbak)
				hiRegsStart = nLoRegs;
			else
				hiRegsStart = -nHiRegs;
			
			//stmdb Rn{!}, {loRegs}
			EMIT(HLstmdb, rnNo, loRegs, eWbak);
			tmpReg = __builtin_ctz(loRegs);
			
			if (tmpReg == rnNo) {
				
				tmpReg = __builtin_ctz(loRegs &~ rnMask);
				reloadOfst++;
			}
			
			if (!eWbak) {
				//add Rn, Rn, #4 * nHiRegs
				EMIT(LLaddImm, rnNo, rnNo, sizeof(uint32_t) * nHiRegs, EmitLeaveFlags, false);
			}
			
			//we stored wrong Rn value since we decremented Rn to do the stmdb. fix it. We know that if Rn is in th elist, eWbak is false!
			if (regsMask & rnMask) {
				
				//str Rn, [Rn, #-4 * (num_regs_above_rn + 1)]
				EMIT(LLstoreImm, rnNo, rnNo, -sizeof(uint32_t) * jitPrvPopcount16(regsMask &~ (rnMask - 1)), EmitSzWord, EmitAdrModeIndex);
			}
			break;
	}
	
	//sort out where we'll reload our temp reg from
	reloadOfst += hiRegsStart - nLoRegs;
	
	//ok now we store the complex regs as needed (in theory LR can be combined with one of them using a strd, but this is rare enough that we do not bother)
	if (hiRegs & spMask) {
		
		//str SP, [Rn, #4 * hiRegsStart]
		EMIT(LLstoreImm, EMIT_REG_NO_SP, rnNo, sizeof(uint32_t) * hiRegsStart, EmitSzWord, EmitAdrModeIndex);
		hiRegsStart++;
	}
	if (hiRegs & lrMask) {
		
		//str lr, [Rn, #4 * hiRegsStart]
		EMIT(LLstoreImm, EMIT_REG_NO_LR, rnNo, sizeof(uint32_t) * hiRegsStart, EmitSzWord, EmitAdrModeIndex);
		hiRegsStart++;
	}
	if (hiRegs & pcMask) {
		
		if (needPush && ! needPop) {
			
			//push {r0}
			EMIT(HLpush, 0x0001);
			
			needPop = true;
		}
		
		needReload = true;
		
		//LDR tmpReg, =PC_VAL
		EMIT(HLloadImmToReg, tmpReg, instrAddr + 8, true, true, false);
		
		//str tmpReg, [Rn, #4 * hiRegsStart]
		EMIT(LLstoreImm, tmpReg, rnNo, sizeof(uint32_t) * hiRegsStart, EmitSzWord, EmitAdrModeIndex);
		hiRegsStart++;
	}
	
	if (needPop) {
		
		//pop {r0}
		EMIT(HLpop, 0x0001);
	}
	else if (needReload) {
		
		//ldr tmpReg, [Rn, #proper_ofst]	//reload orig reg
		EMIT(LLloadImm, tmpReg, rnNo, sizeof(uint32_t) * reloadOfst, EmitSzWord, false, EmitAdrModeIndex);
	}
	
	//if we needed wbak but did not do it yet, do so now
	if (wbakOrig && !eWbak) {
		
		//sub Rn, Rn, #4 * num_Regs
		EMIT(LLsubImm, rnNo, rnNo, sizeof(uint32_t) * jitPrvPopcount16(regsMask), EmitLeaveFlags, false);
	}
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;

	return EmitErrNone;
}

enum EmitStatus jitEmitStmdb(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rnNo, uint32_t regsMask, bool wbak)
{
	const uint32_t pcMask = 1 << EMIT_REG_NO_PC, spMask = 1 << EMIT_REG_NO_SP, rnMask = 1 << rnNo;
	enum EmitStatus now;
	uint32_t nRegs;
	
	//we need at least one reg, PC is not allowed as the base reg, Rn cannot be in the list if wbak is on and it is not the lowest reg in the list
	if (!regsMask || rnNo == EMIT_REG_NO_PC || ((regsMask & (1 << rnNo)) & wbak && (regsMask & ((1 << rnNo) - 1))))
		return EmitErrNotEncodeable;
	
	if (rnNo == EMIT_REG_NO_SP && !wbak){
		loge("STMDB SP, {...} makes no sense\n");
		return EmitErrNotEncodeable;
	}
	
	nRegs = jitPrvPopcount16(regsMask);
	
	if (nRegs == 1) {	//if just one reg, store it using STR
		
		now = jitEmitImmMemStr(dest, cc, instrAddr, 31 -__builtin_clz(regsMask), rnNo, -4, wbak ? EmitAdrModeIndexWbak : EmitAdrModeIndex, EmitSzWord);
		if (now != EmitErrNone)
			return now;
	}
	else if (!(regsMask & (pcMask | spMask)) && (!wbak || !(regsMask & rnMask))) {	//if we can use existing stmdb, do so
		
		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		EMIT(LLstmdb, rnNo, regsMask, wbak);
	}
	else if (rnNo == EMIT_REG_NO_SP)
		return jitPrvStmdbToSp(dest, cc, instrAddr, regsMask);
	else if (rnNo == EMIT_REG_NO_LR)
		return jitPrvStmdbToLr(dest, cc, instrAddr, regsMask, wbak);
	else
		return jitPrvStmdbSimpler(dest, cc, instrAddr, rnNo, regsMask, wbak);

	return EmitErrNone;
}

//////////////////////////////////////////////////////// END STMDB ////////////////////////////////////////////////////////

//////////////////////////////////////////////////////// START LDMDB ////////////////////////////////////////////////////////

static enum EmitStatus jitPrvLdmdbToLr(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t regsMask, bool wbak)
{
	const uint32_t pcMask = 1 << EMIT_REG_NO_PC, spMask = 1 << EMIT_REG_NO_SP, lrMask = 1 << EMIT_REG_NO_LR;
	uint32_t loRegs, hiRegs, nHiRegs, nLoRegs, regA, regB;
	struct EmitBuf ccSkip;
	int32_t hiRegsStart;
	enum EmitStatus now;

	loRegs = (regsMask & spMask) ? (regsMask & (spMask - 1)) : ((regsMask & lrMask) ? (regsMask & (lrMask - 1)) : (regsMask & (pcMask - 1)));
	hiRegs = regsMask - loRegs;
	nLoRegs = jitPrvPopcount16(loRegs);
	nHiRegs = jitPrvPopcount16(hiRegs);
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	switch (nLoRegs) {
		case 0:
			if (wbak) {
				
				//sub lr, lr, #4 * nHiRegs
				EMIT(LLsubImm, EMIT_REG_NO_LR, EMIT_REG_NO_LR, sizeof(uint32_t) * nHiRegs, EmitLeaveFlags, false);
				hiRegsStart = nLoRegs;
			}
			else {
				
				hiRegsStart = -nHiRegs;
			}
			break;
		
		case 1:
			regA = 31 - __builtin_clz(loRegs);
			
			//ldr regA, [lr, #-4 * total_num_regs] {!}
			EMIT(LLloadImm, regA, EMIT_REG_NO_LR, -sizeof(uint32_t) * (nHiRegs + 1), EmitSzWord, false, wbak ? EmitAdrModeIndexWbak : EmitAdrModeIndex);
			
			if (wbak)
				hiRegsStart = nLoRegs;
			else
				hiRegsStart = -nHiRegs - 1;
			break;
		
		case 2:
			regA = __builtin_ctz(loRegs);
			regB = 31 - __builtin_clz(loRegs &~ (1 << regA));
		
			//ldrd regA, regB, [lr, #-4 * total_num_regs] {!}
			EMIT(LLldrdImm, regA, regB, EMIT_REG_NO_LR, -sizeof(uint32_t) * (nHiRegs + 2), wbak ? EmitAdrModeIndexWbak : EmitAdrModeIndex);
			
			if (wbak)
				hiRegsStart = nLoRegs;
			else
				hiRegsStart = -nHiRegs - 2;
			break;
		
		default:	//more than 2 regs - use an stmdb
			
			//sub lr, lr, #4 * nHiRegs
			EMIT(LLsubImm, EMIT_REG_NO_LR, EMIT_REG_NO_LR, sizeof(uint32_t) * nHiRegs, EmitLeaveFlags, false);
			
			if (wbak)
				hiRegsStart = nLoRegs;
			else
				hiRegsStart = -nHiRegs;
			
			//ldmdb lr{!}, {loRegs}
			EMIT(HLldmdb, EMIT_REG_NO_LR, loRegs, wbak);
			
			if (!wbak) {
				//add lr, lr, #4 * nHiRegs
				EMIT(LLaddImm, EMIT_REG_NO_LR, EMIT_REG_NO_LR, sizeof(uint32_t) * nHiRegs, EmitLeaveFlags, false);
			}
			break;
	}
	
	if (!hiRegs) {
		
		//nothing. we should not even here here
	}
	else if (!(hiRegs & (hiRegs - 1))) {		//only one hireg - load it directly
		
		regA = __builtin_ctz(hiRegs);
		
		//ldr regA, [lr, #4 * hiRegsStart]
		EMIT(LLloadImm, regA, EMIT_REG_NO_LR, sizeof(uint32_t) * hiRegsStart, EmitSzWord, false, EmitAdrModeIndex);
		hiRegsStart++;
	}
	else {										//more than one hiReg - this get complex
		
		uint32_t pushRegs = 0x0003, popRegs = 0x0003, nPushedRegs = 2;
		
		if (hiRegs & spMask) {					//SP and at least one more reg
			
			if (hiRegs & pcMask) {
				
				pushRegs |= 1 << 2;	//push r2 as well (for space)
				popRegs |= pcMask;	//pop it as pc
				nPushedRegs++;
			}
			
			//push {..pushRegs..}
			EMIT(HLpush, pushRegs);
			
			if (hiRegs == (spMask | lrMask)) {		//just sp and lr - ldrd them
				
				//ldrd r0, lr, [lr, #4 * hiRegsStart]
				EMIT(LLldrdImm, 0, EMIT_REG_NO_LR, EMIT_REG_NO_LR, sizeof(uint32_t) * hiRegsStart, EmitAdrModeIndex);
			}
			else if (hiRegs == (spMask | pcMask)) {	//just sp and pc - ldrd them
				
				//ldrd r0, r1, [lr, #4 * hiRegsStart]
				EMIT(LLldrdImm, 0, 1, EMIT_REG_NO_LR, sizeof(uint32_t) * hiRegsStart, EmitAdrModeIndex);
			}
			else {									//sp, lr, and pc all being loaded
				
				//ldr r1, [lr, #4 * (hiRegsStart + 2)]   //first load Pc
				EMIT(LLloadImm, 1, EMIT_REG_NO_LR, sizeof(uint32_t) * (hiRegsStart + 2), EmitSzWord, false, EmitAdrModeIndex);
	
				//ldrd r0, lr, [lr, #4 * hiRegsStart]
				EMIT(LLldrdImm, 0, EMIT_REG_NO_LR, EMIT_REG_NO_LR, sizeof(uint32_t) * hiRegsStart, EmitAdrModeIndex);
			}
			
			if (hiRegs & pcMask) {			//loaded pc? store it to proper place
				
				//str r1, [sp, #4 * (nPushedRegs - 1)]
				EMIT(LLstoreImm, 1, EMIT_REG_NO_SP, sizeof(uint32_t) * (nPushedRegs - 1), EmitSzWord, EmitAdrModeIndex);
			}
			
			//swap into new sp
			now = jitPrvEmitSwapSp(dest, 0, 1, popRegs, nPushedRegs);
			if (now != EmitErrNone)
				return now;
		}
		else if (hiRegs & lrMask) {		//lr and pc both
			
			//push {r0, r1}
			EMIT(HLpush, 0x0003);
			
			//we cnanot use ldrd due to m3 erratum on it, so do two loads
			
			//ldr r0, [lr, #4 * (hiRegsStart + 1)]   //first load Pc
			EMIT(LLloadImm, 0, EMIT_REG_NO_LR, sizeof(uint32_t) * (hiRegsStart + 1), EmitSzWord, false, EmitAdrModeIndex);
	
			//ldr lr, [lr, #4 * hiRegsStart]   		//the load lr
			EMIT(LLloadImm, EMIT_REG_NO_LR, EMIT_REG_NO_LR, sizeof(uint32_t) * hiRegsStart, EmitSzWord, false, EmitAdrModeIndex);
	
			//str r0, [sp, #4 * (nPushedRegs - 1)]	//store sp to proper place
			EMIT(LLstoreImm, 0, EMIT_REG_NO_SP, sizeof(uint32_t), EmitSzWord, EmitAdrModeIndex);
			
			//pop {r0, pc}
			now = jitPrvEmitPopWithOpts(dest, EmitCcAl, pcMask | 0x0001, true);
			if (now != EmitErrNone)
				return now;
		}
	}
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvLdmdbWithoutRn(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rnNo, uint32_t regsMask, bool wbak)	//Rn is not SP, LR, or PC
{
	const uint32_t pcMask = 1 << EMIT_REG_NO_PC, spMask = 1 << EMIT_REG_NO_SP, lrMask = 1 << EMIT_REG_NO_LR;
	uint32_t loRegs, hiRegs, nHiRegs, nLoRegs, regA, regB;
	struct EmitBuf ccSkip;
	int32_t hiRegsStart;
	enum EmitStatus now;

	loRegs = (regsMask & spMask) ? (regsMask & (spMask - 1)) : (regsMask & (pcMask - 1));
	hiRegs = regsMask - loRegs;
	nLoRegs = jitPrvPopcount16(loRegs);
	nHiRegs = jitPrvPopcount16(hiRegs);
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	switch (nLoRegs) {
		case 0:
			if (wbak) {
				
				//sub Rn, Rn, #4 * nHiRegs
				EMIT(LLsubImm, rnNo, rnNo, sizeof(uint32_t) * nHiRegs, EmitLeaveFlags, false);
				hiRegsStart = nLoRegs;
			}
			else {
				
				hiRegsStart = -nHiRegs;
			}
			break;
		
		case 1:
			regA = 31 - __builtin_clz(loRegs);
			
			//ldr regA, [Rn, #-4 * total_num_regs] {!}
			EMIT(LLloadImm, regA, rnNo, -sizeof(uint32_t) * (nHiRegs + 1), EmitSzWord, false, wbak ? EmitAdrModeIndexWbak : EmitAdrModeIndex);
			
			if (wbak)
				hiRegsStart = nLoRegs;
			else
				hiRegsStart = -nHiRegs - 1;
			break;
		
		case 2:
			regA = __builtin_ctz(loRegs);
			regB = 31 - __builtin_clz(loRegs &~ (1 << regA));
		
			//ldrd regA, regB, [Rn, #-4 * total_num_regs] {!}
			EMIT(LLldrdImm, regA, regB, rnNo, -sizeof(uint32_t) * (nHiRegs + 2), wbak ? EmitAdrModeIndexWbak : EmitAdrModeIndex);
			
			if (wbak)
				hiRegsStart = nLoRegs;
			else
				hiRegsStart = -nHiRegs - 2;
			break;
		
		default:	//more than 2 regs - use an stmdb
			
			//sub Rn, Rn, #4 * nHiRegs
			EMIT(LLsubImm, rnNo, rnNo, sizeof(uint32_t) * nHiRegs, EmitLeaveFlags, false);
			
			if (wbak)
				hiRegsStart = nLoRegs;
			else
				hiRegsStart = -nHiRegs;
			
			//ldmdb Rn{!}, {loRegs}
			EMIT(HLldmdb, rnNo, loRegs, wbak);
			
			if (!wbak) {
				//add Rn, Rn, #4 * nHiRegs
				EMIT(LLaddImm, rnNo, rnNo, sizeof(uint32_t) * nHiRegs, EmitLeaveFlags, false);
			}
			break;
	}
	
	if (!hiRegs) {
		
		//nothing. we should not even here here
	}
	else if (!(hiRegs & (hiRegs - 1))) {		//only one hireg - load it directly
		
		regA = __builtin_ctz(hiRegs);
		
		//ldr regA, [lr, #4 * hiRegsStart]
		EMIT(LLloadImm, regA, EMIT_REG_NO_LR, sizeof(uint32_t) * hiRegsStart, EmitSzWord, false, EmitAdrModeIndex);
		hiRegsStart++;
	}
	else {										//more than one hiReg - this get complex
		
		uint32_t pushRegs = 0x0003, popRegs = 0x0003, nPushedRegs = 2;
		
		if (hiRegs & spMask) {					//SP and at least one more reg
			
			if (hiRegs & pcMask) {
				
				pushRegs |= 1 << 2;	//push r2 as well (for space)
				popRegs |= pcMask;	//pop it as pc
				nPushedRegs++;
			}
			
			//push {..pushRegs..}
			EMIT(HLpush, pushRegs);
			
			if (hiRegs == (spMask | lrMask)) {		//just sp and lr - ldrd them
				
				//ldrd r0, lr, [Rn, #4 * hiRegsStart]
				EMIT(LLldrdImm, 0, EMIT_REG_NO_LR, rnNo, sizeof(uint32_t) * hiRegsStart, EmitAdrModeIndex);
			}
			else if (hiRegs == (spMask | pcMask)) {	//just sp and pc - ldrd them
				
				//ldrd r0, r1, [Rn, #4 * hiRegsStart]
				EMIT(LLldrdImm, 0, 1, rnNo, sizeof(uint32_t) * hiRegsStart, EmitAdrModeIndex);
			}
			else {									//sp, lr, and pc all being loaded
				
				//ldr r1, [Rn, #4 * (hiRegsStart + 2)]   //first load Pc
				EMIT(LLloadImm, 1, rnNo, sizeof(uint32_t) * (hiRegsStart + 2), EmitSzWord, false, EmitAdrModeIndex);
	
				//ldrd r0, lr, [Rn, #4 * hiRegsStart]
				EMIT(LLldrdImm, 0, EMIT_REG_NO_LR, rnNo, sizeof(uint32_t) * hiRegsStart, EmitAdrModeIndex);
			}
			
			if (hiRegs & pcMask) {			//loaded pc? store it to proper place
				
				//str r1, [sp, #4 * (nPushedRegs - 1)]
				EMIT(LLstoreImm, 1, EMIT_REG_NO_SP, sizeof(uint32_t) * (nPushedRegs - 1), EmitSzWord, EmitAdrModeIndex);
			}
			
			//swap into new sp
			now = jitPrvEmitSwapSp(dest, 0, 1, popRegs, nPushedRegs);
			if (now != EmitErrNone)
				return now;
		}
		else if (hiRegs & lrMask) {		//lr and pc both
			
			//push {r0, r1}
			EMIT(HLpush, 0x0003);
			
			//ldrd lr, r0, [Rn, #4 * hiRegsStart]
			EMIT(LLldrdImm, EMIT_REG_NO_LR, 0, rnNo, sizeof(uint32_t) * hiRegsStart, EmitAdrModeIndex);
			
			//str r0, [sp, #4 * (nPushedRegs - 1)]	//store sp to proper place
			EMIT(LLstoreImm, 0, EMIT_REG_NO_SP, sizeof(uint32_t), EmitSzWord, EmitAdrModeIndex);
			
			//pop {r0, pc}
			now = jitPrvEmitPopWithOpts(dest, EmitCcAl, pcMask | 0x0001, true);
			if (now != EmitErrNone)
				return now;
		}
	}
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

//Rn is in the list, no wbak, Rn is not SP or LR
static enum EmitStatus jitPrvLdmdbWithRn(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rnNo, uint32_t regsMask)
{
	const uint32_t pcMask = 1 << EMIT_REG_NO_PC, spMask = 1 << EMIT_REG_NO_SP, lrMask = 1 << EMIT_REG_NO_LR, rnMask = 1 << rnNo;
	uint32_t i, loRegs, midRegs, hiRegs, regA, regB, rnOfst = 0, ofst = jitPrvPopcount16(regsMask);
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	loRegs = regsMask & (rnMask - 1);
	regsMask -= loRegs + rnMask;
	midRegs = (regsMask & spMask) ? (regsMask & (spMask - 1)) : (regsMask & (pcMask - 1));
	regsMask -= midRegs;
	hiRegs = regsMask;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	//load regs below Rn and then above Rn that are easy
	for (i = 0, regsMask = loRegs; i < 2; i++, regsMask = midRegs) {	//get the low and mid regs loaded - it is easy
		
		while (regsMask) {
		
			regA = __builtin_ctz(regsMask);
			regsMask &= regsMask - 1;
			
			if (regsMask) {			//at least two regs in the list
				
				regB = __builtin_ctz(regsMask);
				regsMask &= regsMask - 1;
				
				//ldrd regA, regB, [Rn, #proper_ofst]
				EMIT(LLldrdImm, regA, regB, rnNo, -sizeof(uint32_t) * ofst, EmitAdrModeIndex);
				ofst -= 2;
			}
			else {					//one reg only
				
				//ldr regA, [Rn, #proper_ofst]
				EMIT(LLloadImm, regA, rnNo, -sizeof(uint32_t) * ofst, EmitSzWord, false, EmitAdrModeIndex);
				ofst--;
			}
		}
		if (!i)
			rnOfst = ofst--;
	}
	
	if (hiRegs & spMask) {		//loading sp and regs after it is hard. use r0 for sp, r1 for pc
		
		uint32_t tmpRegSp, tmpRegPc, pushRegs, popRegs, nPushedRegs;
		
		nPushedRegs = jitPrvFindTempRegs(instrAddr, rnMask, &pushRegs, &popRegs, !!(hiRegs & pcMask), &tmpRegSp, &tmpRegPc, NULL);
		
		//push {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		if (hiRegs & lrMask) {
			
			//ldrd tmpRegSp, lr, [Rn, #proper_ofst]
			EMIT(LLldrdImm, tmpRegSp, EMIT_REG_NO_LR, rnNo, -sizeof(uint32_t) * ofst, EmitAdrModeIndex);
			ofst -= 2;
			
			if (hiRegs & pcMask) {
				
				//ldr tmpRegPc, [Rn, #proper_ofst]
				EMIT(LLloadImm, tmpRegPc, rnNo, -sizeof(uint32_t) * ofst, EmitSzWord, false, EmitAdrModeIndex);
				ofst--;
			}
		}
		else if (hiRegs & pcMask) {
			
			//ldrd tmpRegSp, tmpRegPc, [Rn, #proper_ofst]
			EMIT(LLldrdImm, tmpRegSp, tmpRegPc, rnNo, -sizeof(uint32_t) * ofst, EmitAdrModeIndex);
			ofst -= 2;
		}
		else {
			
			//ldr tmpRegSp, [Rn, #proper_ofst]
			EMIT(LLloadImm, tmpRegSp, rnNo, -sizeof(uint32_t) * ofst, EmitSzWord, false, EmitAdrModeIndex);
			ofst--;
		}
		
		//ldr Rn, [Rn, #proper_ofst]	//load Rn now
		EMIT(LLloadImm, rnNo, rnNo, -sizeof(uint32_t) * rnOfst, EmitSzWord, false, EmitAdrModeIndex);
		
		if (hiRegs & pcMask) {			//loaded pc? store it to proper place
			
			//str tmpRegPc, [sp, #4 * (nPushedRegs - 1)]
			EMIT(LLstoreImm, tmpRegPc, EMIT_REG_NO_SP, sizeof(uint32_t) * (nPushedRegs - 1), EmitSzWord, EmitAdrModeIndex);
		}
		
		//swap into new sp
		now = jitPrvEmitSwapSp(dest, tmpRegSp, tmpRegPc, popRegs, nPushedRegs);
		if (now != EmitErrNone)
			return now;
	}
	else if (hiRegs & pcMask) {		//load pc
		
		regA = rnNo ? 0 : 1;
		regB = regA + 1;	//we do not care if it is Rn. we'll never pop it anyways - this just makes stack space
		
		//push {regA, regB}
		EMIT(HLpush, (1 << regA) | (1 << regB));
		
		//ldr regA, [Rn, #proper_ofst]
		EMIT(LLloadImm, regA, rnNo, -sizeof(uint32_t) * ofst, EmitSzWord, false, EmitAdrModeIndex);
		
		//ldr Rn, [Rn, #proper_ofst]	//load Rn now
		EMIT(LLloadImm, rnNo, rnNo, -sizeof(uint32_t) * rnOfst, EmitSzWord, false, EmitAdrModeIndex);
		
		//str regA, [sp, #4 * (nPushedRegs - 1)]
		EMIT(LLstoreImm, regA, EMIT_REG_NO_SP, sizeof(uint32_t), EmitSzWord, EmitAdrModeIndex);
		
		//pop {regA, pc}
		now = jitPrvEmitPopWithOpts(dest, EmitCcAl, pcMask | (1 << regA), true);
		if (now != EmitErrNone)
			return now;
	}
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitLdmdb(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rnNo, uint32_t regsMask, bool wbak)
{
	const uint32_t pcMask = 1 << EMIT_REG_NO_PC, spMask = 1 << EMIT_REG_NO_SP, lrMask = 1 << EMIT_REG_NO_LR, rnMask = 1 << rnNo;
	enum EmitStatus now;
	uint32_t nRegs;

	//we need at least one reg, PC is not allowed as the base reg, Rn cannot be in the list if wbak is on
	if (!regsMask || rnNo == EMIT_REG_NO_PC || ((regsMask & rnMask) & wbak))
		return EmitErrNotEncodeable;
	
	if (rnNo == EMIT_REG_NO_SP) {
		loge("LDMDB SP{!}, {...} makes no sense\n");
		return EmitErrNotEncodeable;
	}
	
	nRegs = jitPrvPopcount16(regsMask);
	
	if (nRegs == 1) {	//if just one reg, load it using LDR
		
		now = jitEmitImmMemLdr(dest, cc, instrAddr, false, 31 -__builtin_clz(regsMask), rnNo, -4, wbak ? EmitAdrModeIndexWbak : EmitAdrModeIndex, EmitSzWord);
		if (now != EmitErrNone)
			return now;
	}
	else if ((!wbak || !(regsMask & rnMask)) && (regsMask & (pcMask | lrMask)) != (pcMask | lrMask) && !(regsMask & spMask)) { //if we can use existing ldmdb, do so
		
		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		EMIT(LLldmdb, rnNo, regsMask, wbak);
	}
	else if (rnNo == EMIT_REG_NO_LR) {
		
		now = jitPrvLdmdbToLr(dest, cc, instrAddr, regsMask, wbak);
		if (now != EmitErrNone)
			return now;
	}
	else if (regsMask & rnMask) {
		
		now = jitPrvLdmdbWithRn(dest, cc, instrAddr, rnNo, regsMask);
		if (now != EmitErrNone)
			return now;
	}
	else {
	
		now = jitPrvLdmdbWithoutRn(dest, cc, instrAddr, rnNo, regsMask, wbak);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

//////////////////////////////////////////////////////// END LDMDB ////////////////////////////////////////////////////////

//////////////////////////////////////////////////////// START STMDA ////////////////////////////////////////////////////////

static enum EmitStatus jitPrvStmdaToSp(struct EmitBuf *dest, uint32_t instrAddr, uint32_t regsMask)
{
	//more than one reg, wbak guaranteed

	const uint32_t pcMask = 1 << EMIT_REG_NO_PC, spMask = 1 << EMIT_REG_NO_SP, lrMask = 1 << EMIT_REG_NO_LR;
	uint32_t loRegs, hiRegs, regA, regB, ofst, tmpReg;
	bool needPop = false, usedTmpReg = false;
	
	loRegs = (regsMask & spMask) ? (regsMask & (spMask - 1)) : (regsMask & (pcMask - 1));
	hiRegs = regsMask - loRegs;
	
	//if there is only one hiReg, we can use a stmdb (cases of "no hi regs" would have been handled by now)
	if (!(hiRegs & (hiRegs - 1))) {
		
		//stmdb SP!, {regsMask minus top reg}
		EMIT(HLstmdb, EMIT_REG_NO_SP, regsMask &~ (1 << (31 - __builtin_clz(regsMask))), true);
		
		//sub SP, SP, #4
		EMIT(LLsubImm, EMIT_REG_NO_SP, EMIT_REG_NO_SP, sizeof(uint32_t), EmitLeaveFlags, false);
		
		ofst = jitPrvPopcount16(regsMask);
	}	
	else {	//cases of more hi regs require slow handling
	
		ofst = 1;
		
		//sub SP, SP, #4 * numRegs
		EMIT(LLsubImm, EMIT_REG_NO_SP, EMIT_REG_NO_SP, sizeof(uint32_t) * jitPrvPopcount16(regsMask), EmitLeaveFlags, false);
		
		while (loRegs) {
			
			regA = __builtin_ctz(loRegs);
			loRegs &= loRegs - 1;
			
			if (loRegs) {
				
				regB = __builtin_ctz(loRegs);
				loRegs &= loRegs - 1;
			
				//strd regA, regB, [SP, #proper_imm]
				EMIT(LLstrdImm, regA, regB, EMIT_REG_NO_SP, sizeof(uint32_t) * ofst, EmitAdrModeIndex);
				ofst += 2;
			}
			else {
				
				//str regA, [SP, #proper_imm]
				EMIT(LLstoreImm, regA, EMIT_REG_NO_SP, sizeof(uint32_t) * ofst, EmitSzWord, EmitAdrModeIndex);
				ofst++;
			}
		}
	}
	
	if (loRegs) {
		
		tmpReg = __builtin_ctz(regsMask);
	}
	else {
		
		tmpReg = 0;
		needPop = true;
	}
		
	if (regsMask & spMask) {	//this code could be cleverer with two tmp regs, but STMDA isnt common, so fuck it
		
		if (!usedTmpReg && needPop) {
			
			//push {tmpReg}
			EMIT(HLpush, 1 << tmpReg);
		}
		usedTmpReg = true;
		
		//add tmpReg, SP, #4 * (numRegs + 1_if_needed)
		EMIT(LLaddImm, tmpReg, EMIT_REG_NO_SP, sizeof(uint32_t) * (jitPrvPopcount16(regsMask) + (needPop ? 1 : 0)), EmitLeaveFlags, false);
	
		if (regsMask & lrMask) {
			
			//strd tmpReg, lr, [SP, #proper_imm]
			EMIT(LLstrdImm, tmpReg, EMIT_REG_NO_LR, EMIT_REG_NO_SP, sizeof(uint32_t) * (ofst + (needPop ? 1 : 0)), EmitAdrModeIndex);
			ofst += 2;
		}
		else {
			
			//str tmpReg, [SP, #proper_imm]
			EMIT(LLstoreImm, tmpReg, EMIT_REG_NO_SP, sizeof(uint32_t) * (ofst + (needPop ? 1 : 0)), EmitSzWord, EmitAdrModeIndex);
			ofst++;
		}
	}
	
	if (regsMask & pcMask) {
		
		if (!usedTmpReg && needPop) {
			
			//push {tmpReg}
			EMIT(HLpush, 1 << tmpReg);
		}
		usedTmpReg = true;
		
		//ldr tmpReg, =PC_VAL
		EMIT(HLloadImmToReg, tmpReg, instrAddr + 8, false, false, false);
		
		//str tmpReg, [SP, #proper_imm]
		EMIT(LLstoreImm, tmpReg, EMIT_REG_NO_SP, sizeof(uint32_t) * (ofst + (needPop ? 1 : 0)), EmitSzWord, EmitAdrModeIndex);
		ofst++;
	}
		
	if (usedTmpReg) {
		
		if (needPop) {
			
			//pop {tmpReg}
			EMIT(HLpop, 1 << tmpReg);
		}
		else {
			
			//ldr tmpReg, [sp, #4]	//reload tmpReg
			EMIT(LLloadImm, tmpReg, EMIT_REG_NO_SP, sizeof(uint32_t), EmitSzWord, false, EmitAdrModeIndex);
		}
	}
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvStmdaToLr(struct EmitBuf *dest, uint32_t instrAddr, uint32_t regsMask, bool wbak)
{
	const uint32_t pcMask = 1 << EMIT_REG_NO_PC, spMask = 1 << EMIT_REG_NO_SP, lrMask = 1 << EMIT_REG_NO_LR;
	uint32_t loRegs, hiRegs, nRegs, regA, regB, tmpReg;
	bool needPop = false, usedTmpReg = false;
	enum EmitStatus now;
	int32_t ofst;
	
	if (regsMask & spMask)					//sp requires manual work
		loRegs = regsMask & (spMask - 1);
	else if ((regsMask & lrMask) && wbak)	//lr does if we use wbak (and thus LR will be different)
		loRegs = regsMask & (lrMask - 1);
	else
		loRegs = regsMask & (pcMask - 1);	//pc always requires work
	hiRegs = regsMask - loRegs;
	
	nRegs = jitPrvPopcount16(regsMask);
	regA = __builtin_ctz(regsMask);			//lowest reg
	regB = 31 - __builtin_clz(regsMask);	//highest reg

	if (!hiRegs) {		//simple case - no complex regs
		
		if (nRegs == 1) {
			
			//STR reg [LR]
			//or
			//STR reg [LR], #-4
			EMIT(LLstoreImm, regA, EMIT_REG_NO_LR, wbak ? -4 : 0, wbak ? EmitAdrModePostindex : EmitAdrModeIndex, EmitSzWord);
		}
		else if (nRegs == 2) {
			
			//STRD loreg, hireg, [LR, #-4]
			EMIT(LLstrdImm, regA, regB, EMIT_REG_NO_LR, -4, EmitAdrModeIndex);
			
			if (wbak) {
				//SUB LR, LR, #8
				EMIT(LLsubImm, EMIT_REG_NO_LR, EMIT_REG_NO_LR, 2 * sizeof(uint32_t), EmitLeaveFlags, false);
			}
		}
		else {
			
			//stmdb LR, {regsMask minus top reg}
			EMIT(HLstmdb, EMIT_REG_NO_LR, regsMask &~ (1 << regB), false);
			
			//STR topreg [LR]
			EMIT(LLstoreImm, regA, EMIT_REG_NO_LR, 0, EmitAdrModeIndex, EmitSzWord);
			
			if (wbak) {
				//SUB LR, LR, #4 * numRegs
				EMIT(LLsubImm, EMIT_REG_NO_LR, EMIT_REG_NO_LR, nRegs * sizeof(uint32_t), EmitLeaveFlags, false);
			}
		}
	}
	else {
		
		ofst = wbak ? 1 : 1 - jitPrvPopcount16(regsMask);
	
		//if there is only one hiReg, we can use a stmdb (cases of "no hi regs" would have been handled by now)
		if (!(hiRegs & (hiRegs - 1))) {
			
			//stmdb LR!, {regsMask minus top reg}
			EMIT(HLstmdb, EMIT_REG_NO_LR, regsMask &~ (1 << (31 - __builtin_clz(regsMask))), true);
			
			//sub LR, LR, #4
			EMIT(LLsubImm, EMIT_REG_NO_LR, EMIT_REG_NO_LR, sizeof(uint32_t), EmitLeaveFlags, false);
			
			ofst += jitPrvPopcount16(regsMask) - 1;
		}	
		else {
	
			if (wbak) {
				
				//sub LR, LR, #4 * numRegs
				EMIT(LLsubImm, EMIT_REG_NO_LR, EMIT_REG_NO_LR, sizeof(uint32_t) * jitPrvPopcount16(regsMask), EmitLeaveFlags, false);
			}
			
			while (loRegs) {
				
				regA = __builtin_ctz(loRegs);
				loRegs &= loRegs - 1;
				
				if (loRegs) {
					
					regB = __builtin_ctz(loRegs);
					loRegs &= loRegs - 1;
				
					//strd regA, regB, [LR, #proper_imm]
					EMIT(LLstrdImm, regA, regB, EMIT_REG_NO_LR, sizeof(uint32_t) * ofst, EmitAdrModeIndex);
					ofst += 2;
				}
				else {
					
					//str regA, [LR, #proper_imm]
					EMIT(LLstoreImm, regA, EMIT_REG_NO_LR, sizeof(uint32_t) * ofst, EmitSzWord, EmitAdrModeIndex);
					ofst++;
				}
			}
		}
		if (hiRegs & spMask) {
			
			//str SP, [LR, #proper_imm]
			EMIT(LLstoreImm, EMIT_REG_NO_SP, EMIT_REG_NO_LR, sizeof(uint32_t) * ofst, EmitSzWord, EmitAdrModeIndex);
			ofst++;
		}
		
		if (loRegs) {
			
			tmpReg = __builtin_ctz(regsMask);
		}
		else {
			
			tmpReg = 0;
			needPop = true;
		}
		
		if (hiRegs & lrMask) {
			
			uint32_t whichReg = EMIT_REG_NO_LR;
			
			if (wbak) {		//wbak
				
				if (!usedTmpReg && needPop) {
				
					//push {tmpReg}
					EMIT(HLpush, 1 << tmpReg);
				}
				usedTmpReg = true;
				whichReg = tmpReg;
				
				//add usedTmpReg, LR, #proper_num
				EMIT(LLaddImm, tmpReg, EMIT_REG_NO_LR, sizeof(uint32_t) * (ofst - 1), EmitLeaveFlags, false);
			}
			
			//str whichReg, [LR, #proper_imm]
			EMIT(LLstoreImm, whichReg, EMIT_REG_NO_LR, sizeof(uint32_t) * ofst, EmitSzWord, EmitAdrModeIndex);
			ofst++;
		}
		
		if (hiRegs & pcMask) {
			
			if (!usedTmpReg && needPop) {
				
				//push {tmpReg}
				EMIT(HLpush, 1 << tmpReg);
			}
			usedTmpReg = true;
			
			//ldr tmpReg, =PC_VAL
			EMIT(HLloadImmToReg, tmpReg, instrAddr + 8, false, false, false);
			
			//str tmpReg, [LR, #proper_imm]
			EMIT(LLstoreImm, tmpReg, EMIT_REG_NO_LR, sizeof(uint32_t) * ofst, EmitSzWord, EmitAdrModeIndex);
			ofst++;
		}
		
		if (usedTmpReg) {
			
			if (needPop) {
				
				//pop {tmpReg}
				EMIT(HLpop, 1 << tmpReg);
			}
			else {
				
				//ldr tmpReg, [lr, #proper_ofst]	//reload tmpReg
				EMIT(LLloadImm, tmpReg, EMIT_REG_NO_LR, sizeof(uint32_t) * (wbak ? 1 : (1 - jitPrvPopcount16(regsMask))), EmitSzWord, false, EmitAdrModeIndex);
			}
		}
	}
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvStmdaSimpler(struct EmitBuf *dest, uint32_t instrAddr, uint32_t rnNo, uint32_t regsMask, bool wbak)
{
	//Rn is not SP or PC. If Rn is in the list and wbak is on, it is the lowest reg. since we need to post-decerement Rn, we do not bother to use stmdb with wbak
	
	const uint32_t pcMask = 1 << EMIT_REG_NO_PC, spMask = 1 << EMIT_REG_NO_SP, rnMask = 1 << rnNo;
	uint32_t loRegs, hiRegs, tmpReg;
	
	loRegs = (regsMask & spMask) ? (regsMask & (spMask - 1)) : (regsMask & (pcMask - 1));
	hiRegs = regsMask - loRegs;
	
	//if there are at least three regs, and no more than one complex (hiReg), we can use an STMDB very efficiently
	if (jitPrvPopcount16(regsMask) >= 3 && !(hiRegs & (hiRegs - 1))) {
		
		uint32_t topReg = 31 - __builtin_clz(regsMask);
		uint32_t otherRegs = regsMask - (1 << topReg);
		
		//stmdb Rn, {..otherRegs..} //store "other" regs
		EMIT(HLstmdb, rnNo, otherRegs, false);

		//top reg is either PC or not. only PC is hard (but we handle that in common)
		if (topReg != EMIT_REG_NO_PC) {
			
			//str topReg, [Rn]
			EMIT(LLstoreImm, topReg, rnNo, 0, EmitSzWord, EmitAdrModeIndex);
		}
		
		//pc may be left - common case handles it
	}
	else {		//else we break it into strd/str pieces
		
		uint32_t regA, regB, ofst = jitPrvPopcount16(regsMask) - 1;
		
		while (loRegs) {
			
			regA = __builtin_ctz(loRegs);
			loRegs &= loRegs - 1;
			
			if (loRegs) {
				
				regB = __builtin_ctz(loRegs);
				loRegs &= loRegs - 1;
			
				//strd regA, regB, [Rn, #proper_imm]
				EMIT(LLstrdImm, regA, regB, rnNo, -sizeof(uint32_t) * ofst, EmitAdrModeIndex);
				ofst -= 2;
			}
			else {
				
				//str regA, [Rn, #proper_imm]
				EMIT(LLstoreImm, regA, rnNo, -sizeof(uint32_t) * ofst, EmitSzWord, EmitAdrModeIndex);
				ofst--;
			}
		}
		
		loRegs |= hiRegs & (pcMask - 1);	//store sp and/or LR one at a time, also any leftover regs
		
		while (loRegs) {
			
			regA = __builtin_ctz(loRegs);
			loRegs &= loRegs - 1;
			
			//str regA, [Rn, #proper_imm]
			EMIT(LLstoreImm, regA, rnNo, -sizeof(uint32_t) * ofst, EmitSzWord, EmitAdrModeIndex);
			ofst--;
		}
		
		//at most pc left
	}
	
	if (regsMask & pcMask) {	//pc will always be at [Rn, #0]
		
		loRegs = regsMask &~ (rnMask | spMask | pcMask);	//get a list of potential temporary regs
		
		//lowest reg that is not Rn is our preferred temp reg, but we could also end up with no regs
		if (loRegs)
			tmpReg = __builtin_ctz(loRegs);
		else {
			
			tmpReg = rnNo ? 0 : 1;
			
			//push {tmpReg}
			EMIT(HLpush, 1 << tmpReg);
		}
		
		//ldr tmpReg, =PC_VAL
		EMIT(HLloadImmToReg, tmpReg, instrAddr + 8, false, false, false);
		
		//str tmpReg, [Rn]
		EMIT(LLstoreImm, tmpReg, rnNo, 0, EmitSzWord, EmitAdrModeIndex);
		
		if (loRegs) {
			
			//ldr tmpReg, [Rn, #proper_ofst]	//reload tmpReg
			EMIT(LLloadImm, tmpReg, rnNo, sizeof(uint32_t) * (1 - jitPrvPopcount16(regsMask &~ ((1 << tmpReg) - 1))), EmitSzWord, false, EmitAdrModeIndex);
		}
		else {
			
			//pop {tmpReg}
			EMIT(HLpop, 1 << tmpReg);
		}
	}
	
	if (wbak) {
		//sub Rn, Rn, #4 * num_regs
		EMIT(LLsubImm, rnNo, rnNo, sizeof(uint32_t) * jitPrvPopcount16(regsMask), EmitLeaveFlags, false);
	}
	
	return EmitErrNone;
}

enum EmitStatus jitEmitStmda(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rnNo, uint32_t regsMask, bool wbak)
{
	const uint32_t pcMask = 1 << EMIT_REG_NO_PC, spMask = 1 << EMIT_REG_NO_SP, lrMask = 1 << EMIT_REG_NO_LR, rnMask = 1 << rnNo;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	uint32_t nRegs;
	
	//we need at least one reg, PC is not allowed as the base reg, Rn cannot be in the list if wbak is on and it is not the lowest reg in the list
	if (!regsMask || rnNo == EMIT_REG_NO_PC || ((regsMask & (1 << rnNo)) & wbak && (regsMask & ((1 << rnNo) - 1))))
		return EmitErrNotEncodeable;
	
	if (rnNo == EMIT_REG_NO_SP && !wbak && (regsMask & (regsMask - 1))){
		loge("STMDA SP, {...} almost makes sense, but we can do it anyways\n");		//leaves garbage on stack
		
		//only the top reg will be written
		regsMask = 1 << (31 - __builtin_clz(regsMask));
		nRegs = 1;
	}
	else {
		
		nRegs = jitPrvPopcount16(regsMask);
	}
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	if (nRegs == 1) {	//if just one reg and no wbak, store it using STR, if wbak, decement after (as required)
		
		now = jitEmitImmMemStr(dest, EmitCcAl, instrAddr, 31 -__builtin_clz(regsMask), rnNo, 0, EmitAdrModeIndex, EmitSzWord);
		if (now != EmitErrNone)
			return now;
		
		if (wbak) {
			
			//sub Rn, Rn, #4
			EMIT(LLsubImm, rnNo, rnNo, sizeof(uint32_t), EmitLeaveFlags, false);
		}
	}
	//we can use STMFD sometimes. Rn is ok if no wbak, SP is ok if last, PC is never ok
	//we wbak in here so this work on SP
	else if (nRegs >= 5 && (!(regsMask & rnMask) || !wbak) && (!(regsMask & spMask) || !(regsMask & (lrMask | pcMask))) && !(regsMask & pcMask)) {
	
		//str top_reg, [Rn]	//store top reg in place
		EMIT(LLstoreImm, 31 - __builtin_clz(regsMask), rnNo, 0, EmitSzWord, EmitAdrModeIndex);

		//stmdb Rn {!}, {regsMask minus top reg}
		EMIT(HLstmdb, rnNo, regsMask &~ (1 << (31 - __builtin_clz(regsMask))), wbak);

		if (wbak) {

			//sub Rn, Rn, #4
			EMIT(LLsubImm, rnNo, rnNo, sizeof(uint32_t), EmitLeaveFlags, false);
		}
	}
	else if (rnNo == EMIT_REG_NO_SP) {
		
		if (!wbak)
			fatal("STMDA to SP with no WBAK makes no sense\n");
		
		now = jitPrvStmdaToSp(dest, instrAddr, regsMask);		//yes wbak for sure
		if (now != EmitErrNone)
			return now;
	}
	else if (rnNo == EMIT_REG_NO_LR) {
		
		now = jitPrvStmdaToLr(dest, instrAddr, regsMask, wbak);
		if (now != EmitErrNone)
			return now;
	}
	else {
		now = jitPrvStmdaSimpler(dest, instrAddr, rnNo, regsMask, wbak);
		if (now != EmitErrNone)
			return now;
	}

	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;

	return EmitErrNone;
}

//////////////////////////////////////////////////////// END STMDA ////////////////////////////////////////////////////////

//////////////////////////////////////////////////////// START LDMDA ////////////////////////////////////////////////////////

static enum EmitStatus jitPrvLdmdaToLr(struct EmitBuf *dest, uint32_t instrAddr, uint32_t regsMask, bool wbak)
{
	uint32_t loRegs, nRegs = jitPrvPopcount16(regsMask), tmpReg1, tmpReg2, pushRegs = 0, popRegs, nPushRegs;
	const uint32_t pcMask = 1 << EMIT_REG_NO_PC, spMask = 1 << EMIT_REG_NO_SP, lrMask = 1 << EMIT_REG_NO_LR;
	enum EmitStatus now;
	int32_t ofst;
	
	//sort out the easy regs from the hard regs
	if (regsMask & spMask)
		loRegs = regsMask & (spMask - 1);
	else if (regsMask & lrMask)
		loRegs = regsMask & (lrMask - 1);
	else if (regsMask & pcMask)
		loRegs = regsMask & (pcMask - 1);
	else
		loRegs = regsMask;
	
	//wbak if needed, find where we'll start
	if (wbak) {
		
		//sub Rn, Rn, #wbak_adjust
		EMIT(LLsubImm, EMIT_REG_NO_LR, EMIT_REG_NO_LR, sizeof(uint32_t) * nRegs, EmitLeaveFlags, false);
		
		ofst = 1;
	}
	else {
		
		ofst = 1 - nRegs;
	}
	
	//load easy regs
	while (loRegs) {
		
		tmpReg1 = __builtin_ctz(loRegs);
		loRegs &= loRegs - 1;
		
		if (loRegs) {			//at least two regs in the list
			
			tmpReg2 = __builtin_ctz(loRegs);
			loRegs &= loRegs - 1;
			
			//ldrd tmpReg1, tmpReg2, [Rn, #proper_ofst]
			EMIT(LLldrdImm, tmpReg1, tmpReg2, EMIT_REG_NO_LR, sizeof(uint32_t) * ofst, EmitAdrModeIndex);
			ofst += 2;
		}
		else {					//one reg only
			
			//ldr tmpReg1, [Rn, #proper_ofst]
			EMIT(LLloadImm, tmpReg1, EMIT_REG_NO_LR, sizeof(uint32_t) * ofst, EmitSzWord, false, EmitAdrModeIndex);
			ofst++;
		}
	}
	
	//see what we need to do for hard regs
	if (regsMask & spMask) {	
		
		//we need 3 temp regs, 4 if loading pc. We can use r0..r1(r2)
		pushRegs = (regsMask & pcMask) ? 0x0007 : 0x0003;
		popRegs = (regsMask & pcMask) ? (pcMask | 0x0003) : 0x0003;
		nPushRegs = (regsMask & pcMask) ? 3 : 2;
		
		//push {pushRegs}
		EMIT(HLpush, pushRegs);
		
		//get sp into r0, lr into lr, pc into r1
		if (regsMask & lrMask) {
			
			if (regsMask & pcMask) {
				
				//ldr r1, [Rn, #proper_ofst]
				EMIT(LLloadImm, 1, EMIT_REG_NO_LR, sizeof(uint32_t) * (ofst + 2), EmitSzWord, false, EmitAdrModeIndex);
			}
			
			//ldrd r0, lr, [Rn, #proper_ofst]
			EMIT(LLldrdImm, 0, EMIT_REG_NO_LR, EMIT_REG_NO_LR, sizeof(uint32_t) * ofst, EmitAdrModeIndex);
			ofst += 2;
		}
		else if (regsMask & pcMask) {
			
			//ldrd r0, r1, [Rn, #proper_ofst]
			EMIT(LLldrdImm, 0, 1, EMIT_REG_NO_LR, sizeof(uint32_t) * ofst, EmitAdrModeIndex);
			ofst += 2;
		}
		else {
			
			//ldr r0, [Rn, #proper_ofst]
			EMIT(LLloadImm, 0, EMIT_REG_NO_LR, sizeof(uint32_t) * ofst, EmitSzWord, false, EmitAdrModeIndex);
			ofst++;
		}
		
		if (regsMask & pcMask) {		//store pc into proper place
			
			//str r1, [SP, #8]
			EMIT(LLstoreImm, 1, EMIT_REG_NO_SP, (nPushRegs - 1) * sizeof(uint32_t), EmitSzWord, EmitAdrModeIndex);
		}
		
		//swap sp
		now = jitPrvEmitSwapSp(dest, 0, 1, popRegs, nPushRegs);
		if (now != EmitErrNone)
			return now;
	}
	else if ((regsMask & (lrMask | pcMask)) == lrMask) {		//lr only
		
		//ldr lr, [Rn, #proper_ofst]
		EMIT(LLloadImm, EMIT_REG_NO_LR, EMIT_REG_NO_LR, sizeof(uint32_t) * ofst, EmitSzWord, false, EmitAdrModeIndex);
		ofst++;
	}
	else if (regsMask & lrMask) {								//lr and pc both
		
		//push {r0, r1}
		EMIT(HLpush, 0x0003);
		
		//ldr r0, [Rn, #proper_ofst]		//load pc
		EMIT(LLloadImm, 0, EMIT_REG_NO_LR, sizeof(uint32_t) * (ofst + 1), EmitSzWord, false, EmitAdrModeIndex);
		
		//str r0, [SP, #4]					//store pc into the proper place
		EMIT(LLstoreImm, 0, EMIT_REG_NO_SP, sizeof(uint32_t), EmitSzWord, EmitAdrModeIndex);
		
		//ldr lr, [Rn, #proper_ofst]		//load lr
		EMIT(LLloadImm, EMIT_REG_NO_LR, EMIT_REG_NO_LR, sizeof(uint32_t) * ofst, EmitSzWord, false, EmitAdrModeIndex);
		
		//pop {r0, r1}
		now = jitPrvEmitPopWithOpts(dest, EmitCcAl, (1 << EMIT_REG_NO_PC) + 0x0001, true);
		if (now != EmitErrNone)
			return now;
	}
	else if (regsMask & pcMask) {								//just pc
		
		//ldr pc, [Rn, #proper_ofst]		//load pc
		EMIT(LLloadImm, EMIT_REG_NO_PC, EMIT_REG_NO_LR, sizeof(uint32_t) * ofst, EmitSzWord, false, EmitAdrModeIndex);
	}
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvLdmdaLowRegBase(struct EmitBuf *dest, uint32_t instrAddr, uint32_t rnNo, uint32_t regsMask, bool wbak)
{
	uint32_t loRegs, midRegs, i, nRegs = jitPrvPopcount16(regsMask), tmpReg1, tmpReg2, tmpReg3, pushRegs = 0, popRegs, nPushRegs = 0;
	const uint32_t pcMask = 1 << EMIT_REG_NO_PC, spMask = 1 << EMIT_REG_NO_SP, lrMask = 1 << EMIT_REG_NO_LR, rnMask = 1 << rnNo;
	bool needPc = !!(regsMask & pcMask);
	int32_t ofst, rnOfst = 0;
	enum EmitStatus now;
	
	//sort out the easy regs from the hard regs
	midRegs = (regsMask & spMask) ? (regsMask & (spMask - 1)) : (regsMask & (pcMask - 1));	//sort out where complex regs start
	loRegs = midRegs & (rnMask - 1);			//regs below Rn
	midRegs &=~ (2 * rnMask - 1);				//regs above Rn and below complex shit
	
	//wbak if needed, find where we'll start
	if (wbak) {
		
		//sub Rn, Rn, #wbak_adjust
		EMIT(LLsubImm, rnNo, rnNo, sizeof(uint32_t) * nRegs, EmitLeaveFlags, false);
		
		ofst = 1;
	}
	else {
		
		ofst = 1 - nRegs;
	}
	
	//load easy regs
	for (i = 0; i < 2; i++) {
		while (loRegs) {
			
			tmpReg1 = __builtin_ctz(loRegs);
			loRegs &= loRegs - 1;
			
			if (loRegs) {			//at least two regs in the list
				
				tmpReg2 = __builtin_ctz(loRegs);
				loRegs &= loRegs - 1;
				
				//ldrd tmpReg1, tmpReg2, [Rn, #proper_ofst]
				EMIT(LLldrdImm, tmpReg1, tmpReg2, rnNo, sizeof(uint32_t) * ofst, EmitAdrModeIndex);
				ofst += 2;
			}
			else {					//one reg only
				
				//ldr tmpReg1, [Rn, #proper_ofst]
				EMIT(LLloadImm, tmpReg1, rnNo, sizeof(uint32_t) * ofst, EmitSzWord, false, EmitAdrModeIndex);
				ofst++;
			}
		}
		
		if (!i) {
			loRegs = midRegs;
			if (regsMask & rnNo)
				rnOfst = ofst++;
		}
	}
	
	//see what we need to do for hard regs
	if (regsMask & spMask) {	
		
		//we need 3 temp regs, 4 if loading pc. We can use r0..r2(r3)
		
		tmpReg1 = jitUtilPickLowestClearBit(pushRegs | rnMask);
		pushRegs |= 1 << tmpReg1;
		nPushRegs++;
		tmpReg2 = jitUtilPickLowestClearBit(pushRegs | rnMask);
		pushRegs |= 1 << tmpReg2;
		nPushRegs++;
		popRegs = pushRegs;
		
		if (regsMask & pcMask) {
			
			tmpReg3 = jitUtilPickLowestClearBit(pushRegs | rnMask);
			pushRegs |= 1 << tmpReg3;
			nPushRegs++;
			popRegs |= pcMask;
		}
		
		//we want new sp val in tmpReg1, new pc val in tmpReg2
		
		//push {pushRegs}
		EMIT(HLpush, pushRegs);
		
		if (regsMask & lrMask) {
			
			//ldrd tmpReg1, lr, [Rn, #proper_ofst]
			EMIT(LLldrdImm, tmpReg1, EMIT_REG_NO_LR, rnNo, sizeof(uint32_t) * ofst, EmitAdrModeIndex);
			ofst += 2;
		}
		else if (regsMask & pcMask) {
			
			//ldrd tmpReg1, tmpReg2, [Rn, #proper_ofst]
			EMIT(LLldrdImm, tmpReg1, tmpReg2, rnNo, sizeof(uint32_t) * ofst, EmitAdrModeIndex);
			ofst += 2;
			
			needPc = false;
		}
		else {
			
			//ldr tmpReg1, [Rn, #proper_ofst]
			EMIT(LLloadImm, tmpReg1, rnNo, sizeof(uint32_t) * ofst, EmitSzWord, false, EmitAdrModeIndex);
			ofst++;
		}
		
		if (needPc){
			
			//ldr tmpReg2, [Rn, #proper_ofst]
			EMIT(LLloadImm, tmpReg2, rnNo, sizeof(uint32_t) * ofst, EmitSzWord, false, EmitAdrModeIndex);
			ofst++;
		}
		
		if (regsMask & rnMask) {		//load Rn if needed
			
			//ldr Rn, [Rn, #proper_ofst]
			EMIT(LLloadImm, rnNo, rnNo, sizeof(uint32_t) * rnOfst, EmitSzWord, false, EmitAdrModeIndex);
		}
		
		if (regsMask & pcMask) {		//store pc into proper place
			
			//str tmpReg2, [SP, #4 * (nPushRegs - 1)]
			EMIT(LLstoreImm, tmpReg2, EMIT_REG_NO_SP, (nPushRegs - 1) * sizeof(uint32_t), EmitSzWord, EmitAdrModeIndex);
		}
		
		//swap sp
		now = jitPrvEmitSwapSp(dest, tmpReg1, tmpReg2, popRegs, nPushRegs);
		if (now != EmitErrNone)
			return now;
	}
	else if ((regsMask & (pcMask | rnMask)) == (pcMask | rnMask)) {				//rn and pc both
		
		tmpReg1 = jitUtilPickLowestClearBit(pushRegs | rnMask);
		pushRegs |= 1 << tmpReg1;
		popRegs = pushRegs | pcMask;
		tmpReg2 = jitUtilPickLowestClearBit(pushRegs | rnMask);
		pushRegs |= 1 << tmpReg2;
		
		//push {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		//ldr tmpReg1, [Rn, #proper_ofst]		//load pc
		EMIT(LLloadImm, tmpReg1, rnNo, sizeof(uint32_t) * ofst, EmitSzWord, false, EmitAdrModeIndex);
		
		//ldr Rn, [Rn, #proper_ofst]
		EMIT(LLloadImm, rnNo, rnNo, sizeof(uint32_t) * rnOfst, EmitSzWord, false, EmitAdrModeIndex);
		
		//str tmpReg1, [SP, #4]					//store pc into the proper place
		EMIT(LLstoreImm, tmpReg1, EMIT_REG_NO_SP, sizeof(uint32_t), EmitSzWord, EmitAdrModeIndex);
		
		//pop {..popRegs..}
		now = jitPrvEmitPopWithOpts(dest, EmitCcAl, popRegs, true);
		if (now != EmitErrNone)
			return now;
	}
	else if (regsMask & pcMask) {								//just pc
		
		//ldr pc, [Rn, #proper_ofst]		//load pc
		EMIT(LLloadImm, EMIT_REG_NO_PC, rnNo, sizeof(uint32_t) * ofst, EmitSzWord, false, EmitAdrModeIndex);
	}
	else if (regsMask & rnMask) {								//need to load Rn
		
		//ldr Rn, [Rn, #proper_ofst]
		EMIT(LLloadImm, rnNo, rnNo, sizeof(uint32_t) * rnOfst, EmitSzWord, false, EmitAdrModeIndex);
	}
	
	return EmitErrNone;
}

enum EmitStatus jitEmitLdmda(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rnNo, uint32_t regsMask, bool wbak)
{
	const uint32_t pcMask = 1 << EMIT_REG_NO_PC, spMask = 1 << EMIT_REG_NO_SP, lrMask = 1 << EMIT_REG_NO_LR, rnMask = 1 << rnNo;
	uint32_t nRegs, tmpReg;
	struct EmitBuf ccSkip;
	enum EmitStatus now;

	//we need at least one reg, PC is not allowed as the base reg, Rn cannot be in the list if wbak is on
	if (!regsMask || rnNo == EMIT_REG_NO_PC || ((regsMask & rnMask) & wbak))
		return EmitErrNotEncodeable;
	
	nRegs = jitPrvPopcount16(regsMask);
	
	if (rnNo == EMIT_REG_NO_SP && nRegs != 1) {
		loge("LDMDA SP{!}, {...} makes no sense with more than one reg\n");
		return EmitErrNotEncodeable;
	}
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	if (nRegs == 1) {	//if just one reg, load it using LDR
		
		now = jitEmitImmMemLdr(dest, EmitCcAl, instrAddr, false, 31 -__builtin_clz(regsMask), rnNo, wbak ? -4 : 0, wbak ? EmitAdrModePostindex : EmitAdrModeIndex, EmitSzWord);
		if (now != EmitErrNone)
			return now;
	}
	else if (nRegs >= 3 && !(regsMask & (pcMask | spMask))) {	//no PC or SP, Rn possibly in the list
		
		if ((regsMask &~ (rnMask - 1)) == rnMask) {				//Rn present & is the highest reg - load others first
			
			//ldmdb Rn, {..regs_except_top_which_is_rn..}
			EMIT(LLldmdb, rnNo, regsMask &~ rnMask, false);
			
			//ldr Rn, [Rn]
			EMIT(LLloadImm, rnNo, rnNo, 0, EmitSzWord, false, EmitAdrModeIndex);
		}
		else if (regsMask & rnMask) {							//Rn present, not top reg (and we know Rn is not SP since sp not present and Rn is)
			
			tmpReg = 31 - __builtin_clz(regsMask);	//find top reg
			
			//ldr tmpReg, [Rn]
			EMIT(LLloadImm, tmpReg, rnNo, 0, EmitSzWord, false, EmitAdrModeIndex);
			
			//ldmdb Rn, {..regs_except_top_which_is_not_rn..}
			EMIT(LLldmdb, rnNo, regsMask &~ (1 << tmpReg), false);
		}
		else {													//Rn not present, might be wbak
			
			tmpReg = 31 - __builtin_clz(regsMask);	//find top reg
			
			//ldmdb Rn, {..regs_except_top..}
			EMIT(LLldmdb, rnNo, regsMask &~ (1 << tmpReg), false);
			
			//ldr tmpReg, [Rn], #-wbak_adjust
			EMIT(LLloadImm, tmpReg, rnNo, wbak ? -sizeof(uint32_t) * nRegs : 0, EmitSzWord, false, wbak ? EmitAdrModePostindex : EmitAdrModeIndex);
		}
	}
	else if (nRegs >= 3 && (regsMask & (rnMask | spMask | lrMask | pcMask)) == spMask) {	//SP present & is highest reg, no Rn in the list
		
		//ldmdb Rn, {..regs_except_top..}
		EMIT(LLldmdb, rnNo, regsMask &~ spMask, false);
		
		//ldr sp, [Rn]	//cannot wbak because loading sp with wbak isnt allowed
		EMIT(LLloadImm, EMIT_REG_NO_SP, rnNo, 0, EmitSzWord, false, EmitAdrModeIndex);
		
		if (wbak) {
			
			//sub Rn, Rn, #wbak_adjust
			EMIT(LLsubImm, rnNo, rnNo, sizeof(uint32_t) * nRegs, EmitLeaveFlags, false);
		}
	}
	else if (nRegs >= 3 && (regsMask & (rnMask | spMask | lrMask | pcMask)) == pcMask) {	//PC present, no SP in list, no Rn in the list
		
		//ldmdb Rn, {..regs_except_top..}
		EMIT(LLldmdb, rnNo, regsMask &~ pcMask, false);
		
		//ldr tmpReg, [Rn], #-wbak_adjust_if_needed
		EMIT(LLloadImm, EMIT_REG_NO_PC, rnNo, wbak ? -sizeof(uint32_t) * nRegs : 0, EmitSzWord, false, wbak ? EmitAdrModePostindex : EmitAdrModeIndex);
	}
	else if (rnNo == EMIT_REG_NO_LR) {
		
		now = jitPrvLdmdaToLr(dest, instrAddr, regsMask, wbak);
		if (now != EmitErrNone)
			return now;
	}
	else {
		
		now = jitPrvLdmdaLowRegBase(dest, instrAddr, rnNo, regsMask, wbak);
		if (now != EmitErrNone)
			return now;
	}
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;

	return EmitErrNone;
}

//////////////////////////////////////////////////////// END LDMDA ////////////////////////////////////////////////////////



//will not be asked to handle Rn == PC or index mode on SP with imm > 0
static enum EmitStatus jitPrvImmMemStrdNormal(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rtNo, uint32_t rtNo2, uint32_t rnNo, int32_t imm, enum EmitAddrMode adrMode)
{
	if ((rtNo == EMIT_REG_NO_SP || rtNo2 == EMIT_REG_NO_SP) || ((rtNo == rnNo || rtNo2 == rnNo) && adrMode == EmitAdrModeIndex)){
		
		if (cc != EmitCcAl)
			EMIT(LLitt, cc);
		
		//sp means no STRD, must use two STRs, as does index mode with an unencodeable imm
		switch (adrMode) {
			
			case EmitAdrModeIndex:
				EMIT(LLstoreImm, rtNo, rnNo, imm, EmitSzWord, EmitAdrModeIndex);
				EMIT(LLstoreImm, rtNo2, rnNo, imm + sizeof(uint32_t), EmitSzWord, EmitAdrModeIndex);
				break;
			
			case EmitAdrModePostindex:
				EMIT(LLstoreImm, rtNo, rnNo, imm, EmitSzWord, EmitAdrModePostindex);
				EMIT(LLstoreImm, rtNo2, rnNo, sizeof(uint32_t) - imm, EmitSzWord, EmitAdrModeIndex);
				break;
			
			case EmitAdrModeIndexWbak:
				EMIT(LLstoreImm, rtNo, rnNo, imm, EmitSzWord, EmitAdrModeIndexWbak);
				EMIT(LLstoreImm, rtNo2, rnNo, sizeof(uint32_t), EmitSzWord, EmitAdrModeIndex);
				break;
		}
	}
	else if ((imm & 3) || rtNo == rnNo || rtNo2 == rnNo) {											//imm not multiple of 4 is unencodeable as a strd, nor is wbak mode with rN
		
		if (cc != EmitCcAl)
			EMIT(LLittt, cc);
		
		if (adrMode != EmitAdrModePostindex) {
			
			if (imm >= 0)
				EMIT(LLaddImm, rnNo, rnNo, imm, EmitLeaveFlags, cc != EmitCcAl);
			else if (imm < 0)
				EMIT(LLsubImm, rnNo, rnNo, -imm, EmitLeaveFlags, cc != EmitCcAl);
			
			if (adrMode == EmitAdrModeIndex)
				imm = -imm;
		}
		
		EMIT(LLstrdImm, rtNo, rtNo2, rnNo, 0, EmitAdrModeIndex);
		
		if (adrMode != EmitAdrModeIndexWbak) {
			
			if (imm >= 0)
				EMIT(LLaddImm, rnNo, rnNo, imm, EmitLeaveFlags, cc != EmitCcAl);
			else if (imm < 0)
				EMIT(LLsubImm, rnNo, rnNo, -imm, EmitLeaveFlags, cc != EmitCcAl);
		}
	}
	else {																//normal STRD
		
		if (cc != EmitCcAl)
			EMIT(LLit, cc);
		EMIT(LLstrdImm, rtNo, rtNo2, rnNo, imm, adrMode);
	}
	
	return EmitErrNone;
}


static enum EmitStatus jitPrvImmMemStrdToPc(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rtNo, uint32_t rtNo2, int32_t imm, enum EmitAddrMode adrMode)
{
	uint32_t nPushedRegs, pushedRegs, tmpBaseReg, spPlaceholderReg, ea = instrAddr + 8 + imm;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	
	//pc base cannot be used with writeback
	if (adrMode != EmitAdrModeIndex)
		return EmitErrNotEncodeable;
	
	nPushedRegs = jitPrvFindTempRegs(instrAddr, (1 << rtNo) | (1 << rtNo2), &pushedRegs, NULL, false, &tmpBaseReg, NULL);
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
		
	if (nPushedRegs && (rtNo == EMIT_REG_NO_SP || rtNo2 == EMIT_REG_NO_SP)) {
		
		nPushedRegs = jitPrvFindTempRegs(instrAddr, (1 << rtNo) | (1 << rtNo2), &pushedRegs, NULL, false, &tmpBaseReg, &spPlaceholderReg, NULL);
		
		//PUSH {..pushedRegs..}
		EMIT(HLpush, pushedRegs);
		
		//ADD spPlaceholderReg, #sp, #4 * nPushedRegs		//if sp is needed, calc it
		EMIT(LLaddImm, spPlaceholderReg, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushedRegs, EmitLeaveFlags, false);
		
		if (rtNo == EMIT_REG_NO_SP)
			rtNo = spPlaceholderReg;
		if (rtNo2 == EMIT_REG_NO_SP)
			rtNo2 = spPlaceholderReg;
	}
	else {
		
		//PUSH {..pushedRegs..}
		EMIT(HLpush, pushedRegs);
	}
	
	//LDR tmpBaseReg, =ea									//get ea
	EMIT(HLloadImmToReg, tmpBaseReg, ea, false, false, false);
	
	//strd rtNo, rtNo2, [tmpBaseReg]
	now = jitPrvImmMemStrdNormal(dest, EmitCcAl, instrAddr, rtNo, rtNo2, tmpBaseReg, imm, EmitAdrModeIndex);
	if (now != EmitErrNone)
		return now;
	
	//POP {..pushedRegs..}
	EMIT(HLpop, pushedRegs);
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

static enum EmitStatus jitPrvImmMemStrdToSp(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rtNo, uint32_t rtNo2, int32_t imm, enum EmitAddrMode adrMode)
{
	enum EmitStatus now;
	
	//sp offsets must be word-aligned.
	if (imm & 3) {
		
		loge("stack will always be aligned and thus unaligned imm in STRD makes no sense\n");
		return EmitErrNotEncodeable;
	}
	
	if (adrMode == EmitAdrModeIndex && imm <= -8) {
		
		loge("STRD fully below SP makes no sense\n");
		return EmitErrNotEncodeable;
	}
	else if (adrMode == EmitAdrModePostindex && imm >= 8) {
		
		loge("STRD adjusts SP to hide what was stored!\n");
		return EmitErrNotEncodeable;
	}
	
	if (adrMode == EmitAdrModeIndex && (imm & 3) && imm > 0) {			//we cannot move SP up as we would in jitPrvImmMemStrdNormal(), so we have work to do...
		
		uint32_t nPushedRegs, pushedRegs, tmpBaseReg, spPlaceholderReg, spSrc = EMIT_REG_NO_SP;
		struct EmitBuf ccSkip;
		
		nPushedRegs = jitPrvFindTempRegs(instrAddr, (1 << rtNo) | (1 << rtNo2), &pushedRegs, NULL, false, &tmpBaseReg, NULL);
		
		now = jitPrvHandleCcStart(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
		
		if (nPushedRegs && (rtNo == EMIT_REG_NO_SP || rtNo2 == EMIT_REG_NO_SP)) {
			
			nPushedRegs = jitPrvFindTempRegs(instrAddr, (1 << rtNo) | (1 << rtNo2), &pushedRegs, NULL, false, &tmpBaseReg, &spPlaceholderReg, NULL);
			
			//PUSH {..pushedRegs..}
			EMIT(HLpush, pushedRegs);
			
			//ADD spPlaceholderReg, #sp, #4 * nPushedRegs		//if sp is needed, calc it
			EMIT(LLaddImm, spPlaceholderReg, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushedRegs, EmitLeaveFlags, false);
			
			if (rtNo == EMIT_REG_NO_SP)
				rtNo = spPlaceholderReg;
			if (rtNo2 == EMIT_REG_NO_SP)
				rtNo2 = spPlaceholderReg;
			
			spSrc = spPlaceholderReg;
		}
		else {
			
			//PUSH {..pushedRegs..}
			EMIT(HLpush, pushedRegs);
		}
		
		//ADD tmpBaseReg, spSrc, #imm
		EMIT(LLaddImm, tmpBaseReg, spSrc, imm, EmitLeaveFlags, false);
		
		//strd rtNo, rtNo2, [tmpBaseReg]
		now = jitPrvImmMemStrdNormal(dest, EmitCcAl, instrAddr, rtNo, rtNo2, tmpBaseReg, imm, EmitAdrModeIndex);
		if (now != EmitErrNone)
			return now;
		
		//POP {..pushedRegs..}
		EMIT(HLpop, pushedRegs);
	
		now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
		if (now != EmitErrNone)
			return now;
	}
	else {
		
		//strd rtNo, rtNo2, SP_WITH_ADR_MODE
		now = jitPrvImmMemStrdNormal(dest, cc, instrAddr, rtNo, rtNo2, EMIT_REG_NO_SP, imm, EmitAdrModeIndex);
		if (now != EmitErrNone)
			return now;
	}
	
	return EmitErrNone;
}

enum EmitStatus jitEmitImmMemStrd(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rtNo, uint32_t rnNo, int32_t imm, enum EmitAddrMode adrMode)
{
	uint32_t rtNo2 = rtNo + 1;
	
	//with zero imm, all addr modes are equal
	if (!imm)
		adrMode = EmitAdrModeIndex;
	
	//reg must be even, cannot be the LR & PC pair
	if ((rtNo & 1) || rtNo == EMIT_REG_NO_LR)
		return EmitErrNotEncodeable;
	
	//we restrict offsets to what ARMv5 allows
	if (imm <= -0x100 || imm >= 0x100)
		return EmitErrNotEncodeable;
	
	if (rnNo == EMIT_REG_NO_PC)		//index mode guaranteed
		return jitPrvImmMemStrdToPc(dest, cc, instrAddr, rtNo, rtNo2, imm, adrMode);
	else if (rnNo == EMIT_REG_NO_SP)
		return jitPrvImmMemStrdToSp(dest, cc, instrAddr, rtNo, rtNo2, imm, adrMode);
	else
		return jitPrvImmMemStrdNormal(dest, cc, instrAddr, rtNo, rtNo2, rnNo, imm, adrMode);
}

enum EmitStatus jitEmitImmMemLdrd(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rtNo, uint32_t rnNo, int32_t imm, enum EmitAddrMode adrMode)
{
	uint32_t rtNo2 = rtNo + 1;
	struct EmitBuf ccSkip;
	enum EmitStatus now;

	//with zero imm, all addr modes are equal
	if (!imm)
		adrMode = EmitAdrModeIndex;
	
	//reg must be even, cannot be the LR & PC pair
	if ((rtNo & 1) || rtNo == EMIT_REG_NO_LR)
		return EmitErrNotEncodeable;
	
	//load to base reg not allowed with wbak
	if ((rtNo == rnNo || rtNo2 == rnNo) && adrMode != EmitAdrModeIndex)
		return EmitErrNotEncodeable;
	
	//we restrict offsets to what ARMv5 allows
	if (imm <= -0x100 || imm >= 0x100)
		return EmitErrNotEncodeable;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	if (rnNo == EMIT_REG_NO_PC) {
		
		//pc base cannot be used with writeback
		if (adrMode != EmitAdrModeIndex)
			return EmitErrNotEncodeable;
		
		rnNo = (rtNo2 == EMIT_REG_NO_SP) ? rtNo : rtNo2;
			
		//pc base cannot be used with writeback
		if (adrMode != EmitAdrModeIndex)
			return EmitErrNotEncodeable;
		
		EMIT(HLloadImmToReg, rnNo, instrAddr + 8 + imm, false, false, false);
	}
	
	if (rtNo2 == EMIT_REG_NO_SP && rtNo == rnNo) {			//loading SP and base reg is hard
		
		uint32_t nPushedRegs, pushRegs, spNewVal, swpSwapReg;
		
		nPushedRegs = jitPrvFindTempRegs(instrAddr, (1 << rtNo) | (1 << rtNo2) | (1 << rnNo), &pushRegs, NULL, false, &spNewVal, &swpSwapReg, NULL);
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		if(rtNo == rnNo) {									//indexed mode guaranteed in this case
			
			//LDR spNewVal, [Rn, #imm + 4]
			EMIT(LLloadImm, spNewVal, rnNo, imm + sizeof(uint32_t), EmitSzWord, false, EmitAdrModeIndex);
			
			//LDR Rn, [Rn, #imm]
			EMIT(LLloadImm, rnNo, rnNo, imm, EmitSzWord, false, EmitAdrModeIndex);
		}
		else {												//if adrMode is not index, Rn guaranteed not to be SP since SP being loaded
			
			//LDRD Rt, spNewVal, Rn_WITH_ADDR_MODE
			EMIT(LLldrdImm, rtNo, spNewVal, rnNo, imm, adrMode);
		}

		now = jitPrvEmitSwapSp(dest, spNewVal, swpSwapReg, pushRegs, nPushedRegs);
		if (now != EmitErrNone)
			return now;
	}
	else if (rtNo == rnNo) {	//work around erratum 602117. index mode guaranteed since else Rt could not equal Rn
		
		//LDR Rt2, [Rn, #imm + 4]
		EMIT(LLloadImm, rtNo2, rnNo, imm + sizeof(uint32_t), EmitSzWord, false, EmitAdrModeIndex);
			
		//LDR Rt, [Rn, #imm]
		EMIT(LLloadImm, rtNo, rnNo, imm, EmitSzWord, false, EmitAdrModeIndex);
	}
	else if (!(imm & 3) && rtNo2 != EMIT_REG_NO_SP) {
		
		//LDRD Rt, Rt2, Rn_WITH_ADDR_MODE
		EMIT(LLldrdImm, rtNo, rtNo2, rnNo, imm, adrMode);
	}
	else if (adrMode == EmitAdrModeIndex) {
		
		//LDR Rt, [Rn, #imm]
		EMIT(LLloadImm, rtNo, rnNo, imm, EmitSzWord, false, EmitAdrModeIndex);
		
		//LDR Rt2, [Rn, #imm + 4]
		EMIT(LLloadImm, rtNo2, rnNo, imm + sizeof(uint32_t), EmitSzWord, false, EmitAdrModeIndex);
	}
	else if (adrMode == EmitAdrModeIndexWbak) {
		
		//LDR Rt, [Rn, #imm]!
		EMIT(LLloadImm, rtNo, rnNo, imm, EmitSzWord, false, EmitAdrModeIndexWbak);
		
		//LDR Rt2, [Rn, #4]
		EMIT(LLloadImm, rtNo2, rnNo, sizeof(uint32_t), EmitSzWord, false, EmitAdrModeIndex);
	}
	else {			//postindex mode
		
		//LDR Rt2, [Rn, #4]
		EMIT(LLloadImm, rtNo2, rnNo, sizeof(uint32_t), EmitSzWord, false, EmitAdrModeIndex);
		
		//LDR Rt, [Rn], #imm
		EMIT(LLloadImm, rtNo, rnNo, imm, EmitSzWord, false, EmitAdrModePostindex);
	}
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitRegRegMemLdrd(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rtNo, uint32_t rnNo, bool isAdd, uint32_t rmNo, enum EmitAddrMode adrMode)
{
	uint32_t rtNo2 = rtNo + 1;
	struct EmitBuf ccSkip;
	enum EmitStatus now;

	//reg must be even, cannot be the LR & PC pair
	if ((rtNo & 1) || rtNo == EMIT_REG_NO_LR)
		return EmitErrNotEncodeable;
	
	//load to base reg not allowed with wbak
	if ((rtNo == rnNo || rtNo2 == rnNo) && adrMode != EmitAdrModeIndex)
		return EmitErrNotEncodeable;
	
	//Rm not allowed as one of the output regs
	if (rtNo == rmNo || rtNo2 == rmNo)
		return EmitErrNotEncodeable;
	
	//PC not allowed as Rm
	if (rmNo == EMIT_REG_NO_PC)
		return EmitErrNotEncodeable;
	
	//Rn == PC cannot use writeback
	if (rnNo == EMIT_REG_NO_PC && adrMode != EmitAdrModeIndex)
		return EmitErrNotEncodeable;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	if (adrMode == EmitAdrModeIndexWbak) {
		
		//ADD/SUB Rn, Rn, Rm
		now = jitEmitAluOpRegShiftImm(dest, EmitCcAl, isAdd ? ArmDpOpAdd : ArmDpOpSub, instrAddr, rnNo, rnNo, rmNo, EmitShiftLsl, 0, false);
		if (now != EmitErrNone)
			return now;
		
		//LDRD Rt, Rt2, [Rn]
		now = jitEmitImmMemLdrd(dest, EmitCcAl, instrAddr, rtNo, rnNo, 0, EmitAdrModeIndex);
		if (now != EmitErrNone)
			return now;
	}
	else if (adrMode == EmitAdrModePostindex) {	//safe to do this since Rm is not a destination
		
		//LDRD Rt, Rt2, [Rn]
		now = jitEmitImmMemLdrd(dest, EmitCcAl, instrAddr, rtNo, rnNo, 0, EmitAdrModeIndex);
		if (now != EmitErrNone)
			return now;
		
		//ADD/SUB Rn, Rn, Rm
		now = jitEmitAluOpRegShiftImm(dest, EmitCcAl, isAdd ? ArmDpOpAdd : ArmDpOpSub, instrAddr, rnNo, rnNo, rmNo, EmitShiftLsl, 0, false);
		if (now != EmitErrNone)
			return now;
	}
	else {
		
		uint32_t tmpReg = (rtNo2 == EMIT_REG_NO_SP) ? rtNo : rtNo2;
		
		if (rnNo == EMIT_REG_NO_PC) {
			
			//LDR tmpReg, =PC_VAL
			EMIT(HLloadImmToReg, tmpReg, instrAddr + 8, false, false, false);
			rnNo = tmpReg;
		}
		
		//ADD/SUB tmpReg, Rn, Rm
		now = jitEmitAluOpRegShiftImm(dest, EmitCcAl, isAdd ? ArmDpOpAdd : ArmDpOpSub, instrAddr, tmpReg, rnNo, rmNo, EmitShiftLsl, 0, false);
		if (now != EmitErrNone)
			return now;
		
		//LDRD Rt, Rt2, [tmpReg]
		now = jitEmitImmMemLdrd(dest, EmitCcAl, instrAddr, rtNo, tmpReg, 0, EmitAdrModeIndex);
		if (now != EmitErrNone)
			return now;
	}
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitRegRegMemStrd(struct EmitBuf *dest, enum EmitCc cc, uint32_t instrAddr, uint32_t rtNo, uint32_t rnNo, bool isAdd, uint32_t rmNo, enum EmitAddrMode adrMode)
{
	uint32_t nPushedRegs, pushRegs, baseReg, spCopyReg, rtNo2 = rtNo + 1;
	struct EmitBuf ccSkip;
	enum EmitStatus now;
	bool wantSp, needSp;

	//reg must be even, cannot be the LR & PC pair
	if ((rtNo & 1) || rtNo == EMIT_REG_NO_LR)
		return EmitErrNotEncodeable;
	
	//PC not allowed as Rm
	if (rmNo == EMIT_REG_NO_PC)
		return EmitErrNotEncodeable;
	
	now = jitPrvHandleCcStart(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	if (adrMode == EmitAdrModeIndexWbak) {
		
		//no wbak to PC
		if (rnNo == EMIT_REG_NO_PC)
			return EmitErrNotEncodeable;
		
		//ADD/SUB Rn, Rn, Rm
		now = jitEmitAluOpRegShiftImm(dest, cc, isAdd ? ArmDpOpAdd : ArmDpOpSub, instrAddr, rnNo, rnNo, rmNo, EmitShiftLsl, 0, false);
		if (now != EmitErrNone)
			return now;
		
		//STRD Rt, Rt2, [Rn]
		now = jitEmitImmMemStrd(dest, EmitCcAl, instrAddr, rtNo, rnNo, 0, EmitAdrModeIndex);
		if (now != EmitErrNone)
			return now;
	}
	else if (adrMode == EmitAdrModePostindex) {
		
		//no wbak to PC
		if (rnNo == EMIT_REG_NO_PC)
			return EmitErrNotEncodeable;
		
		//STRD Rt, Rt2, [Rn]
		now = jitEmitImmMemStrd(dest, EmitCcAl, instrAddr, rtNo, rnNo, 0, EmitAdrModeIndex);
		if (now != EmitErrNone)
			return now;
		
		//ADD/SUB Rn, Rn, Rm
		now = jitEmitAluOpRegShiftImm(dest, EmitCcAl, isAdd ? ArmDpOpAdd : ArmDpOpSub, instrAddr, rnNo, rnNo, rmNo, EmitShiftLsl, 0, false);
		if (now != EmitErrNone)
			return now;
	}
	else {			//index mode
		
		nPushedRegs = jitPrvFindTempRegs(instrAddr, (1 << rtNo) | (1 << rnNo) | (1 << rmNo), &pushRegs, NULL, false, &baseReg, NULL);
		wantSp = rtNo2 == EMIT_REG_NO_SP || rmNo == EMIT_REG_NO_SP;
		needSp = !isAdd && (rmNo == EMIT_REG_NO_SP);
		
		if ((nPushedRegs && wantSp) || needSp)
			nPushedRegs = jitPrvFindTempRegs(instrAddr, (1 << rtNo) | (1 << rnNo) | (1 << rmNo), &pushRegs, NULL, false, &baseReg, &spCopyReg, NULL);
		else
			spCopyReg = EMIT_REG_NO_SP;
		
		//PUSH {..pushRegs..}
		EMIT(HLpush, pushRegs);
		
		if (rnNo == EMIT_REG_NO_PC) {
			
			//LDR baseReg, =PC_VAL
			EMIT(HLloadImmToReg, baseReg, instrAddr + 8, false, false, false);
			rnNo = baseReg;
		}
		
		if (spCopyReg != EMIT_REG_NO_SP) {
			
			//ADD spCopyReg, SP, #4 * nPushedRegs
			EMIT(LLaddImm, spCopyReg, EMIT_REG_NO_SP, sizeof(uint32_t) * nPushedRegs, EmitLeaveFlags, false);
			
			if (rnNo == EMIT_REG_NO_SP)
				rnNo = spCopyReg;
			if (rmNo == EMIT_REG_NO_SP)
				rmNo = spCopyReg;
			if (rtNo2 == EMIT_REG_NO_SP)
				rtNo2 = spCopyReg;
		}
		
		//ADD/SUB baseReg, rnNo, Rm
		if (isAdd)
			EMIT(LLaddReg, baseReg, rnNo, rmNo, EmitShiftLsl, 0, EmitLeaveFlags, false);
		else
			EMIT(LLsubReg, baseReg, rnNo, rmNo, EmitShiftLsl, 0, EmitLeaveFlags, false);
		
		//STRD Rt, Rt2, [baseReg]
		EMIT(LLstrdImm, rtNo, rtNo2, baseReg, 0, EmitAdrModeIndex);
		
		//POP {..pushRegs..}
		EMIT(HLpop, pushRegs);
	}
	
	now = jitPrvHandleCcEnd(dest, &ccSkip, cc);
	if (now != EmitErrNone)
		return now;
	
	return EmitErrNone;
}

//variable length - not cuitable for cc
enum EmitStatus jitEmitLoadImmToReg(struct EmitBuf *dest, uint32_t rdNo, uint32_t val, bool canCorruptNZ, bool canCorruptC, bool isInIt)
{
	enum EmitStatus now;
	
	//heuristic: could we do it better via a literal load?
	if (EMIT_IS_LOREG(rdNo) && (val >> 16) && (val >> 16) != (val & 0xffff) && (val >> 15) != 0x1ffff && 32 - __builtin_clz(val) - __builtin_ctz(val) > 8) {
		
		now = jitPrvLiteralLoad(dest, rdNo, val);
		if (now != EmitErrNone)
			return now;
	}
	else {
	
		EMIT(HLloadImmToReg, rdNo, val, canCorruptNZ, canCorruptC, isInIt);
	}
	
	return EmitErrNone;
}

enum EmitStatus jitEmitWhereToJumpFromAnotherTu(uint16_t *startOfCodeTu, uint16_t **jumpToP)
{
	*jumpToP = startOfCodeTu;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitJumpToAnotherTu(struct EmitBuf *dest, const uint16_t* startOfCodeInOtherTu, enum EmitCc cc)
{
	return emitLLbranch(dest, (uintptr_t)startOfCodeInOtherTu, cc);
}

enum EmitStatus jitEmitJumpToAbsThumbAddrNotInTu(struct EmitBuf *dest, uintptr_t dstAddr)
{
	return emitHLjump(dest, dstAddr);
}

enum EmitStatus jitEmitTuPrologue(struct EmitBuf *dest, uint32_t sourceStartAddr)
{
	return EmitErrNone;
}

enum EmitStatus jitEmitIntraTuBranch(struct EmitBuf *dest, uintptr_t to, enum EmitCc cc)
{
	EMIT(LLbranch, to, cc);
	
	return EmitErrNone;
}

enum EmitStatus jitEmitNumHalfwordsNeededForConditionalSkipover(enum EmitCc cc, uint32_t *nHalfwordsP)
{
	*nHalfwordsP = 1;
	
	return EmitErrNone;
}

enum EmitStatus jitEmitExtend(struct EmitBuf *dest, uint32_t rdNo, uint32_t rmNo, uint32_t rotateBy, bool byte, bool unsign)
{
	EMIT(LLextend, rdNo, rmNo, rotateBy, byte, unsign);
	
	return EmitErrNone;
}

enum EmitStatus jitEmitSemihostingCall(struct EmitBuf *dest, uint32_t instrAddr, enum EmitCc cc)
{
	struct EmitBuf space;
	enum EmitStatus now;
	
	if (cc != EmitCcAl)
		EMIT(SaveSpace, &space, 1);
	
	//semihosting swi might need pc, but is unlikely to need r4. we agree (with our kernel) to pass ARM pc as if after SWI (that is addr of instr + 4) in r4
	
	//push {r4}
	EMIT(HLpush, 1 << 4);
	
	//ldr r4, =effective_pc
	now = jitEmitLoadImmToReg(dest, 4, instrAddr + 4, false, false, false);
	if (now != EmitErrNone)
		return now;
	
	//svc #KERNEL_ARM_SEMIHOSTING_SWI
	EMIT(LLsvc, KERNEL_ARM_SEMIHOSTING_SWI);

	//pop {r4}
	EMIT(HLpop, 1 << 4);
	
	if (cc != EmitCcAl)
		EMIT_TO(LLbranch, &space, emitGetPtrToJumpHere(dest), emitCcInvert(cc));
	
	return EmitErrNone;
}

void jitStateBackendInit(struct JitBackendRuntimeData *rtd, bool isUiThreadPreallocatedState)
{
	//nothing
}
