//not a real include file. include directly into emuJit to provide the proper pattern matcher

#include "emuJitBackend_m0.h"


static void __attribute__((naked)) jitPrvUdivmodCallout(uint32_t num, uint32_t denom, void (*divByZeroHandler)(void))
{
	#ifdef HAVE_v8M_BASE
	
		asm volatile(
			".syntax unified				\n\t"
			"	cbz  r1, 1f					\n\t"
			"	udiv r2, r0, r1				\n\t"
			"	muls r1, r2					\n\t"
			"	subs r1, r0, r1				\n\t"
			"	mov  r0, r2					\n\t"
			"	bx   lr						\n\t"
			"1:								\n\t"
			"	bx   r2						\n\t"
			:
			:
			:"cc"
		);
	
	#else
	
		asm volatile(
			".syntax unified					\n\t"
			"	cmp r1, #0						\n\t"
			"	beq 1f							\n\t"
			"	ldr r2, =__aeabi_uidivmod + 1	\n\t"
			"1:									\n\t"
			"	bx  r2							\n\t"
			"	.ltorg							\n\t"
			:
			:
			:"cc"
		);
		
	#endif
}

static void __attribute__((naked)) jitPrvSdivmodCallout(int32_t num, int32_t denom, void (*divByZeroHandler)(void))
{
	#ifdef HAVE_v8M_BASE
	
		asm volatile(
			".syntax unified				\n\t"
			"	cbz  r1, 1f					\n\t"
			"	sdiv r2, r0, r1				\n\t"
			"	muls r1, r2					\n\t"
			"	subs r1, r0, r1				\n\t"
			"	mov  r0, r2					\n\t"
			"	bx   lr						\n\t"
			"1:								\n\t"
			"	bx   r2						\n\t"
			:
			:
			:"cc"
		);
	
	#else
	
		asm volatile(
			".syntax unified					\n\t"
			"	cmp r1, #0						\n\t"
			"	beq 1f							\n\t"
			"	ldr r2, =__aeabi_idivmod + 1	\n\t"
			"1:									\n\t"
			"	bx  r2							\n\t"
			"	.ltorg							\n\t"
			:
			:
			:"cc"
		);
		
	#endif
}

static void __attribute__((naked)) jitPrvUdivmod10Callout(uint32_t num)
{
	#ifdef HAVE_v8M_BASE
	
		asm volatile(
			".syntax unified				\n\t"
			"	movs r1, #10				\n\t"
			"	udiv r2, r0, r1				\n\t"
			"	muls r1, r2					\n\t"
			"	subs r1, r0, r1				\n\t"
			"	mov  r0, r2					\n\t"
			"	bx   lr						\n\t"
			:
			:
			:"cc"
		);
	
	#else
	
		/*
			unsigned divu10(unsigned n) {
				unsigned q, r;
				q = (n >> 1) + (n >> 2);
				q = q + (q >> 4);
				q = q + (q >> 8);
				q = q + (q >> 16);
				q = q >> 3;
				r = n - q * 10;
				return q + (r > 9);
			}
		*/
		asm volatile(							// n[r0], q[r1]
			".syntax unified			\n\t"
			"	lsrs  r1, r0, #1		\n\t"	// q = (n >> 1) + (n >> 2)
			"	lsrs  r2, r0, #2		\n\t"
			"	adds  r1, r2			\n\t"
			"	lsrs  r2, r1, #4		\n\t"	// q += q >> 4
			"	adds  r1, r2			\n\t"
			"	lsrs  r2, r1, #8		\n\t"	// q += q >> 8
			"	adds  r1, r2			\n\t"
			"	lsrs  r2, r1, #16		\n\t"	// q += q >> 16
			"	adds  r1, r2			\n\t"
			"	lsrs  r2, r1, #3		\n\t"	// r2 = proposed q
			"	movs  r3, #10			\n\t"
			"	muls  r3, r2			\n\t"
			"	subs  r1, r0, r3		\n\t"	//proposed remainder
			"	cmp   r1, #9			\n\t"
			"	bhi   1f				\n\t"
			"	movs  r0, r2			\n\t"
			"	bx    lr				\n\t"
			"1:							\n\t"
			"	subs  r1, #10			\n\t"
			"	adds  r0, r2, #1		\n\t"
			"	bx    lr				\n\t"
			:
			:
			:"cc"
		);
	#endif
}


static void __attribute__((naked)) jitPrvSdivmod10Callout(int32_t num)
{
	#ifdef HAVE_v8M_BASE
	
		asm volatile(
			".syntax unified			\n\t"
			"	movs r1, #10				\n\t"
			"	sdiv r2, r0, r1				\n\t"
			"	muls r1, r2					\n\t"
			"	subs r1, r0, r1				\n\t"
			"	movs r0, r2					\n\t"
			"	bx   lr						\n\t"
			:
			:
			:"cc"
		);
	
	#else
	
		asm volatile(
			".syntax unified					\n\t"
			"	movs r1, #10					\n\t"
			"	ldr  r2, =__aeabi_idivmod + 1	\n\t"
			"	bx   r2							\n\t"
			"	.ltorg							\n\t"
			:
			:
			:"cc"
		);
	#endif
}

static void __attribute__((naked)) jitPrvAlignedMemcpyCallout(uint32_t *dst, const uint32_t *src, uint32_t sz)
{
	asm volatile(
		".syntax unified			\n\t"
		"	push  {r4-r7, lr}		\n\t"
		"1:							\n\t"
		"	subs  r2, #20			\n\t"
		"	bmi   2f				\n\t"
		"	ldmia r1!, {r3-r7}		\n\t"
		"	stmia r0!, {r3-r7}		\n\t"
		"	b     1b				\n\t"
		"2:							\n\t"
		"	adds  r2, #20			\n\t"
		"3:							\n\t"
		"	subs  r2, #4			\n\t"
		"	bmi   4f				\n\t"
		"	ldmia r1!, {r3}			\n\t"
		"	stmia r0!, {r3}			\n\t"
		"	b     3b				\n\t"
		"4:							\n\t"
		"	lsls  r2, #31			\n\t"
		"	bcc   5f				\n\t"
		"	ldrh  r3, [r1]			\n\t"
		"	strh  r3, [r0]			\n\t"
		"	bpl   7f				\n\t"
		"	ldrb  r3, [r1, #2]		\n\t"
		"	strb  r3, [r0, #2]		\n\t"
		"7:							\n\t"
		"	pop   {r4-r7, pc}		\n\t"
		"5:							\n\t"
		"	bpl   8f				\n\t"
		"	ldrb  r3, [r1]			\n\t"
		"	strb  r3, [r0]			\n\t"
		"8:							\n\t"
		"	pop   {r4-r7, pc}		\n\t"
	:
	:
	:"r3","cc"
	);
}

static enum EmitStatus jitPatternMatcherMatch(struct TU **tuP, struct EmitBuf *dest, const uint32_t *code, bool *tcFlushedP)
{
	struct JitBackendRuntimeData *rtd = jitPrvGetRuntimeDataPtr();
	struct EmitBuf space;
	enum EmitStatus now;
	int32_t mod;

	//we HAVE to match & fix Linker Stub since it sometimes violates the ABI. We match its callers too, since our stub sets the "thumb supported" bit
	
	//match LinkerStub invocation
	mod = jitPatternMatchLinkerStubInvocation(code);
	if (mod != -1) {
		
		fatal("SysLinkerStub found at 0x%08x for module ID 0x%08x. we do NOT expect this to ever be directly called! this shouldn't happen\n", code, mod);
		return EmitErrNotEncodeable;
	}
	
	//match OS calls
	// e519c???	LDR R12, [R9, -0x...]
	// e59cf???	LDR PC, [R12, #0x...]
	if ((code[0] & 0xffffff03) == 0xe519c000 && (code[1] & 0xfffff003) == 0xe59cf000) {
		
		EMIT(LLmov, 0, 9, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
		EMIT(LLsubImm, 0, 0, code[0] & 0xff, EmitFlagsDoNotCare, false);
		EMIT(LLloadImm, 0, 0, 0, EmitSzWord, false, EmitAdrModeIndex);
		now = jitPrvEmitLoadImmToLoreg(dest, 2, code[1] & 0xfff, true, true);
		if (now != EmitErrNone)
			return now;
		EMIT(LLloadRegReg, 2, 2, 0, 0, EmitSzWord, false);
		EMIT(LLbl, (uintptr_t)rtd->epilogueInterworking);
		//	mov r0, r9
		//	subs r0, #imm
		//	ldr r0, [r0]	//we could store r12 here but wont
		//	ldr r2, = imm2
		//	ldr r2, [r2, r0]
		//	interworking jump to r2
		
		#ifdef SUPPORT_ICACHE_FLUSH
			(*tuP)->srcLen = 2;		//2 words
		#endif
		return EmitErrNone;
	}

	//match long jump ARM->ARM thunks in ROM (ARM ARM says that ADD to PC will not switch modes)
	// e59fc###	LDR      R12, [PC, ###]
	// e08ff00c	ADD      PC, PC, R12
	if ((code[0] & 0xfffff003) == 0xe59fc000 && code[1] == 0xe08ff00c) {
		
		if ((uintptr_t)code >= CPU_ROM_BASE && (uintptr_t)code - CPU_ROM_BASE < CPU_ROM_SIZE) {	//in rom (safe to assume literal does not change)
				
			uint32_t pcVal = (uintptr_t)(code + 2), imm = code[2 + (code[0] & 0xfff) / 4], dst = imm + (uintptr_t)(code + 3);
			
			now = jitEmitJumpToArm(dest, EmitCcAl, dst, NULL);
			if (now != EmitErrNone)
				return now;
		}
		else {																					//not in rom - must assume literal can change
			
			uint32_t pcValAtSecondInstr = (uintptr_t)(code + 3);
			int32_t finalAdd = 0, immOfst = (code[0] & 0xfff) - 4;
						
			if (immOfst < 0 && immOfst > -0x100) {
				
				pcValAtSecondInstr += immOfst;
				finalAdd = -immOfst;
				immOfst = 0;
			}
			
			now = jitPrvEmitLoadImmToLoreg(dest, 0, pcValAtSecondInstr, true, true);	//load pc
			if (now != EmitErrNone)
				return now;
			
			if (immOfst < 0x7c) {
				
				EMIT(LLloadImm, 2, 0, immOfst, EmitSzWord, false, EmitAdrModeIndex);
			}
			else {
				
				now = jitPrvEmitLoadImmToLoreg(dest, 2, immOfst, true, true);	//load pc
				if (now != EmitErrNone)
					return now;
				
				EMIT(LLloadRegReg, 2, 2, 0, 0, EmitSzWord, false);
			}
			
			EMIT(LLaddReg, 2, 2, 0, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
			
			EMIT(LLaddImm, 2, 2, finalAdd, EmitFlagsDoNotCare, false);
			
			EMIT(LLbl, (uintptr_t)rtd->epilogueNoninterworking);
			
		}
		
		#ifdef SUPPORT_ICACHE_FLUSH
			(*tuP)->srcLen = 2;		//2 words
		#endif
		return EmitErrNone;
	}

	//match long jump ARM->thumb thunks in ROM
	// E59FC###		LDR	R12, [PC, ###]
	// E08CC00F	ADD	R12, R12, PC
	// E12FFF1C	BX	R12
	if ((code[0] & 0xfffff003) == 0xe59fc000 && code[1] == 0xe08cc00f && code[2] == 0xe12fff1c) {
		
		if ((uintptr_t)code >= CPU_ROM_BASE && (uintptr_t)code - CPU_ROM_BASE < CPU_ROM_SIZE) {		//in rom (safe to assume literal does not change)
			
			uint32_t pcVal = (uintptr_t)(code + 2), imm = code[2 + (code[0] & 0xfff) / 4], dst = imm + (uintptr_t)(code + 3);
			
			if (dst & 1) {	//jump to thumb
				
				now = jitPrvEmitLoadImmToLoreg(dest, 2, dst, true, true);
				if (now != EmitErrNone)
					return now;
				
				EMIT(LLbl, (uintptr_t)rtd->epilogueThumbOnly);
			}
			else {			//jump to ARM
				
				now = jitEmitJumpToArm(dest, EmitCcAl, dst, NULL);
				if (now != EmitErrNone)
					return now;
			}
		}
		else {																					//not in rom - must assume literal can change
		
			uint32_t pcValAtSecondInstr = (uintptr_t)(code + 3), immOfst = (code[0] & 0xfff) - 4;
			
			now = jitPrvEmitLoadImmToLoreg(dest, 0, pcValAtSecondInstr, true, true);	//load pc
			if (now != EmitErrNone)
				return now;
			
			if (immOfst < 0x7c) {
				
				EMIT(LLloadImm, 2, 0, immOfst, EmitSzWord, false, EmitAdrModeIndex);
			}
			else {
				
				now = jitPrvEmitLoadImmToLoreg(dest, 2, immOfst, true, true);	//load pc
				if (now != EmitErrNone)
					return now;
				
				EMIT(LLloadRegReg, 2, 2, 0, 0, EmitSzWord, false);
			}
			
			EMIT(LLaddReg, 2, 2, 0, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
			EMIT(LLbl, (uintptr_t)rtd->epilogueInterworking);
		}
				
		#ifdef SUPPORT_ICACHE_FLUSH
			(*tuP)->srcLen = 3;		//3 words
		#endif
		return EmitErrNone;
		
	}

	//normal callsite looks like this and handles libraries of up to 256 entries
	// LDR    R12, [R9]
	// LDR    R12, [R12, #moduleid * 4]
	// LDR    R12, [R12, #globalsTableOfst]
	// CMP    R12, #0
	// ADDNE  PC, R12, #funcnum * 4
	// STMFD  SP!, {PC}
	// B      per_lib_stub
	
	//ique was seen using this instead for entrypts whose indices were >= 256
    // LDR             R12, [R9]
	// LDR             R12, [R12,#0x1BC]
	// LDR             R12, [R12,#0x28]
	// CMP             R12, #0
	// ADDNE           R12, R12, #0x3C
	// STMEQFD         SP!, {PC}
	// BEQ             per_lib_stub
	// ADD             PC, R12, #0x400
	
	if (code[0] == 0xe599c000 && (code[1] & 0xfffff003) == 0xe59cc000 && (code[2] & 0xfffff003) == 0xe59cc000 && code[3] == 0xe35c0000) {
	
		//linker stub invocation likely - > check further
		int32_t libTrapNo = -1;
		uint32_t codeLen = 0;
		
		if ((code[4] & 0xfffff000) == 0x128cf000 && code[5] == 0xe92d8000 && (code[6] & 0xff000000) == 0xea000000) {
			
			libTrapNo = armShifterImmDecode(code[4] & 0xFFF);
			codeLen = 7 * sizeof(uint32_t);
		}
		else if ((code[4] & 0xfffff000) == 0x128cc000 && code[5] == 0x092d8000 && (code[6] & 0xff000000) == 0x0a000000 && (code[7] & 0xfffff000) == 0xe28cf000) {
			
			libTrapNo = armShifterImmDecode(code[4] & 0xFFF) + armShifterImmDecode(code[7] & 0xFFF);
			codeLen = 8 * sizeof(uint32_t);
		}
		(void)codeLen;
		
		if (libTrapNo >= 0) {
		
			uint32_t allegedOwnModuleId = (code[1] & 0xfff) >> 2;
			uint32_t globalsOffset = code[2] & 0xfff;
			const uint32_t *perLibStubAddr, *commonStub;
			
			perLibStubAddr = (uint32_t*)jitWorkOutArmBranchTarget(&code[6]);
			
			//now verify func offset is divisibile by 4, the target here is valid
			if (!(libTrapNo & 3) && perLibStubAddr[0] == 0xe28fc004 && perLibStubAddr[1] == 0xe92d1000 && (perLibStubAddr[2] & 0xff000000) == 0xea000000) {
			
				int32_t modId;
				
				libTrapNo /= 4;
				commonStub = (uint32_t*)jitWorkOutArmBranchTarget(&perLibStubAddr[2]);
				modId = jitPatternMatchLinkerStubInvocation(commonStub);
				
				if (modId >= 0 && (uint32_t)modId == allegedOwnModuleId) {	//match!
					
					struct TU* perLibStub = jitTuFindByExactAddr((uintptr_t)perLibStubAddr);
					
					logt("Found LazyLoaded lib call stub for func %u and common stub at 0x%08x\n", libTrapNo, commonStub);
					
					if (!perLibStub) {
						
						uint32_t maxCodeWords;
						bool tcFlushed = false;
						
						logt("PerLibStub not found - creating translation\n");
						
						//we do not need to actually translate - we know what is there, so just generate code...
						//we'll reuse currently open TU since we must close it before we can close another
						perLibStub = *tuP;
						(*tuP)->baseAddr = (uintptr_t)perLibStubAddr;	// reset addr. maxCodeWords is still valid
						#ifdef SUPPORT_ICACHE_FLUSH
							(*tuP)->srcLen = 0x30 / sizeof(uint32_t);	//lib stub is that big always
						#endif
						
						//LDR r12, =perLibStubAddr + 0x0c (where descritor is)
						now = jitEmitLoadImmToReg(dest, 12, (uintptr_t)(perLibStubAddr + 3), false, false, false);
						if (now != EmitErrNone)
							return now;
						
						//push {r12}
						now = jitEmitImmMemStr(dest, EmitCcAl, (uintptr_t)code, 12, EMIT_REG_NO_SP, -4, EmitAdrModeIndexWbak, EmitSzWord);
						if (now != EmitErrNone)
							return now;

						//and now jump to common stub
						now = jitEmitJumpToAbsThumbAddrNotInTu(dest, (uintptr_t)&LinkerStubCallout);
						if (now != EmitErrNone)
							return now;
						
						//spill any literals we might have
						now = jitPrvSpillLiterals(jitGetState(true), dest, false);
						if (now != EmitErrNone)
							return now;
						
						jitTuInsert(*tuP, dest);
						
						//now create a new tu for our actual translation
						*tuP = jitTuAllocate(&maxCodeWords, &tcFlushed);
						if (tcFlushed) {
							if (tcFlushedP)
								*tcFlushedP = true;
							return EmitErrNoSpace;
						}
						
						(*tuP)->baseAddr = (uintptr_t)code;
						emitBufferInit(dest, (*tuP)->code, maxCodeWords * sizeof(uint16_t));
						
						now = jitEmitTuPrologue(dest, (*tuP)->baseAddr);
						if (now != EmitErrNone){
							if (now != EmitErrNoSpace)
								logw("prologue failed");
							return now;
						}
					}
					
					logt("now actually translating per-entry stub\n");
					
					now = jitPrvDynamicLibStubM0(dest, allegedOwnModuleId, globalsOffset, libTrapNo, (uint32_t)code, perLibStub->code);
					if (now != EmitErrNone)
						return now;
					
					logt("Translated lib stub for call %u from client %u at 0x%08x -> 0x%08x\n", libTrapNo, allegedOwnModuleId, code, dest->bufStart);
					
					#ifdef SUPPORT_ICACHE_FLUSH
						(*tuP)->srcLen = codeLen / sizeof(uint32_t);
					#endif
					return EmitErrNone;
				}
			}
		}
	}
	
	//ADS's udivmod
	{
		static const uint32_t first[] = {0xE3A02000, 0xE070C1A1, 0x3A000020, 0xE070C421, 0x3A00000F, 0xE1A00400, 0xE38224FF, 0xE070C221, 0x3A000017, 0xE070C421, 0x3A000009, 0xE1A00400, 0xE38228FF, 0xE070C421, 0x21A00400, 0x23822CFF, 0xE070C221, 0x3A00000E, 0xE270C000};
		static const uint32_t second[] = {0x21A00420, 0xE070C3A1, 0x20411380, 0xE0A22002, 0xE070C321, 0x20411300, 0xE0A22002, 0xE070C2A1, 0x20411280, 0xE0A22002, 0xE070C221, 0x20411200, 0xE0A22002, 0xE070C1A1, 0x20411180, 0xE0A22002, 0xE070C121, 0x20411100, 0xE0A22002, 0xE070C0A1, 0x20411080, 0xE0A22002, 0xE070C001, 0x20411000, 0xE0B22002, 0x2AFFFFE5, 0xE1A00002, 0xE12FFF1E};
		
		if (!memcmp(code, first, sizeof(first)) && (code[sizeof(first) / 4] >> 24) == 0x2A && !memcmp(code + sizeof(first) / 4 + 1, second, sizeof(second))) {
			
			static const int8_t regsInMap[] = {1, 0, 0, 1, 2, 2, -1};
			static const int8_t regsOutMap[] = {0, 0, 1, 1, -1};
			
			logjt("ADS udivmod matched at 0x%08x\n", code);
			//(r1 / r0) -> (r0 = quo, r1 = rem), we load r2 with div-by-zero func ptr
			
			now = jitEmitLoadImmToReg(dest, 2, jitWorkOutArmBranchTarget(&code[sizeof(first) / 4]), true, true, false);
			if (now != EmitErrNone)
				return now;
			
			now = jitPrvCalloutM0(dest, &jitPrvUdivmodCallout, regsInMap, regsOutMap);
			if (now != EmitErrNone)
				return now;
			
			//return
			now = jitEmitBxReg(dest, EmitCcAl, (uintptr_t)code, EMIT_REG_NO_LR);
			if (now != EmitErrNone)
				return now;
			
			#ifdef SUPPORT_ICACHE_FLUSH
				(*tuP)->srcLen = (sizeof(first) + sizeof(second) + 4) / sizeof(uint32_t);
			#endif
			return EmitErrNone;
		}
	}
	
	//ADS's sdivmod
	{
		static const uint32_t first[] = {0xE2102480, 0x42600000, 0xE0323041, 0x22611000, 0xE070C1A1, 0x3A000020, 0xE070C421, 0x3A00000F, 0xE1A00400, 0xE38224FF, 0xE070C221, 0x3A000017, 0xE070C421, 0x3A000009, 0xE1A00400, 0xE38228FF, 0xE070C421, 0x21A00400, 0x23822CFF, 0xE070C221, 0x3A00000E, 0xE270C000};
		static const uint32_t second[] = {0x21A00420, 0xE070C3A1, 0x20411380, 0xE0A22002, 0xE070C321, 0x20411300, 0xE0A22002, 0xE070C2A1, 0x20411280, 0xE0A22002, 0xE070C221, 0x20411200, 0xE0A22002, 0xE070C1A1, 0x20411180, 0xE0A22002, 0xE070C121, 0x20411100, 0xE0A22002, 0xE070C0A1, 0x20411080, 0xE0A22002, 0xE070C001, 0x20411000, 0xE0B22002, 0x2AFFFFE5, 0xE0320FC3, 0xE0800FA3, 0x22611000, 0xE12FFF1E};
		
		if (!memcmp(code, first, sizeof(first)) && (code[sizeof(first) / 4] >> 24) == 0x2A && !memcmp(code + sizeof(first) / 4 + 1, second, sizeof(second))) {
			
			static const int8_t regsInMap[] = {1, 0, 0, 1, 2, 2, -1};
			static const int8_t regsOutMap[] = {0, 0, 1, 1, -1};
			
			logjt("ADS sdivmod matched at 0x%08x\n", code);
			//(r1 / r0) -> (r0 = quo, r1 = rem), we load r2 with div-by-zero func ptr
			
			now = jitEmitLoadImmToReg(dest, 2, jitWorkOutArmBranchTarget(&code[sizeof(first) / 4]), true, true, false);
			if (now != EmitErrNone)
				return now;
			
			now = jitPrvCalloutM0(dest, &jitPrvSdivmodCallout, regsInMap, regsOutMap);
			if (now != EmitErrNone)
				return now;
			
			//return
			now = jitEmitBxReg(dest, EmitCcAl, (uintptr_t)code, EMIT_REG_NO_LR);
			if (now != EmitErrNone)
				return now;
			
			#ifdef SUPPORT_ICACHE_FLUSH
				(*tuP)->srcLen = (sizeof(first) + sizeof(second) + 4) / sizeof(uint32_t);
			#endif
			return EmitErrNone;
		}
	}

	//ADS's udiv10
	{
		static const uint32_t match[] = {0xE240100A, 0xE0400120, 0xE0800220, 0xE0800420, 0xE0800820, 0xE1A001A0, 0xE0802100, 0xE0511082, 0x52800001, 0x4281100A, 0xE12FFF1E};
		
		if (!memcmp(code, match, sizeof(match))) {
			
			static const int8_t regsInMap[] = {0, 0, -1};
			static const int8_t regsOutMap[] = {0, 0, 1, 1, -1};
			
			logjt("ADS udiv10 matched at 0x%08x\n", code);
			//(r0 / 10) -> (r0 = quo, r1 = rem)
			
			now = jitPrvCalloutM0(dest, &jitPrvUdivmod10Callout, regsInMap, regsOutMap);
			if (now != EmitErrNone)
				return now;
			
			//return
			now = jitEmitBxReg(dest, EmitCcAl, (uintptr_t)code, EMIT_REG_NO_LR);
			if (now != EmitErrNone)
				return now;
			
			#ifdef SUPPORT_ICACHE_FLUSH
				(*tuP)->srcLen = sizeof(match) / sizeof(uint32_t);
			#endif
			return EmitErrNone;
		}
	}

	//ADS's sdiv10
	{
		static const uint32_t match[] = {0xE1B03000, 0x42600000, 0xE240100A, 0xE0400120, 0xE0800220, 0xE0800420, 0xE0800820, 0xE1A001A0, 0xE0802100, 0xE0511082, 0x52800001, 0x4281100A, 0xE1B03003, 0x42600000, 0x42611000, 0xE12FFF1E};
		
		if (!memcmp(code, match, sizeof(match))) {
			
			static const int8_t regsInMap[] = {0, 0, -1};
			static const int8_t regsOutMap[] = {0, 0, 1, 1, -1};
			
			logjt("ADS sdiv10 matched at 0x%08x\n", code);
			//(r0 / 10) -> (r0 = quo, r1 = rem)
			
			now = jitPrvCalloutM0(dest, &jitPrvSdivmod10Callout, regsInMap, regsOutMap);
			if (now != EmitErrNone)
				return now;
			
			//return
			now = jitEmitBxReg(dest, EmitCcAl, (uintptr_t)code, EMIT_REG_NO_LR);
			if (now != EmitErrNone)
				return now;
			
			#ifdef SUPPORT_ICACHE_FLUSH
				(*tuP)->srcLen = sizeof(match) / sizeof(uint32_t);
			#endif
			return EmitErrNone;
		}
	}

	//ADS's memcpy
	{
		static const uint32_t match[] = {0xE92D4010, 0xE2522020, 0x3A000005, 0x28B15018, 0x28A05018, 0x28B15018, 0x28A05018, 0x22522020, 0x2AFFFFF9, 0xE1B0CE02, 0x28B15018, 0x28A05018, 0x48B10018, 0x48A00018, 0xE8BD4010, 0xE1B0CF02, 0x24913004, 0x24803004, 0x012FFF1E, 0xE1B02F82, 0x44D12001, 0x24D13001, 0x24D1C001, 0x44C02001, 0x24C03001, 0x24C0C001, 0xE12FFF1E};
		
		if (!memcmp(code, match, sizeof(match))) {
			
			static const int8_t regsInMap[] = {0, 0, 1, 1, 2, 2, -1};
			
			logjt("ADS aligned memcpy matched at 0x%08x\n", code);
			
			now = jitPrvCalloutM0(dest, &jitPrvAlignedMemcpyCallout, regsInMap, NULL);
			if (now != EmitErrNone)
				return now;
			
			//return
			now = jitEmitBxReg(dest, EmitCcAl, (uintptr_t)code, EMIT_REG_NO_LR);
			if (now != EmitErrNone)
				return now;
			
			#ifdef SUPPORT_ICACHE_FLUSH
				(*tuP)->srcLen = sizeof(match) / sizeof(uint32_t);
			#endif
			return EmitErrNone;
		}
	}

	return EmitErrInvalidInput;
}


static enum EmitStatus jitPeepholeCodeReader(struct EmitBuf *dest, uint32_t **codeP, bool *terminateP)
{
	static const char* strCc[] = {"EQ","NE","CS","CC","MI","PL","VS","VC","HI","LS","GE","LT","GT","LE", "AL","NV"};
	uint32_t *code = *codeP, instr, instr2;
	struct EmitBuf space;
	enum EmitStatus now;
	
	//emit {u,s}xt{b,h} for
	//	uxt_: MOV Rx, Ry, LSL #num1   ;  MOV Rx, Rx, LSR #num2		x may be == y
	//	sxt_: MOV Rx, Ry, LSL #num1   ;  MOV Rx, Rx, ASR #num2		x may be == y
	{
		uint8_t rdNo1, rmNo1, cc1, shiftAmt1, shiftType1;
		uint8_t rdNo2, rmNo2, cc2, shiftAmt2, shiftType2;
		
		if (jitParseArmMovToRegFomRegShiftImm(code[0], &rdNo1, &rmNo1, &shiftType1, &shiftAmt1, &cc1) &&
					(rdNo1 != EMIT_REG_NO_PC || cc1 != EmitCcAl) && // make sure next instr is even safe to read
					jitParseArmMovToRegFomRegShiftImm(code[1], &rdNo2, &rmNo2, &shiftType2, &shiftAmt2, &cc2) && cc1 == cc2 &&
					cc1 != EmitCcNv &&  shiftType1 == EmitShiftLsl && (shiftType2 == EmitShiftLsr || shiftType2 == EmitShiftAsr) &&
					shiftAmt2 >= shiftAmt1 && shiftAmt2 < 32 &&  rmNo2 == rdNo1 && rdNo2 == rmNo2 &&
					rmNo1 != EMIT_REG_NO_PC && rmNo1 != EMIT_REG_NO_SP && rdNo1 != EMIT_REG_NO_PC && rdNo1 != EMIT_REG_NO_SP) {
			
			//we have a match - emit a potentially conditional {u/s}bfx
			uint32_t msbDesired = 31 - shiftAmt1;
			uint32_t numBitsDesired = 32 - shiftAmt2;
			uint32_t lsbDesired = msbDesired - numBitsDesired + 1;
			bool isUnsigned = (shiftType2 == EmitShiftLsr);
			
			//if the conditions for a using a {US}XT{HB} are met, use those
			if (lsbDesired == 0 && (numBitsDesired == 8 || numBitsDesired == 16)) {
				
				logt("converting MOV%s R%u, R%u, LSL #%u; MOV%s R%u, R%u, %cSR #%u => %cXT%c R%u, R%u\n",
					(cc1 == EmitCcAl) ? "" : strCc[cc1], rdNo1, rmNo1, shiftAmt1,
					(cc2 == EmitCcAl) ? "" : strCc[cc2], rdNo2, rmNo2, isUnsigned ? 'L' : 'A', shiftAmt2,
					isUnsigned ? 'U' : 'S', (numBitsDesired == 8) ? 'B' : 'H', rdNo2, rmNo1);
				
				//handle conditionality
				if (cc1 != EmitCcAl) {
					
					uint32_t spaceNeeded;
					
					now = jitEmitNumHalfwordsNeededForConditionalSkipover(cc1, &spaceNeeded);
					if (now != EmitErrNone)
						return now;
					
					EMIT(SaveSpace, &space, spaceNeeded);
				}
			
				now = jitEmitExtend(dest, rdNo2, rmNo1, 0, numBitsDesired == 8, isUnsigned);
				if (now != EmitErrNone)
					return now;
				
				//handle conditionality
				if (cc1 != EmitCcAl) {
					
					now = jitEmitIntraTuBranch(&space, emitGetPtrToJumpHere(dest), emitCcInvert(cc1));
					if (now != EmitErrNone)
						return now;
				}
				
				(*codeP) += 2;
				return EmitErrNone;
			}
		}
	}
	

	//match switches of the form: CMP Rx, #yy; ADDLS PC, PC, Rx, LSL #2; B default_label; B label....   (x times)
	if ((((instr = code[0]) & 0xfff0ff00) == 0xe3500000) && (((instr2 = code[1]) & 0xfffffff0) == 0x908ff100)) {
		
		uint32_t i, regNo = (instr >> 16) & 0x0F, ncases = (instr & 0xff) + 1;
		bool valid = true;
		
		if (regNo != EMIT_REG_NO_PC && (instr2 & 0x0f) == regNo && ncases <= MAX_SWITCH_SIZE) {
			
			for (i = 0; i < ncases + 1; i++) {
				
				if ((code[2 + i] >> 24) != 0xea) {
					valid = false;
					break;
				}
			}
			
			if (valid) {		//we have deduced that this switch is worth optimizing
				
				struct SwitchBranchStateM0 sbs;
				
				now = jitPrvSwitchBranchM0(dest, &sbs, regNo, ncases, 3 /* why not? :) */);
				if (now != EmitErrNone)
					return now;
				
				//default jump (here cause it is unlikely)
				now = jitEmitJumpToArm(dest, EmitCcAl, jitWorkOutArmBranchTarget(code + 2), NULL);
				if (now != EmitErrNone)
					return now;
				
				//cases
				
				for (i = 0; i < ncases; i++) {
					
					struct EmitBuf caseSpace;
					
					now = jitPrvSwitchBranchWriteCaseM0(&sbs, i, &caseSpace);
					if (now != EmitErrNone)
						return now;
					
					//if the tc entry exists this will be a short branch, if not it will be a UDF we'll replace later
					now = jitEmitJumpToArm(&caseSpace, EmitCcAl, jitWorkOutArmBranchTarget(code + 3 + i), NULL);
					if (now != EmitErrNone)
						return now;
				}
				
				*terminateP = true;
				(*codeP) += 2 + ncases + 1;
				
				return EmitErrNone;
			}
		}
	}

	//no match?
	return EmitErrInvalidInput;
}

