#include <SystemResources.h>
#include <stdbool.h>
#include <string.h>
#include <string.h>
#include <stdint.h>
#include "halDrawing.h"
#include "halDisplay.h"
#include "boot.h"
#include "printf.h"
#include "heap.h"	//so we can use hal static mem early in the boot to draw boot screen
#include "dal.h"



//XXX: this code WILL not properly handle displays whose bit order for <8bpp has earlier bits in lower bits. It will look like it almost works, but will not actually work!



//fast plz
#pragma GCC push_options
#pragma GCC optimize("Ofast")


#ifdef BUILDING_FOR_BIG_ARM
	#pragma GCC target ("arm")		//faster
#endif



/* - for reference garnet used this
struct PrvConvertInfo {
	uint16_t dstPixelSz;			// 0x00
	int16_t dstX;					// 0x02 - used for dithering
	int16_t dstY;					// 0x04 - used for dithering
	uint8_t rfu1[2];				// 0x06
	const struct PalmClut *dstClut;	// 0x08
	const uint8_t *translate;		// 0x0C - array of uint8, at least as many as 1 << depth
	uint16_t needsBitswap;			// 0x10 - set if pixFmt is 3
	uint16_t srcPixelSize;			// 0x12
	uint32_t scaling;				// 0x14 - rounded and quite inaccurate
	uint8_t rfu3[6];				// 0x18
	int16_t outputCount;			// 0x1E - in pixels
};

struct BlitPatInfo {
	int16_t dstRowBytes;			// 0x00
	int16_t depth;					// 0x02
	int16_t depthShift;				// 0x04 - log_2(depth)
	uint8_t rfu1[2];				// 0x06
	int16_t pixelFormat;			// 0x08
	uint8_t rfu2[2];				// 0x0A
	PrvBlitPatFunc blitF;			// 0x0C
	uint16_t pattern;				// 0x10
	uint16_t patRowBytesAcross;		// 0x12
	uint16_t expPatRowBytesShift;	// 0x14
	uint16_t expPatRowPreShift;		// 0x16
	uint8_t expPat[256];			// 0x18		//must be 4-byte aligned, hence why the struct must be 4-byte aligned
	uint8_t expMask[256];			// 0x118
	uint16_t swap1;					// 0x218
	uint16_t swap16;				// 0x21A
} __attribute__((aligned(4)));

struct PrvBlitInfo {
	
	uint16_t depth;				//	0x00
	uint16_t width;				//	0x02 - seemingly unused
	int16_t pixelsPerByte;		//	0x04
	int16_t srcOfst;			//	0x06 - in bytes
	int16_t dstOfst;			//	0x08 - in bytes
	int16_t initialPixels;		//	0x0A - number of pixels before we can process full bytes (only used for bpp < 8)
	int16_t trailingPixels;		//	0x0C - number of pixels after process all full bytes (only used for bpp < 8)
	uint8_t initialPxsMask;		//	0x0E
	uint8_t finalPxsMask;		//	0x0F
	int16_t initialPixelShift;	//	0x10 - should probably be (8 - initialPixels * bpp)
	int16_t trailingPixelShift;	//	0x12 - should probably be 0
	int16_t numFullPixels;		//	0x14 - for bpp < 8, this is number of FULL bytes affected in this row
	int16_t needsSwap;			//	0x16 - used as boolean to indicate of we need to bitswap data in/out (only for bpp < 8)
	uint16_t transparentColor;	//	0x18 - color in src
	uint16_t backgroundColor;	//	0x1A - eraseColor - the color to erase dst with
	uint8_t rfu[4];				//	0x1C
	const uint16_t* xlate;		//	0x20 - used for XOR mode when dst is 16 bit (translates the 8-bit source color to 16-bit XOR-able value which is stores inverted for some reason)
};

*/



union ClutEntryAccess {
	struct PalmClutEntry entry;
	uint32_t val32;
	struct {
		uint32_t idx			: 8;	//assumes compiler allocates bitfields consecutively and in the low to high bit order
		uint32_t rgbComponents	: 24;
	};	
};

enum HalDrawingClutState {
	halDrawingClutIsStandard,
	halDrawingClutIsNotYetChecked,
	halDrawingClutIsNonstandard,
};

union ComprState {
	struct {	//used for compression & decompression
		uint8_t bpp;				// 0x00
	} packBits;
	struct {	//used for for decompression only
		uint8_t prevByteCout;		// 0x00
		uint8_t prevByte;			// 0x01
	} rle;
	struct {	//used for for compression only
		const uint8_t* prevLine;	// 0x00
		bool havePrevLine;			// 0x04
	} scanLine;
};

enum HalDrawingScalingMode {
	halDrawScaleNone,			// do not scale
	halDrawScaleOneHalf,		// shrink by 50%	(DD img on SD screen)
	halDrawScaleThreeQuarters,	// shrink by 25%%	(DD img on 1.5D screen)		//palm os blitter does not support this - this is our addition
	halDrawScaleTwoThirds,		// shrink by 33%	(1.5D img on SD screen)
	halDrawScaleOneAndHalf,		// grow by 50%		(SD img on 1.5D screen)
	halDrawScaleDouble,			// grow by 100%		(SD img on DD screen)
	halDrawScaleTriple,			// grow by 200%		(SD img on 3x screen)
	halDrawScaleQuadruple,		// grow by 300%		(SD img on 4x screen)
};



struct BlitPatInfo;
struct PrvConvertInfo;
struct BitonalBlitInfo;
struct PrvBlitInfo;
struct PrvCopyLineInfo;

typedef void (*PrvBlitPatFunc)(const struct BlitPatInfo *info, int32_t patOffsetV, void* dst, int32_t dstOffset, int32_t width, int32_t height);
typedef void (*PrvBlit8_or_16func)(const uint8_t *srcP, uint8_t* dstP, const struct PrvBlitInfo *info);
typedef void (*PrvConvertFunc)(const void* srcP, void* dstP, int32_t numSrcBytes, struct PrvConvertInfo *info);
typedef void (*PrvBitonalBlitXferFunc)(const struct BitonalBlitInfo *info, const void* srcP, uint32_t srcRowBytes, uint32_t srcOffset, uint32_t dstOffset, int32_t width, int32_t height);
typedef void (*PrvCopyLineFunc)(const void* src, void* dst, int32_t height, const struct PrvCopyLineInfo* info);


struct PrvCopyLineInfo {
	uint16_t leftMask;
	uint16_t rightMask;
	int32_t srcWidthBytes;
	int32_t srcOffset;
	int32_t dstOffset;
	uint32_t srcRowBytes;
	uint32_t dstRowBytes;
	const uint8_t* swapTbl;
	
	int8_t srcToDstBitOfst;			
	uint8_t needExtraReadUpFront	: 1;
	uint8_t needExtraReadAtEnd		: 1;
};

struct GlyphInfo {		//changed from original format since we are the only users left - this is more compact and saves a lot of stack
	
	const uint8_t *srcP;
	uint16_t srcRowBytes;
	uint16_t fromLeft;
	uint16_t width	: 15;
	uint16_t hasPad	: 1;
} __attribute__((packed));	//we store a lot of these on stack and packing is worth it

struct BitonalBlitInfo {
	
	PrvBitonalBlitXferFunc transferFunc;	// 0x00
	uint16_t background;					// 0x04
	uint16_t foreground;					// 0x06
	uint32_t depth;							// 0x08
	uint32_t depthShift;					// 0x0C
	uint8_t *dstP;							// 0x10 - accounts for Y displacement
	uint32_t dstRowBytes;					// 0x14
	uint32_t dstPixelFormat;				// 0x18
};

struct PrvConvertInfo {
	uint16_t dstPixelSz;
	int16_t dstX;
	int16_t dstY;
	int16_t outputCount;
	const struct PalmClut *dstClut;
	const uint8_t *translate;
	uint16_t needsBitswap;
	uint16_t srcPixelSize;
	enum HalDrawingScalingMode scaleMode;
};

struct BlitPatInfo {
	PrvBlitPatFunc blitF;
	int16_t dstRowBytes;
	int16_t depth;
	int16_t depthShift;
	int16_t pixelFormat;
	uint16_t pattern;
	uint16_t patRowBytesAcross;
	uint16_t expPatRowBytesShift;
	uint16_t expPatRowPreShift;
	//these both can be 256 as long as we do not need quad-density support
	uint8_t expPat[512];			//must be 4-byte aligned, hence why the struct must be 4-byte aligned
	uint8_t expMask[512];
	uint16_t swap1;
	uint16_t swap16;
} __attribute__((aligned(4)));

struct PrvBlitInfo {
	
	uint16_t depth;
	uint16_t width;
	int16_t pixelsPerByte;	
	int16_t srcOfst;
	int16_t dstOfst;
	int16_t initialPixels;
	int16_t trailingPixels;
	uint8_t initialPxsMask;	
	uint8_t finalPxsMask;
	int16_t initialPixelShift;
	int16_t trailingPixelShift;
	int16_t numFullPixels;
	int16_t needsSwap;
	uint16_t transparentColor;
	uint16_t backgroundColor;
	const uint16_t* xlate;
};




static const uint8_t kBias16Gray[] = {
		0x00, 0x20, 0x08, 0x28, 0x02, 0x22, 0x0A, 0x2A, 0x30, 0x10, 0x38, 0x18, 0x32, 0x12, 0x3A, 0x1A,
		0x0C, 0x2C, 0x04, 0x24, 0x0E, 0x2E, 0x06, 0x26, 0x3C, 0x1C, 0x34, 0x14, 0x3E, 0x1E, 0x36, 0x16,
		0x03, 0x23, 0x0B, 0x2B, 0x01, 0x21, 0x09, 0x29, 0x33, 0x13, 0x3B, 0x1B, 0x31, 0x11, 0x39, 0x19,
		0x0F, 0x2F, 0x07, 0x27, 0x0D, 0x2D, 0x05, 0x25, 0x3F, 0x1F, 0x37, 0x17, 0x3D, 0x1D, 0x35, 0x15
	};

static const uint8_t k8UnityMapping[] = {
		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
		0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
		0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F,
		0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
		0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
		0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
		0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
		0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
		0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
		0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
		0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
		0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
		0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
		0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
		0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF
	};

static const uint16_t mOneAndOneHalfScaleNibble1bit[] = {0x0000, 0x0001, 0x0004, 0x0007, 0x0008, 0x0009, 0x000C, 0x000F, 0x0020, 0x0021, 0x0024, 0x0027, 0x0038, 0x0039, 0x003C, 0x003F, 0x0040, 0x0041, 0x0044, 0x0047, 0x0048, 0x0049, 0x004C, 0x004F, 0x0060, 0x0061, 0x0064, 0x0067, 0x0078, 0x0079, 0x007C, 0x007F, 0x0100, 0x0101, 0x0104, 0x0107, 0x0108, 0x0109, 0x010C, 0x010F, 0x0120, 0x0121, 0x0124, 0x0127, 0x0138, 0x0139, 0x013C, 0x013F, 0x01C0, 0x01C1, 0x01C4, 0x01C7, 0x01C8, 0x01C9, 0x01CC, 0x01CF, 0x01E0, 0x01E1, 0x01E4, 0x01E7, 0x01F8, 0x01F9, 0x01FC, 0x01FF, 0x0200, 0x0201, 0x0204, 0x0207, 0x0208, 0x0209, 0x020C, 0x020F, 0x0220, 0x0221, 0x0224, 0x0227, 0x0238, 0x0239, 0x023C, 0x023F, 0x0240, 0x0241, 0x0244, 0x0247, 0x0248, 0x0249, 0x024C, 0x024F, 0x0260, 0x0261, 0x0264, 0x0267, 0x0278, 0x0279, 0x027C, 0x027F, 0x0300, 0x0301, 0x0304, 0x0307, 0x0308, 0x0309, 0x030C, 0x030F, 0x0320, 0x0321, 0x0324, 0x0327, 0x0338, 0x0339, 0x033C, 0x033F, 0x03C0, 0x03C1, 0x03C4, 0x03C7, 0x03C8, 0x03C9, 0x03CC, 0x03CF, 0x03E0, 0x03E1, 0x03E4, 0x03E7, 0x03F8, 0x03F9, 0x03FC, 0x03FF, 0x0800, 0x0801, 0x0804, 0x0807, 0x0808, 0x0809, 0x080C, 0x080F, 0x0820, 0x0821, 0x0824, 0x0827, 0x0838, 0x0839, 0x083C, 0x083F, 0x0840, 0x0841, 0x0844, 0x0847, 0x0848, 0x0849, 0x084C, 0x084F, 0x0860, 0x0861, 0x0864, 0x0867, 0x0878, 0x0879, 0x087C, 0x087F, 0x0900, 0x0901, 0x0904, 0x0907, 0x0908, 0x0909, 0x090C, 0x090F, 0x0920, 0x0921, 0x0924, 0x0927, 0x0938, 0x0939, 0x093C, 0x093F, 0x09C0, 0x09C1, 0x09C4, 0x09C7, 0x09C8, 0x09C9, 0x09CC, 0x09CF, 0x09E0, 0x09E1, 0x09E4, 0x09E7, 0x09F8, 0x09F9, 0x09FC, 0x09FF, 0x0E00, 0x0E01, 0x0E04, 0x0E07, 0x0E08, 0x0E09, 0x0E0C, 0x0E0F, 0x0E20, 0x0E21, 0x0E24, 0x0E27, 0x0E38, 0x0E39, 0x0E3C, 0x0E3F, 0x0E40, 0x0E41, 0x0E44, 0x0E47, 0x0E48, 0x0E49, 0x0E4C, 0x0E4F, 0x0E60, 0x0E61, 0x0E64, 0x0E67, 0x0E78, 0x0E79, 0x0E7C, 0x0E7F, 0x0F00, 0x0F01, 0x0F04, 0x0F07, 0x0F08, 0x0F09, 0x0F0C, 0x0F0F, 0x0F20, 0x0F21, 0x0F24, 0x0F27, 0x0F38, 0x0F39, 0x0F3C, 0x0F3F, 0x0FC0, 0x0FC1, 0x0FC4, 0x0FC7, 0x0FC8, 0x0FC9, 0x0FCC, 0x0FCF, 0x0FE0, 0x0FE1, 0x0FE4, 0x0FE7, 0x0FF8, 0x0FF9, 0x0FFC, 0x0FFF, };

static const uint8_t swap4[] = {0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xA0, 0xB0, 0xC0, 0xD0, 0xE0, 0xF0, 0x01, 0x11, 0x21, 0x31, 0x41, 0x51, 0x61, 0x71, 0x81, 0x91, 0xA1, 0xB1, 0xC1, 0xD1, 0xE1, 0xF1, 0x02, 0x12, 0x22, 0x32, 0x42, 0x52, 0x62, 0x72, 0x82, 0x92, 0xA2, 0xB2, 0xC2, 0xD2, 0xE2, 0xF2, 0x03, 0x13, 0x23, 0x33, 0x43, 0x53, 0x63, 0x73, 0x83, 0x93, 0xA3, 0xB3, 0xC3, 0xD3, 0xE3, 0xF3, 0x04, 0x14, 0x24, 0x34, 0x44, 0x54, 0x64, 0x74, 0x84, 0x94, 0xA4, 0xB4, 0xC4, 0xD4, 0xE4, 0xF4, 0x05, 0x15, 0x25, 0x35, 0x45, 0x55, 0x65, 0x75, 0x85, 0x95, 0xA5, 0xB5, 0xC5, 0xD5, 0xE5, 0xF5, 0x06, 0x16, 0x26, 0x36, 0x46, 0x56, 0x66, 0x76, 0x86, 0x96, 0xA6, 0xB6, 0xC6, 0xD6, 0xE6, 0xF6, 0x07, 0x17, 0x27, 0x37, 0x47, 0x57, 0x67, 0x77, 0x87, 0x97, 0xA7, 0xB7, 0xC7, 0xD7, 0xE7, 0xF7, 0x08, 0x18, 0x28, 0x38, 0x48, 0x58, 0x68, 0x78, 0x88, 0x98, 0xA8, 0xB8, 0xC8, 0xD8, 0xE8, 0xF8, 0x09, 0x19, 0x29, 0x39, 0x49, 0x59, 0x69, 0x79, 0x89, 0x99, 0xA9, 0xB9, 0xC9, 0xD9, 0xE9, 0xF9, 0x0A, 0x1A, 0x2A, 0x3A, 0x4A, 0x5A, 0x6A, 0x7A, 0x8A, 0x9A, 0xAA, 0xBA, 0xCA, 0xDA, 0xEA, 0xFA, 0x0B, 0x1B, 0x2B, 0x3B, 0x4B, 0x5B, 0x6B, 0x7B, 0x8B, 0x9B, 0xAB, 0xBB, 0xCB, 0xDB, 0xEB, 0xFB, 0x0C, 0x1C, 0x2C, 0x3C, 0x4C, 0x5C, 0x6C, 0x7C, 0x8C, 0x9C, 0xAC, 0xBC, 0xCC, 0xDC, 0xEC, 0xFC, 0x0D, 0x1D, 0x2D, 0x3D, 0x4D, 0x5D, 0x6D, 0x7D, 0x8D, 0x9D, 0xAD, 0xBD, 0xCD, 0xDD, 0xED, 0xFD, 0x0E, 0x1E, 0x2E, 0x3E, 0x4E, 0x5E, 0x6E, 0x7E, 0x8E, 0x9E, 0xAE, 0xBE, 0xCE, 0xDE, 0xEE, 0xFE, 0x0F, 0x1F, 0x2F, 0x3F, 0x4F, 0x5F, 0x6F, 0x7F, 0x8F, 0x9F, 0xAF, 0xBF, 0xCF, 0xDF, 0xEF, 0xFF, };
static const uint8_t swap2[] = {0x00, 0x40, 0x80, 0xC0, 0x10, 0x50, 0x90, 0xD0, 0x20, 0x60, 0xA0, 0xE0, 0x30, 0x70, 0xB0, 0xF0, 0x04, 0x44, 0x84, 0xC4, 0x14, 0x54, 0x94, 0xD4, 0x24, 0x64, 0xA4, 0xE4, 0x34, 0x74, 0xB4, 0xF4, 0x08, 0x48, 0x88, 0xC8, 0x18, 0x58, 0x98, 0xD8, 0x28, 0x68, 0xA8, 0xE8, 0x38, 0x78, 0xB8, 0xF8, 0x0C, 0x4C, 0x8C, 0xCC, 0x1C, 0x5C, 0x9C, 0xDC, 0x2C, 0x6C, 0xAC, 0xEC, 0x3C, 0x7C, 0xBC, 0xFC, 0x01, 0x41, 0x81, 0xC1, 0x11, 0x51, 0x91, 0xD1, 0x21, 0x61, 0xA1, 0xE1, 0x31, 0x71, 0xB1, 0xF1, 0x05, 0x45, 0x85, 0xC5, 0x15, 0x55, 0x95, 0xD5, 0x25, 0x65, 0xA5, 0xE5, 0x35, 0x75, 0xB5, 0xF5, 0x09, 0x49, 0x89, 0xC9, 0x19, 0x59, 0x99, 0xD9, 0x29, 0x69, 0xA9, 0xE9, 0x39, 0x79, 0xB9, 0xF9, 0x0D, 0x4D, 0x8D, 0xCD, 0x1D, 0x5D, 0x9D, 0xDD, 0x2D, 0x6D, 0xAD, 0xED, 0x3D, 0x7D, 0xBD, 0xFD, 0x02, 0x42, 0x82, 0xC2, 0x12, 0x52, 0x92, 0xD2, 0x22, 0x62, 0xA2, 0xE2, 0x32, 0x72, 0xB2, 0xF2, 0x06, 0x46, 0x86, 0xC6, 0x16, 0x56, 0x96, 0xD6, 0x26, 0x66, 0xA6, 0xE6, 0x36, 0x76, 0xB6, 0xF6, 0x0A, 0x4A, 0x8A, 0xCA, 0x1A, 0x5A, 0x9A, 0xDA, 0x2A, 0x6A, 0xAA, 0xEA, 0x3A, 0x7A, 0xBA, 0xFA, 0x0E, 0x4E, 0x8E, 0xCE, 0x1E, 0x5E, 0x9E, 0xDE, 0x2E, 0x6E, 0xAE, 0xEE, 0x3E, 0x7E, 0xBE, 0xFE, 0x03, 0x43, 0x83, 0xC3, 0x13, 0x53, 0x93, 0xD3, 0x23, 0x63, 0xA3, 0xE3, 0x33, 0x73, 0xB3, 0xF3, 0x07, 0x47, 0x87, 0xC7, 0x17, 0x57, 0x97, 0xD7, 0x27, 0x67, 0xA7, 0xE7, 0x37, 0x77, 0xB7, 0xF7, 0x0B, 0x4B, 0x8B, 0xCB, 0x1B, 0x5B, 0x9B, 0xDB, 0x2B, 0x6B, 0xAB, 0xEB, 0x3B, 0x7B, 0xBB, 0xFB, 0x0F, 0x4F, 0x8F, 0xCF, 0x1F, 0x5F, 0x9F, 0xDF, 0x2F, 0x6F, 0xAF, 0xEF, 0x3F, 0x7F, 0xBF, 0xFF, };
static const uint8_t swap1[] = {0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE, 0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1, 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, 0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB, 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7, 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF, };





static struct BlitPatInfo XRAM1 mPatBlitInfo;	// this is indeed not thread safe, but PalmOS does the same

static uint32_t mClutEndMismatchMask = 0;			//bit i set if current clut's (i + 0xE6)th entry mismatched default clut's (i + 0xE6)th entry. only valid if the first 0xE6 entries match
													// this is aka "userColorMask"

static enum HalDrawingClutState mClutState = halDrawingClutIsStandard;		// 0 = standard, 1 = not yet checked, 2=  nonstandard





static void PrvRGB565ToIndices(const uint16_t *srcP, uint8_t *dstP, uint32_t count, const struct PalmClut *colorTblP, bool swapSourceBytes)
{
	const uint32_t cacheSz = 4;
	
	uint32_t i, resultIdx, rgb565, cacheIdx = 0;
	struct PalmClutEntry rgb;
	bool first = true;
	uint16_t pixel[cacheSz];	//cache
	uint8_t idx[cacheSz];		//cache
	
	do {
		
		rgb565 = *srcP++;
		
		if (!swapSourceBytes)	//do not ask
			rgb565 = __builtin_bswap16(rgb565);
		
		rgb.r = ((rgb565 & 0xF800) >> 8) | ((rgb565 & 0x3800) >> 11);	//some weird form of dithering??
		rgb.g = ((rgb565 & 0x07E0) >> 3) | ((rgb565 & 0x0060) >>  5);	//some weird form of dithering??
		rgb.b = ((rgb565 & 0x001F) << 3) | ((rgb565 & 0x0007)      );	//some weird form of dithering??
		
		if (first) {
			HALDraw_FindIndexes(1, &rgb, colorTblP);
			resultIdx = rgb.idx;
			for (i = 0; i < cacheSz; i++) {
				pixel[i] = rgb565;
				idx[i] = resultIdx;
			}
			first = false;
		}
		else {
			//check cache
			for (i = 0; i < cacheSz; i++) {
				if (pixel[i] == rgb565) {
					resultIdx = idx[i];
					goto result_found;
				}
			}
			
			HALDraw_FindIndexes(1, &rgb, colorTblP);
			resultIdx = rgb.idx;
			pixel[cacheIdx] = rgb565;
			idx[cacheIdx] = resultIdx;
			if (++cacheIdx == cacheSz)
				cacheIdx = 0;
		}
		
result_found:
		*dstP++ = resultIdx;
	} while (--count);
}

static void PrvConvert16To16TwoThirds(const void* srcP, void* dstP, int32_t numSrcBytes, struct PrvConvertInfo *info)
{
	const uint16_t* src = (const uint16_t*)srcP;
	uint16_t* dst = (uint16_t*)dstP;
	int32_t npix = info->outputCount;
	
	while (npix >= 2) {
		
		*dst++ = *src++;
		*dst++ = *src;
		src += 2;
		npix--;
	}
	if (npix)
		*dst = *src;
}

static void PrvConvert16To16ThreeQuarters(const void* srcP, void* dstP, int32_t numSrcBytes, struct PrvConvertInfo *info)
{
	const uint16_t* src = (const uint16_t*)srcP;
	uint16_t* dst = (uint16_t*)dstP;
	int32_t npix = info->outputCount;
	uint32_t v;
	
	while (npix >= 3) {
		
		*dst++ = *src++;
		*dst++ = *src++;
		*dst++ = *src++;
		src++;
		
		npix -= 3;
	}
	while (npix--)
		*dst++ = *src++;
}

static void PrvConvert16To16OneAndOneHalf(const void* srcP, void* dstP, int32_t numSrcBytes, struct PrvConvertInfo *info)
{
	const uint16_t* src = (const uint16_t*)srcP;
	uint16_t* dst = (uint16_t*)dstP;
	int32_t npix = info->outputCount;
	uint32_t v;
	
	while (npix >= 3) {
		
		*dst++ = v = *src++;
		*dst++ = v;
		*dst++ = *src++;
		
		npix -= 3;
	}
	if (npix == 2) {
		
		*dst++ = v = *src++;
		*dst = v;
	}
	else if (npix)
		*dst = *src;
}

static void PrvConvert16To16Halved(const void* srcP, void* dstP, int32_t numSrcBytes, struct PrvConvertInfo *info)
{
	const uint16_t* src = (const uint16_t*)srcP;
	uint16_t* dst = (uint16_t*)dstP;
	int32_t npix = info->outputCount;
	
	while (npix-- > 0) {
		*dst++ = *src;
		src += 2;
	}
}

static void PrvConvert16To16Doubled(const void* srcP, void* dstP, int32_t numSrcBytes, struct PrvConvertInfo *info)
{
	const uint16_t* src = (const uint16_t*)srcP;
	uint16_t* dst = (uint16_t*)dstP;
	int32_t npix = info->outputCount;
	
	while (npix >= 2) {
		uint32_t v = *src++;
		
		*dst++ = v;
		*dst++ = v;
		npix -= 2;
	}
	
	while (npix--)
		*dst++ = *src++;
}

static void PrvConvert16To16Tripled(const void* srcP, void* dstP, int32_t numSrcBytes, struct PrvConvertInfo *info)
{
	const uint16_t* src = (const uint16_t*)srcP;
	uint16_t* dst = (uint16_t*)dstP;
	int32_t npix = info->outputCount;
	
	while (npix >= 3) {
		uint32_t v = *src++;
		
		*dst++ = v;
		*dst++ = v;
		*dst++ = v;
		npix -= 3;
	}
	
	while (npix--)
		*dst++ = *src;
}

static void PrvConvert16To16Quadrupled(const void* srcP, void* dstP, int32_t numSrcBytes, struct PrvConvertInfo *info)
{
	const uint16_t* src = (const uint16_t*)srcP;
	uint16_t* dst = (uint16_t*)dstP;
	int32_t npix = info->outputCount;
	
	while (npix >= 4) {
		uint32_t v = *src++;
		
		*dst++ = v;
		*dst++ = v;
		*dst++ = v;
		*dst++ = v;
		npix -= 4;
	}
	
	while (npix--)
		*dst++ = *src;
}

static void PrvConvert16To8(const void* srcP, void* dstP, int32_t numSrcBytes, struct PrvConvertInfo *info)
{
	uint32_t v, i, j, nInputPixels = numSrcBytes / 2, nOutputBytes;
	enum HalDrawingScalingMode scaleMode = info->scaleMode;
	const uint16_t *src = (const uint16_t*)srcP;
	const uint8_t *translate = info->translate;
	bool needBitswap = !!info->needsBitswap;
	uint8_t *dst = (uint8_t*)dstP;
	
	if (numSrcBytes <= 0)
		return;
	
	if (scaleMode == halDrawScaleOneHalf) {
		
		nOutputBytes = nInputPixels / 2;
		
		for (i = 0; i < nOutputBytes; i++)
			PrvRGB565ToIndices(&src[i * 2], &dst[i], 1, info->dstClut, needBitswap);
	}
	else if (scaleMode == halDrawScaleTwoThirds) {
		
		for (i = 0, j = 0; i + 2 <= nInputPixels; i += 3, j += 2)
			PrvRGB565ToIndices(&src[i + 0], &dst[j + 0], 2, info->dstClut, needBitswap);

		if (i < nInputPixels)	// the only possible case is one input pixel left
			PrvRGB565ToIndices(&src[i + 0], &dst[j++], 1, info->dstClut, needBitswap);
		
		nOutputBytes = j;
	}
	else if (scaleMode == halDrawScaleThreeQuarters) {
		
		for (i = 0, j = 0; i + 2 <= nInputPixels; i += 3, j += 4)
			PrvRGB565ToIndices(&src[i + 0], &dst[j + 0], 3, info->dstClut, needBitswap);

		if (i < nInputPixels)	// the only possible case is one input pixel left
			PrvRGB565ToIndices(&src[i + 0], &dst[j++], 1, info->dstClut, needBitswap);
		
		nOutputBytes = j;
	}
	else {
		//first convert into place with no scaling, then scale
		PrvRGB565ToIndices(src, dst, nInputPixels, info->dstClut, needBitswap);
		
		if (scaleMode == halDrawScaleNone) {
			
			nOutputBytes = nInputPixels;	//nothing more to do
		}
		else if (scaleMode == halDrawScaleDouble) {
			
			nOutputBytes = nInputPixels * 2;
			
			i = nInputPixels;
			while (i) {
				
				i--;
				v = dst[i];
				dst[i * 2 + 0] = v;
				dst[i * 2 + 1] = v;
			}
		}
		else if (scaleMode == halDrawScaleOneAndHalf) {
			
			i = nInputPixels - 1;
			j = i + i / 2 + 1;
			nOutputBytes = j + 1;
			
			while (i >= 2) {
				
				v = src[i--];
				dst[j--] = v;
				dst[j--] = v;
				dst[j--] = src[i--];
			}
			if (i) {
				v = src[i--];
				dst[j--] = v;
				dst[j--] = v;
			}
		}
		else {
			
			__builtin_unreachable();
			nOutputBytes = 0;
		}
	}
	
	if (translate != k8UnityMapping) {
		
		for (i = 0; i < nOutputBytes; i++)
			dst[i] = translate[dst[i]];
	}
}

static void PrvConvert16To8Intensity(const void* srcP, void* dstP, int32_t numSrcBytes, struct PrvConvertInfo *info)
{
	enum HalDrawingScalingMode scaleMode = info->scaleMode;
	uint32_t iter = 0, numSourcePixels = numSrcBytes / 2;
	const uint16_t *src = (const uint16_t*)srcP;
	bool needBitswap = !!info->needsBitswap;
	uint8_t *dst = (uint8_t*)dstP;
	
	if (numSrcBytes <= 0)
		return;
	
	while (numSourcePixels--) {
		
		uint32_t intensity, r, g, b, pix = *src++;
		
		if ((scaleMode == halDrawScaleOneHalf) || ((scaleMode == halDrawScaleTwoThirds) && ((++iter) & 1)) || ((scaleMode == halDrawScaleThreeQuarters) && !((++iter) % 3))) {
			src++;
			numSourcePixels--;
		}
		
		if (!needBitswap)
			pix = __builtin_bswap16(pix);
		
		r = (pix >> 11) & 0x1F;
		g = (pix >> 5) & 0x3F;
		b = pix & 0x1F;

		intensity = g / 2 + b + g * 2 + r * 2;
		intensity += (g >> 3) & 5;
		
		*dst++ = intensity;
		
		if ((scaleMode == halDrawScaleDouble) || ((scaleMode == halDrawScaleOneAndHalf) && ((++iter) & 1)))
			*dst++ = intensity;
	}
}

static void PrvConvert16To8Dither(const void* srcP, void* dstP, int32_t numSrcBytes, struct PrvConvertInfo *info)
{
	static const uint8_t web[] = {0xFF, 0xD7, 0xD8, 0xBE, 0xD9, 0xDA, 0xA5, 0xDB, 0xDC, 0x32, 0xDD, 0xDE, 0x19, 0xDF, 0xE0, 0x00, 0xFF, 0xD1, 0xCB, 0x6B, 0x65, 0x5F, 0xD6, 0xD0, 0xCA, 0x6A, 0x64, 0x5E, 0xD5, 0xCF, 0xC9, 0x69, 0x63, 0x5D, 0xD4, 0xCE, 0xC8, 0x68, 0x62, 0x5C, 0xD3, 0xCD, 0xC7, 0x67, 0x61, 0x5B, 0xD2, 0xCC, 0xC6, 0x66, 0x60, 0x5A, 0xC5, 0xBF, 0xB9, 0x59, 0x53, 0x4D, 0xC4, 0xBE, 0xB8, 0x58, 0x52, 0x4C, 0xC3, 0xBD, 0xB7, 0x57, 0x51, 0x4B, 0xC2, 0xBC, 0xB6, 0x56, 0x50, 0x4A, 0xC1, 0xBB, 0xB5, 0x55, 0x4F, 0x49, 0xC0, 0xBA, 0xB4, 0x54, 0x4E, 0x48, 0xB3, 0xAD, 0xA7, 0x47, 0x41, 0x3B, 0xB2, 0xAC, 0xA6, 0x46, 0x40, 0x3A, 0xB1, 0xAB, 0xA5, 0x45, 0x3F, 0x39, 0xB0, 0xAA, 0xA4, 0x44, 0x3E, 0x38, 0xAF, 0xA9, 0xA3, 0x43, 0x3D, 0x37, 0xAE, 0xA8, 0xA2, 0x42, 0x3C, 0x36, 0xA1, 0x9B, 0x95, 0x35, 0x2F, 0x29, 0xA0, 0x9A, 0x94, 0x34, 0x2E, 0x28, 0x9F, 0x99, 0x93, 0x33, 0x2D, 0x27, 0x9E, 0x98, 0x92, 0x32, 0x2C, 0x26, 0x9D, 0x97, 0x91, 0x31, 0x2B, 0x25, 0x9C, 0x96, 0x90, 0x30, 0x2A, 0x24, 0x8F, 0x89, 0x83, 0x23, 0x1D, 0x17, 0x8E, 0x88, 0x82, 0x22, 0x1C, 0x16, 0x8D, 0x87, 0x81, 0x21, 0x1B, 0x15, 0x8C, 0x86, 0x80, 0x20, 0x1A, 0x14, 0x8B, 0x85, 0x7F, 0x1F, 0x19, 0x13, 0x8A, 0x84, 0x7E, 0x1E, 0x18, 0x12, 0x7D, 0x77, 0x71, 0x11, 0x0B, 0x05, 0x7C, 0x76, 0x70, 0x10, 0x0A, 0x04, 0x7B, 0x75, 0x6F, 0x0F, 0x09, 0x03, 0x7A, 0x74, 0x6E, 0x0E, 0x08, 0x02, 0x79, 0x73, 0x6D, 0x0D, 0x07, 0x01, 0x78, 0x72, 0x6C, 0x0C, 0x06, 0x00, }; //kIndexToWebTranslate
	const uint8_t *translate = info->translate, *biasArr = kBias16Gray + ((info->dstY & 7) * 8);
	uint32_t iter = 0, xMatrix = info->dstX, numSourcePixels = numSrcBytes / 2;
	enum HalDrawingScalingMode scaleMode = info->scaleMode;
	const uint16_t *src = (const uint16_t*)srcP;
	bool needBitswap = !!info->needsBitswap;
	uint8_t *dst = (uint8_t*)dstP;
	
	info->dstY++;
	
	while (numSourcePixels--) {
		
		uint32_t intensity, r, g, b, pix = *src++;
		
		
		if (!needBitswap)
			pix = __builtin_bswap16(pix);
		
		r = (pix >> 11) & 0x1F;
		g = (pix >> 6) & 0x1F;		//loses one bit of precision!
		b = pix & 0x1F;
		
		if ((scaleMode == halDrawScaleOneHalf) || ((scaleMode == halDrawScaleTwoThirds) && ((++iter) & 1)) || ((scaleMode == halDrawScaleThreeQuarters) && !((++iter) % 3))) {
			src++;
			numSourcePixels--;
		}
		
		if (r == g && r == b) {		//gray color
			
			intensity = g * 5 + b + r * 2;
			intensity = (intensity << 2) | (intensity >> 3);
			intensity -= intensity >> 4;
			*dst++ = translate[web[(intensity + biasArr[xMatrix++ & 7]) >> 6]];
			
			if ((scaleMode == halDrawScaleDouble) || ((scaleMode == halDrawScaleOneAndHalf) && ((++iter) & 1)))
				*dst++ = translate[web[(intensity + biasArr[xMatrix++ & 7]) >> 6]];
		}
		else {
			
			uint32_t z, bias;
			
			r *= 10;
			g *= 10;
			b *= 10;
			
			bias = biasArr[xMatrix++ & 7];
			z = (r + bias) >> 6;
			z *= 6;
			z += (g + bias) >> 6;
			z *= 6;
			z += (b + bias) >> 6;
			z += 16;
			
			*dst++ = translate[web[z]];
			if ((scaleMode == halDrawScaleDouble) || ((scaleMode == halDrawScaleOneAndHalf) && ((++iter) & 1))) {
				
				bias = biasArr[xMatrix++ & 7];
				z = (r + bias) >> 6;
				z *= 6;
				z += (g + bias) >> 6;
				z *= 6;
				z += (b + bias) >> 6;
				z += 16;
				*dst++ = translate[web[z]];
			}
		}
	}
}

static void PrvConvertNto8Doubled(const void* srcP, void* dstP, int32_t numSrcBytes, struct PrvConvertInfo *info)
{
	uint32_t j, bits, v, srcDepth = info->srcPixelSize, mask = (1 << srcDepth) - 1;
	const uint8_t *translate = info->translate;
	const uint8_t *src = (const uint8_t*)srcP;
	uint8_t *dst = (uint8_t*)dstP;
	
	if (srcDepth == 8) {
		
		while (numSrcBytes-- > 0) {
			
			v = translate[*src++];
			*dst++ = v;
			*dst++ = v;
		}
	}
	else if (info->needsBitswap) {
		
		while (numSrcBytes-- > 0) {
			
			bits = *src++;
			
			for (j = 0; j < 8; j += srcDepth, bits >>= srcDepth) {
				
				v = translate[bits & mask];
				*dst++ = v;
				*dst++ = v;
			}
		}
	}
	else {
		
		while (numSrcBytes-- > 0) {
			
			bits = *src++;
			j = 8;
			
			while (j) {
				j -= srcDepth;
				v = translate[(bits >> j) & mask];
				*dst++ = v;
				*dst++ = v;
			}
		}
	}
}

static void PrvConvertNto8Tripled(const void* srcP, void* dstP, int32_t numSrcBytes, struct PrvConvertInfo *info)
{
	uint32_t j, bits, v, srcDepth = info->srcPixelSize, mask = (1 << srcDepth) - 1;
	const uint8_t *translate = info->translate;
	const uint8_t *src = (const uint8_t*)srcP;
	uint8_t *dst = (uint8_t*)dstP;
	
	if (srcDepth == 8) {
		
		while (numSrcBytes-- > 0) {
			
			v = translate[*src++];
			*dst++ = v;
			*dst++ = v;
			*dst++ = v;
		}
	}
	else if (info->needsBitswap) {
		
		while (numSrcBytes-- > 0) {
			
			bits = *src++;
			
			for (j = 0; j < 8; j += srcDepth, bits >>= srcDepth) {
				
				v = translate[bits & mask];
				*dst++ = v;
				*dst++ = v;
				*dst++ = v;
			}
		}
	}
	else {
		
		while (numSrcBytes-- > 0) {
			
			bits = *src++;
			j = 8;
			
			while (j) {
				j -= srcDepth;
				v = translate[(bits >> j) & mask];
				*dst++ = v;
				*dst++ = v;
				*dst++ = v;
			}
		}
	}
}

static void PrvConvertNto8Quadrupled(const void* srcP, void* dstP, int32_t numSrcBytes, struct PrvConvertInfo *info)
{
	uint32_t j, bits, v, srcDepth = info->srcPixelSize, mask = (1 << srcDepth) - 1;
	const uint8_t *translate = info->translate;
	const uint8_t *src = (const uint8_t*)srcP;
	uint8_t *dst = (uint8_t*)dstP;
	
	if (srcDepth == 8) {
		
		while (numSrcBytes-- > 0) {
			
			v = translate[*src++];
			*dst++ = v;
			*dst++ = v;
			*dst++ = v;
			*dst++ = v;
		}
	}
	else if (info->needsBitswap) {
		
		while (numSrcBytes-- > 0) {
			
			bits = *src++;
			
			for (j = 0; j < 8; j += srcDepth, bits >>= srcDepth) {
				
				v = translate[bits & mask];
				*dst++ = v;
				*dst++ = v;
				*dst++ = v;
				*dst++ = v;
			}
		}
	}
	else {
		
		while (numSrcBytes-- > 0) {
			
			bits = *src++;
			j = 8;
			
			while (j) {
				j -= srcDepth;
				v = translate[(bits >> j) & mask];
				*dst++ = v;
				*dst++ = v;
				*dst++ = v;
				*dst++ = v;
			}
		}
	}
}

static void PrvConvertNto8Halved(const void* srcP, void* dstP, int32_t numSrcBytes, struct PrvConvertInfo *info)
{
	uint32_t j, bits, srcDepth = info->srcPixelSize, mask = (1 << srcDepth) - 1;
	const uint8_t *translate = info->translate;
	const uint8_t *src = (const uint8_t*)srcP;
	uint8_t *dst = (uint8_t*)dstP;
	
	if (srcDepth == 8) {
		
		while ((numSrcBytes -= 2) >= 0) {
			
			*dst++ = translate[*src];
			src += 2;
		}
	}
	else if (info->needsBitswap) {
	
		uint32_t srcDepthTimesTwo = srcDepth * 2;
		
		while (numSrcBytes-- > 0) {
			
			bits = *src++;
			
			for (j = 0; j < 8; j += srcDepthTimesTwo, bits >>= srcDepthTimesTwo) {
				
				*dst++ = translate[bits & mask];
			}
		}
	}
	else {
	
		uint32_t skip = 0;
		
		while (numSrcBytes-- > 0) {
			
			bits = *src++;
			j = 8;
			
			while (j) {
				j -= srcDepth;
				
				if (++skip & 1)
					*dst++ = translate[(bits >> j) & mask];
			}
		}
	}
}

static void PrvConvertNto8ThreeQuarters(const void* srcP, void* dstP, int32_t numSrcBytes, struct PrvConvertInfo *info)
{
	const uint16_t* oneAndOneHalfScaleNibble1bit = mOneAndOneHalfScaleNibble1bit;
	uint32_t j, bits, v, srcDepth = info->srcPixelSize;
	const uint8_t *translate = info->translate;
	const uint8_t *src = (const uint8_t*)srcP;
	uint8_t *dst = (uint8_t*)dstP;
	
	if (srcDepth == 8) {
		
		while (numSrcBytes >= 4) {
		
			numSrcBytes -= 4;
			*dst++ = translate[*src++];
			*dst++ = translate[*src++];
			*dst++ = translate[*src++];
			src++;
		}
		if (numSrcBytes >= 2) {
			*dst++ = translate[*src++];
			*dst++ = translate[*src++];
		}
		else if (numSrcBytes)
			*dst++ = translate[*src++];
	}
	else if (info->needsBitswap) switch (srcDepth) {
		case 1:
			while (numSrcBytes--) {
				v = *src++;
				*dst++ = translate[(v >> 0) & 1];
				*dst++ = translate[(v >> 1) & 1];
				*dst++ = translate[(v >> 2) & 1];
				*dst++ = translate[(v >> 4) & 1];
				*dst++ = translate[(v >> 5) & 1];
				*dst++ = translate[(v >> 6) & 1];
			}
			break;
		
		case 2:
			while (numSrcBytes--) {
				v = *src++;
				*dst++ = translate[(v >> 0) & 3];
				*dst++ = translate[(v >> 2) & 3];
				*dst++ = translate[(v >> 4) & 3];
			}
			break;
		
		case 4:
			while (numSrcBytes >= 2) {
				numSrcBytes -= 2;
				v = *src++;
				*dst++ = translate[v & 15];
				*dst++ = translate[v >> 4];
				*dst++ = translate[*src++ & 15];
			}
			if (numSrcBytes)
				*dst++ = translate[*src++ & 15];
			break;
	}
	else switch (srcDepth) {
		case 1:
			while (numSrcBytes--) {
				v = *src++;
				*dst++ = translate[(v >> 7) & 1];
				*dst++ = translate[(v >> 6) & 1];
				*dst++ = translate[(v >> 5) & 1];
				*dst++ = translate[(v >> 3) & 1];
				*dst++ = translate[(v >> 2) & 1];
				*dst++ = translate[(v >> 1) & 1];
			}
			break;
		
		case 2:
			while (numSrcBytes--) {
				v = *src++;
				*dst++ = translate[(v >> 6) & 3];
				*dst++ = translate[(v >> 4) & 3];
				*dst++ = translate[(v >> 2) & 3];
			}
			break;
		
		case 4:
			while (numSrcBytes >= 2) {
				numSrcBytes -= 2;
				v = *src++;
				*dst++ = translate[v >> 4];
				*dst++ = translate[v & 15];
				*dst++ = translate[*src++ >> 4];
			}
			if (numSrcBytes)
				*dst++ = translate[*src++ >> 4];
			break;
	}
}

static void PrvConvertNto8OneAndOneHalf(const void* srcP, void* dstP, int32_t numSrcBytes, struct PrvConvertInfo *info)
{
	const uint16_t* oneAndOneHalfScaleNibble1bit = mOneAndOneHalfScaleNibble1bit;
	uint32_t j, bits, v, srcDepth = info->srcPixelSize;
	const uint8_t *translate = info->translate;
	const uint8_t *src = (const uint8_t*)srcP;
	uint8_t *dst = (uint8_t*)dstP;
	
	if (srcDepth == 8) {
		
		while (numSrcBytes >= 2) {
		
			numSrcBytes -= 2;
			v = translate[*src++];
			*dst++ = v;
			*dst++ = v;
			*dst++ = translate[*src++];
		}
		if (numSrcBytes > 0) //one source byte left
			*dst++ = translate[*src++];
	}
	else if (info->needsBitswap) switch (srcDepth) {
		
		case 1:
			while (numSrcBytes-- > 0) {
				
				bits = oneAndOneHalfScaleNibble1bit[*src++];
				
				for (j = 0; j < 12; j++, bits >>= 1)
					*dst++ = translate[bits & 1];
			}
			break;
		
		case 2:
			while (numSrcBytes-- > 0) {
				
				bits = *src++;
				v = translate[bits & 3];
				*dst++ = v;
				*dst++ = v;
				bits >>= 2;
				*dst++ = translate[bits & 3];
				bits >>= 2;
				v = translate[bits & 3];
				*dst++ = v;
				*dst++ = v;
				bits >>= 2;
				*dst++ = translate[bits & 3];
			}
			break;
		
		case 4:
			while (numSrcBytes-- > 0) {
				
				bits = *src++;
				v = translate[bits & 15];
				*dst++ = v;
				*dst++ = v;
				bits >>= 4;
				*dst++ = translate[bits & 15];
			}
			break;
	}
	else switch (srcDepth) {
		
		case 1:
			while (numSrcBytes-- > 0) {
				
				bits = oneAndOneHalfScaleNibble1bit[*src++];
				j = 12;
				
				while (j) {
					j--;
					*dst++ = translate[(bits >> j) & 1];
				}
			}
			break;
		
		case 2:
			while (numSrcBytes-- > 0) {
				
				bits = *src++;
				v = translate[(bits >> 6) & 3];
				*dst++ = v;
				*dst++ = v;
				*dst++ = translate[(bits >> 4) & 3];
				v = translate[(bits >> 2) & 3];
				*dst++ = v;
				*dst++ = v;
				*dst++ = translate[bits & 3];
			}
			break;
		
		case 4:
			while (numSrcBytes-- > 0) {
				
				bits = *src++;
				v = translate[(bits >> 4) & 15];
				*dst++ = v;
				*dst++ = v;
				*dst++ = translate[bits & 15];
			}
			break;
	}
}

static void PrvConvertNto8TwoThirds(const void* srcP, void* dstP, int32_t numSrcBytes, struct PrvConvertInfo *info)
{
	uint32_t j, bits, srcDepth = info->srcPixelSize;
	const uint8_t *translate = info->translate;
	const uint8_t *src = (const uint8_t*)srcP;
	uint8_t *dst = (uint8_t*)dstP;
	
	if (srcDepth == 8) {
		
		while (numSrcBytes >= 2) {	//this is correct. if 2 are left, this is proper since the handling woudl be the same as the body of thi sloop, so this is efficient
			
			numSrcBytes -= 3;
			
			*dst++ = translate[*src++];
			*dst++ = translate[*src];
			src += 2;
		}
		if (numSrcBytes == 1)
			*dst++ = translate[*src++];
	}
	else if (info->needsBitswap) switch (srcDepth) {
		case 1:
			while (numSrcBytes >= 3) {
			
				numSrcBytes -= 3;
				bits = src[2];
				bits <<= 8;
				bits += src[1];
				bits <<= 8;
				bits += src[0];
				src += 3;
				
				for (j = 0; j < 8; j++) {
					
					*dst++ = translate[bits & 1];
					bits >>= 1;
					*dst++ = translate[bits & 1];
					bits >>= 2;
				}
			}
			if (numSrcBytes == 2) {
				
				bits = src[1];
				bits <<= 8;
				bits += src[0];
				
				for (j = 0; j < 16; j += 3) {
					*dst++ = translate[bits & 1];
					bits >>= 1;
					*dst++ = translate[bits & 1];
					bits >>= 2;
				}
			}
			else if (numSrcBytes == 1) {
				
				bits = *src;
				
				*dst++ = translate[bits & 1];
				bits >>= 1;
				*dst++ = translate[bits & 1];
				bits >>= 2;
				*dst++ = translate[bits & 1];
				bits >>= 1;
				*dst++ = translate[bits & 1];
				bits >>= 2;
				*dst++ = translate[bits & 1];
				bits >>= 1;
				*dst++ = translate[bits & 1];
			}
			break;
		
		case 2:
			while (numSrcBytes >= 3) {
			
				numSrcBytes -= 3;
				bits = src[2];
				bits <<= 8;
				bits += src[1];
				bits <<= 8;
				bits += src[0];
				src += 3;
				
				*dst++ = translate[bits & 3];
				bits >>= 2;
				*dst++ = translate[bits & 3];
				bits >>= 4;
				*dst++ = translate[bits & 3];
				bits >>= 2;
				*dst++ = translate[bits & 3];
				bits >>= 4;
				*dst++ = translate[bits & 3];
				bits >>= 2;
				*dst++ = translate[bits & 3];
				bits >>= 4;
				*dst++ = translate[bits & 3];
				bits >>= 2;
				*dst++ = translate[bits & 3];
			}
			if (numSrcBytes == 2) {
				
				bits = src[1];
				bits <<= 8;
				bits += src[0];
				
				*dst++ = translate[bits & 3];
				bits >>= 2;
				*dst++ = translate[bits & 3];
				bits >>= 4;
				*dst++ = translate[bits & 3];
				bits >>= 2;
				*dst++ = translate[bits & 3];
				bits >>= 4;
				*dst++ = translate[bits & 3];
			}
			else if (numSrcBytes == 1) {
				
				bits = *src;
				
				*dst++ = translate[bits & 3];
				bits >>= 2;
				*dst++ = translate[bits & 3];
			}
			break;
		
		case 4:
			while (numSrcBytes >= 3) {
			
				numSrcBytes -= 3;
				bits = src[2];
				bits <<= 8;
				bits += src[1];
				bits <<= 8;
				bits += src[0];
				src += 3;
				
				*dst++ = translate[bits & 15];
				bits >>= 4;
				*dst++ = translate[bits & 15];
				bits >>= 8;
				*dst++ = translate[bits & 15];
				bits >>= 4;
				*dst++ = translate[bits & 15];
			}
			if (numSrcBytes == 2) {
				
				bits = src[1];
				bits <<= 8;
				bits += src[0];
				
				*dst++ = translate[bits & 15];
				bits >>= 4;
				*dst++ = translate[bits & 15];
			}
			else if (numSrcBytes == 1) {
				
				bits = *src;
				
				*dst++ = translate[bits & 15];
			}
			break;
	}
	else switch (srcDepth) {
		case 1:
			while (numSrcBytes >= 3) {
			
				numSrcBytes -= 3;
				bits = src[2];
				bits <<= 8;
				bits += src[1];
				bits <<= 8;
				bits += src[0];
				src += 3;
				
				for (j = 0; j < 8; j++) {
					
					*dst++ = translate[(bits >> 23) & 1];
					bits <<= 1;
					*dst++ = translate[(bits >> 23) & 1];
					bits <<= 2;
				}
			}
			if (numSrcBytes == 2) {
				
				bits = src[1];
				bits <<= 8;
				bits += src[0];
				
				for (j = 0; j < 16; j += 3) {
					*dst++ = translate[(bits >> 15) & 1];
					bits <<= 1;
					*dst++ = translate[(bits >> 15) & 1];
					bits <<= 2;
				}
			}
			else if (numSrcBytes == 1) {
				
				bits = *src;
				
				*dst++ = translate[(bits >> 7) & 1];
				*dst++ = translate[(bits >> 6) & 1];
				*dst++ = translate[(bits >> 4) & 1];
				*dst++ = translate[(bits >> 3) & 1];
				*dst++ = translate[(bits >> 1) & 1];
				*dst++ = translate[(bits >> 0) & 1];
			}
			break;
		
		case 2:
			while (numSrcBytes >= 3) {
			
				numSrcBytes -= 3;
				bits = src[2];
				bits <<= 8;
				bits += src[1];
				bits <<= 8;
				bits += src[0];
				src += 3;
				
				*dst++ = translate[(bits >> 22) & 3];
				bits <<= 2;
				*dst++ = translate[(bits >> 22) & 3];
				bits <<= 4;
				*dst++ = translate[(bits >> 22) & 3];
				bits <<= 2;
				*dst++ = translate[(bits >> 22) & 3];
				bits <<= 4;
				*dst++ = translate[(bits >> 22) & 3];
				bits <<= 2;
				*dst++ = translate[(bits >> 22) & 3];
				bits <<= 4;
				*dst++ = translate[(bits >> 22) & 3];
				bits <<= 2;
				*dst++ = translate[(bits >> 22) & 3];
			}
			if (numSrcBytes == 2) {
				
				bits = src[1];
				bits <<= 8;
				bits += src[0];
				
				*dst++ = translate[(bits >> 14) & 3];
				bits <<= 2;
				*dst++ = translate[(bits >> 14) & 3];
				bits <<= 4;
				*dst++ = translate[(bits >> 14) & 3];
				bits <<= 2;
				*dst++ = translate[(bits >> 14) & 3];
				bits <<= 4;
				*dst++ = translate[(bits >> 14) & 3];
			}
			else if (numSrcBytes == 1) {
				
				bits = *src;
				
				*dst++ = translate[(bits >> 6) & 3];
				*dst++ = translate[(bits >> 4) & 3];
			}
			break;
		
		case 4:
			while (numSrcBytes >= 3) {
			
				numSrcBytes -= 3;
				bits = src[2];
				bits <<= 8;
				bits += src[1];
				bits <<= 8;
				bits += src[0];
				src += 3;
				
				*dst++ = translate[(bits >> 20) & 15];
				bits <<= 4;
				*dst++ = translate[(bits >> 20) & 15];
				bits <<= 8;
				*dst++ = translate[(bits >> 20) & 15];
				bits <<= 4;
				*dst++ = translate[(bits >> 20) & 15];
			}
			if (numSrcBytes == 2) {
				
				bits = src[1];
				bits <<= 8;
				bits += src[0];
				
				*dst++ = translate[(bits >> 12) & 15];
				bits <<= 4;
				*dst++ = translate[(bits >> 12) & 15];
			}
			else if (numSrcBytes == 1) {
				
				bits = *src;
				
				*dst++ = translate[(bits >> 4) & 15];
			}
			break;
	}
}

static void PrvBlit1To16Copy(const struct BitonalBlitInfo *info, const void* srcP, uint32_t srcRowBytes, uint32_t srcOffset, uint32_t dstOffset, int32_t width, int32_t height)
{
	uint16_t *dst = ((uint16_t*)info->dstP) + dstOffset;
	const uint8_t *src = ((const uint8_t*)srcP) + (srcOffset / 8);
	uint32_t dstRowAdd = info->dstRowBytes / sizeof(uint16_t) - width;
	int32_t leftSideShift = srcOffset & 7, leftSideCount = (8 - leftSideShift >= width) ? width : 8 - leftSideShift;
	uint32_t srcRowAdd = srcRowBytes - (width - leftSideCount + 7) / 8 - 1;
	uint16_t bg = info->background, fg = info->foreground;

	
	if (width <= 0)
		return;
	
	while (height-- > 0) {
		
		uint32_t i, x = width, byte;
		
		//intro
		byte = *src++ << leftSideShift;
		i = leftSideCount;
		while (i--) {
			*dst++ = (byte & 0x80) ? fg : bg;
			byte <<= 1;
		}
		x -= leftSideCount;
		
		//body
		while (x >= 8) {
			byte = *src++;
			i = 8;
			
			while (i--) {
				*dst++ = (byte & 0x80) ? fg : bg;
				byte <<= 1;
			}
			x -= 8;
		}
		
		//outro
		if (x) {
			byte = *src++;
			while (x-- > 0) {
				*dst++ = (byte & 0x80) ? fg : bg;
				byte <<= 1;
			}
		}
		
		//row maintenance
		src += srcRowAdd;
		dst += dstRowAdd;
	}
}

static void PrvBlit1To8Copy(const struct BitonalBlitInfo *info, const void* srcP, uint32_t srcRowBytes, uint32_t srcOffset, uint32_t dstOffset, int32_t width, int32_t height)
{
	uint8_t *dst = ((uint8_t*)info->dstP) + dstOffset;
	const uint8_t *src = ((const uint8_t*)srcP) + (srcOffset / 8);
	uint32_t dstRowAdd = info->dstRowBytes / sizeof(uint8_t) - width;
	int32_t leftSideShift = srcOffset & 7, leftSideCount = (8 - leftSideShift >= width) ? width : 8 - leftSideShift;
	uint32_t srcRowAdd = srcRowBytes - (width - leftSideCount + 7) / 8 - 1;
	uint16_t bg = info->background, fg = info->foreground;

	
	if (width <= 0)
		return;
	
	while (height-- > 0) {
		
		uint32_t i, x = width, byte;
		
		//intro
		byte = *src++ << leftSideShift;
		i = leftSideCount;
		while (i--) {
			*dst++ = (byte & 0x80) ? fg : bg;
			byte <<= 1;
		}
		x -= leftSideCount;
		
		//body
		while (x >= 8) {
			byte = *src++;
			i = 8;
			
			while (i--) {
				*dst++ = (byte & 0x80) ? fg : bg;
				byte <<= 1;
			}
			x -= 8;
		}
		
		//outro
		if (x) {
			byte = *src++;
			while (x-- > 0) {
				*dst++ = (byte & 0x80) ? fg : bg;
				byte <<= 1;
			}
		}
		
		//row maintenance
		src += srcRowAdd;
		dst += dstRowAdd;
	}
}

static void PrvBlit1To4Copy(const struct BitonalBlitInfo *info, const void* srcP, uint32_t srcRowBytes, uint32_t srcOffset, uint32_t dstOffset, int32_t width, int32_t height)
{
	uint8_t bg = info->background & 0x0F, fg = info->foreground & 0x0F;
	uint8_t bgHi = bg << 4, fgHi = fg << 4;
	const uint8_t lookup[] = {(uint8_t)(bgHi + bg), (uint8_t)(bgHi + fg), (uint8_t)(bg + fgHi), (uint8_t)(fgHi + fg)};
	const uint8_t *src, *srcStart = ((const uint8_t*)srcP) + (srcOffset / 8);
	int32_t leftSideShift = srcOffset & 7, leftSideCount = (8 - leftSideShift >= width) ? width : 8 - leftSideShift;
	uint8_t *dst, *dstStart = ((uint8_t*)info->dstP) + dstOffset / 2;
	
	if (width <= 0)
		return;
	
	while (height-- > 0) {
	
		uint32_t i, x = width, byte;
		uint32_t dstLo = dstOffset & 1;
		
		src = srcStart;
		dst = dstStart;
		
		//intro
		byte = *src++ << leftSideShift;
		i = leftSideCount;
		while (i--) {
			
			if (dstLo) {
				*dst = (*dst & 0xF0) + ((byte & 0x80) ? fg : bg);
				dst++;
			}
			else
				*dst = (*dst & 0x0F) + ((byte & 0x80) ? fgHi : bgHi);
			dstLo = 1 - dstLo;
			byte <<= 1;
		}
		x -= leftSideCount;
	
		//main XXX: this can be optimized, but 4bpp and less is rare, so fuck it
		while (x >= 8) {
			byte = *src++;
			i = 8;
			
			while (i) {
				
				if (!dstLo && i >= 2) {	//two at once
					*dst++ = lookup[(byte >> 6) & 3];
					byte <<= 2;
					i -= 2;
				}
				else {
					if (dstLo) {
						*dst = (*dst & 0xF0) + ((byte & 0x80) ? fg : bg);
						dst++;
						i--;
					}
					else {
						*dst = (*dst & 0x0F) + ((byte & 0x80) ? fgHi : bgHi);
						i--;
					}
					dstLo = 1 - dstLo;
					byte <<= 1;
				}
			}
			x -= 8;
		}
		
		//outro
		if (x) {
			byte = *src++;
			while (x > 0) {
				if (dstLo) {
					*dst = (*dst & 0xF0) + ((byte & 0x80) ? fg : bg);
					dst++;
				}
				else
					*dst = (*dst & 0x0F) + ((byte & 0x80) ? fgHi : bgHi);
				dstLo = 1 - dstLo;	
				byte <<= 1;
				x--;
			}
		}
		
		//row maintenance
		srcStart += srcRowBytes;
		dstStart += info->dstRowBytes;
	}
}

static void PrvBlit1To2Copy(const struct BitonalBlitInfo *info, const void* srcP, uint32_t srcRowBytes, uint32_t srcOffset, uint32_t dstOffset, int32_t width, int32_t height)
{
	uint8_t bg = info->background & 0x03, fg = info->foreground & 0x03;
	int32_t y, x;
	
	for (y = 0; y < height; y++) {	//slow, but fuck it
		
		const uint8_t *src = ((const uint8_t*)srcP) + (srcOffset / 8) + srcRowBytes * y;
		uint8_t *dst = ((uint8_t*)info->dstP) + dstOffset / 4 + info->dstRowBytes * y;
		uint32_t srcBitsLeft = 8 - srcOffset % 8;
		uint8_t byte = *src++ << (srcOffset % 8);
		uint32_t dstLo = 3 - (dstOffset & 3);
		
		for (x = 0; x < width; x++) {
			
			*dst = (*dst &~ (3 << (dstLo * 2))) + (((byte & 0x80) ? fg : bg) << (dstLo * 2));
			if (dstLo-- == 0) {
				dstLo = 3;
				dst++;
			}
			
			if (--srcBitsLeft)
				byte <<= 1;
			else {
				srcBitsLeft = 8;
				byte = *src++;
			}
		}
	}
}

static void PrvBlit1To1Copy(const struct BitonalBlitInfo *info, const void* srcP, uint32_t srcRowBytes, uint32_t srcOffset, uint32_t dstOffset, int32_t width, int32_t height)
{
	uint8_t bg = info->background & 0x01, fg = info->foreground & 0x01;
	int32_t y, x;
	
	for (y = 0; y < height; y++) {	//slow, but fuck it
		
		const uint8_t *src = ((const uint8_t*)srcP) + (srcOffset / 8) + srcRowBytes * y;
		uint8_t *dst = ((uint8_t*)info->dstP) + dstOffset / 8 + info->dstRowBytes * y;
		uint32_t srcBitsLeft = 8 - srcOffset % 8;
		uint8_t byte = *src++ << (srcOffset % 8);
		uint32_t dstLo = 7 - (dstOffset & 7);
		
		for (x = 0; x < width; x++) {
			
			*dst = (*dst &~ (1 << dstLo)) + (((byte & 0x80) ? fg : bg) << dstLo);
			if (dstLo-- == 0) {
				dstLo = 7;
				dst++;
			}
			
			if (--srcBitsLeft)
				byte <<= 1;
			else {
				srcBitsLeft = 8;
				byte = *src++;
			}
		}
	}
}

static void PrvBlit1To16Over(const struct BitonalBlitInfo *info, const void* srcP, uint32_t srcRowBytes, uint32_t srcOffset, uint32_t dstOffset, int32_t width, int32_t height)
{
	uint16_t *dst = ((uint16_t*)info->dstP) + dstOffset;
	const uint8_t *src = ((const uint8_t*)srcP) + (srcOffset / 8);
	uint32_t dstRowAdd = info->dstRowBytes / sizeof(uint16_t) - width;
	int32_t leftSideShift = srcOffset & 7, leftSideCount = (8 - leftSideShift >= width) ? width : 8 - leftSideShift;
	uint32_t srcRowAdd = srcRowBytes - (width - leftSideCount + 7) / 8 - 1;
	uint16_t fg = info->foreground;

	if (width <= 0)
		return;
	
	while (height-- > 0) {
		
		uint32_t i, x = width, byte;
		
		//intro
		byte = *src++ << leftSideShift;
		i = leftSideCount;
		while (i--) {
			if (byte & 0x80)
				*dst = fg;
			dst++;
			byte <<= 1;
		}
		x -= leftSideCount;
		
		//body
		while (x >= 8) {
			byte = *src++;
			i = 8;
			
			while (i--) {
				if (byte & 0x80)
					*dst = fg;
				dst++;
				byte <<= 1;
			}
			x -= 8;
		}
		
		//outro
		if (x) {
			byte = *src++;
			while (x-- > 0) {
				if (byte & 0x80)
					*dst = fg;
				dst++;
				byte <<= 1;
			}
		}
		
		//row maintenance
		src += srcRowAdd;
		dst += dstRowAdd;
	}
}

static void PrvBlit1To8Over(const struct BitonalBlitInfo *info, const void* srcP, uint32_t srcRowBytes, uint32_t srcOffset, uint32_t dstOffset, int32_t width, int32_t height)
{
	uint8_t *dst = ((uint8_t*)info->dstP) + dstOffset;
	const uint8_t *src = ((const uint8_t*)srcP) + (srcOffset / 8);
	uint32_t dstRowAdd = info->dstRowBytes / sizeof(uint8_t) - width;
	int32_t leftSideShift = srcOffset & 7, leftSideCount = (8 - leftSideShift >= width) ? width : 8 - leftSideShift;
	uint32_t srcRowAdd = srcRowBytes - (width - leftSideCount + 7) / 8 - 1;
	uint16_t fg = info->foreground;

	
	if (width <= 0)
		return;
	
	while (height-- > 0) {
		
		uint32_t i, x = width, byte;
		
		//intro
		byte = *src++ << leftSideShift;
		i = leftSideCount;
		while (i--) {
			if (byte & 0x80)
				*dst = fg;
			dst++;
			byte <<= 1;
		}
		x -= leftSideCount;
		
		//body
		while (x >= 8) {
			byte = *src++;
			i = 8;
			
			while (i--) {
				if (byte & 0x80)
					*dst = fg;
				dst++;
				byte <<= 1;
			}
			x -= 8;
		}
		
		//outro
		if (x) {
			byte = *src++;
			while (x-- > 0) {
				if (byte & 0x80)
					*dst = fg;
				dst++;
				byte <<= 1;
			}
		}
		
		//row maintenance
		src += srcRowAdd;
		dst += dstRowAdd;
	}
}

static void PrvBlit1To4Over(const struct BitonalBlitInfo *info, const void* srcP, uint32_t srcRowBytes, uint32_t srcOffset, uint32_t dstOffset, int32_t width, int32_t height)
{
	uint8_t fg = info->foreground & 0x0F, fgHi = fg << 4;
	const uint8_t *src, *srcStart = ((const uint8_t*)srcP) + (srcOffset / 8);
	int32_t leftSideShift = srcOffset & 7, leftSideCount = (8 - leftSideShift >= width) ? width : 8 - leftSideShift;
	uint8_t *dst, *dstStart = ((uint8_t*)info->dstP) + dstOffset / 2;
	
	if (width <= 0)
		return;
	
	while (height-- > 0) {
	
		uint32_t i, x = width, byte;
		uint32_t dstLo = dstOffset & 1;
		
		dst = dstStart;
		src = srcStart;
		
		//intro
		byte = *src++ << leftSideShift;
		i = leftSideCount;
		while (i--) {
			
			if (dstLo) {
				if (byte & 0x80)
					*dst = (*dst & 0xF0) + fg;
				dst++;
			}
			else if (byte & 0x80)
				*dst = (*dst & 0x0F) + fgHi;
			dstLo = 1 - dstLo;
			byte <<= 1;
		}
		x -= leftSideCount;
	
		//main XXX: this can be optimized, but 4bpp and less is rare, so fuck it
		while (x >= 8) {
			byte = *src++;
			i = 8;
			
			while (i--) {
				
				if (dstLo) {
					if (byte & 0x80)
						*dst = (*dst & 0xF0) + fg;
					dst++;
				}
				else if (byte & 0x80)
					*dst = (*dst & 0x0F) + fgHi;
				dstLo = 1 - dstLo;
				byte <<= 1;
			}
			x -= 8;
		}
		
		//outro
		if (x) {
			byte = *src++;
			while (x-- > 0) {
				if (dstLo) {
					if (byte & 0x80)
						*dst = (*dst & 0xF0) + fg;
					dst++;
				}
				else if (byte & 0x80)
					*dst = (*dst & 0x0F) + fgHi;
				dstLo = 1 - dstLo;	
				byte <<= 1;
			}
		}
		
		//row maintenance
		srcStart += srcRowBytes;
		dstStart += info->dstRowBytes;
	}
}

static void PrvBlit1To2Over(const struct BitonalBlitInfo *info, const void* srcP, uint32_t srcRowBytes, uint32_t srcOffset, uint32_t dstOffset, int32_t width, int32_t height)
{
	uint8_t fg = info->foreground & 0x03;
	int32_t y, x;
	
	for (y = 0; y < height; y++) {	//slow, but fuck it
		
		const uint8_t *src = ((const uint8_t*)srcP) + (srcOffset / 8) + srcRowBytes * y;
		uint8_t *dst = ((uint8_t*)info->dstP) + dstOffset / 4 + info->dstRowBytes * y;
		uint32_t srcBitsLeft = 8 - srcOffset % 8;
		uint8_t byte = *src++ << (srcOffset % 8);
		uint32_t dstLo = 3 - (dstOffset & 3);
		
		for (x = 0; x < width; x++) {
			
			if (byte & 0x80)
				*dst = (*dst &~ (3 << (dstLo * 2))) + (fg << (dstLo * 2));
			if (dstLo-- == 0) {
				dstLo = 3;
				dst++;
			}
			
			if (--srcBitsLeft)
				byte <<= 1;
			else {
				srcBitsLeft = 8;
				byte = *src++;
			}
		}
	}
}

static void PrvBlit1To1Over(const struct BitonalBlitInfo *info, const void* srcP, uint32_t srcRowBytes, uint32_t srcOffset, uint32_t dstOffset, int32_t width, int32_t height)
{
	uint8_t fg = info->foreground & 0x01;
	int32_t y, x;
	
	for (y = 0; y < height; y++) {	//slow, but fuck it
		
		const uint8_t *src = ((const uint8_t*)srcP) + (srcOffset / 8) + srcRowBytes * y;
		uint8_t *dst = ((uint8_t*)info->dstP) + dstOffset / 8 + info->dstRowBytes * y;
		uint32_t srcBitsLeft = 8 - srcOffset % 8;
		uint8_t byte = *src++ << (srcOffset % 8);
		uint32_t dstLo = 7 - (dstOffset & 7);
		
		for (x = 0; x < width; x++) {
			
			if (byte & 0x80)
				*dst = (*dst &~ (1 << dstLo)) + (fg << dstLo);
			if (dstLo-- == 0) {
				dstLo = 7;
				dst++;
			}
			
			if (--srcBitsLeft)
				byte <<= 1;
			else {
				srcBitsLeft = 8;
				byte = *src++;
			}
		}
	}
}

static void PrvBlit1ToNXOR(const struct BitonalBlitInfo *info, const void* srcP, uint32_t srcRowBytes, uint32_t srcOffset, uint32_t dstOffset, int32_t width, int32_t height)
{
	uint32_t depth = info->depth, dstRowBytes = info->dstRowBytes, bg = info->background, fg = info->foreground;
	const uint8_t *src = (const uint8_t*)srcP;
	uint8_t *dst = info->dstP;
	int32_t x;
	
	if (depth == 16) {
		bg = ~bg;
		fg = ~fg;
		
		while (height-- > 0) {
			
			uint16_t *dst16 = (uint16_t*)dst;
			
			dst16 += dstOffset;
			for (x = 0; x < width; x++)
				*dst16++ ^= (src[(x + srcOffset) >> 3] & (0x80 >> ((x + srcOffset) & 7))) ? fg : bg;
			
			src += srcRowBytes;
			dst += dstRowBytes;
		}
	}
	else if (depth == 8) {
		
		while (height-- > 0) {
			
			uint16_t *dst8 = (uint16_t*)dst;
			
			dst8 += dstOffset;
			for (x = 0; x < width; x++)
				*dst8++ ^= (src[(x + srcOffset) >> 3] & (0x80 >> ((x + srcOffset) & 7))) ? fg : bg;
			
			src += srcRowBytes;
			dst += dstRowBytes;
		}
	}
	else if (info->dstPixelFormat == PALM_BMP_PIXEL_FORMAT_INDEXED_LE) {
		
		uint32_t depthMultShift = info->depthShift, dstXShift = 3 - depthMultShift, pixelsPerDstByteMinusOne = (8 >> depthMultShift) - 1, pixelMask = 0xFF >> (8 - depth);
		
		bg &= pixelMask;
		fg &= pixelMask;
		
		while (height-- > 0) {
			
			for (x = 0; x < width; x++) {
				
				uint32_t pixel = (src[(x + srcOffset) >> 3] & (0x80 >> ((x + srcOffset) & 7))) ? fg : bg;
				uint32_t position = (x + dstOffset) & pixelsPerDstByteMinusOne;
				
				pixel <<= (position << depthMultShift);
				position = (x + dstOffset) >> dstXShift;
				dst[position] ^= pixel;
			}
			src += srcRowBytes;
			dst += dstRowBytes;
		}
	}
	else {
		
		uint32_t depthMultShift = info->depthShift, dstXShift = 3 - depthMultShift, pixelsPerDstByteMinusOne = (8 >> depthMultShift) - 1, pixelMask = (0xFF00 >> depth) & 0xFF;
		
		bg &= pixelMask;
		fg &= pixelMask;
		
		while (height-- > 0) {
			
			for (x = 0; x < width; x++) {
				
				uint32_t pixel = (src[(x + srcOffset) >> 3] & (0x80 >> ((x + srcOffset) & 7))) ? fg : bg;
				uint32_t position = (x + dstOffset) & pixelsPerDstByteMinusOne;
				
				pixel <<= (position << depthMultShift);
				position = (x + dstOffset) >> dstXShift;
				dst[position] ^= pixel;
			}
			src += srcRowBytes;
			dst += dstRowBytes;
		}
	}
}

static void PrvBlit1ToNOverLE(const struct BitonalBlitInfo *info, const void* srcP, uint32_t srcRowBytes, uint32_t srcOffset, uint32_t dstOffset, int32_t width, int32_t height)
{
	uint32_t depth = info->depth, dstRowBytes = info->dstRowBytes, depthMultShift = info->depthShift, pixel = info->foreground;
	uint32_t dstXShift = 3 - depthMultShift, pixelsPerDstByteMinusOne = (8 >> depthMultShift) - 1, pixelMask = 0xFF >> (8 - depth);
	const uint8_t *src = (const uint8_t*)srcP;
	uint8_t *dst = info->dstP;
	int32_t x;
	
	while (height-- > 0) {
		
		for (x = 0; x < width; x++) {
			
			uint32_t srcByte = src[(x + srcOffset) >> 3];
			uint32_t srcMask = 0x80 >> ((x + srcOffset) & 7);
			
			if (!(srcByte & srcMask)) {
			
				uint32_t position = (x + dstOffset) & pixelsPerDstByteMinusOne;
				uint32_t mask = pixelMask << (position << depthMultShift);
				
				position = (x + dstOffset) >> dstXShift;
				dst[position] = (dst[position] &~ mask) | (pixel & mask);
			}
		}
		src += srcRowBytes;
		dst += dstRowBytes;
	}
}

static void PrvBlit1ToNCopyLE(const struct BitonalBlitInfo *info, const void* srcP, uint32_t srcRowBytes, uint32_t srcOffset, uint32_t dstOffset, int32_t width, int32_t height)
{
	uint32_t depth = info->depth, dstRowBytes = info->dstRowBytes, bg = info->background, fg = info->foreground, depthMultShift = info->depthShift;
	uint32_t dstXShift = 3 - depthMultShift, pixelsPerDstByteMinusOne = (8 >> depthMultShift) - 1, pixelMask = 0xFF >> (8 - depth);
	const uint8_t *src = (const uint8_t*)srcP;
	uint8_t *dst = info->dstP;
	int32_t x;
	
	while (height-- > 0) {
		
		for (x = 0; x < width; x++) {
			
			uint32_t srcByte = src[(x + srcOffset) >> 3];
			uint32_t srcMask = 0x80 >> ((x + srcOffset) & 7);
			uint32_t pixel = (srcByte & srcMask) ? fg : bg;
			uint32_t position = (x + dstOffset) & pixelsPerDstByteMinusOne;
			uint32_t mask = pixelMask << (position << depthMultShift);
			
			position = (x + dstOffset) >> dstXShift;
			dst[position] = (dst[position] &~ mask) | (pixel & mask);
		}
		src += srcRowBytes;
		dst += dstRowBytes;
	}
}

static const uint8_t* SwapForDepth(uint32_t depth)
{
	switch (depth) {
		case 1:
			return swap1;
		case 2:
			return swap2;
		case 4:
			return swap4;
		default:
			__builtin_unreachable();
			return NULL;
	}	
}

static uint32_t SwapPixelEndian(uint32_t pix, uint32_t depth)	//not called with depth >= 8
{
	const uint8_t *lookup = SwapForDepth(depth);
	uint32_t ret = lookup[pix >> 8];
	
	return (ret << 8) + lookup[pix & 0xFF];
}

static void PrvConvert8To8(const void* srcP, void* dstP, int32_t numSrcBytes, struct PrvConvertInfo *info)
{
	const uint8_t *translate = info->translate;
	const uint8_t *src = (const uint8_t*)srcP;
	uint8_t *dst = (uint8_t*)dstP;
	
	if (translate == k8UnityMapping) {
		memcpy(dst, src, numSrcBytes);
		return;
	}
	
	while (numSrcBytes >= 8) {
		*dst++ = translate[*src++];
		*dst++ = translate[*src++];
		*dst++ = translate[*src++];
		*dst++ = translate[*src++];
		*dst++ = translate[*src++];
		*dst++ = translate[*src++];
		*dst++ = translate[*src++];
		*dst++ = translate[*src++];
		numSrcBytes -= 8;
	}
	
	while (numSrcBytes-- > 0)
		*dst++ = translate[*src++];
}

static void PrvConvert4To8(const void* srcP, void* dstP, int32_t numSrcBytes, struct PrvConvertInfo *info)
{
	uint16_t needsBitswap = info->needsBitswap;
	const uint8_t *translate = info->translate;
	const uint8_t *src = (const uint8_t*)srcP;
	uint8_t *dst = (uint8_t*)dstP;
	
	if (needsBitswap) {
		while (numSrcBytes-- > 0) {
			
			uint32_t bits = *src++;
			
			*dst++ = translate[(bits >> 0) & 15];
			*dst++ = translate[(bits >> 4) & 15];
		}
	}
	else {
		while (numSrcBytes-- > 0) {
			
			uint32_t bits = *src++;
			
			*dst++ = translate[(bits >> 4) & 15];
			*dst++ = translate[(bits >> 0) & 15];
		}
	}
}

static void PrvConvert2To8(const void* srcP, void* dstP, int32_t numSrcBytes, struct PrvConvertInfo *info)
{
	uint16_t needsBitswap = info->needsBitswap;
	const uint8_t *translate = info->translate;
	const uint8_t *src = (const uint8_t*)srcP;
	uint8_t *dst = (uint8_t*)dstP;
	
	while (numSrcBytes-- > 0) {
		
		uint32_t bits = *src++;
		if (needsBitswap)
			bits = swap2[bits];
		
		*dst++ = translate[(bits >> 6) & 3];
		*dst++ = translate[(bits >> 4) & 3];
		*dst++ = translate[(bits >> 2) & 3];
		*dst++ = translate[(bits >> 0) & 3];
	}
}

static void PrvConvert1To8(const void* srcP, void* dstP, int32_t numSrcBytes, struct PrvConvertInfo *info)
{
	static const uint32_t table[] = {		//kExpand1To8Table: expand a nibble into a word, where each bit gets repliacted 8 times in a row   eg 0xqwer - >0xrrrrrrrreeeeeeeewwwwwwwwqqqqqqqq
		0x00000000,0xFF000000,0x00FF0000,0xFFFF0000,
		0x0000FF00,0xFF00FF00,0x00FFFF00,0xFFFFFF00,
		0x000000FF,0xFF0000FF,0x00FF00FF,0xFFFF00FF,
		0x0000FFFF,0xFF00FFFF,0x00FFFFFF,0xFFFFFFFF,
	};
	uint32_t bg = info->translate[0], fg = info->translate[1];
	const uint8_t *src = (const uint8_t*)srcP;
	bool needsBitswap = !!info->needsBitswap;
	uint32_t *dst = (uint32_t*)dstP;
	
	//fast likely case (black & white)
	if (bg == 0 && fg == 0xff) {
		
		while (numSrcBytes-- > 0) {
			
			uint32_t bits = *src++;
			if (needsBitswap)
				bits = swap1[bits];
			
			*dst++ = table[bits >> 4];
			*dst++ = table[bits & 15];
		}
	}
	else {	//slower case
		
		bg |= bg << 8;
		bg |= bg << 16;
		fg |= fg << 8;
		fg |= fg << 16;
		
		while (numSrcBytes-- > 0) {
			
			uint32_t v, bits = *src++;
			if (needsBitswap)
				bits = swap1[bits];
			
			v = table[bits >> 4];
			*dst++ = (fg & v) | (bg &~ v);
			v = table[bits & 15];
			*dst++ = (fg & v) | (bg &~ v);
		}
	}
}

static void PrvBlit8ToNCopy(const uint8_t *srcP, uint8_t* dstP, const struct PrvBlitInfo *info)	//replace destination with source, for 16-bit use LUT to get 16-bit color for a given 8-bit colore
{
	int32_t i, j, num = info->numFullPixels;
	uint32_t depth = info->depth;
	uint32_t c;

	srcP += info->srcOfst;
	dstP += info->dstOfst;
	if (depth == 16) {
		const uint16_t* xlate16 = info->xlate;
		uint16_t *dst16 = (uint16_t*)dstP;
		
		for (i = 0; i < num; i++) {
			
			c = *srcP++;
			*dst16++ = xlate16[c];
		}
	}
	else if (depth == 8) {
		
		memcpy(dstP, srcP, num);
	}
	else {
		const uint8_t *lookup = info->needsSwap ? SwapForDepth(depth) : k8UnityMapping;
		uint32_t mask, val;
		
		//prologue
		if (info->initialPixels) {
			
			mask = (1 << (info->initialPixels * depth)) - 1;
			for (mask = 0, val = 0, i = 0; i < info->initialPixels; i++) {
				val <<= depth;
				val += *srcP++;
			}
			mask <<= info->initialPixelShift;
			val <<= info->initialPixelShift;
			*dstP = lookup[(lookup[*dstP] &~ mask) | val];
			dstP++;
		}
		//body - this is a hot path and thus gets lots of love
		switch (depth) {
			case 1:
				for (j = 0; j < num; j++) {
					
					val = 0;
					val = (val << 1) + *srcP++;
					val = (val << 1) + *srcP++;
					val = (val << 1) + *srcP++;
					val = (val << 1) + *srcP++;
					val = (val << 1) + *srcP++;
					val = (val << 1) + *srcP++;
					val = (val << 1) + *srcP++;
					val = (val << 1) + *srcP++;
					*dstP++ = lookup[val];
				}
				break;
			case 2:
				for (j = 0; j < num; j++) {
					
					val = 0;
					val = (val << 2) + *srcP++;
					val = (val << 2) + *srcP++;
					val = (val << 2) + *srcP++;
					val = (val << 2) + *srcP++;
					*dstP++ = lookup[val];
				}
				break;
			case 4:
				for (j = 0; j < num; j++) {
					
					val = 0;
					val = (val << 4) + *srcP++;
					val = (val << 4) + *srcP++;
					*dstP++ = lookup[val];
				}
				break;
		}
		//epilogue
		if (info->trailingPixels) {
			
			for (val = 0, i = 0; i < info->trailingPixels; i++) {
				val <<= depth;
				val += *srcP++;
			}
			mask <<= info->trailingPixelShift;
			val <<= info->trailingPixelShift;
			*dstP = lookup[(lookup[*dstP] & info->finalPxsMask) | val];
		}
	}
}

static void PrvBlit8ToNInit(struct PrvBlitInfo *info, uint32_t depth, uint32_t pixelFormat, uint32_t srcOffset, uint32_t dstOffset, uint32_t width)
{
	uint32_t ppb, shift = 0;
	
	info->depth = depth;
	info->width = width;
	info->srcOfst = srcOffset;
	info->initialPixelShift = 0;
	info->xlate = NULL;
	info->needsSwap = (pixelFormat == PALM_BMP_PIXEL_FORMAT_INDEXED_LE);
	
	switch(depth) {
		case 4:
			shift++;
			//fallthrough
		case 2:
			shift++;
			//fallthrough
		case 1:

			ppb = 8 >> shift;
			info->pixelsPerByte = ppb;
			info->dstOfst = dstOffset >> (3 - shift);
			if (!(dstOffset & (ppb - 1)))
				info->initialPixels = 0;
			else {
				uint32_t initialPixels = ppb - (dstOffset & (ppb - 1));
				
				info->initialPixels = initialPixels;
				info->initialPxsMask = 0xFF << (initialPixels * depth);
				
				if (width < initialPixels) {
					
					uint32_t ips = (initialPixels - width) * depth;
					
					info->initialPixelShift = ips;
					info->initialPxsMask |=~ (0xFF << ips);
					info->initialPixels = initialPixels = width;
				}
				info->width -= initialPixels;
			}
			info->numFullPixels = info->width >> (3 - shift);
			info->trailingPixels = info->width & (ppb - 1);
			
			if (info->trailingPixels) {
				info->finalPxsMask = 0xFF >> (info->trailingPixels * depth);
				info->trailingPixelShift = (ppb - info->trailingPixels) * depth;
			}
			break;
		
		case 8:
			info->pixelsPerByte = 1;
			info->dstOfst = dstOffset;
			info->initialPixels = 0;
			info->numFullPixels = width;
			info->trailingPixels = 0;
			break;
		
		case 16:
			info->pixelsPerByte = 0;
			info->dstOfst = dstOffset * 2;
			info->initialPixels = 0;
			info->numFullPixels = width;
			info->trailingPixels = 0;
			break;
	}		
}

static void PrvBlitPatSwap(const struct BlitPatInfo *info, int32_t patOffsetV, void* dstParam, int32_t dstOffset, int32_t width, int32_t height)	//swap 2 colors with each other, leave others untouched. inapplicable to 1bpp
{
	uint32_t numSrcBytes, swap1 = info->swap1, swap16 = info->swap16, depth = info->depth;
	int32_t j, dstRowBytes = info->dstRowBytes;
	uint8_t *dst = (uint8_t*)dstParam;
	uint8_t *ptr;
	uint32_t i;
	
	dstOffset <<= info->depthShift;
	width <<= info->depthShift;
	
	for (i = depth; i > 1; i >>= 1) {
		dstOffset >>= 1;
		width >>= 1;
	}
	
	if (depth == 2 || depth == 4) {
		
		struct PrvBlitInfo bltInfo;
		struct PrvConvertInfo convertInfo;
		PrvConvertFunc cvtFunc = NULL;
		uint8_t translate[16];
		uint8_t local[32];
		
		cvtFunc = (depth == 2) ? PrvConvert2To8 : PrvConvert4To8;
		ptr = width > 16 ? (uint8_t*)kheapAlloc(width + 16) : local;	//small opt
		if (!ptr)
			return;	//oopsie
		numSrcBytes = (((dstOffset + width) * depth + 7) >> 3) - ((dstOffset * depth) >> 3);
		
		PrvBlit8ToNInit(&bltInfo, depth, info->pixelFormat, dstOffset & ((depth == 2) ? 3 : 1), dstOffset & ((depth == 2) ? 3 : 1), width);
		
		//init xlate table
		for (i = 0; i < (1U << depth); i++) {
			
			if (i == swap16)
				translate[i] = swap1;
			else if (i == swap1)
				translate[i] = swap16;
			else
				translate[i] = i;
		}
		
		dst += (dstOffset * depth) >> 3;
		convertInfo.translate = translate;
		convertInfo.needsBitswap = info->pixelFormat == PALM_BMP_PIXEL_FORMAT_INDEXED_LE;
		
		while (height-- > 0) {
			
			cvtFunc(dst, ptr, numSrcBytes, &convertInfo);
			PrvBlit8ToNCopy(ptr, dst, &bltInfo);
			
			dst += dstRowBytes;
		}
		
		if (ptr != local)
			kheapFree(ptr);
	}
	else if (depth == 8) {
		
		uint32_t dstRowAdd = dstRowBytes - width;
		
		dst += dstOffset;
		
		while (height-- > 0) {
		
			for (j = 0; j < width; j++) {
				uint32_t px = *dst;
				if (px == swap16)
					*dst = swap1;
				else if (px == swap1)
					*dst = swap16;
				dst++;
			}
			dst += dstRowAdd;
		}
	}
	else if (depth == 16){
		
		dst += dstOffset << 1;
		
		while (height-- > 0) {
			
			uint16_t* dst16 = (uint16_t*)dst;
			
			for (j = 0; j < width; j++) {
				uint32_t px = *dst16;
				if (px == swap16)
					*dst16 = swap1;
				else if (px == swap1)
					*dst16 = swap16;
				dst16++;
			}
			
			dst += dstRowBytes;
		}
	}
}

static void PrvBlitPatOverlay(const struct BlitPatInfo *info, int32_t patOffsetV, void* dstParam, int32_t dstOffset, int32_t width, int32_t height)
{
	uint32_t expPatRowBytesShift = info->expPatRowBytesShift, expPatRowPreShift = info->expPatRowPreShift;
	uint32_t leftMask, rightMask, alignMask, patWidth = info->patRowBytesAcross >> 1;
	int32_t i, centerWords, dstRowAdd, dstRowBytes = info->dstRowBytes;
	const uint8_t *expMask = info->expMask;
	const uint8_t *expPat = info->expPat;
	const uint16_t *patP, *maskP;
	uint32_t align;
	uint16_t *dst;
	
	
	dstOffset <<= info->depthShift;
	width <<= info->depthShift;
	
	if (width < 0)
		return;
	
	leftMask = 0xFFFF >> (dstOffset & 0x0F);
	rightMask = (uint16_t)~(0xFFFF >> ((dstOffset + width) & 0x0F));
	dst = (uint16_t*)(((uint8_t*)dstParam) + ((dstOffset >> 3) & 0xFFFE));
	centerWords = ((dstOffset + width) >> 4) - (dstOffset >> 4) - 1;
	dstRowAdd = dstRowBytes - centerWords * 2 - 4;
	
	alignMask = patWidth - 1;
	
	if (centerWords < 0) {
		
		leftMask = ((uint16_t)~(0xFFFF >> width)) >> (dstOffset & 0x0F);
		rightMask = 0;
		centerWords = 0;
	}
	else if (!rightMask)
		dstRowAdd += 2;
	
	if (leftMask == 0xFFFF) {
		leftMask = 0;
		centerWords++;
	}
	
	if (info->pixelFormat == PALM_BMP_PIXEL_FORMAT_INDEXED_LE) {
		
		leftMask = SwapPixelEndian(leftMask, info->depth);
		rightMask = SwapPixelEndian(rightMask, info->depth);
	}
	
	leftMask = __builtin_bswap16(leftMask);
	rightMask = __builtin_bswap16(rightMask);
	
	while (height-- > 0) {
		
		uint32_t startPos = ((patOffsetV++ >> expPatRowPreShift) & 7) << expPatRowBytesShift;
		
		patP = (uint16_t*)&expPat[startPos];
		maskP = (uint16_t*)&expMask[startPos];
		align = dstOffset >> 4;
		
		if (leftMask) {
			
			*dst = (*dst & ((~leftMask) | maskP[align & alignMask])) | (patP[align & alignMask] & leftMask);
			dst++;
			align++;
		}
		if (centerWords > 0) {
			
			i = centerWords;
			
			while (i--) {
				
				*dst = (*dst & maskP[align & alignMask]) | patP[align & alignMask];
				align++;
				dst++;
			}
		}
		if (rightMask) {
			
			*dst = (*dst & ((~rightMask) | maskP[align & alignMask])) | (patP[align & alignMask] & rightMask);
			dst++;
			align++;
		}
	
		dst += dstRowAdd / 2;
	}
}

static void PrvBlitPatCopy(const struct BlitPatInfo *info, int32_t patOffsetV, void* dstParam, int32_t dstOffset, int32_t width, int32_t height)
{
	int32_t i, j, centerWords, dstRowAdd, dstRowBytes = info->dstRowBytes, patWidth = info->patRowBytesAcross >> 1;
	uint32_t expPatRowBytesShift = info->expPatRowBytesShift, expPatRowPreShift = info->expPatRowPreShift;
	uint32_t leftMask, rightMask, alignMask;
	const uint8_t *expPat = info->expPat;
	uint16_t *dst;
	
	dstOffset <<= info->depthShift;
	width <<= info->depthShift;
	
	if (width <= 0 || height <= 0)
		return;
	
	leftMask = 0xFFFF >> (dstOffset & 0x0F);
	rightMask = (uint16_t)~(0xFFFF >> ((dstOffset + width) & 0x0F));
	dst = (uint16_t*)(((uint8_t*)dstParam) + ((dstOffset >> 3) & 0xFFFE));
	centerWords = ((dstOffset + width) >> 4) - (dstOffset >> 4) - 1;
	dstRowAdd = dstRowBytes - centerWords * 2 - 4;
	
	alignMask = patWidth - 1;
	
	if (centerWords < 0) {
		
		leftMask = ((uint16_t)~(0xFFFF >> width)) >> (dstOffset & 0x0F);
		rightMask = 0;
		centerWords = 0;
	}
	else if (!rightMask)
		dstRowAdd += 2;
	
	if (leftMask == 0xFFFF) {
		leftMask = 0;
		centerWords++;
	}
	
	if (info->pixelFormat == PALM_BMP_PIXEL_FORMAT_INDEXED_LE) {
		
		leftMask = SwapPixelEndian(leftMask, info->depth);
		rightMask = SwapPixelEndian(rightMask, info->depth);
	}
	
	leftMask = __builtin_bswap16(leftMask);
	rightMask = __builtin_bswap16(rightMask);
	
	if (patWidth == 1) {		//common cases are worth optimizing
		
		uint32_t pat;
		
		while (height-- > 0) {
			
			pat = *(uint16_t*)&expPat[((patOffsetV++ >> expPatRowPreShift) & 7) << expPatRowBytesShift];	//do not ask
			
			if (leftMask) {
				
				*dst = (*dst &~ leftMask) | (pat & leftMask);
				dst++;
			}
			if (centerWords > 0) {
				
				for (i = 0; i < centerWords; i++)
					*dst++ = pat;
			}
			if (rightMask) {
				
				*dst = (*dst &~ rightMask) | (pat & rightMask);
				dst++;
			}
			dst += dstRowAdd / 2;
		}
	}
	else if (patWidth == 2) {	//common cases are worth optimizing
		
		const uint16_t *patP;
		uint32_t align;

		while (height-- > 0) {
			
			patP = (uint16_t*)&expPat[((patOffsetV++ >> expPatRowPreShift) & 7) << expPatRowBytesShift];	//do not ask
			align = dstOffset >> 4;
			
			if (leftMask) {
				
				*dst = (*dst &~ leftMask) | (patP[align++ & alignMask] & leftMask);
				dst++;
			}
			if (centerWords > 0) {
				
				if (centerWords & 1)
					*dst++ = patP[align++ & alignMask];
				
				if (centerWords >= 2) {
					
					uint32_t first, second, cycles = centerWords / 2;
					
					first = patP[align++ & alignMask];
					second = patP[align++ & alignMask];
					
					while (cycles--) {
						
						*dst++ = first;
						*dst++ = second;
					}
				}
			}
			if (rightMask) {
				
				*dst = (*dst &~ rightMask) | (patP[align++ & alignMask] & rightMask);
				dst++;
			}
			dst += dstRowAdd / 2;
		}
	}
	else {						//generic case - it can handle the above cases too (once you remove the quick lop in th emiddle that assumes patWidth >= 4) , but that would be slower
		
		const uint16_t *patP;
		uint32_t align;
		
		while (height-- > 0) {
			
			patP = (uint16_t*)&expPat[((patOffsetV++ >> expPatRowPreShift) & 7) << expPatRowBytesShift];	//do not ask
			align = dstOffset >> 4;
			
			if (leftMask) {
				
				*dst = (*dst &~ leftMask) | (patP[align++ & alignMask] & leftMask);
				dst++;
			}
			if (centerWords > 0) {
				
				i = centerWords;
				
				while ((align & alignMask) && (i-- > 0))
					*dst++ = patP[align++ & alignMask];
				
				while (i >= patWidth) {
					
					for (j = 0; j < patWidth; j += 4) {
						*dst++ = *patP++;
						*dst++ = *patP++;
						*dst++ = *patP++;
						*dst++ = *patP++;
					}
					
					i -= patWidth;
					patP -= patWidth;
				}
			
				while(i-- > 0)
					*dst++ = patP[align++ & alignMask];
			}
			if (rightMask) {
				
				*dst = (*dst &~ rightMask) | (patP[align++ & alignMask] & rightMask);
				dst++;
			}
		
			dst += dstRowAdd / 2;
		}
	}
}

static void PrvBlitPatXOR(const struct BlitPatInfo *info, int32_t patOffsetV, void* dstParam, int32_t dstOffset, int32_t width, int32_t height)
{
	uint32_t expPatRowBytesShift = info->expPatRowBytesShift, expPatRowPreShift = info->expPatRowPreShift;
	uint32_t leftMask, rightMask, alignMask, patWidth = info->patRowBytesAcross >> 1;
	int32_t i, centerWords, dstRowAdd, dstRowBytes = info->dstRowBytes;
	const uint8_t *expPat = info->expPat;
	const uint16_t *patP;
	uint16_t *dst;
	uint32_t align;
		
	dstOffset <<= info->depthShift;
	width <<= info->depthShift;
	
	if (width < 0)
		return;
	
	leftMask = 0xFFFF >> (dstOffset & 0x0F);
	rightMask = (uint16_t)~(0xFFFF >> ((dstOffset + width) & 0x0F));
	dst = (uint16_t*)(((uint8_t*)dstParam) + ((dstOffset >> 3) & 0xFFFE));
	centerWords = ((dstOffset + width) >> 4) - (dstOffset >> 4) - 1;
	dstRowAdd = dstRowBytes - centerWords * 2 - 4;
	
	alignMask = patWidth - 1;
	
	if (centerWords < 0) {
		
		leftMask = ((uint16_t)~(0xFFFF >> width)) >> (dstOffset & 0x0F);
		rightMask = 0;
		centerWords = 0;
	}
	else if (!rightMask)
		dstRowAdd += 2;
	
	if (leftMask == 0xFFFF) {
		leftMask = 0;
		centerWords++;
	}
	
	if (info->pixelFormat == PALM_BMP_PIXEL_FORMAT_INDEXED_LE) {
		
		leftMask = SwapPixelEndian(leftMask, info->depth);
		rightMask = SwapPixelEndian(rightMask, info->depth);
	}
	
	leftMask = __builtin_bswap16(leftMask);
	rightMask = __builtin_bswap16(rightMask);

	while (height-- > 0) {
		
		patP = (uint16_t*)&expPat[((patOffsetV++ >> expPatRowPreShift) & 7) << expPatRowBytesShift];	//do not ask
		align = dstOffset >> 4;
		
		if (leftMask) {
			
			*dst++ ^= patP[align++ & alignMask] & leftMask;
		}
		if (centerWords > 0) {
			
			i = centerWords;
			
			while(i--)
				*dst++ ^= patP[align++ & alignMask];
		}
		if (rightMask) {
				
			*dst++ ^= patP[align++ & alignMask] & rightMask;
		}
	
		dst += dstRowAdd / 2;
	}
}

static void PrvBlitPatNOP(const struct BlitPatInfo *info, int32_t patOffsetV, void* dstParam, int32_t dstOffset, int32_t width, int32_t height)
{
	//nothing
}

static void PrvBlitPatConstXOR(const struct BlitPatInfo *info, int32_t patOffsetV, void* dstParam, int32_t dstOffset, int32_t width, int32_t height)
{
	uint32_t value = info->pattern, value32 = (value << 16) + value, leftMask, rightMask;
	int32_t i, centerWords, dstRowAdd, dstRowBytes = info->dstRowBytes;
	uint16_t *dst;
	
	dstOffset <<= info->depthShift;
	width <<= info->depthShift;
	
	leftMask = 0xFFFF >> (dstOffset & 0x0F);
	rightMask = (uint16_t)~(0xFFFF >> ((dstOffset + width) & 0x0F));
	dst = (uint16_t*)(((uint8_t*)dstParam) + ((dstOffset >> 3) & 0xFFFE));
	centerWords = ((dstOffset + width) >> 4) - (dstOffset >> 4) - 1;
	dstRowAdd = dstRowBytes - centerWords * 2 - 4;
	
	if (centerWords < 0) {
		
		leftMask = ((uint16_t)~(0xFFFF >> width)) >> (dstOffset & 0x0F);
		rightMask = 0;
		centerWords = 0;
	}
	else if (!rightMask)
		dstRowAdd += 2;
	
	if (leftMask == 0xFFFF) {
		leftMask = 0;
		centerWords++;
	}
	
	if (info->pixelFormat == PALM_BMP_PIXEL_FORMAT_INDEXED_LE) {
		
		leftMask = SwapPixelEndian(leftMask, info->depth);
		rightMask = SwapPixelEndian(rightMask, info->depth);
	}
	
	leftMask = __builtin_bswap16(leftMask);
	rightMask = __builtin_bswap16(rightMask);
	
	if (centerWords <= 0 && !rightMask) {
		
		while (height-- > 0) {
			
			*dst++ ^= value & leftMask;
		}
	}
	else {
		
		while (height-- > 0) {
			
			if (leftMask) {
				
				*dst++ ^= value & leftMask;
			}
			
			if (centerWords > 0) {
				
				i = centerWords;
				if (((uintptr_t)dst) & 2) {
					
					*dst++ ^= value;
					i--;
				}
				while (i >= 8) {
					
					uint32_t *dst32;
					
					i -= 8;
					dst32 = (uint32_t*)dst;
					*dst32++ ^= value32;
					*dst32++ ^= value32;
					*dst32++ ^= value32;
					*dst32++ ^= value32;
					dst = (uint16_t*)dst32;
				}
				while (i--) {
					
					*dst++ ^= value;
				}
			}
			
			if (rightMask) {
				
				*dst++ ^= value & rightMask;
			}
			
			dst += dstRowAdd / 2;
		}
	}
}

static void PrvBlitPatConstCopy(const struct BlitPatInfo *info, int32_t patOffsetV, void* dstParam, int32_t dstOffset, int32_t width, int32_t height)
{
	uint32_t value = info->pattern, value32 = (value << 16) + value, leftMask, rightMask;
	int32_t i, centerWords, dstRowAdd, dstRowBytes = info->dstRowBytes;
	uint16_t *dst;
	
	dstOffset <<= info->depthShift;
	width <<= info->depthShift;
	
	leftMask = 0xFFFF >> (dstOffset & 0x0F);
	rightMask = (uint16_t)~(0xFFFF >> ((dstOffset + width) & 0x0F));
	dst = (uint16_t*)(((uint8_t*)dstParam) + ((dstOffset >> 3) & 0xFFFE));
	centerWords = ((dstOffset + width) >> 4) - (dstOffset >> 4) - 1;
	dstRowAdd = dstRowBytes - centerWords * 2 - 4;
	
	if (centerWords < 0) {
		
		leftMask = ((uint16_t)~(0xFFFF >> width)) >> (dstOffset & 0x0F);
		rightMask = 0;
		centerWords = 0;
	}
	else if (!rightMask)
		dstRowAdd += 2;
	
	if (leftMask == 0xFFFF) {
		leftMask = 0;
		centerWords++;
	}
	
	if (info->pixelFormat == PALM_BMP_PIXEL_FORMAT_INDEXED_LE) {
		
		leftMask = SwapPixelEndian(leftMask, info->depth);
		rightMask = SwapPixelEndian(rightMask, info->depth);
	}
	
	leftMask = __builtin_bswap16(leftMask);
	rightMask = __builtin_bswap16(rightMask);
	
	if (centerWords <= 0 && !rightMask) {
		
		while (height-- > 0) {
			
			*dst = (*dst &~ leftMask) | (value & leftMask);
			dst += dstRowAdd / 2 + 1;
		}
	}
	else {
		
		while (height-- > 0) {
			
			if (leftMask) {
				
				*dst = (*dst &~ leftMask) | (value & leftMask);
				dst++;
			}
			
			if (centerWords > 0) {
				
				i = centerWords;
				if (((uintptr_t)dst) & 2) {
					
					*dst++ = value;
					i--;
				}
				while (i >= 8) {
					
					uint32_t *dst32;
					
					i -= 8;
					dst32 = (uint32_t*)dst;
					*dst32++ = value32;
					*dst32++ = value32;
					*dst32++ = value32;
					*dst32++ = value32;
					dst = (uint16_t*)dst32;
				}
				while (i--) {
					
					*dst++ = value;
				}
			}
			
			if (rightMask) {
				
				*dst = (*dst &~ rightMask) | (value & rightMask);
				dst++;
			}
			
			dst += dstRowAdd / 2;
		}
	}
}

static void PrvBlitPatConstCopy8(const struct BlitPatInfo *info, int32_t patOffsetV, void* dstParam, int32_t dstOffset, int32_t width, int32_t height)
{
	uint32_t value32 = (((uint32_t)info->pattern) << 16) + info->pattern;
	int32_t j, i, dstRowBytes = info->dstRowBytes;
	uint8_t *dstP = (uint8_t*)dstParam;
	
	dstP += dstOffset;
	
	if (width <= 0 || height <= 0)
		return;
	
	for (j = 0; j < height; j++, dstP += dstRowBytes) {
		
		uint8_t *dst = dstP;
		
		for (i = width; i > 0 && (((uintptr_t)dst) & 3); i--)	//prologue
			*dst++ = value32;
		for (; i >= 8; i-= 8, dst += 8) {					//body
			uint32_t* dst32 = (uint32_t*)dst;
			
			dst32[0] = value32;
			dst32[1] = value32;
		}
		for (; i > 0; i--)										//epilogue
			*dst++ = value32;
	}
}

static void PrvBlitPatConstCopy16(const struct BlitPatInfo *info, int32_t patOffsetV, void* dstParam, int32_t dstOffset, int32_t width, int32_t height)
{
	uint32_t value32 = (((uint32_t)info->pattern) << 16) + info->pattern;
	int32_t j, i, dstRowBytes = info->dstRowBytes;
	uint16_t *dstP = (uint16_t*)dstParam;
	
	dstP += dstOffset;
	
	if (width <= 0 || height <= 0)
		return;
	
	for (j = 0; j < height; j++, dstP += dstRowBytes / sizeof(*dstP)) {
		
		uint16_t *dst = dstP;
		
		for (i = width; i > 0 && (((uintptr_t)dst) & 3); i--)	//prologue
			*dst++ = value32;
		for (; i >= 8; i -= 8, dst += 8) {					//body
			uint32_t* dst32 = (uint32_t*)dst;
			
			dst32[0] = value32;
			dst32[1] = value32;
			dst32[2] = value32;
			dst32[3] = value32;
		}
		for (; i > 0; i--)										//epilogue
			*dst++ = value32;
	}
}

static void PrvBlit8ToNOverlay(const uint8_t *srcP, uint8_t* dstP, const struct PrvBlitInfo *info)	//if sourse color != magic color, replace destination with source, for 16-bit use LUT to get 16-bit color for a given 8-bit colore
{
	uint32_t skipColor = (uint8_t)info->transparentColor;
	int32_t i, j, num = info->numFullPixels;
	uint32_t c, depth = info->depth;
	
	srcP += info->srcOfst;
	dstP += info->dstOfst;
	if (depth == 16) {
		const uint16_t* xlate16 = info->xlate;
		uint16_t *dst16 = (uint16_t*)dstP;
		
		for (i = 0; i < num; i++) {
			
			c = *srcP++;
			if (c != skipColor)
				*dst16 = xlate16[c];
			dst16++;
		}
	}
	else if (depth == 8) {
		
		for (i = 0; i < num; i++) {
			
			c = *srcP++;
			if (c != skipColor)
				*dstP = c;
			dstP++;
		}
	}
	else {
		const uint8_t *lookup = info->needsSwap ? SwapForDepth(depth) : k8UnityMapping;
		uint32_t bitsMask = (1 << depth) - 1;
		int32_t ppb = info->pixelsPerByte;
		uint32_t mask, val;
		
		//prologue
		if (info->initialPixels) {
			for (mask = 0, val = 0, i = 0; i < info->initialPixels; i++) {
				mask <<= depth;
				val <<= depth;
				c = *srcP++;
				if (c != skipColor) {
					mask += bitsMask;
					val += c;
				}
			}
			mask <<= info->initialPixelShift;
			val <<= info->initialPixelShift;
			*dstP = lookup[(lookup[*dstP] &~ mask) | val];
			dstP++;
		}
		//body
		for (j = 0; j < num; j++) {
			
			for (mask = 0, val = 0, i = 0; i < ppb; i++) {
				mask <<= depth;
				val <<= depth;
				c = *srcP++;
				if (c != skipColor) {
					mask += bitsMask;
					val += c;
				}
			}
			*dstP = lookup[(lookup[*dstP] &~ mask) | val];
			dstP++;
		}
		//epilogue
		if (info->trailingPixels) {
			for (mask = 0, val = 0, i = 0; i < info->trailingPixels; i++) {
				mask <<= depth;
				val <<= depth;
				c = *srcP++;
				if (c != skipColor) {
					mask += bitsMask;
					val += c;
				}
			}
			mask <<= info->trailingPixelShift;
			val <<= info->trailingPixelShift;
			*dstP = lookup[(lookup[*dstP] &~ mask) | val];
		}
	}
}

static void PrvBlit8ToNXOR(const uint8_t *srcP, uint8_t* dstP, const struct PrvBlitInfo *info)	//literally XOR src into DST. for 16-bit, look up src in a LUT to get the XORes 16-bit val
{
	int32_t i, j, num = info->numFullPixels;
	uint32_t depth = info->depth;
	
	srcP += info->srcOfst;
	dstP += info->dstOfst;
	if (depth == 16) {
		const uint16_t* xlate16 = info->xlate;
		uint16_t *dst16 = (uint16_t*)dstP;
		
		for (i = 0; i < num; i++) {
			
			*dst16++ ^= ~xlate16[*srcP++];
		}
	}
	else if (depth == 8) {
		
		for (i = 0; i < num; i++) {
			
			*dstP++ ^= *srcP++;
		}
	}
	else {
		const uint8_t *lookup = info->needsSwap ? SwapForDepth(depth) : k8UnityMapping;
		int32_t ppb = info->pixelsPerByte;
		uint32_t val;
		
		//prologue
		if (info->initialPixels) {
			for (i = 0, val = 0; i < info->initialPixels; i++) {
				val <<= depth;
				val |= *srcP++;
			}
			val <<= info->initialPixelShift;
			*dstP++ ^= lookup[val & 0xFF];
		}
		//body
		for (j = 0; j < num; j++) {
			
			for (i = 0, val = 0; i < ppb; i++) {
				val <<= depth;
				val |= *srcP++;
			}
			*dstP++ ^= lookup[val & 0xFF];
		}
		//epilogue
		if (info->trailingPixels) {
			for (val = 0, i = 0; i < info->trailingPixels; i++) {
				val <<= depth;
				val |= *srcP++;
			}
			val <<= info->trailingPixelShift;
			*dstP ^= lookup[val & 0xFF];
		}
	}
}

static void PrvBlit8ToNMask(const uint8_t *srcP, uint8_t* dstP, const struct PrvBlitInfo *info)	//if sourse color != magic color, replace destination with a certain color
{
	uint32_t depth = info->depth, eraseWithThisColor = info->backgroundColor;
	uint32_t eraseIfSrcIsThisColor = (uint8_t)info->transparentColor;
	int32_t i, j, num = info->numFullPixels;
	
	srcP += info->srcOfst;
	dstP += info->dstOfst;
	if (depth == 16) {
		uint16_t *dst16 = (uint16_t*)dstP;
		
		for (i = 0; i < num; i++) {
			
			if (*srcP++ != eraseIfSrcIsThisColor)
				*dst16 = eraseWithThisColor;
			dst16++;
		}
	}
	else if (depth == 8) {
		
		for (i = 0; i < num; i++) {
			
			if (*srcP++ != eraseIfSrcIsThisColor)
				*dstP = eraseWithThisColor;
			dstP++;
		}
	}
	else {
		const uint8_t *lookup = info->needsSwap ? SwapForDepth(depth) : k8UnityMapping;
		uint32_t mask, bitsMask = (1 << depth) - 1;
		int32_t ppb = info->pixelsPerByte;
		
		//prologue
		if (info->initialPixels) {
			for (mask = 0, i = 0; i < info->initialPixels; i++) {
				mask <<= depth;
				if (*srcP++ != eraseIfSrcIsThisColor)
					mask += bitsMask;
			}
			mask <<= info->initialPixelShift;
			*dstP = lookup[(lookup[*dstP] &~ mask) | (eraseWithThisColor & mask)];
			dstP++;
		}
		//body
		for (j = 0; j < num; j++) {
			
			for (mask = 0, i = 0; i < ppb; i++) {
				mask <<= depth;
				if (*srcP++ != eraseIfSrcIsThisColor)
					mask += bitsMask;
			}
			*dstP = lookup[(lookup[*dstP] &~ mask) | (eraseWithThisColor & mask)];
			dstP++;
		}
		//epilogue
		if (info->trailingPixels) {
			for (mask = 0, i = 0; i < info->trailingPixels; i++) {
				mask <<= depth;
				if (*srcP++ != eraseIfSrcIsThisColor)
					mask += bitsMask;
			}
			mask <<= info->trailingPixelShift;
			*dstP = lookup[(lookup[*dstP] &~ mask) | (eraseWithThisColor & mask)];
		}
	}
}

static void PrvBlit8ToNErase(const uint8_t *srcP, uint8_t* dstP, const struct PrvBlitInfo *info)	//if sourse color == magic color, replace destination with a certain color
{
	uint32_t depth = info->depth, eraseWithThisColor = info->backgroundColor;
	uint32_t eraseIfSrcIsThisColor = (uint8_t)info->transparentColor;
	int32_t i, j, num = info->numFullPixels;
	
	srcP += info->srcOfst;
	dstP += info->dstOfst;
	if (depth == 16) {
		uint16_t *dst16 = (uint16_t*)dstP;
		
		for (i = 0; i < num; i++) {
			
			if (*srcP++ == eraseIfSrcIsThisColor)
				*dst16 = eraseWithThisColor;
			dst16++;
		}
	}
	else if (depth == 8) {
		
		for (i = 0; i < num; i++) {
			
			if (*srcP++ == eraseIfSrcIsThisColor)
				*dstP = eraseWithThisColor;
			dstP++;
		}
	}
	else {
		const uint8_t *lookup = info->needsSwap ? SwapForDepth(depth) : k8UnityMapping;
		uint32_t mask, bitsMask = (1 << depth) - 1;
		int32_t ppb = info->pixelsPerByte;
		
		//prologue
		if (info->initialPixels) {
			for (mask = 0, i = 0; i < info->initialPixels; i++) {
				mask <<= depth;
				if (*srcP++ == eraseIfSrcIsThisColor)
					mask += bitsMask;
			}
			mask <<= info->initialPixelShift;
			*dstP = lookup[(lookup[*dstP] &~ mask) | (eraseWithThisColor & mask)];
			dstP++;
		}
		//body
		for (j = 0; j < num; j++) {
			
			for (mask = 0, i = 0; i < ppb; i++) {
				mask <<= depth;
				if (*srcP++ == eraseIfSrcIsThisColor)
					mask += bitsMask;
			}
			*dstP = lookup[(lookup[*dstP] &~ mask) | (eraseWithThisColor & mask)];
			dstP++;
		}
		//epilogue
		if (info->trailingPixels) {
			for (mask = 0, i = 0; i < info->trailingPixels; i++) {
				mask <<= depth;
				if (*srcP++ == eraseIfSrcIsThisColor)
					mask += bitsMask;
			}
			mask <<= info->trailingPixelShift;
			*dstP = lookup[(lookup[*dstP] &~ mask) | (eraseWithThisColor & mask)];
		}
	}
}

static void PrvDraw8BitCopyDoubled(const uint8_t *src, uint8_t *dst1, uint32_t srcRowBytes, uint32_t dstRowBytes, int32_t height, int32_t width)
{
	uint32_t dstAdd = dstRowBytes * 2 - width, srcAdd = srcRowBytes - width / 2;
	uint8_t *dst2 = dst1 + dstRowBytes;
	int32_t x;
	
	while ((height -= 2) >= 0) {
		
		for (x = width; x >= 2; x -= 2) {
			
			uint32_t pixel = *src++;
			
			*dst1++ = pixel;
			*dst1++ = pixel;
			*dst2++ = pixel;
			*dst2++ = pixel;
		}
		src += srcAdd;
		dst1 += dstAdd;
		dst2 += dstAdd;
	}
}

static void PrvDraw8BitCopyTripled(const uint8_t *src, uint8_t *dst1, uint32_t srcRowBytes, uint32_t dstRowBytes, int32_t height, int32_t width)
{
	uint32_t dstAdd = dstRowBytes * 3 - width, srcAdd = srcRowBytes - width / 3;
	uint8_t *dst2 = dst1 + dstRowBytes, *dst3 = dst2 + dstRowBytes;
	int32_t x;
	
	while ((height -= 3) >= 0) {
		
		for (x = width; x >= 3; x -= 3) {
			
			uint32_t pixel = *src++;
			
			*dst1++ = pixel;
			*dst1++ = pixel;
			*dst1++ = pixel;
			*dst2++ = pixel;
			*dst2++ = pixel;
			*dst2++ = pixel;
			*dst3++ = pixel;
			*dst3++ = pixel;
			*dst3++ = pixel;
		}
		src += srcAdd;
		dst1 += dstAdd;
		dst2 += dstAdd;
		dst3 += dstAdd;
	}
}

static void PrvDraw8BitCopyQuadrupled(const uint8_t *src, uint8_t *dst1, uint32_t srcRowBytes, uint32_t dstRowBytes, int32_t height, int32_t width)
{
	uint32_t dstAdd = dstRowBytes * 4 - width, srcAdd = srcRowBytes - width / 4;
	uint8_t *dst2 = dst1 + dstRowBytes, *dst3 = dst2 + dstRowBytes, *dst4 = dst3 + dstRowBytes;
	int32_t x;
	
	while ((height -= 4) >= 0) {
		
		for (x = width; x >= 4; x -= 4) {
			
			uint32_t pixel = *src++;
			
			*dst1++ = pixel;
			*dst1++ = pixel;
			*dst1++ = pixel;
			*dst1++ = pixel;
			*dst2++ = pixel;
			*dst2++ = pixel;
			*dst2++ = pixel;
			*dst2++ = pixel;
			*dst3++ = pixel;
			*dst3++ = pixel;
			*dst3++ = pixel;
			*dst3++ = pixel;
			*dst4++ = pixel;
			*dst4++ = pixel;
			*dst4++ = pixel;
			*dst4++ = pixel;
		}
		src += srcAdd;
		dst1 += dstAdd;
		dst2 += dstAdd;
		dst3 += dstAdd;
		dst4 += dstAdd;
	}
}

static void PrvDraw8BitTransBitmap(const uint8_t *src, uint8_t *dst, uint32_t srcRowAdd, uint32_t dstRowAdd, uint32_t transparentColor, int32_t height, int32_t width)
{
	uint32_t pixel;
	int32_t x;
	
	while (height-- > 0) {
		
		for (x = width; x >= 4; x -= 4) {
			
			pixel = *src++;
			if (pixel != transparentColor)
				*dst = pixel;
			dst++;
			
			pixel = *src++;
			if (pixel != transparentColor)
				*dst = pixel;
			dst++;
			
			pixel = *src++;
			if (pixel != transparentColor)
				*dst = pixel;
			dst++;
			
			pixel = *src++;
			if (pixel != transparentColor)
				*dst = pixel;
			dst++;
		}
		while (x-- > 0) {
			pixel = *src++;
			if (pixel != transparentColor)
				*dst = pixel;
			dst++;
		}
		src += srcRowAdd;
		dst += dstRowAdd;
	}
}

static void PrvDraw8BitTransBitmapDoubled(const uint8_t *src, uint8_t *dst1, uint32_t srcRowBytes, uint32_t dstRowBytes, uint32_t transparentPixel, int32_t height, int32_t width)
{
	uint32_t dstAdd = dstRowBytes * 2 - width, srcAdd = srcRowBytes - width / 2;
	uint8_t *dst2 = dst1 + dstRowBytes;
	int32_t x;
	
	while ((height -= 2) >= 0) {
		
		for (x = width; x >= 2; x-= 2) {
			
			uint32_t pixel = *src++;
			
			if (pixel == transparentPixel) {
				dst1 += 2;
				dst2 += 2;
			}
			else {
				*dst1++ = pixel;
				*dst1++ = pixel;
				*dst2++ = pixel;
				*dst2++ = pixel;
			}
		}
		src += srcAdd;
		dst1 += dstAdd;
		dst2 += dstAdd;
	}
}

static void PrvDraw8BitTransBitmapTripled(const uint8_t *src, uint8_t *dst1, uint32_t srcRowBytes, uint32_t dstRowBytes, uint32_t transparentPixel, int32_t height, int32_t width)
{
	uint32_t dstAdd = dstRowBytes * 3 - width, srcAdd = srcRowBytes - width / 3;
	uint8_t *dst2 = dst1 + dstRowBytes, *dst3 = dst2 + dstRowBytes;
	int32_t x;
	
	while ((height -= 3) >= 0) {
		
		for (x = width; x >= 3; x -= 3) {
			
			uint32_t pixel = *src++;
			
			if (pixel == transparentPixel) {
				dst1 += 3;
				dst2 += 3;
				dst3 += 3;
			}
			else {
				*dst1++ = pixel;
				*dst1++ = pixel;
				*dst1++ = pixel;
				*dst2++ = pixel;
				*dst2++ = pixel;
				*dst2++ = pixel;
				*dst3++ = pixel;
				*dst3++ = pixel;
				*dst3++ = pixel;
			}
		}
		src += srcAdd;
		dst1 += dstAdd;
		dst2 += dstAdd;
		dst3 += dstAdd;
	}
}

static void PrvDraw8BitTransBitmapQuadrupled(const uint8_t *src, uint8_t *dst1, uint32_t srcRowBytes, uint32_t dstRowBytes, uint32_t transparentPixel, int32_t height, int32_t width)
{
	uint32_t dstAdd = dstRowBytes * 4 - width, srcAdd = srcRowBytes - width / 4;
	uint8_t *dst2 = dst1 + dstRowBytes, *dst3 = dst2 + dstRowBytes, *dst4 = dst3 + dstRowBytes;
	int32_t x;
	
	while ((height -= 4) >= 0) {
		
		for (x = width; x >= 4; x -= 4) {
			
			uint32_t pixel = *src++;
			
			if (pixel == transparentPixel) {
				dst1 += 4;
				dst2 += 4;
				dst3 += 4;
				dst4 += 4;
			}
			else {
				*dst1++ = pixel;
				*dst1++ = pixel;
				*dst1++ = pixel;
				*dst1++ = pixel;
				*dst2++ = pixel;
				*dst2++ = pixel;
				*dst2++ = pixel;
				*dst2++ = pixel;
				*dst3++ = pixel;
				*dst3++ = pixel;
				*dst3++ = pixel;
				*dst3++ = pixel;
				*dst4++ = pixel;
				*dst4++ = pixel;
				*dst4++ = pixel;
				*dst4++ = pixel;
			}
		}
		src += srcAdd;
		dst1 += dstAdd;
		dst2 += dstAdd;
		dst3 += dstAdd;
		dst4 += dstAdd;
	}
}

static void PrvDraw8BitTransBitmapOneAndOneHalf(const uint8_t *src, uint8_t *dst1, uint32_t srcRowBytes, uint32_t dstRowBytes, uint32_t transparentPixel, int32_t height, int32_t width)
{
	//width and height are given in destination terms. this func implemented in my own way. no promises that it is optimal or proper
	uint32_t dstRowAdd, pixel, dstRowAdd1 = dstRowBytes - width, dstRowAdd2 = dstRowBytes * 2 - width, alternateY = 1, alternateX, srcRowAdd;
	int32_t x, srcWidth = width * 2 / 3, srcHeight = height * 2 / 3;
	uint8_t *dst2 = dst1 + dstRowBytes;
	
	if (srcWidth & 1) {
		dstRowAdd1--;
		dstRowAdd2--;
		width++;
	}
	
	srcRowAdd = srcRowBytes - srcWidth;
	
	while (srcHeight-- > 0) {
		
		alternateX = 1;
		x = srcWidth;
		
		if (alternateY) {
			
			while (x-- > 0) {
				
				pixel = *src++;
				
				if (alternateX) {
					
					if (pixel == transparentPixel) {
						
						dst1 += 2;
						dst2 += 2;
					}
					else {
						
						*dst1++ = pixel;
						*dst1++ = pixel;
						*dst2++ = pixel;
						*dst2++ = pixel;
					}
				}
				else {
					
					if (pixel == transparentPixel) {
						
						dst1++;
						dst2++;
					}
					else {
						
						*dst1++ = pixel;
						*dst2++ = pixel;
					}
				}
				
				alternateX = 1 - alternateX;
			}
			dstRowAdd = dstRowAdd2;
		}
		else {	//else_alternateY
			
			while (x-- > 0) {
				
				pixel = *src++;
				
				if (alternateX) {
					
					if (pixel == transparentPixel) {
						
						dst1 += 2;
					}
					else {
						
						*dst1++ = pixel;
						*dst1++ = pixel;
					}
				}
				else {
					
					if (pixel == transparentPixel) {
						
						dst1++;
					}
					else {
						
						*dst1++ = pixel;
					}
				}
				
				alternateX = 1 - alternateX;
			}
			dstRowAdd = dstRowAdd1;
			dst2 += width;
		}
		
		alternateY = 1 - alternateY;
		src += srcRowAdd;
		dst1 += dstRowAdd;
		dst2 += dstRowAdd;
	}
}

static void PrvBlit16To16NotCopy(const uint8_t *srcP, uint8_t* dstP, const struct PrvBlitInfo *info)	//if source color != magic color, replace destination with source, for 16-bit use LUT to get 16-bit color for a given 8-bit colore
{
	uint16_t *dst = (uint16_t*)(dstP + info->dstOfst);
	const uint16_t *src = (const uint16_t*)srcP;
	int32_t width = info->numFullPixels >> 1;
	
	if (info->needsSwap) {
		
		while (width -- > 0) {
			
			*dst++ = ~__builtin_bswap16(*src++);
		}
	}
	else {
		
		while (width -- > 0) {
			
			*dst++ = ~*src++;
		}
	}
}

static void PrvBlit16To16Overlay(const uint8_t *srcP, uint8_t* dstP, const struct PrvBlitInfo *info)	//if source color != magic color, replace destination with source, for 16-bit use LUT to get 16-bit color for a given 8-bit colore
{
	uint32_t transparent = info->transparentColor;
	uint16_t *dst = (uint16_t*)(dstP + info->dstOfst);
	const uint16_t *src = (const uint16_t*)srcP;
	int32_t width = info->numFullPixels >> 1;
	uint32_t v;
	
	if (info->needsSwap) {
		
		while (width -- > 0) {
			
			v = *src++;
			if (v != transparent)
				*dst = __builtin_bswap16(v);
			dst++;
		}
	}
	else {
		
		while (width -- > 0) {
			
			v = *src++;
			if (v != transparent)
				*dst = v;
			dst++;
		}
	}
}

static void PrvBlit16To16XOR(const uint8_t *srcP, uint8_t* dstP, const struct PrvBlitInfo *info)		//XOR NOT src into dst
{
	uint16_t *dst = (uint16_t*)(dstP + info->dstOfst);
	const uint16_t *src = (const uint16_t*)srcP;
	int32_t width = info->numFullPixels >> 1;
	
	if (info->needsSwap) {
		
		while (width -- > 0) {
			
			*dst++ ^=~ __builtin_bswap16(*src++);
		}
	}
	else {
		
		while (width -- > 0) {
			
			*dst++ ^=~ *src++;
		}
	}
}

static void PrvBlit16To16Mask(const uint8_t *srcP, uint8_t* dstP, const struct PrvBlitInfo *info)		//if sourse color != magic color, replace destination with a certain color
{
	uint32_t transparent = info->transparentColor, backColor = info->backgroundColor;
	uint16_t *dst = (uint16_t*)(dstP + info->dstOfst);
	const uint16_t *src = (const uint16_t*)srcP;
	int32_t width = info->numFullPixels >> 1;
	
	while (width -- > 0) {
		
		if (*src++ != transparent)
			*dst = backColor;
		dst++;
	}
}

static void PrvBlit16To16Erase(const uint8_t *srcP, uint8_t* dstP, const struct PrvBlitInfo *info)		//if sourse color == magic color, replace destination with a certain color
{
	uint32_t transparent = info->transparentColor, backColor = info->backgroundColor;
	uint16_t *dst = (uint16_t*)(dstP + info->dstOfst);
	const uint16_t *src = (const uint16_t*)srcP;
	int32_t width = info->numFullPixels >> 1;
	
	while (width -- > 0) {
		
		if (*src++ == transparent)
			*dst = backColor;
		dst++;
	}
}

static void PrvBlit16To16Copy(const uint8_t *srcP, uint8_t* dstP, const struct PrvBlitInfo *info)		//replace destination with source
{
	uint16_t *dst = (uint16_t*)(dstP + info->dstOfst);
	const uint16_t *src = (const uint16_t*)srcP;
	int32_t width = info->numFullPixels >> 1;
	
	if (!info->needsSwap)
		memcpy(dst, src, width * 2);
	else {
		while (width-- > 0) {
		
			*dst++ = __builtin_bswap16(*src++);
		}
	}
}

static uint8_t PrvFindBinIndex(uint8_t rgbComponent)
{
	return (((uint16_t)rgbComponent) + 0x19) / (uint8_t)0x33;	//same result as original code but faster
}

static void PrvGetIndexList(uint8_t redBinIndex, uint8_t greenBinIndex, uint8_t blueBinIndex, uint16_t *startP, uint16_t *endP)
{
	static const uint16_t sliceIndexOffsets[] = {0, 0x33, 0x6D, 0xA7, 0xE8, 0x122};
	static const uint8_t offsets[] = {
		0x00, 0x02, 0x05, 0x06, 0x07, 0x08, 0x09, 0x00, 0x09, 0x0C, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x00,
		0x13, 0x15, 0x17, 0x19, 0x1B, 0x1C, 0x1D, 0x00, 0x1D, 0x1F, 0x21, 0x23, 0x25, 0x26, 0x27, 0x00,
		0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x00, 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x00,
		0x00, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x00, 0x0A, 0x0D, 0x10, 0x13, 0x14, 0x15, 0x16, 0x00,
		0x16, 0x18, 0x1C, 0x20, 0x22, 0x23, 0x24, 0x00, 0x24, 0x26, 0x28, 0x2A, 0x2C, 0x2D, 0x2E, 0x00,
		0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x00, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x00,
		0x00, 0x02, 0x04, 0x06, 0x08, 0x09, 0x0A, 0x00, 0x0A, 0x0C, 0x10, 0x14, 0x16, 0x17, 0x18, 0x00,
		0x18, 0x19, 0x1C, 0x1F, 0x22, 0x23, 0x24, 0x00, 0x24, 0x25, 0x26, 0x29, 0x2C, 0x2D, 0x2E, 0x00,
		0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x00, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x00,
		0x00, 0x02, 0x04, 0x06, 0x08, 0x09, 0x0A, 0x00, 0x0A, 0x0C, 0x0E, 0x10, 0x12, 0x13, 0x14, 0x00,
		0x14, 0x15, 0x16, 0x19, 0x1C, 0x1D, 0x1E, 0x00, 0x1E, 0x1F, 0x20, 0x23, 0x26, 0x2A, 0x2C, 0x00,
		0x2C, 0x2D, 0x2E, 0x2F, 0x33, 0x37, 0x39, 0x00, 0x39, 0x3A, 0x3B, 0x3C, 0x3E, 0x40, 0x41, 0x00,
		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x00, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x00,
		0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x00, 0x12, 0x13, 0x14, 0x15, 0x19, 0x1D, 0x1F, 0x00,
		0x1F, 0x20, 0x21, 0x22, 0x26, 0x2A, 0x2E, 0x00, 0x2E, 0x2F, 0x30, 0x31, 0x33, 0x37, 0x3A, 0x00,
		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x00, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x00,
		0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x00, 0x12, 0x13, 0x14, 0x15, 0x17, 0x19, 0x1A, 0x00,
		0x1A, 0x1B, 0x1C, 0x1D, 0x1F, 0x23, 0x26, 0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2D, 0x2F, 0x00,
	};
	
	uint16_t index = (uint16_t)redBinIndex * 48 + (uint16_t)greenBinIndex * 8 + (uint16_t)blueBinIndex;	//286 entries
	
	*startP = sliceIndexOffsets[redBinIndex] + offsets[index];
	*endP = sliceIndexOffsets[redBinIndex] + offsets[index + 1];
}

static void PrvFindDefaultColorIndex8Bit(struct PalmClutEntry *matchColor, const struct PalmClutEntry *refColors, uint32_t userColorMask)
{
	static const uint8_t indexData[] = {
		0xFF, 0xD7, 0xD1, 0xD7, 0xD8, 0xCB, 0x6B, 0x65, 0x5F, 0xD6, 0xD7, 0xD8, 0xD7, 0xD0, 0xD8, 0xCA,
		0x6A, 0x64, 0x5E, 0xD5, 0xE4, 0xCF, 0xE4, 0xC9, 0xE5, 0x69, 0xE5, 0x63, 0x5D, 0xE4, 0xD4, 0xE4,
		0xCE, 0xC8, 0xE5, 0xE5, 0x68, 0x62, 0x5C, 0xD3, 0xCD, 0xC7, 0x67, 0x61, 0x5B, 0xD2, 0xCC, 0xC6,
		0x66, 0x60, 0x5A, 0xC5, 0xD7, 0xD8, 0xD7, 0xBF, 0xD8, 0xB9, 0x59, 0x53, 0x4D, 0xD7, 0xD8, 0xC4,
		0xD8, 0xBE, 0xD9, 0xB8, 0xD9, 0xDA, 0x58, 0x52, 0x4C, 0xC3, 0xE4, 0xBD, 0xE4, 0xD9, 0xDA, 0xB7,
		0xE5, 0xD9, 0xDA, 0x57, 0xE5, 0x51, 0x4B, 0xE4, 0xC2, 0xE4, 0xBC, 0xB6, 0xE5, 0xE5, 0x56, 0x50,
		0x4A, 0xC1, 0xBB, 0xB5, 0x55, 0x4F, 0x49, 0xC0, 0xBA, 0xB4, 0x54, 0x4E, 0x48, 0xB3, 0xE2, 0xAD,
		0xE2, 0xA7, 0xE3, 0x47, 0xE3, 0x41, 0x3B, 0xB2, 0xE2, 0xAC, 0xD9, 0xDA, 0xE2, 0xA6, 0xD9, 0xDA,
		0xE3, 0x46, 0xE3, 0x40, 0x3A, 0xB1, 0xAB, 0xD9, 0xDA, 0xDA, 0xA5, 0xDB, 0x45, 0xDB, 0xDC, 0x3F,
		0x39, 0xB0, 0xAA, 0xA4, 0xDB, 0xDC, 0x44, 0xDB, 0xDC, 0x3E, 0x38, 0xAF, 0xA9, 0xA3, 0x43, 0x3D,
		0x37, 0xAE, 0xA8, 0xA2, 0x42, 0x3C, 0x36, 0xE2, 0xA1, 0xE2, 0x9B, 0x95, 0xE3, 0xE3, 0x35, 0x2F,
		0x29, 0xE2, 0xA0, 0xE2, 0x9A, 0x94, 0xE3, 0xE3, 0x34, 0x2E, 0x28, 0x9F, 0x99, 0x93, 0xDB, 0xDC,
		0x33, 0xDB, 0xDC, 0x2D, 0x27, 0x9E, 0x98, 0x92, 0xDB, 0xDC, 0xDC, 0x32, 0xDD, 0x2C, 0xDD, 0xDE,
		0xE1, 0x26, 0xE1, 0x9D, 0x97, 0x91, 0x31, 0xDD, 0xDE, 0xE1, 0x2B, 0xDD, 0xDE, 0xE1, 0x25, 0xE1,
		0x9C, 0x96, 0x90, 0x30, 0xE1, 0x2A, 0xE1, 0x24, 0x8F, 0x89, 0x83, 0x23, 0x1D, 0x17, 0x8E, 0x88,
		0x82, 0x22, 0x1C, 0x16, 0x8D, 0x87, 0x81, 0x21, 0x1B, 0x15, 0x8C, 0x86, 0x80, 0x20, 0xDD, 0xDE,
		0xE1, 0x1A, 0xDD, 0xDE, 0xE1, 0x14, 0xE1, 0x8B, 0x85, 0x7F, 0x1F, 0xDD, 0xDE, 0xE1, 0xDE, 0xE1,
		0x19, 0xDF, 0xE1, 0x13, 0xDF, 0xE0, 0x8A, 0x84, 0x7E, 0x1E, 0xE1, 0xE1, 0x18, 0xDF, 0xE0, 0x12,
		0xDF, 0xE0, 0x7D, 0x77, 0x71, 0x11, 0x0B, 0x05, 0x7C, 0x76, 0x70, 0x10, 0x0A, 0x04, 0x7B, 0x75,
		0x6F, 0x0F, 0x09, 0x03, 0x7A, 0x74, 0x6E, 0x0E, 0xE1, 0x08, 0xE1, 0x02, 0x79, 0x73, 0x6D, 0x0D,
		0xE1, 0xE1, 0x07, 0xDF, 0xE0, 0x01, 0xDF, 0xE0, 0x78, 0x72, 0x6C, 0x0C, 0x06, 0xDF, 0xE0, 0xE0,
		0x00, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4,
		0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x00, 0x00,
	};
	uint32_t bestIdx, bestDistance = 60000;
	uint16_t startIdx, endIdx;
	
	PrvGetIndexList(PrvFindBinIndex(matchColor->r), PrvFindBinIndex(matchColor->g), PrvFindBinIndex(matchColor->b), &startIdx, &endIdx);
	bestIdx = indexData[startIdx];
	
	if (startIdx + 1 < endIdx || userColorMask) {
		
		userColorMask <<= (endIdx - startIdx);
		userColorMask ^= 0xFFFFFFFF >> (32 - (endIdx - startIdx));
		
		do {
			
			for (;startIdx < endIdx; startIdx++, userColorMask >>= 1) {
			
				const struct PalmClutEntry *refClr;
				uint32_t dist = 0, delta;
				
				if (userColorMask & 1)
					continue;
				
				refClr = refColors + indexData[startIdx];
				
				delta = (uint32_t)refClr->r - matchColor->r;
				dist += delta * delta;
				delta = (uint32_t)refClr->g - matchColor->g;
				dist += delta * delta;
				delta = (uint32_t)refClr->b - matchColor->b;
				dist += delta * delta;
				
				if (dist < bestDistance) {
					bestDistance = dist;
					bestIdx = indexData[startIdx];
				}
			}
			
			startIdx = 0x0151;
			endIdx = 0x016A;
			
		} while (userColorMask);
	}
	matchColor->idx = bestIdx;
}

static bool PrvGetTranslation16Table(const struct PalmClut *srcColorTableP, uint32_t srcDepth, uint8_t* translate, uint16_t* translate16, uint32_t pixelFormat)
{
	if (srcColorTableP) {
		
		const struct PalmClutEntry *rgbP = srcColorTableP->entries;
		uint32_t color565, i;
	
		for (i = 0; i < (1u << srcDepth); i++) {
			
			color565 = ((((uint32_t)rgbP[i].r) & 0xF8) << 8) + ((((uint32_t)rgbP[i].g) & 0xFC) << 3) + ((((uint32_t)rgbP[i].b) & 0xF8) >> 3);
			if (pixelFormat == PALM_BMP_PIXEL_FORMAT_RGB565_BE)
				color565 = __builtin_bswap16(color565);
			
			translate16[i] = color565;
			translate[i] = i;
		}
		
		return true;
	}
	else {
		
		static const uint8_t k1ToWebTranslate[] = {0xff, 0x00};
		static const uint8_t k2ToWebTranslate[] = {0x00, 0xDD, 0xDA, 0xFF};
		static const uint8_t k4ToWebTranslate[] = {0x00, 0xE0, 0xDF, 0x19, 0xDE, 0xDD, 0x32, 0xDC, 0xDB, 0xA5, 0xDA, 0xD9, 0xBE, 0xD8, 0xD7, 0xFF};

		static const uint16_t kWebTo16Translate[] = {
			0xFFFF, 0x7FFE, 0xDFFC, 0x3FFB, 0x9FF9, 0x1FF8, 0xF9FF, 0x79FE, 0xD9FC, 0x39FB, 0x99F9, 0x19F8, 0xF3FF, 0x73FE, 0xD3FC, 0x33FB,
			0x93F9, 0x13F8, 0xFFCF, 0x7FCE, 0xDFCC, 0x3FCB, 0x9FC9, 0x1FC8, 0xF9CF, 0x79CE, 0xD9CC, 0x39CB, 0x99C9, 0x19C8, 0xF3CF, 0x73CE,
			0xD3CC, 0x33CB, 0x93C9, 0x13C8, 0xFF9F, 0x7F9E, 0xDF9C, 0x3F9B, 0x9F99, 0x1F98, 0xF99F, 0x799E, 0xD99C, 0x399B, 0x9999, 0x1998,
			0xF39F, 0x739E, 0xD39C, 0x339B, 0x9399, 0x1398, 0xFF67, 0x7F66, 0xDF64, 0x3F63, 0x9F61, 0x1F60, 0xF967, 0x7966, 0xD964, 0x3963,
			0x9961, 0x1960, 0xF367, 0x7366, 0xD364, 0x3363, 0x9361, 0x1360, 0xFF37, 0x7F36, 0xDF34, 0x3F33, 0x9F31, 0x1F30, 0xF937, 0x7936,
			0xD934, 0x3933, 0x9931, 0x1930, 0xF337, 0x7336, 0xD334, 0x3333, 0x9331, 0x1330, 0xFF07, 0x7F06, 0xDF04, 0x3F03, 0x9F01, 0x1F00,
			0xF907, 0x7906, 0xD904, 0x3903, 0x9901, 0x1900, 0xF307, 0x7306, 0xD304, 0x3303, 0x9301, 0x1300, 0xECFF, 0x6CFE, 0xCCFC, 0x2CFB,
			0x8CF9, 0x0CF8, 0xE6FF, 0x66FE, 0xC6FC, 0x26FB, 0x86F9, 0x06F8, 0xE0FF, 0x60FE, 0xC0FC, 0x20FB, 0x80F9, 0x00F8, 0xECCF, 0x6CCE,
			0xCCCC, 0x2CCB, 0x8CC9, 0x0CC8, 0xE6CF, 0x66CE, 0xC6CC, 0x26CB, 0x86C9, 0x06C8, 0xE0CF, 0x60CE, 0xC0CC, 0x20CB, 0x80C9, 0x00C8,
			0xEC9F, 0x6C9E, 0xCC9C, 0x2C9B, 0x8C99, 0x0C98, 0xE69F, 0x669E, 0xC69C, 0x269B, 0x8699, 0x0698, 0xE09F, 0x609E, 0xC09C, 0x209B,
			0x8099, 0x0098, 0xEC67, 0x6C66, 0xCC64, 0x2C63, 0x8C61, 0x0C60, 0xE667, 0x6666, 0xC664, 0x2663, 0x8661, 0x0660, 0xE067, 0x6066,
			0xC064, 0x2063, 0x8061, 0x0060, 0xEC37, 0x6C36, 0xCC34, 0x2C33, 0x8C31, 0x0C30, 0xE637, 0x6636, 0xC634, 0x2633, 0x8631, 0x0630,
			0xE037, 0x6036, 0xC034, 0x2033, 0x8031, 0x0030, 0xEC07, 0x6C06, 0xCC04, 0x2C03, 0x8C01, 0x0C00, 0xE607, 0x6606, 0xC604, 0x2603,
			0x8601, 0x0600, 0xE007, 0x6006, 0xC004, 0x2003, 0x8001, 0x8210, 0x0421, 0x2842, 0xAA52, 0xAE73, 0x518C, 0x55AD, 0xD7BD, 0xFBDE,
			0x7DEF, 0x18C6, 0x0080, 0x1080, 0x0004, 0x1004, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
			0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
		};
	
		static const uint16_t kWebTo16LETranslate[] = {
			0xFFFF, 0xFE7F, 0xFCDF, 0xFB3F, 0xF99F, 0xF81F, 0xFFF9, 0xFE79, 0xFCD9, 0xFB39, 0xF999, 0xF819, 0xFFF3, 0xFE73, 0xFCD3, 0xFB33,
			0xF993, 0xF813, 0xCFFF, 0xCE7F, 0xCCDF, 0xCB3F, 0xC99F, 0xC81F, 0xCFF9, 0xCE79, 0xCCD9, 0xCB39, 0xC999, 0xC819, 0xCFF3, 0xCE73,
			0xCCD3, 0xCB33, 0xC993, 0xC813, 0x9FFF, 0x9E7F, 0x9CDF, 0x9B3F, 0x999F, 0x981F, 0x9FF9, 0x9E79, 0x9CD9, 0x9B39, 0x9999, 0x9819,
			0x9FF3, 0x9E73, 0x9CD3, 0x9B33, 0x9993, 0x9813, 0x67FF, 0x667F, 0x64DF, 0x633F, 0x619F, 0x601F, 0x67F9, 0x6679, 0x64D9, 0x6339,
			0x6199, 0x6019, 0x67F3, 0x6673, 0x64D3, 0x6333, 0x6193, 0x6013, 0x37FF, 0x367F, 0x34DF, 0x333F, 0x319F, 0x301F, 0x37F9, 0x3679,
			0x34D9, 0x3339, 0x3199, 0x3019, 0x37F3, 0x3673, 0x34D3, 0x3333, 0x3193, 0x3013, 0x07FF, 0x067F, 0x04DF, 0x033F, 0x019F, 0x001F,
			0x07F9, 0x0679, 0x04D9, 0x0339, 0x0199, 0x0019, 0x07F3, 0x0673, 0x04D3, 0x0333, 0x0193, 0x0013, 0xFFEC, 0xFE6C, 0xFCCC, 0xFB2C,
			0xF98C, 0xF80C, 0xFFE6, 0xFE66, 0xFCC6, 0xFB26, 0xF986, 0xF806, 0xFFE0, 0xFE60, 0xFCC0, 0xFB20, 0xF980, 0xF800, 0xCFEC, 0xCE6C,
			0xCCCC, 0xCB2C, 0xC98C, 0xC80C, 0xCFE6, 0xCE66, 0xCCC6, 0xCB26, 0xC986, 0xC806, 0xCFE0, 0xCE60, 0xCCC0, 0xCB20, 0xC980, 0xC800,
			0x9FEC, 0x9E6C, 0x9CCC, 0x9B2C, 0x998C, 0x980C, 0x9FE6, 0x9E66, 0x9CC6, 0x9B26, 0x9986, 0x9806, 0x9FE0, 0x9E60, 0x9CC0, 0x9B20,
			0x9980, 0x9800, 0x67EC, 0x666C, 0x64CC, 0x632C, 0x618C, 0x600C, 0x67E6, 0x6666, 0x64C6, 0x6326, 0x6186, 0x6006, 0x67E0, 0x6660,
			0x64C0, 0x6320, 0x6180, 0x6000, 0x37EC, 0x366C, 0x34CC, 0x332C, 0x318C, 0x300C, 0x37E6, 0x3666, 0x34C6, 0x3326, 0x3186, 0x3006,
			0x37E0, 0x3660, 0x34C0, 0x3320, 0x3180, 0x3000, 0x07EC, 0x066C, 0x04CC, 0x032C, 0x018C, 0x000C, 0x07E6, 0x0666, 0x04C6, 0x0326,
			0x0186, 0x0006, 0x07E0, 0x0660, 0x04C0, 0x0320, 0x0180, 0x1082, 0x2104, 0x4228, 0x52AA, 0x73AE, 0x8C51, 0xAD55, 0xBDD7, 0xDEFB,
			0xEF7D, 0xC618, 0x8000, 0x8010, 0x0400, 0x0410, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
			0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
		};

		switch (srcDepth) {
			case 1:
				memcpy(translate, k1ToWebTranslate, 2);
				break;
			case 2:
				memcpy(translate, k2ToWebTranslate, 4);
				break;
			case 4:
				memcpy(translate, k4ToWebTranslate, 16);
				break;
			case 8:
				memcpy(translate, k8UnityMapping, 256);
				break;
		}
		
		memcpy(translate16, (pixelFormat == PALM_BMP_PIXEL_FORMAT_RGB565_LE) ? kWebTo16LETranslate : kWebTo16Translate, 512);
		
		return false;
	}
}

static void PrvCopyLineShiftBB(const void* srcP, void* dstP, int32_t height, const struct PrvCopyLineInfo* info)	// BE indexed to BE indexed
{
	const uint8_t* src = ((const uint8_t*)srcP) + info->srcOffset;
	uint8_t* dst = ((uint8_t*)dstP) + info->dstOffset;
	int32_t i, shift = info->srcToDstBitOfst;
	uint32_t srcBits;

	while (height-- > 0) {
		
		const uint8_t* src8 = src;
		uint8_t* dst8 = dst;
		
		src += info->srcRowBytes;
		dst += info->dstRowBytes;
		i = info->srcWidthBytes;
		
		srcBits = *src8++;
		
		if (info->leftMask) {
			
			if (info->needExtraReadUpFront)
				srcBits = (srcBits << 8) | *src8++;

			*dst8 = (*dst8 &~ info->leftMask) | ((srcBits >> shift) & 0xFF & info->leftMask);
			dst8++;
		}
		while (i-- > 0) {
			
			srcBits = (srcBits << 8) | *src8++;
			*dst8++ = srcBits >> shift;
		}
		if (info->rightMask) {
			
			srcBits <<= 8;
				
			if (info->needExtraReadAtEnd)
				srcBits |= *src8;
			
			*dst8 = (*dst8 &~ info->rightMask) | ((srcBits >> shift) & 0xFF & info->rightMask);
		}
	}
}

static void PrvCopyLineShiftBL(const void* srcP, void* dstP, int32_t height, const struct PrvCopyLineInfo* info)	// BE indexed to LE indexed
{
	const uint8_t* src = ((const uint8_t*)srcP) + info->srcOffset;
	uint8_t* dst = ((uint8_t*)dstP) + info->dstOffset;
	int32_t i, shift = info->srcToDstBitOfst;
	const uint8_t *swapTab = info->swapTbl;
	uint32_t srcBits;

	while (height-- > 0) {
		
		const uint8_t* src8 = src;
		uint8_t* dst8 = dst;
		
		src += info->srcRowBytes;
		dst += info->dstRowBytes;
		i = info->srcWidthBytes;
		
		srcBits = ((uint32_t)swapTab[*src8++]) << 8;
		
		if (info->leftMask) {
			
			if (info->needExtraReadUpFront)
				srcBits = (srcBits >> 8) | (((uint32_t)swapTab[*src8++]) << 8);

			*dst8 = (*dst8 &~ info->leftMask) | ((srcBits >> shift) & 0xFF & info->leftMask);
			dst8++;
		}
		while (i-- > 0) {
			
			srcBits = (srcBits >> 8) | (((uint32_t)swapTab[*src8++]) << 8);
			*dst8++ = srcBits >> shift;
		}
		if (info->rightMask) {
			
			srcBits <<= 8;
				
			if (info->needExtraReadAtEnd)
				srcBits |= ((uint32_t)swapTab[*src8++]) << 8;
			
			*dst8 = (*dst8 &~ info->rightMask) | ((srcBits >> shift) & 0xFF & info->rightMask);
		}
	}
}

static void PrvCopyLineShiftLB(const void* srcP, void* dstP, int32_t height, const struct PrvCopyLineInfo* info)	// LE indexed to BE indexed
{
	const uint8_t* src = ((const uint8_t*)srcP) + info->srcOffset;
	uint8_t* dst = ((uint8_t*)dstP) + info->dstOffset;
	int32_t i, shift = info->srcToDstBitOfst;
	const uint8_t *swapTab = info->swapTbl;
	uint32_t srcBits;
	
	while (height-- > 0) {
		
		const uint8_t* src8 = src;
		uint8_t* dst8 = dst;
		
		src += info->srcRowBytes;
		dst += info->dstRowBytes;
		i = info->srcWidthBytes;
		
		srcBits = swapTab[*src8++];
		
		if (info->leftMask) {
			
			if (info->needExtraReadUpFront)
				srcBits = (srcBits << 8) | swapTab[*src8++];

			*dst8 = (*dst8 &~ info->leftMask) | ((srcBits >> shift) & 0xFF & info->leftMask);
			dst8++;
		}
		while (i-- > 0) {
			
			srcBits = (srcBits << 8) | swapTab[*src8++];
			*dst8++ = srcBits >> shift;
		}
		if (info->rightMask) {
			
			srcBits <<= 8;
				
			if (info->needExtraReadAtEnd)
				srcBits |= swapTab[*src8];
			
			*dst8 = (*dst8 &~ info->rightMask) | ((srcBits >> shift) & 0xFF & info->rightMask);
		}
	}
}

static void PrvCopyLineShiftLL(const void* srcP, void* dstP, int32_t height, const struct PrvCopyLineInfo* info)	// LE indexed to LE indexed
{
	const uint8_t* src = ((const uint8_t*)srcP) + info->srcOffset;
	uint8_t* dst = ((uint8_t*)dstP) + info->dstOffset;
	int32_t i, shift = info->srcToDstBitOfst;
	uint32_t srcBits;

	while (height-- > 0) {
		
		const uint8_t* src8 = src;
		uint8_t* dst8 = dst;
		
		src += info->srcRowBytes;
		dst += info->dstRowBytes;
		i = info->srcWidthBytes;
		
		srcBits = ((uint32_t)*src8++) << 8;
		
		if (info->leftMask) {
			
			if (info->needExtraReadUpFront)
				srcBits = (srcBits >> 8) | (((uint32_t)*src8++) << 8);

			*dst8 = (*dst8 &~ info->leftMask) | ((srcBits >> shift) & 0xFF & info->leftMask);
			dst8++;
		}
		while (i-- > 0) {
			
			srcBits = (srcBits >> 8) | (((uint32_t)*src8++) << 8);
			*dst8++ = srcBits >> shift;
		}
		if (info->rightMask) {
			
			srcBits >>= 8;
				
			if (info->needExtraReadAtEnd)
				srcBits |= ((uint32_t)*src8) << 8;
			
			*dst8 = (*dst8 &~ info->rightMask) | ((srcBits >> shift) & 0xFF & info->rightMask);
		}
	}
}

static void PrvCopyLineAligned16(const void* srcP, void* dstP, int32_t height, const struct PrvCopyLineInfo* info)
{
	const uint8_t* src = ((const uint8_t*)srcP) + info->srcOffset;
	uint8_t* dst = ((uint8_t*)dstP) + info->dstOffset;
	int32_t i;
	
	if (info->srcWidthBytes <= 0)
		return;
	
	while (height-- > 0) {
		
		const uint16_t* src16 = (const uint16_t*)src;
		uint16_t* dst16 = (uint16_t*)dst;
		
		src += info->srcRowBytes;
		dst += info->dstRowBytes;
		i = info->srcWidthBytes / 2;
		
		while (i >= 4) {
			i -= 4;
			*dst16++ = *src16++;
			*dst16++ = *src16++;
			*dst16++ = *src16++;
			*dst16++ = *src16++;
		}
		while (i-- > 0)
			*dst16++ = *src16++;
	}
}

static void PrvCopyLineSwap565(const void* srcP, void* dstP, int32_t height, const struct PrvCopyLineInfo* info)
{
	const uint8_t* src = ((const uint8_t*)srcP) + info->srcOffset;
	uint8_t* dst = ((uint8_t*)dstP) + info->dstOffset;
	int32_t i;
	
	if (info->srcWidthBytes <= 0)
		return;
	
	while (height-- > 0) {
		
		const uint16_t* src16 = (const uint16_t*)src;
		uint16_t* dst16 = (uint16_t*)dst;
		
		src += info->srcRowBytes;
		dst += info->dstRowBytes;
		i = info->srcWidthBytes / 2;
		
		while (i >= 4) {
			i -= 4;
			*dst16++ = __builtin_bswap16(*src16++);
			*dst16++ = __builtin_bswap16(*src16++);
			*dst16++ = __builtin_bswap16(*src16++);
			*dst16++ = __builtin_bswap16(*src16++);
		}
		while (i-- > 0)
			*dst16++ = __builtin_bswap16(*src16++);
	}
}

static void PrvCopyLineAligned16Mask(const void* srcP, void* dstP, int32_t height, const struct PrvCopyLineInfo* info)
{
	const uint8_t* src = ((const uint8_t*)srcP) + info->srcOffset;
	uint8_t* dst = ((uint8_t*)dstP) + info->dstOffset;
	int32_t i;
	
	while (height-- > 0) {
		
		const uint16_t* src16 = (const uint16_t*)src;
		uint16_t* dst16 = (uint16_t*)dst;
		
		src += info->srcRowBytes;
		dst += info->dstRowBytes;
		i = info->srcWidthBytes / 2;
		
		*dst16 = (*dst16 &~ info->leftMask) | (*src16++ & info->leftMask);
		dst16++;
		
		while (i >= 4) {
			i -= 4;
			*dst16++ = *src16++;
			*dst16++ = *src16++;
			*dst16++ = *src16++;
			*dst16++ = *src16++;
		}
		while (i-- > 0)
			*dst16++ = *src16++;
		
		if (info->rightMask)
			*dst16 = (*dst16 &~ info->rightMask) | (*src16 & info->rightMask);
	}
}

static void PrvCopyLineAligned8Mask(const void* srcP, void* dstP, int32_t height, const struct PrvCopyLineInfo* info)
{
	const uint8_t* src = ((const uint8_t*)srcP) + info->srcOffset;
	uint8_t* dst = ((uint8_t*)dstP) + info->dstOffset;
	int32_t i;
	
	while (height-- > 0) {
		
		const uint8_t* src8 = src;
		uint8_t* dst8 = dst;
		
		src += info->srcRowBytes;
		dst += info->dstRowBytes;
		i = info->srcWidthBytes;
		
		if (info->leftMask) {
			*dst8 = (*dst8 &~ info->leftMask) | (*src8++ & info->leftMask);
			dst8++;
		}
		
		while (i >= 4) {
			i -= 4;
			*dst8++ = *src8++;
			*dst8++ = *src8++;
			*dst8++ = *src8++;
			*dst8++ = *src8++;
		}
		while (i-- > 0)
			*dst8++ = *src8++;
		
		if (info->rightMask)
			*dst8 = (*dst8 &~ info->rightMask) | (*src8 & info->rightMask);
	}
}

static void PrvCopyLineAligned8MaskSwap(const void* srcP, void* dstP, int32_t height, const struct PrvCopyLineInfo* info)
{
	const uint8_t* src = ((const uint8_t*)srcP) + info->srcOffset;
	uint8_t* dst = ((uint8_t*)dstP) + info->dstOffset;
	const uint8_t *swapTab = info->swapTbl;
	int32_t i;
	
	while (height-- > 0) {
		
		const uint8_t* src8 = src;
		uint8_t* dst8 = dst;
		
		src += info->srcRowBytes;
		dst += info->dstRowBytes;
		i = info->srcWidthBytes;
		
		if (info->leftMask) {
			*dst8 = (*dst8 &~ info->leftMask) | (swapTab[*src8++] & info->leftMask);
			dst8++;
		}
		
		while (i >= 4) {
			i -= 4;
			*dst8++ = swapTab[*src8++];
			*dst8++ = swapTab[*src8++];
			*dst8++ = swapTab[*src8++];
			*dst8++ = swapTab[*src8++];
		}
		while (i-- > 0)
			*dst8++ = swapTab[*src8++];
		
		if (info->rightMask)
			*dst8 = (*dst8 &~ info->rightMask) | (swapTab[*src8] & info->rightMask);
	}
}

static PrvCopyLineFunc PrvCopyLineInit(uint32_t depth, uint32_t srcFormat, uint32_t dstFormat, int32_t srcX, int32_t dstX, int32_t width, struct PrvCopyLineInfo *info)
{
	const uint8_t *swap;
	
	info->srcRowBytes = 0;
	info->dstRowBytes = 0;
	
	if (depth == 16) {
		info->srcWidthBytes = width * 2;
		info->srcOffset = srcX * 2;
		info->dstOffset = dstX * 2;
		
		return (srcFormat == dstFormat) ? PrvCopyLineAligned16 : PrvCopyLineSwap565;
	}
	else {
		
		int32_t t, srcOffset = srcX * depth, dstOffset = dstX * depth;
		uint16_t leftMask, rightMask;
		
		width *= depth;
		
		if (srcFormat == dstFormat && (srcOffset & 0x0F) == (dstOffset & 0x0F)) {
			
			info->srcOffset = (srcOffset >> 4) * 2;
			info->dstOffset = (dstOffset >> 4) * 2;
			
			t = ((dstOffset + width) >> 4) - (dstOffset >> 4);
			t = (t - 1) * 2;
			
			info->srcWidthBytes = t;
			
			if (t < 0) {
				leftMask = ((0xFFFF0000 >> width) & 0xFFFF) >> (dstOffset & 0x0F);
				rightMask = 0;
				info->srcWidthBytes = 0;
			}
			else {
				
				leftMask = 0xFFFF >> (dstOffset & 0x0F);
				rightMask = 0xFFFF0000 >> ((dstOffset + width) & 0x0F);
			}
			
			if (leftMask == 0xFFFF && !rightMask) {
				
				info->srcWidthBytes = t + 2;
				
				return PrvCopyLineAligned16;
			}
			else {
				
				leftMask = __builtin_bswap16(leftMask);
				rightMask = __builtin_bswap16(rightMask);
				
				if (dstFormat == PALM_BMP_PIXEL_FORMAT_INDEXED_LE) {
					
					swap = SwapForDepth(depth);
					
					leftMask = swap[leftMask & 0xFF] + (((uint32_t)swap[leftMask >> 8]) << 8);
					rightMask = swap[rightMask & 0xFF] + (((uint32_t)swap[rightMask >> 8]) << 8);
				}
				info->leftMask = leftMask;
				info->rightMask = rightMask;
				return PrvCopyLineAligned16Mask;
			}
		}
		else {
			info->srcToDstBitOfst = (dstOffset & 7) - (srcOffset & 7);
			
			leftMask = 0xFF >> (dstOffset & 7);
			rightMask = (uint8_t)~(0xFF >> ((dstOffset + width) & 7));
			info->srcWidthBytes = t = ((dstOffset + width) >> 3) - (dstOffset >> 3) - 1;
			info->swapTbl = swap = SwapForDepth(depth);
			info->srcOffset = srcOffset >> 3;
			info->dstOffset = dstOffset >> 3;
			
			if (t < 0) {
				
				leftMask = ((uint8_t)~(0xFF >> width)) >> (dstOffset & 7);
				rightMask = 0;
				info->srcWidthBytes = 0;
			}
			
			if (info->srcToDstBitOfst < 0) {
				
				if (dstFormat == PALM_BMP_PIXEL_FORMAT_INDEXED_LE)
					info->srcToDstBitOfst = -info->srcToDstBitOfst;
				else
					info->srcToDstBitOfst += 8;
				info->needExtraReadUpFront = true;
			}
			else if (info->srcToDstBitOfst) {
				
				if (dstFormat == PALM_BMP_PIXEL_FORMAT_INDEXED_LE)
					info->srcToDstBitOfst = 8 - info->srcToDstBitOfst;
				info->needExtraReadUpFront = false;
			}
			
			if (dstFormat != PALM_BMP_PIXEL_FORMAT_INDEXED_LE)
				info->needExtraReadAtEnd = ((0xFF00 >> info->srcToDstBitOfst) & 0xFF) < rightMask;
			else {
				
				leftMask = swap[leftMask];
				rightMask = swap[rightMask];
				
				info->needExtraReadAtEnd = (0xFF >> info->srcToDstBitOfst) < rightMask;
			}
			
			info->rightMask = rightMask;
			info->leftMask = leftMask;
			
			if (!info->srcToDstBitOfst)
				return (srcFormat == dstFormat) ? PrvCopyLineAligned8Mask : PrvCopyLineAligned8MaskSwap;
			else if (dstFormat == PALM_BMP_PIXEL_FORMAT_INDEXED_LE)
				return (srcFormat == PALM_BMP_PIXEL_FORMAT_INDEXED) ? PrvCopyLineShiftBL : PrvCopyLineShiftLL;
			else
				return (srcFormat == PALM_BMP_PIXEL_FORMAT_INDEXED) ? PrvCopyLineShiftBB : PrvCopyLineShiftLB;
		}
	}
}

static void PrvBlitCopyMatched(const void* srcP_, int32_t srcRowBytes, int32_t srcX, uint32_t srcPixelFormat, void* dstP_, int32_t dstRowBytes, int32_t dstX, uint32_t dstPixelFormat, uint32_t depth, int32_t width, int32_t height, bool rightLeftScan)
{
	const uint8_t* srcP = (const uint8_t*)srcP_;
	uint8_t* dstP = (uint8_t*)dstP_;
	struct PrvCopyLineInfo info = {};
	PrvCopyLineFunc func;
	
	if (rightLeftScan) {
		
		uint32_t shift = 0, copySize = 0;
		uint8_t *tempLineBuffer = NULL;
		
		shift = __builtin_ctz(depth);
		srcX <<= shift;
		width <<= shift;
		srcP += ((srcX >> 4) * 2);
		srcX &= 0x0F;
		copySize = ((srcX + width + 0x0F) >> 4) * 2;
		tempLineBuffer = (uint8_t*)kheapAlloc(copySize);
		if (!tempLineBuffer)
			return;
		
		srcX >>= shift;
		width >>= shift;
		func = PrvCopyLineInit(depth, srcPixelFormat, dstPixelFormat, srcX, dstX, width, &info);
		
		while (height-- > 0) {
			
			memcpy(tempLineBuffer, srcP, copySize);
			func(tempLineBuffer, dstP, 1, &info);
			srcP += srcRowBytes;
			dstP += dstRowBytes;
		}
		
		if (tempLineBuffer)
			kheapFree(tempLineBuffer);
	}
	else {
		
		func = PrvCopyLineInit(depth, srcPixelFormat, dstPixelFormat, srcX, dstX, width, &info);

		info.srcRowBytes = srcRowBytes;
		info.dstRowBytes = dstRowBytes;
		
		func(srcP, dstP, height, &info);
	}
}

static const uint8_t* PrvStandardIndexMapping(uint32_t srcPixelSize, uint32_t dstPixelSize)
{
	static const uint8_t kStdTrans_8_4[] = {0x00, 0x02, 0x04, 0x06, 0x09, 0x0B, 0x00, 0x02, 0x05, 0x07, 0x09, 0x0B, 0x00, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x01, 0x03, 0x05, 0x07, 0x09, 0x0C, 0x01, 0x03, 0x05, 0x08, 0x0A, 0x0C, 0x01, 0x03, 0x06, 0x08, 0x0A, 0x0C, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x02, 0x04, 0x06, 0x08, 0x0B, 0x0D, 0x02, 0x04, 0x06, 0x09, 0x0B, 0x0D, 0x02, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0E, 0x03, 0x05, 0x07, 0x09, 0x0C, 0x0E, 0x03, 0x05, 0x08, 0x0A, 0x0C, 0x0E, 0x03, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x01, 0x03, 0x05, 0x07, 0x0A, 0x0C, 0x01, 0x03, 0x05, 0x08, 0x0A, 0x0C, 0x01, 0x03, 0x06, 0x08, 0x0A, 0x0C, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0D, 0x02, 0x04, 0x06, 0x08, 0x0B, 0x0D, 0x02, 0x04, 0x06, 0x09, 0x0B, 0x0D, 0x02, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0E, 0x03, 0x05, 0x07, 0x09, 0x0C, 0x0E, 0x03, 0x05, 0x08, 0x0A, 0x0C, 0x0E, 0x03, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0F, 0x04, 0x06, 0x08, 0x0B, 0x0D, 0x0F, 0x04, 0x06, 0x09, 0x0B, 0x0D, 0x0E, 0x0D, 0x0B, 0x0A, 0x08, 0x07, 0x05, 0x04, 0x02, 0x01, 0x04, 0x0D, 0x0D, 0x0A, 0x09, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, };
	static const uint8_t kStdTrans_8_2[] = {0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x01, 0x01, 0x01, 0x02, 0x02, 0x03, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x01, 0x01, 0x01, 0x02, 0x02, 0x03, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x01, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x02, 0x02, 0x02, 0x01, 0x01, 0x01, 0x00, 0x01, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, };
	static const uint8_t kStdTrans_8_1[] = {0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, };
	static const uint8_t kStdTrans_4_8[] = {0x00, 0xE0, 0xDF, 0x19, 0xDE, 0xDD, 0x32, 0xDC, 0xDB, 0xA5, 0xDA, 0xD9, 0xBE, 0xD8, 0xD7, 0xFF, };
	static const uint8_t kStdTrans_4_2[] = {0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, };
	static const uint8_t kStdTrans_4_1[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, };
	static const uint8_t kStdTrans_2_8[] = {0x00, 0xDD, 0xDA, 0xFF, };
	static const uint8_t kStdTrans_2_4[] = {0x00, 0x05, 0x0A, 0x0F, };
	static const uint8_t kStdTrans_2_1[] = {0x00, 0x00, 0x01, 0x01, };
	static const uint8_t *map[] = {
		kStdTrans_2_1, k8UnityMapping, kStdTrans_4_1, k8UnityMapping, k8UnityMapping, k8UnityMapping,
		kStdTrans_8_1, k8UnityMapping, k8UnityMapping, k8UnityMapping, kStdTrans_4_2, k8UnityMapping,
		k8UnityMapping, k8UnityMapping, kStdTrans_8_2, k8UnityMapping, k8UnityMapping, k8UnityMapping,
		k8UnityMapping, k8UnityMapping, k8UnityMapping, k8UnityMapping, k8UnityMapping, k8UnityMapping,
		kStdTrans_2_4, k8UnityMapping, k8UnityMapping, k8UnityMapping, k8UnityMapping, k8UnityMapping, 
		kStdTrans_8_4, k8UnityMapping, k8UnityMapping, k8UnityMapping, k8UnityMapping, k8UnityMapping,
		k8UnityMapping, k8UnityMapping, k8UnityMapping, k8UnityMapping, k8UnityMapping, k8UnityMapping,
		k8UnityMapping, k8UnityMapping, k8UnityMapping, k8UnityMapping, k8UnityMapping, k8UnityMapping,
		k8UnityMapping, k8UnityMapping, k8UnityMapping, k8UnityMapping, k8UnityMapping, k8UnityMapping,
		k8UnityMapping, k8UnityMapping, kStdTrans_2_8, k8UnityMapping, kStdTrans_4_8};
	
	uint32_t idx = srcPixelSize + 8 * dstPixelSize - 10;
	
	if (idx >= sizeof(map) / sizeof(*map))
		return k8UnityMapping;
	
	return map[idx];
}

static uint32_t PrvRGBColorToRGB565(const struct PalmClutEntry *rgbP, uint32_t pixelFormat)
{
	uint32_t val = ((((uint32_t)rgbP->r) & 0xF8) << 8) + ((((uint32_t)rgbP->g) & 0xFC) << 3) + ((((uint32_t)rgbP->b) & 0xF8) >> 3);
	
	if (pixelFormat == PALM_BMP_PIXEL_FORMAT_RGB565_BE)
		val = __builtin_bswap16(val);
	
	return val;
}

static uint32_t PrvExpandPixel(const struct PalmBitmapV3fat *bmp, uint32_t pixel, const struct PalmClutEntry *pixelRGB)
{
	switch (bmp->pixelSz) {
		case 1:
			pixel |= pixel << 1;
			//fallthrough
		case 2:
			pixel |= pixel << 2;
			//fallthrough
		case 4:
			pixel |= pixel << 4;
			//fallthrough
		case 8:
			pixel |= pixel << 8;
			return pixel;
		case 16:
			return PrvRGBColorToRGB565(pixelRGB, bmp->pixelFormat);
		default:
			__builtin_unreachable();
			return 0;
	}
}

static void PrvSetupBitonalBlit(enum PalmWinDrawOperation drawMode, struct PalmBitmapV3fat* dstBmp, int32_t dstY, uint32_t transparentColor, uint32_t backColor, uint32_t foreColor, struct BitonalBlitInfo *info)
{
	uint32_t depthShift = 0, depth = dstBmp->pixelSz, stride = dstBmp->stride, pixFmt = dstBmp->pixelFormat;
	static const PrvBitonalBlitXferFunc copyFuncs[] = {[0] = PrvBlit1To1Copy, [1] = PrvBlit1To2Copy, [2] = PrvBlit1To4Copy, [3] = PrvBlit1To8Copy, [4] = PrvBlit1To16Copy,};
	static const PrvBitonalBlitXferFunc overFuncs[] = {[0] = PrvBlit1To1Over, [1] = PrvBlit1To2Over, [2] = PrvBlit1To4Over, [3] = PrvBlit1To8Over, [4] = PrvBlit1To16Over,};

	depthShift = __builtin_ctz(depth);
	
	info->depth = depth;
	info->depthShift = depthShift;
	info->dstP = ((uint8_t*)dstBmp->data) + dstY * stride;
	info->dstRowBytes = stride;
	info->dstPixelFormat = pixFmt;

	switch (drawMode) {
		case palmWinDrawOpWinPaint:
			info->transferFunc = (pixFmt == PALM_BMP_PIXEL_FORMAT_INDEXED_LE) ? PrvBlit1ToNCopyLE : copyFuncs[depthShift];
			info->background = backColor;
			info->foreground = foreColor;
			break;
		
		case palmWinDrawOpWinErase:
			info->transferFunc = (pixFmt == PALM_BMP_PIXEL_FORMAT_INDEXED_LE) ? PrvBlit1ToNOverLE : overFuncs[depthShift];
			//these ARE correct
			info->background = transparentColor ? 0 : 1;
			info->foreground = backColor;
			break;
		
		case palmWinDrawOpWinMask:
			info->transferFunc = (pixFmt == PALM_BMP_PIXEL_FORMAT_INDEXED_LE) ? PrvBlit1ToNOverLE : overFuncs[depthShift];
			//these ARE correct
			info->background = transparentColor;
			info->foreground = backColor;
			break;
		
		case palmWinDrawOpWinInvert:
			info->transferFunc = PrvBlit1ToNXOR;
			info->background = backColor;
			info->foreground = foreColor;
			break;
		
		case palmWinDrawOpWinOverlay:
			info->transferFunc = (pixFmt == PALM_BMP_PIXEL_FORMAT_INDEXED_LE) ? PrvBlit1ToNOverLE : overFuncs[depthShift];
			info->background = transparentColor;
			info->foreground = transparentColor ? backColor : foreColor;
			break;
		
		case palmWinDrawOpWinPaintInverse:
			info->transferFunc = (pixFmt == PALM_BMP_PIXEL_FORMAT_INDEXED_LE) ? PrvBlit1ToNCopyLE : copyFuncs[depthShift];
			//these ARE correct
			info->background = foreColor;
			info->foreground = backColor;
			break;
		
		default:	//doesnt happen
			break;
	}
}

static void PrvBlitBitonal(const struct BitonalBlitInfo *info, const void* srcP, uint32_t srcRowBytes, uint32_t srcOffset, uint32_t dstOffset, int32_t width, uint32_t height)
{
	info->transferFunc(info, srcP, srcRowBytes, srcOffset, dstOffset, width, height);
}

static int32_t PrvScrDecompressScanLine(const uint8_t *src, uint8_t *dst, uint32_t width)		//XXX: TODO: there is a copy of this in boot screen drawing code - consolidate
{
	const uint8_t *srcOrig = src;
	
	//one flag byte at a time first
	while (width >= 8) {
		
		uint32_t v = *src++;
		
		if (v) {
			
			if (v & 0x80) dst[0] = *src++;
			if (v & 0x40) dst[1] = *src++;
			if (v & 0x20) dst[2] = *src++;
			if (v & 0x10) dst[3] = *src++;
			if (v & 0x08) dst[4] = *src++;
			if (v & 0x04) dst[5] = *src++;
			if (v & 0x02) dst[6] = *src++;
			if (v & 0x01) dst[7] = *src++;
		}
		dst += 8;
		width -= 8;
	}
	
	//then the leftover
	if (width > 0) {
		
		uint32_t v = *src++;
		while (width--) {
			
			if (v & 0x80)
				*dst = *src++;
			dst++;
			v <<= 1;
		}
	}
	
	return src - srcOrig;
}

static int32_t PrvScrDecompressRLE(const uint8_t *src, uint8_t *dst, uint32_t dstSz, uint8_t *prevByteCountP, uint8_t *prevByteP)
{
	uint32_t i, prevCt = *prevByteCountP, prevVal = *prevByteP;
	const uint8_t *srcOrig = src;
	
	if (prevCt) {
		
		//this case is so fast, we special-case it
		if (prevCt >= dstSz) {
			(*prevByteCountP) -= dstSz;
			
			for (i = 0; i < dstSz; i++)
				*dst++ = prevVal;
			
			return 0;
		}
		
		for (i = 0; i < prevCt; i++)
			*dst++ = prevVal;
		
		dstSz -= prevCt;
		*prevByteCountP = 0;
	}

	while (1) {
		
		prevCt = *src++;
		prevVal = *src++;
		
		//if it fits entirely (likely) handle it in the fast path
		if (prevCt < dstSz) {
			
			for (i = 0; i < prevCt; i++)
				*dst++ = prevVal;
			
			dstSz -= prevCt;
			continue;
		}
		
		//else,we have the normal path
		for (i = 0; i < dstSz; i++)
			*dst++ = prevVal;
		
		*prevByteCountP = prevCt - dstSz;
		*prevByteP = prevVal;
		break;
	}
	
	return src - srcOrig;
}

static int32_t PrvScrDecompressPackBits(const uint8_t *src, uint8_t *dst, uint32_t dstSz)
{
	const uint8_t *srcOrig = src;
	
	while (dstSz) {
		
		int32_t v = (int8_t)*src++;
		
		if (v >= 0) {		//literal copy
			
			v++;
			if ((uint32_t)v > dstSz)	//trying to output too much data - bad
				return -1;
			
			dstSz -= v;
			while (v--)
				*dst++ = *src++;
		}
		else {
			
			uint32_t repVal = *src++;
			
			v = 1 - v;
			if ((uint32_t)v > dstSz)	//trying to output too much data - bad
				return -1;
			
			dstSz -= v;
			while (v--)
				*dst++ = repVal;
		}
	}
	
	return src - srcOrig;
}

static int32_t PrvScrDecompressPackBits16(const uint8_t *src, uint16_t *dst, uint32_t dstSz)	//dastSz is in halfwords, our return valus is in bytes (used from source)
{
	const uint8_t *srcOrig = src;
	uint32_t halfword;
	
	while (dstSz) {
		
		int32_t v = (int8_t)*src++;
		
		if (v >= 0) {		//literal copy
			
			v++;
			if ((uint32_t)v > dstSz)	//trying to output too much data - bad
				return -1;
			
			dstSz -= v;
			while (v--) {
				//note: order matters here: BE!!!
				halfword = src[1];
				halfword <<= 8;
				halfword += src[0];
				src += 2;
				
				*dst++ = halfword;
			}
		}
		else {
			
			//note: order matters here: BE!!!
			halfword = src[1];
			halfword <<= 8;
			halfword += src[0];
			src += 2;
			
			v = 1 - v;
			if ((uint32_t)v > dstSz)	//trying to output too much data - bad
				return -1;
			
			dstSz -= v;
			while (v--)
				*dst++ = halfword;
		}
	}
	
	return src - srcOrig;
}

static int32_t PrvScrDecompress(uint32_t compressionMethod, const uint8_t *srcP, uint32_t srcBufLen, uint8_t *dstP, uint32_t dstBufLen, union ComprState *decompStateP)
{
	//most but not all of these decompress one line at a time (no more no less)
	//some of these decompression funcs assume destination buffer contains previously decompressed line
	switch (compressionMethod) {
		
		case PALM_BMP_COMPRES_TYPE_SCAN_LINE:
			return PrvScrDecompressScanLine(srcP, dstP, dstBufLen);
		
		case PALM_BMP_COMPRES_TYPE_RLE:
			return PrvScrDecompressRLE(srcP, dstP, dstBufLen, &decompStateP->rle.prevByteCout, &decompStateP->rle.prevByte);
		
		case PALM_BMP_COMPRES_TYPE_PACK_BITS:
			if (decompStateP->packBits.bpp == 16)
				return PrvScrDecompressPackBits16(srcP, (uint16_t*)dstP, dstBufLen / sizeof(uint16_t));
			else
				return PrvScrDecompressPackBits(srcP, dstP, dstBufLen);
		
		default:
			return -1;
	}
}

static int32_t PrvScrCompressScanLine(const uint8_t *line, const uint8_t *prevLine, uint32_t width, uint8_t *dst, bool isFirstLine)
{
	uint8_t *dstOrig = dst;
	uint32_t flags, i, v;
	
	if (isFirstLine) {
		
		while (width >= 8) {
			width -= 8;
			*dst++ = 0xFF;
			for (i = 0; i < 8; i++)
				*dst++ = *line++;
		}
		if (width) {
			
			*dst++ = 0xFF << (8 - width);
			for (i = 0; i < width; i++)
				*dst++ = *line++;
		}
	}
	else {
		
		while (width >= 8) {
			uint8_t* flagsP = dst++;
			
			width -= 8;
			for (flags = 0, i = 0; i < 8; i++) {
				
				flags <<= 1;
				if ((v = *line++) != *prevLine++) {
					flags++;
					*dst++ = v;
				}	
			}
			*flagsP = flags;
		}
		if (width) {
			uint8_t* flagsP = dst++;
			
			for (flags = 0, i = 0; i < width; i++) {
				
				flags <<= 1;
				if ((v = *line++) != *prevLine++) {
					flags++;
					*dst++ = v;
				}	
			}
			*flagsP = flags << (8 - width);
		}
	}
	
	return dst - dstOrig;
}

static int32_t PrvScrCompressRLE(const uint8_t *src, uint32_t srcBufLen, uint8_t *dst, uint32_t dstBufLen)
{
	uint32_t curByte, curRun, origDstBufLen = dstBufLen;
	const uint8_t* srcEnd = src + srcBufLen;
	
	while (src != srcEnd) {
		
		if (dstBufLen < 2)
			return -1;
		
		curByte = src[0];
		curRun = 1;
		while (src + curRun != srcEnd && src[curRun] == curByte && curRun != 255)
			curRun++;
		src += curRun;
		
		dstBufLen -= 2;
		*dst++ = curRun;
		*dst++ = curByte;
	}
	
	return origDstBufLen - dstBufLen;
}

static int32_t PrvScrCompressPackBits(const uint8_t *src, uint32_t srcBufLen, uint8_t *dst)
{
	const uint8_t *srcEnd = src + srcBufLen, *srcEndMinus2 = srcEnd - 2, *runEnd;
	uint8_t *dstOrig = dst;
	uint32_t v, runLen;
	
	while (src != srcEnd) {
		
		//maybe a run?
		v = *src;
		runEnd = src + 1;
		while (runEnd != srcEnd && *runEnd == v)
			runEnd++;
		runLen = runEnd - src;
		
		if (runLen > 2) {	//makes no sense to encode runs of 2 bytes or less
			
			if (runLen > 128)
				runLen = 128;
			
			*dst++ = 1 - runLen;
			*dst++ = v;
			src += runLen;
			continue;		//go on try to find the next run
		}
		
		//if we are here, we have no run to record - emit the "copy bytes" code
		runEnd = src;
		while (runEnd < srcEndMinus2 && (runEnd[0] != runEnd[1] || runEnd[0] != runEnd[2]))
			runEnd++;
		if (runEnd == srcEndMinus2)
			runEnd = srcEnd;
		runLen = runEnd - src;
		
		if (runLen) {
		
			if (runLen > 127)
				runLen = 127;
			
			*dst++ = runLen - 1;
			while(runLen--)
				*dst++ = *src++;
		}
	}
	
	return dst - dstOrig;
}

static int32_t PrvScrCompressPackBits16(const uint16_t *src, uint32_t srcLen, uint8_t *dst)
{
	const uint16_t *srcEnd = src + srcLen, *srcEndMinus2 = srcEnd - 2, *runEnd;
	uint8_t *dstOrig = dst;
	uint32_t v, runLen;
	
	while (src != srcEnd) {
		
		//maybe a run?
		v = *src;
		runEnd = src + 1;
		while (runEnd != srcEnd && *runEnd == v)
			runEnd++;
		runLen = runEnd - src;
		
		if (runLen > 2) {	//makes no sense to encode runs of 2 bytes or less
			
			if (runLen > 128)
				runLen = 128;
			
			*dst++ = 1 - runLen;
			*dst++ = v;
			*dst++ = v >> 8;
			src += runLen;
			continue;		//go on try to find the next run
		}
		
		//if we are here, we have no run to record - emit the "copy bytes" code
		runEnd = src;
		while (runEnd < srcEndMinus2 && (runEnd[0] != runEnd[1] || runEnd[0] != runEnd[2]))
			runEnd++;
		if (runEnd == srcEndMinus2)
			runEnd = srcEnd;
		runLen = runEnd - src;
		
		if (runLen) {
		
			if (runLen > 127)
				runLen = 127;
			
			*dst++ = runLen - 1;
			while(runLen--) {
				v = *src++;
				*dst++ = v;
				*dst++ = v >> 8;
			}
		}
	}
	
	return dst - dstOrig;
}

static int32_t PrvScrCompress(uint32_t compressionMethod, const uint8_t *srcP, uint32_t srcBufLen, uint8_t *dstP, uint32_t dstBufLen, union ComprState *compStateP)
{
	int32_t ret;
	
	switch (compressionMethod) {
		
		case PALM_BMP_COMPRES_TYPE_SCAN_LINE:
			ret = PrvScrCompressScanLine(srcP, compStateP->scanLine.prevLine, srcBufLen, dstP, !compStateP->scanLine.havePrevLine);
			compStateP->scanLine.prevLine = srcP;
			compStateP->scanLine.havePrevLine = true;
			return ret;
		
		case PALM_BMP_COMPRES_TYPE_RLE:
			return PrvScrCompressRLE(srcP, srcBufLen, dstP, dstBufLen);
		
		case PALM_BMP_COMPRES_TYPE_PACK_BITS:
			if (compStateP->packBits.bpp == 16)
				return PrvScrCompressPackBits16((uint16_t*)srcP, srcBufLen / 2, dstP);
			else
				return PrvScrCompressPackBits(srcP, srcBufLen, dstP);
		
		default:
			return -1;
	}
}


static void PrvBlitUnderline(uint8_t* dstP, int32_t dstOffset, int32_t width, uint32_t depth, uint32_t which, int32_t skip, uint32_t skipIndex, uint32_t writeIndex, uint32_t densityMul, uint32_t pixelFormat)
{
	uint32_t byteShift = 1, pixelMask = (1 << depth) - 1, pixelShiftMask = 1;
	int32_t i, stop = dstOffset + width;
	uint16_t *dst16 = (uint16_t*)dstP;
	
	switch (depth) {
		case 1:
			pixelShiftMask += 4;
			byteShift++;
			//fallthrough
		case 2:
			pixelShiftMask += 2;
			byteShift++;
			//fallthrough
		case 4:
			for (i = dstOffset; i < stop; i++) {
				
				uint32_t pixelShift, mask, byte;
				
				if (which == 2) {
				
					if (((i + skip) % (densityMul * 2)) >= densityMul)
						continue;
				}
				
				pixelShift = 8 - depth - depth * (i & pixelShiftMask);
				byte = dstP[i >> byteShift];
				
				if (pixelFormat == PALM_BMP_PIXEL_FORMAT_INDEXED_LE) {
					
					mask = pixelMask << (8 - depth - pixelShift);
					
					if ((byte & mask) != (skipIndex << (8 - depth - pixelShift))) {
						
						byte &=~ mask;
						byte |= writeIndex << (8 - depth - pixelShift);
						
						dstP[i >> byteShift] = byte;
					}
				}
				else {
					
					mask = pixelMask << pixelShift;
					
					if ((byte & mask) != (skipIndex << pixelShift)) {
						
						byte &=~ mask;
						byte |= writeIndex << pixelShift;
						
						dstP[i >> byteShift] = byte;
					}
				}
			}
			break;
		
		case 8:
			for (i = dstOffset; i < stop; i++) {
				
				if (which == 2) {
				
					if (((i + skip) % (densityMul * 2)) >= densityMul)
						continue;
				}
				
				if (dstP[i] != skipIndex)
					dstP[i] = writeIndex;
			}
			break;
		
		case 16:
			for (i = dstOffset; i < stop; i++) {
				
				if (which == 2) {
				
					if (((i + skip) % (densityMul * 2)) >= densityMul)
						continue;
				}
				
				if (dst16[i] != (skipIndex & 0xFF))	//not sure why anding with 0xFF but this is the same in ARM & x86
					dst16[i] = writeIndex;
			}
			break;
	}
}

static void PrvDrawUnderline(struct PalmCanvas* canvas, int32_t start, int32_t right, int32_t top)
{
	enum PalmUnderlineMode underline = canvas->drawState->underlineMode;
	struct PalmBitmapV3fat* bmp = canvas->bmp;
	uint32_t densityMul = bmp->density / PALM_DENSITY_STANDARD;	//this treats 1.5x as 1x, this is INTENDED
	int32_t currentTop, left = start, bottom = top + densityMul;
	int32_t clipLeft = canvas->clippingRect.topLeft.x, clipRight = clipLeft + canvas->clippingRect.extent.x;
	int32_t clipTop = canvas->clippingRect.topLeft.y, clipBottom = clipTop + canvas->clippingRect.extent.y;
	uint32_t foreColor, textColor, dstRowBytes = bmp->stride, dstDepth = bmp->pixelSz, mask = (1 << dstDepth) - 1;
	uint8_t *dstP, *dstBaseAddr = (uint8_t*)bmp->data;

	//no underline? bail
	if (underline == palmUnderlineModeNoUnderline)
		return;
	
	//clip
	if (left < clipLeft)
		left = clipLeft;
	
	if (right > clipRight)
		right = clipRight;
	
	if (top < clipTop)
		top = clipTop;
	
	if (bottom > clipBottom)
		bottom = clipBottom;
	
	//no pixels in clip area? bail
	if (left >= right || top >= bottom)
		return;
	
	//prepare colors
	foreColor = PrvExpandPixel(bmp, canvas->drawState->foreColorIdx, &canvas->drawState->foreColorRGB);
	textColor = PrvExpandPixel(bmp, canvas->drawState->textColorIdx, &canvas->drawState->textColorRGB);
	
	//calculate starting address
	dstP = dstBaseAddr + dstRowBytes * top;
	
	//draw
	for (currentTop = top; currentTop < bottom; currentTop++, dstP += dstRowBytes)
		PrvBlitUnderline(dstP, left, right - left, dstDepth, underline == palmUnderlineModeGrayUnderline ? 2 : 0, left - start, textColor & mask, foreColor & mask, densityMul, bmp->pixelFormat);
	
	if (bmp->forScreen && halScreenIsLive())	//screen refresh code doesnt apply for offscreen windows
		HALScreenDrawNotify(left, top, right - left, bottom - top);
}

static void PrvOneAndOneHalfBitonal(const uint8_t *src, uint32_t srcRowBytes, uint32_t widthBytes, uint32_t height, uint8_t *dst, uint32_t dstRowBytes)
{
	int32_t x, srcRowAdd = srcRowBytes - widthBytes, dstRowAdd = dstRowBytes - widthBytes * 3 / 2;
	const uint16_t* oneAndOneHalfScaleNibble1bit = mOneAndOneHalfScaleNibble1bit;
	uint32_t t, scaledBits1, scaledBits2, toggle = 1;
	uint8_t *dst2 = dst + dstRowBytes;
	
	if (widthBytes & 1)
		dstRowAdd--;
	
	while (height-- > 0) {
		
		x = widthBytes;
		
		if (toggle) {
			
			while (x >= 2) {
				
				scaledBits1 = oneAndOneHalfScaleNibble1bit[*src++];
				scaledBits2 = oneAndOneHalfScaleNibble1bit[*src++];
				
				t = scaledBits1 >> 4;
				*dst++ = t;
				*dst2++ = t;
				
				t = ((scaledBits1 & 0x0F) << 4) | (scaledBits2 >> 8);
				*dst++ = t;
				*dst2++ = t;
				
				t = scaledBits2;
				*dst++ = t;
				*dst2++ = t;
				
				x -= 2;
			}
			
			if (x == 1) {
				scaledBits1 = oneAndOneHalfScaleNibble1bit[*src++];
				
				t = scaledBits1 >> 4;
				*dst++ = t;
				*dst2++ = t;
				
				t = (scaledBits1 & 0x0F) << 4;
				*dst++ = t;
				*dst2++ = t;
			}
			
			dst += dstRowBytes + dstRowAdd;
			dst2 += dstRowBytes * 2 + dstRowAdd;
		}
		else {
			
			while (x >= 2) {
			
				scaledBits1 = oneAndOneHalfScaleNibble1bit[*src++];
				scaledBits2 = oneAndOneHalfScaleNibble1bit[*src++];
				
				t = scaledBits1 >> 4;
				*dst++ = t;
				
				t = ((scaledBits1 & 0x0F) << 4) | (scaledBits2 >> 8);
				*dst++ = t;
				
				t = scaledBits2;
				*dst++ = t;
				
				x -= 2;
			}
			
			if (x == 1) {
				scaledBits1 = oneAndOneHalfScaleNibble1bit[*src++];
				
				t = scaledBits1 >> 4;
				*dst++ = t;
				
				t = (scaledBits1 & 0x0F) << 4;
				*dst++ = t;
			}
			
			dst += dstRowAdd;
		}
		
		src += srcRowAdd;
		toggle = 1 - toggle;
	}
}

static void PrvDoubleBitonal(const uint8_t *src, int32_t srcRowBytes, int32_t widthBytes, int32_t height, uint8_t *dst, int32_t dstRowBytes)
{
	static const uint8_t lookup[] = {0x00, 0x03, 0x0C, 0x0F, 0x30, 0x33, 0x3C, 0x3F, 0xC0, 0xC3, 0xCC, 0xCF, 0xF0, 0xF3, 0xFc, 0xFF};	//nibble to byte expansion
	int32_t x, srcRowAdd = srcRowBytes - widthBytes, dstRowAdd = dstRowBytes * 2 - widthBytes * 2;
	uint8_t *dst2 = dst + dstRowBytes;
	
	while (height-- > 0) {
		
		for (x = 0; x < widthBytes; x++) {
			
			uint32_t v = *src++, v1 = lookup[v >> 4], v2 = lookup[v & 15];
			
			*dst++ = v1;
			*dst2++ = v1;
			*dst++ = v2;
			*dst2++ = v2;
		}
		
		src += srcRowAdd;
		dst += dstRowAdd;
		dst2 += dstRowAdd;
	}
}

static void PrvTripleBitonal(const uint8_t *src, int32_t srcRowBytes, int32_t widthBytes, int32_t height, uint8_t *dst, int32_t dstRowBytes)
{
	static const uint16_t lookup[] = {0x0000, 0x0007, 0x0038, 0x003f, 0x01c0, 0x01c7, 0x01f8, 0x01ff, 0x0e00, 0x0e07, 0x0e38, 0x0e3f, 0x0fc0, 0x0fc7, 0x0ff8, 0x0fff};
	
	int32_t x, srcRowAdd = srcRowBytes - widthBytes, dstRowAdd = dstRowBytes * 3 - widthBytes * 3;
	uint8_t *dst2 = dst + dstRowBytes, *dst3 = dst2 + dstRowBytes;
	
	while (height-- > 0) {
		
		for (x = 0; x < widthBytes; x++) {
			
			uint32_t v = *src++, v1 = lookup[v >> 4], v2 = lookup[v & 15], vt, vb0, vb1, vb2;
			
			vt = (v1 << 12) + v2;
			vb0 = vt >> 16;
			vb1 = vt >> 8;
			vb2 = vt;
			
			*dst++ = vb0;
			*dst++ = vb1;
			*dst++ = vb2;
			
			*dst2++ = vb0;
			*dst2++ = vb1;
			*dst2++ = vb2;
			
			*dst3++ = vb0;
			*dst3++ = vb1;
			*dst3++ = vb2;
		}
		
		src += srcRowAdd;
		dst += dstRowAdd;
		dst2 += dstRowAdd;
		dst3 += dstRowAdd;
	}
}

static void PrvQuadrupleBitonal(const uint8_t *src, int32_t srcRowBytes, int32_t widthBytes, int32_t height, uint8_t *dst, int32_t dstRowBytes)
{
	static const uint16_t lookup[] = {0x0000, 0x000f, 0x00f0, 0x00ff, 0x0f00, 0x0f0f, 0x0ff0, 0x0fff, 0xf000, 0xf00f, 0xf0f0, 0xf0ff, 0xff00, 0xff0f, 0xfff0, 0xffff, };
	
	int32_t x, srcRowAdd = srcRowBytes - widthBytes, dstRowAdd = dstRowBytes * 4 - widthBytes * 4;
	uint8_t *dst2 = dst + dstRowBytes, *dst3 = dst2 + dstRowBytes, *dst4 = dst3 + dstRowBytes;
	
	while (height-- > 0) {
		
		for (x = 0; x < widthBytes; x++) {
			
			uint32_t v = *src++, v1 = lookup[v >> 4], v2 = lookup[v & 15];
			
			*dst++ = v1 >> 8;
			*dst++ = v1;
			*dst++ = v2 >> 8;
			*dst++ = v2;
			
			*dst2++ = v1 >> 8;
			*dst2++ = v1;
			*dst2++ = v2 >> 8;
			*dst2++ = v2;
			
			*dst3++ = v1 >> 8;
			*dst3++ = v1;
			*dst3++ = v2 >> 8;
			*dst3++ = v2;
			
			*dst4++ = v1 >> 8;
			*dst4++ = v1;
			*dst4++ = v2 >> 8;
			*dst4++ = v2;
		}
		
		src += srcRowAdd;
		dst += dstRowAdd;
		dst2 += dstRowAdd;
		dst3 += dstRowAdd;
		dst4 += dstRowAdd;
	}
}

static const uint8_t* PrvGetFontImagePtr(const struct PalmFont *fontP, uint32_t densityIndex)
{
	if (fontP->metrics.fontType & FONT_TYPE_MASK_V2)
		return ((const uint8_t*)fontP) + fontP->densities[densityIndex].glyphBitsOffset;
	else {
		const struct PalmFontV1 *fntV1 = (struct PalmFontV1*)fontP;
		
		return (const uint8_t*)(fntV1 + 1);
	}
}

static const uint16_t* PrvGetFontLocTablePtr(const struct PalmFont *fontP)
{
	if (fontP->metrics.fontType & FONT_TYPE_MASK_V2)
		return (const uint16_t*)&fontP->densities[fontP->densityCount];		//one past the last density map is location table
	else {
		const struct PalmFontV1 *fntV1 = (struct PalmFontV1*)fontP;
		
		return ((const uint16_t*)(fntV1 + 1)) + ((uint32_t)fntV1->metrics.rowWords * fntV1->metrics.fRectHeight);
	}
}

//we can upscale fonts, but we never downscale them! upscaling by non-integer amounts also sucks
//this also means that while we can use a 2x font to draw on a 3x screen, we prefer 1.5x or 1x fonts as noninteger scaling sucks!
static uint32_t PrvGetFontDensityIndex(const struct PalmFont *fontP, uint32_t dstDensity, uint32_t *fontDensityP)
{
	int32_t i, halfMatch = -1, thirdMatch = -1, threeHalvesMatch = -1, exactMatch = -1, standardDensityMatch = -1;
	
	//v1 fonts are always standard density
	if (!(fontP->metrics.fontType & FONT_TYPE_MASK_V2)) {
		
		*fontDensityP = PALM_DENSITY_STANDARD;
		return 0;
	}
	
	//go over all options. shortcut if exact match found
	for (i = fontP->densityCount - 1; i >= 0; i--) {
		
		uint32_t candidateDensity = fontP->densities[i].density;
		
		if (candidateDensity == dstDensity) {
			exactMatch = i;
			break;
		}
		
		if (candidateDensity * 2 == dstDensity)
			halfMatch = i;
		
		if (candidateDensity *3 == dstDensity)
			thirdMatch = i;
		
		if (candidateDensity * 2 / 3 == dstDensity)
			threeHalvesMatch = i;
		
		if (candidateDensity == PALM_DENSITY_STANDARD)
			standardDensityMatch = i;
	}
	
	
	//we prefer things in this order: exact, 2x, 3x, standard, 1.5x, whatever first density in the font was
	if (exactMatch >= 0)
		i = exactMatch;
	else if (halfMatch >= 0)
		i = halfMatch;
	else if (thirdMatch >= 0)
		i = thirdMatch;
	else if (standardDensityMatch >= 0)
		i = standardDensityMatch;
	else if (threeHalvesMatch >= 0)
		i = threeHalvesMatch;
	else
		i = 0;
	
	//set i to zero here forcibly to force use of lo-res fonts always
	//i = 0;
	
	*fontDensityP = fontP->densities[i].density;
	return i;
}

static void HALDraw_Glyphs(struct PalmCanvas* canvas, struct PointType *topLeft, uint32_t glyphHeight, const struct GlyphInfo* glyphs, uint32_t glyphCount)
{
	int32_t penX = topLeft->x, penY = topLeft->y, clipLeft = canvas->clippingRect.topLeft.x, clipRight = clipLeft + canvas->clippingRect.extent.x;
	int32_t i, glyphTopSkip = canvas->clippingRect.topLeft.y - penY, remainingWidth = clipRight - penX;
	enum PalmWinDrawOperation xferMode = canvas->drawState->drawMode;
	struct PalmDrawState *drawState = canvas->drawState;
	struct PalmBitmapV3fat *bmp = canvas->bmp;
	static const uint32_t empty[8] = {};
	const struct GlyphInfo* glyph;
	uint32_t backColor, textColor;
	struct BitonalBlitInfo info;
	
	//prepare colors
	backColor = PrvExpandPixel(bmp, drawState->backColorIdx, &drawState->backColorRGB);
	textColor = PrvExpandPixel(bmp, drawState->textColorIdx, &drawState->textColorRGB);
	
	//clip on top
	if (glyphTopSkip <= 0)
		glyphTopSkip = 0;
	else {
		penY += glyphTopSkip;
		glyphHeight -= glyphTopSkip;
	}
	
	//clip on bottom
	i = penY + glyphHeight - ((int32_t)canvas->clippingRect.topLeft.y + canvas->clippingRect.extent.y);
	if (i > 0)
		glyphHeight -= i;
	
	//prepare to blit
	PrvSetupBitonalBlit(xferMode, bmp, penY, 0, backColor, textColor, &info);
	
	//more clipping adjustments
	clipLeft = clipLeft <= penX ? 0 : clipLeft - penX;
	
	//per-glyph loop
	for (glyph = glyphs; glyphCount; glyphCount--, glyph++) {
		
		int32_t fromLeft = glyph->fromLeft;
		int32_t width = glyph->width;
		
		if (clipLeft) {
			
			if (width <= clipLeft) {
				
				if (glyph->hasPad && !((width + penX - 1) % 3)) {	//not sure why mod 3..
					
					clipLeft--;
					remainingWidth--;
					penX++;
				}
				clipLeft -= width;
				remainingWidth -= width;
				penX += width;
				continue;
			}
				
			penX += clipLeft;
			width -= clipLeft;
			remainingWidth -= clipLeft;
			fromLeft += clipLeft;
			clipLeft = 0;
		}
		
		if (width > remainingWidth) {
			
			width = remainingWidth;
			if (width <= 0)
				break;
		}
		
		//draw glyph
		PrvBlitBitonal(&info, glyph->srcP + glyphTopSkip * glyph->srcRowBytes, glyph->srcRowBytes, fromLeft, penX, width, glyphHeight);
		
		//post-glyph spacing?
		if (glyph->hasPad && !((width + penX - 1) % 3)) {	//not sure why mod 3..
			
			PrvBlitBitonal(&info, empty, 1, 0, penX + width, 1, glyphHeight);
			penX++;
			remainingWidth--;
		}
		
		penX += width;
		remainingWidth -= width;
	}
	
	topLeft->x = penX;
	
	if (bmp->forScreen && halScreenIsLive()) {	//screen refresh code doesnt apply for offscreen windows
		
		int32_t left = topLeft->x > clipLeft ? clipLeft : topLeft->x;
		int32_t right = clipRight < penX ? clipRight : penX;
		
		HALScreenDrawNotify(left, penY, right - left, glyphHeight);
	}
}

void DALEXPORT impl_HALDraw_Chars(struct PalmCanvas* canvas, int16_t x_in, int16_t y_in, const char *charsP, int16_t len, const struct PalmFont *fontP, const struct PalmFontMap *fontMap, PalmDrawCharsVerifyFunc charCheckProc)
{
	#define NUM_GLYPHS_IN_ARR	32
	
	uint32_t fontIndex = 0, charSize, dstScale = 2, fntScale = 2, dstDensity = canvas->bmp->density, fontDensity = PALM_DENSITY_STANDARD, fromX, width, densityIndex = 0, scaledFromX, scaledWidth;
	int32_t glyphIndex = 0, penX, clipLeft = canvas->clippingRect.topLeft.x, clipRight = clipLeft + canvas->clippingRect.extent.x, fontScratchPenX = 0;
	bool unscaledText = canvas->drawState->flags.unscaledText, unpaddedText = canvas->drawState->flags.unpaddedText, useFontScratch = false;
	uint32_t fontScratchRowBytes = 0, thisChar, charIndex, srcRowBytes = 0, srcBytesWidth, scaledSrcBytesWidth;
	int32_t srcHeightApprox, srcHeight = -1, glyphHeight = -1, glyphHeightApprox, toY = y_in, toX = x_in;
	const struct PalmFontMetrics* metricsP = fontMap ? &fontMap->metrics : &fontP->metrics;
	const struct PalmFont *lastFontP = NULL, **fontsP = NULL;
	struct GlyphInfo glyphArr[NUM_GLYPHS_IN_ARR], *glyph;
	const struct PalmFontMapEntry *fontCharMap = NULL;
	struct PointType glyphStartPoint = {};
	uint8_t *fontScratchP = NULL;
	const uint16_t *locP = NULL;	//bitmap location table in the font
	const uint8_t *srcP = NULL;

	//sort out some scaling up front
	//dstScale is 31.1 fixint representing dstDensity / PALM_DENSITY_STANDARD
	if (!unscaledText)
		dstScale = dstDensity * 2 / PALM_DENSITY_STANDARD;

	//this value is not really indicative of much. for example for 1.5 density fonts, if we scale it up, it will be too much
	srcHeightApprox = metricsP->fRectHeight + metricsP->leading;
		
	//glyphHeight is approximate and likely overestimated here
	//we canot know it for sure till we pick a font component
	//we calc it here just for the quick-exit code that follows
	glyphHeightApprox = (srcHeightApprox * dstScale + 1)/ 2;
	
	//check y boundaries for a quick shortcut out of here
	if (toY >= canvas->clippingRect.topLeft.y + canvas->clippingRect.extent.y)
		return;
	if (toY + glyphHeightApprox < canvas->clippingRect.topLeft.y)
		return;
	
	if (fontMap) {
		
		fontCharMap = (const struct PalmFontMapEntry*)fontMap->entries;
		fontsP = (const struct PalmFont**)fontP;	//this is a very strange way to do this, but such is life
	}
	
	penX = toX;
	glyphStartPoint.x = penX;
	glyphStartPoint.y = toY;
	
	while (len > 0 && penX < clipRight) {
		
		charSize = 1;
		
		if (fontCharMap) {
			
			fontIndex = (uint8_t)*charsP;
			fontP = fontsP[fontCharMap[fontIndex].fontIdx];
			if (fontCharMap[fontIndex].unk_0x01 == 2) {
				charsP++;
				charSize = 2;
			}
		}
		
		if (lastFontP != fontP) {
			
			lastFontP = fontP;
			
			if (glyphIndex > 0) {
				
				HALDraw_Glyphs(canvas, &glyphStartPoint, glyphHeight, glyphArr, glyphIndex);
				glyphIndex = 0;
				fontScratchPenX = 0;
			}
			
			glyphArr[0].hasPad = 0;
			if (!unscaledText) {
				densityIndex = PrvGetFontDensityIndex(fontP, dstDensity, &fontDensity);
				fntScale = fontDensity * 2 / PALM_DENSITY_STANDARD;
			}
			else {
				densityIndex = 0;
				fontDensity = PALM_DENSITY_STANDARD;
				fntScale = 2;
			}
			
			srcHeight = metricsP->fRectHeight + metricsP->leading;
			if (fntScale == 3 && !(srcHeight & 1))	//1.5 density fonts are weird, man!
				srcHeight--;						//do not ask me why, but this is correct!
			
			glyphHeight = dstScale * metricsP->ascent / 2 + dstScale * metricsP->descent / 2 + dstScale * metricsP->leading / 2;
			//some fonts (like ones made by PilRC) do not set ANY of thos evals. fall back to older methods then
			if (!glyphHeight)
				glyphHeight = dstScale * srcHeight / 2;
			
			//os does this (at start since it doesnt support scaling non std fonts) even though it clearly doesnt mean that since it breaks for 3x
			//	if (dstScale == 108 && !unscaledText)
			//		glyphHeight = dstScale * metricsP->ascent / 2 + dstScale * metricsP->descent / 2 + dstScale * metricsP->leading / 2;
			//	else
			//		glyphHeight = srcHeight * dstScale / 2;
		
			srcRowBytes = ((fntScale * fontP->metrics.rowWords + 1) / 2) * 2;
			
			if (fontDensity == PALM_DENSITY_ONE_AND_A_HALF && !unpaddedText)
				glyphArr[0].hasPad = 1;
			
			srcP = PrvGetFontImagePtr(fontP, densityIndex);
			locP = PrvGetFontLocTablePtr(fontP);
			
			if (dstDensity == fontDensity || unscaledText)
				useFontScratch = false;
			else {
				useFontScratch = true;
				fontScratchPenX = 0;
				fontScratchRowBytes = 16;
				
				if (!fontScratchP) {
					
					fontScratchP = (uint8_t*)kheapAlloc((glyphHeight + 4) * fontScratchRowBytes);
					if (!fontScratchP)
						return;
				}
			}
		}
		
		//grab the char
		thisChar = (uint8_t)*charsP++;
		
		//get its index in the font (or missing char glypgh if no such image in the font
		if (thisChar >= fontP->metrics.firstChar && thisChar <= fontP->metrics.lastChar)
			charIndex = thisChar - fontP->metrics.firstChar;
		else {
			charIndex = (uint32_t)fontP->metrics.lastChar - fontP->metrics.firstChar + 1;
		
			if (charSize > 1) {
				
				if (!charCheckProc((fontIndex << 8) | thisChar)) {
					charSize = 1;
					charsP--;
				}
			}
		}
		
		fromX = locP[charIndex];
		width = locP[charIndex + 1] - fromX;
		
		if (!width) {
			
			fromX = locP[(uint32_t)fontP->metrics.lastChar - fontP->metrics.firstChar + 1];
			width = locP[(uint32_t)fontP->metrics.lastChar - fontP->metrics.firstChar + 2] - fromX;
		}
		
		len -= charSize;
		glyph = &glyphArr[glyphIndex];
		if (glyphIndex > 0)
			glyph->hasPad = 0;
		
		
		if (unscaledText) {
			scaledFromX = fromX;
			scaledWidth = width;
		}
		else {
			scaledFromX = fromX * dstScale / 2;
			scaledWidth = width * dstScale / 2;
		}
		
		if (useFontScratch) {
			
			uint32_t stdCoordWidth = (fromX + width + 7) / 8 - fromX / 8;
			uint32_t scaledSrcHeight = srcHeight * fntScale / 2;
			uint32_t fntScaledFromX, fntScaledWidth;
			uint32_t magnificationFactor;
			
			magnificationFactor = dstDensity * 2 / fontDensity;
			glyphHeight = srcHeight * fntScale / 2 * magnificationFactor / 2;	//order maters!
			scaledWidth = width * fntScale / 2 * magnificationFactor / 2;		//order maters!
			
			fntScaledFromX = fromX * fntScale / 2;
			fntScaledWidth = (width + fromX) * fntScale / 2 - fromX * fntScale / 2;
			
			srcBytesWidth = (fntScaledFromX + fntScaledWidth + 7) / 8 - fntScaledFromX / 8;
			scaledSrcBytesWidth = (stdCoordWidth * dstScale + 1) / 2;
			scaledSrcBytesWidth = ((stdCoordWidth * fntScale + 1) / 2 * magnificationFactor + 1) / 2;		//order maters!
			
			if (fontScratchPenX + scaledSrcBytesWidth * 8 > fontScratchRowBytes * 8) {
				
				HALDraw_Glyphs(canvas, &glyphStartPoint, glyphHeight, glyphArr, glyphIndex);
				glyphIndex = 0;
				glyph = &glyphArr[0];
				fontScratchPenX = 0;
			}
			
			switch (magnificationFactor) {
				
				case 3:		//1.5x upscale
					glyph->hasPad = 1;
					PrvOneAndOneHalfBitonal(srcP + fntScaledFromX / 8, srcRowBytes, srcBytesWidth, scaledSrcHeight, fontScratchP + fontScratchPenX / 8, fontScratchRowBytes);
					break;
				
				case 4:		//2.0x upscale
					PrvDoubleBitonal(srcP + fntScaledFromX / 8, srcRowBytes, srcBytesWidth, scaledSrcHeight, fontScratchP + fontScratchPenX / 8, fontScratchRowBytes);
					break;
				
				case 6:		//3.0x upscale
					PrvTripleBitonal(srcP + fntScaledFromX / 8, srcRowBytes, srcBytesWidth, scaledSrcHeight, fontScratchP + fontScratchPenX / 8, fontScratchRowBytes);
					break;
				
				case 8:		//4x upscale
					PrvQuadrupleBitonal(srcP + fntScaledFromX / 8, srcRowBytes, srcBytesWidth, scaledSrcHeight, fontScratchP + fontScratchPenX / 8, fontScratchRowBytes);
					break;
				
				default:
					fatal("Not sure how to scale a font from %u to %u ppi\n", fontDensity, dstDensity);
			}
			
			glyph->fromLeft = fontScratchPenX + ((fntScaledFromX & 7) * magnificationFactor / 2);
			glyph->srcP = fontScratchP;
			glyph->srcRowBytes = fontScratchRowBytes;
			fontScratchPenX += scaledSrcBytesWidth * 8;
		}
		else {
			
			if (fontDensity == PALM_DENSITY_ONE_AND_A_HALF && !unpaddedText)
				glyph->hasPad = 1;
			
			glyph->srcP = srcP;
			glyph->srcRowBytes = srcRowBytes;
			glyph->fromLeft = scaledFromX;
		}
		
		glyph->width = scaledWidth;
		penX += scaledWidth;
		if (++glyphIndex >= NUM_GLYPHS_IN_ARR) {
			
			HALDraw_Glyphs(canvas, &glyphStartPoint, glyphHeight, glyphArr, glyphIndex);
			glyphIndex = 0;
			fontScratchPenX = 0;
		}
	}
	
	if (glyphIndex)
		HALDraw_Glyphs(canvas, &glyphStartPoint, glyphHeight, glyphArr, glyphIndex);
	
	if (canvas->drawState->underlineMode != palmUnderlineModeNoUnderline) {
	
		if (dstDensity != PALM_DENSITY_ONE_AND_A_HALF || unscaledText)
			PrvDrawUnderline(canvas, toX, glyphStartPoint.x, toY + glyphHeight - (dstScale / 2));
		else {
			int32_t underLineY = toY + ((metricsP->fRectHeight - 1) * 3 + (canvas->drawState->flags.useFloor ? 0 : 1)) / 2;
			
			if (glyphHeight != srcHeight * 3 / 2) {
				
				static const struct PalmClutEntry white = {.idx = 0, .r = 255, .g = 255, .b = 255,};
				struct PalmDrawState *drawState = canvas->drawState;
				struct PalmClutEntry saveForeRGB = drawState->foreColorRGB;
				struct PalmClutEntry saveTextRGB = drawState->textColorRGB;
				enum PalmUnderlineMode saveUnd = drawState->underlineMode;
				uint8_t saveFore = drawState->foreColorIdx;
				uint8_t saveText = drawState->textColorIdx;
				
				drawState->underlineMode = palmUnderlineModeSolidUnderline;
				drawState->foreColorIdx = 0;
				drawState->textColorIdx = 0;
				drawState->foreColorRGB = white;
				drawState->textColorRGB = white;
				
				PrvDrawUnderline(canvas, toX, glyphStartPoint.x, underLineY);
				
				drawState->foreColorRGB = saveForeRGB;
				drawState->textColorRGB = saveTextRGB;
				drawState->underlineMode = saveUnd;
				drawState->foreColorIdx = saveFore;
				drawState->textColorIdx = saveText;
			}
			PrvDrawUnderline(canvas, toX, glyphStartPoint.x, underLineY);
		}
	}
	
	if (fontScratchP)
		kheapFree(fontScratchP);
}

uint32_t DALEXPORT impl_HALDraw_GetPixel(const struct PalmBitmapV3fat *bitmapP, int16_t x, int16_t y, bool asIndex)
{
	const struct PalmClut *bmpClut = bitmapP->clut;
	uint32_t value, pixelSz = bitmapP->pixelSz;
	const uint8_t *src = (const uint8_t*)bitmapP->data;
	
	//verify coordinates are valid
	if (x < 0 || y < 0 || x >= bitmapP->width || y >= bitmapP->height)
		return 0;
	
	//account for y
	src += (uint32_t)y * (uint32_t)bitmapP->stride;
	
	switch (pixelSz) {
		
		case 1:
		case 2:
		case 4:
			value = src[x * pixelSz >> 3];
			
			if (bitmapP->pixelFormat == PALM_BMP_PIXEL_FORMAT_INDEXED)
				value >>= 8 - ((x * pixelSz) & 7) - pixelSz;
			else
				value >>= (x * pixelSz) & 7;
			
			value &= (1 << pixelSz) - 1;
			break;
		
		case 8:
			value = src[x];
			break;
		
		case 16:
			value = ((const uint16_t*)src)[x];
			
			if (bitmapP->pixelFormat != PALM_BMP_PIXEL_FORMAT_RGB565_LE)
				value = __builtin_bswap16(value);
			
			if (asIndex) {
				
				uint16_t src = value;
				uint8_t index;
				
				if (!bmpClut || !bmpClut->nEntries)
					bmpClut = HALScreenGetColortable();
				
				PrvRGB565ToIndices(&src, &index, 1, bmpClut, true);
				
				value = index;
			}
			break;
			
		default:
			value = 0;
	}
	
	return value;
}

static uint32_t PrvExpandPattern(uint32_t depth, const uint8_t *pat, uint32_t background, uint32_t foreground, uint8_t *expPat, uint32_t pixelFormat)
{
	//a halfword for each byte value, doubling each bit in place 0xqwertyui -> 0xqqwweerrttyyuuii, except also byteswapped since result will be interpreted as bytes (we do 2 at a time for speed)
	static const uint16_t doubleBitExpandTab[] = {
		0x0000, 0x0300, 0x0C00, 0x0F00, 0x3000, 0x3300, 0x3C00, 0x3F00, 0xC000, 0xC300, 0xCC00, 0xCF00, 0xF000, 0xF300, 0xFC00, 0xFF00,
		0x0003, 0x0303, 0x0C03, 0x0F03, 0x3003, 0x3303, 0x3C03, 0x3F03, 0xC003, 0xC303, 0xCC03, 0xCF03, 0xF003, 0xF303, 0xFC03, 0xFF03,
		0x000C, 0x030C, 0x0C0C, 0x0F0C, 0x300C, 0x330C, 0x3C0C, 0x3F0C, 0xC00C, 0xC30C, 0xCC0C, 0xCF0C, 0xF00C, 0xF30C, 0xFC0C, 0xFF0C,
		0x000F, 0x030F, 0x0C0F, 0x0F0F, 0x300F, 0x330F, 0x3C0F, 0x3F0F, 0xC00F, 0xC30F, 0xCC0F, 0xCF0F, 0xF00F, 0xF30F, 0xFC0F, 0xFF0F,
		0x0030, 0x0330, 0x0C30, 0x0F30, 0x3030, 0x3330, 0x3C30, 0x3F30, 0xC030, 0xC330, 0xCC30, 0xCF30, 0xF030, 0xF330, 0xFC30, 0xFF30,
		0x0033, 0x0333, 0x0C33, 0x0F33, 0x3033, 0x3333, 0x3C33, 0x3F33, 0xC033, 0xC333, 0xCC33, 0xCF33, 0xF033, 0xF333, 0xFC33, 0xFF33,
		0x003C, 0x033C, 0x0C3C, 0x0F3C, 0x303C, 0x333C, 0x3C3C, 0x3F3C, 0xC03C, 0xC33C, 0xCC3C, 0xCF3C, 0xF03C, 0xF33C, 0xFC3C, 0xFF3C,
		0x003F, 0x033F, 0x0C3F, 0x0F3F, 0x303F, 0x333F, 0x3C3F, 0x3F3F, 0xC03F, 0xC33F, 0xCC3F, 0xCF3F, 0xF03F, 0xF33F, 0xFC3F, 0xFF3F,
		0x00C0, 0x03C0, 0x0CC0, 0x0FC0, 0x30C0, 0x33C0, 0x3CC0, 0x3FC0, 0xC0C0, 0xC3C0, 0xCCC0, 0xCFC0, 0xF0C0, 0xF3C0, 0xFCC0, 0xFFC0,
		0x00C3, 0x03C3, 0x0CC3, 0x0FC3, 0x30C3, 0x33C3, 0x3CC3, 0x3FC3, 0xC0C3, 0xC3C3, 0xCCC3, 0xCFC3, 0xF0C3, 0xF3C3, 0xFCC3, 0xFFC3,
		0x00CC, 0x03CC, 0x0CCC, 0x0FCC, 0x30CC, 0x33CC, 0x3CCC, 0x3FCC, 0xC0CC, 0xC3CC, 0xCCCC, 0xCFCC, 0xF0CC, 0xF3CC, 0xFCCC, 0xFFCC,
		0x00CF, 0x03CF, 0x0CCF, 0x0FCF, 0x30CF, 0x33CF, 0x3CCF, 0x3FCF, 0xC0CF, 0xC3CF, 0xCCCF, 0xCFCF, 0xF0CF, 0xF3CF, 0xFCCF, 0xFFCF,
		0x00F0, 0x03F0, 0x0CF0, 0x0FF0, 0x30F0, 0x33F0, 0x3CF0, 0x3FF0, 0xC0F0, 0xC3F0, 0xCCF0, 0xCFF0, 0xF0F0, 0xF3F0, 0xFCF0, 0xFFF0,
		0x00F3, 0x03F3, 0x0CF3, 0x0FF3, 0x30F3, 0x33F3, 0x3CF3, 0x3FF3, 0xC0F3, 0xC3F3, 0xCCF3, 0xCFF3, 0xF0F3, 0xF3F3, 0xFCF3, 0xFFF3,
		0x00FC, 0x03FC, 0x0CFC, 0x0FFC, 0x30FC, 0x33FC, 0x3CFC, 0x3FFC, 0xC0FC, 0xC3FC, 0xCCFC, 0xCFFC, 0xF0FC, 0xF3FC, 0xFCFC, 0xFFFC,
		0x00FF, 0x03FF, 0x0CFF, 0x0FFF, 0x30FF, 0x33FF, 0x3CFF, 0x3FFF, 0xC0FF, 0xC3FF, 0xCCFF, 0xCFFF, 0xF0FF, 0xF3FF, 0xFCFF, 0xFFFF,	
	};
	const uint8_t lookup4[] = {(uint8_t)background, (uint8_t)((background & 0xF0) + (foreground & 0x0F)), (uint8_t)((background & 0x0F) + (foreground & 0xF0)), (uint8_t)foreground};
	uint8_t *expPat8 = (uint8_t*)expPat, xlate[] = {(uint8_t)background, (uint8_t)foreground};
	uint32_t *expPat32 = (uint32_t*)expPat, expandedRowBytes = depth;
	struct PrvConvertInfo cvtNfo = {};
	uint16_t *expPat16 = (uint16_t*)expPat; 
	uint32_t i, j, t;
	
	cvtNfo.translate = xlate;
	
	switch (depth) {
		
		case 1:
			if (background == foreground) {
				
				foreground |= foreground << 16;
				*expPat32++ = foreground;
				*expPat32++ = foreground;
				*expPat32++ = foreground;
				*expPat32++ = foreground;
			}
			else for (i = 0; i < 8; i++) {
				t = *pat++;
				if (pixelFormat == PALM_BMP_PIXEL_FORMAT_INDEXED_LE)
					t = swap1[t];
				if (!foreground)
					t = ~t;
				*expPat++ = t;
				*expPat++ = t;
			}
			expandedRowBytes = 2;
			break;
		
		case 2:
			for (i = 0; i < 8; i++) {
				
				t = doubleBitExpandTab[*pat++];
				t = (foreground & t) | (background &~ t);
				*expPat16++ = t;
			}
			if (pixelFormat == PALM_BMP_PIXEL_FORMAT_INDEXED_LE) {
				
				for (i = 0; i < 8; i++, expPat += 2) {
					
					expPat[0] = swap2[expPat[0]];
					expPat[1] = swap2[expPat[1]];
				}
			}
			break;
		
		case 4:
			for (i = 0; i < 8; i++) {
				
				t = *pat++;
				*expPat8++ = lookup4[(t >> 6) & 3];
				*expPat8++ = lookup4[(t >> 4) & 3];
				*expPat8++ = lookup4[(t >> 2) & 3];
				*expPat8++ = lookup4[(t >> 0) & 3];
			}
			if (pixelFormat == PALM_BMP_PIXEL_FORMAT_INDEXED_LE) {
				
				for (i = 0; i < 32; i++, expPat++)
					*expPat = swap4[*expPat];
			}
			break;
		
		case 8:
			PrvConvert1To8(pat, expPat, 8, &cvtNfo);
			break;
		
		case 16:
			for (i = 0; i < 8; i++) {
				
				t = *pat++;
				for (j = 0; j < 8; j++, t <<= 1)
					*expPat16++ = (t & 0x80) ? foreground : background;
			}
			break;
		
		case 32:	//this IS used (when pattern is doubled for double density)
		
			foreground |= foreground << 16;
			background |= background << 16;
			
			for (i = 0; i < 8; i++) {
				
				t = *pat++;
				for (j = 0; j < 8; j++, t <<= 1)
					*expPat32++ = (t & 0x80) ? foreground : background;
			}
			break;
		
		case 64:	//this IS used (when pattern is quadrupled for 4x density)
		
			foreground |= foreground << 16;
			background |= background << 16;
			
			for (i = 0; i < 8; i++) {
				
				t = *pat++;
				for (j = 0; j < 8; j++, t <<= 1) {
					if (t & 0x80) {
						*expPat32++ = foreground;
						*expPat32++ = foreground;
					}
					else {
						*expPat32++ = background;
						*expPat32++ = background;
					}
				}
			}
			break;
		
		default:
			return 0;
	}

	return expandedRowBytes;
}

static void PrvSetupPatBlit(const struct PalmCanvas* canvas, struct BlitPatInfo* info)
{
	const struct PalmDrawState *state = canvas->drawState;
	struct PalmBitmapV3fat *bmp = canvas->bmp;
	uint32_t depthShift = 0, depth = bmp->pixelSz, density = bmp->density;
	enum PalmWinDrawOperation mode = state->drawMode;
	enum PalmPatternType which = state->pattern;
	const uint8_t *pat = state->patternData.data;
	uint16_t background, foreground;
	
	background = PrvExpandPixel(bmp, canvas->drawState->backColorIdx, &canvas->drawState->backColorRGB);
	foreground = PrvExpandPixel(bmp, canvas->drawState->foreColorIdx, &canvas->drawState->foreColorRGB);
	
	depthShift = __builtin_ctz(depth);
		
	info->dstRowBytes = bmp->stride;
	info->depth = depth;
	info->depthShift = depthShift;
	info->pixelFormat = bmp->pixelFormat;

	if (mode == palmWinDrawOpWinSwap) {	//this one is special
		
		if (foreground == background)
			info->blitF = PrvBlitPatNOP;
		else if (depth == 1) {
			info->pattern = 0xFFFF;
			info->blitF = PrvBlitPatConstXOR;
		}
		else {
			info->swap1 = background & ((1 << depth) - 1);
			info->swap16 = foreground & ((1 << depth) - 1);
			info->blitF = PrvBlitPatSwap;
		}
	}
	else {
	
		static const uint8_t pattGrey[] = {0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55};
		static const uint8_t pattLightGrey[] = {0x88, 0x22, 0x88, 0x22, 0x88, 0x22, 0x88, 0x22};
		static const uint8_t pattDarkGrey[] = {0xDD, 0x77, 0xDD, 0x77, 0xDD, 0x77, 0xDD, 0x77};
	
		info->patRowBytesAcross = 0;
		info->expPatRowBytesShift = 0;

		switch (which) {
			case palmPatternGrayPattern:
				pat = pattGrey;
				which = palmPatternCustomPattern;
				density = PALM_DENSITY_STANDARD;
				break;
			case palmPatternLightGrayPattern:
				pat = pattLightGrey;
				which = palmPatternCustomPattern;
				density = PALM_DENSITY_STANDARD;
				break;
			case palmPatternDarkGrayPattern:
				pat = pattDarkGrey;
				which = palmPatternCustomPattern;
				density = PALM_DENSITY_STANDARD;
				break;
			default:
				;//nothing;
		}
		
		if (which == palmPatternCustomPattern) {
			
			uint16_t t, low, high, lowMask, highMask;
			
			if (density >= PALM_DENSITY_DOUBLE)
				depth *= 2;
			
			if (density >= PALM_DENSITY_QUADRUPLE)
				depth *= 2;
			
			switch (mode) {
				case palmWinDrawOpWinPaintInverse:
					t = background;
					background = foreground;
					foreground = t;
					//fallthrough
					
				case palmWinDrawOpWinPaint:
					
					if (background == foreground) {
						
						info->pattern = foreground;
						info->blitF = PrvBlitPatConstCopy;
					}
					else {
						
						info->patRowBytesAcross = PrvExpandPattern(depth, pat, background, foreground, info->expPat, bmp->pixelFormat);
						info->blitF = PrvBlitPatCopy;
					}
					break;
					
				case palmWinDrawOpWinErase:
					low = background;
					high = 0;
					lowMask = 0;
					highMask = 0xFFFF;
					goto custom_patt_modes_common;
				
				case palmWinDrawOpWinMask:
					low = 0;
					high = background;
					lowMask = 0xFFFF;
					highMask = 0;
					goto custom_patt_modes_common;
				
				case palmWinDrawOpWinOverlay:
					low = 0;
					high = foreground;
					lowMask = 0xFFFF;
					highMask = 0;
					//fallthrough to the label
					
			custom_patt_modes_common:
					info->patRowBytesAcross = PrvExpandPattern(depth, pat, low, high, info->expPat, bmp->pixelFormat);
					PrvExpandPattern(depth, pat, lowMask, highMask, info->expMask, bmp->pixelFormat);
					info->blitF = PrvBlitPatOverlay;
					break;
				
				case palmWinDrawOpWinInvert:
					if (depth == 16) {
						background = ~background;
						foreground = ~foreground;
					}
					info->patRowBytesAcross = PrvExpandPattern(depth, pat, background, foreground, info->expPat, bmp->pixelFormat);
					info->blitF = PrvBlitPatXOR;
					break;

				default:
					info->blitF = PrvBlitPatNOP;
					break;
			}
			
			info->expPatRowPreShift = (density < PALM_DENSITY_DOUBLE) ? 0 : ((density < PALM_DENSITY_QUADRUPLE) ? 1 : 2);
			if (info->patRowBytesAcross) {
				uint32_t shift = 0, i = info->patRowBytesAcross;
				
				while (i >>= 1)
					shift++;
				
				info->expPatRowBytesShift = shift;
			}
		}
		else {
			
			if (which != palmPatternWhitePattern)
				which = palmPatternBlackPattern;
			
			switch (mode) {
				case palmWinDrawOpWinPaint:
					info->pattern = (which == palmPatternBlackPattern) ? foreground : background;
					info->blitF = PrvBlitPatConstCopy;
					break;
				
				case palmWinDrawOpWinPaintInverse:
					info->pattern = (which == palmPatternBlackPattern) ? background : foreground;
					info->blitF = PrvBlitPatConstCopy;
					break;
				
				case palmWinDrawOpWinMask:
					which = (which == palmPatternBlackPattern) ? palmPatternWhitePattern : palmPatternBlackPattern;
					//fallthrough
				
				case palmWinDrawOpWinErase:
					info->pattern = background;
					info->blitF = (which == palmPatternBlackPattern) ? PrvBlitPatNOP : PrvBlitPatConstCopy;
					break;
				
				case palmWinDrawOpWinOverlay:
					info->pattern = foreground;
					info->blitF = (which == palmPatternBlackPattern) ? PrvBlitPatConstCopy : PrvBlitPatNOP;
					break;

				case palmWinDrawOpWinInvert:
					info->pattern = (which == palmPatternBlackPattern) ? foreground : background;
					if (depth == 16)
						info->pattern = ~info->pattern;
					info->blitF = PrvBlitPatConstXOR;
					break;

				default:
					info->blitF = PrvBlitPatNOP;
					break;
			}
		}
		
		if (info->blitF == PrvBlitPatConstCopy) switch (depth) {
			case 8:
				info->blitF = PrvBlitPatConstCopy8;
				break;
			case 16:
				info->blitF = PrvBlitPatConstCopy16;
				break;
		}
	}
}

static void PrvPatBlitUnclipped(struct PalmCanvas* canvas, struct BlitPatInfo* info, int32_t left, int32_t top, int32_t right, int32_t height)
{
	struct PalmBitmapV3fat *bmp = canvas->bmp;
	int32_t bottom = top + height;
	uint8_t *baseAddr = (uint8_t*)bmp->data;
	
	//call blit func
	info->blitF(info, top, baseAddr + top * bmp->stride, left, right - left, bottom - top);

	//update screen?
	if (bmp->forScreen && halScreenIsLive())	//screen refresh code doesnt apply for offscreen windows
		HALScreenDrawNotify(left, top, right - left, bottom - top);
}

static void PrvPatBlitClipped(struct PalmCanvas* canvas, const struct AbsRectType *clip, struct BlitPatInfo* info, int32_t left, int32_t top, int32_t right, int32_t height)
{
	struct PalmBitmapV3fat *bmp = canvas->bmp;
	int32_t bottom = top + height;
	uint8_t *baseAddr = (uint8_t*)bmp->data;
	
	//clip
	if (left < clip->left)
		left = clip->left;
	if (right > clip->right)
		right = clip->right;
	if (top < clip->top)
		top = clip->top;
	if (bottom > clip->bottom)
		bottom = clip->bottom;
	
	//verify there is still work to do
	if (right <= left || bottom <= top)
		return;
	
	//call blit func
	info->blitF(info, top, baseAddr + top * bmp->stride, left, right - left, bottom - top);
	
	//update screen?
	if (bmp->forScreen && halScreenIsLive())	//screen refresh code doesnt apply for offscreen windows
		HALScreenDrawNotify(left, top, right - left, bottom - top);
}

void DALEXPORT impl_HALDraw_Pixels(struct PalmCanvas* canvas, uint32_t nPoints, const struct PointType *pts, int16_t penWidth)
{
	struct AbsRectType clip;
	
	if (!nPoints)
		return;
	
	PrvSetupPatBlit(canvas, &mPatBlitInfo);
	clip.top = canvas->clippingRect.topLeft.x;
	clip.left = canvas->clippingRect.topLeft.y;
	clip.right = clip.left + canvas->clippingRect.extent.x;
	clip.bottom = clip.top + canvas->clippingRect.extent.y;

	do {
		int32_t x = pts->x, y = pts->y;
		
		PrvPatBlitClipped(canvas, &clip, &mPatBlitInfo, x, y, x + penWidth, penWidth);
		pts++;

	} while (--nPoints);
}

static void PrvCompareColorTableToDefault(const struct PalmClutEntry *currentClut)
{
	const union ClutEntryAccess *defaultClutU32 = (const union ClutEntryAccess*)(halDrawingGetDefaultColorTableForDepth(8)->entries);
	const union ClutEntryAccess *currentClutU32 = (const union ClutEntryAccess*)currentClut;
	uint32_t mask = 0, i;
	bool same = false;
	
	if (defaultClutU32[255].rgbComponents == currentClutU32[255].rgbComponents) {
		
		same = true;
		
		for (i = 0; i < 0xE6; i++) {
			
			if (defaultClutU32[i].rgbComponents == currentClutU32[i].rgbComponents)
				continue;
			
			same = false;
			break;
		}
	
		if (same) {
			for (i = 0xE6; i < 0xFF; i++) {
				
				if (defaultClutU32[i].rgbComponents == currentClutU32[i].rgbComponents)
					continue;
				
				mask ^= 1 << (i - 0xE6);
			}
		}
	}
	
	mClutEndMismatchMask = mask;
	mClutState = same ? halDrawingClutIsStandard : halDrawingClutIsNonstandard;
}

void DALEXPORT impl_HALDrawFindIndexes() __attribute__((alias ("impl_HALDraw_FindIndexes")));	//this entry is duplicated in the table, so we dupe it here
Err DALEXPORT impl_HALDraw_FindIndexes(uint32_t numEntries, union ClutEntryAccess *colorsToMatch, const struct PalmClut* referenceClut)
{
	bool tryIntensity = true, unsureAboutColors = true;
	const union ClutEntryAccess *refColors;
	union ClutEntryAccess matchColor;
	uint32_t j, i, refNumEntries;
	
	
	if (!referenceClut)
		referenceClut = HALScreenGetColortable();
	
	refNumEntries = referenceClut->nEntries;
	refColors = (const union ClutEntryAccess*)referenceClut->entries;
	
	if (mClutState == halDrawingClutIsNotYetChecked && refNumEntries == 256)
		PrvCompareColorTableToDefault(referenceClut->entries);
	
	for (i = 0; i < numEntries; i++) {

		uint32_t grey, bestMatch = 0, lowestDist = 256;
		
		matchColor = colorsToMatch[i];
		
		//if it matches the reference clut, there is no more to do
		if (matchColor.idx < refNumEntries && matchColor.val32 == refColors[matchColor.idx].val32)
			continue;
		
		if (matchColor.rgbComponents == refColors[refNumEntries - 1].rgbComponents) {
			colorsToMatch[i].idx = refNumEntries - 1;
			continue;
		}
		
		if (mClutState == halDrawingClutIsStandard && refNumEntries == 256) {

			PrvFindDefaultColorIndex8Bit(&colorsToMatch[i].entry, referenceClut->entries, mClutEndMismatchMask);
			continue;
		}
		
		for (j = 0; j < refNumEntries; j++) {
			if (matchColor.val32 == refColors[j].val32)
				break;
		}
		
		if (j < refNumEntries) {	//found
			colorsToMatch[i].idx = j;
			continue;
		}
		
		if (!tryIntensity)
			goto ColorMatching;
		
		grey = (2UL * matchColor.entry.r + 5UL * matchColor.entry.g + matchColor.entry.b) / 8;
		
		for (j = 0; j < refNumEntries; j++) {
			
			uint32_t dist;
			
			if (unsureAboutColors && (refColors[j].entry.r != refColors[j].entry.g || refColors[j].entry.g != refColors[j].entry.b)) {
				tryIntensity = false;
				goto ColorMatching;
			}
			
			dist = grey > refColors[j].entry.r ? grey - refColors[j].entry.r : refColors[j].entry.r - grey;
			if (dist < lowestDist) {
				lowestDist = dist;
				bestMatch = j;
			}
		}
		
		colorsToMatch[i].idx = bestMatch;
		unsureAboutColors = false;
		continue;
		
	ColorMatching:
	
		for (bestMatch = 0, lowestDist = 0x80000, j = 0; j < refNumEntries; j++) {
			
			uint32_t dist = 0;
			int32_t delta;
			
			delta = (int8_t)(refColors[j].entry.r - matchColor.entry.r);
			dist += delta * delta;
			delta = (int8_t)(refColors[j].entry.g - matchColor.entry.g);
			dist += delta * delta;
			delta = (int8_t)(refColors[j].entry.b - matchColor.entry.b);
			dist += delta * delta;
			
			if (dist < lowestDist) {
				lowestDist = dist;
				bestMatch = j;
			}
		}
		colorsToMatch[i].idx = bestMatch;
		continue;
	}
	
	return errNone;
}

bool DALEXPORT impl_HALScreenDefaultPalette(void)
{
	logt("%s\n", __func__);
	
	return mClutState == halDrawingClutIsStandard;
}

const struct PalmClut* halDrawingGetDefaultColorTableForDepth(uint32_t depth)
{
	static const struct PalmClut stdColorTable1 = {
		.nEntries = 2, 
		.entries = { {0x00, 0xFF, 0xFF, 0xFF}, {0x01, 0x00, 0x00, 0x00}, },
	};
	
	static const struct PalmClut stdColorTable2 = {
		.nEntries = 4, 
		.entries = { {0x00, 0xFF, 0xFF, 0xFF}, {0x01, 0xAA, 0xAA, 0xAA}, {0x02, 0x55, 0x55, 0x55}, {0x03, 0x00, 0x00, 0x00}, },
	};
	
	static const struct PalmClut stdColorTable4 = {
		.nEntries = 16, 
		.entries = {
			{0x00, 0xFF, 0xFF, 0xFF}, {0x01, 0xEE, 0xEE, 0xEE}, {0x02, 0xDD, 0xDD, 0xDD}, {0x03, 0xCC, 0xCC, 0xCC},
			{0x04, 0xBB, 0xBB, 0xBB}, {0x05, 0xAA, 0xAA, 0xAA}, {0x06, 0x99, 0x99, 0x99}, {0x07, 0x88, 0x88, 0x88},
			{0x08, 0x77, 0x77, 0x77}, {0x09, 0x66, 0x66, 0x66}, {0x0A, 0x55, 0x55, 0x55}, {0x0B, 0x44, 0x44, 0x44},
			{0x0C, 0x33, 0x33, 0x33}, {0x0D, 0x22, 0x22, 0x22}, {0x0E, 0x11, 0x11, 0x11}, {0x0F, 0x00, 0x00, 0x00},
		},
	};
	
	static const struct PalmClut stdColorTable8 = {
		.nEntries = 256, 
		.entries = {
			{0x00, 0xff, 0xff, 0xff}, {0x01, 0xff, 0xcc, 0xff}, {0x02, 0xff, 0x99, 0xff}, {0x03, 0xff, 0x66, 0xff},
			{0x04, 0xff, 0x33, 0xff}, {0x05, 0xff, 0x00, 0xff}, {0x06, 0xff, 0xff, 0xcc}, {0x07, 0xff, 0xcc, 0xcc},
			{0x08, 0xff, 0x99, 0xcc}, {0x09, 0xff, 0x66, 0xcc}, {0x0a, 0xff, 0x33, 0xcc}, {0x0b, 0xff, 0x00, 0xcc},
			{0x0c, 0xff, 0xff, 0x99}, {0x0d, 0xff, 0xcc, 0x99}, {0x0e, 0xff, 0x99, 0x99}, {0x0f, 0xff, 0x66, 0x99},
			{0x10, 0xff, 0x33, 0x99}, {0x11, 0xff, 0x00, 0x99}, {0x12, 0xcc, 0xff, 0xff}, {0x13, 0xcc, 0xcc, 0xff},
			{0x14, 0xcc, 0x99, 0xff}, {0x15, 0xcc, 0x66, 0xff}, {0x16, 0xcc, 0x33, 0xff}, {0x17, 0xcc, 0x00, 0xff},
			{0x18, 0xcc, 0xff, 0xcc}, {0x19, 0xcc, 0xcc, 0xcc}, {0x1a, 0xcc, 0x99, 0xcc}, {0x1b, 0xcc, 0x66, 0xcc},
			{0x1c, 0xcc, 0x33, 0xcc}, {0x1d, 0xcc, 0x00, 0xcc}, {0x1e, 0xcc, 0xff, 0x99}, {0x1f, 0xcc, 0xcc, 0x99},
			{0x20, 0xcc, 0x99, 0x99}, {0x21, 0xcc, 0x66, 0x99}, {0x22, 0xcc, 0x33, 0x99}, {0x23, 0xcc, 0x00, 0x99},
			{0x24, 0x99, 0xff, 0xff}, {0x25, 0x99, 0xcc, 0xff}, {0x26, 0x99, 0x99, 0xff}, {0x27, 0x99, 0x66, 0xff},
			{0x28, 0x99, 0x33, 0xff}, {0x29, 0x99, 0x00, 0xff}, {0x2a, 0x99, 0xff, 0xcc}, {0x2b, 0x99, 0xcc, 0xcc},
			{0x2c, 0x99, 0x99, 0xcc}, {0x2d, 0x99, 0x66, 0xcc}, {0x2e, 0x99, 0x33, 0xcc}, {0x2f, 0x99, 0x00, 0xcc},
			{0x30, 0x99, 0xff, 0x99}, {0x31, 0x99, 0xcc, 0x99}, {0x32, 0x99, 0x99, 0x99}, {0x33, 0x99, 0x66, 0x99},
			{0x34, 0x99, 0x33, 0x99}, {0x35, 0x99, 0x00, 0x99}, {0x36, 0x66, 0xff, 0xff}, {0x37, 0x66, 0xcc, 0xff},
			{0x38, 0x66, 0x99, 0xff}, {0x39, 0x66, 0x66, 0xff}, {0x3a, 0x66, 0x33, 0xff}, {0x3b, 0x66, 0x00, 0xff},
			{0x3c, 0x66, 0xff, 0xcc}, {0x3d, 0x66, 0xcc, 0xcc}, {0x3e, 0x66, 0x99, 0xcc}, {0x3f, 0x66, 0x66, 0xcc},
			{0x40, 0x66, 0x33, 0xcc}, {0x41, 0x66, 0x00, 0xcc}, {0x42, 0x66, 0xff, 0x99}, {0x43, 0x66, 0xcc, 0x99},
			{0x44, 0x66, 0x99, 0x99}, {0x45, 0x66, 0x66, 0x99}, {0x46, 0x66, 0x33, 0x99}, {0x47, 0x66, 0x00, 0x99},
			{0x48, 0x33, 0xff, 0xff}, {0x49, 0x33, 0xcc, 0xff}, {0x4a, 0x33, 0x99, 0xff}, {0x4b, 0x33, 0x66, 0xff},
			{0x4c, 0x33, 0x33, 0xff}, {0x4d, 0x33, 0x00, 0xff}, {0x4e, 0x33, 0xff, 0xcc}, {0x4f, 0x33, 0xcc, 0xcc},
			{0x50, 0x33, 0x99, 0xcc}, {0x51, 0x33, 0x66, 0xcc}, {0x52, 0x33, 0x33, 0xcc}, {0x53, 0x33, 0x00, 0xcc},
			{0x54, 0x33, 0xff, 0x99}, {0x55, 0x33, 0xcc, 0x99}, {0x56, 0x33, 0x99, 0x99}, {0x57, 0x33, 0x66, 0x99},
			{0x58, 0x33, 0x33, 0x99}, {0x59, 0x33, 0x00, 0x99}, {0x5a, 0x00, 0xff, 0xff}, {0x5b, 0x00, 0xcc, 0xff},
			{0x5c, 0x00, 0x99, 0xff}, {0x5d, 0x00, 0x66, 0xff}, {0x5e, 0x00, 0x33, 0xff}, {0x5f, 0x00, 0x00, 0xff},
			{0x60, 0x00, 0xff, 0xcc}, {0x61, 0x00, 0xcc, 0xcc}, {0x62, 0x00, 0x99, 0xcc}, {0x63, 0x00, 0x66, 0xcc},
			{0x64, 0x00, 0x33, 0xcc}, {0x65, 0x00, 0x00, 0xcc}, {0x66, 0x00, 0xff, 0x99}, {0x67, 0x00, 0xcc, 0x99},
			{0x68, 0x00, 0x99, 0x99}, {0x69, 0x00, 0x66, 0x99}, {0x6a, 0x00, 0x33, 0x99}, {0x6b, 0x00, 0x00, 0x99},
			{0x6c, 0xff, 0xff, 0x66}, {0x6d, 0xff, 0xcc, 0x66}, {0x6e, 0xff, 0x99, 0x66}, {0x6f, 0xff, 0x66, 0x66},
			{0x70, 0xff, 0x33, 0x66}, {0x71, 0xff, 0x00, 0x66}, {0x72, 0xff, 0xff, 0x33}, {0x73, 0xff, 0xcc, 0x33},
			{0x74, 0xff, 0x99, 0x33}, {0x75, 0xff, 0x66, 0x33}, {0x76, 0xff, 0x33, 0x33}, {0x77, 0xff, 0x00, 0x33},
			{0x78, 0xff, 0xff, 0x00}, {0x79, 0xff, 0xcc, 0x00}, {0x7a, 0xff, 0x99, 0x00}, {0x7b, 0xff, 0x66, 0x00},
			{0x7c, 0xff, 0x33, 0x00}, {0x7d, 0xff, 0x00, 0x00}, {0x7e, 0xcc, 0xff, 0x66}, {0x7f, 0xcc, 0xcc, 0x66},
			{0x80, 0xcc, 0x99, 0x66}, {0x81, 0xcc, 0x66, 0x66}, {0x82, 0xcc, 0x33, 0x66}, {0x83, 0xcc, 0x00, 0x66},
			{0x84, 0xcc, 0xff, 0x33}, {0x85, 0xcc, 0xcc, 0x33}, {0x86, 0xcc, 0x99, 0x33}, {0x87, 0xcc, 0x66, 0x33},
			{0x88, 0xcc, 0x33, 0x33}, {0x89, 0xcc, 0x00, 0x33}, {0x8a, 0xcc, 0xff, 0x00}, {0x8b, 0xcc, 0xcc, 0x00},
			{0x8c, 0xcc, 0x99, 0x00}, {0x8d, 0xcc, 0x66, 0x00}, {0x8e, 0xcc, 0x33, 0x00}, {0x8f, 0xcc, 0x00, 0x00},
			{0x90, 0x99, 0xff, 0x66}, {0x91, 0x99, 0xcc, 0x66}, {0x92, 0x99, 0x99, 0x66}, {0x93, 0x99, 0x66, 0x66},
			{0x94, 0x99, 0x33, 0x66}, {0x95, 0x99, 0x00, 0x66}, {0x96, 0x99, 0xff, 0x33}, {0x97, 0x99, 0xcc, 0x33},
			{0x98, 0x99, 0x99, 0x33}, {0x99, 0x99, 0x66, 0x33}, {0x9a, 0x99, 0x33, 0x33}, {0x9b, 0x99, 0x00, 0x33},
			{0x9c, 0x99, 0xff, 0x00}, {0x9d, 0x99, 0xcc, 0x00}, {0x9e, 0x99, 0x99, 0x00}, {0x9f, 0x99, 0x66, 0x00},
			{0xa0, 0x99, 0x33, 0x00}, {0xa1, 0x99, 0x00, 0x00}, {0xa2, 0x66, 0xff, 0x66}, {0xa3, 0x66, 0xcc, 0x66},
			{0xa4, 0x66, 0x99, 0x66}, {0xa5, 0x66, 0x66, 0x66}, {0xa6, 0x66, 0x33, 0x66}, {0xa7, 0x66, 0x00, 0x66},
			{0xa8, 0x66, 0xff, 0x33}, {0xa9, 0x66, 0xcc, 0x33}, {0xaa, 0x66, 0x99, 0x33}, {0xab, 0x66, 0x66, 0x33},
			{0xac, 0x66, 0x33, 0x33}, {0xad, 0x66, 0x00, 0x33}, {0xae, 0x66, 0xff, 0x00}, {0xaf, 0x66, 0xcc, 0x00},
			{0xb0, 0x66, 0x99, 0x00}, {0xb1, 0x66, 0x66, 0x00}, {0xb2, 0x66, 0x33, 0x00}, {0xb3, 0x66, 0x00, 0x00},
			{0xb4, 0x33, 0xff, 0x66}, {0xb5, 0x33, 0xcc, 0x66}, {0xb6, 0x33, 0x99, 0x66}, {0xb7, 0x33, 0x66, 0x66},
			{0xb8, 0x33, 0x33, 0x66}, {0xb9, 0x33, 0x00, 0x66}, {0xba, 0x33, 0xff, 0x33}, {0xbb, 0x33, 0xcc, 0x33},
			{0xbc, 0x33, 0x99, 0x33}, {0xbd, 0x33, 0x66, 0x33}, {0xbe, 0x33, 0x33, 0x33}, {0xbf, 0x33, 0x00, 0x33},
			{0xc0, 0x33, 0xff, 0x00}, {0xc1, 0x33, 0xcc, 0x00}, {0xc2, 0x33, 0x99, 0x00}, {0xc3, 0x33, 0x66, 0x00},
			{0xc4, 0x33, 0x33, 0x00}, {0xc5, 0x33, 0x00, 0x00}, {0xc6, 0x00, 0xff, 0x66}, {0xc7, 0x00, 0xcc, 0x66},
			{0xc8, 0x00, 0x99, 0x66}, {0xc9, 0x00, 0x66, 0x66}, {0xca, 0x00, 0x33, 0x66}, {0xcb, 0x00, 0x00, 0x66},
			{0xcc, 0x00, 0xff, 0x33}, {0xcd, 0x00, 0xcc, 0x33}, {0xce, 0x00, 0x99, 0x33}, {0xcf, 0x00, 0x66, 0x33},
			{0xd0, 0x00, 0x33, 0x33}, {0xd1, 0x00, 0x00, 0x33}, {0xd2, 0x00, 0xff, 0x00}, {0xd3, 0x00, 0xcc, 0x00},
			{0xd4, 0x00, 0x99, 0x00}, {0xd5, 0x00, 0x66, 0x00}, {0xd6, 0x00, 0x33, 0x00}, {0xd7, 0x11, 0x11, 0x11},
			{0xd8, 0x22, 0x22, 0x22}, {0xd9, 0x44, 0x44, 0x44}, {0xda, 0x55, 0x55, 0x55}, {0xdb, 0x77, 0x77, 0x77},
			{0xdc, 0x88, 0x88, 0x88}, {0xdd, 0xaa, 0xaa, 0xaa}, {0xde, 0xbb, 0xbb, 0xbb}, {0xdf, 0xdd, 0xdd, 0xdd},
			{0xe0, 0xee, 0xee, 0xee}, {0xe1, 0xc0, 0xc0, 0xc0}, {0xe2, 0x80, 0x00, 0x00}, {0xe3, 0x80, 0x00, 0x80},
			{0xe4, 0x00, 0x80, 0x00}, {0xe5, 0x00, 0x80, 0x80}, {0xe6, 0x00, 0x00, 0x00}, {0xe7, 0x00, 0x00, 0x00},
			{0xe8, 0x00, 0x00, 0x00}, {0xe9, 0x00, 0x00, 0x00}, {0xea, 0x00, 0x00, 0x00}, {0xeb, 0x00, 0x00, 0x00},
			{0xec, 0x00, 0x00, 0x00}, {0xed, 0x00, 0x00, 0x00}, {0xee, 0x00, 0x00, 0x00}, {0xef, 0x00, 0x00, 0x00},
			{0xf0, 0x00, 0x00, 0x00}, {0xf1, 0x00, 0x00, 0x00}, {0xf2, 0x00, 0x00, 0x00}, {0xf3, 0x00, 0x00, 0x00},
			{0xf4, 0x00, 0x00, 0x00}, {0xf5, 0x00, 0x00, 0x00}, {0xf6, 0x00, 0x00, 0x00}, {0xf7, 0x00, 0x00, 0x00},
			{0xf8, 0x00, 0x00, 0x00}, {0xf9, 0x00, 0x00, 0x00}, {0xfa, 0x00, 0x00, 0x00}, {0xfb, 0x00, 0x00, 0x00},
			{0xfc, 0x00, 0x00, 0x00}, {0xfd, 0x00, 0x00, 0x00}, {0xfe, 0x00, 0x00, 0x00}, {0xff, 0x00, 0x00, 0x00},
		},
	};
	switch (depth) {
		case 1:
			return &stdColorTable1;
		case 2:
			return &stdColorTable2;
		case 4:
			return &stdColorTable4;
		case 8:
			return &stdColorTable8;
		default:
			return NULL;
	}
}

static const struct PalmClut* PrvGetColorTable(struct PalmBitmapV3fat *bmp)
{
	if (bmp->clut)
		return bmp->clut;
	
	if (bmp->pixelSz == 8 && halScreenGetCurBitmap()->pixelSz == 8)
		return HALScreenGetColortable();
	
	return halDrawingGetDefaultColorTableForDepth(bmp->pixelSz);
}

static bool PrvGetTranslationTable(const struct PalmClut *srcColorTableP, const struct PalmClut *dstColorTableP, uint32_t srcDepth, uint32_t dstDepth, uint8_t* translation)
{
	uint32_t i = 0;
	
	if (!dstColorTableP || !dstColorTableP->nEntries)
		dstColorTableP = HALScreenGetColortable();
	
	if (srcColorTableP) {
		
		const union ClutEntryAccess *srcColors = (const union ClutEntryAccess*)srcColorTableP->entries;
		const union ClutEntryAccess *dstColors = (const union ClutEntryAccess*)dstColorTableP->entries;
		uint32_t numColor = srcColorTableP->nEntries;
		
		if (srcDepth == dstDepth) {		//they might match (at least in start). This saves us effort - check for that
			
			for (; i < numColor && srcColors[i].rgbComponents == dstColors[i].rgbComponents; i++)
				translation[i] = i;
		}
		
		for (; i < numColor; i++) {
			
			struct PalmClutEntry colorTmp = srcColors[i].entry;
			
			HALDraw_FindIndexes(1, &colorTmp, dstColorTableP);
			
			translation[i] = colorTmp.idx;
		}
	}
	else if (srcDepth == dstDepth) {
			
		for (i = 0; i < (1U << srcDepth); i++)
			translation[i] = i;
	}
	else {
		
		memcpy(translation, halScreenGetStandardPaletteXlationTableForDepth(srcDepth), 1 << srcDepth);
	}
	
	return !!srcColorTableP;
}

static void PrvDitherIntensity(const void* srcP, void* dstP, int32_t numSrcBytes, struct PrvConvertInfo *info)
{
	const uint8_t *biasArrGrey = kBias16Gray + 8 * (info->dstY & 7), *src = (const uint8_t*)srcP;
	uint32_t depth = info->dstPixelSz, highest = (1 << depth) - 1, intensity;
	uint8_t *dst = (uint8_t*)dstP;
	int32_t xMatrix = info->dstX;
	
	info->dstY++;
	
	switch (depth) {
		case 1:
			while (numSrcBytes-- > 0) {
				intensity = *src++;
				intensity -= intensity / 2;
				intensity += biasArrGrey[xMatrix++ & 7] * 2;
				*dst++ = highest - (intensity >> 7);
			}
			break;
		
		case 2:
			while (numSrcBytes-- > 0) {
				intensity = *src++;
				intensity -= intensity / 4;
				intensity += biasArrGrey[xMatrix++ & 7];
				*dst++ = highest - (intensity >> 6);
			}
			break;
		
		default:	//4bpp destination. no other cases exist anyways, and PalmOS does this
			while (numSrcBytes-- > 0) {
				intensity = *src++;
				intensity -= intensity / 16;
				intensity += biasArrGrey[xMatrix++ & 7] / 4;
				*dst++ = highest - (intensity >> 4);
			}
			break;
	}
}

static void PrvBresenhamCircle(int32_t radius, int16_t* points)	//num elements in points array is "radius + 1" at most
{
	int32_t x = 0, y = radius, d = 1 - radius, deltaE = 3, deltaSE = 5 - 2 * radius;
	
	while (y > x) {
		points[x] = y;
		
		if (d < 0) {
			
			d += deltaE;
			deltaSE += 2;
		}
		else {
			points[y] = x;
			
			d += deltaSE;
			deltaSE += 4;
			y--;
		}

		x++;
		deltaE += 2;
	}
	
	points[y] = x;
}

static Err PrvBlitCompress(const struct PalmBitmapV3fat *srcBitmapP, struct PalmBitmapV3fat *dstBitmapP, int32_t fromX, int32_t fromY, int32_t width, int32_t height)
{
	uint32_t srcRowBytes = srcBitmapP->stride, dstRowBytes = dstBitmapP->stride, scanLineSize = (srcRowBytes > dstRowBytes) ? srcRowBytes : dstRowBytes, size = dstRowBytes * height;
	uint8_t *dstP = (uint8_t*)dstBitmapP->data, *compPtr = NULL, *compPtr2 = NULL, *compressedP = dstP, *compressionFailP, *temp;
	uint8_t comprType = (dstBitmapP->version >= 2) ? dstBitmapP->compressionType : PALM_BMP_COMPRES_TYPE_SCAN_LINE;
	const uint8_t *srcP = (const uint8_t*)srcBitmapP->data;
	union ComprState compState = {};
	struct PrvCopyLineInfo copyInfo;
	PrvCopyLineFunc func;
	Err err = errNone;
	int32_t result;
	
	//we need two buffesr since some compression protocols need previous line entirely
	compPtr = (uint8_t*)kheapAlloc(scanLineSize);
	compPtr2 = (uint8_t*)kheapAlloc(scanLineSize);
	if (!compPtr || !compPtr2) {
		err = 0x2302;
		goto out;
	}
	
	memset(compPtr, 0, scanLineSize);
	memset(compPtr2, 0, scanLineSize);
	
	func = PrvCopyLineInit(dstBitmapP->pixelSz, srcBitmapP->pixelFormat, dstBitmapP->pixelFormat, fromX, 0, width, &copyInfo);
	
	if (comprType == PALM_BMP_COMPRES_TYPE_PACK_BITS)
		compState.packBits.bpp = dstBitmapP->pixelSz;
	
	srcP += fromY * srcRowBytes;
	
	//v2 bmps store compressed size in u16 which limits us in what we can represent. record this
	if (dstBitmapP->structSz - 0x24 == 2) {	//v2
		if (size > 65533)
			size = 65533;
	}
	
	size -= dstRowBytes * 2;
	size -= 4;
	compressionFailP = compressedP + size;
	
	while (height-- > 0) {
		
		if (compressedP >= compressionFailP) {
			err = 0x2302;
			break;
		}
		
		func(srcP, compPtr, 1, &copyInfo);
		result = PrvScrCompress(comprType, compPtr, dstRowBytes, compressedP, 0x4000000, &compState);
		if (result < 0) {
			err = 0x2302;
			break;
		}
		
		compressedP += result;
		srcP += srcRowBytes;
		
		//swap buffers so compression protocols can have their prevLine
		temp = compPtr;
		compPtr = compPtr2;
		compPtr2 = temp;
	}
	
	//write compressed size
	if (dstBitmapP->structSz - 0x24 == sizeof(uint16_t))
		((uint16_t*)dstP)[-1] = compressedP - dstP + sizeof(uint16_t);
	else
		((uint32_t*)dstP)[-1] = compressedP - dstP + sizeof(uint32_t);

out:
	if (compPtr)
		kheapFree(compPtr);
	if (compPtr2)
		kheapFree(compPtr2);
	
	return err;
}

Err DALEXPORT impl_HALDraw_Bitmap(struct PalmCanvas* canvasP, const struct PalmBitmapV3fat *srcBitmapP, const struct RectangleType *dstClippedP, int32_t offsetX, int32_t offsetY)
{
	static const uint8_t k8ToIntensity[] = {0xFF, 0xDB, 0xB6, 0x92, 0x6D, 0x49, 0xFB, 0xD7, 0xB2, 0x8E, 0x69, 0x45, 0xF8, 0xD3, 0xAF, 0x8A, 0x66, 0x41, 0xF4, 0xD0, 0xAB, 0x87, 0x62, 0x3E, 0xF0, 0xCC, 0xA8, 0x83, 0x5F, 0x3A, 0xED, 0xC8, 0xA4, 0x7F, 0x5B, 0x36, 0xE9, 0xC5, 0xA0, 0x7C, 0x57, 0x33, 0xE6, 0xC1, 0x9D, 0x78, 0x54, 0x2F, 0xE2, 0xBD, 0x99, 0x75, 0x50, 0x2C, 0xDE, 0xBA, 0x95, 0x71, 0x4D, 0x28, 0xDB, 0xB6, 0x92, 0x6D, 0x49, 0x24, 0xD7, 0xB3, 0x8E, 0x6A, 0x45, 0x21, 0xD4, 0xAF, 0x8B, 0x66, 0x42, 0x1D, 0xD0, 0xAB, 0x87, 0x62, 0x3E, 0x1A, 0xCC, 0xA8, 0x83, 0x5F, 0x3A, 0x16, 0xC9, 0xA4, 0x80, 0x5B, 0x37, 0x12, 0xC5, 0xA1, 0x7C, 0x58, 0x33, 0x0F, 0xC1, 0x9D, 0x78, 0x54, 0x2F, 0x0B, 0xF4, 0xD0, 0xAB, 0x87, 0x62, 0x3E, 0xF0, 0xCC, 0xA7, 0x83, 0x5E, 0x3A, 0xED, 0xC8, 0xA4, 0x7F, 0x5B, 0x36, 0xE9, 0xC5, 0xA0, 0x7C, 0x57, 0x33, 0xE5, 0xC1, 0x9D, 0x78, 0x54, 0x2F, 0xE2, 0xBD, 0x99, 0x74, 0x50, 0x2B, 0xDE, 0xBA, 0x95, 0x71, 0x4C, 0x28, 0xDB, 0xB6, 0x92, 0x6D, 0x49, 0x24, 0xD7, 0xB2, 0x8E, 0x6A, 0x45, 0x21, 0xD3, 0xAF, 0x8A, 0x66, 0x42, 0x1D, 0xD0, 0xAB, 0x87, 0x62, 0x3E, 0x19, 0xCC, 0xA8, 0x83, 0x5F, 0x3A, 0x16, 0xC9, 0xA4, 0x80, 0x5B, 0x37, 0x12, 0xC5, 0xA0, 0x7C, 0x57, 0x33, 0x0F, 0xC1, 0x9D, 0x78, 0x54, 0x2F, 0x0B, 0xBE, 0x99, 0x75, 0x50, 0x2C, 0x07, 0xBA, 0x96, 0x71, 0x4D, 0x28, 0x04, 0xB6, 0x92, 0x6D, 0x49, 0x24, 0x11, 0x22, 0x44, 0x55, 0x77, 0x88, 0xAA, 0xBB, 0xDD, 0xEE, 0xC0, 0x1B, 0x24, 0x5C, 0x65, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
	static const uint8_t k4ToIntensity[] = {0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99, 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11, 0x00};
	static const uint8_t k2ToIntensity[] = {0xff, 0xaa, 0x55, 0x00};
	static const uint8_t k1ToIntensity[] = {0xff, 0x00};
	
	
	int32_t i, y, fromX, fromY, toX, toY, width, height, srcRowBytes, dstRowBytes, srcYDecision, srcOffset, count, decrement;
	uint32_t transparentPixel, backPixel, forePixel, srcByteOffset = 0, pixelsPerWord, numSrcBytes = 0;
	bool scrolling, matchedColors, canCopy = false, dithering, compressed, mustReconvert, convertNow;
	enum PalmWinDrawOperation transferMode = canvasP->drawState->drawMode;
	uint8_t *dstP, *srcP, *compressedP = NULL, *compPtr = NULL;
	enum HalDrawingScalingMode scaleMode = halDrawScaleNone;
	struct PalmBitmapV3fat *dstBitmapP = canvasP->bmp;
	uint8_t compType = PALM_BMP_COMPRES_TYPE_NONE;
	uint8_t srcPixelSize = srcBitmapP->pixelSz;
	uint8_t dstPixelSize = dstBitmapP->pixelSz;
	uint16_t srcDensity = srcBitmapP->density;
	uint16_t dstDensity = dstBitmapP->density;
	struct PrvConvertInfo convertInfo = {};
	PrvBlit8_or_16func transferFunc = NULL;
	struct PrvCopyLineInfo copyInfo = {};
	PrvConvertFunc ditherFunc = NULL;
	PrvCopyLineFunc copyFunc = NULL;
	union ComprState compState = {};
	PrvConvertFunc cvtFunc = NULL;
	uint8_t translateMem[256];
	struct PrvBlitInfo info;
	void* ptr = NULL;
	
	static uint16_t XRAM1 xlate16_tmp[256];	//not sure if ever used. in garnet this is srcGlobals + 0x12c
	
	if ((srcDensity != dstDensity) && !srcBitmapP->neverDensityScale) {
		
		switch ((((uint32_t)dstDensity) << 16) / srcDensity) {
			case 0x8000:
				scaleMode = halDrawScaleOneHalf;
				break;
			case 0xAAAA:
			case 0xAAAB:
				scaleMode = halDrawScaleTwoThirds;
				break;
			case 0xC000:
				scaleMode = halDrawScaleThreeQuarters;
				break;
			case 0x18000:
				scaleMode = halDrawScaleOneAndHalf;
				break;
			case 0x20000:
				scaleMode = halDrawScaleDouble;
				break;
			case 0x30000:
				scaleMode = halDrawScaleTriple;
				break;
			case 0x40000:
				scaleMode = halDrawScaleQuadruple;
				break;
			default:
				logw("Not sure how to scale bitmap from density %u to density %u. Will not scale\n", srcDensity, dstDensity);
		}
	}
	
	toX = dstClippedP->topLeft.x;
	toY = dstClippedP->topLeft.y;
	width = dstClippedP->extent.x;
	height = dstClippedP->extent.y;
	fromX = toX - offsetX;
	fromY = toY - offsetY;
	
	if (width <= 0 || height <= 0)
		goto cleanup;
	
	if (dstBitmapP->compressed) {
	
		uint32_t fieldSize;
		
		if (!srcBitmapP->compressed && srcPixelSize == dstPixelSize && transferMode == palmWinDrawOpWinPaint && !toX && !toY && scaleMode == halDrawScaleNone && errNone == PrvBlitCompress(srcBitmapP, dstBitmapP, fromX, fromY, width, height))
			goto exit_notify;
		
		fieldSize = dstBitmapP->structSz - sizeof(struct PalmBitmapV3fat);
		dstBitmapP->data = ((uint8_t*)dstBitmapP->data) - fieldSize;
		dstBitmapP->compressed = 0;
	}
	
	scrolling = dstBitmapP->data == srcBitmapP->data;
	backPixel = canvasP->drawState->backColorIdx;
	forePixel = canvasP->drawState->foreColorIdx;
	srcRowBytes = srcBitmapP->stride;
	dstRowBytes = dstBitmapP->stride;
	
	dstP = (uint8_t*)dstBitmapP->data;
	dstP += toY * dstRowBytes;
	
	matchedColors = scrolling || (srcPixelSize == dstPixelSize && (!srcBitmapP->hasColorTable || !dstBitmapP->hasColorTable) && (srcPixelSize != 1 || (backPixel == 0 && forePixel == 1)));
	switch (scaleMode) {
		case halDrawScaleNone:
			//nothing
			break;
		
		case halDrawScaleOneHalf:
			fromY *= 2;
			break;
		
		case halDrawScaleTwoThirds:
			fromY = (fromY * 3 + 1) / 2;
			break;
		
		case halDrawScaleThreeQuarters:
			fromY = (fromY * 4 + 1) / 3;
			break;
		
		case halDrawScaleOneAndHalf:
			fromY = fromY * 2 / 3;
			break;
		
		case halDrawScaleDouble:
			fromY = fromY / 2;
			break;
		
		case halDrawScaleTriple:
			fromY = fromY / 3;
			break;
		
		case halDrawScaleQuadruple:
			fromY = fromY / 4;
			break;
			
	}
	srcP = srcBitmapP->compressed ? NULL : ((uint8_t*)srcBitmapP->data) + fromY * srcRowBytes;	//NUL to catch if we ever use SRC on compressed data (unsafe)

	if (transferMode == palmWinDrawOpWinPaint && srcBitmapP->hasTransparency)
		transferMode = palmWinDrawOpWinOverlay;

	//shortcut some likely drawing operations in paint mode
	if (transferMode == palmWinDrawOpWinPaint && matchedColors && !srcBitmapP->compressed) {

		if (scaleMode == halDrawScaleNone) {
			
			if (fromY < toY && scrolling) {
				
				int32_t offset = (height - 1) * srcRowBytes;

				srcP += offset;
				dstP += offset;
				srcRowBytes = -srcRowBytes;
				dstRowBytes = -dstRowBytes;
			}
			
			PrvBlitCopyMatched(srcP, srcRowBytes, fromX, srcBitmapP->pixelFormat, dstP, dstRowBytes, toX, dstBitmapP->pixelFormat, dstPixelSize, width, height, fromY == toY && scrolling && fromX < toX);
			goto exit_notify;
		}
		else if (scaleMode == halDrawScaleDouble && srcPixelSize == 8 && !((fromX | fromY | width | height) & 1)) {
			
			PrvDraw8BitCopyDoubled(srcP + (fromX >> 1), dstP + toX, srcRowBytes, dstRowBytes, height, width);
			goto exit_notify;
		}
		else if (scaleMode == halDrawScaleTriple && srcPixelSize == 8 && !(fromX % 3) && !(fromY % 3) && !(width % 3) && !(height % 3)) {
			
			PrvDraw8BitCopyTripled(srcP + (fromX / 3), dstP + toX, srcRowBytes, dstRowBytes, height, width);
			goto exit_notify;
		}
		else if (scaleMode == halDrawScaleQuadruple && srcPixelSize == 8 && !((fromX | fromY | width | height) & 3)) {
			
			PrvDraw8BitCopyQuadrupled(srcP + (fromX / 4), dstP + toX, srcRowBytes, dstRowBytes, height, width);
			goto exit_notify;
		}
	}
	
	//shortcut some likely drawing operations in overlay mode
	transparentPixel = srcBitmapP->transparentValue;
	if (transferMode == palmWinDrawOpWinOverlay && srcPixelSize == 8 && dstPixelSize == 8 && srcBitmapP != dstBitmapP && matchedColors && !srcBitmapP->compressed) {

		if (scaleMode == halDrawScaleNone) {
			
			PrvDraw8BitTransBitmap(srcP + fromX, dstP + toX, srcRowBytes - width, dstRowBytes - width, transparentPixel, height, width);
			goto exit_notify;
		}
		else if (scaleMode == halDrawScaleDouble && !((fromX | fromY | width | height) & 1)) {
			
			PrvDraw8BitTransBitmapDoubled(srcP + (fromX >> 1), dstP + toX, srcRowBytes, dstRowBytes, transparentPixel, height, width);
			goto exit_notify;
		}
		else if (scaleMode == halDrawScaleQuadruple && !((fromX | fromY | width | height) & 3)) {
			
			PrvDraw8BitTransBitmapQuadrupled(srcP + (fromX >> 2), dstP + toX, srcRowBytes, dstRowBytes, transparentPixel, height, width);
			goto exit_notify;
		}
		else if (scaleMode == halDrawScaleTriple && !(fromX % 3) && !(fromY % 3) && !(width % 3) && !(height % 3)) {
			
			PrvDraw8BitTransBitmapTripled(srcP + (fromX / 3), dstP + toX, srcRowBytes, dstRowBytes, transparentPixel, height, width);
			goto exit_notify;
		}
		else if (scaleMode == halDrawScaleOneAndHalf) {
			
			PrvDraw8BitTransBitmapOneAndOneHalf(srcP + fromX, dstP + toX, srcRowBytes, dstRowBytes, transparentPixel, height, width);
			goto exit_notify;
		}
	}
	
	//figure out the translation table
	if (dstPixelSize <= 8) {
		
		if (matchedColors || srcPixelSize == 16) {
			
			convertInfo.translate = k8UnityMapping;
		}
		else if (srcPixelSize == 1 && !srcBitmapP->clut) {
			
			translateMem[0] = backPixel;
			translateMem[1] = forePixel;
			convertInfo.translate = translateMem;
		}
		else if (!srcBitmapP->clut && (dstBitmapP->clut || (dstBitmapP->forScreen && !mClutEndMismatchMask && mClutState == halDrawingClutIsStandard))) {
			
			convertInfo.translate = PrvStandardIndexMapping(srcPixelSize, dstPixelSize);
		}
		else if (srcPixelSize > 8) {
			
			convertInfo.translate = k8UnityMapping;
		}
		else {
			
			if (PrvGetTranslationTable(srcBitmapP->clut, dstBitmapP->clut, srcPixelSize, dstPixelSize, translateMem) && srcPixelSize == 1) {
				
				translateMem[0] = backPixel;
				translateMem[1] = forePixel;
			}
			convertInfo.translate = translateMem;
		}
	}
	else {
		
		forePixel = PrvRGBColorToRGB565(&canvasP->drawState->foreColorRGB, dstBitmapP->pixelFormat);
		backPixel = PrvRGBColorToRGB565(&canvasP->drawState->backColorRGB, dstBitmapP->pixelFormat);

		if (srcPixelSize == 1 && !srcBitmapP->clut) {
			convertInfo.translate = k8UnityMapping;
			xlate16_tmp[0] = backPixel;
			xlate16_tmp[1] = forePixel;
		}
		else if (srcPixelSize <= 8) {
			
			if (PrvGetTranslation16Table(srcBitmapP->clut, srcPixelSize, translateMem, xlate16_tmp, dstBitmapP->pixelFormat) && srcPixelSize == 1) {
				
				backPixel = xlate16_tmp[translateMem[0]];
				forePixel = xlate16_tmp[translateMem[1]];
			}
			convertInfo.translate = translateMem;
		}
	}
	
	//handle likely ops for 1bpp source
	if (srcPixelSize == 1 && !srcBitmapP->compressed) {
		
		if (scaleMode == halDrawScaleNone && srcBitmapP->pixelFormat == PALM_BMP_PIXEL_FORMAT_INDEXED && (width < 40 || height < 5)) {
			
			struct BitonalBlitInfo bitonalInfo;
			
			switch (dstPixelSize) {
			
				case 1:
					backPixel |= backPixel << 1;
					forePixel |= forePixel << 1;
					//fallthrough
				case 2:
					backPixel |= backPixel << 2;
					forePixel |= forePixel << 2;
					//fallthrough
				case 4:
					backPixel |= backPixel << 4;
					forePixel |= forePixel << 4;
					//fallthrough
				case 8:
					backPixel |= backPixel << 8;
					forePixel |= forePixel << 8;
					//fallthrough
				default:
					PrvSetupBitonalBlit(transferMode, dstBitmapP, toY, transparentPixel, backPixel, forePixel, &bitonalInfo);
					PrvBlitBitonal(&bitonalInfo, srcP, srcRowBytes, fromX, toX, width, height);
					goto exit_notify;
			}
		}
	}

	srcYDecision = dstDensity;

	//sort out whether we can dither
	dithering = srcPixelSize > dstPixelSize && !srcBitmapP->noDither && transferMode == palmWinDrawOpWinPaint;
	
	if (srcBitmapP->compressed) {
		compPtr = (uint8_t*)kheapAlloc(srcRowBytes);
		if (!compPtr)
			goto cleanup;
	
		memset(compPtr, 0, srcRowBytes);
		compType = srcBitmapP->version < 2 ? PALM_BMP_COMPRES_TYPE_SCAN_LINE : srcBitmapP->compressionType;
		if (compType == PALM_BMP_COMPRES_TYPE_PACK_BITS)
			compState.packBits.bpp = srcPixelSize;

		srcP = compPtr;
		compressedP = (uint8_t*)srcBitmapP->data;
		
		//fast forward to source line that has the first data we care about (we need to decompress along the way as decompression is stateful)
		compressedP += PrvScrDecompress(compType, compressedP, 0x4000000, compPtr, srcRowBytes, &compState);
		
		if (0) {		//XXX: fromY is already scaled by now so we do not need to do this this way
						// on the other hand, this IS what treo680 does...
			for (y = 0; y < fromY; y++) {
				srcYDecision -= srcDensity;
				while (srcYDecision <= 0) {
					srcYDecision += dstDensity;
					compressedP += PrvScrDecompress(compType, compressedP, 0x4000000, compPtr, srcRowBytes, &compState);
				}
			}
		}
		else {
			
			for (y = 0; y < fromY; y++) {
				compressedP += PrvScrDecompress(compType, compressedP, 0x4000000, compPtr, srcRowBytes, &compState);
			}
		}
	}
	
	if (srcPixelSize == 16 && dstPixelSize == 16) {
		
		info.backgroundColor = backPixel;
		info.numFullPixels = width * 2;
		info.srcOfst = 0;
		info.dstOfst = toX * 2;
		info.needsSwap = srcBitmapP->pixelFormat != dstBitmapP->pixelFormat;
		info.transparentColor = (srcBitmapP->pixelFormat == PALM_BMP_PIXEL_FORMAT_RGB565_BE) ? __builtin_bswap16(transparentPixel) : transparentPixel;

		switch (scaleMode) {
			case halDrawScaleNone:
				srcByteOffset = fromX * 2;
				break;
			
			case halDrawScaleOneHalf:
				cvtFunc = PrvConvert16To16Halved;
				convertInfo.outputCount = width;
				srcByteOffset = fromX * 4;
				break;
				
			case halDrawScaleTwoThirds:
				cvtFunc = PrvConvert16To16TwoThirds;
				convertInfo.outputCount = width;
				srcByteOffset = (fromX * 3) / 2 * 2;	//bottom bit clear is a must
				break;
				
			case halDrawScaleThreeQuarters:
				cvtFunc = PrvConvert16To16ThreeQuarters;
				convertInfo.outputCount = width;
				srcByteOffset = (fromX * 4) / 3 * 2;	//bottom bit clear is a must
				break;
				
			case halDrawScaleOneAndHalf:
				cvtFunc = PrvConvert16To16OneAndOneHalf;
				convertInfo.outputCount = width;
				srcByteOffset = fromX * 2 / 3 * 2;	//bottom bit clear is a must
				break;
			
			case halDrawScaleDouble:
				cvtFunc = PrvConvert16To16Doubled;
				convertInfo.outputCount = width;
				srcByteOffset = fromX &~ 1;
				info.srcOfst = fromX & 1;
				break;
			
			case halDrawScaleTriple:
				cvtFunc = PrvConvert16To16Tripled;
				convertInfo.outputCount = width;
				srcByteOffset = (fromX * 2 / 3) &~ 1;
				info.srcOfst = fromX - srcByteOffset * 3 / 2;
				break;
			
			case halDrawScaleQuadruple:
				cvtFunc = PrvConvert16To16Quadrupled;
				convertInfo.outputCount = width;
				srcByteOffset = (fromX * 2 / 4) &~ 1;
				info.srcOfst = fromX - srcByteOffset * 4 / 2;
				break;
				
			default:
				goto cleanup;
		}
		
		if (scaleMode != halDrawScaleNone) {
			
			ptr = kheapAlloc(2 + width * 2);
			if (!ptr)
				goto cleanup;
		}
		
		switch (transferMode) {
			case palmWinDrawOpWinErase:			transferFunc = PrvBlit16To16Erase;		break;
			case palmWinDrawOpWinMask:			transferFunc = PrvBlit16To16Mask;		break;
			case palmWinDrawOpWinInvert:		transferFunc = PrvBlit16To16XOR;		break;
			case palmWinDrawOpWinOverlay:		transferFunc = PrvBlit16To16Overlay;	break;
			case palmWinDrawOpWinPaintInverse:	transferFunc = PrvBlit16To16NotCopy;	break;
			default:							transferFunc = PrvBlit16To16Copy;		break;
		}
		
		//goes to join9
	}
	else {
		switch (srcPixelSize) {
			case 1:
				cvtFunc = PrvConvert1To8;
				pixelsPerWord = 16;
				break;
			case 2:
				cvtFunc = PrvConvert2To8;
				pixelsPerWord = 8;
				break;
			case 4:
				cvtFunc = PrvConvert4To8;
				pixelsPerWord = 4;
				break;
			case 8:
				cvtFunc = PrvConvert8To8;
				pixelsPerWord = 2;
				break;
			case 16:
				pixelsPerWord = 1;
				convertInfo.dstPixelSz = dstPixelSize;
				if (!dithering) {
					cvtFunc = PrvConvert16To8;
					convertInfo.dstClut = PrvGetColorTable(dstBitmapP);
				}
				else {
					convertInfo.dstX = toX;
					convertInfo.dstX = toY;
					if (dstPixelSize == 8)
						cvtFunc = PrvConvert16To8Dither;
					else {
						cvtFunc = PrvConvert16To8Intensity;
						ditherFunc = PrvDitherIntensity;
					}
				}
				break;
			default:
				goto cleanup;
		}
		
		if (srcBitmapP->pixelFormat == PALM_BMP_PIXEL_FORMAT_RGB565_LE || srcBitmapP->pixelFormat == PALM_BMP_PIXEL_FORMAT_INDEXED_LE)
			convertInfo.needsBitswap = 1;

		if (scaleMode != halDrawScaleNone) {
			
			switch (scaleMode) {
				case halDrawScaleOneHalf:
					pixelsPerWord = pixelsPerWord / 2;
					break;
				
				case halDrawScaleTwoThirds:
					pixelsPerWord = pixelsPerWord * 2 / 3;
					break;
				
				case halDrawScaleThreeQuarters:
					pixelsPerWord = pixelsPerWord * 3 / 4;
					break;
				
				case halDrawScaleOneAndHalf:
					pixelsPerWord = pixelsPerWord * 3 / 2;
					break;
				
				case halDrawScaleDouble:
					pixelsPerWord = pixelsPerWord * 2;
					break;
				
				case halDrawScaleTriple:
					pixelsPerWord = pixelsPerWord * 3;
					break;
				
				case halDrawScaleQuadruple:
					pixelsPerWord = pixelsPerWord * 4;
					break;
				
				default:
					goto cleanup;
			}
			
			convertInfo.scaleMode = scaleMode;
			if (cvtFunc != PrvConvert16To8Dither && cvtFunc != PrvConvert16To8Intensity && cvtFunc != PrvConvert16To8) {
			
				switch (scaleMode) {
					case halDrawScaleOneHalf:
						cvtFunc = PrvConvertNto8Halved;
						break;
					
					case halDrawScaleTwoThirds:
						cvtFunc = PrvConvertNto8TwoThirds;
						break;
					
					case halDrawScaleThreeQuarters:
						cvtFunc = PrvConvertNto8ThreeQuarters;
						break;
					
					case halDrawScaleOneAndHalf:
						cvtFunc = PrvConvertNto8OneAndOneHalf;
						break;
					
					case halDrawScaleDouble:
						cvtFunc = PrvConvertNto8Doubled;
						break;
					
					case halDrawScaleTriple:
						cvtFunc = PrvConvertNto8Tripled;
						break;
					
					case halDrawScaleQuadruple:
						cvtFunc = PrvConvertNto8Quadrupled;
						break;
					
					default:
						goto cleanup;
				}
				convertInfo.srcPixelSize = srcPixelSize;
			}
		}

		if (pixelsPerWord <= 1)
			srcOffset = 0;
		else if (scaleMode == halDrawScaleOneAndHalf || scaleMode == halDrawScaleTwoThirds || scaleMode == halDrawScaleThreeQuarters)
			srcOffset = fromX % (pixelsPerWord >> 1);
		else
			srcOffset = fromX & ((pixelsPerWord >> 1) - 1);

		switch (scaleMode) {
			case halDrawScaleNone:
				srcByteOffset = (srcPixelSize * fromX) >> 3;
				numSrcBytes = ((srcPixelSize * (fromX + width) + 7) >> 3) - ((srcPixelSize * fromX) >> 3);
				convertInfo.dstX = fromX;
				convertInfo.dstY = fromY;
				break;
			
			case halDrawScaleOneHalf:
				srcByteOffset = (srcPixelSize * 2 * fromX) >> 3;
				numSrcBytes = ((srcPixelSize * (fromX + width) + 3) >> 2) - ((srcPixelSize * fromX) >> 2);
				convertInfo.dstX = fromX / 2;
				convertInfo.dstY = fromY / 2;
				break;
			
			case halDrawScaleTwoThirds:
				srcByteOffset = (((3 * srcPixelSize * fromX) >> 3) + 1) / 2;
				numSrcBytes = ((srcPixelSize * (fromX + width) + 7) >> 3) - ((srcPixelSize * fromX) >> 3);
				numSrcBytes = (3 * numSrcBytes) / 2;	//safe since we know value fits in 30 bits (see above)
				convertInfo.dstX = 2 * fromX / 3;
				convertInfo.dstY = 2 * fromY / 3;
				break;
			
			case halDrawScaleThreeQuarters:
				srcByteOffset = (srcPixelSize * (2 * fromX / 3)) >> 2;
				numSrcBytes = ((srcPixelSize * (fromX + width) + 7) >> 3) - ((srcPixelSize * fromX) >> 3);
				numSrcBytes = (4 * numSrcBytes) / 3;	//safe since we know value fits in 30 bits (see above)
				convertInfo.dstX = 3 * fromX / 4;
				convertInfo.dstY = 3 * fromY / 4;
				break;
			
			case halDrawScaleOneAndHalf:
				srcByteOffset = (srcPixelSize * (2 * fromX / 3)) >> 3;
				numSrcBytes = ((srcPixelSize * (fromX + width) + 7) >> 3) - ((srcPixelSize * fromX) >> 3);
				numSrcBytes = (2 * numSrcBytes + 2) / 3;
				if (width + srcOffset > (int32_t)(pixelsPerWord * numSrcBytes / 2))
             		numSrcBytes++;
				convertInfo.dstX = 3 * fromX / 2;
				convertInfo.dstY = 3 * fromY / 2;
				break;
			
			case halDrawScaleDouble:
				srcByteOffset = (srcPixelSize * (fromX >> 1)) >> 3;
				numSrcBytes = ((srcPixelSize * (fromX + width) + 15) >> 4) - ((srcPixelSize * fromX) >> 4);
				convertInfo.dstX = fromX * 2;
				convertInfo.dstY = fromY * 2;
				break;
			
			case halDrawScaleTriple:
				srcByteOffset = (srcPixelSize * (fromX / 3)) >> 3;
				numSrcBytes = ((srcPixelSize * (fromX + width) + 23) / 24) - ((srcPixelSize * fromX) / 24);
				convertInfo.dstX = fromX * 3;
				convertInfo.dstY = fromY * 3;
				break;
			
			case halDrawScaleQuadruple:
				srcByteOffset = (srcPixelSize * (fromX / 4)) >> 3;
				numSrcBytes = ((srcPixelSize * (fromX + width) + 31) / 32) - ((srcPixelSize * fromX) / 32);
				convertInfo.dstX = fromX * 4;
				convertInfo.dstY = fromY * 4;
				break;
			
			default:
				goto cleanup;
		}

		PrvBlit8ToNInit(&info, dstPixelSize, dstBitmapP->pixelFormat, srcOffset, toX, width);
		info.xlate = xlate16_tmp;
		convertInfo.dstX += 8 - info.srcOfst;
		ptr = kheapAlloc((scaleMode == halDrawScaleTwoThirds || scaleMode == halDrawScaleOneAndHalf || scaleMode == halDrawScaleThreeQuarters) ? (3LL * (width + 32)) / 2 : width + 32);
		if (!ptr)
			goto cleanup;
		
		if (dithering && srcPixelSize <= 8 && dstPixelSize < srcPixelSize) {
			
			convertInfo.dstPixelSz = dstPixelSize;
			ditherFunc = PrvDitherIntensity;
			
			switch (srcPixelSize) {
				case 8:	convertInfo.translate = k8ToIntensity;	break;
				case 4:	convertInfo.translate = k4ToIntensity;	break;
				case 2:	convertInfo.translate = k2ToIntensity;	break;
				case 1:	convertInfo.translate = k1ToIntensity;	break;
			}
		}

		if (transferMode == palmWinDrawOpWinErase || transferMode == palmWinDrawOpWinMask || transferMode == palmWinDrawOpWinOverlay) {
			
			if (srcPixelSize != 16) 
				info.transparentColor = convertInfo.translate[transparentPixel];
			else {
				
				uint16_t rgb565 = transparentPixel;
				uint8_t index;
				
				PrvRGB565ToIndices(&rgb565, &index, 1, PrvGetColorTable(dstBitmapP), true);
				info.transparentColor = index;
			}
		}

		switch (transferMode) {
			
			case palmWinDrawOpWinPaint:
				transferFunc = PrvBlit8ToNCopy;
				break;
			
			case palmWinDrawOpWinErase:
				transferFunc = PrvBlit8ToNErase;
				info.backgroundColor = backPixel;
				break;
			
			case palmWinDrawOpWinMask:
				transferFunc = PrvBlit8ToNMask;
				info.backgroundColor = backPixel;
				break;
			
			case palmWinDrawOpWinInvert:
				transferFunc = PrvBlit8ToNXOR;
				break;
			
			case palmWinDrawOpWinOverlay:
				transferFunc = PrvBlit8ToNOverlay;
				break;
			
			case palmWinDrawOpWinPaintInverse:
				count = (srcPixelSize > 8) ? (1 << dstPixelSize) : (1 << srcPixelSize);
				if (convertInfo.translate == translateMem) {
					for (i = 0; i < count / 2; i++) {
						uint32_t tmp = translateMem[i];
						translateMem[i] = translateMem[count - i - 1];
						translateMem[count - i - 1] = tmp;
					}
				}
				else {
					for (i = 0; i < count; i++)
						translateMem[i] = convertInfo.translate[count - i - 1];
					convertInfo.translate = translateMem;
				}
				break;
			
			default:
				transferFunc = PrvBlit8ToNCopy;
				break;	
		}
	}

	if (srcBitmapP->compressed) {
	
		canCopy = (scaleMode == halDrawScaleNone) && ((transferFunc == PrvBlit8ToNCopy && convertInfo.translate == k8UnityMapping) || transferFunc == PrvBlit16To16Copy) && !info.needsSwap && srcPixelSize == dstPixelSize;

		if (canCopy)
			copyFunc = PrvCopyLineInit(dstPixelSize, srcBitmapP->pixelFormat, dstBitmapP->pixelFormat, fromX, toX, width, &copyInfo);
		srcP = compPtr;
	}

	srcP += srcByteOffset;
	compressed = srcBitmapP->compressed;
	mustReconvert = (cvtFunc == PrvConvert16To8Dither) || ditherFunc;
	convertNow = !!cvtFunc;
	decrement = scaleMode == halDrawScaleNone ? dstDensity : srcDensity;

	for (i = 0; i < height; i++) {
		
		while (srcYDecision <= 0) {
			
			srcYDecision += dstDensity;
			
			if (compressed)
				compressedP += PrvScrDecompress(compType, compressedP, 0x4000000, compPtr, srcRowBytes, &compState);
			else
				srcP += srcRowBytes;
			convertNow = !!cvtFunc;
		}
		
		if (canCopy)
			copyFunc(compPtr, dstP, 1, &copyInfo);
		else {
			if (convertNow) {
				
				cvtFunc(srcP, ptr, numSrcBytes, &convertInfo);
				if (!mustReconvert)
					convertNow = false;
			}
			
			if (ditherFunc)
				ditherFunc(ptr, ptr, width + info.srcOfst, &convertInfo);
			
			transferFunc(cvtFunc ? (uint8_t*)ptr : srcP, dstP, &info);
		}
		dstP += dstRowBytes;
		srcYDecision -= decrement;
	}
	
exit_notify:
	if (canvasP->bmp->forScreen && halScreenIsLive()) {	//screen refresh code doesnt apply for offscreen windows
		HALScreenDrawNotify(toX, toY, width, height);
	}
		
cleanup:
	if (ptr)
		kheapFree(ptr);
	if (compPtr)
		kheapFree(compPtr);
	
	return errNone;	//yes, always
}

void DALEXPORT impl_HALDraw_Rectangle(struct PalmCanvas* canvas, const struct RectangleType *rect, int32_t radius, int32_t penWidth)
{
	#define MAX_RECT_RADIUS_ON_STACK	25
	
	struct AbsRectType clip = {.left = canvas->clippingRect.topLeft.x, .top = canvas->clippingRect.topLeft.y, .right = (Coord)(canvas->clippingRect.topLeft.x + canvas->clippingRect.extent.x), .bottom = (Coord)(canvas->clippingRect.topLeft.y + canvas->clippingRect.extent.y)};
	int32_t x1, x2, x3, y, outerRad, left = rect->topLeft.x, top = rect->topLeft.y, right = left + rect->extent.x, bottom = top + rect->extent.y;
	int16_t bresMem[MAX_RECT_RADIUS_ON_STACK + 1], *bres = bresMem;
	
	PrvSetupPatBlit(canvas, &mPatBlitInfo);
	
	if (radius > 0) {			//rounded case is the hardest

		if (radius > MAX_RECT_RADIUS_ON_STACK) {
			
			bres = (int16_t*)kheapAlloc(sizeof(int16_t) * (radius + 1));
			if (!bres)
				return;
		}
		PrvBresenhamCircle(radius, bres);
		bottom -= radius;
		top += radius;

		if (penWidth) {			//ugh - we need to draw a rounded frame
			
			outerRad = radius + penWidth - 1;
			bottom -= penWidth - 1;
			top += penWidth - 1;
			
			PrvPatBlitClipped(canvas, &clip, &mPatBlitInfo, left, top + 1, penWidth + left, bottom - top - 2);
			PrvPatBlitClipped(canvas, &clip, &mPatBlitInfo, right - penWidth, top + 1, right, bottom - top - 2);
			
			left += outerRad;
			right -= outerRad;
			
			for (y = 0; y <= outerRad; y++) {

				x1 = ((y < penWidth - 1) ? bres[0] : bres[y - (penWidth - 1)]) + penWidth - 1;

				if ( y < radius )
				{
					x2 = bres[y];
					x3 = bres[y + 1] + 1;
					if (x2 > x3)
						x2 = x3;
					x2--;
					
					PrvPatBlitClipped(canvas, &clip, &mPatBlitInfo, left - x1, top - y, left - x2, 1);
					PrvPatBlitClipped(canvas, &clip, &mPatBlitInfo, x2 + right, top - y, x1 + right, 1);
					PrvPatBlitClipped(canvas, &clip, &mPatBlitInfo, left - x1, bottom + y - 1, left - x2, 1);
					PrvPatBlitClipped(canvas, &clip, &mPatBlitInfo, x2 + right, bottom + y - 1, x1 + right, 1);
				}
				else {
					
					PrvPatBlitClipped(canvas, &clip, &mPatBlitInfo, left - x1, top - y, x1 + right, 1);
					PrvPatBlitClipped(canvas, &clip, &mPatBlitInfo, left - x1, bottom + y - 1, x1 + right, 1);
				}
			}
		}
		else {					//just a rounded rectangle
			
			//draw center section (a rectangle)
			PrvPatBlitClipped(canvas, &clip, &mPatBlitInfo, left, top, right, bottom - top);

			//prepare to draw rounded top & bottom
			right -= radius;
			left += radius;
			
			//draw 'em
			for (y = 0; y < radius; y++) {
			
				int32_t ofst = bres[y], nxt = bres[y + 1] + 1;
				
				if (ofst > nxt)
					ofst = nxt;
				
				PrvPatBlitClipped(canvas, &clip, &mPatBlitInfo, left - ofst, top - y - 1, ofst + right, 1);	//top
				PrvPatBlitClipped(canvas, &clip, &mPatBlitInfo, left - ofst, y + bottom, ofst + right, 1);	//bottom
			}
		}
	}
	else if (penWidth > 0) {	//drawing a border isnt too hard
		
		PrvPatBlitClipped(canvas, &clip, &mPatBlitInfo, left, top, right, penWidth);
		PrvPatBlitClipped(canvas, &clip, &mPatBlitInfo, left, penWidth + top, penWidth + left, bottom - top - 2 * penWidth);
		PrvPatBlitClipped(canvas, &clip, &mPatBlitInfo, right - penWidth, penWidth + top, right, bottom - top - 2 * penWidth);
		PrvPatBlitClipped(canvas, &clip, &mPatBlitInfo, left, bottom - penWidth, right, penWidth);
	}
	else {						//OMG, the rectangle drawing func needs to actually draw a normal rectangle?!?!?!
		
		PrvPatBlitClipped(canvas, &clip, &mPatBlitInfo, left, top, right, bottom - top);
	}
	
	if (bres != bresMem)
		kheapFree(bres);
}

void DALEXPORT impl_HALDraw_Line(struct PalmCanvas* canvas, int32_t x1, int32_t y1, int32_t x2, int32_t y2, int32_t penWidth)	//palm's func doesnt use the global "mPatBlitInfo" and instead uses stack. we use the global since i assume all ui drawing must be on the ui thread
{
	struct AbsRectType clip = {.left = canvas->clippingRect.topLeft.x, .top = canvas->clippingRect.topLeft.y, .right = (Coord)(canvas->clippingRect.topLeft.x + canvas->clippingRect.extent.x), .bottom = (Coord)(canvas->clippingRect.topLeft.y + canvas->clippingRect.extent.y)};
	int32_t temp, width, height;
	
	PrvSetupPatBlit(canvas, &mPatBlitInfo);
	
	if (x1 > x2) {	//sort points left to right
		temp = x1;
		x1 = x2;
		x2 = temp;
		temp = y1;
		y1 = y2;
		y2 = temp;
	}
	
	width = x2 - x1;
	height = y2 - y1;
	
	if (x1 != x2 && height) {	//sloped
		
		bool dstIsforScreen = !!canvas->bmp->forScreen, clipping;
		struct AbsRectType notify;
		int32_t decision;
		
		//we do not want each pixel we draw to send a draw notify, so we temporarily clear the "forScreen" flag in the dst bitmap
		canvas->bmp->forScreen = 0;
		
		if (y2 < y1)
			height = -height;
		
		if (height >= width) {
			
			int32_t oldY, xDirection = 1;
			
			decision = -height;
			if (y2 < y1) {
				
				temp = x1;
				x1 = x2;
				x2 = temp;
				temp = y1;
				y1 = y2;
				y2 = temp;
				xDirection = -xDirection;
			}
			
			y2 += penWidth;
			notify.left = ((xDirection <= 0) ? x2 : x1) - penWidth + 1;
			notify.top = y1;
			notify.right = ((xDirection <= 0) ? x1 : x2) + penWidth * 2 - 1;
			notify.bottom = y2;
			
			clipping = notify.left < clip.left || notify.top < clip.top || notify.right > clip.right || notify.bottom > clip.bottom;
			
			while (y1 != y2) {
				
				oldY = y1;
				do {
					y1++;
					decision += width * 2;
				} while (decision < 0 && y1 != y2);
				
				if (clipping)
					PrvPatBlitClipped(canvas, &clip, &mPatBlitInfo, x1, oldY, penWidth + x1, y1 - oldY);
				else
					PrvPatBlitUnclipped(canvas, &mPatBlitInfo, x1, oldY, penWidth + x1, y1 - oldY);
					
				x1 += xDirection;
				decision -= height * 2;
			}
		}
		else {
			
			int32_t oldX, yDirection = (y2 > y1) ? 1 : -1;
			
			decision = -width;
			
			x2 += penWidth;
			notify.left = x1;
			notify.top = ((yDirection <= 0) ? y2 : y1) - penWidth + 1;
			notify.right = x2;
			notify.bottom = ((yDirection <= 0) ? y1 : y2) + penWidth * 2 - 1;
			
			clipping = notify.left < clip.left || notify.top < clip.top || notify.right > clip.right || notify.bottom > clip.bottom;
			
			while (x1 != x2) {
				
				oldX = x1;
				do {
					x1++;
					decision += height * 2;
				} while (decision < 0 && x1 != x2);
				
				if (clipping)
					PrvPatBlitClipped(canvas, &clip, &mPatBlitInfo, oldX, y1, x1, penWidth);
				else
					PrvPatBlitUnclipped(canvas, &mPatBlitInfo, oldX, y1, x1, penWidth);
					
				y1 += yDirection;
				decision -= width * 2;
			}
		}

		//and now we restore "forScreen" flag and send a single update after clipping the draw are to the clip area
		if (dstIsforScreen) {
			canvas->bmp->forScreen = 1;
			if (notify.left < clip.left)
				notify.left = clip.left;
			if (notify.top < clip.top)
				notify.top = clip.top;
			if (notify.right > clip.right)
				notify.right = clip.right;
			if (notify.bottom > clip.bottom)
				notify.bottom = clip.bottom;
			HALScreenDrawNotify(notify.left, notify.top, notify.right - notify.left, notify.bottom - notify.top);
		}
	}
	else {						//vertical
		
		if (height < 0) {
			temp = y1;
			y1 = y2;
			y2 = temp;
			height = -height;
		}
		width += penWidth;
		height += penWidth;
		
		PrvPatBlitClipped(canvas, &clip, &mPatBlitInfo, x1, y1, width + x1, height);
	}
	
}


Err DALEXPORT impl_HALDraw_GetSupportedDensity(uint16_t *densityP)
{
	logt("%s(&(%u))\n", __func__, *densityP);
	
	switch (*densityP) {
		case 0:
			*densityP = PALM_DENSITY_STANDARD;
			break;
		case PALM_DENSITY_STANDARD:
			*densityP = PALM_DENSITY_ONE_AND_A_HALF;
			break;
		case PALM_DENSITY_ONE_AND_A_HALF:
			*densityP = PALM_DENSITY_DOUBLE;
			break;
		case PALM_DENSITY_DOUBLE:
			*densityP = PALM_DENSITY_TRIPLE;
			break;
		case PALM_DENSITY_TRIPLE:
			*densityP = PALM_DENSITY_QUADRUPLE;
			break;
		case PALM_DENSITY_QUADRUPLE:
			*densityP = 0;
			break;
		default:
			return 0xFFFF;
	}
	
	return errNone;
}

static Err halDrawPrvDrawDiaBitmap(const struct RectangleType *clip, struct PalmBitmap *bmp)	//clip can be NULL
{
	struct PalmBitmapV3fat bmpSilk, bmpScreen;
	struct PalmDrawState drawState = {};
	uint32_t curDispDepth, dispW, ofst;
	struct PalmBitmap *prv = NULL;
	struct RectangleType silkRect;
	struct PalmCanvas cnv = {};
	
	cnv.drawState = &drawState;
	cnv.bmp = &bmpScreen;
	
	(void)HALDisplayGetAttributes(hwrDispDepth, &curDispDepth);
	(void)HALDisplayGetAttributes(hwrDispHorizontal, &dispW);
	
	//pick highest bit depth that is not higher than screen's
	while (bmp->pixelSz < curDispDepth) {
		prv = bmp;
		bmp = (struct PalmBitmap*)BmpGetNextBitmap((BitmapPtr)bmp);
		if (!bmp) {
			bmp = prv;
			break;
		}
	}
	if (bmp->pixelSz > curDispDepth)
		bmp = prv;
	
	(void)BmpPrvConvertBitmap((BitmapPtr)bmp, &bmpSilk);
	(void)HALDisplayGetAttributes(hwrDispInputAreaRect, &silkRect);
	
	if (bmp->height < silkRect.extent.y) {		//bottom-align not-tall-enough images
		
		silkRect.topLeft.y += silkRect.extent.y - bmp->height;
		silkRect.extent.y = bmp->height;
	}
	
	ofst = silkRect.topLeft.y;
	
	if (halScreenGetCurBitmap()->pixelSz != curDispDepth)
		(void)HALScreenUpdateBitmap(curDispDepth);

	(void)BmpPrvConvertBitmap((BitmapPtr)halScreenGetCurBitmap(), &bmpScreen);
	
	if (curDispDepth > 8) {
		drawState.foreColorRGB.idx = 0xFF;
		drawState.foreColorIdx = 0xFF;
	}
	else
		drawState.foreColorIdx = (1 << curDispDepth) - 1;
	
	//cnv.clippingRect is ignored by HALDraw_Bitmap()
	
	if (clip) {
		silkRect = *clip;
		silkRect.topLeft.y += ofst;
	}
	
	//sometimes density isn't set, we need to fix it
	if (bmpSilk.width > 240 && bmpSilk.density == 72)
		bmpSilk.density = 144;
	else if (bmpSilk.width > 160 && bmpSilk.density == 72)
		bmpSilk.density = 108;
	
	return HALDraw_Bitmap(&cnv, &bmpSilk, &silkRect, 0, ofst);
}

Err DALEXPORT impl_HALRedrawInputArea(const struct RectangleType *rectP, bool selected)
{
	struct SysNotifyInputAreaDrawingDetailsTag notifData = {};
	struct RectangleType inputAreaRectAsPerHal;
	SysNotifyParamType notif;
	struct PalmBitmap *bmp;
	Err e = errNone;
	
	notif.notifyType = sysNotifyInputAreaDrawingEvent;
	notif.broadcaster = sysFileCSystem;
	notif.notifyDetailsP = &notifData;
	
	if (selected && !rectP) {	// "Needs bounds for selected state"
		loge("We need bounds for drawing selected input area state\n");
		return 0xffff;
	}
	
	if(rectP)
		logt("%s(((%d,%d),(%d,%d)), %d)\n", __func__, rectP->topLeft.x, rectP->topLeft.y, rectP->extent.x, rectP->extent.y, selected);
	else
		logt("%s(NULLRECT, %d)\n", __func__, selected);
	
	e = HALDisplayGetAttributes(hwrDispInputAreaRect, &inputAreaRectAsPerHal);
	if (e)
		return errNone;
	
	e = HALDisplayGetAttributes(selected ? hwrDispInputAreaBitmapSelected : hwrDispInputAreaBitmapUnselected, &bmp);
	if (e) {
		loge("Failed to get input area bitmap\n");
		return e;
	}
	
	if (!bmp)
		return errNone;
	
	notifData.bitmapP = (BitmapPtr)bmp;
	if (rectP) {
		memcpy(&notifData.updateBounds, rectP, sizeof(notifData.updateBounds));
		notifData.selected = selected;
	}
	else
		notifData.fullRedraw = true;
	
	if (SysNotifyBroadcast(&notif) || !notif.handled)			//nobody drew it? then we must draw it
		return halDrawPrvDrawDiaBitmap(rectP, bmp);
	
	return errNone;
}


Err DALEXPORT impl_HALDisplayDrawBootScreen(int16_t x, int16_t y, struct PalmBitmap *bmpPtr)	//assumes pointer ot just one proper bitmap. no depth traversals, etc
{
	struct PalmBitmapV3fat scrBmp = {}, bmpImg;
	uint32_t dispW, dispH, dispS, dispD;
	struct RectangleType clipRect = {};
	struct PalmCanvas canvas = {};
	struct PalmDrawState ds = {};
	bool indexedFormatIsLe;
	
	clipRect.extent.x = bmpPtr->width;
	clipRect.extent.y = bmpPtr->height;
	
	halDisplayEarlyBootGetMetrics((void**)&scrBmp.data, &dispW, &dispH, &dispS, NULL, &dispD, &indexedFormatIsLe);

	canvas.clippingRect.extent.x = dispW;
	canvas.clippingRect.extent.y = dispH;
	canvas.drawState = &ds;
	canvas.bmp = &scrBmp;
	
	scrBmp.width = dispW;
	scrBmp.height = dispH;
	scrBmp.stride = dispW * dispD / 8;
	scrBmp.pixelSz = dispD;
	scrBmp.version = 3;
	scrBmp.structSz = sizeof(scrBmp);
	scrBmp.density = PALM_DENSITY_STANDARD;
	
	if (dispD <= 8) {
		
		ds.foreColorIdx = (1 << dispD) - 1;
		scrBmp.pixelFormat = indexedFormatIsLe ? PALM_BMP_PIXEL_FORMAT_INDEXED_LE : PALM_BMP_PIXEL_FORMAT_INDEXED;
	}
	else {
		
		//in case out boot image is 1bpp, this needs proper setup
		ds.backColorRGB.r = 0xFF;
		ds.backColorRGB.g = 0xFF;
		ds.backColorRGB.b = 0xFF;
		
		ds.foreColorRGB.idx = 0xFF;
		ds.foreColorIdx = 0xFF;
		scrBmp.pixelFormat = PALM_BMP_PIXEL_FORMAT_RGB565_LE;
	}
	
	//pick highest bit depth that is not higher than screen's
	while (bmpPtr->pixelSz < dispD && BmpGetNextBitmap((BitmapPtr)bmpPtr) && ((struct PalmBitmap*)BmpGetNextBitmap((BitmapPtr)bmpPtr))->pixelSz < dispD)
		bmpPtr = (struct PalmBitmap*)BmpGetNextBitmap((BitmapPtr)bmpPtr);
	
	(void)BmpPrvConvertBitmap((BitmapPtr)bmpPtr, &bmpImg);
	
	bmpImg.density = PALM_DENSITY_STANDARD;
	if (bmpImg.width == dispW ||  bmpImg.height == dispH) {
		
		//nothing
	}
	else if (bmpImg.width * 3 / 2 == dispW || bmpImg.height * 3 / 2 == dispH) {
		
		scrBmp.density = PALM_DENSITY_ONE_AND_A_HALF;
		clipRect.extent.x = clipRect.extent.x * 3 / 2;
		clipRect.extent.y = clipRect.extent.y * 3 / 2;
	}
	else if (bmpImg.width * 2 == dispW || bmpImg.height * 2 == dispH) {
		
		scrBmp.density = PALM_DENSITY_DOUBLE;
		clipRect.extent.x *= 2;
		clipRect.extent.y *= 2;
	}
	else if (bmpImg.width * 3 == dispW || bmpImg.height * 3 == dispH) {
		
		scrBmp.density = PALM_DENSITY_TRIPLE;
		clipRect.extent.x *= 3;
		clipRect.extent.y *= 3;
	}
	else if (bmpImg.width * 4 == dispW || bmpImg.height * 4 == dispH) {
		
		scrBmp.density = PALM_DENSITY_QUADRUPLE;
		clipRect.extent.x *= 4;
		clipRect.extent.y *= 4;
	}
	else if (bmpImg.width == dispW * 3 / 2 || bmpImg.height == dispH * 3 / 2) {
		
		bmpImg.density = PALM_DENSITY_ONE_AND_A_HALF;
	}
	else if (bmpImg.width == dispW * 2 || bmpImg.height == dispH * 2) {
		
		bmpImg.density = PALM_DENSITY_DOUBLE;
	}
	else if (bmpImg.width == dispW * 3 || bmpImg.height == dispH * 3) {
		
		bmpImg.density = PALM_DENSITY_TRIPLE;
	}
	else if (bmpImg.width == dispW * 4 || bmpImg.height == dispH * 4) {
		
		bmpImg.density = PALM_DENSITY_QUADRUPLE;
	}
	
	logi("screen is %ux%u, boot bmp is %ux%u. treating screen as %u bpp, bmp as %u\n", dispW, dispH, bmpImg.width, bmpImg.height, scrBmp.density, bmpImg.density);
	
	if (clipRect.extent.x > (int32_t)dispW)
		clipRect.extent.x = dispW;
	if (clipRect.extent.y > (int32_t)dispH)
		clipRect.extent.y = dispH;
	
	(void)HALDraw_Bitmap(&canvas, &bmpImg, &clipRect, x, y);
	halDisplayRefreshManual();
		
	return errNone;
}

void halDrawingSetClutsDirtyFlag(void)
{
	mClutState = halDrawingClutIsNotYetChecked;
}
