/*
 * emu.c -- F-CPU instruction-level emulator core
 * Copyright (C) 2002, 2003 Michael Riepe <michael@stud.uni-hannover.de>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

static const char rcsid[] = "@(#) $Id: emu.c,v 1.22 2003/01/29 22:39:57 michael Exp $";

#if HAVE_CONFIG_H
#include <config.h>
#endif

#if STDC_HEADERS
#include <stdlib.h>
#include <string.h>
#endif

#include <stdio.h>

#if HAVE_UNISTD_H
#include <unistd.h>
#else
int read(), write();
#endif

#if HAVE_ERRNO_H
#include <errno.h>
#else
extern int errno;
#endif

#if HAVE_MATH_H
#include <math.h>
#else
int isnan();
double sqrt(), log(), exp();
#endif

#include <fcpu_opcodes/fcpu_opcodes.h>
#include "./emu.h"

/* rounding */
#if HAVE_FENV_H && HAVE_FESETROUND
#include <fenv.h>
#else
#define fesetround(x)	/**/
#define FE_TONEAREST	0
#define FE_TOWARDZERO	1
#define FE_DOWNWARD		2
#define FE_UPWARD		3
#endif

int default_rounding = FE_TONEAREST;

#define RND_r	FE_TONEAREST
#define RND_t	FE_TOWARDZERO
#define RND_f	FE_DOWNWARD
#define RND_c	FE_UPWARD
#define RND(rm)	RND_##rm

/* the register set */
struct regs regs;

unsigned excode = EX_NONE;

void *memmap(U64 virtaddr, U64 align, U64 len, int write_mode);

/* condition codes */
static int
cc_zero(unsigned reg) {
	unsigned i;

	if (reg)
		for_all_chunks(i,best)
			if (r(reg).C(best,i))
				return 0;
	return 1;
}

static int
cc_nan(unsigned reg) {
	return reg && isnan(r(reg).C(D,0));
}

static int
cc_lsb(unsigned reg) {
	return reg && (r(reg).C(b,0) & 1);
}

/* XXX: always use bit 63? */
static int
cc_msb(unsigned reg) {
	return reg && (r(reg).C(b,7) & 0x80);
}

/* the instruction set, (almost) in manual order */

/* function generators */
#define GEN4(name,style,fun) \
	style(name,b,0,fun) \
	style(name,d,0,fun) \
	style(name,q,0,fun) \
	style(name,o,0,fun)

#define GEN4S(name,style,fun) \
	style(s##name,b,1,fun) \
	style(s##name,d,1,fun) \
	style(s##name,q,1,fun) \
	style(s##name,o,1,fun)

#define GEN8(name,style,fun) \
	GEN4(name,style,fun) \
	GEN4S(name,style,fun)

/* definition styles */
#define def_1(name,sz,simd,fun) \
	void name##_##sz(U32 opcode) { if (R1) fun(sz,simd); }
#define def_2(name,sz,simd,fun) \
	void name##_##sz(U32 opcode) { fun(sz,simd); }
#define def_rop2(name,sz,simd,F1,F2) \
	void name##_##sz(U32 opcode) { if (R1) F1(sz,simd,F2); }

#define ADD(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) + r(R3).C(sz,i); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(add,def_1,ADD)

#define ADDC(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		union reg ra = reginit(RA); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) + r(R3).C(sz,i); \
			ra.C(sz,i) = r1.C(sz,i) < r(R2).C(sz,i); \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
		if (RA) r(RA) = ra; \
	} while (0)

GEN8(addc,def_2,ADDC)

#define ADDS(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) + r(R3).C(sz,i); \
			if (r1.C(sz,i) < r(R2).C(sz,i)) r1.C(sz,i) = ~(UT(sz))0; \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(adds,def_1,ADDS)

#define ADDI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) + UIMM8; \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(addi,def_1,ADDI)

#define SUB(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) - r(R3).C(sz,i); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(sub,def_1,SUB)

#define SUBB(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		union reg ra = reginit(RA); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) - r(R3).C(sz,i); \
			ra.C(sz,i) = r(R2).C(sz,i) < r(R3).C(sz,i); \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
		if (RA) r(RA) = ra; \
	} while (0)

GEN8(subb,def_2,SUBB)

#define SUBF(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (r(R2).C(sz,i) <= r(R3).C(sz,i)) \
				r1.C(sz,i) = 0; \
			else \
				r1.C(sz,i) = r(R2).C(sz,i) - r(R3).C(sz,i); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(subf,def_1,SUBF)

#define SUBI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) - UIMM8; \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(subi,def_1,SUBI)

#define MULMAC(sz,simd,mac) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) * r(R3).C(sz,i); \
			if (mac) r1.C(sz,i) += r(R1).C(sz,i); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

#define MUL(sz,simd)	MULMAC(sz,simd,0)

GEN8(mul,def_1,MUL)

#define HI(sz,x)	((x) >> 4*BYTES(sz))
#define LO(sz,x)	HI(sz,(x) << 4*BYTES(sz))

#define MULMACH(sz,simd,mac) \
	do { \
		union reg r1 = reginit(R1); \
		union reg ra = reginit(RA); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (BITS(sz) > 32) { \
				union reg t; \
				r1.C(sz,i) = r(R2).C(sz,i) * r(R3).C(sz,i); \
				ra.C(sz,i) = HI(sz,r(R2).C(sz,i)) * HI(sz,r(R3).C(sz,i)); \
				t.C(sz,0) = LO(sz,r(R2).C(sz,i)) * LO(sz,r(R3).C(sz,i)); \
				t.C(sz,0) = HI(sz,t.C(sz,0)) \
				        + LO(sz,r(R2).C(sz,i)) * HI(sz,r(R3).C(sz,i)); \
				ra.C(sz,i) += HI(sz,t.C(sz,0)); \
				t.C(sz,0) = LO(sz,t.C(sz,0)) \
				        + HI(sz,r(R2).C(sz,i)) * LO(sz,r(R3).C(sz,i)); \
				ra.C(sz,i) += HI(sz,t.C(sz,0)); \
			} \
			else { \
				U64 t = (U64)r(R2).C(sz,i) * (U64)r(R3).C(sz,i); \
				r1.C(sz,i) = t; \
				ra.C(sz,i) = t >> 1 >> (BITS(sz)-1); \
			} \
			if (mac) { \
				r1.C(sz,i) += r(R1).C(sz,i); \
				ra.C(sz,i) += r1.C(sz,i) < r(R1).C(sz,i); \
			} \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
		if (RA) r(RA) = ra; \
	} while (0)

#define MULH(sz,simd)	MULMACH(sz,simd,0)

GEN8(mulh,def_2,MULH)

#define MULMACHS(sz,simd,mac) \
	do { \
		union reg r1 = reginit(R1); \
		union reg ra = reginit(RA); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (BITS(sz) > 32) { \
				union reg t; \
				r1.C(sz,i) = r(R2).C(sz,i) * r(R3).C(sz,i); \
				ra.C(sz,i) = HI(sz,r(R2).C(sz,i)) * HI(sz,r(R3).C(sz,i)); \
				t.C(sz,0) = LO(sz,r(R2).C(sz,i)) * LO(sz,r(R3).C(sz,i)); \
				t.C(sz,0) = HI(sz,t.C(sz,0)) \
				        + LO(sz,r(R2).C(sz,i)) * HI(sz,r(R3).C(sz,i)); \
				ra.C(sz,i) += HI(sz,t.C(sz,0)); \
				t.C(sz,0) = LO(sz,t.C(sz,0)) \
				        + HI(sz,r(R2).C(sz,i)) * LO(sz,r(R3).C(sz,i)); \
				ra.C(sz,i) += HI(sz,t.C(sz,0)); \
				if (r(R2).SC(sz,i) < 0) \
					ra.C(sz,0) -= r(R3).C(sz,i); \
				if (r(R3).SC(sz,i) < 0) \
					ra.C(sz,0) -= r(R2).C(sz,i); \
			} \
			else { \
				I64 t = (I64)r(R2).C(sz,i) * (I64)r(R3).C(sz,i); \
				r1.C(sz,i) = t; \
				ra.C(sz,i) = t >> 1 >> (BITS(sz)-1); \
			} \
			if (mac) { \
				r1.C(sz,i) += r(R1).C(sz,i); \
				ra.C(sz,i) += r1.C(sz,i) < r(R1).C(sz,i); \
			} \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
		if (RA) r(RA) = ra; \
	} while (0)

#define MULHS(sz,simd)	MULMACHS(sz,simd,0)

GEN8(mulhs,def_2,MULHS)

#define MULI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) * SIMM8; \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(muli,def_1,MULI)

#define DIV(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (!r(R3).C(sz,i)) { \
				ex(EX_ZERO); \
				return; \
			} \
			r1.C(sz,i) = r(R2).C(sz,i) / r(R3).C(sz,i); \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
	} while (0)

GEN8(div,def_2,DIV)

#define DIVS(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (!r(R3).C(sz,i)) { \
				ex(EX_ZERO); \
				return; \
			} \
			r1.C(sz,i) = r(R2).SC(sz,i) / r(R3).SC(sz,i); \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
	} while (0)

GEN8(divs,def_2,DIVS)

#define DIVREM(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		union reg ra = reginit(RA); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (!r(R3).C(sz,i)) { \
				ex(EX_ZERO); \
				return; \
			} \
			r1.C(sz,i) = r(R2).C(sz,i) / r(R3).C(sz,i); \
			ra.C(sz,i) = r(R2).C(sz,i) % r(R3).C(sz,i); \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
		if (RA) r(RA) = ra; \
	} while (0)

GEN8(divrem,def_2,DIVREM)

#define DIVREMS(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		union reg ra = reginit(RA); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (!r(R3).C(sz,i)) { \
				ex(EX_ZERO); \
				return; \
			} \
			r1.C(sz,i) = r(R2).SC(sz,i) / r(R3).SC(sz,i); \
			ra.C(sz,i) = r(R2).SC(sz,i) % r(R3).SC(sz,i); \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
		if (RA) r(RA) = ra; \
	} while (0)

GEN8(divrems,def_2,DIVREMS)

#define DIVI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		if (!SIMM8) { \
			ex(EX_ZERO); \
			return; \
		} \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).SC(sz,i) / SIMM8; \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
	} while (0)

GEN8(divi,def_2,DIVI)

#define DIVREMI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		union reg ra = reginit(RA); \
		unsigned i; \
		if (!SIMM8) { \
			ex(EX_ZERO); \
			return; \
		} \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).SC(sz,i) / SIMM8; \
			ra.C(sz,i) = r(R2).SC(sz,i) % SIMM8; \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
		if (RA) r(RA) = ra; \
	} while (0)

GEN8(divremi,def_2,DIVREMI)

#define REM(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (!r(R3).C(sz,i)) { \
				ex(EX_ZERO); \
				return; \
			} \
			r1.C(sz,i) = r(R2).C(sz,i) % r(R3).C(sz,i); \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
	} while (0)

GEN8(rem,def_2,REM)

#define REMS(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (!r(R3).C(sz,i)) { \
				ex(EX_ZERO); \
				return; \
			} \
			r1.C(sz,i) = r(R2).SC(sz,i) % r(R3).SC(sz,i); \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
	} while (0)

GEN8(rems,def_2,REMS)

#define REMI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		if (!SIMM8) { \
			ex(EX_ZERO); \
			return; \
		} \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).SC(sz,i) % SIMM8; \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
	} while (0)

GEN8(remi,def_2,REMI)

#define N_b d
#define N_d q
#define N_q o
#define N_o o

/* XXX: 64-bit chunks not handled correctly when MAXSIZE != 8 */
#define MACL3(ssz,dsz,simd)	\
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,dsz) { \
			r1.C(dsz,i) = r(R1).C(dsz,i) \
				+ (UT(dsz))r(R2).C(ssz,i) * (UT(dsz))r(R3).C(ssz,i); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

#define MACL(ssz,simd)	MACL3(ssz,N_##ssz,simd)

GEN8(macl,def_1,MACL)

/* XXX: 64-bit chunks not handled correctly when MAXSIZE != 8 */
#define MACLS3(ssz,dsz,simd)	\
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,dsz) { \
			r1.C(dsz,i) = r(R1).C(dsz,i) \
				+ (ST(dsz))r(R2).SC(ssz,i) * (ST(dsz))r(R3).SC(ssz,i); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

#define MACLS(ssz,simd)	MACLS3(ssz,N_##ssz,simd)

GEN8(macls,def_1,MACLS)

/* XXX: 64-bit chunks not handled correctly when MAXSIZE != 8 */
#define MACH3(ssz,dsz,simd)	\
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		const unsigned j = CHUNKS(ssz) / 2; \
		for_all_chunks(i,dsz) { \
			r1.C(dsz,i) = r(R1).C(dsz,i) \
				+ (UT(dsz))r(R2).C(ssz,j+i) * (UT(dsz))r(R3).C(ssz,j+i); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

#define MACH(ssz,simd)	MACH3(ssz,N_##ssz,simd)

GEN8(mach,def_1,MACH)

/* XXX: 64-bit chunks not handled correctly when MAXSIZE != 8 */
#define MACHS3(ssz,dsz,simd)	\
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		const unsigned j = CHUNKS(ssz) / 2; \
		for_all_chunks(i,dsz) { \
			r1.C(dsz,i) = r(R1).C(dsz,i) \
				+ (ST(dsz))r(R2).SC(ssz,j+i) * (ST(dsz))r(R3).SC(ssz,j+i); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

#define MACHS(ssz,simd)	MACHS3(ssz,N_##ssz,simd)

GEN8(machs,def_1,MACHS)

/* XXX: alternative mac (undocumented) */
/* Note: also used for FP mac! */
#define AMAC(sz,simd)	MULMAC(sz,simd,1)

GEN8(amac,def_1,AMAC)

#define AMACH(sz,simd)	MULMACH(sz,simd,1)

GEN8(amach,def_2,AMACH)

#define AMACHS(sz,simd)	MULMACHS(sz,simd,1)

GEN8(amachs,def_2,AMACHS)

#define ADDSUB(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		union reg ra = reginit(RA); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) + r(R3).C(sz,i); \
			ra.C(sz,i) = r(R2).C(sz,i) - r(R3).C(sz,i); \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
		if (RA) r(RA) = ra; \
	} while (0)

GEN8(addsub,def_2,ADDSUB)

#define POPC(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			UT(sz) y = 0; \
			unsigned j; \
			for (j = 0; j < BYTES(sz); j++) { \
				U8 x = cbyte(r(R2),sz,i,j); \
				x = (x & 0x55) + ((x >> 1) & 0x55); \
				x = (x & 0x33) + ((x >> 2) & 0x33); \
				x = (x & 0x0f) + ((x >> 4) & 0x0f); \
				y += x; \
			} \
			if (y > r(R3).C(sz,i)) \
				r1.C(sz,i) = y - r(R3).C(sz,i); \
			else \
				r1.C(sz,i) = 0; \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(popc,def_1,POPC)

#if 0
/* XXX: undocumented extension */
#define POPCH(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			UT(sz) y = 0; \
			unsigned j; \
			for (j = 0; j < BYTES(sz); j++) { \
				U8 x = cbyte(r(R2),sz,i,j); \
				x = (x & 0x55) + ((x >> 1) & 0x55); \
				x = (x & 0x33) + ((x >> 2) & 0x33); \
				x = (x & 0x0f) + ((x >> 4) & 0x0f); \
				y += x; \
			} \
			if (y > r(R3).C(sz,0)) \
				r1.C(sz,i) = y - r(R3).C(sz,0); \
			else \
				r1.C(sz,i) = 0; \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN4S(popch,def_1,POPCH)
#endif

#define POPCI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			UT(sz) y = 0; \
			unsigned j; \
			for (j = 0; j < BYTES(sz); j++) { \
				U8 x = cbyte(r(R2),sz,i,j); \
				x = (x & 0x55) + ((x >> 1) & 0x55); \
				x = (x & 0x33) + ((x >> 2) & 0x33); \
				x = (x & 0x0f) + ((x >> 4) & 0x0f); \
				y += x; \
			} \
			if (y > UIMM8) \
				r1.C(sz,i) = y - UIMM8; \
			else \
				r1.C(sz,i) = 0; \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(popci,def_1,POPCI)

#define INC(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) + 1; \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(inc,def_1,INC)

#define DEC(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) - 1; \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(dec,def_1,DEC)

#define NEG(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = -r(R2).C(sz,i); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(neg,def_1,NEG)

#define LSB1(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			UT(sz) x = r(R2).C(sz,i); \
			unsigned j; \
			r1.C(sz,i) = 0; \
			if (x) \
				for (j = 1; j <= BITS(sz); j++, x >>= 1) \
					if (x & 1) { \
						r1.C(sz,i) = j; \
						break; \
					} \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(lsb1,def_1,LSB1)

#define LSB0(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			UT(sz) x = ~r(R2).C(sz,i); \
			unsigned j; \
			r1.C(sz,i) = 0; \
			if (x) \
				for (j = 1; j <= BITS(sz); j++, x >>= 1) \
					if (x & 1) { \
						r1.C(sz,i) = j; \
						break; \
					} \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(lsb0,def_1,LSB0)

#define MSB1(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			ST(sz) x = r(R2).C(sz,i); \
			unsigned j; \
			r1.C(sz,i) = 0; \
			if (x) \
				for (j = BITS(sz); j >= 1; j--, x <<= 1) \
					if (x < 0) { \
						r1.C(sz,i) = j; \
						break; \
					} \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(msb1,def_1,MSB1)

#define MSB0(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			ST(sz) x = ~r(R2).C(sz,i); \
			unsigned j; \
			r1.C(sz,i) = 0; \
			if (x) \
				for (j = BITS(sz); j >= 1; j--, x <<= 1) \
					if (x < 0) { \
						r1.C(sz,i) = j; \
						break; \
					} \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(msb0,def_1,MSB0)

#define CMPG(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = -(r(R2).C(sz,i) > r(R3).C(sz,i)); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(cmpg,def_1,CMPG)

#define CMPLE(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = -(r(R2).C(sz,i) <= r(R3).C(sz,i)); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(cmple,def_1,CMPLE)

#define CMPGI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = -(r(R2).C(sz,i) > UIMM8); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(cmpgi,def_1,CMPGI)

#define CMPLEI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = -(r(R2).C(sz,i) <= UIMM8); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(cmplei,def_1,CMPLEI)

#define CMPGS(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = -(r(R2).SC(sz,i) > r(R3).SC(sz,i)); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(cmpgs,def_1,CMPGS)

#define CMPLES(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = -(r(R2).SC(sz,i) <= r(R3).SC(sz,i)); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(cmples,def_1,CMPLES)

#define CMPGSI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = -(r(R2).SC(sz,i) > SIMM8); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(cmpgsi,def_1,CMPGSI)

#define CMPLESI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = -(r(R2).SC(sz,i) <= SIMM8); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(cmplesi,def_1,CMPLESI)

#define ABS(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (r(R2).SC(sz,i) < 0) \
				r1.C(sz,i) = -r(R2).C(sz,i); \
			else \
				r1.C(sz,i) = r(R2).C(sz,i); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(abs,def_1,ABS)

#define NABS(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (r(R2).SC(sz,i) < 0) \
				r1.C(sz,i) = r(R2).C(sz,i); \
			else \
				r1.C(sz,i) = -r(R2).C(sz,i); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(nabs,def_1,NABS)

#define MAX(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (r(R2).C(sz,i) > r(R3).C(sz,i)) \
				r1.C(sz,i) = r(R2).C(sz,i); \
			else \
				r1.C(sz,i) = r(R3).C(sz,i); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(max,def_1,MAX)

#define MIN(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (r(R2).C(sz,i) < r(R3).C(sz,i)) \
				r1.C(sz,i) = r(R2).C(sz,i); \
			else \
				r1.C(sz,i) = r(R3).C(sz,i); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(min,def_1,MIN)

#define MAXI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (r(R2).C(sz,i) > UIMM8) \
				r1.C(sz,i) = r(R2).C(sz,i); \
			else \
				r1.C(sz,i) = UIMM8; \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(maxi,def_1,MAXI)

#define MINI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (r(R2).C(sz,i) < UIMM8) \
				r1.C(sz,i) = r(R2).C(sz,i); \
			else \
				r1.C(sz,i) = UIMM8; \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(mini,def_1,MINI)

/* Note: legacy name is `sort' */
#define MINMAX(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		union reg ra = reginit(RA); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (r(R2).C(sz,i) < r(R3).C(sz,i)) { \
				r1.C(sz,i) = r(R2).C(sz,i); \
				ra.C(sz,i) = r(R3).C(sz,i); \
			} \
			else { \
				r1.C(sz,i) = r(R3).C(sz,i); \
				ra.C(sz,i) = r(R2).C(sz,i); \
			} \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
		if (RA) r(RA) = ra; \
	} while (0)

GEN8(minmax,def_2,MINMAX)

/* XXX: undocumented */
#define MINMAXI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		union reg ra = reginit(RA); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (r(R2).C(sz,i) < UIMM8) { \
				r1.C(sz,i) = r(R2).C(sz,i); \
				ra.C(sz,i) = UIMM8; \
			} \
			else { \
				r1.C(sz,i) = UIMM8; \
				ra.C(sz,i) = r(R2).C(sz,i); \
			} \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
		if (RA) r(RA) = ra; \
	} while (0)

GEN8(minmaxi,def_2,MINMAXI)

#define MAXS(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (r(R2).SC(sz,i) > r(R3).SC(sz,i)) \
				r1.C(sz,i) = r(R2).C(sz,i); \
			else \
				r1.C(sz,i) = r(R3).C(sz,i); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(maxs,def_1,MAXS)

#define MINS(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (r(R2).SC(sz,i) < r(R3).SC(sz,i)) \
				r1.C(sz,i) = r(R2).C(sz,i); \
			else \
				r1.C(sz,i) = r(R3).C(sz,i); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(mins,def_1,MINS)

#define MAXSI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (r(R2).SC(sz,i) > SIMM8) \
				r1.C(sz,i) = r(R2).C(sz,i); \
			else \
				r1.C(sz,i) = SIMM8; \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(maxsi,def_1,MAXSI)

#define MINSI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (r(R2).SC(sz,i) < SIMM8) \
				r1.C(sz,i) = r(R2).C(sz,i); \
			else \
				r1.C(sz,i) = SIMM8; \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(minsi,def_1,MINSI)

/* Note: legacy name is `sorts' */
#define MINMAXS(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		union reg ra = reginit(RA); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (r(R2).SC(sz,i) < r(R3).SC(sz,i)) { \
				r1.C(sz,i) = r(R2).C(sz,i); \
				ra.C(sz,i) = r(R3).C(sz,i); \
			} \
			else { \
				r1.C(sz,i) = r(R3).C(sz,i); \
				ra.C(sz,i) = r(R2).C(sz,i); \
			} \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
		if (RA) r(RA) = ra; \
	} while (0)

GEN8(minmaxs,def_2,MINMAXS)

/* XXX: undocumented */
#define MINMAXSI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		union reg ra = reginit(RA); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (r(R2).SC(sz,i) < SIMM8) { \
				r1.C(sz,i) = r(R2).C(sz,i); \
				ra.C(sz,i) = SIMM8; \
			} \
			else { \
				r1.C(sz,i) = SIMM8; \
				ra.C(sz,i) = r(R2).C(sz,i); \
			} \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
		if (RA) r(RA) = ra; \
	} while (0)

GEN8(minmaxsi,def_2,MINMAXSI)

/* XXX: LNS operations are unimplemented */

#define SHIFTL(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			const unsigned n = r(R3).C(sz,i) % BITS(sz); \
			r1.C(sz,i) = r(R2).C(sz,i) << n; \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(shiftl,def_1,SHIFTL)

#define SHIFTLH(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		const unsigned n = r(R3).C(sz,0) % BITS(sz); \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) << n; \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN4S(shiftlh,def_1,SHIFTLH)

#define DSHIFTL(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		union reg ra = reginit(RA); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			const unsigned n = r(R3).C(sz,i) % BITS(sz); \
			r1.C(sz,i) = r(R2).C(sz,i) << n; \
			ra.C(sz,i) = n ? r(R2).C(sz,i) >> (BITS(sz) - n) : 0; \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
		if (RA) r(RA) = ra; \
	} while (0)

GEN8(dshiftl,def_2,DSHIFTL)

#define DSHIFTLH(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		union reg ra = reginit(RA); \
		unsigned i; \
		const unsigned n = r(R3).C(sz,0) % BITS(sz); \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) << n; \
			ra.C(sz,i) = n ? r(R2).C(sz,i) >> (BITS(sz) - n) : 0; \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
		if (RA) r(RA) = ra; \
	} while (0)

GEN4S(dshiftlh,def_2,DSHIFTLH)

#define SHIFTR(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			const unsigned n = r(R3).C(sz,i) % BITS(sz); \
			r1.C(sz,i) = r(R2).C(sz,i) >> n; \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(shiftr,def_1,SHIFTR)

#define SHIFTRH(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		const unsigned n = r(R3).C(sz,0) % BITS(sz); \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) >> n; \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN4S(shiftrh,def_1,SHIFTRH)

#define DSHIFTR(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		union reg ra = reginit(RA); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			const unsigned n = r(R3).C(sz,i) % BITS(sz); \
			r1.C(sz,i) = r(R2).C(sz,i) >> n; \
			ra.C(sz,i) = n ? r(R2).C(sz,i) << (BITS(sz) - n) : 0; \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
		if (RA) r(RA) = ra; \
	} while (0)

GEN8(dshiftr,def_2,DSHIFTR)

#define DSHIFTRH(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		union reg ra = reginit(RA); \
		unsigned i; \
		const unsigned n = r(R3).C(sz,0) % BITS(sz); \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) >> n; \
			ra.C(sz,i) = n ? r(R2).C(sz,i) << (BITS(sz) - n) : 0; \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
		if (RA) r(RA) = ra; \
	} while (0)

GEN4S(dshiftrh,def_2,DSHIFTRH)

#define SHIFTLI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		const unsigned n = UIMM8 % BITS(sz); \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) << n; \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(shiftli,def_1,SHIFTLI)

#define DSHIFTLI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		union reg ra = reginit(RA); \
		unsigned i; \
		const unsigned n = UIMM8 % BITS(sz); \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) << n; \
			ra.C(sz,i) = n ? r(R2).C(sz,i) >> (BITS(sz) - n) : 0; \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
		if (RA) r(RA) = ra; \
	} while (0)

GEN8(dshiftli,def_2,DSHIFTLI)

#define SHIFTRI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		const unsigned n = UIMM8 % BITS(sz); \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) >> n; \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(shiftri,def_1,SHIFTRI)

#define DSHIFTRI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		union reg ra = reginit(RA); \
		unsigned i; \
		const unsigned n = UIMM8 % BITS(sz); \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) >> n; \
			ra.C(sz,i) = n ? r(R2).C(sz,i) << (BITS(sz) - n) : 0; \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
		if (RA) r(RA) = ra; \
	} while (0)

GEN8(dshiftri,def_2,DSHIFTRI)

#define SHIFTRA(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			const unsigned n = r(R3).C(sz,i) % BITS(sz); \
			r1.C(sz,i) = r(R2).SC(sz,i) >> n; \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(shiftra,def_1,SHIFTRA)

#define SHIFTRAH(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		const unsigned n = r(R3).C(sz,0) % BITS(sz); \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).SC(sz,i) >> n; \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN4S(shiftrah,def_1,SHIFTRAH)

#define DSHIFTRA(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		union reg ra = reginit(RA); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			const unsigned n = r(R3).C(sz,i) % BITS(sz); \
			r1.C(sz,i) = r(R2).SC(sz,i) >> n; \
			ra.C(sz,i) = n ? r(R2).C(sz,i) << (BITS(sz) - n) : 0; \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
		if (RA) r(RA) = ra; \
	} while (0)

GEN8(dshiftra,def_2,DSHIFTRA)

#define DSHIFTRAH(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		union reg ra = reginit(RA); \
		unsigned i; \
		const unsigned n = r(R3).C(sz,0) % BITS(sz); \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).SC(sz,i) >> n; \
			ra.C(sz,i) = n ? r(R2).C(sz,i) << (BITS(sz) - n) : 0; \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
		if (RA) r(RA) = ra; \
	} while (0)

GEN4S(dshiftrah,def_2,DSHIFTRAH)

#define SHIFTRAI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		const unsigned n = UIMM8 % BITS(sz); \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).SC(sz,i) >> n; \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(shiftrai,def_1,SHIFTRAI)

#define DSHIFTRAI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		union reg ra = reginit(RA); \
		unsigned i; \
		const unsigned n = UIMM8 % BITS(sz); \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).SC(sz,i) >> n; \
			ra.C(sz,i) = n ? r(R2).C(sz,i) << (BITS(sz) - n) : 0; \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
		if (RA) r(RA) = ra; \
	} while (0)

GEN8(dshiftrai,def_2,DSHIFTRAI)

#define ROTL(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			const unsigned n = r(R3).C(sz,i) % BITS(sz); \
			r1.C(sz,i) = n == 0 ? r(R2).C(sz,i) : \
				(r(R2).C(sz,i) << n) | (r(R2).C(sz,i) >> (BITS(sz) - n)); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(rotl,def_1,ROTL)

#define ROTLH(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		const unsigned n = r(R3).C(sz,0) % BITS(sz); \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = n == 0 ? r(R2).C(sz,i) : \
				(r(R2).C(sz,i) << n) | (r(R2).C(sz,i) >> (BITS(sz) - n)); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN4S(rotlh,def_1,ROTLH)

#define ROTR(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			const unsigned n = r(R3).C(sz,i) % BITS(sz); \
			r1.C(sz,i) = n == 0 ? r(R2).C(sz,i) : \
				(r(R2).C(sz,i) >> n) | (r(R2).C(sz,i) << (BITS(sz) - n)); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(rotr,def_1,ROTR)

#define ROTRH(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		const unsigned n = r(R3).C(sz,0) % BITS(sz); \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = n == 0 ? r(R2).C(sz,i) : \
				(r(R2).C(sz,i) >> n) | (r(R2).C(sz,i) << (BITS(sz) - n)); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN4S(rotrh,def_1,ROTRH)

#define ROTLI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		const unsigned n = UIMM8 % BITS(sz); \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = n == 0 ? r(R2).C(sz,i) : \
				(r(R2).C(sz,i) << n) | (r(R2).C(sz,i) >> (BITS(sz) - n)); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(rotli,def_1,ROTLI)

#define ROTRI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		const unsigned n = UIMM8 % BITS(sz); \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = n == 0 ? r(R2).C(sz,i) : \
				(r(R2).C(sz,i) >> n) | (r(R2).C(sz,i) << (BITS(sz) - n)); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(rotri,def_1,ROTRI)

#define BTST(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			const unsigned n = r(R3).C(sz,i) % BITS(sz); \
			r1.C(sz,i) = r(R2).C(sz,i) & ((UT(sz))1 << n); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(btst,def_1,BTST)

#if 0
/* XXX: undocumented */
#define BTSTH(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		const unsigned n = r(R3).C(sz,0) % BITS(sz); \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) & ((UT(sz))1 << n); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN4S(btsth,def_1,BTSTH)
#endif

#define BCLR(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			const unsigned n = r(R3).C(sz,i) % BITS(sz); \
			r1.C(sz,i) = r(R2).C(sz,i) & ~((UT(sz))1 << n); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(bclr,def_1,BCLR)

#if 0
/* XXX: undocumented */
#define BCLRH(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		const unsigned n = r(R3).C(sz,0) % BITS(sz); \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) & ~((UT(sz))1 << n); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN4S(bclrh,def_1,BCLRH)
#endif

#define BCHG(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			const unsigned n = r(R3).C(sz,i) % BITS(sz); \
			r1.C(sz,i) = r(R2).C(sz,i) ^ ((UT(sz))1 << n); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(bchg,def_1,BCHG)

#if 0
/* XXX: undocumented */
#define BCHGH(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		const unsigned n = r(R3).C(sz,0) % BITS(sz); \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) ^ ((UT(sz))1 << n); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN4S(bchgh,def_1,BCHGH)
#endif

#define BSET(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			const unsigned n = r(R3).C(sz,i) % BITS(sz); \
			r1.C(sz,i) = r(R2).C(sz,i) | ((UT(sz))1 << n); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(bset,def_1,BSET)

#if 0
/* XXX: undocumented */
#define BSETH(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		const unsigned n = r(R3).C(sz,0) % BITS(sz); \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) | ((UT(sz))1 << n); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN4S(bseth,def_1,BSETH)
#endif

/* XXX: verify immediate size */
#define BTSTI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		const unsigned n = UIMM8 % BITS(sz); \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) & ((UT(sz))1 << n); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(btsti,def_1,BTSTI)

/* XXX: verify immediate size */
#define BCLRI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		const unsigned n = UIMM8 % BITS(sz); \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) & ~((UT(sz))1 << n); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(bclri,def_1,BCLRI)

/* XXX: verify immediate size */
#define BCHGI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		const unsigned n = UIMM8 % BITS(sz); \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) ^ ((UT(sz))1 << n); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(bchgi,def_1,BCHGI)

/* XXX: verify immediate size */
#define BSETI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		const unsigned n = UIMM8 % BITS(sz); \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,i) | ((UT(sz))1 << n); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(bseti,def_1,BSETI)

#define MIXL(sz,ignore) \
	do { \
		union reg res[2]; \
		unsigned i; \
		for_all_chunks(i,sz) { \
			res[0].C2(sz,2*i+0) = r(R2).C(sz,i); \
			res[0].C2(sz,2*i+1) = r(R3).C(sz,i); \
		} \
		r(R1) = res[HOST_BIG_ENDIAN != 0]; \
	} while (0)

GEN4(mixl,def_1,MIXL)

#define MIXH(sz,ignore) \
	do { \
		union reg res[2]; \
		unsigned i; \
		for_all_chunks(i,sz) { \
			res[0].C2(sz,2*i+0) = r(R2).C(sz,i); \
			res[0].C2(sz,2*i+1) = r(R3).C(sz,i); \
		} \
		r(R1) = res[HOST_BIG_ENDIAN == 0]; \
	} while (0)

GEN4(mixh,def_1,MIXH)

/* XXX: undocumented */
#define MIX(sz,ignore) \
	do { \
		union reg res[2]; \
		unsigned i; \
		for_all_chunks(i,sz) { \
			res[0].C2(sz,2*i+0) = r(R2).C(sz,i); \
			res[0].C2(sz,2*i+1) = r(R3).C(sz,i); \
		} \
		if (R1) r(R1) = res[HOST_BIG_ENDIAN != 0]; \
		if (RA) r(RA) = res[HOST_BIG_ENDIAN == 0]; \
	} while (0)

GEN4(mix,def_2,MIX)

#define EXPANDL(sz,ignore) \
	do { \
		union reg res[2]; \
		unsigned i; \
		for_all_chunks(i,sz) { \
			const unsigned j = (i % 2) ? CHUNKS(sz) - 1 + i : i; \
			res[0].C2(sz,j+0) = r(R2).C(sz,i); \
			res[0].C2(sz,j+1) = r(R3).C(sz,i); \
		} \
		r(R1) = res[HOST_BIG_ENDIAN != 0]; \
	} while (0)

GEN4(expandl,def_1,EXPANDL)

#define EXPANDH(sz,ignore) \
	do { \
		union reg res[2]; \
		unsigned i; \
		for_all_chunks(i,sz) { \
			const unsigned j = (i % 2) ? CHUNKS(sz) - 1 + i : i; \
			res[0].C2(sz,j+0) = r(R2).C(sz,i); \
			res[0].C2(sz,j+1) = r(R3).C(sz,i); \
		} \
		r(R1) = res[HOST_BIG_ENDIAN == 0]; \
	} while (0)

GEN4(expandh,def_1,EXPANDH)

/* XXX: undocumented */
#define EXPAND(sz,ignore) \
	do { \
		union reg res[2]; \
		unsigned i; \
		for_all_chunks(i,sz) { \
			const unsigned j = (i % 2) ? CHUNKS(sz) - 1 + i : i; \
			res[0].C2(sz,j+0) = r(R2).C(sz,i); \
			res[0].C2(sz,j+1) = r(R3).C(sz,i); \
		} \
		if (R1) r(R1) = res[HOST_BIG_ENDIAN != 0]; \
		if (RA) r(RA) = res[HOST_BIG_ENDIAN == 0]; \
	} while (0)

GEN4(expand,def_2,EXPAND)

#define CSHIFTL(sz,ignore) \
	do { \
		union reg r1; \
		unsigned i; \
		r1.C(sz,0) = r(R3).C(sz,0); \
		for (i = 1; i < CHUNKS(sz); i++) \
			r1.C(sz,i) = r(R2).C(sz,i-1); \
		r(R1) = r1; \
	} while (0)

GEN4(cshiftl,def_1,CSHIFTL)

#define CSHIFTR(sz,ignore) \
	do { \
		union reg r1; \
		unsigned i; \
		for (i = 1; i < CHUNKS(sz); i++) \
			r1.C(sz,i-1) = r(R2).C(sz,i); \
		r1.C(sz,CHUNKS(sz)-1) = r(R3).C(sz,0); \
		r(R1) = r1; \
	} while (0)

GEN4(cshiftr,def_1,CSHIFTR)

#define VSEL(sz,simd) \
	do { \
		union reg r1; \
		unsigned i; \
		for_all_chunks(i,sz) { \
			unsigned n = r(R3).C(sz,i) % CHUNKS(sz); \
			r1.C(sz,i) = r(R2).C(sz,n); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(vsel,def_1,VSEL)

#define VSELH(sz,simd) \
	do { \
		union reg r1; \
		unsigned i; \
		const unsigned n = r(R3).C(sz,0) % CHUNKS(sz); \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,n); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN4S(vselh,def_1,VSELH)

#define VSELI(sz,simd) \
	do { \
		union reg r1; \
		unsigned i; \
		const unsigned n = UIMM8 % CHUNKS(sz); \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = r(R2).C(sz,n); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(vseli,def_1,VSELI)

U64 brev(U64 x, unsigned bits) {
	U64 y = 0;
	unsigned i;

	for (i = 0; i < bits; i++) {
		y = (y << 1) | (x & 1);
		x >>= 1;
	}
	return y;
}

/* XXX: bitrev[i]o is unimplemented */
#define BITREV(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			const unsigned n = r(R3).C(sz,i) % BITS(sz); \
			UT(sz) t = brev(r(R2).C(sz,i), BITS(sz)); \
			r1.C(sz,i) = t >> n; \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(bitrev,def_1,BITREV)

#define BITREVH(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		const unsigned n = r(R3).C(sz,0) % BITS(sz); \
		for_all_chunks(i,sz) { \
			UT(sz) t = brev(r(R2).C(sz,i), BITS(sz)); \
			r1.C(sz,i) = t >> n; \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN4S(bitrevh,def_1,BITREVH)

#define BITREVI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		const unsigned n = UIMM8 % BITS(sz); \
		for_all_chunks(i,sz) { \
			UT(sz) t = brev(r(R2).C(sz,i), BITS(sz)); \
			r1.C(sz,i) = t >> n; \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(bitrevi,def_1,BITREVI)

/* XXX: dbitrev[i]o is unimplemented */
#define DBITREV(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		union reg ra = reginit(RA); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			const unsigned n = r(R3).C(sz,i) % BITS(sz); \
			UT(sz) t = brev(r(R2).C(sz,i), BITS(sz)); \
			r1.C(sz,i) = t >> n; \
			ra.C(sz,i) = n ? t << (BITS(sz) - n) : 0; \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
		if (RA) r(RA) = ra; \
	} while (0)

GEN8(dbitrev,def_2,DBITREV)

#define DBITREVH(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		union reg ra = reginit(RA); \
		unsigned i; \
		const unsigned n = r(R3).C(sz,0) % BITS(sz); \
		for_all_chunks(i,sz) { \
			UT(sz) t = brev(r(R2).C(sz,i), BITS(sz)); \
			r1.C(sz,i) = t >> n; \
			ra.C(sz,i) = n ? t << (BITS(sz) - n) : 0; \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
		if (RA) r(RA) = ra; \
	} while (0)

GEN4S(dbitrevh,def_2,DBITREVH)

#define DBITREVI(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		union reg ra = reginit(RA); \
		unsigned i; \
		const unsigned n = UIMM8 % BITS(sz); \
		for_all_chunks(i,sz) { \
			UT(sz) t = brev(r(R2).C(sz,i), BITS(sz)); \
			r1.C(sz,i) = t >> n; \
			ra.C(sz,i) = n ? t << (BITS(sz) - n) : 0; \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
		if (RA) r(RA) = ra; \
	} while (0)

GEN8(dbitrevi,def_2,DBITREVI)

/* XXX: manual example is wrong! */
#define BYTEREV(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		const unsigned n = BYTES(sz); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			unsigned j; \
			for (j = 0; j < n; j++) \
				cbyte(r1,sz,i,n-1-j) = cbyte(r(R2),sz,i,j); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(byterev,def_1,BYTEREV)

#define rop2_def(name,simd,F1,F2) \
	def_rop2(name,b,simd,F1,F2) \
	def_rop2(name,d,simd,F1,F2) \
	def_rop2(name,q,simd,F1,F2) \
	def_rop2(name,o,simd,F1,F2)

#define rop2_name(x)	x
#define rop2_andname(x)	x##_and
#define rop2_orname(x)	x##_or
#define rop2_iname(x)	x##i

#define rop2f_and(a,b)	((a) & (b))
#define rop2f_or(a,b)	((a) | (b))
#define rop2f_xor(a,b)	((a) ^ (b))
#define rop2f_andn(a,b)	((a) & ~(b))
#define rop2f_orn(a,b)	((a) | ~(b))
#define rop2f_nand(a,b)	~((a) & (b))
#define rop2f_nor(a,b)	~((a) | (b))
#define rop2f_xnor(a,b)	~((a) ^ (b))

#define GENROP2(style,name,fun) \
	style(name(and),0,fun,rop2f_and) \
	style(name(or),0,fun,rop2f_or) \
	style(name(xor),0,fun,rop2f_xor) \
	style(name(andn),0,fun,rop2f_andn) \
	style(name(orn),0,fun,rop2f_orn) \
	style(name(nand),0,fun,rop2f_nand) \
	style(name(nor),0,fun,rop2f_nor) \
	style(name(xnor),0,fun,rop2f_xnor) \
	style(name(sand),1,fun,rop2f_and) \
	style(name(sor),1,fun,rop2f_or) \
	style(name(sxor),1,fun,rop2f_xor) \
	style(name(sandn),1,fun,rop2f_andn) \
	style(name(sorn),1,fun,rop2f_orn) \
	style(name(snand),1,fun,rop2f_nand) \
	style(name(snor),1,fun,rop2f_nor) \
	style(name(sxnor),1,fun,rop2f_xnor)

#define ROP2(sz,simd,func) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = func(r(R2).C(sz,i), r(R3).C(sz,i)); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GENROP2(rop2_def,rop2_name,ROP2)

#define ROP2_AND(sz,simd,func) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			UT(sz) x = ~func(r(R2).C(sz,i), r(R3).C(sz,i)); \
			r1.C(sz,i) = -(x == 0); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GENROP2(rop2_def,rop2_andname,ROP2_AND)

#define ROP2_OR(sz,simd,func) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			UT(sz) x = func(r(R2).C(sz,i), r(R3).C(sz,i)); \
			r1.C(sz,i) = -(x != 0); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GENROP2(rop2_def,rop2_orname,ROP2_OR)

/* XXX: verify register usage for MUX */
#define ROP2_MUX(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = (r(R1).C(sz,i) &~ r(R3).C(sz,i)) \
					   | (r(R2).C(sz,i) &  r(R3).C(sz,i)); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GEN8(mux,def_1,ROP2_MUX)

#define ROP2I(sz,simd,func) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		const ST(sz) r3 = SIMM9; \
		for_all_chunks(i,sz) { \
			r1.C(sz,i) = func(r(R2).C(sz,i), r3); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GENROP2(rop2_def,rop2_iname,ROP2I)

/* FP stuff */
/* XXX: -x suffix is currently ignored */

#define GENF4(name,style,fun) \
	style(name##_f,fun,F,0) \
	style(name##_d,fun,D,0) \
	style(s##name##_f,fun,F,1) \
	style(s##name##_d,fun,D,1)

#define def_f1(name,fun,sz,simd) \
	void name(U32 opcode) { if (R1) fun(sz,simd); }
#define def_f2(name,fun,sz,simd) \
	void name(U32 opcode) { fun(sz,simd); }

GENF4(fadd,def_f1,ADD)

GENF4(fsub,def_f1,SUB)

GENF4(fmul,def_f1,MUL)

/* XXX: verify semantics */
/* XXX: exception handling */
/* XXX: SIMD version? */
#define F2INT(fsz,isz,rm) \
	do { \
		fesetround(RND(rm)); \
		r(R1).SC(isz,0) = (ST(isz))r(R2).C(fsz,0); \
		fesetround(default_rounding); \
	} while (0)

void f2intr_b(U32 opcode) { if (R1) F2INT(F,b,r); }
void f2intr_d(U32 opcode) { if (R1) F2INT(F,d,r); }
void f2intr_q(U32 opcode) { if (R1) F2INT(F,q,r); }
void f2intr_o(U32 opcode) { if (R1) F2INT(F,o,r); }
void d2intr_b(U32 opcode) { if (R1) F2INT(D,b,r); }
void d2intr_d(U32 opcode) { if (R1) F2INT(D,d,r); }
void d2intr_q(U32 opcode) { if (R1) F2INT(D,q,r); }
void d2intr_o(U32 opcode) { if (R1) F2INT(D,o,r); }
void f2intt_b(U32 opcode) { if (R1) F2INT(F,b,t); }
void f2intt_d(U32 opcode) { if (R1) F2INT(F,d,t); }
void f2intt_q(U32 opcode) { if (R1) F2INT(F,q,t); }
void f2intt_o(U32 opcode) { if (R1) F2INT(F,o,t); }
void d2intt_b(U32 opcode) { if (R1) F2INT(D,b,t); }
void d2intt_d(U32 opcode) { if (R1) F2INT(D,d,t); }
void d2intt_q(U32 opcode) { if (R1) F2INT(D,q,t); }
void d2intt_o(U32 opcode) { if (R1) F2INT(D,o,t); }
void f2intf_b(U32 opcode) { if (R1) F2INT(F,b,f); }
void f2intf_d(U32 opcode) { if (R1) F2INT(F,d,f); }
void f2intf_q(U32 opcode) { if (R1) F2INT(F,q,f); }
void f2intf_o(U32 opcode) { if (R1) F2INT(F,o,f); }
void d2intf_b(U32 opcode) { if (R1) F2INT(D,b,f); }
void d2intf_d(U32 opcode) { if (R1) F2INT(D,d,f); }
void d2intf_q(U32 opcode) { if (R1) F2INT(D,q,f); }
void d2intf_o(U32 opcode) { if (R1) F2INT(D,o,f); }
void f2intc_b(U32 opcode) { if (R1) F2INT(F,b,c); }
void f2intc_d(U32 opcode) { if (R1) F2INT(F,d,c); }
void f2intc_q(U32 opcode) { if (R1) F2INT(F,q,c); }
void f2intc_o(U32 opcode) { if (R1) F2INT(F,o,c); }
void d2intc_b(U32 opcode) { if (R1) F2INT(D,b,c); }
void d2intc_d(U32 opcode) { if (R1) F2INT(D,d,c); }
void d2intc_q(U32 opcode) { if (R1) F2INT(D,q,c); }
void d2intc_o(U32 opcode) { if (R1) F2INT(D,o,c); }

/* XXX: verify semantics */
/* XXX: exception handling */
/* XXX: SIMD version? */
#define INT2F(fsz,isz,rm) \
	do { \
		fesetround(RND(rm)); \
		r(R1).C(fsz,0) = (FT(fsz))r(R2).SC(isz,0); \
		fesetround(default_rounding); \
	} while (0)

void int2fr_b(U32 opcode) { if (R1) INT2F(F,b,r); }
void int2fr_d(U32 opcode) { if (R1) INT2F(F,d,r); }
void int2fr_q(U32 opcode) { if (R1) INT2F(F,q,r); }
void int2fr_o(U32 opcode) { if (R1) INT2F(F,o,r); }
void int2dr_b(U32 opcode) { if (R1) INT2F(D,b,r); }
void int2dr_d(U32 opcode) { if (R1) INT2F(D,d,r); }
void int2dr_q(U32 opcode) { if (R1) INT2F(D,q,r); }
void int2dr_o(U32 opcode) { if (R1) INT2F(D,o,r); }
void int2ft_b(U32 opcode) { if (R1) INT2F(F,b,t); }
void int2ft_d(U32 opcode) { if (R1) INT2F(F,d,t); }
void int2ft_q(U32 opcode) { if (R1) INT2F(F,q,t); }
void int2ft_o(U32 opcode) { if (R1) INT2F(F,o,t); }
void int2dt_b(U32 opcode) { if (R1) INT2F(D,b,t); }
void int2dt_d(U32 opcode) { if (R1) INT2F(D,d,t); }
void int2dt_q(U32 opcode) { if (R1) INT2F(D,q,t); }
void int2dt_o(U32 opcode) { if (R1) INT2F(D,o,t); }
void int2ff_b(U32 opcode) { if (R1) INT2F(F,b,f); }
void int2ff_d(U32 opcode) { if (R1) INT2F(F,d,f); }
void int2ff_q(U32 opcode) { if (R1) INT2F(F,q,f); }
void int2ff_o(U32 opcode) { if (R1) INT2F(F,o,f); }
void int2df_b(U32 opcode) { if (R1) INT2F(D,b,f); }
void int2df_d(U32 opcode) { if (R1) INT2F(D,d,f); }
void int2df_q(U32 opcode) { if (R1) INT2F(D,q,f); }
void int2df_o(U32 opcode) { if (R1) INT2F(D,o,f); }
void int2fc_b(U32 opcode) { if (R1) INT2F(F,b,c); }
void int2fc_d(U32 opcode) { if (R1) INT2F(F,d,c); }
void int2fc_q(U32 opcode) { if (R1) INT2F(F,q,c); }
void int2fc_o(U32 opcode) { if (R1) INT2F(F,o,c); }
void int2dc_b(U32 opcode) { if (R1) INT2F(D,b,c); }
void int2dc_d(U32 opcode) { if (R1) INT2F(D,d,c); }
void int2dc_q(U32 opcode) { if (R1) INT2F(D,q,c); }
void int2dc_o(U32 opcode) { if (R1) INT2F(D,o,c); }

/* simulated approximation table lookup */
/* XXX: depends heavily on host CPU's internal FP representation! */
double aprx(double x) {
#if __i386__ || __sparc__ || __fcpu__
	union {
		double d;
		U64 n;
	} cv;

	/* truncate mantissa to 3 bits -> 16 table entries */
	cv.d = x;
	cv.n &= 0xfffe000000000000ull;
	cv.n |= 0x0001000000000000ull;	/* move to middle of interval */
	return cv.d;
#else
#error unknown FP representation; please modify aprx().
#endif
}

#define FIAPRX(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (!r(R2).C(sz,i)) { \
				ex(EX_ZERO); \
				return; \
			} \
			r1.C(sz,i) = 1.0 / aprx(r(R2).C(sz,i)); \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
	} while (0)

GENF4(fiaprx,def_f2,FIAPRX)

#define FSQRTIAPRX(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (r(R2).C(sz,i) < 0.0) { \
				ex(EX_RANGE); \
				return; \
			} \
			r1.C(sz,i) = 1.0 / sqrt(aprx(r(R2).C(sz,i))); \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
	} while (0)

GENF4(fsqrtiaprx,def_f2,FSQRTIAPRX)

/* XXX: fcmp* is Broken As Designed (tm), skipping... */

GENF4(fdiv,def_f2,DIV)

#define FSQRT(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (r(R2).C(sz,i) < 0.0) { \
				ex(EX_RANGE); \
				return; \
			} \
			r1.C(sz,i) = sqrt(r(R2).C(sz,i)); \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
	} while (0)

GENF4(fsqrt,def_f2,FSQRT)

#define log2(x) (log(x)/log(2.0))	/* binary logarithm */
#define FLOG(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (r(R2).C(sz,i) <= 0.0) { \
				ex(EX_RANGE); \
				return; \
			} \
			if (R3) { \
				if (r(R3).C(sz,i) == 0.0) { \
					ex(EX_ZERO); \
					return; \
				} \
				r1.C(sz,i) = log2(r(R2).C(sz,i)) / r(R3).C(sz,i); \
			} \
			else \
				r1.C(sz,i) = log2(r(R2).C(sz,i)); \
			if (!simd) break; \
		} \
		if (R1) r(R1) = r1; \
	} while (0)

GENF4(flog,def_f2,FLOG)

#define exp2(x) (exp((x)*log(2.0))) /* 2.0**x */
#define FEXP(sz,simd) \
	do { \
		union reg r1 = reginit(R1); \
		unsigned i; \
		for_all_chunks(i,sz) { \
			if (R3) \
				r1.C(sz,i) = exp2(r(R2).C(sz,i) * r(R3).C(sz,i)); \
			else \
				r1.C(sz,i) = exp2(r(R2).C(sz,i)); \
			if (!simd) break; \
		} \
		r(R1) = r1; \
	} while (0)

GENF4(fexp,def_f1,FEXP)

GENF4(fmac,def_f1,AMAC)

GENF4(faddsub,def_f2,ADDSUB)

/* load/store */

/* Note: R1 is updated *after* R2 */
#define LOAD(sz,ignore) \
	do { \
		U64 addr = r(R2).C(o,0) ; \
		union reg r1 = reginit(R1); \
		unsigned j; \
		if (excode) return; \
		if (R3 && R2) r(R2).C(o,0) += r(R3).C(o,0); \
		for (j = 0; j < BYTES(sz); j++) \
			r1.C(b,j) = lsu_read(addr + j, R2) ; \
		if (R1) r(R1) = r1; \
	} while (0)

GEN4(load,def_2,LOAD)

/* Note: R1 is updated *after* R2 */
#define LOADE(sz,ignore) \
	do { \
		U64 addr = r(R2).C(o,0) ; \
		union reg r1 = reginit(R1); \
		unsigned j; \
		if (excode) return; \
		if (R3 && R2) r(R2).C(o,0) += r(R3).C(o,0); \
		for (j = 0; j < BYTES(sz); j++) \
			r1.C(b,BYTES(sz)-1-j) = lsu_read(addr + j, R2) ; \
		if (R1) r(R1) = r1; \
	} while (0)

GEN4(loade,def_2,LOADE)

/* Note: R1 is updated *after* R2 */
#define LOADI(sz,ignore) \
	do { \
		U64 addr = r(R2).C(o,0) ; \
		union reg r1 = reginit(R1); \
		unsigned j; \
		if (excode) return; \
		if (R2) r(R2).C(o,0) += SIMM9; \
		for (j = 0; j < BYTES(sz); j++) \
			r1.C(b,j) =  lsu_read(addr + j, R2) ; \
		if (R1) r(R1) = r1; \
	} while (0)

GEN4(loadi,def_2,LOADI)

/* Note: R1 is updated *after* R2 */
#define LOADIE(sz,ignore) \
	do { \
		U64 addr = r(R2).C(o,0) ; \
		union reg r1 = reginit(R1); \
		unsigned j; \
		if (excode) return; \
		if (R2) r(R2).C(o,0) += SIMM9; \
		for (j = 0; j < BYTES(sz); j++) \
			r1.C(b,BYTES(sz)-1-j) = lsu_read(addr + j, R2) ; \
		if (R1) r(R1) = r1; \
	} while (0)

GEN4(loadie,def_2,LOADIE)

/* Note: R2 is updated *after* R1 was written */
#define STORE(sz,ignore) \
	do { \
		U64 addr = r(R2).C(o,0) ; \
		unsigned j; \
		if (excode) return; \
		for (j = 0; j < BYTES(sz); j++) \
			lsu_write(addr + j, R2, r(R1).C(b,j)); \
		if (R3 && R2) r(R2).C(o,0) += r(R3).C(o,0); \
	} while (0)

GEN4(store,def_2,STORE)

/* Note: R2 is updated *after* R1 was written */
#define STOREE(sz,ignore) \
	do { \
		U64 addr = r(R2).C(o,0) ; \
		unsigned j; \
		if (excode) return; \
		for (j = 0; j < BYTES(sz); j++) \
			lsu_write(addr + j, R2, r(R1).C(b,BYTES(sz)-1-j)); \
		if (R3 && R2) r(R2).C(o,0) += r(R3).C(o,0); \
	} while (0)

GEN4(storee,def_2,STOREE)

/* Note: R2 is updated *after* R1 was written */
#define STOREI(sz,ignore) \
	do { \
		U64 addr = r(R2).C(o,0) ; \
		unsigned j; \
		if (excode) return; \
		for (j = 0; j < BYTES(sz); j++) \
		   lsu_write(addr + j, R2, r(R1).C(b,j)); \
		if (R2) r(R2).C(o,0) += SIMM9; \
	} while (0)

GEN4(storei,def_2,STOREI)

/* Note: R2 is updated *after* R1 was written */
#define STOREIE(sz,ignore) \
	do { \
		U64 addr = r(R2).C(o,0) ; \
		unsigned j; \
		if (excode) return; \
		for (j = 0; j < BYTES(sz); j++) \
			lsu_write(addr + j, R2, r(R1).C(b,BYTES(sz)-1-j)); \
		if (R2) r(R2).C(o,0) += SIMM9; \
	} while (0)

GEN4(storeie,def_2,STOREIE)

/* XXX: verify semantics */
#define CSTORE(sz,cond) \
	do { \
		unsigned j; \
		U64 addr = r(R2).C(o,0) ; \
		if (!cond(R3)) break; \
		if (excode) return; \
		for (j = 0; j < BYTES(sz); j++) \
			lsu_write(addr + j, R2, r(R1).C(b,j)); \
	} while (0)

#define def_cc_1(name,sz,fun,cond) \
	void name##_##sz(U32 opcode) { if (R1) fun(sz,cond); }
#define def_cc_2(name,sz,fun,cond) \
	void name##_##sz(U32 opcode) { fun(sz,cond); }

#define def_cc_sz(name,style,fun,cond) \
	style(name,b,fun,cond) \
	style(name,d,fun,cond) \
	style(name,q,fun,cond) \
	style(name,o,fun,cond)

#define GENCCSZ(name,style,fun) \
	def_cc_sz(name##z,style,fun,cc_zero) \
	def_cc_sz(name##n,style,fun,cc_nan) \
	def_cc_sz(name##l,style,fun,cc_lsb) \
	def_cc_sz(name##m,style,fun,cc_msb) \
	def_cc_sz(name##nz,style,fun,!cc_zero) \
	def_cc_sz(name##nn,style,fun,!cc_nan) \
	def_cc_sz(name##nl,style,fun,!cc_lsb) \
	def_cc_sz(name##nm,style,fun,!cc_msb)

GENCCSZ(cstore,def_cc_2,CSTORE)

/* XXX: verify semantics */
#define CSTOREE(sz,cond) \
	do { \
		unsigned j; \
		U64 addr = r(R2).C(o,0) ; \
		if (!cond(R3)) break; \
		if (excode) return; \
		for (j = 0; j < BYTES(sz); j++) \
			lsu_write(addr + j, R2, r(R1).C(b,BYTES(sz)-1-j)); \
	} while (0)

GENCCSZ(cstoree,def_cc_2,CSTOREE)

/* XXX: verify semantics */
#define CLOAD(sz,cond) \
	do { \
		union reg r1 = reginit(R1); \
		U64 addr = r(R2).C(o,0); \
		unsigned j; \
		if (!cond(R3)) break; \
		if (excode) return; \
		for (j = 0; j < BYTES(sz); j++) \
			r1.C(b,j) = lsu_read(addr+j, R2); \
		if (R1) r(R1) = r1; \
	} while (0)

GENCCSZ(cload,def_cc_2,CLOAD)

/* XXX: verify semantics */
#define CLOADE(sz,cond) \
	do { \
		union reg r1 = reginit(R1); \
		U64 addr = r(R2).C(o,0); \
		unsigned j; \
		if (!cond(R3)) break; \
		if (excode) return; \
		for (j = 0; j < BYTES(sz); j++) \
			r1.C(b,BYTES(sz)-1-j) = lsu_read(addr+j, R2); \
		if (R1) r(R1) = r1; \
	} while (0)

GENCCSZ(cloade,def_cc_2,CLOADE)

/* XXX: ss/sl is missing */

/* XXX: cachemm is deprecated */

/* XXX: manual differs */
#define MOVE(sz,cond) \
	do { \
		union reg r1 = reginit(R1); \
		if (!cond(R3)) break; \
		r1.C(sz,0) = r(R2).C(sz,0); \
		r(R1) = r1; \
	} while (0)

GENCCSZ(move,def_cc_1,MOVE)

/* XXX: undocumented */
/* XXX: make it conditional (that is, `moves{cc}'? */
#define WIDEN(sz,ignore) \
	do { \
		union reg r1; \
		unsigned i; \
		r1.SC(o,0) = r(R2).SC(sz,0); \
		for (i = 1; i < CHUNKS(o); i++) \
			r1.SC(o,i) = r1.SC(o,0) >> (BITS(o)-1); \
		r(R1) = r1; \
	} while (0)

GEN4(widen,def_1,WIDEN)

/* loadcons */

/* Note: partial write */
#define LOADCONS(i,ignore) \
	do { \
		r(R1).C(d,i) = UIMM16; \
	} while (0)

#define GENLC(name,style,fun) \
	style(name,0,dummy,fun) \
	style(name,1,dummy,fun) \
	style(name,2,dummy,fun) \
	style(name,3,dummy,fun)

GENLC(loadcons,def_1,LOADCONS)

/* Note: partial write */
/* XXX: sign-extend how much? */
#define LOADCONSX(i,ignore) \
	do { \
		unsigned j; \
		r(R1).C(d,i) = SIMM16; \
		for (j = i + 1; j < 4; j++) \
			r(R1).C(d,j) = SIMM16 >> 15; \
	} while (0)

GENLC(loadconsx,def_1,LOADCONSX)

#define DATA	((opcode >>	 23) & 001)	/* destination */

/* XXX: assumes the PC has already been advanced */
/* XXX: handle upper chunks?! */
#define LOADADDR() \
	do { \
		union reg r1 = regs.r_pc; \
		r1.C(o,0) += r(R2).C(o,0); \
		r(R1) = r1; \
                if(DATA) \
                  lsu_loadaddr(R1, r1.C(o,0)); \
                else\
                  pf_loadaddr(R1,r1.C(o,0));\
	} while (0)

void loadaddr (U32 opcode) { if (R1) LOADADDR(); }
void loadaddrd(U32 opcode) { if (R1) LOADADDR(); }

/* XXX: assumes the PC has already been advanced */
/* XXX: handle upper chunks?! */
#define LOADADDRI() \
	do { \
		union reg r1 = regs.r_pc; \
		r1.C(o,0) += SIMM17; \
		r(R1) = r1; \
                if(DATA)\
                  lsu_loadaddr(R1,r1.C(o,0)) ;\
                else \
                  pf_loadaddr(R1,r1.C(o,0));\
	} while (0)

void loadaddri (U32 opcode) { if (R1) LOADADDRI(); }
void loadaddrid(U32 opcode) { if (R1) LOADADDRI(); }

/* XXX: get[i]/put[i] follow at the end */

/* XXX: differs from manual */
/* XXX: may segfault if it crosses a page boundary */
/* Note: loadm/storem use host default endian! */
void loadm(U32 opcode) {
	void *p;

	if (R3 > R1) {
		ex(EX_INVALID);
		return;
	}
	p = memmap(r(R2).C(o,0), BYTES(o), (R1 - R3 + 1) * sizeof(union reg), 0);
	if (excode) return;
	memcpy(&r(R3), p, (R1 - R3 + 1) * sizeof(union reg));
	if (!R3) memset(&r(0), 0, sizeof(union reg));	/* r0 may have changed! */
}

/* XXX: differs from manual */
/* XXX: may segfault if it crosses a page boundary */
void storem(U32 opcode) {
	void *p;

	if (R3 > R1) {
		ex(EX_INVALID);
		return;
	}
	p = memmap(r(R2).C(o,0), BYTES(o), (R1 - R3 + 1) * sizeof(union reg), 1);
	if (excode) return;
	memcpy(p, &r(R3), (R1 - R3 + 1) * sizeof(union reg));
}

/* XXX: left-shift immediate operand? */
void nop(U32 opcode) { regs.r_pc.C(o,0) += opcode & 0xffffff; }

/* XXX: assumes the PC has already been advanced */
/* Note: R1 and R2 might be the same! */
#define JMP(cond) \
	do { \
		union reg t = r(R2); \
		if (!cond(R3)) break; \
		if (R1) r(R1) = regs.r_pc; \
		regs.r_pc = t; \
                pf_doJmp(R2);\
	} while (0)

#define def_cc_3(name,fun,cond) \
	void name(U32 opcode) { fun(cond); }

#define GENCC(name,style,fun) \
	style(name##z,fun,cc_zero) \
	style(name##n,fun,cc_nan) \
	style(name##l,fun,cc_lsb) \
	style(name##m,fun,cc_msb) \
	style(name##nz,fun,!cc_zero) \
	style(name##nn,fun,!cc_nan) \
	style(name##nl,fun,!cc_lsb) \
	style(name##nm,fun,!cc_msb)

GENCC(jmp,def_cc_3,JMP)

/* XXX: no size flags? */
/* XXX: handle upper bits?! */
void loop(U32 opcode) {
	if (!cc_zero(R1)) {
	  regs.r_pc = r(R2);
	  pf_doJmp(R2);
	}
	if (R1) r(R1).C(o,0) -= 1;
	
}

/* hooks for emulator main program */
int (*syscall_hook)(U32 opcode) = NULL;
int (*trap_hook)(U32 opcode) = NULL;

/* provide simple I/O via syscall */
void emu_syscall(U32 opcode) {
	if (!syscall_hook || !syscall_hook(opcode)) {
		U64 base = sregs[SR_SYSCALL_BASE].u.v;
		U64 size = sregs[SR_SYSCALL_SIZE].u.v;
		U64 off = opcode & 017777700;

		if (off >= size) {
			ex(EX_NOHAND);
			return;
		}
		regs.r_pc.C(o,0) = base + off;
		pf_updatePC();
		/* XXX: how can we return? */
	}
	if (excode == EX_NONE && R1) {
		memset(&r(R1), 0, sizeof(union reg));
	}
}

void trap(U32 opcode) {
	if (!trap_hook || !trap_hook(opcode)) {
		U64 base = sregs[SR_TRAP_BASE].u.v;
		U64 size = sregs[SR_TRAP_SIZE].u.v;
		U64 off = opcode & 017777700;

		if (off >= size) {
			ex(EX_NOHAND);
			return;
		}
		regs.r_pc.C(o,0) = base + off;
		pf_updatePC();
		/* XXX: perform SRB! */
	}
	if (excode == EX_NONE && R1) {
		memset(&r(R1), 0, sizeof(union reg));
	}
}

void halt(U32 opcode) { ex(EX_HALT); }

/* XXX: not implemented yet */
void rfe(U32 opcode) {
	ex(EX_INVALID);
}

void srb_save(U32 opcode) { /* no-op during emulation */ }

void srb_restore(U32 opcode) { /* no-op during emulation */ }

void serialize(U32 opcode) { /* no-op during emulation */ }

struct sreg sregs[SR_LAST_SR] = {
	[SR_NUMBERS]		= { _SR_RD, _SR_RD, { SR_LAST_SR } },
	[SR_FAMILY]			= { _SR_RD, _SR_RD, { 0xfc0 } },
	[SR_STEPPING]		= { _SR_RD, _SR_RD, { 0 } },
	[SR_MAX_SIZE]		= { _SR_RD, _SR_RD, { LOG_MAXSIZE } },
	[SR_SIZE_0]			= { _SR_RD, _SR_RD, { 0 } },
	[SR_SIZE_1]			= { _SR_RD, _SR_RD, { 1 } },
	[SR_SIZE_2]			= { _SR_RD, _SR_RD, { 2 } },
	[SR_SIZE_3]			= { _SR_RD, _SR_RD, { 3 } },
	[SR_MAX_CHUNK_SIZE]	= { _SR_RD, _SR_RD, { 3 } },
	[SR_CYCLE]			= { _SR_RD, _SR_RD, { 0 } },	/* volatile */
	[SR_PAGING]			= { _SR_RW, _SR_RD, { 0 } },
	[SR_CONTROL]		= { _SR_RW, _SR_RD, { 0 } },
	[SR_IRQ_BASE]		= { _SR_RW, _SR_RD, { 0 } },
	[SR_IRQ_SIZE]		= { _SR_RW, _SR_RD, { 0 } },
	[SR_TRAP_BASE]		= { _SR_RW, _SR_RD, { 0 } },
	[SR_TRAP_SIZE]		= { _SR_RW, _SR_RD, { 0 } },
	[SR_SYSCALL_BASE]	= { _SR_RW, _SR_RD, { 0 } },
	[SR_SYSCALL_SIZE]	= { _SR_RW, _SR_RD, { 0 } },
	[SR_TLBMISS_BASE]	= { _SR_RW, _SR_RD, { 0 } },
	[SR_URL]			= { _SR_RD, _SR_RD, { .s = "http://w" } },
						  { _SR_RD, _SR_RD, { .s = "ww.f-cpu" } },
						  { _SR_RD, _SR_RD, { .s = ".org" } },
						  { _SR_RD, _SR_RD, { .s = "" } },
						  { _SR_RD, _SR_RD, { .s = "" } },
						  { _SR_RD, _SR_RD, { .s = "" } },
						  { _SR_RD, _SR_RD, { .s = "" } },
						  { _SR_RD, _SR_RD, { .s = "" } },
};

/* XXX: currently, we always are superuser */

void get(U32 opcode) {
	const U64 n = r(R2).C(o,0);
	union reg r1 = reginit(R1);

	if (n >= SR_LAST_SR) {
		ex(EX_INVALID);
		return;
	}
	if (!(sregs[n].p_super & _SR_RD)) {
		ex(EX_ACCESS);
		return;
	}
	r(1).C(o,0) = sregs[n].u.v;
	if (R1) r(R1) = r1;
}

void geti(U32 opcode) {
	const U64 n = UIMM16;
	union reg r1 = reginit(R1);

	if (n >= SR_LAST_SR) {
		ex(EX_INVALID);
		return;
	}
	if (!(sregs[n].p_super & _SR_RD)) {
		ex(EX_ACCESS);
		return;
	}
	r(1).C(o,0) = sregs[n].u.v;
	if (R1) r(R1) = r1;
}

void put(U32 opcode) {
	const U64 n = r(R2).C(o,0);

	if (n >= SR_LAST_SR) {
		ex(EX_INVALID);
		return;
	}
	if (!(sregs[n].p_super & _SR_WR)) {
		ex(EX_ACCESS);
		return;
	}
	sregs[n].u.v = r(R1).C(o,0);
}

void puti(U32 opcode) {
	const U64 n = UIMM16;

	if (n >= SR_LAST_SR) {
		ex(EX_INVALID);
		return;
	}
	if (!(sregs[n].p_super & _SR_WR)) {
		ex(EX_ACCESS);
		return;
	}
	sregs[n].u.v = r(R1).C(o,0);
}

/* emulator main */

struct insn {
	U32 code;
	U32 mask;
	void (*func[LOG_MAXSIZE + 1])(U32);
};

static int
insncmp(const struct insn *a, const struct insn *b) {
	if (a == b) return 0;
	if (a->code < b->code) return -1;
	if (a->code > b->code) return 1;
	/* This should never happen... */
	fprintf(stderr, "duplicate insn table entry:\n");
	fprintf(stderr, "  0x%08x : 0x%08x\n", a->code, a->mask);
	fprintf(stderr, "  0x%08x : 0x%08x\n", b->code, b->mask);
	abort();
}

#define X(op)	((OP_##op)<<24)
#define F1(x)	{ x }
#define F2(x)	{ x##_f, x##_d }
#define F4(x)	{ x##_b, x##_d, x##_q, x##_o }

/*
 * Instruction dispatch table
 */
static struct insn insn_table[] = {
	/* 24-bit immediate arg */
	{ X(NOP),                                           077777777, F1(nop) },
	{ X(SERIALIZE),                                     077777777, F1(serialize) },
	/* imm17, r1 */
	{ X(LOADADDRI),                                     037777777, F1(loadaddri) },
	{ X(LOADADDRI) | LOADADDR_DATA,                     037777777, F1(loadaddrid) },
	/* imm16, r1 */
	{ X(LOADCONS) | (0 << 22),                          017777777, F1(loadcons_0) },
	{ X(LOADCONS) | (1 << 22),                          017777777, F1(loadcons_1) },
	{ X(LOADCONS) | (2 << 22),                          017777777, F1(loadcons_2) },
	{ X(LOADCONS) | (3 << 22),                          017777777, F1(loadcons_3) },
	{ X(LOADCONSX) | (0 << 22),                         017777777, F1(loadconsx_0) },
	{ X(LOADCONSX) | (1 << 22),                         017777777, F1(loadconsx_1) },
	{ X(LOADCONSX) | (2 << 22),                         017777777, F1(loadconsx_2) },
	{ X(LOADCONSX) | (3 << 22),                         017777777, F1(loadconsx_3) },
	{ X(GETI),                                          017777777, F1(geti) },
	{ X(PUTI),                                          017777777, F1(puti) },
	{ X(SYSCALL),                                       017777777, F1(emu_syscall) },
	{ X(SYSCALL) | SYSCALL_TRAP,                        017777777, F1(trap) },
	/* imm9, r2, r1 */
	{ X(ANDI),                                          07777777, F4(andi) },
	{ X(ANDI) | SIMD_FLAG,                              07777777, F4(sandi) },
	{ X(ANDNI),                                         07777777, F4(andni) },
	{ X(ANDNI) | SIMD_FLAG,                             07777777, F4(sandni) },
	{ X(NANDI),                                         07777777, F4(nandi) },
	{ X(NANDI) | SIMD_FLAG,                             07777777, F4(snandi) },
	{ X(NORI),                                          07777777, F4(nori) },
	{ X(NORI) | SIMD_FLAG,                              07777777, F4(snori) },
	{ X(ORI),                                           07777777, F4(ori) },
	{ X(ORI) | SIMD_FLAG,                               07777777, F4(sori) },
	{ X(ORNI),                                          07777777, F4(orni) },
	{ X(ORNI) | SIMD_FLAG,                              07777777, F4(sorni) },
	{ X(XNORI),                                         07777777, F4(xnori) },
	{ X(XNORI) | SIMD_FLAG,                             07777777, F4(sxnori) },
	{ X(XORI),                                          07777777, F4(xori) },
	{ X(XORI) | SIMD_FLAG,                              07777777, F4(sxori) },
	{ X(LOADI),                                         07777777, F4(loadi) },
	{ X(LOADI) | LS_BIG_ENDIAN,                         07777777, F4(loadie) },
	{ X(LOADIF),                                        07777777, F4(loadi) },
	{ X(LOADIF) | LS_BIG_ENDIAN,                        07777777, F4(loadie) },
	{ X(STOREI),                                        07777777, F4(storei) },
	{ X(STOREI) | LS_BIG_ENDIAN,                        07777777, F4(storeie) },
	{ X(STOREIF),                                       07777777, F4(storei) },
	{ X(STOREIF) | LS_BIG_ENDIAN,                       07777777, F4(storeie) },
	/* load/store r3, r2, r1 + 3 bit stream hints */
	{ X(LOAD),                                          07777777, F4(load) },
	{ X(LOAD) | LS_BIG_ENDIAN,                          07777777, F4(loade) },
	{ X(LOADF),                                         07777777, F4(load) },
	{ X(LOADF) | LS_BIG_ENDIAN,                         07777777, F4(loade) },
	{ X(STORE),                                         07777777, F4(store) },
	{ X(STORE) | LS_BIG_ENDIAN,                         07777777, F4(storee) },
	{ X(STOREF),                                        07777777, F4(store) },
	{ X(STOREF) | LS_BIG_ENDIAN,                        07777777, F4(storee) },
	/* imm8, r2, r1 */
	{ X(ADDI),                                          03777777, F4(addi) },
	{ X(ADDI) | SIMD_FLAG,                              03777777, F4(saddi) },
	{ X(CMPLEI),                                        03777777, F4(cmplei) },
	{ X(CMPLEI) | CMP_SIGNED,                           03777777, F4(cmplesi) },
	{ X(CMPLEI) | SIMD_FLAG,                            03777777, F4(scmplei) },
	{ X(CMPLEI) | SIMD_FLAG | CMP_SIGNED,               03777777, F4(scmplesi) },
	{ X(CMPGI),                                         03777777, F4(cmpgi) },
	{ X(CMPGI) | CMP_SIGNED,                            03777777, F4(cmpgsi) },
	{ X(CMPGI) | SIMD_FLAG,                             03777777, F4(scmpgi) },
	{ X(CMPGI) | SIMD_FLAG | CMP_SIGNED,                03777777, F4(scmpgsi) },
	{ X(DIVI),                                          03777777, F4(divi) },
	{ X(DIVI) | DIV_REMAINDER,                          03777777, F4(divremi) },
	{ X(DIVI) | SIMD_FLAG,                              03777777, F4(sdivi) },
	{ X(DIVI) | SIMD_FLAG | DIV_REMAINDER,              03777777, F4(sdivremi) },
	{ X(MAXI),                                          03777777, F4(maxi) },
	{ X(MAXI) | CMP_SIGNED,                             03777777, F4(maxsi) },
	{ X(MAXI) | SIMD_FLAG,                              03777777, F4(smaxi) },
	{ X(MAXI) | SIMD_FLAG | CMP_SIGNED,                 03777777, F4(smaxsi) },
	{ X(MINI),                                          03777777, F4(mini) },
	{ X(MINI) | CMP_SIGNED,                             03777777, F4(minsi) },
	{ X(MINI) | SIMD_FLAG,                              03777777, F4(smini) },
	{ X(MINI) | SIMD_FLAG | CMP_SIGNED,                 03777777, F4(sminsi) },
	{ X(MINMAXI),                                       03777777, F4(minmaxi) },
	{ X(MINMAXI) | CMP_SIGNED,                          03777777, F4(minmaxsi) },
	{ X(MINMAXI) | SIMD_FLAG,                           03777777, F4(sminmaxi) },
	{ X(MINMAXI) | SIMD_FLAG | CMP_SIGNED,              03777777, F4(sminmaxsi) },
	{ X(MULI),                                          03777777, F4(muli) },
	{ X(MULI) | SIMD_FLAG,                              03777777, F4(smuli) },
	{ X(REMI),                                          03777777, F4(remi) },
	{ X(REMI) | SIMD_FLAG,                              03777777, F4(sremi) },
	{ X(SUBI),                                          03777777, F4(subi) },
	{ X(SUBI) | SIMD_FLAG,                              03777777, F4(ssubi) },
	{ X(VSELI),                                         03777777, F4(vseli) },
	{ X(VSELI) | SIMD_FLAG,                             03777777, F4(svseli) },
	{ X(BCHGI),                                         03777777, F4(bchgi) },
	{ X(BCHGI) | SIMD_FLAG,                             03777777, F4(sbchgi) },
	{ X(BCLRI),                                         03777777, F4(bclri) },
	{ X(BCLRI) | SIMD_FLAG,                             03777777, F4(sbclri) },
	{ X(BITREVI),                                       03777777, F4(bitrevi) },
	{ X(BITREVI) | SIMD_FLAG,                           03777777, F4(sbitrevi) },
	{ X(BSETI),                                         03777777, F4(bseti) },
	{ X(BSETI) | SIMD_FLAG,                             03777777, F4(sbseti) },
	{ X(BTSTI),                                         03777777, F4(btsti) },
	{ X(BTSTI) | SIMD_FLAG,                             03777777, F4(sbtsti) },
	{ X(DBITREVI),                                      03777777, F4(dbitrevi) },
	{ X(DBITREVI) | SIMD_FLAG,                          03777777, F4(sdbitrevi) },
	{ X(DSHIFTLI),                                      03777777, F4(dshiftli) },
	{ X(DSHIFTLI) | SIMD_FLAG,                          03777777, F4(sdshiftli) },
	{ X(DSHIFTRAI),                                     03777777, F4(dshiftrai) },
	{ X(DSHIFTRAI) | SIMD_FLAG,                         03777777, F4(sdshiftrai) },
	{ X(DSHIFTRI),                                      03777777, F4(dshiftri) },
	{ X(DSHIFTRI) | SIMD_FLAG,                          03777777, F4(sdshiftri) },
	{ X(POPCI),                                         03777777, F4(popci) },
	{ X(POPCI) | SIMD_FLAG,                             03777777, F4(spopci) },
	{ X(ROTLI),                                         03777777, F4(rotli) },
	{ X(ROTLI) | SIMD_FLAG,                             03777777, F4(srotli) },
	{ X(ROTRI),                                         03777777, F4(rotri) },
	{ X(ROTRI) | SIMD_FLAG,                             03777777, F4(srotri) },
	{ X(SHIFTLI),                                       03777777, F4(shiftli) },
	{ X(SHIFTLI) | SIMD_FLAG,                           03777777, F4(sshiftli) },
	{ X(SHIFTRAI),                                      03777777, F4(shiftrai) },
	{ X(SHIFTRAI) | SIMD_FLAG,                          03777777, F4(sshiftrai) },
	{ X(SHIFTRI),                                       03777777, F4(shiftri) },
	{ X(SHIFTRI) | SIMD_FLAG,                           03777777, F4(sshiftri) },
	/* r3, r2, r1 */
	{ X(ADD) | ADD_MODE_NORMAL,                         0777777, F4(add) },
	{ X(ADD) | ADD_MODE_CARRY,                          0777777, F4(addc) },
	{ X(ADD) | ADD_MODE_SATURATE,                       0777777, F4(adds) },
	{ X(ADD) | SIMD_FLAG | ADD_MODE_NORMAL,             0777777, F4(sadd) },
	{ X(ADD) | SIMD_FLAG | ADD_MODE_CARRY,              0777777, F4(saddc) },
	{ X(ADD) | SIMD_FLAG | ADD_MODE_SATURATE,           0777777, F4(sadds) },
	{ X(ADDSUB),                                        0777777, F4(addsub) },
	{ X(ADDSUB) | SIMD_FLAG,                            0777777, F4(saddsub) },
	{ X(AMAC),                                          0777777, F4(amac) },
	{ X(AMAC) | MUL_SIGNED,                             0777777, F4(amac) },
	{ X(AMAC) | MUL_HIGH,                               0777777, F4(amach) },
	{ X(AMAC) | MUL_HIGH | MUL_SIGNED,                  0777777, F4(amachs) },
	{ X(AMAC) | SIMD_FLAG,                              0777777, F4(samac) },
	{ X(AMAC) | SIMD_FLAG | MUL_SIGNED,                 0777777, F4(samac) },
	{ X(AMAC) | SIMD_FLAG | MUL_HIGH,                   0777777, F4(samach) },
	{ X(AMAC) | SIMD_FLAG | MUL_HIGH | MUL_SIGNED,      0777777, F4(samachs) },
	{ X(AND) | ROP2_MODE_DIRECT,                        0777777, F4(and) },
	{ X(AND) | ROP2_MODE_AND,                           0777777, F4(and_and) },
	{ X(AND) | ROP2_MODE_OR,                            0777777, F4(and_or) },
	{ X(AND) | SIMD_FLAG | ROP2_MODE_DIRECT,            0777777, F4(sand) },
	{ X(AND) | SIMD_FLAG | ROP2_MODE_AND,               0777777, F4(sand_and) },
	{ X(AND) | SIMD_FLAG | ROP2_MODE_OR,                0777777, F4(sand_or) },
	{ X(ANDN) | ROP2_MODE_DIRECT,                       0777777, F4(andn) },
	{ X(ANDN) | ROP2_MODE_AND,                          0777777, F4(andn_and) },
	{ X(ANDN) | ROP2_MODE_OR,                           0777777, F4(andn_or) },
	{ X(ANDN) | SIMD_FLAG | ROP2_MODE_DIRECT,           0777777, F4(sandn) },
	{ X(ANDN) | SIMD_FLAG | ROP2_MODE_AND,              0777777, F4(sandn_and) },
	{ X(ANDN) | SIMD_FLAG | ROP2_MODE_OR,               0777777, F4(sandn_or) },
	{ X(BCHG),                                          0777777, F4(bchg) },
	{ X(BCHG) | SIMD_FLAG,                              0777777, F4(sbchg) },
	{ X(BCLR),                                          0777777, F4(bclr) },
	{ X(BCLR) | SIMD_FLAG,                              0777777, F4(sbclr) },
	{ X(BITREV),                                        0777777, F4(bitrev) },
	{ X(BITREV) | SIMD_FLAG,                            0777777, F4(sbitrev) },
	{ X(BITREV) | SIMD_FLAG | SHIFT_HALF_SIMD,          0777777, F4(sbitrevh) },
	{ X(BSET),                                          0777777, F4(bset) },
	{ X(BSET) | SIMD_FLAG,                              0777777, F4(sbset) },
	{ X(BTST),                                          0777777, F4(btst) },
	{ X(BTST) | SIMD_FLAG,                              0777777, F4(sbtst) },
	{ X(CMPG),                                          0777777, F4(cmpg) },
	{ X(CMPG) | CMP_SIGNED,                             0777777, F4(cmpgs) },
	{ X(CMPG) | SIMD_FLAG,                              0777777, F4(scmpg) },
	{ X(CMPG) | SIMD_FLAG | CMP_SIGNED,                 0777777, F4(scmpgs) },
	{ X(CMPLE),                                         0777777, F4(cmple) },
	{ X(CMPLE) | CMP_SIGNED,                            0777777, F4(cmples) },
	{ X(CMPLE) | SIMD_FLAG,                             0777777, F4(scmple) },
	{ X(CMPLE) | SIMD_FLAG | CMP_SIGNED,                0777777, F4(scmples) },
	{ X(CSHIFT) | CSHIFT_LEFT,                          0777777, F4(cshiftl) },
	{ X(CSHIFT) | CSHIFT_RIGHT,                         0777777, F4(cshiftr) },
	{ X(DBITREV),                                       0777777, F4(dbitrev) },
	{ X(DBITREV) | SIMD_FLAG,                           0777777, F4(sdbitrev) },
	{ X(DBITREV) | SIMD_FLAG | SHIFT_HALF_SIMD,         0777777, F4(sdbitrevh) },
	{ X(DIV),                                           0777777, F4(div) },
	{ X(DIV) | DIV_SIGNED,                              0777777, F4(divs) },
	{ X(DIV) | DIV_REMAINDER,                           0777777, F4(divrem) },
	{ X(DIV) | DIV_REMAINDER | DIV_SIGNED,              0777777, F4(divrems) },
	{ X(DIV) | SIMD_FLAG,                               0777777, F4(sdiv) },
	{ X(DIV) | SIMD_FLAG | DIV_SIGNED,                  0777777, F4(sdivs) },
	{ X(DIV) | SIMD_FLAG | DIV_REMAINDER,               0777777, F4(sdivrem) },
	{ X(DIV) | SIMD_FLAG | DIV_REMAINDER | DIV_SIGNED,  0777777, F4(sdivrems) },
	{ X(DSHIFTL),                                       0777777, F4(dshiftl) },
	{ X(DSHIFTL) | SIMD_FLAG,                           0777777, F4(sdshiftl) },
	{ X(DSHIFTL) | SIMD_FLAG | SHIFT_HALF_SIMD,         0777777, F4(sdshiftlh) },
	{ X(DSHIFTR),                                       0777777, F4(dshiftr) },
	{ X(DSHIFTR) | SIMD_FLAG,                           0777777, F4(sdshiftr) },
	{ X(DSHIFTR) | SIMD_FLAG | SHIFT_HALF_SIMD,         0777777, F4(sdshiftrh) },
	{ X(DSHIFTRA),                                      0777777, F4(dshiftra) },
	{ X(DSHIFTRA) | SIMD_FLAG,                          0777777, F4(sdshiftra) },
	{ X(DSHIFTRA) | SIMD_FLAG | SHIFT_HALF_SIMD,        0777777, F4(sdshiftrah) },
	{ X(EXPAND) | EXPAND_BOTH,                          0777777, F4(expand) },
	{ X(EXPAND) | EXPAND_LOW,                           0777777, F4(expandl) },
	{ X(EXPAND) | EXPAND_HIGH,                          0777777, F4(expandh) },
	{ X(FADD),                                          0777777, F2(fadd) },
	{ X(FADD) | SIMD_FLAG,                              0777777, F2(sfadd) },
	{ X(FADDSUB),                                       0777777, F2(faddsub) },
	{ X(FADDSUB) | SIMD_FLAG,                           0777777, F2(sfaddsub) },
	{ X(FDIV),                                          0777777, F2(fdiv) },
	{ X(FDIV) | SIMD_FLAG,                              0777777, F2(sfdiv) },
	{ X(FEXP),                                          0777777, F2(fexp) },
	{ X(FEXP) | SIMD_FLAG,                              0777777, F2(sfexp) },
	{ X(FLOG),                                          0777777, F2(flog) },
	{ X(FLOG) | SIMD_FLAG,                              0777777, F2(sflog) },
	{ X(FMAC),                                          0777777, F2(fmac) },
	{ X(FMAC) | SIMD_FLAG,                              0777777, F2(sfmac) },
	{ X(FMUL),                                          0777777, F2(fmul) },
	{ X(FMUL) | SIMD_FLAG,                              0777777, F2(sfmul) },
	{ X(FSUB),                                          0777777, F2(fsub) },
	{ X(FSUB) | SIMD_FLAG,                              0777777, F2(sfsub) },
	{ X(JMP) | CC_ZERO,                                 0777777, F1(jmpz) },
	{ X(JMP) | CC_NAN,                                  0777777, F1(jmpn) },
	{ X(JMP) | CC_MSB,                                  0777777, F1(jmpm) },
	{ X(JMP) | CC_LSB,                                  0777777, F1(jmpl) },
	{ X(JMP) | CC_NOT_ZERO,                             0777777, F1(jmpnz) },
	{ X(JMP) | CC_NOT_NAN,                              0777777, F1(jmpnn) },
	{ X(JMP) | CC_NOT_MSB,                              0777777, F1(jmpnm) },
	{ X(JMP) | CC_NOT_LSB,                              0777777, F1(jmpnl) },
	{ X(LOADM),                                         0777777, F1(loadm) },
	{ X(MAC),                                           0777777, F4(macl) },
	{ X(MAC) | MAC_SIGNED,                              0777777, F4(macls) },
	{ X(MAC) | MAC_HIGH,                                0777777, F4(mach) },
	{ X(MAC) | MAC_HIGH | MAC_SIGNED,                   0777777, F4(machs) },
	{ X(MAC) | SIMD_FLAG,                               0777777, F4(smacl) },
	{ X(MAC) | SIMD_FLAG | MAC_SIGNED,                  0777777, F4(smacls) },
	{ X(MAC) | SIMD_FLAG | MAC_HIGH,                    0777777, F4(smach) },
	{ X(MAC) | SIMD_FLAG | MAC_HIGH | MAC_SIGNED,       0777777, F4(smachs) },
	{ X(MAX),                                           0777777, F4(max) },
	{ X(MAX) | CMP_SIGNED,                              0777777, F4(maxs) },
	{ X(MAX) | SIMD_FLAG,                               0777777, F4(smax) },
	{ X(MAX) | SIMD_FLAG | CMP_SIGNED,                  0777777, F4(smaxs) },
	{ X(MIN),                                           0777777, F4(min) },
	{ X(MIN) | CMP_SIGNED,                              0777777, F4(mins) },
	{ X(MIN) | SIMD_FLAG,                               0777777, F4(smin) },
	{ X(MIN) | SIMD_FLAG | CMP_SIGNED,                  0777777, F4(smins) },
	{ X(MINMAX),                                        0777777, F4(minmax) },
	{ X(MINMAX) | CMP_SIGNED,                           0777777, F4(minmaxs) },
	{ X(MINMAX) | SIMD_FLAG,                            0777777, F4(sminmax) },
	{ X(MINMAX) | SIMD_FLAG | CMP_SIGNED,               0777777, F4(sminmaxs) },
	{ X(MIX) | MIX_BOTH,                                0777777, F4(mix) },
	{ X(MIX) | MIX_LOW,                                 0777777, F4(mixl) },
	{ X(MIX) | MIX_HIGH,                                0777777, F4(mixh) },
	{ X(MOVE) | CC_ZERO,                                0777777, F4(movez) },
	{ X(MOVE) | CC_NAN,                                 0777777, F4(moven) },
	{ X(MOVE) | CC_MSB,                                 0777777, F4(movem) },
	{ X(MOVE) | CC_LSB,                                 0777777, F4(movel) },
	{ X(MOVE) | CC_NOT_ZERO,                            0777777, F4(movenz) },
	{ X(MOVE) | CC_NOT_NAN,                             0777777, F4(movenn) },
	{ X(MOVE) | CC_NOT_MSB,                             0777777, F4(movenm) },
	{ X(MOVE) | CC_NOT_LSB,                             0777777, F4(movenl) },
	{ X(MUL),                                           0777777, F4(mul) },
	{ X(MUL) | MUL_SIGNED,                              0777777, F4(mul) },
	{ X(MUL) | MUL_HIGH,                                0777777, F4(mulh) },
	{ X(MUL) | MUL_HIGH | MUL_SIGNED,                   0777777, F4(mulhs) },
	{ X(MUL) | SIMD_FLAG,                               0777777, F4(smul) },
	{ X(MUL) | SIMD_FLAG | MUL_SIGNED,                  0777777, F4(smul) },
	{ X(MUL) | SIMD_FLAG | MUL_HIGH,                    0777777, F4(smulh) },
	{ X(MUL) | SIMD_FLAG | MUL_HIGH | MUL_SIGNED,       0777777, F4(smulhs) },
	{ X(NAND) | ROP2_MODE_DIRECT,                       0777777, F4(nand) },
	{ X(NAND) | ROP2_MODE_AND,                          0777777, F4(nand_and) },
	{ X(NAND) | ROP2_MODE_OR,                           0777777, F4(nand_or) },
	{ X(NAND) | SIMD_FLAG | ROP2_MODE_DIRECT,           0777777, F4(snand) },
	{ X(NAND) | SIMD_FLAG | ROP2_MODE_AND,              0777777, F4(snand_and) },
	{ X(NAND) | SIMD_FLAG | ROP2_MODE_OR,               0777777, F4(snand_or) },
	{ X(NOR) | ROP2_MODE_DIRECT,                        0777777, F4(nor) },
	{ X(NOR) | ROP2_MODE_AND,                           0777777, F4(nor_and) },
	{ X(NOR) | ROP2_MODE_OR,                            0777777, F4(nor_or) },
	{ X(NOR) | SIMD_FLAG | ROP2_MODE_DIRECT,            0777777, F4(snor) },
	{ X(NOR) | SIMD_FLAG | ROP2_MODE_AND,               0777777, F4(snor_and) },
	{ X(NOR) | SIMD_FLAG | ROP2_MODE_OR,                0777777, F4(snor_or) },
	{ X(OR) | ROP2_MODE_DIRECT,                         0777777, F4(or) },
	{ X(OR) | ROP2_MODE_AND,                            0777777, F4(or_and) },
	{ X(OR) | ROP2_MODE_OR,                             0777777, F4(or_or) },
	{ X(OR) | SIMD_FLAG | ROP2_MODE_DIRECT,             0777777, F4(sor) },
	{ X(OR) | SIMD_FLAG | ROP2_MODE_AND,                0777777, F4(sor_and) },
	{ X(OR) | SIMD_FLAG | ROP2_MODE_OR,                 0777777, F4(sor_or) },
	{ X(ORN) | ROP2_MODE_DIRECT,                        0777777, F4(orn) },
	{ X(ORN) | ROP2_MODE_AND,                           0777777, F4(orn_and) },
	{ X(ORN) | ROP2_MODE_OR,                            0777777, F4(orn_or) },
	{ X(ORN) | SIMD_FLAG | ROP2_MODE_DIRECT,            0777777, F4(sorn) },
	{ X(ORN) | SIMD_FLAG | ROP2_MODE_AND,               0777777, F4(sorn_and) },
	{ X(ORN) | SIMD_FLAG | ROP2_MODE_OR,                0777777, F4(sorn_or) },
	{ X(POPC),                                          0777777, F4(popc) },
	{ X(POPC) | SIMD_FLAG,                              0777777, F4(spopc) },
	{ X(REM),                                           0777777, F4(rem) },
	{ X(REM) | DIV_SIGNED,                              0777777, F4(rems) },
	{ X(REM) | SIMD_FLAG,                               0777777, F4(srem) },
	{ X(REM) | SIMD_FLAG | DIV_SIGNED,                  0777777, F4(srems) },
	{ X(ROP2) | ROP2_MODE_MUX,                          0777777, F4(mux) },
	{ X(ROP2) | SIMD_FLAG | ROP2_MODE_MUX,              0777777, F4(smux) },
	{ X(ROTL),                                          0777777, F4(rotl) },
	{ X(ROTL) | SIMD_FLAG,                              0777777, F4(srotl) },
	{ X(ROTL) | SIMD_FLAG | SHIFT_HALF_SIMD,            0777777, F4(srotlh) },
	{ X(ROTR),                                          0777777, F4(rotr) },
	{ X(ROTR) | SIMD_FLAG,                              0777777, F4(srotr) },
	{ X(ROTR) | SIMD_FLAG | SHIFT_HALF_SIMD,            0777777, F4(srotrh) },
	{ X(SHIFTL),                                        0777777, F4(shiftl) },
	{ X(SHIFTL) | SIMD_FLAG,                            0777777, F4(sshiftl) },
	{ X(SHIFTL) | SIMD_FLAG | SHIFT_HALF_SIMD,          0777777, F4(sshiftlh) },
	{ X(SHIFTR),                                        0777777, F4(shiftr) },
	{ X(SHIFTR) | SIMD_FLAG,                            0777777, F4(sshiftr) },
	{ X(SHIFTR) | SIMD_FLAG | SHIFT_HALF_SIMD,          0777777, F4(sshiftrh) },
	{ X(SHIFTRA),                                       0777777, F4(shiftra) },
	{ X(SHIFTRA) | SIMD_FLAG,                           0777777, F4(sshiftra) },
	{ X(SHIFTRA) | SIMD_FLAG | SHIFT_HALF_SIMD,         0777777, F4(sshiftrah) },
	{ X(STOREM),                                        0777777, F1(storem) },
	{ X(SUB) | SIMD_FLAG | SUB_MODE_NORMAL,             0777777, F4(ssub) },
	{ X(SUB) | SIMD_FLAG | SUB_MODE_BORROW,             0777777, F4(ssubb) },
	{ X(SUB) | SIMD_FLAG | SUB_MODE_FLOOR,              0777777, F4(ssubf) },
	{ X(SUB) | SUB_MODE_NORMAL,                         0777777, F4(sub) },
	{ X(SUB) | SUB_MODE_BORROW,                         0777777, F4(subb) },
	{ X(SUB) | SUB_MODE_FLOOR,                          0777777, F4(subf) },
	{ X(VSEL),                                          0777777, F4(vsel) },
	{ X(VSEL) | SIMD_FLAG,                              0777777, F4(svsel) },
	{ X(VSEL) | SIMD_FLAG | SHIFT_HALF_SIMD,            0777777, F4(svselh) },
	{ X(XNOR) | ROP2_MODE_DIRECT,                       0777777, F4(xnor) },
	{ X(XNOR) | ROP2_MODE_AND,                          0777777, F4(xnor_and) },
	{ X(XNOR) | ROP2_MODE_OR,                           0777777, F4(xnor_or) },
	{ X(XNOR) | SIMD_FLAG | ROP2_MODE_DIRECT,           0777777, F4(sxnor) },
	{ X(XNOR) | SIMD_FLAG | ROP2_MODE_AND,              0777777, F4(sxnor_and) },
	{ X(XNOR) | SIMD_FLAG | ROP2_MODE_OR,               0777777, F4(sxnor_or) },
	{ X(XOR) | ROP2_MODE_DIRECT,                        0777777, F4(xor) },
	{ X(XOR) | ROP2_MODE_AND,                           0777777, F4(xor_and) },
	{ X(XOR) | ROP2_MODE_OR,                            0777777, F4(xor_or) },
	{ X(XOR) | SIMD_FLAG | ROP2_MODE_DIRECT,            0777777, F4(sxor) },
	{ X(XOR) | SIMD_FLAG | ROP2_MODE_AND,               0777777, F4(sxor_and) },
	{ X(XOR) | SIMD_FLAG | ROP2_MODE_OR,                0777777, F4(sxor_or) },
	/* r2, r1 */
	{ X(ABS),                                           07777, F4(abs) },
	{ X(ABS) | SIMD_FLAG,                               07777, F4(sabs) },
	{ X(BYTEREV),                                       07777, F4(byterev) },
	{ X(BYTEREV) | SIMD_FLAG,                           07777, F4(sbyterev) },
	{ X(D2INT) | ROUND_NEAREST,                         07777, F4(d2intr) },
	{ X(D2INT) | ROUND_TRUNC,                           07777, F4(d2intt) },
	{ X(D2INT) | ROUND_FLOOR,                           07777, F4(d2intf) },
	{ X(D2INT) | ROUND_CEIL,                            07777, F4(d2intc) },
	{ X(DEC),                                           07777, F4(dec) },
	{ X(DEC) | SIMD_FLAG,                               07777, F4(sdec) },
	{ X(F2INT) | ROUND_NEAREST,                         07777, F4(f2intr) },
	{ X(F2INT) | ROUND_TRUNC,                           07777, F4(f2intt) },
	{ X(F2INT) | ROUND_FLOOR,                           07777, F4(f2intf) },
	{ X(F2INT) | ROUND_CEIL,                            07777, F4(f2intc) },
	{ X(FIAPRX),                                        07777, F2(fiaprx) },
	{ X(FIAPRX) | SIMD_FLAG,                            07777, F2(sfiaprx) },
	{ X(FSQRT),                                         07777, F2(fsqrt) },
	{ X(FSQRT) | SIMD_FLAG,                             07777, F2(sfsqrt) },
	{ X(FSQRTIAPRX),                                    07777, F2(fsqrtiaprx) },
	{ X(FSQRTIAPRX) | SIMD_FLAG,                        07777, F2(sfsqrtiaprx) },
	{ X(GET),                                           07777, F1(get) },
	{ X(INC),                                           07777, F4(inc) },
	{ X(INC) | SIMD_FLAG,                               07777, F4(sinc) },
	{ X(INT2D) | ROUND_NEAREST,                         07777, F4(int2dr) },
	{ X(INT2D) | ROUND_TRUNC,                           07777, F4(int2dt) },
	{ X(INT2D) | ROUND_FLOOR,                           07777, F4(int2df) },
	{ X(INT2D) | ROUND_CEIL,                            07777, F4(int2dc) },
	{ X(INT2F) | ROUND_NEAREST,                         07777, F4(int2fr) },
	{ X(INT2F) | ROUND_TRUNC,                           07777, F4(int2ft) },
	{ X(INT2F) | ROUND_FLOOR,                           07777, F4(int2ff) },
	{ X(INT2F) | ROUND_CEIL,                            07777, F4(int2fc) },
	{ X(LOADADDR),                                      07777, F1(loadaddr) },
	{ X(LOADADDR) | LOADADDR_DATA,                      07777, F1(loadaddrd) },
	{ X(LOOP),                                          07777, F1(loop) },
	{ X(NABS),                                          07777, F4(nabs) },
	{ X(NABS) | SIMD_FLAG,                              07777, F4(snabs) },
	{ X(NEG),                                           07777, F4(neg) },
	{ X(NEG) | SIMD_FLAG,                               07777, F4(sneg) },
	{ X(PUT),                                           07777, F1(put) },
	{ X(SCAN),                                          07777, F4(lsb1) },
	{ X(SCAN) | SCAN_NEGATE,                            07777, F4(lsb0) },
	{ X(SCAN) | SCAN_REVERSE,                           07777, F4(msb1) },
	{ X(SCAN) | SCAN_REVERSE | SCAN_NEGATE,             07777, F4(msb0) },
	{ X(SCAN) | SIMD_FLAG,                              07777, F4(slsb1) },
	{ X(SCAN) | SIMD_FLAG | SCAN_NEGATE,                07777, F4(slsb0) },
	{ X(SCAN) | SIMD_FLAG | SCAN_REVERSE,               07777, F4(smsb1) },
	{ X(SCAN) | SIMD_FLAG | SCAN_REVERSE | SCAN_NEGATE, 07777, F4(smsb0) },
	{ X(WIDEN),                                         07777, F4(widen) },
	{ X(CLOAD) | CC_ZERO,                               07777, F4(cloadz) },
	{ X(CLOAD) | CC_NAN,                                07777, F4(cloadn) },
	{ X(CLOAD) | CC_MSB,                                07777, F4(cloadm) },
	{ X(CLOAD) | CC_LSB,                                07777, F4(cloadl) },
	{ X(CLOAD) | CC_NOT_ZERO,                           07777, F4(cloadnz) },
	{ X(CLOAD) | CC_NOT_NAN,                            07777, F4(cloadnn) },
	{ X(CLOAD) | CC_NOT_MSB,                            07777, F4(cloadnm) },
	{ X(CLOAD) | CC_NOT_LSB,                            07777, F4(cloadnl) },
	{ X(CSTORE) | CC_ZERO,                              07777, F4(cstorez) },
	{ X(CSTORE) | CC_NAN,                               07777, F4(cstoren) },
	{ X(CSTORE) | CC_MSB,                               07777, F4(cstorem) },
	{ X(CSTORE) | CC_LSB,                               07777, F4(cstorel) },
	{ X(CSTORE) | CC_NOT_ZERO,                          07777, F4(cstorenz) },
	{ X(CSTORE) | CC_NOT_NAN,                           07777, F4(cstorenn) },
	{ X(CSTORE) | CC_NOT_MSB,                           07777, F4(cstorenm) },
	{ X(CSTORE) | CC_NOT_LSB,                           07777, F4(cstorenl) },
	/* no args at all */
	{ X(HALT),                                          0, F1(halt) },
	{ X(RFE),                                           0, F1(rfe) },
	{ X(SRB_SAVE),                                      0, F1(srb_save) },
	{ X(SRB_RESTORE),                                   0, F1(srb_restore) },
};

#undef X
#undef F1
#undef F2
#undef F4

static unsigned short insn_index[257];

static void
sort_table(void) {
	static int table_sorted = 0;
	unsigned i, j, k;

	/* sort table only once */
	if (table_sorted) {
		return;
	}
	k = sizeof(insn_table)/sizeof(*insn_table);
	qsort(insn_table, k, sizeof(*insn_table),
		(int (*)(const void*, const void*))insncmp);
	/* build table index */
	for (i = j = 0; i < 256; i++) {
		for (insn_index[i] = j; j < k; j++) {
			if ((insn_table[j].code >> 24) != i) {
				break;
			}
		}
	}
	insn_index[256] = k;
	/* done */
	table_sorted = 1;
}

void
initemu(void) {
	sort_table();
	memset(&regs, 0, sizeof(regs));
	ex(EX_NONE);
}

int
emulate1(U32 opcode) {
	void (*func)(U32);
	unsigned i, j, k;
	U32 tmp;
	U64 r1val = r(R1).C(o,0);	
	U64 r2val = r(R2).C(o,0);	
	U64 r3val = r(R3).C(o,0);	
	sregs[SR_CYCLE].u.v += 1;
	k = opcode >> 24; if (k > 255) abort();
	i = insn_index[k];
	j = insn_index[k + 1];
	while (i < j) {
		k = i + ((j - i) >> 1);
		/* ignore size bits if there's more than one function */
		tmp = insn_table[k].func[1] ? opcode &~ (3u << 22) : opcode;
		if (tmp < insn_table[k].code) {
			j = k;
		}
		else if (tmp > (insn_table[k].code | insn_table[k].mask)) {
			i = k + 1;
		}
		else {
			if (insn_table[k].func[1]) {
				/* instruction with size flags */
				/* translate size bits */
				tmp = (opcode >> 22) & 3u;
				tmp = sregs[SR_SIZE_0 + tmp].u.v;
				if (tmp > LOG_MAXSIZE) abort();
				if (!(func = insn_table[k].func[tmp])) {
					/* not there */
					break;
				}
			}
			else {
				/* instruction without size flags */
				if (!(func = insn_table[k].func[0])) abort();
			}
			/* execute instruction */
			func(opcode);
			if( (r(R1).C(o,0)^r1val) & LSU_HI_ADDR_MASK ) lsu_break_reg_line_mapping(R1);
			if( (r(R2).C(o,0)^r2val) & LSU_HI_ADDR_MASK ) lsu_break_reg_line_mapping(R2);
			if( (r(R3).C(o,0)^r3val) & LSU_HI_ADDR_MASK ) lsu_break_reg_line_mapping(R3);
			return 0;
		}
	}
	ex(EX_INVALID);
	return 1;
}
