/* ars-swar.h This header file is a minimal version of the GCC inline assembler macros used in the 3DNow! target support for the SWARC compiler, Scc. To use this library, simply include this header file and compile with GCC. Of course, we assume that you "just know" that the processor you're running on supports MMX, 3DNow!, and the Athlon 3DNow! enhancements.... ;-) Usage: let's say you want to add the 64-bit vector of two 32-bit floats at x into register 7. Unless you have declared x as "mmx_t x;" you need to cast x to the correct type. Register 7 is called "mm7". Thus: pfadd_m2r(*((mmx_t *) &x), mm7); I know that looks evil, but it works fine. Unless x requires nasty addressing computations, the above will literally generate a single assembly instruction. You also should be aware that misaligned memory references work -- they just take longer than aligned ones. Thus, try to make your data structures 64-bit aligned. However, often it is faster to reference misaligned data rather than making a copy to align it first. (This is the biggest advantage of MMX/3DNow! over AltiVec, which REQUIRES alignment.) THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ANY PARTICULAR PURPOSE. June 11, 1998 by H. Dietz and R. Fisher April 1, 1999 by H. Dietz June 21, 2000 modified by H. Dietz and T. Mattox */ /* The type of an value that fits in an MMX register (note that long long constant values MUST be suffixed by LL and unsigned long long values by ULL, lest they be truncated by the compiler) */ typedef union { long long q; /* Quadword (64-bit) value */ unsigned long long uq; /* Unsigned Quadword */ float f[2]; /* 2 Float (32-bit) values */ int d[2]; /* 2 Doubleword (32-bit) values */ unsigned int ud[2]; /* 2 Unsigned Doubleword */ short w[4]; /* 4 Word (16-bit) values */ unsigned short uw[4]; /* 4 Unsigned Word */ char b[8]; /* 8 Byte (8-bit) values */ unsigned char ub[8]; /* 8 Unsigned Byte */ } mmx_t; /* Helper functions for the instruction macros that follow... (note that memory-to-register, m2r, instructions are nearly as efficient as register-to-register, r2r, instructions) */ #define mmx_m2r(op, mem, reg) \ __asm__ __volatile__ (#op " %0, %%" #reg \ : /* nothing */ \ : "X" (mem)) #define mmx_r2m(op, reg, mem) \ __asm__ __volatile__ (#op " %%" #reg ", %0" \ : "=X" (mem) \ : /* nothing */ ) #define mmx_i2r(op, immed, regd) \ __asm__ __volatile__ (#op " $" #immed ", %" #regd) #define mmx_r2r(op, regs, regd) \ __asm__ __volatile__ (#op " %" #regs ", %" #regd) #define mmx_m(op, mem) \ __asm__ __volatile__ (#op " %0" \ : /* nothing */ \ : "X" (mem)) /* Prefetch stuff... RTFM for use... */ #define prefetch_m(var) mmx_m(prefetch, var) #define prefetchw_m(var) mmx_m(prefetchw, var) #define prefetchnta_m(var) mmx_m(prefetchnta, var) #define prefetcht0_m(var) mmx_m(prefetcht0, var) #define prefetcht1_m(var) mmx_m(prefetcht1, var) #define prefetcht2_m(var) mmx_m(prefetcht2, var) /* 1x64 MOVe Quadword (this is both a load and a store... in fact, it is the only way to store) */ #define movq_m2r(var, reg) mmx_m2r(movq, var, reg) #define movq_r2m(reg, var) mmx_r2m(movq, reg, var) #define movq_r2r(regs, regd) mmx_r2r(movq, regs, regd) /* 1x64 MOVe Doubleword (like movq, this is both load and store... but is most useful for moving things between mmx registers and ordinary registers) */ #define movd_m2r(var, reg) mmx_m2r(movd, var, reg) #define movd_r2m(reg, var) mmx_r2m(movd, reg, var) #define movd_r2r(regs, regd) mmx_r2r(movd, regs, regd) /* 2x32, 4x16, and 8x8 Parallel ADDs */ #define paddd_m2r(var, reg) mmx_m2r(paddd, var, reg) #define paddd_r2r(regs, regd) mmx_r2r(paddd, regs, regd) #define paddw_m2r(var, reg) mmx_m2r(paddw, var, reg) #define paddw_r2r(regs, regd) mmx_r2r(paddw, regs, regd) #define paddb_m2r(var, reg) mmx_m2r(paddb, var, reg) #define paddb_r2r(regs, regd) mmx_r2r(paddb, regs, regd) /* 4x16 and 8x8 Parallel ADDs using Saturation arithmetic */ #define paddsw_m2r(var, reg) mmx_m2r(paddsw, var, reg) #define paddsw_r2r(regs, regd) mmx_r2r(paddsw, regs, regd) #define paddsb_m2r(var, reg) mmx_m2r(paddsb, var, reg) #define paddsb_r2r(regs, regd) mmx_r2r(paddsb, regs, regd) /* 4x16 and 8x8 Parallel ADDs using Unsigned Saturation arithmetic */ #define paddusw_m2r(var, reg) mmx_m2r(paddusw, var, reg) #define paddusw_r2r(regs, regd) mmx_r2r(paddusw, regs, regd) #define paddusb_m2r(var, reg) mmx_m2r(paddusb, var, reg) #define paddusb_r2r(regs, regd) mmx_r2r(paddusb, regs, regd) /* 2x32, 4x16, and 8x8 Parallel SUBs */ #define psubd_m2r(var, reg) mmx_m2r(psubd, var, reg) #define psubd_r2r(regs, regd) mmx_r2r(psubd, regs, regd) #define psubw_m2r(var, reg) mmx_m2r(psubw, var, reg) #define psubw_r2r(regs, regd) mmx_r2r(psubw, regs, regd) #define psubb_m2r(var, reg) mmx_m2r(psubb, var, reg) #define psubb_r2r(regs, regd) mmx_r2r(psubb, regs, regd) /* 4x16 and 8x8 Parallel SUBs using Saturation arithmetic */ #define psubsw_m2r(var, reg) mmx_m2r(psubsw, var, reg) #define psubsw_r2r(regs, regd) mmx_r2r(psubsw, regs, regd) #define psubsb_m2r(var, reg) mmx_m2r(psubsb, var, reg) #define psubsb_r2r(regs, regd) mmx_r2r(psubsb, regs, regd) /* 4x16 and 8x8 Parallel SUBs using Unsigned Saturation arithmetic */ #define psubusw_m2r(var, reg) mmx_m2r(psubusw, var, reg) #define psubusw_r2r(regs, regd) mmx_r2r(psubusw, regs, regd) #define psubusb_m2r(var, reg) mmx_m2r(psubusb, var, reg) #define psubusb_r2r(regs, regd) mmx_r2r(psubusb, regs, regd) /* 4x16 Parallel MULs giving Low 4x16 portions of results */ #define pmullw_m2r(var, reg) mmx_m2r(pmullw, var, reg) #define pmullw_r2r(regs, regd) mmx_r2r(pmullw, regs, regd) /* 4x16 Parallel MULs giving High 4x16 portions of results */ #define pmulhw_m2r(var, reg) mmx_m2r(pmulhw, var, reg) #define pmulhw_r2r(regs, regd) mmx_r2r(pmulhw, regs, regd) /* 4x16->2x32 Parallel Mul-ADD (muls like pmullw, then adds adjacent 16-bit fields in the multiply result to make the final 2x32 result) */ #define pmaddwd_m2r(var, reg) mmx_m2r(pmaddwd, var, reg) #define pmaddwd_r2r(regs, regd) mmx_r2r(pmaddwd, regs, regd) /* 1x64 bitwise AND */ #define pand_m2r(var, reg) mmx_m2r(pand, var, reg) #define pand_r2r(regs, regd) mmx_r2r(pand, regs, regd) /* 1x64 bitwise AND with Not the destination */ #define pandn_m2r(var, reg) mmx_m2r(pandn, var, reg) #define pandn_r2r(regs, regd) mmx_r2r(pandn, regs, regd) /* 1x64 bitwise OR */ #define por_m2r(var, reg) mmx_m2r(por, var, reg) #define por_r2r(regs, regd) mmx_r2r(por, regs, regd) /* 1x64 bitwise eXclusive OR */ #define pxor_m2r(var, reg) mmx_m2r(pxor, var, reg) #define pxor_r2r(regs, regd) mmx_r2r(pxor, regs, regd) /* 2x32, 4x16, and 8x8 Parallel CoMPare for EQuality (resulting fields are either 0 or -1) */ #define pcmpeqd_m2r(var, reg) mmx_m2r(pcmpeqd, var, reg) #define pcmpeqd_r2r(regs, regd) mmx_r2r(pcmpeqd, regs, regd) #define pcmpeqw_m2r(var, reg) mmx_m2r(pcmpeqw, var, reg) #define pcmpeqw_r2r(regs, regd) mmx_r2r(pcmpeqw, regs, regd) #define pcmpeqb_m2r(var, reg) mmx_m2r(pcmpeqb, var, reg) #define pcmpeqb_r2r(regs, regd) mmx_r2r(pcmpeqb, regs, regd) /* 2x32, 4x16, and 8x8 Parallel CoMPare for Greater Than (resulting fields are either 0 or -1) */ #define pcmpgtd_m2r(var, reg) mmx_m2r(pcmpgtd, var, reg) #define pcmpgtd_r2r(regs, regd) mmx_r2r(pcmpgtd, regs, regd) #define pcmpgtw_m2r(var, reg) mmx_m2r(pcmpgtw, var, reg) #define pcmpgtw_r2r(regs, regd) mmx_r2r(pcmpgtw, regs, regd) #define pcmpgtb_m2r(var, reg) mmx_m2r(pcmpgtb, var, reg) #define pcmpgtb_r2r(regs, regd) mmx_r2r(pcmpgtb, regs, regd) /* 1x64, 2x32, and 4x16 Parallel Shift Left Logical */ #define psllq_m2r(var, reg) mmx_m2r(psllq, var, reg) #define psllq_i2r(immed, regd) mmx_i2r(psllq, immed, regd) #define psllq_r2r(regs, regd) mmx_r2r(psllq, regs, regd) #define pslld_m2r(var, reg) mmx_m2r(pslld, var, reg) #define pslld_i2r(immed, regd) mmx_i2r(pslld, immed, regd) #define pslld_r2r(regs, regd) mmx_r2r(pslld, regs, regd) #define psllw_m2r(var, reg) mmx_m2r(psllw, var, reg) #define psllw_i2r(immed, regd) mmx_i2r(psllw, immed, regd) #define psllw_r2r(regs, regd) mmx_r2r(psllw, regs, regd) /* 1x64, 2x32, and 4x16 Parallel Shift Right Logical */ #define psrlq_m2r(var, reg) mmx_m2r(psrlq, var, reg) #define psrlq_i2r(immed, regd) mmx_i2r(psrlq, immed, regd) #define psrlq_r2r(regs, regd) mmx_r2r(psrlq, regs, regd) #define psrld_m2r(var, reg) mmx_m2r(psrld, var, reg) #define psrld_i2r(immed, regd) mmx_i2r(psrld, immed, regd) #define psrld_r2r(regs, regd) mmx_r2r(psrld, regs, regd) #define psrlw_m2r(var, reg) mmx_m2r(psrlw, var, reg) #define psrlw_i2r(immed, regd) mmx_i2r(psrlw, immed, regd) #define psrlw_r2r(regs, regd) mmx_r2r(psrlw, regs, regd) /* 2x32 and 4x16 Parallel Shift Right Arithmetic */ #define psrad_m2r(var, reg) mmx_m2r(psrad, var, reg) #define psrad_i2r(immed, regd) mmx_i2r(psrad, immed, regd) #define psrad_r2r(regs, regd) mmx_r2r(psrad, regs, regd) #define psraw_m2r(var, reg) mmx_m2r(psraw, var, reg) #define psraw_i2r(immed, regd) mmx_i2r(psraw, immed, regd) #define psraw_r2r(regs, regd) mmx_r2r(psraw, regs, regd) /* 2x32->4x16 and 4x16->8x8 PACK and Signed Saturate (packs source and dest fields into dest in that order) */ #define packssdw_m2r(var, reg) mmx_m2r(packssdw, var, reg) #define packssdw_r2r(regs, regd) mmx_r2r(packssdw, regs, regd) #define packsswb_m2r(var, reg) mmx_m2r(packsswb, var, reg) #define packsswb_r2r(regs, regd) mmx_r2r(packsswb, regs, regd) /* 4x16->8x8 PACK and Unsigned Saturate (packs source and dest fields into dest in that order) */ #define packuswb_m2r(var, reg) mmx_m2r(packuswb, var, reg) #define packuswb_r2r(regs, regd) mmx_r2r(packuswb, regs, regd) /* 2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK Low (interleaves low half of dest with low half of source as padding in each result field) */ #define punpckldq_m2r(var, reg) mmx_m2r(punpckldq, var, reg) #define punpckldq_r2r(regs, regd) mmx_r2r(punpckldq, regs, regd) #define punpcklwd_m2r(var, reg) mmx_m2r(punpcklwd, var, reg) #define punpcklwd_r2r(regs, regd) mmx_r2r(punpcklwd, regs, regd) #define punpcklbw_m2r(var, reg) mmx_m2r(punpcklbw, var, reg) #define punpcklbw_r2r(regs, regd) mmx_r2r(punpcklbw, regs, regd) /* 2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK High (interleaves high half of dest with high half of source as padding in each result field) */ #define punpckhdq_m2r(var, reg) mmx_m2r(punpckhdq, var, reg) #define punpckhdq_r2r(regs, regd) mmx_r2r(punpckhdq, regs, regd) #define punpckhwd_m2r(var, reg) mmx_m2r(punpckhwd, var, reg) #define punpckhwd_r2r(regs, regd) mmx_r2r(punpckhwd, regs, regd) #define punpckhbw_m2r(var, reg) mmx_m2r(punpckhbw, var, reg) #define punpckhbw_r2r(regs, regd) mmx_r2r(punpckhbw, regs, regd) /* Empty MMx State (used to clean-up when going from mmx to float use of the registers that are shared by both; note that there is no float-to-mmx operation needed, because only the float tag word info is corruptible) */ #define emms() __asm__ __volatile__ ("emms") #define femms() __asm__ __volatile__ ("femms") /* 2x32 Parallel Float-to-Int */ #define pf2id_m2r(var, reg) mmx_m2r(pf2id, var, reg) #define pf2id_r2r(regs, regd) mmx_r2r(pf2id, regs, regd) /* 2x32 Parallel Int-to-Float */ #define pi2fd_m2r(var, reg) mmx_m2r(pi2fd, var, reg) #define pi2fd_r2r(regs, regd) mmx_r2r(pi2fd, regs, regd) /* 32-bit Float Reciprocal */ #define pfrcp_r2r(regs, regd) mmx_r2r(pfrcp, regs, regd) #define pfrcpit1_r2r(regs, regd) mmx_r2r(pfrcpit1, regs, regd) #define pfrcpit2_r2r(regs, regd) mmx_r2r(pfrcpit2, regs, regd) /* 2x32 Parallel Float Accumulate */ #define pfacc_m2r(var, reg) mmx_m2r(pfacc, var, reg) #define pfacc_r2r(regs, regd) mmx_r2r(pfacc, regs, regd) /* 2x32 Parallel Float Add */ #define pfadd_m2r(var, reg) mmx_m2r(pfadd, var, reg) #define pfadd_r2r(regs, regd) mmx_r2r(pfadd, regs, regd) /* 2x32 Parallel Float Compares */ #define pfcmpeq_m2r(var, reg) mmx_m2r(pfcmpeq, var, reg) #define pfcmpeq_r2r(regs, regd) mmx_r2r(pfcmpeq, regs, regd) #define pfcmpge_m2r(var, reg) mmx_m2r(pfcmpge, var, reg) #define pfcmpge_r2r(regs, regd) mmx_r2r(pfcmpge, regs, regd) #define pfcmpgt_m2r(var, reg) mmx_m2r(pfcmpgt, var, reg) #define pfcmpgt_r2r(regs, regd) mmx_r2r(pfcmpgt, regs, regd) /* 2x32 Parallel Float Mul */ #define pfmul_m2r(var, reg) mmx_m2r(pfmul, var, reg) #define pfmul_r2r(regs, regd) mmx_r2r(pfmul, regs, regd) /* 2x32 Parallel Float Sub */ #define pfsub_m2r(var, reg) mmx_m2r(pfsub, var, reg) #define pfsub_r2r(regs, regd) mmx_r2r(pfsub, regs, regd) /* 2x32 Parallel Float Min and Max */ #define pfmin_m2r(var, reg) mmx_m2r(pfmin, var, reg) #define pfmin_r2r(regs, regd) mmx_r2r(pfmin, regs, regd) #define pfmax_m2r(var, reg) mmx_m2r(pfmax, var, reg) #define pfmax_r2r(regs, regd) mmx_r2r(pfmax, regs, regd)