revision 903:1337c7a7dd6b
summary |
tree |
shortlog |
changelog |
graph |
changeset |
raw | bz2 | zip | gz changeset | 903:1337c7a7dd6b |
parent | 902:408568dc97d5 |
child | 904:5b92e51ac06b |
author | nkeynes |
date | Wed Oct 29 23:32:28 2008 +0000 (15 years ago) |
Add SSE3 versions of FIPR and FTRV - the latter is about a 4.5% improvement
src/sh4/sh4.c | view | annotate | diff | log | ||
src/sh4/sh4.h | view | annotate | diff | log | ||
src/sh4/sh4x86.in | view | annotate | diff | log | ||
src/sh4/x86op.h | view | annotate | diff | log |
1.1 --- a/src/sh4/sh4.c Mon Oct 27 07:26:49 2008 +00001.2 +++ b/src/sh4/sh4.c Wed Oct 29 23:32:28 2008 +00001.3 @@ -49,7 +49,7 @@1.4 sh4_start, sh4_run_slice, sh4_stop,1.5 sh4_save_state, sh4_load_state };1.7 -struct sh4_registers sh4r;1.8 +struct sh4_registers sh4r __attribute__((aligned(16)));1.9 struct breakpoint_struct sh4_breakpoints[MAX_BREAKPOINTS];1.10 int sh4_breakpoint_count = 0;1.11 sh4ptr_t sh4_main_ram;
2.1 --- a/src/sh4/sh4.h Mon Oct 27 07:26:49 2008 +00002.2 +++ b/src/sh4/sh4.h Wed Oct 29 23:32:28 2008 +00002.3 @@ -64,14 +64,15 @@2.4 */2.5 struct sh4_registers {2.6 uint32_t r[16];2.7 - uint32_t sr, pr, pc, fpscr;2.8 - uint32_t t, m, q, s; /* really boolean - 0 or 1 */2.9 + uint32_t sr, pr, pc;2.10 union {2.11 int32_t i;2.12 float f;2.13 } fpul;2.14 + uint32_t t, m, q, s; /* really boolean - 0 or 1 */2.15 + float fr[2][16];2.16 + uint32_t fpscr;2.17 uint32_t pad; /* Pad up to 64-bit boundaries */2.18 - float fr[2][16];2.19 uint64_t mac;2.20 uint32_t gbr, ssr, spc, sgr, dbr, vbr;
3.1 --- a/src/sh4/sh4x86.in Mon Oct 27 07:26:49 2008 +00003.2 +++ b/src/sh4/sh4x86.in Wed Oct 29 23:32:28 2008 +00003.3 @@ -56,7 +56,8 @@3.4 gboolean fpuen_checked; /* true if we've already checked fpu enabled. */3.5 gboolean branch_taken; /* true if we branched unconditionally */3.6 gboolean double_prec; /* true if FPU is in double-precision mode */3.7 - gboolean double_size; /* true if FPU is in double-size mode */3.8 + gboolean double_size; /* true if FPU is in double-size mode */3.9 + gboolean sse3_enabled; /* true if host supports SSE3 instructions */3.10 uint32_t block_start_pc;3.11 uint32_t stack_posn; /* Trace stack height for alignment purposes */3.12 int tstate;3.13 @@ -103,10 +104,25 @@3.14 static uint32_t save_fcw; /* save value for fpu control word */3.15 static uint32_t trunc_fcw = 0x0F7F; /* fcw value for truncation mode */3.17 +gboolean is_sse3_supported()3.18 +{3.19 + uint32_t features;3.20 +3.21 + // Note: Include the push/pop ebx sequence in case of PIC builds. This3.22 + // isn't exactly on a critical path anyway3.23 + __asm__ __volatile__(3.24 + "pushl %%ebx\n\t"3.25 + "mov $0x01, %%eax\n\t"3.26 + "cpuid\n\t"3.27 + "popl %%ebx" : "=c" (features) : : "eax", "edx");3.28 + return (features & 1) ? TRUE : FALSE;3.29 +}3.30 +3.31 void sh4_translate_init(void)3.32 {3.33 sh4_x86.backpatch_list = malloc(DEFAULT_BACKPATCH_SIZE);3.34 sh4_x86.backpatch_size = DEFAULT_BACKPATCH_SIZE / sizeof(struct backpatch_record);3.35 + sh4_x86.sse3_enabled = is_sse3_supported();3.36 }3.39 @@ -325,7 +341,7 @@3.40 sh4_x86.tlb_on = IS_MMU_ENABLED();3.41 sh4_x86.tstate = TSTATE_NONE;3.42 sh4_x86.double_prec = sh4r.fpscr & FPSCR_PR;3.43 - sh4_x86.double_size = sh4r.fpscr & FPSCR_SZ;3.44 + sh4_x86.double_size = sh4r.fpscr & FPSCR_SZ;3.45 }3.48 @@ -2222,30 +2238,62 @@3.49 COUNT_INST(I_FIPR);3.50 check_fpuen();3.51 if( sh4_x86.double_prec == 0 ) {3.52 - push_fr( FVm<<2 );3.53 - push_fr( FVn<<2 );3.54 - FMULP_st(1);3.55 - push_fr( (FVm<<2)+1);3.56 - push_fr( (FVn<<2)+1);3.57 - FMULP_st(1);3.58 - FADDP_st(1);3.59 - push_fr( (FVm<<2)+2);3.60 - push_fr( (FVn<<2)+2);3.61 - FMULP_st(1);3.62 - FADDP_st(1);3.63 - push_fr( (FVm<<2)+3);3.64 - push_fr( (FVn<<2)+3);3.65 - FMULP_st(1);3.66 - FADDP_st(1);3.67 - pop_fr( (FVn<<2)+3);3.68 +/* if( sh4_x86.sse3_enabled ) {3.69 + MOVAPS_sh4r_xmm( REG_OFFSET(fr[0][FVm<<2]), 4 );3.70 + MULPS_sh4r_xmm( REG_OFFSET(fr[0][FVn<<2]), 4 );3.71 + HADDPS_xmm_xmm( 4, 4 );3.72 + HADDPS_xmm_xmm( 4, 4 );3.73 + MOVSS_xmm_sh4r( 4, REG_OFFSET(fr[0][(FVn<<2)+2]) );3.74 + } else {3.75 +*/ push_fr( FVm<<2 );3.76 + push_fr( FVn<<2 );3.77 + FMULP_st(1);3.78 + push_fr( (FVm<<2)+1);3.79 + push_fr( (FVn<<2)+1);3.80 + FMULP_st(1);3.81 + FADDP_st(1);3.82 + push_fr( (FVm<<2)+2);3.83 + push_fr( (FVn<<2)+2);3.84 + FMULP_st(1);3.85 + FADDP_st(1);3.86 + push_fr( (FVm<<2)+3);3.87 + push_fr( (FVn<<2)+3);3.88 + FMULP_st(1);3.89 + FADDP_st(1);3.90 + pop_fr( (FVn<<2)+3);3.91 +// }3.92 }3.93 :}3.94 FTRV XMTRX, FVn {:3.95 COUNT_INST(I_FTRV);3.96 check_fpuen();3.97 if( sh4_x86.double_prec == 0 ) {3.98 - LEA_sh4r_rptr( REG_OFFSET(fr[0][FVn<<2]), R_EDX );3.99 - call_func1( sh4_ftrv, R_EDX );3.100 + if( sh4_x86.sse3_enabled ) {3.101 + MOVAPS_sh4r_xmm( REG_OFFSET(fr[1][0]), 1 ); // M1 M0 M3 M23.102 + MOVAPS_sh4r_xmm( REG_OFFSET(fr[1][4]), 0 ); // M5 M4 M7 M63.103 + MOVAPS_sh4r_xmm( REG_OFFSET(fr[1][8]), 3 ); // M9 M8 M11 M103.104 + MOVAPS_sh4r_xmm( REG_OFFSET(fr[1][12]), 2 );// M13 M12 M15 M143.105 +3.106 + MOVSLDUP_sh4r_xmm( REG_OFFSET(fr[0][FVn<<2]), 4 ); // V1 V1 V3 V33.107 + MOVSHDUP_sh4r_xmm( REG_OFFSET(fr[0][FVn<<2]), 5 ); // V0 V0 V2 V23.108 + MOVAPS_xmm_xmm( 4, 6 );3.109 + MOVAPS_xmm_xmm( 5, 7 );3.110 + MOVLHPS_xmm_xmm( 4, 4 ); // V1 V1 V1 V13.111 + MOVHLPS_xmm_xmm( 6, 6 ); // V3 V3 V3 V33.112 + MOVLHPS_xmm_xmm( 5, 5 ); // V0 V0 V0 V03.113 + MOVHLPS_xmm_xmm( 7, 7 ); // V2 V2 V2 V23.114 + MULPS_xmm_xmm( 0, 4 );3.115 + MULPS_xmm_xmm( 1, 5 );3.116 + MULPS_xmm_xmm( 2, 6 );3.117 + MULPS_xmm_xmm( 3, 7 );3.118 + ADDPS_xmm_xmm( 5, 4 );3.119 + ADDPS_xmm_xmm( 7, 6 );3.120 + ADDPS_xmm_xmm( 6, 4 );3.121 + MOVAPS_xmm_sh4r( 4, REG_OFFSET(fr[0][FVn<<2]) );3.122 + } else {3.123 + LEA_sh4r_rptr( REG_OFFSET(fr[0][FVn<<2]), R_EAX );3.124 + call_func1( sh4_ftrv, R_EAX );3.125 + }3.126 }3.127 sh4_x86.tstate = TSTATE_NONE;3.128 :}
4.1 --- a/src/sh4/x86op.h Mon Oct 27 07:26:49 2008 +00004.2 +++ b/src/sh4/x86op.h Wed Oct 29 23:32:28 2008 +00004.3 @@ -312,6 +312,24 @@4.4 /* Pseudo-op Load carry from T: CMP [EBP+t], #01 ; CMC */4.5 #define LDC_t() OP(0x83); MODRM_r32_sh4r(7,R_T); OP(0x01); CMC()4.7 +/* SSE instructions */4.8 +#define ADDPS_xmm_xmm(xmm1,xmm2) OP(0x0F); OP(0x58); MODRM_rm32_r32(xmm1,xmm2)4.9 +#define HADDPS_xmm_xmm(xmm1,xmm2) OP(0xF2); OP(0x0F); OP(0x7C); MODRM_rm32_r32(xmm1,xmm2)4.10 +#define MOVHLPS_xmm_xmm(xmm1,xmm2) OP(0x0F); OP(0x12); MODRM_rm32_r32(xmm1,xmm2)4.11 +#define MOVLHPS_xmm_xmm(xmm1,xmm2) OP(0x0F); OP(0x16); MODRM_rm32_r32(xmm1,xmm2)4.12 +#define MOVSHDUP_sh4r_xmm(disp,xmm) OP(0xF3); OP(0x0F); OP(0x16); MODRM_r32_sh4r(xmm,disp)4.13 +#define MOVSLDUP_sh4r_xmm(disp,xmm) OP(0xF3); OP(0x0F); OP(0x12); MODRM_r32_sh4r(xmm,disp)4.14 +#define MOVAPS_sh4r_xmm(disp, xmm) OP(0x0F); OP(0x28); MODRM_r32_sh4r(xmm,disp)4.15 +#define MOVAPS_xmm_sh4r(xmm,disp) OP(0x0F); OP(0x29); MODRM_r32_sh4r(xmm,disp)4.16 +#define MOVAPS_xmm_xmm(xmm1,xmm2) OP(0x0F); OP(0x28); MODRM_rm32_r32(xmm1,xmm2)4.17 +#define MOVSS_xmm_sh4r(xmm,disp) OP(0xF3); OP(0x0F); OP(0x11); MODRM_r32_sh4r(xmm,disp)4.18 +#define MULPS_sh4r_xmm(disp, xmm) OP(0x0F); OP(0x59); MODRM_r32_sh4r(xmm,disp)4.19 +#define MULPS_xmm_xmm(xmm1,xmm2) OP(0x0F); OP(0x59); MODRM_rm32_r32(xmm1,xmm2)4.20 +#define SHUFPS_sh4r_xmm(disp,xmm,imm8) OP(0x0F); OP(0xC6); MODRM_r32_sh4r(xmm, disp); OP(imm8)4.21 +#define SHUFPS_xmm_xmm(xmm1,xmm2,imm8) OP(0x0F); OP(0xC6); MODRM_rm32_r32(xmm1,xmm2); OP(imm8)4.22 +#define UNPCKHPS_xmm_xmm(xmm1,xmm2) OP(0x0F); OP(0x15); MODRM_rm32_r32(xmm1,xmm2)4.23 +#define UNPCKLPS_xmm_xmm(xmm1,xmm2) OP(0x0F); OP(0x14); MODRM_rm32_r32(xmm1,xmm2)4.24 +4.25 #ifdef __cplusplus4.26 }4.27 #endif
.