Search
lxdream.org :: lxdream :: r903:1337c7a7dd6b
lxdream 0.9.1
released Jun 29
Download Now
changeset903:1337c7a7dd6b
parent902:408568dc97d5
child904:5b92e51ac06b
authornkeynes
dateWed Oct 29 23:32:28 2008 +0000 (15 years ago)
Add SSE3 versions of FIPR and FTRV - the latter is about a 4.5% improvement
src/sh4/sh4.c
src/sh4/sh4.h
src/sh4/sh4x86.in
src/sh4/x86op.h
1.1 --- a/src/sh4/sh4.c Mon Oct 27 07:26:49 2008 +0000
1.2 +++ b/src/sh4/sh4.c Wed Oct 29 23:32:28 2008 +0000
1.3 @@ -49,7 +49,7 @@
1.4 sh4_start, sh4_run_slice, sh4_stop,
1.5 sh4_save_state, sh4_load_state };
1.6
1.7 -struct sh4_registers sh4r;
1.8 +struct sh4_registers sh4r __attribute__((aligned(16)));
1.9 struct breakpoint_struct sh4_breakpoints[MAX_BREAKPOINTS];
1.10 int sh4_breakpoint_count = 0;
1.11 sh4ptr_t sh4_main_ram;
2.1 --- a/src/sh4/sh4.h Mon Oct 27 07:26:49 2008 +0000
2.2 +++ b/src/sh4/sh4.h Wed Oct 29 23:32:28 2008 +0000
2.3 @@ -64,14 +64,15 @@
2.4 */
2.5 struct sh4_registers {
2.6 uint32_t r[16];
2.7 - uint32_t sr, pr, pc, fpscr;
2.8 - uint32_t t, m, q, s; /* really boolean - 0 or 1 */
2.9 + uint32_t sr, pr, pc;
2.10 union {
2.11 int32_t i;
2.12 float f;
2.13 } fpul;
2.14 + uint32_t t, m, q, s; /* really boolean - 0 or 1 */
2.15 + float fr[2][16];
2.16 + uint32_t fpscr;
2.17 uint32_t pad; /* Pad up to 64-bit boundaries */
2.18 - float fr[2][16];
2.19 uint64_t mac;
2.20 uint32_t gbr, ssr, spc, sgr, dbr, vbr;
2.21
3.1 --- a/src/sh4/sh4x86.in Mon Oct 27 07:26:49 2008 +0000
3.2 +++ b/src/sh4/sh4x86.in Wed Oct 29 23:32:28 2008 +0000
3.3 @@ -56,7 +56,8 @@
3.4 gboolean fpuen_checked; /* true if we've already checked fpu enabled. */
3.5 gboolean branch_taken; /* true if we branched unconditionally */
3.6 gboolean double_prec; /* true if FPU is in double-precision mode */
3.7 - gboolean double_size; /* true if FPU is in double-size mode */
3.8 + gboolean double_size; /* true if FPU is in double-size mode */
3.9 + gboolean sse3_enabled; /* true if host supports SSE3 instructions */
3.10 uint32_t block_start_pc;
3.11 uint32_t stack_posn; /* Trace stack height for alignment purposes */
3.12 int tstate;
3.13 @@ -103,10 +104,25 @@
3.14 static uint32_t save_fcw; /* save value for fpu control word */
3.15 static uint32_t trunc_fcw = 0x0F7F; /* fcw value for truncation mode */
3.16
3.17 +gboolean is_sse3_supported()
3.18 +{
3.19 + uint32_t features;
3.20 +
3.21 + // Note: Include the push/pop ebx sequence in case of PIC builds. This
3.22 + // isn't exactly on a critical path anyway
3.23 + __asm__ __volatile__(
3.24 + "pushl %%ebx\n\t"
3.25 + "mov $0x01, %%eax\n\t"
3.26 + "cpuid\n\t"
3.27 + "popl %%ebx" : "=c" (features) : : "eax", "edx");
3.28 + return (features & 1) ? TRUE : FALSE;
3.29 +}
3.30 +
3.31 void sh4_translate_init(void)
3.32 {
3.33 sh4_x86.backpatch_list = malloc(DEFAULT_BACKPATCH_SIZE);
3.34 sh4_x86.backpatch_size = DEFAULT_BACKPATCH_SIZE / sizeof(struct backpatch_record);
3.35 + sh4_x86.sse3_enabled = is_sse3_supported();
3.36 }
3.37
3.38
3.39 @@ -325,7 +341,7 @@
3.40 sh4_x86.tlb_on = IS_MMU_ENABLED();
3.41 sh4_x86.tstate = TSTATE_NONE;
3.42 sh4_x86.double_prec = sh4r.fpscr & FPSCR_PR;
3.43 - sh4_x86.double_size = sh4r.fpscr & FPSCR_SZ;
3.44 + sh4_x86.double_size = sh4r.fpscr & FPSCR_SZ;
3.45 }
3.46
3.47
3.48 @@ -2222,30 +2238,62 @@
3.49 COUNT_INST(I_FIPR);
3.50 check_fpuen();
3.51 if( sh4_x86.double_prec == 0 ) {
3.52 - push_fr( FVm<<2 );
3.53 - push_fr( FVn<<2 );
3.54 - FMULP_st(1);
3.55 - push_fr( (FVm<<2)+1);
3.56 - push_fr( (FVn<<2)+1);
3.57 - FMULP_st(1);
3.58 - FADDP_st(1);
3.59 - push_fr( (FVm<<2)+2);
3.60 - push_fr( (FVn<<2)+2);
3.61 - FMULP_st(1);
3.62 - FADDP_st(1);
3.63 - push_fr( (FVm<<2)+3);
3.64 - push_fr( (FVn<<2)+3);
3.65 - FMULP_st(1);
3.66 - FADDP_st(1);
3.67 - pop_fr( (FVn<<2)+3);
3.68 +/* if( sh4_x86.sse3_enabled ) {
3.69 + MOVAPS_sh4r_xmm( REG_OFFSET(fr[0][FVm<<2]), 4 );
3.70 + MULPS_sh4r_xmm( REG_OFFSET(fr[0][FVn<<2]), 4 );
3.71 + HADDPS_xmm_xmm( 4, 4 );
3.72 + HADDPS_xmm_xmm( 4, 4 );
3.73 + MOVSS_xmm_sh4r( 4, REG_OFFSET(fr[0][(FVn<<2)+2]) );
3.74 + } else {
3.75 +*/ push_fr( FVm<<2 );
3.76 + push_fr( FVn<<2 );
3.77 + FMULP_st(1);
3.78 + push_fr( (FVm<<2)+1);
3.79 + push_fr( (FVn<<2)+1);
3.80 + FMULP_st(1);
3.81 + FADDP_st(1);
3.82 + push_fr( (FVm<<2)+2);
3.83 + push_fr( (FVn<<2)+2);
3.84 + FMULP_st(1);
3.85 + FADDP_st(1);
3.86 + push_fr( (FVm<<2)+3);
3.87 + push_fr( (FVn<<2)+3);
3.88 + FMULP_st(1);
3.89 + FADDP_st(1);
3.90 + pop_fr( (FVn<<2)+3);
3.91 +// }
3.92 }
3.93 :}
3.94 FTRV XMTRX, FVn {:
3.95 COUNT_INST(I_FTRV);
3.96 check_fpuen();
3.97 if( sh4_x86.double_prec == 0 ) {
3.98 - LEA_sh4r_rptr( REG_OFFSET(fr[0][FVn<<2]), R_EDX );
3.99 - call_func1( sh4_ftrv, R_EDX );
3.100 + if( sh4_x86.sse3_enabled ) {
3.101 + MOVAPS_sh4r_xmm( REG_OFFSET(fr[1][0]), 1 ); // M1 M0 M3 M2
3.102 + MOVAPS_sh4r_xmm( REG_OFFSET(fr[1][4]), 0 ); // M5 M4 M7 M6
3.103 + MOVAPS_sh4r_xmm( REG_OFFSET(fr[1][8]), 3 ); // M9 M8 M11 M10
3.104 + MOVAPS_sh4r_xmm( REG_OFFSET(fr[1][12]), 2 );// M13 M12 M15 M14
3.105 +
3.106 + MOVSLDUP_sh4r_xmm( REG_OFFSET(fr[0][FVn<<2]), 4 ); // V1 V1 V3 V3
3.107 + MOVSHDUP_sh4r_xmm( REG_OFFSET(fr[0][FVn<<2]), 5 ); // V0 V0 V2 V2
3.108 + MOVAPS_xmm_xmm( 4, 6 );
3.109 + MOVAPS_xmm_xmm( 5, 7 );
3.110 + MOVLHPS_xmm_xmm( 4, 4 ); // V1 V1 V1 V1
3.111 + MOVHLPS_xmm_xmm( 6, 6 ); // V3 V3 V3 V3
3.112 + MOVLHPS_xmm_xmm( 5, 5 ); // V0 V0 V0 V0
3.113 + MOVHLPS_xmm_xmm( 7, 7 ); // V2 V2 V2 V2
3.114 + MULPS_xmm_xmm( 0, 4 );
3.115 + MULPS_xmm_xmm( 1, 5 );
3.116 + MULPS_xmm_xmm( 2, 6 );
3.117 + MULPS_xmm_xmm( 3, 7 );
3.118 + ADDPS_xmm_xmm( 5, 4 );
3.119 + ADDPS_xmm_xmm( 7, 6 );
3.120 + ADDPS_xmm_xmm( 6, 4 );
3.121 + MOVAPS_xmm_sh4r( 4, REG_OFFSET(fr[0][FVn<<2]) );
3.122 + } else {
3.123 + LEA_sh4r_rptr( REG_OFFSET(fr[0][FVn<<2]), R_EAX );
3.124 + call_func1( sh4_ftrv, R_EAX );
3.125 + }
3.126 }
3.127 sh4_x86.tstate = TSTATE_NONE;
3.128 :}
4.1 --- a/src/sh4/x86op.h Mon Oct 27 07:26:49 2008 +0000
4.2 +++ b/src/sh4/x86op.h Wed Oct 29 23:32:28 2008 +0000
4.3 @@ -312,6 +312,24 @@
4.4 /* Pseudo-op Load carry from T: CMP [EBP+t], #01 ; CMC */
4.5 #define LDC_t() OP(0x83); MODRM_r32_sh4r(7,R_T); OP(0x01); CMC()
4.6
4.7 +/* SSE instructions */
4.8 +#define ADDPS_xmm_xmm(xmm1,xmm2) OP(0x0F); OP(0x58); MODRM_rm32_r32(xmm1,xmm2)
4.9 +#define HADDPS_xmm_xmm(xmm1,xmm2) OP(0xF2); OP(0x0F); OP(0x7C); MODRM_rm32_r32(xmm1,xmm2)
4.10 +#define MOVHLPS_xmm_xmm(xmm1,xmm2) OP(0x0F); OP(0x12); MODRM_rm32_r32(xmm1,xmm2)
4.11 +#define MOVLHPS_xmm_xmm(xmm1,xmm2) OP(0x0F); OP(0x16); MODRM_rm32_r32(xmm1,xmm2)
4.12 +#define MOVSHDUP_sh4r_xmm(disp,xmm) OP(0xF3); OP(0x0F); OP(0x16); MODRM_r32_sh4r(xmm,disp)
4.13 +#define MOVSLDUP_sh4r_xmm(disp,xmm) OP(0xF3); OP(0x0F); OP(0x12); MODRM_r32_sh4r(xmm,disp)
4.14 +#define MOVAPS_sh4r_xmm(disp, xmm) OP(0x0F); OP(0x28); MODRM_r32_sh4r(xmm,disp)
4.15 +#define MOVAPS_xmm_sh4r(xmm,disp) OP(0x0F); OP(0x29); MODRM_r32_sh4r(xmm,disp)
4.16 +#define MOVAPS_xmm_xmm(xmm1,xmm2) OP(0x0F); OP(0x28); MODRM_rm32_r32(xmm1,xmm2)
4.17 +#define MOVSS_xmm_sh4r(xmm,disp) OP(0xF3); OP(0x0F); OP(0x11); MODRM_r32_sh4r(xmm,disp)
4.18 +#define MULPS_sh4r_xmm(disp, xmm) OP(0x0F); OP(0x59); MODRM_r32_sh4r(xmm,disp)
4.19 +#define MULPS_xmm_xmm(xmm1,xmm2) OP(0x0F); OP(0x59); MODRM_rm32_r32(xmm1,xmm2)
4.20 +#define SHUFPS_sh4r_xmm(disp,xmm,imm8) OP(0x0F); OP(0xC6); MODRM_r32_sh4r(xmm, disp); OP(imm8)
4.21 +#define SHUFPS_xmm_xmm(xmm1,xmm2,imm8) OP(0x0F); OP(0xC6); MODRM_rm32_r32(xmm1,xmm2); OP(imm8)
4.22 +#define UNPCKHPS_xmm_xmm(xmm1,xmm2) OP(0x0F); OP(0x15); MODRM_rm32_r32(xmm1,xmm2)
4.23 +#define UNPCKLPS_xmm_xmm(xmm1,xmm2) OP(0x0F); OP(0x14); MODRM_rm32_r32(xmm1,xmm2)
4.24 +
4.25 #ifdef __cplusplus
4.26 }
4.27 #endif
.