Search
lxdream.org :: lxdream/src/sh4/sh4x86.in :: diff
lxdream 0.9.1
released Jun 29
Download Now
filename src/sh4/sh4x86.in
changeset 903:1337c7a7dd6b
prev901:32c5cf5e206f
next904:5b92e51ac06b
author nkeynes
date Wed Oct 29 23:32:28 2008 +0000 (12 years ago)
permissions -rw-r--r--
last change Add SSE3 versions of FIPR and FTRV - the latter is about a 4.5% improvement
file annotate diff log raw
1.1 --- a/src/sh4/sh4x86.in Sun Oct 26 02:28:29 2008 +0000
1.2 +++ b/src/sh4/sh4x86.in Wed Oct 29 23:32:28 2008 +0000
1.3 @@ -56,7 +56,8 @@
1.4 gboolean fpuen_checked; /* true if we've already checked fpu enabled. */
1.5 gboolean branch_taken; /* true if we branched unconditionally */
1.6 gboolean double_prec; /* true if FPU is in double-precision mode */
1.7 - gboolean double_size; /* true if FPU is in double-size mode */
1.8 + gboolean double_size; /* true if FPU is in double-size mode */
1.9 + gboolean sse3_enabled; /* true if host supports SSE3 instructions */
1.10 uint32_t block_start_pc;
1.11 uint32_t stack_posn; /* Trace stack height for alignment purposes */
1.12 int tstate;
1.13 @@ -103,10 +104,25 @@
1.14 static uint32_t save_fcw; /* save value for fpu control word */
1.15 static uint32_t trunc_fcw = 0x0F7F; /* fcw value for truncation mode */
1.16
1.17 +gboolean is_sse3_supported()
1.18 +{
1.19 + uint32_t features;
1.20 +
1.21 + // Note: Include the push/pop ebx sequence in case of PIC builds. This
1.22 + // isn't exactly on a critical path anyway
1.23 + __asm__ __volatile__(
1.24 + "pushl %%ebx\n\t"
1.25 + "mov $0x01, %%eax\n\t"
1.26 + "cpuid\n\t"
1.27 + "popl %%ebx" : "=c" (features) : : "eax", "edx");
1.28 + return (features & 1) ? TRUE : FALSE;
1.29 +}
1.30 +
1.31 void sh4_translate_init(void)
1.32 {
1.33 sh4_x86.backpatch_list = malloc(DEFAULT_BACKPATCH_SIZE);
1.34 sh4_x86.backpatch_size = DEFAULT_BACKPATCH_SIZE / sizeof(struct backpatch_record);
1.35 + sh4_x86.sse3_enabled = is_sse3_supported();
1.36 }
1.37
1.38
1.39 @@ -325,7 +341,7 @@
1.40 sh4_x86.tlb_on = IS_MMU_ENABLED();
1.41 sh4_x86.tstate = TSTATE_NONE;
1.42 sh4_x86.double_prec = sh4r.fpscr & FPSCR_PR;
1.43 - sh4_x86.double_size = sh4r.fpscr & FPSCR_SZ;
1.44 + sh4_x86.double_size = sh4r.fpscr & FPSCR_SZ;
1.45 }
1.46
1.47
1.48 @@ -2222,30 +2238,62 @@
1.49 COUNT_INST(I_FIPR);
1.50 check_fpuen();
1.51 if( sh4_x86.double_prec == 0 ) {
1.52 - push_fr( FVm<<2 );
1.53 - push_fr( FVn<<2 );
1.54 - FMULP_st(1);
1.55 - push_fr( (FVm<<2)+1);
1.56 - push_fr( (FVn<<2)+1);
1.57 - FMULP_st(1);
1.58 - FADDP_st(1);
1.59 - push_fr( (FVm<<2)+2);
1.60 - push_fr( (FVn<<2)+2);
1.61 - FMULP_st(1);
1.62 - FADDP_st(1);
1.63 - push_fr( (FVm<<2)+3);
1.64 - push_fr( (FVn<<2)+3);
1.65 - FMULP_st(1);
1.66 - FADDP_st(1);
1.67 - pop_fr( (FVn<<2)+3);
1.68 +/* if( sh4_x86.sse3_enabled ) {
1.69 + MOVAPS_sh4r_xmm( REG_OFFSET(fr[0][FVm<<2]), 4 );
1.70 + MULPS_sh4r_xmm( REG_OFFSET(fr[0][FVn<<2]), 4 );
1.71 + HADDPS_xmm_xmm( 4, 4 );
1.72 + HADDPS_xmm_xmm( 4, 4 );
1.73 + MOVSS_xmm_sh4r( 4, REG_OFFSET(fr[0][(FVn<<2)+2]) );
1.74 + } else {
1.75 +*/ push_fr( FVm<<2 );
1.76 + push_fr( FVn<<2 );
1.77 + FMULP_st(1);
1.78 + push_fr( (FVm<<2)+1);
1.79 + push_fr( (FVn<<2)+1);
1.80 + FMULP_st(1);
1.81 + FADDP_st(1);
1.82 + push_fr( (FVm<<2)+2);
1.83 + push_fr( (FVn<<2)+2);
1.84 + FMULP_st(1);
1.85 + FADDP_st(1);
1.86 + push_fr( (FVm<<2)+3);
1.87 + push_fr( (FVn<<2)+3);
1.88 + FMULP_st(1);
1.89 + FADDP_st(1);
1.90 + pop_fr( (FVn<<2)+3);
1.91 +// }
1.92 }
1.93 :}
1.94 FTRV XMTRX, FVn {:
1.95 COUNT_INST(I_FTRV);
1.96 check_fpuen();
1.97 if( sh4_x86.double_prec == 0 ) {
1.98 - LEA_sh4r_rptr( REG_OFFSET(fr[0][FVn<<2]), R_EDX );
1.99 - call_func1( sh4_ftrv, R_EDX );
1.100 + if( sh4_x86.sse3_enabled ) {
1.101 + MOVAPS_sh4r_xmm( REG_OFFSET(fr[1][0]), 1 ); // M1 M0 M3 M2
1.102 + MOVAPS_sh4r_xmm( REG_OFFSET(fr[1][4]), 0 ); // M5 M4 M7 M6
1.103 + MOVAPS_sh4r_xmm( REG_OFFSET(fr[1][8]), 3 ); // M9 M8 M11 M10
1.104 + MOVAPS_sh4r_xmm( REG_OFFSET(fr[1][12]), 2 );// M13 M12 M15 M14
1.105 +
1.106 + MOVSLDUP_sh4r_xmm( REG_OFFSET(fr[0][FVn<<2]), 4 ); // V1 V1 V3 V3
1.107 + MOVSHDUP_sh4r_xmm( REG_OFFSET(fr[0][FVn<<2]), 5 ); // V0 V0 V2 V2
1.108 + MOVAPS_xmm_xmm( 4, 6 );
1.109 + MOVAPS_xmm_xmm( 5, 7 );
1.110 + MOVLHPS_xmm_xmm( 4, 4 ); // V1 V1 V1 V1
1.111 + MOVHLPS_xmm_xmm( 6, 6 ); // V3 V3 V3 V3
1.112 + MOVLHPS_xmm_xmm( 5, 5 ); // V0 V0 V0 V0
1.113 + MOVHLPS_xmm_xmm( 7, 7 ); // V2 V2 V2 V2
1.114 + MULPS_xmm_xmm( 0, 4 );
1.115 + MULPS_xmm_xmm( 1, 5 );
1.116 + MULPS_xmm_xmm( 2, 6 );
1.117 + MULPS_xmm_xmm( 3, 7 );
1.118 + ADDPS_xmm_xmm( 5, 4 );
1.119 + ADDPS_xmm_xmm( 7, 6 );
1.120 + ADDPS_xmm_xmm( 6, 4 );
1.121 + MOVAPS_xmm_sh4r( 4, REG_OFFSET(fr[0][FVn<<2]) );
1.122 + } else {
1.123 + LEA_sh4r_rptr( REG_OFFSET(fr[0][FVn<<2]), R_EAX );
1.124 + call_func1( sh4_ftrv, R_EAX );
1.125 + }
1.126 }
1.127 sh4_x86.tstate = TSTATE_NONE;
1.128 :}
.