Search
lxdream.org :: lxdream/src/sh4/sh4x86.in :: diff
lxdream 0.9.1
released Jun 29
Download Now
filename src/sh4/sh4x86.in
changeset 1112:4cac5e474d4c
prev1092:7c4ffe27e7b5
next1120:7c40a0f687b3
author nkeynes
date Tue Jul 13 18:23:16 2010 +1000 (10 years ago)
permissions -rw-r--r--
last change Rearrange the main translation loop to allow translated blocks to jump
directly to their successors without needing to return to the main loop
in between. Shaves about 6% off the core runtime.
file annotate diff log raw
1.1 --- a/src/sh4/sh4x86.in Sun Dec 20 21:01:03 2009 +1000
1.2 +++ b/src/sh4/sh4x86.in Tue Jul 13 18:23:16 2010 +1000
1.3 @@ -71,6 +71,8 @@
1.4 #define DELAY_PC 1
1.5 #define DELAY_PC_PR 2
1.6
1.7 +#define SH4_MODE_UNKNOWN -1
1.8 +
1.9 struct backpatch_record {
1.10 uint32_t fixup_offset;
1.11 uint32_t fixup_icount;
1.12 @@ -84,6 +86,7 @@
1.13 */
1.14 struct sh4_x86_state {
1.15 int in_delay_slot;
1.16 + uint8_t *code;
1.17 gboolean fpuen_checked; /* true if we've already checked fpu enabled. */
1.18 gboolean branch_taken; /* true if we branched unconditionally */
1.19 gboolean double_prec; /* true if FPU is in double-precision mode */
1.20 @@ -91,6 +94,7 @@
1.21 gboolean sse3_enabled; /* true if host supports SSE3 instructions */
1.22 uint32_t block_start_pc;
1.23 uint32_t stack_posn; /* Trace stack height for alignment purposes */
1.24 + uint32_t sh4_mode; /* Mirror of sh4r.xlat_sh4_mode */
1.25 int tstate;
1.26
1.27 /* mode flags */
1.28 @@ -171,7 +175,7 @@
1.29 fprintf( out, "%c%016lx: %-30s %-40s", (target_pc == (uintptr_t)native_pc ? '*' : ' '),
1.30 target_pc, op, buf );
1.31 #else
1.32 - fprintf( out, "%c%08x: %-30s %-40s", (target_pc == (uintptr_t)native_pc ? '*' : ' '),
1.33 + fprintf( out, "%c%08lx: %-30s %-40s", (target_pc == (uintptr_t)native_pc ? '*' : ' '),
1.34 target_pc, op, buf );
1.35 #endif
1.36 if( source_recov_table < source_recov_end &&
1.37 @@ -249,6 +253,7 @@
1.38 #define SETNE_t() SETCCB_cc_rbpdisp(X86_COND_NE,R_T)
1.39 #define SETC_r8(r1) SETCCB_cc_r8(X86_COND_C, r1)
1.40 #define JAE_label(label) JCC_cc_rel8(X86_COND_AE,-1); MARK_JMP8(label)
1.41 +#define JBE_label(label) JCC_cc_rel8(X86_COND_BE,-1); MARK_JMP8(label)
1.42 #define JE_label(label) JCC_cc_rel8(X86_COND_E,-1); MARK_JMP8(label)
1.43 #define JGE_label(label) JCC_cc_rel8(X86_COND_GE,-1); MARK_JMP8(label)
1.44 #define JNA_label(label) JCC_cc_rel8(X86_COND_NA,-1); MARK_JMP8(label)
1.45 @@ -317,7 +322,7 @@
1.46 /* Exception checks - Note that all exception checks will clobber EAX */
1.47
1.48 #define check_priv( ) \
1.49 - if( (sh4r.xlat_sh4_mode & SR_MD) == 0 ) { \
1.50 + if( (sh4_x86.sh4_mode & SR_MD) == 0 ) { \
1.51 if( sh4_x86.in_delay_slot ) { \
1.52 exit_block_exc(EXC_SLOT_ILLEGAL, (pc-2) ); \
1.53 } else { \
1.54 @@ -365,7 +370,7 @@
1.55 TESTL_imms_r32( 0x00000007, x86reg ); \
1.56 JNE_exc(EXC_DATA_ADDR_WRITE);
1.57
1.58 -#define address_space() ((sh4r.xlat_sh4_mode&SR_MD) ? (uintptr_t)sh4_address_space : (uintptr_t)sh4_user_address_space)
1.59 +#define address_space() ((sh4_x86.sh4_mode&SR_MD) ? (uintptr_t)sh4_address_space : (uintptr_t)sh4_user_address_space)
1.60
1.61 #define UNDEF(ir)
1.62 /* Note: For SR.MD == 1 && MMUCR.AT == 0, there are no memory exceptions, so
1.63 @@ -375,7 +380,7 @@
1.64 static void call_read_func(int addr_reg, int value_reg, int offset, int pc)
1.65 {
1.66 decode_address(address_space(), addr_reg);
1.67 - if( !sh4_x86.tlb_on && (sh4r.xlat_sh4_mode & SR_MD) ) {
1.68 + if( !sh4_x86.tlb_on && (sh4_x86.sh4_mode & SR_MD) ) {
1.69 CALL1_r32disp_r32(REG_ECX, offset, addr_reg);
1.70 } else {
1.71 if( addr_reg != REG_ARG1 ) {
1.72 @@ -393,7 +398,7 @@
1.73 static void call_write_func(int addr_reg, int value_reg, int offset, int pc)
1.74 {
1.75 decode_address(address_space(), addr_reg);
1.76 - if( !sh4_x86.tlb_on && (sh4r.xlat_sh4_mode & SR_MD) ) {
1.77 + if( !sh4_x86.tlb_on && (sh4_x86.sh4_mode & SR_MD) ) {
1.78 CALL2_r32disp_r32_r32(REG_ECX, offset, addr_reg, value_reg);
1.79 } else {
1.80 if( value_reg != REG_ARG2 ) {
1.81 @@ -444,8 +449,7 @@
1.82
1.83 void sh4_translate_begin_block( sh4addr_t pc )
1.84 {
1.85 - enter_block();
1.86 - MOVP_immptr_rptr( ((uint8_t *)&sh4r) + 128, REG_EBP );
1.87 + sh4_x86.code = xlat_output;
1.88 sh4_x86.in_delay_slot = FALSE;
1.89 sh4_x86.fpuen_checked = FALSE;
1.90 sh4_x86.branch_taken = FALSE;
1.91 @@ -455,6 +459,8 @@
1.92 sh4_x86.tstate = TSTATE_NONE;
1.93 sh4_x86.double_prec = sh4r.fpscr & FPSCR_PR;
1.94 sh4_x86.double_size = sh4r.fpscr & FPSCR_SZ;
1.95 + sh4_x86.sh4_mode = sh4r.xlat_sh4_mode;
1.96 + enter_block();
1.97 }
1.98
1.99
1.100 @@ -481,19 +487,49 @@
1.101
1.102 #define UNTRANSLATABLE(pc) !IS_IN_ICACHE(pc)
1.103
1.104 +/** Offset of xlat_sh4_mode field relative to the code pointer */
1.105 +#define XLAT_SH4_MODE_CODE_OFFSET (uint32_t)(offsetof(struct xlat_cache_block, xlat_sh4_mode) - offsetof(struct xlat_cache_block,code) )
1.106 +
1.107 +/**
1.108 + * Test if the loaded target code pointer in %eax is valid, and if so jump
1.109 + * directly into it, bypassing the normal exit.
1.110 + */
1.111 +static void jump_next_block()
1.112 +{
1.113 + TESTP_rptr_rptr(REG_EAX, REG_EAX);
1.114 + JE_label(nocode);
1.115 + if( sh4_x86.sh4_mode == SH4_MODE_UNKNOWN ) {
1.116 + /* sr/fpscr was changed, possibly updated xlat_sh4_mode, so reload it */
1.117 + MOVL_rbpdisp_r32( REG_OFFSET(xlat_sh4_mode), REG_ECX );
1.118 + CMPL_r32_r32disp( REG_ECX, REG_EAX, XLAT_SH4_MODE_CODE_OFFSET );
1.119 + } else {
1.120 + CMPL_imms_r32disp( sh4_x86.sh4_mode, REG_EAX, XLAT_SH4_MODE_CODE_OFFSET );
1.121 + }
1.122 + JNE_label(wrongmode);
1.123 + LEAP_rptrdisp_rptr(REG_EAX, PROLOGUE_SIZE,REG_EAX);
1.124 + JMP_rptr(REG_EAX);
1.125 + JMP_TARGET(nocode); JMP_TARGET(wrongmode);
1.126 +}
1.127 +
1.128 /**
1.129 * Exit the block with sh4r.pc already written
1.130 */
1.131 void exit_block_pcset( sh4addr_t pc )
1.132 {
1.133 MOVL_imm32_r32( ((pc - sh4_x86.block_start_pc)>>1)*sh4_cpu_period, REG_ECX );
1.134 - ADDL_r32_rbpdisp( REG_ECX, REG_OFFSET(slice_cycle) );
1.135 + ADDL_rbpdisp_r32( REG_OFFSET(slice_cycle), REG_ECX );
1.136 + MOVL_r32_rbpdisp( REG_ECX, REG_OFFSET(slice_cycle) );
1.137 + CMPL_r32_rbpdisp( REG_ECX, REG_OFFSET(event_pending) );
1.138 + JBE_label(exitloop);
1.139 MOVL_rbpdisp_r32( R_PC, REG_ARG1 );
1.140 if( sh4_x86.tlb_on ) {
1.141 CALL1_ptr_r32(xlat_get_code_by_vma,REG_ARG1);
1.142 } else {
1.143 CALL1_ptr_r32(xlat_get_code,REG_ARG1);
1.144 }
1.145 +
1.146 + jump_next_block();
1.147 + JMP_TARGET(exitloop);
1.148 exit_block();
1.149 }
1.150
1.151 @@ -503,14 +539,20 @@
1.152 void exit_block_newpcset( sh4addr_t pc )
1.153 {
1.154 MOVL_imm32_r32( ((pc - sh4_x86.block_start_pc)>>1)*sh4_cpu_period, REG_ECX );
1.155 - ADDL_r32_rbpdisp( REG_ECX, REG_OFFSET(slice_cycle) );
1.156 + ADDL_rbpdisp_r32( REG_OFFSET(slice_cycle), REG_ECX );
1.157 + MOVL_r32_rbpdisp( REG_ECX, REG_OFFSET(slice_cycle) );
1.158 MOVL_rbpdisp_r32( R_NEW_PC, REG_ARG1 );
1.159 MOVL_r32_rbpdisp( REG_ARG1, R_PC );
1.160 + CMPL_r32_rbpdisp( REG_ECX, REG_OFFSET(event_pending) );
1.161 + JBE_label(exitloop);
1.162 if( sh4_x86.tlb_on ) {
1.163 CALL1_ptr_r32(xlat_get_code_by_vma,REG_ARG1);
1.164 } else {
1.165 CALL1_ptr_r32(xlat_get_code,REG_ARG1);
1.166 }
1.167 +
1.168 + jump_next_block();
1.169 + JMP_TARGET(exitloop);
1.170 exit_block();
1.171 }
1.172
1.173 @@ -520,18 +562,25 @@
1.174 */
1.175 void exit_block_abs( sh4addr_t pc, sh4addr_t endpc )
1.176 {
1.177 - MOVL_imm32_r32( pc, REG_ECX );
1.178 - MOVL_r32_rbpdisp( REG_ECX, R_PC );
1.179 + MOVL_imm32_r32( ((endpc - sh4_x86.block_start_pc)>>1)*sh4_cpu_period, REG_ECX );
1.180 + ADDL_rbpdisp_r32( REG_OFFSET(slice_cycle), REG_ECX );
1.181 + MOVL_r32_rbpdisp( REG_ECX, REG_OFFSET(slice_cycle) );
1.182 +
1.183 + MOVL_imm32_r32( pc, REG_ARG1 );
1.184 + MOVL_r32_rbpdisp( REG_ARG1, R_PC );
1.185 + CMPL_r32_rbpdisp( REG_ECX, REG_OFFSET(event_pending) );
1.186 + JBE_label(exitloop);
1.187 +
1.188 if( IS_IN_ICACHE(pc) ) {
1.189 MOVP_moffptr_rax( xlat_get_lut_entry(GET_ICACHE_PHYS(pc)) );
1.190 ANDP_imms_rptr( -4, REG_EAX );
1.191 } else if( sh4_x86.tlb_on ) {
1.192 - CALL1_ptr_r32(xlat_get_code_by_vma, REG_ECX);
1.193 + CALL1_ptr_r32(xlat_get_code_by_vma, REG_ARG1);
1.194 } else {
1.195 - CALL1_ptr_r32(xlat_get_code, REG_ECX);
1.196 + CALL1_ptr_r32(xlat_get_code, REG_ARG1);
1.197 }
1.198 - MOVL_imm32_r32( ((endpc - sh4_x86.block_start_pc)>>1)*sh4_cpu_period, REG_ECX );
1.199 - ADDL_r32_rbpdisp( REG_ECX, REG_OFFSET(slice_cycle) );
1.200 + jump_next_block();
1.201 + JMP_TARGET(exitloop);
1.202 exit_block();
1.203 }
1.204
1.205 @@ -540,19 +589,36 @@
1.206 */
1.207 void exit_block_rel( sh4addr_t pc, sh4addr_t endpc )
1.208 {
1.209 - MOVL_imm32_r32( pc - sh4_x86.block_start_pc, REG_ECX );
1.210 - ADDL_rbpdisp_r32( R_PC, REG_ECX );
1.211 - MOVL_r32_rbpdisp( REG_ECX, R_PC );
1.212 - if( IS_IN_ICACHE(pc) ) {
1.213 - MOVP_moffptr_rax( xlat_get_lut_entry(GET_ICACHE_PHYS(pc)) );
1.214 - ANDP_imms_rptr( -4, REG_EAX );
1.215 - } else if( sh4_x86.tlb_on ) {
1.216 - CALL1_ptr_r32(xlat_get_code_by_vma, REG_ECX);
1.217 - } else {
1.218 - CALL1_ptr_r32(xlat_get_code, REG_ECX);
1.219 + MOVL_imm32_r32( ((endpc - sh4_x86.block_start_pc)>>1)*sh4_cpu_period, REG_ECX );
1.220 + ADDL_rbpdisp_r32( REG_OFFSET(slice_cycle), REG_ECX );
1.221 + MOVL_r32_rbpdisp( REG_ECX, REG_OFFSET(slice_cycle) );
1.222 +
1.223 + if( pc == sh4_x86.block_start_pc && sh4_x86.sh4_mode == sh4r.xlat_sh4_mode ) {
1.224 + /* Special case for tight loops - the PC doesn't change, and
1.225 + * we already know the target address. Just check events pending before
1.226 + * looping.
1.227 + */
1.228 + CMPL_r32_rbpdisp( REG_ECX, REG_OFFSET(event_pending) );
1.229 + uint32_t backdisp = ((uintptr_t)(sh4_x86.code - xlat_output)) + PROLOGUE_SIZE;
1.230 + JCC_cc_prerel(X86_COND_A, backdisp);
1.231 + } else {
1.232 + MOVL_imm32_r32( pc - sh4_x86.block_start_pc, REG_ARG1 );
1.233 + ADDL_rbpdisp_r32( R_PC, REG_ARG1 );
1.234 + MOVL_r32_rbpdisp( REG_ARG1, R_PC );
1.235 + CMPL_r32_rbpdisp( REG_ECX, REG_OFFSET(event_pending) );
1.236 + JBE_label(exitloop2);
1.237 +
1.238 + if( IS_IN_ICACHE(pc) ) {
1.239 + MOVP_moffptr_rax( xlat_get_lut_entry(GET_ICACHE_PHYS(pc)) );
1.240 + ANDP_imms_rptr( -4, REG_EAX );
1.241 + } else if( sh4_x86.tlb_on ) {
1.242 + CALL1_ptr_r32(xlat_get_code_by_vma, REG_ARG1);
1.243 + } else {
1.244 + CALL1_ptr_r32(xlat_get_code, REG_ARG1);
1.245 + }
1.246 + jump_next_block();
1.247 + JMP_TARGET(exitloop2);
1.248 }
1.249 - MOVL_imm32_r32( ((endpc - sh4_x86.block_start_pc)>>1)*sh4_cpu_period, REG_ECX );
1.250 - ADDL_r32_rbpdisp( REG_ECX, REG_OFFSET(slice_cycle) );
1.251 exit_block();
1.252 }
1.253
1.254 @@ -567,13 +633,6 @@
1.255 ADDL_r32_rbpdisp( REG_ECX, REG_OFFSET(slice_cycle) );
1.256 MOVL_imm32_r32( code, REG_ARG1 );
1.257 CALL1_ptr_r32( sh4_raise_exception, REG_ARG1 );
1.258 - MOVL_rbpdisp_r32( R_PC, REG_ARG1 );
1.259 - if( sh4_x86.tlb_on ) {
1.260 - CALL1_ptr_r32(xlat_get_code_by_vma,REG_ARG1);
1.261 - } else {
1.262 - CALL1_ptr_r32(xlat_get_code,REG_ARG1);
1.263 - }
1.264 -
1.265 exit_block();
1.266 }
1.267
1.268 @@ -599,13 +658,7 @@
1.269 MOVL_imm32_r32( sh4_x86.in_delay_slot ? 1 : 0, REG_ECX );
1.270 MOVL_r32_rbpdisp( REG_ECX, REG_OFFSET(in_delay_slot) );
1.271
1.272 - CALL_ptr( sh4_execute_instruction );
1.273 - MOVL_rbpdisp_r32( R_PC, REG_EAX );
1.274 - if( sh4_x86.tlb_on ) {
1.275 - CALL1_ptr_r32(xlat_get_code_by_vma,REG_EAX);
1.276 - } else {
1.277 - CALL1_ptr_r32(xlat_get_code,REG_EAX);
1.278 - }
1.279 + CALL_ptr( sh4_execute_instruction );
1.280 exit_block();
1.281 }
1.282
1.283 @@ -627,12 +680,6 @@
1.284 MOVL_moffptr_eax( &sh4_cpu_period );
1.285 MULL_r32( REG_EDX );
1.286 ADDL_r32_rbpdisp( REG_EAX, REG_OFFSET(slice_cycle) );
1.287 - MOVL_rbpdisp_r32( R_PC, REG_ARG1 );
1.288 - if( sh4_x86.tlb_on ) {
1.289 - CALL1_ptr_r32(xlat_get_code_by_vma, REG_ARG1);
1.290 - } else {
1.291 - CALL1_ptr_r32(xlat_get_code, REG_ARG1);
1.292 - }
1.293 exit_block();
1.294
1.295 for( i=0; i< sh4_x86.backpatch_posn; i++ ) {
1.296 @@ -1975,6 +2022,7 @@
1.297 sh4_x86.fpuen_checked = FALSE;
1.298 sh4_x86.tstate = TSTATE_NONE;
1.299 sh4_x86.branch_taken = TRUE;
1.300 + sh4_x86.sh4_mode = SH4_MODE_UNKNOWN;
1.301 if( UNTRANSLATABLE(pc+2) ) {
1.302 exit_block_emu(pc+2);
1.303 return 2;
1.304 @@ -2539,6 +2587,7 @@
1.305 XORL_imms_rbpdisp( FPSCR_SZ, REG_OFFSET(xlat_sh4_mode) );
1.306 sh4_x86.tstate = TSTATE_NONE;
1.307 sh4_x86.double_size = !sh4_x86.double_size;
1.308 + sh4_x86.sh4_mode = sh4_x86.sh4_mode ^ FPSCR_SZ;
1.309 :}
1.310
1.311 /* Processor control instructions */
1.312 @@ -2552,6 +2601,7 @@
1.313 CALL1_ptr_r32( sh4_write_sr, REG_EAX );
1.314 sh4_x86.fpuen_checked = FALSE;
1.315 sh4_x86.tstate = TSTATE_NONE;
1.316 + sh4_x86.sh4_mode = SH4_MODE_UNKNOWN;
1.317 return 2;
1.318 }
1.319 :}
1.320 @@ -2624,6 +2674,7 @@
1.321 CALL1_ptr_r32( sh4_write_sr, REG_EAX );
1.322 sh4_x86.fpuen_checked = FALSE;
1.323 sh4_x86.tstate = TSTATE_NONE;
1.324 + sh4_x86.sh4_mode = SH4_MODE_UNKNOWN;
1.325 return 2;
1.326 }
1.327 :}
1.328 @@ -2693,6 +2744,7 @@
1.329 load_reg( REG_EAX, Rm );
1.330 CALL1_ptr_r32( sh4_write_fpscr, REG_EAX );
1.331 sh4_x86.tstate = TSTATE_NONE;
1.332 + sh4_x86.sh4_mode = SH4_MODE_UNKNOWN;
1.333 return 2;
1.334 :}
1.335 LDS.L @Rm+, FPSCR {:
1.336 @@ -2704,6 +2756,7 @@
1.337 ADDL_imms_rbpdisp( 4, REG_OFFSET(r[Rm]) );
1.338 CALL1_ptr_r32( sh4_write_fpscr, REG_EAX );
1.339 sh4_x86.tstate = TSTATE_NONE;
1.340 + sh4_x86.sh4_mode = SH4_MODE_UNKNOWN;
1.341 return 2;
1.342 :}
1.343 LDS Rm, FPUL {:
.