[patch] support zhaoxin x86 cpu vendor ids CentaurHauls and Shanghai

classic Classic list List threaded Threaded
11 messages Options
Reply | Threaded
Open this post in threaded view
|

[patch] support zhaoxin x86 cpu vendor ids CentaurHauls and Shanghai

Vic Wang(BJ-RD)
Hi,
Here is a hotspot patch for supporting zhaoxin x86 cpu vendor ids (CentaurHauls id and Shanghai id). The original repository is http://hg.openjdk.java.net/jdk/hs.
Please help me to submit the changes . If there is anything incorrectly, please let me know. Thank you very much.

The following is the changeset by the diff command.

diff -Nur org/jdk/src/hotspot/cpu/x86/assembler_x86.cpp zx/jdk/src/hotspot/cpu/x86/assembler_x86.cpp
--- org/jdk/src/hotspot/cpu/x86/assembler_x86.cpp2017-12-08 03:33:00.000000000 +0800
+++ zx/jdk/src/hotspot/cpu/x86/assembler_x86.cpp2017-12-28 11:31:30.000000000 +0800
@@ -3167,6 +3167,89 @@
     return;
   }

+  if (UseAddressNop && VM_Version::is_zx()) {
+    //
+    // Using multi-bytes nops "0x0F 0x1F [address]" for ZX
+    //  1: 0x90
+    //  2: 0x66 0x90
+    //  3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
+    //  4: 0x0F 0x1F 0x40 0x00
+    //  5: 0x0F 0x1F 0x44 0x00 0x00
+    //  6: 0x66 0x0F 0x1F 0x44 0x00 0x00
+    //  7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
+    //  8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
+    //  9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
+    // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
+    // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
+
+    // The rest coding is ZX specific - don't use consecutive address nops
+
+    // 12: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
+    // 13: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
+    // 14: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
+    // 15: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
+
+    while(i >= 15) {
+      // For ZX don't generate consecutive addess nops (mix with regular nops)
+      i -= 15;
+      emit_int8(0x66);   // size prefix
+      emit_int8(0x66);   // size prefix
+      emit_int8(0x66);   // size prefix
+      addr_nop_8();
+      emit_int8(0x66);   // size prefix
+      emit_int8(0x66);   // size prefix
+      emit_int8(0x66);   // size prefix
+      emit_int8((unsigned char)0x90);
+                         // nop
+    }
+    switch (i) {
+      case 14:
+        emit_int8(0x66); // size prefix
+      case 13:
+        emit_int8(0x66); // size prefix
+      case 12:
+        addr_nop_8();
+        emit_int8(0x66); // size prefix
+        emit_int8(0x66); // size prefix
+        emit_int8(0x66); // size prefix
+        emit_int8((unsigned char)0x90);
+                         // nop
+        break;
+      case 11:
+        emit_int8(0x66); // size prefix
+      case 10:
+        emit_int8(0x66); // size prefix
+      case 9:
+        emit_int8(0x66); // size prefix
+      case 8:
+        addr_nop_8();
+        break;
+      case 7:
+        addr_nop_7();
+        break;
+      case 6:
+        emit_int8(0x66); // size prefix
+      case 5:
+        addr_nop_5();
+        break;
+      case 4:
+        addr_nop_4();
+        break;
+      case 3:
+        // Don't use "0x0F 0x1F 0x00" - need patching safe padding
+        emit_int8(0x66); // size prefix
+      case 2:
+        emit_int8(0x66); // size prefix
+      case 1:
+        emit_int8((unsigned char)0x90);
+                         // nop
+        break;
+      default:
+        assert(i == 0, " ");
+    }
+    return;
+  }
+
   // Using nops with size prefixes "0x66 0x90".
   // From AMD Optimization Guide:
   //  1: 0x90
diff -Nur org/jdk/src/hotspot/cpu/x86/vm_version_x86.cpp zx/jdk/src/hotspot/cpu/x86/vm_version_x86.cpp
--- org/jdk/src/hotspot/cpu/x86/vm_version_x86.cpp2017-12-08 03:33:00.000000000 +0800
+++ zx/jdk/src/hotspot/cpu/x86/vm_version_x86.cpp2017-12-28 11:30:40.000000000 +0800
@@ -628,6 +628,11 @@
   if (UseSSE < 1)
     _features &= ~CPU_SSE;

+  //since AVX instructions is slower than SSE in some ZX cpus, force USEAVX=0.
+  if(is_zx() && ((cpu_family()==6)||(cpu_family()==7))){
+UseAVX = 0;
+  }
+
   // first try initial setting and detect what we can support
   int use_avx_limit = 0;
   if (UseAVX > 0) {
@@ -1078,6 +1083,66 @@
   // UseXmmRegToRegMoveAll == true  --> movaps(xmm, xmm), movapd(xmm, xmm).
   // UseXmmRegToRegMoveAll == false --> movss(xmm, xmm),  movsd(xmm, xmm).

+
+  if( is_zx() ) { // ZX cpus specific settings
+    if( FLAG_IS_DEFAULT(UseStoreImmI16) ) {
+      UseStoreImmI16 = false; // don't use it on ZX cpus
+    }
+    if( (cpu_family() == 6)|| (cpu_family() == 7) ) {
+      if( FLAG_IS_DEFAULT(UseAddressNop) ) {
+        // Use it on all ZX cpus
+        UseAddressNop = true;
+      }
+    }
+    if( FLAG_IS_DEFAULT(UseXmmLoadAndClearUpper) ) {
+      UseXmmLoadAndClearUpper = true; // use movsd on all ZX cpus
+    }
+    if( FLAG_IS_DEFAULT(UseXmmRegToRegMoveAll) ) {
+      if( supports_sse3() ) {
+        UseXmmRegToRegMoveAll = true; // use movaps, movapd on new ZX cpus
+      } else {
+        UseXmmRegToRegMoveAll = false;
+      }
+    }
+    if( ((cpu_family() == 6)||(cpu_family() == 7)) && supports_sse3() ) { // new ZX cpus
+#ifdef COMPILER2
+      if( FLAG_IS_DEFAULT(MaxLoopPad) ) {
+        // For new ZX cpus do the next optimization:
+        // don't align the beginning of a loop if there are enough instructions
+        // left (NumberOfLoopInstrToAlign defined in c2_globals.hpp)
+        // in current fetch line (OptoLoopAlignment) or the padding
+        // is big (> MaxLoopPad).
+        // Set MaxLoopPad to 11 for new ZX cpus to reduce number of
+        // generated NOP instructions. 11 is the largest size of one
+        // address NOP instruction '0F 1F' (see Assembler::nop(i)).
+        MaxLoopPad = 11;
+      }
+#endif // COMPILER2
+      if (FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
+        UseXMMForArrayCopy = true; // use SSE2 movq on new ZX cpus
+      }
+      if (supports_sse4_2()) { // new ZX cpus
+        if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
+          UseUnalignedLoadStores = true; // use movdqu on newest ZX cpus
+        }
+      }
+      if (supports_sse4_2()) {
+        if (FLAG_IS_DEFAULT(UseSSE42Intrinsics)) {
+          FLAG_SET_DEFAULT(UseSSE42Intrinsics, true);
+        }
+      } else {
+        if (UseSSE42Intrinsics && !FLAG_IS_DEFAULT(UseAESIntrinsics)) {
+          warning("SSE4.2 intrinsics require SSE4.2 instructions or higher. Intrinsics will be disabled.");
+        }
+        FLAG_SET_DEFAULT(UseSSE42Intrinsics, false);
+      }
+    }
+
+    if(FLAG_IS_DEFAULT(AllocatePrefetchInstr) && supports_3dnow_prefetch()) {
+      FLAG_SET_DEFAULT(AllocatePrefetchInstr, 3);
+    }
+  }
+
   if( is_amd() ) { // AMD cpus specific settings
     if( supports_sse2() && FLAG_IS_DEFAULT(UseAddressNop) ) {
       // Use it on new AMD cpus starting from Opteron.
@@ -1370,6 +1435,14 @@
 #ifdef COMPILER2
     if (FLAG_IS_DEFAULT(UseFPUForSpilling) && supports_sse4_2()) {
       FLAG_SET_DEFAULT(UseFPUForSpilling, true);
+    }
+#endif
+  }
+
+    if (is_zx() && ((cpu_family() == 6)||(cpu_family() == 7)) && supports_sse3()) {
+#ifdef COMPILER2
+    if (FLAG_IS_DEFAULT(UseFPUForSpilling) && supports_sse4_2()) {
+      FLAG_SET_DEFAULT(UseFPUForSpilling, true);
     }
 #endif
   }
diff -Nur org/jdk/src/hotspot/cpu/x86/vm_version_x86.hpp zx/jdk/src/hotspot/cpu/x86/vm_version_x86.hpp
--- org/jdk/src/hotspot/cpu/x86/vm_version_x86.hpp2017-12-08 03:33:00.000000000 +0800
+++ zx/jdk/src/hotspot/cpu/x86/vm_version_x86.hpp2017-12-28 14:15:18.000000000 +0800
@@ -305,6 +305,9 @@
   enum Extended_Family {
     // AMD
     CPU_FAMILY_AMD_11H       = 0x11,
+    // ZX
+    CPU_FAMILY_ZX_CORE_F6    = 6,
+    CPU_FAMILY_ZX_CORE_F7    = 7,
     // Intel
     CPU_FAMILY_INTEL_CORE    = 6,
     CPU_MODEL_NEHALEM        = 0x1e,
@@ -549,6 +552,16 @@
       }
     }

+// ZX features.
+    if(is_zx()) {
+      if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
+        result |= CPU_LZCNT;
+      // for ZX, ecx.bits.misalignsse bit (bit 8) indicates support for prefetchw
+      if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
+        result |= CPU_3DNOW_PREFETCH;
+      }
+    }
+
     return result;
   }

@@ -657,6 +670,7 @@
   static bool is_P6()             { return cpu_family() >= 6; }
   static bool is_amd()            { assert_is_initialized(); return _cpuid_info.std_vendor_name_0 == 0x68747541; } // 'htuA'
   static bool is_intel()          { assert_is_initialized(); return _cpuid_info.std_vendor_name_0 == 0x756e6547; } // 'uneG'
+  static bool is_zx()             { assert_is_initialized(); return (_cpuid_info.std_vendor_name_0 == 0x746e6543)||(_cpuid_info.std_vendor_name_0 == 0x68532020); } // 'tneC'||'hS  '
   static bool is_atom_family()    { return ((cpu_family() == 0x06) && ((extended_cpu_model() == 0x36) || (extended_cpu_model() == 0x37) || (extended_cpu_model() == 0x4D))); } //Silvermont and Centerton
   static bool is_knights_family() { return ((cpu_family() == 0x06) && ((extended_cpu_model() == 0x57) || (extended_cpu_model() == 0x85))); } // Xeon Phi 3200/5200/7200 and Future Xeon Phi

@@ -680,7 +694,16 @@
       }
     } else if (is_amd()) {
       result = (_cpuid_info.ext_cpuid8_ecx.bits.cores_per_cpu + 1);
-    }
+    } else if (is_zx()) {
+  bool supports_topology = supports_processor_topology();
+      if (supports_topology) {
+        result = _cpuid_info.tpl_cpuidB1_ebx.bits.logical_cpus /
+                 _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
+      }
+      if (!supports_topology || result == 0) {
+        result = (_cpuid_info.dcp_cpuid4_eax.bits.cores_per_cpu + 1);
+      }
+}
     return result;
   }

@@ -688,7 +711,9 @@
     uint result = 1;
     if (is_intel() && supports_processor_topology()) {
       result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
-    } else if (_cpuid_info.std_cpuid1_edx.bits.ht != 0) {
+    } else if (is_zx() && supports_processor_topology()) {
+  result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
+} else if (_cpuid_info.std_cpuid1_edx.bits.ht != 0) {
       if (cpu_family() >= 0x17) {
         result = _cpuid_info.ext_cpuid1E_ebx.bits.threads_per_core + 1;
       } else {
@@ -705,7 +730,9 @@
       result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1);
     } else if (is_amd()) {
       result = _cpuid_info.ext_cpuid5_ecx.bits.L1_line_size;
-    }
+    } else if (is_zx()) {
+  result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1);
+}
     if (result < 32) // not defined ?
       result = 32;   // 32 bytes by default on x86 and other x64
     return result;

Finally, running the jtreg test. I run 2 jtreg tests(jtreg:test/jdk:tier1 and jtreg:test/langtools:tier1) and both of them are pass.
The results are as follows.
==============================
Test summary
==============================
   TEST                                              TOTAL  PASS  FAIL ERROR
   jtreg:test/jdk:tier1                               1660  1660     0     0
   jtreg:test/langtools:tier1                          3828  3828     0     0
   jtreg:test/nashorn:tier1                              0     0     0     0
   jtreg:test/jaxp:tier1                                 0     0     0     0
==============================
TEST SUCCESS

Best Regards!
VicWang | R&D
Telephone:+86-01082695388-892477


保密声明:
本邮件含有保密或专有信息,仅供指定收件人使用。严禁对本邮件或其内容做任何未经授权的查阅、使用、复制或转发。
CONFIDENTIAL NOTE:
This email contains confidential or legally privileged information and is for the sole use of its intended recipient. Any unauthorized review, use, copying or forwarding of this email or the content of this email is strictly prohibited.
Reply | Threaded
Open this post in threaded view
|

Re: [patch] support zhaoxin x86 cpu vendor ids CentaurHauls and Shanghai

David Holmes
Hi Vic,

Have you, or your company signed the OCA? [1] Unfortunately there seems
to be a problem accessing the signatory list at the moment so I can't
check myself.

Thanks,
David
[1] http://openjdk.java.net/contribute/


On 28/12/2017 4:59 PM, Vic Wang(BJ-RD) wrote:

> Hi,
> Here is a hotspot patch for supporting zhaoxin x86 cpu vendor ids (CentaurHauls id and Shanghai id). The original repository is http://hg.openjdk.java.net/jdk/hs.
> Please help me to submit the changes . If there is anything incorrectly, please let me know. Thank you very much.
>
> The following is the changeset by the diff command.
>
> diff -Nur org/jdk/src/hotspot/cpu/x86/assembler_x86.cpp zx/jdk/src/hotspot/cpu/x86/assembler_x86.cpp
> --- org/jdk/src/hotspot/cpu/x86/assembler_x86.cpp2017-12-08 03:33:00.000000000 +0800
> +++ zx/jdk/src/hotspot/cpu/x86/assembler_x86.cpp2017-12-28 11:31:30.000000000 +0800
> @@ -3167,6 +3167,89 @@
>       return;
>     }
>
> +  if (UseAddressNop && VM_Version::is_zx()) {
> +    //
> +    // Using multi-bytes nops "0x0F 0x1F [address]" for ZX
> +    //  1: 0x90
> +    //  2: 0x66 0x90
> +    //  3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
> +    //  4: 0x0F 0x1F 0x40 0x00
> +    //  5: 0x0F 0x1F 0x44 0x00 0x00
> +    //  6: 0x66 0x0F 0x1F 0x44 0x00 0x00
> +    //  7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
> +    //  8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
> +    //  9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
> +    // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
> +    // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
> +
> +    // The rest coding is ZX specific - don't use consecutive address nops
> +
> +    // 12: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
> +    // 13: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
> +    // 14: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
> +    // 15: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
> +
> +    while(i >= 15) {
> +      // For ZX don't generate consecutive addess nops (mix with regular nops)
> +      i -= 15;
> +      emit_int8(0x66);   // size prefix
> +      emit_int8(0x66);   // size prefix
> +      emit_int8(0x66);   // size prefix
> +      addr_nop_8();
> +      emit_int8(0x66);   // size prefix
> +      emit_int8(0x66);   // size prefix
> +      emit_int8(0x66);   // size prefix
> +      emit_int8((unsigned char)0x90);
> +                         // nop
> +    }
> +    switch (i) {
> +      case 14:
> +        emit_int8(0x66); // size prefix
> +      case 13:
> +        emit_int8(0x66); // size prefix
> +      case 12:
> +        addr_nop_8();
> +        emit_int8(0x66); // size prefix
> +        emit_int8(0x66); // size prefix
> +        emit_int8(0x66); // size prefix
> +        emit_int8((unsigned char)0x90);
> +                         // nop
> +        break;
> +      case 11:
> +        emit_int8(0x66); // size prefix
> +      case 10:
> +        emit_int8(0x66); // size prefix
> +      case 9:
> +        emit_int8(0x66); // size prefix
> +      case 8:
> +        addr_nop_8();
> +        break;
> +      case 7:
> +        addr_nop_7();
> +        break;
> +      case 6:
> +        emit_int8(0x66); // size prefix
> +      case 5:
> +        addr_nop_5();
> +        break;
> +      case 4:
> +        addr_nop_4();
> +        break;
> +      case 3:
> +        // Don't use "0x0F 0x1F 0x00" - need patching safe padding
> +        emit_int8(0x66); // size prefix
> +      case 2:
> +        emit_int8(0x66); // size prefix
> +      case 1:
> +        emit_int8((unsigned char)0x90);
> +                         // nop
> +        break;
> +      default:
> +        assert(i == 0, " ");
> +    }
> +    return;
> +  }
> +
>     // Using nops with size prefixes "0x66 0x90".
>     // From AMD Optimization Guide:
>     //  1: 0x90
> diff -Nur org/jdk/src/hotspot/cpu/x86/vm_version_x86.cpp zx/jdk/src/hotspot/cpu/x86/vm_version_x86.cpp
> --- org/jdk/src/hotspot/cpu/x86/vm_version_x86.cpp2017-12-08 03:33:00.000000000 +0800
> +++ zx/jdk/src/hotspot/cpu/x86/vm_version_x86.cpp2017-12-28 11:30:40.000000000 +0800
> @@ -628,6 +628,11 @@
>     if (UseSSE < 1)
>       _features &= ~CPU_SSE;
>
> +  //since AVX instructions is slower than SSE in some ZX cpus, force USEAVX=0.
> +  if(is_zx() && ((cpu_family()==6)||(cpu_family()==7))){
> +UseAVX = 0;
> +  }
> +
>     // first try initial setting and detect what we can support
>     int use_avx_limit = 0;
>     if (UseAVX > 0) {
> @@ -1078,6 +1083,66 @@
>     // UseXmmRegToRegMoveAll == true  --> movaps(xmm, xmm), movapd(xmm, xmm).
>     // UseXmmRegToRegMoveAll == false --> movss(xmm, xmm),  movsd(xmm, xmm).
>
> +
> +  if( is_zx() ) { // ZX cpus specific settings
> +    if( FLAG_IS_DEFAULT(UseStoreImmI16) ) {
> +      UseStoreImmI16 = false; // don't use it on ZX cpus
> +    }
> +    if( (cpu_family() == 6)|| (cpu_family() == 7) ) {
> +      if( FLAG_IS_DEFAULT(UseAddressNop) ) {
> +        // Use it on all ZX cpus
> +        UseAddressNop = true;
> +      }
> +    }
> +    if( FLAG_IS_DEFAULT(UseXmmLoadAndClearUpper) ) {
> +      UseXmmLoadAndClearUpper = true; // use movsd on all ZX cpus
> +    }
> +    if( FLAG_IS_DEFAULT(UseXmmRegToRegMoveAll) ) {
> +      if( supports_sse3() ) {
> +        UseXmmRegToRegMoveAll = true; // use movaps, movapd on new ZX cpus
> +      } else {
> +        UseXmmRegToRegMoveAll = false;
> +      }
> +    }
> +    if( ((cpu_family() == 6)||(cpu_family() == 7)) && supports_sse3() ) { // new ZX cpus
> +#ifdef COMPILER2
> +      if( FLAG_IS_DEFAULT(MaxLoopPad) ) {
> +        // For new ZX cpus do the next optimization:
> +        // don't align the beginning of a loop if there are enough instructions
> +        // left (NumberOfLoopInstrToAlign defined in c2_globals.hpp)
> +        // in current fetch line (OptoLoopAlignment) or the padding
> +        // is big (> MaxLoopPad).
> +        // Set MaxLoopPad to 11 for new ZX cpus to reduce number of
> +        // generated NOP instructions. 11 is the largest size of one
> +        // address NOP instruction '0F 1F' (see Assembler::nop(i)).
> +        MaxLoopPad = 11;
> +      }
> +#endif // COMPILER2
> +      if (FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
> +        UseXMMForArrayCopy = true; // use SSE2 movq on new ZX cpus
> +      }
> +      if (supports_sse4_2()) { // new ZX cpus
> +        if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
> +          UseUnalignedLoadStores = true; // use movdqu on newest ZX cpus
> +        }
> +      }
> +      if (supports_sse4_2()) {
> +        if (FLAG_IS_DEFAULT(UseSSE42Intrinsics)) {
> +          FLAG_SET_DEFAULT(UseSSE42Intrinsics, true);
> +        }
> +      } else {
> +        if (UseSSE42Intrinsics && !FLAG_IS_DEFAULT(UseAESIntrinsics)) {
> +          warning("SSE4.2 intrinsics require SSE4.2 instructions or higher. Intrinsics will be disabled.");
> +        }
> +        FLAG_SET_DEFAULT(UseSSE42Intrinsics, false);
> +      }
> +    }
> +
> +    if(FLAG_IS_DEFAULT(AllocatePrefetchInstr) && supports_3dnow_prefetch()) {
> +      FLAG_SET_DEFAULT(AllocatePrefetchInstr, 3);
> +    }
> +  }
> +
>     if( is_amd() ) { // AMD cpus specific settings
>       if( supports_sse2() && FLAG_IS_DEFAULT(UseAddressNop) ) {
>         // Use it on new AMD cpus starting from Opteron.
> @@ -1370,6 +1435,14 @@
>   #ifdef COMPILER2
>       if (FLAG_IS_DEFAULT(UseFPUForSpilling) && supports_sse4_2()) {
>         FLAG_SET_DEFAULT(UseFPUForSpilling, true);
> +    }
> +#endif
> +  }
> +
> +    if (is_zx() && ((cpu_family() == 6)||(cpu_family() == 7)) && supports_sse3()) {
> +#ifdef COMPILER2
> +    if (FLAG_IS_DEFAULT(UseFPUForSpilling) && supports_sse4_2()) {
> +      FLAG_SET_DEFAULT(UseFPUForSpilling, true);
>       }
>   #endif
>     }
> diff -Nur org/jdk/src/hotspot/cpu/x86/vm_version_x86.hpp zx/jdk/src/hotspot/cpu/x86/vm_version_x86.hpp
> --- org/jdk/src/hotspot/cpu/x86/vm_version_x86.hpp2017-12-08 03:33:00.000000000 +0800
> +++ zx/jdk/src/hotspot/cpu/x86/vm_version_x86.hpp2017-12-28 14:15:18.000000000 +0800
> @@ -305,6 +305,9 @@
>     enum Extended_Family {
>       // AMD
>       CPU_FAMILY_AMD_11H       = 0x11,
> +    // ZX
> +    CPU_FAMILY_ZX_CORE_F6    = 6,
> +    CPU_FAMILY_ZX_CORE_F7    = 7,
>       // Intel
>       CPU_FAMILY_INTEL_CORE    = 6,
>       CPU_MODEL_NEHALEM        = 0x1e,
> @@ -549,6 +552,16 @@
>         }
>       }
>
> +// ZX features.
> +    if(is_zx()) {
> +      if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
> +        result |= CPU_LZCNT;
> +      // for ZX, ecx.bits.misalignsse bit (bit 8) indicates support for prefetchw
> +      if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
> +        result |= CPU_3DNOW_PREFETCH;
> +      }
> +    }
> +
>       return result;
>     }
>
> @@ -657,6 +670,7 @@
>     static bool is_P6()             { return cpu_family() >= 6; }
>     static bool is_amd()            { assert_is_initialized(); return _cpuid_info.std_vendor_name_0 == 0x68747541; } // 'htuA'
>     static bool is_intel()          { assert_is_initialized(); return _cpuid_info.std_vendor_name_0 == 0x756e6547; } // 'uneG'
> +  static bool is_zx()             { assert_is_initialized(); return (_cpuid_info.std_vendor_name_0 == 0x746e6543)||(_cpuid_info.std_vendor_name_0 == 0x68532020); } // 'tneC'||'hS  '
>     static bool is_atom_family()    { return ((cpu_family() == 0x06) && ((extended_cpu_model() == 0x36) || (extended_cpu_model() == 0x37) || (extended_cpu_model() == 0x4D))); } //Silvermont and Centerton
>     static bool is_knights_family() { return ((cpu_family() == 0x06) && ((extended_cpu_model() == 0x57) || (extended_cpu_model() == 0x85))); } // Xeon Phi 3200/5200/7200 and Future Xeon Phi
>
> @@ -680,7 +694,16 @@
>         }
>       } else if (is_amd()) {
>         result = (_cpuid_info.ext_cpuid8_ecx.bits.cores_per_cpu + 1);
> -    }
> +    } else if (is_zx()) {
> +  bool supports_topology = supports_processor_topology();
> +      if (supports_topology) {
> +        result = _cpuid_info.tpl_cpuidB1_ebx.bits.logical_cpus /
> +                 _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
> +      }
> +      if (!supports_topology || result == 0) {
> +        result = (_cpuid_info.dcp_cpuid4_eax.bits.cores_per_cpu + 1);
> +      }
> +}
>       return result;
>     }
>
> @@ -688,7 +711,9 @@
>       uint result = 1;
>       if (is_intel() && supports_processor_topology()) {
>         result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
> -    } else if (_cpuid_info.std_cpuid1_edx.bits.ht != 0) {
> +    } else if (is_zx() && supports_processor_topology()) {
> +  result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
> +} else if (_cpuid_info.std_cpuid1_edx.bits.ht != 0) {
>         if (cpu_family() >= 0x17) {
>           result = _cpuid_info.ext_cpuid1E_ebx.bits.threads_per_core + 1;
>         } else {
> @@ -705,7 +730,9 @@
>         result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1);
>       } else if (is_amd()) {
>         result = _cpuid_info.ext_cpuid5_ecx.bits.L1_line_size;
> -    }
> +    } else if (is_zx()) {
> +  result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1);
> +}
>       if (result < 32) // not defined ?
>         result = 32;   // 32 bytes by default on x86 and other x64
>       return result;
>
> Finally, running the jtreg test. I run 2 jtreg tests(jtreg:test/jdk:tier1 and jtreg:test/langtools:tier1) and both of them are pass.
> The results are as follows.
> ==============================
> Test summary
> ==============================
>     TEST                                              TOTAL  PASS  FAIL ERROR
>     jtreg:test/jdk:tier1                               1660  1660     0     0
>     jtreg:test/langtools:tier1                          3828  3828     0     0
>     jtreg:test/nashorn:tier1                              0     0     0     0
>     jtreg:test/jaxp:tier1                                 0     0     0     0
> ==============================
> TEST SUCCESS
>
> Best Regards!
> VicWang | R&D
> Telephone:+86-01082695388-892477
>
>
> 保密声明:
> 本邮件含有保密或专有信息,仅供指定收件人使用。严禁对本邮件或其内容做任何未经授权的查阅、使用、复制或转发。
> CONFIDENTIAL NOTE:
> This email contains confidential or legally privileged information and is for the sole use of its intended recipient. Any unauthorized review, use, copying or forwarding of this email or the content of this email is strictly prohibited.
>
Reply | Threaded
Open this post in threaded view
|

答复: [patch] support zhaoxin x86 cpu vendor ids CentaurHauls and Shanghai

Vic Wang(BJ-RD)
Hi David:


My company has already signed the OCA, and I can find my company's name(Shanghai Zhaoxin Semiconductor Co. Ltd.) in the signatories list.

Please check it again.

Thanks very much.

Best Regards!
Vic
________________________________
发件人: David Holmes <[hidden email]>
发送时间: 2018年1月1日 15:48
收件人: Vic Wang(BJ-RD); [hidden email]
抄送: Cobe Chen(BJ-RD)
主题: Re: [patch] support zhaoxin x86 cpu vendor ids CentaurHauls and Shanghai

Hi Vic,

Have you, or your company signed the OCA? [1] Unfortunately there seems
to be a problem accessing the signatory list at the moment so I can't
check myself.

Thanks,
David
[1] http://openjdk.java.net/contribute/


On 28/12/2017 4:59 PM, Vic Wang(BJ-RD) wrote:

> Hi,
> Here is a hotspot patch for supporting zhaoxin x86 cpu vendor ids (CentaurHauls id and Shanghai id). The original repository is http://hg.openjdk.java.net/jdk/hs.
> Please help me to submit the changes . If there is anything incorrectly, please let me know. Thank you very much.
>
> The following is the changeset by the diff command.
>
> diff -Nur org/jdk/src/hotspot/cpu/x86/assembler_x86.cpp zx/jdk/src/hotspot/cpu/x86/assembler_x86.cpp
> --- org/jdk/src/hotspot/cpu/x86/assembler_x86.cpp2017-12-08 03:33:00.000000000 +0800
> +++ zx/jdk/src/hotspot/cpu/x86/assembler_x86.cpp2017-12-28 11:31:30.000000000 +0800
> @@ -3167,6 +3167,89 @@
>       return;
>     }
>
> +  if (UseAddressNop && VM_Version::is_zx()) {
> +    //
> +    // Using multi-bytes nops "0x0F 0x1F [address]" for ZX
> +    //  1: 0x90
> +    //  2: 0x66 0x90
> +    //  3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
> +    //  4: 0x0F 0x1F 0x40 0x00
> +    //  5: 0x0F 0x1F 0x44 0x00 0x00
> +    //  6: 0x66 0x0F 0x1F 0x44 0x00 0x00
> +    //  7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
> +    //  8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
> +    //  9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
> +    // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
> +    // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
> +
> +    // The rest coding is ZX specific - don't use consecutive address nops
> +
> +    // 12: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
> +    // 13: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
> +    // 14: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
> +    // 15: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
> +
> +    while(i >= 15) {
> +      // For ZX don't generate consecutive addess nops (mix with regular nops)
> +      i -= 15;
> +      emit_int8(0x66);   // size prefix
> +      emit_int8(0x66);   // size prefix
> +      emit_int8(0x66);   // size prefix
> +      addr_nop_8();
> +      emit_int8(0x66);   // size prefix
> +      emit_int8(0x66);   // size prefix
> +      emit_int8(0x66);   // size prefix
> +      emit_int8((unsigned char)0x90);
> +                         // nop
> +    }
> +    switch (i) {
> +      case 14:
> +        emit_int8(0x66); // size prefix
> +      case 13:
> +        emit_int8(0x66); // size prefix
> +      case 12:
> +        addr_nop_8();
> +        emit_int8(0x66); // size prefix
> +        emit_int8(0x66); // size prefix
> +        emit_int8(0x66); // size prefix
> +        emit_int8((unsigned char)0x90);
> +                         // nop
> +        break;
> +      case 11:
> +        emit_int8(0x66); // size prefix
> +      case 10:
> +        emit_int8(0x66); // size prefix
> +      case 9:
> +        emit_int8(0x66); // size prefix
> +      case 8:
> +        addr_nop_8();
> +        break;
> +      case 7:
> +        addr_nop_7();
> +        break;
> +      case 6:
> +        emit_int8(0x66); // size prefix
> +      case 5:
> +        addr_nop_5();
> +        break;
> +      case 4:
> +        addr_nop_4();
> +        break;
> +      case 3:
> +        // Don't use "0x0F 0x1F 0x00" - need patching safe padding
> +        emit_int8(0x66); // size prefix
> +      case 2:
> +        emit_int8(0x66); // size prefix
> +      case 1:
> +        emit_int8((unsigned char)0x90);
> +                         // nop
> +        break;
> +      default:
> +        assert(i == 0, " ");
> +    }
> +    return;
> +  }
> +
>     // Using nops with size prefixes "0x66 0x90".
>     // From AMD Optimization Guide:
>     //  1: 0x90
> diff -Nur org/jdk/src/hotspot/cpu/x86/vm_version_x86.cpp zx/jdk/src/hotspot/cpu/x86/vm_version_x86.cpp
> --- org/jdk/src/hotspot/cpu/x86/vm_version_x86.cpp2017-12-08 03:33:00.000000000 +0800
> +++ zx/jdk/src/hotspot/cpu/x86/vm_version_x86.cpp2017-12-28 11:30:40.000000000 +0800
> @@ -628,6 +628,11 @@
>     if (UseSSE < 1)
>       _features &= ~CPU_SSE;
>
> +  //since AVX instructions is slower than SSE in some ZX cpus, force USEAVX=0.
> +  if(is_zx() && ((cpu_family()==6)||(cpu_family()==7))){
> +UseAVX = 0;
> +  }
> +
>     // first try initial setting and detect what we can support
>     int use_avx_limit = 0;
>     if (UseAVX > 0) {
> @@ -1078,6 +1083,66 @@
>     // UseXmmRegToRegMoveAll == true  --> movaps(xmm, xmm), movapd(xmm, xmm).
>     // UseXmmRegToRegMoveAll == false --> movss(xmm, xmm),  movsd(xmm, xmm).
>
> +
> +  if( is_zx() ) { // ZX cpus specific settings
> +    if( FLAG_IS_DEFAULT(UseStoreImmI16) ) {
> +      UseStoreImmI16 = false; // don't use it on ZX cpus
> +    }
> +    if( (cpu_family() == 6)|| (cpu_family() == 7) ) {
> +      if( FLAG_IS_DEFAULT(UseAddressNop) ) {
> +        // Use it on all ZX cpus
> +        UseAddressNop = true;
> +      }
> +    }
> +    if( FLAG_IS_DEFAULT(UseXmmLoadAndClearUpper) ) {
> +      UseXmmLoadAndClearUpper = true; // use movsd on all ZX cpus
> +    }
> +    if( FLAG_IS_DEFAULT(UseXmmRegToRegMoveAll) ) {
> +      if( supports_sse3() ) {
> +        UseXmmRegToRegMoveAll = true; // use movaps, movapd on new ZX cpus
> +      } else {
> +        UseXmmRegToRegMoveAll = false;
> +      }
> +    }
> +    if( ((cpu_family() == 6)||(cpu_family() == 7)) && supports_sse3() ) { // new ZX cpus
> +#ifdef COMPILER2
> +      if( FLAG_IS_DEFAULT(MaxLoopPad) ) {
> +        // For new ZX cpus do the next optimization:
> +        // don't align the beginning of a loop if there are enough instructions
> +        // left (NumberOfLoopInstrToAlign defined in c2_globals.hpp)
> +        // in current fetch line (OptoLoopAlignment) or the padding
> +        // is big (> MaxLoopPad).
> +        // Set MaxLoopPad to 11 for new ZX cpus to reduce number of
> +        // generated NOP instructions. 11 is the largest size of one
> +        // address NOP instruction '0F 1F' (see Assembler::nop(i)).
> +        MaxLoopPad = 11;
> +      }
> +#endif // COMPILER2
> +      if (FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
> +        UseXMMForArrayCopy = true; // use SSE2 movq on new ZX cpus
> +      }
> +      if (supports_sse4_2()) { // new ZX cpus
> +        if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
> +          UseUnalignedLoadStores = true; // use movdqu on newest ZX cpus
> +        }
> +      }
> +      if (supports_sse4_2()) {
> +        if (FLAG_IS_DEFAULT(UseSSE42Intrinsics)) {
> +          FLAG_SET_DEFAULT(UseSSE42Intrinsics, true);
> +        }
> +      } else {
> +        if (UseSSE42Intrinsics && !FLAG_IS_DEFAULT(UseAESIntrinsics)) {
> +          warning("SSE4.2 intrinsics require SSE4.2 instructions or higher. Intrinsics will be disabled.");
> +        }
> +        FLAG_SET_DEFAULT(UseSSE42Intrinsics, false);
> +      }
> +    }
> +
> +    if(FLAG_IS_DEFAULT(AllocatePrefetchInstr) && supports_3dnow_prefetch()) {
> +      FLAG_SET_DEFAULT(AllocatePrefetchInstr, 3);
> +    }
> +  }
> +
>     if( is_amd() ) { // AMD cpus specific settings
>       if( supports_sse2() && FLAG_IS_DEFAULT(UseAddressNop) ) {
>         // Use it on new AMD cpus starting from Opteron.
> @@ -1370,6 +1435,14 @@
>   #ifdef COMPILER2
>       if (FLAG_IS_DEFAULT(UseFPUForSpilling) && supports_sse4_2()) {
>         FLAG_SET_DEFAULT(UseFPUForSpilling, true);
> +    }
> +#endif
> +  }
> +
> +    if (is_zx() && ((cpu_family() == 6)||(cpu_family() == 7)) && supports_sse3()) {
> +#ifdef COMPILER2
> +    if (FLAG_IS_DEFAULT(UseFPUForSpilling) && supports_sse4_2()) {
> +      FLAG_SET_DEFAULT(UseFPUForSpilling, true);
>       }
>   #endif
>     }
> diff -Nur org/jdk/src/hotspot/cpu/x86/vm_version_x86.hpp zx/jdk/src/hotspot/cpu/x86/vm_version_x86.hpp
> --- org/jdk/src/hotspot/cpu/x86/vm_version_x86.hpp2017-12-08 03:33:00.000000000 +0800
> +++ zx/jdk/src/hotspot/cpu/x86/vm_version_x86.hpp2017-12-28 14:15:18.000000000 +0800
> @@ -305,6 +305,9 @@
>     enum Extended_Family {
>       // AMD
>       CPU_FAMILY_AMD_11H       = 0x11,
> +    // ZX
> +    CPU_FAMILY_ZX_CORE_F6    = 6,
> +    CPU_FAMILY_ZX_CORE_F7    = 7,
>       // Intel
>       CPU_FAMILY_INTEL_CORE    = 6,
>       CPU_MODEL_NEHALEM        = 0x1e,
> @@ -549,6 +552,16 @@
>         }
>       }
>
> +// ZX features.
> +    if(is_zx()) {
> +      if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
> +        result |= CPU_LZCNT;
> +      // for ZX, ecx.bits.misalignsse bit (bit 8) indicates support for prefetchw
> +      if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
> +        result |= CPU_3DNOW_PREFETCH;
> +      }
> +    }
> +
>       return result;
>     }
>
> @@ -657,6 +670,7 @@
>     static bool is_P6()             { return cpu_family() >= 6; }
>     static bool is_amd()            { assert_is_initialized(); return _cpuid_info.std_vendor_name_0 == 0x68747541; } // 'htuA'
>     static bool is_intel()          { assert_is_initialized(); return _cpuid_info.std_vendor_name_0 == 0x756e6547; } // 'uneG'
> +  static bool is_zx()             { assert_is_initialized(); return (_cpuid_info.std_vendor_name_0 == 0x746e6543)||(_cpuid_info.std_vendor_name_0 == 0x68532020); } // 'tneC'||'hS  '
>     static bool is_atom_family()    { return ((cpu_family() == 0x06) && ((extended_cpu_model() == 0x36) || (extended_cpu_model() == 0x37) || (extended_cpu_model() == 0x4D))); } //Silvermont and Centerton
>     static bool is_knights_family() { return ((cpu_family() == 0x06) && ((extended_cpu_model() == 0x57) || (extended_cpu_model() == 0x85))); } // Xeon Phi 3200/5200/7200 and Future Xeon Phi
>
> @@ -680,7 +694,16 @@
>         }
>       } else if (is_amd()) {
>         result = (_cpuid_info.ext_cpuid8_ecx.bits.cores_per_cpu + 1);
> -    }
> +    } else if (is_zx()) {
> +  bool supports_topology = supports_processor_topology();
> +      if (supports_topology) {
> +        result = _cpuid_info.tpl_cpuidB1_ebx.bits.logical_cpus /
> +                 _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
> +      }
> +      if (!supports_topology || result == 0) {
> +        result = (_cpuid_info.dcp_cpuid4_eax.bits.cores_per_cpu + 1);
> +      }
> +}
>       return result;
>     }
>
> @@ -688,7 +711,9 @@
>       uint result = 1;
>       if (is_intel() && supports_processor_topology()) {
>         result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
> -    } else if (_cpuid_info.std_cpuid1_edx.bits.ht != 0) {
> +    } else if (is_zx() && supports_processor_topology()) {
> +  result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
> +} else if (_cpuid_info.std_cpuid1_edx.bits.ht != 0) {
>         if (cpu_family() >= 0x17) {
>           result = _cpuid_info.ext_cpuid1E_ebx.bits.threads_per_core + 1;
>         } else {
> @@ -705,7 +730,9 @@
>         result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1);
>       } else if (is_amd()) {
>         result = _cpuid_info.ext_cpuid5_ecx.bits.L1_line_size;
> -    }
> +    } else if (is_zx()) {
> +  result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1);
> +}
>       if (result < 32) // not defined ?
>         result = 32;   // 32 bytes by default on x86 and other x64
>       return result;
>
> Finally, running the jtreg test. I run 2 jtreg tests(jtreg:test/jdk:tier1 and jtreg:test/langtools:tier1) and both of them are pass.
> The results are as follows.
> ==============================
> Test summary
> ==============================
>     TEST                                              TOTAL  PASS  FAIL ERROR
>     jtreg:test/jdk:tier1                               1660  1660     0     0
>     jtreg:test/langtools:tier1                          3828  3828     0     0
>     jtreg:test/nashorn:tier1                              0     0     0     0
>     jtreg:test/jaxp:tier1                                 0     0     0     0
> ==============================
> TEST SUCCESS
>
> Best Regards!
> VicWang | R&D
> Telephone:+86-01082695388-892477
>
>
> 保密声明:
> 本邮件含有保密或专有信息,仅供指定收件人使用。严禁对本邮件或其内容做任何未经授权的查阅、使用、复制或转发。
> CONFIDENTIAL NOTE:
> This email contains confidential or legally privileged information and is for the sole use of its intended recipient. Any unauthorized review, use, copying or forwarding of this email or the content of this email is strictly prohibited.
>


保密声明:
本邮件含有保密或专有信息,仅供指定收件人使用。严禁对本邮件或其内容做任何未经授权的查阅、使用、复制或转发。
CONFIDENTIAL NOTE:
This email contains confidential or legally privileged information and is for the sole use of its intended recipient. Any unauthorized review, use, copying or forwarding of this email or the content of this email is strictly prohibited.
Reply | Threaded
Open this post in threaded view
|

Re: 答复: [patch] support zhaoxin x86 cpu vendor ids CentaurHauls and Shanghai

David Holmes
On 1/01/2018 6:48 PM, Vic Wang(BJ-RD) wrote:
> Hi David:
>
>
> My company has already signed the OCA, and I can find my company's
> name(Shanghai Zhaoxin Semiconductor Co. Ltd.) in the signatories list.

Thanks for confirming that.

> Please check it again.

I can't access it - I just get one of those "the server is redirecting
the request for this address in a way that will never complete" errors.

I'll take a look at the patch itself tomorrow once I'm back at work.

David

> Thanks very much.
>
>
> Best Regards!
> Vic
> ------------------------------------------------------------------------
> *发件人:* David Holmes <[hidden email]>
> *发送时间:* 2018年1月1日 15:48
> *收件人:* Vic Wang(BJ-RD); [hidden email]
> *抄送:* Cobe Chen(BJ-RD)
> *主题:* Re: [patch] support zhaoxin x86 cpu vendor ids CentaurHauls and
> Shanghai
> Hi Vic,
>
> Have you, or your company signed the OCA? [1] Unfortunately there seems
> to be a problem accessing the signatory list at the moment so I can't
> check myself.
>
> Thanks,
> David
> [1] http://openjdk.java.net/contribute/
>
>
> On 28/12/2017 4:59 PM, Vic Wang(BJ-RD) wrote:
>> Hi,
>> Here is a hotspot patch for supporting zhaoxin x86 cpu vendor ids (CentaurHauls id and Shanghai id). The original repository is  http://hg.openjdk.java.net/jdk/hs.
>> Please help me to submit the changes . If there is anything incorrectly, please let me know. Thank you very much.
>>
>> The following is the changeset by the diff command.
>>
>> diff -Nur org/jdk/src/hotspot/cpu/x86/assembler_x86.cpp zx/jdk/src/hotspot/cpu/x86/assembler_x86.cpp
>> --- org/jdk/src/hotspot/cpu/x86/assembler_x86.cpp2017-12-08 03:33:00.000000000 +0800
>> +++ zx/jdk/src/hotspot/cpu/x86/assembler_x86.cpp2017-12-28 11:31:30.000000000 +0800
>> @@ -3167,6 +3167,89 @@
>>       return;
>>     }
>>
>> +  if (UseAddressNop && VM_Version::is_zx()) {
>> +    //
>> +    // Using multi-bytes nops "0x0F 0x1F [address]" for ZX
>> +    //  1: 0x90
>> +    //  2: 0x66 0x90
>> +    //  3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
>> +    //  4: 0x0F 0x1F 0x40 0x00
>> +    //  5: 0x0F 0x1F 0x44 0x00 0x00
>> +    //  6: 0x66 0x0F 0x1F 0x44 0x00 0x00
>> +    //  7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
>> +    //  8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
>> +    //  9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
>> +    // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
>> +    // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
>> +
>> +    // The rest coding is ZX specific - don't use consecutive address nops
>> +
>> +    // 12: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
>> +    // 13: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
>> +    // 14: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
>> +    // 15: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
>> +
>> +    while(i >= 15) {
>> +      // For ZX don't generate consecutive addess nops (mix with regular nops)
>> +      i -= 15;
>> +      emit_int8(0x66);   // size prefix
>> +      emit_int8(0x66);   // size prefix
>> +      emit_int8(0x66);   // size prefix
>> +      addr_nop_8();
>> +      emit_int8(0x66);   // size prefix
>> +      emit_int8(0x66);   // size prefix
>> +      emit_int8(0x66);   // size prefix
>> +      emit_int8((unsigned char)0x90);
>> +                         // nop
>> +    }
>> +    switch (i) {
>> +      case 14:
>> +        emit_int8(0x66); // size prefix
>> +      case 13:
>> +        emit_int8(0x66); // size prefix
>> +      case 12:
>> +        addr_nop_8();
>> +        emit_int8(0x66); // size prefix
>> +        emit_int8(0x66); // size prefix
>> +        emit_int8(0x66); // size prefix
>> +        emit_int8((unsigned char)0x90);
>> +                         // nop
>> +        break;
>> +      case 11:
>> +        emit_int8(0x66); // size prefix
>> +      case 10:
>> +        emit_int8(0x66); // size prefix
>> +      case 9:
>> +        emit_int8(0x66); // size prefix
>> +      case 8:
>> +        addr_nop_8();
>> +        break;
>> +      case 7:
>> +        addr_nop_7();
>> +        break;
>> +      case 6:
>> +        emit_int8(0x66); // size prefix
>> +      case 5:
>> +        addr_nop_5();
>> +        break;
>> +      case 4:
>> +        addr_nop_4();
>> +        break;
>> +      case 3:
>> +        // Don't use "0x0F 0x1F 0x00" - need patching safe padding
>> +        emit_int8(0x66); // size prefix
>> +      case 2:
>> +        emit_int8(0x66); // size prefix
>> +      case 1:
>> +        emit_int8((unsigned char)0x90);
>> +                         // nop
>> +        break;
>> +      default:
>> +        assert(i == 0, " ");
>> +    }
>> +    return;
>> +  }
>> +
>>     // Using nops with size prefixes "0x66 0x90".
>>     // From AMD Optimization Guide:
>>     //  1: 0x90
>> diff -Nur org/jdk/src/hotspot/cpu/x86/vm_version_x86.cpp zx/jdk/src/hotspot/cpu/x86/vm_version_x86.cpp
>> --- org/jdk/src/hotspot/cpu/x86/vm_version_x86.cpp2017-12-08 03:33:00.000000000 +0800
>> +++ zx/jdk/src/hotspot/cpu/x86/vm_version_x86.cpp2017-12-28 11:30:40.000000000 +0800
>> @@ -628,6 +628,11 @@
>>     if (UseSSE < 1)
>>       _features &= ~CPU_SSE;
>>
>> +  //since AVX instructions is slower than SSE in some ZX cpus, force USEAVX=0.
>> +  if(is_zx() && ((cpu_family()==6)||(cpu_family()==7))){
>> +UseAVX = 0;
>> +  }
>> +
>>     // first try initial setting and detect what we can support
>>     int use_avx_limit = 0;
>>     if (UseAVX > 0) {
>> @@ -1078,6 +1083,66 @@
>>     // UseXmmRegToRegMoveAll == true  --> movaps(xmm, xmm), movapd(xmm, xmm).
>>     // UseXmmRegToRegMoveAll == false --> movss(xmm, xmm),  movsd(xmm, xmm).
>>
>> +
>> +  if( is_zx() ) { // ZX cpus specific settings
>> +    if( FLAG_IS_DEFAULT(UseStoreImmI16) ) {
>> +      UseStoreImmI16 = false; // don't use it on ZX cpus
>> +    }
>> +    if( (cpu_family() == 6)|| (cpu_family() == 7) ) {
>> +      if( FLAG_IS_DEFAULT(UseAddressNop) ) {
>> +        // Use it on all ZX cpus
>> +        UseAddressNop = true;
>> +      }
>> +    }
>> +    if( FLAG_IS_DEFAULT(UseXmmLoadAndClearUpper) ) {
>> +      UseXmmLoadAndClearUpper = true; // use movsd on all ZX cpus
>> +    }
>> +    if( FLAG_IS_DEFAULT(UseXmmRegToRegMoveAll) ) {
>> +      if( supports_sse3() ) {
>> +        UseXmmRegToRegMoveAll = true; // use movaps, movapd on new ZX cpus
>> +      } else {
>> +        UseXmmRegToRegMoveAll = false;
>> +      }
>> +    }
>> +    if( ((cpu_family() == 6)||(cpu_family() == 7)) && supports_sse3() ) { // new ZX cpus
>> +#ifdef COMPILER2
>> +      if( FLAG_IS_DEFAULT(MaxLoopPad) ) {
>> +        // For new ZX cpus do the next optimization:
>> +        // don't align the beginning of a loop if there are enough instructions
>> +        // left (NumberOfLoopInstrToAlign defined in c2_globals.hpp)
>> +        // in current fetch line (OptoLoopAlignment) or the padding
>> +        // is big (> MaxLoopPad).
>> +        // Set MaxLoopPad to 11 for new ZX cpus to reduce number of
>> +        // generated NOP instructions. 11 is the largest size of one
>> +        // address NOP instruction '0F 1F' (see Assembler::nop(i)).
>> +        MaxLoopPad = 11;
>> +      }
>> +#endif // COMPILER2
>> +      if (FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
>> +        UseXMMForArrayCopy = true; // use SSE2 movq on new ZX cpus
>> +      }
>> +      if (supports_sse4_2()) { // new ZX cpus
>> +        if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
>> +          UseUnalignedLoadStores = true; // use movdqu on newest ZX cpus
>> +        }
>> +      }
>> +      if (supports_sse4_2()) {
>> +        if (FLAG_IS_DEFAULT(UseSSE42Intrinsics)) {
>> +          FLAG_SET_DEFAULT(UseSSE42Intrinsics, true);
>> +        }
>> +      } else {
>> +        if (UseSSE42Intrinsics && !FLAG_IS_DEFAULT(UseAESIntrinsics)) {
>> +          warning("SSE4.2 intrinsics require SSE4.2 instructions or higher. Intrinsics will be disabled.");
>> +        }
>> +        FLAG_SET_DEFAULT(UseSSE42Intrinsics, false);
>> +      }
>> +    }
>> +
>> +    if(FLAG_IS_DEFAULT(AllocatePrefetchInstr) && supports_3dnow_prefetch()) {
>> +      FLAG_SET_DEFAULT(AllocatePrefetchInstr, 3);
>> +    }
>> +  }
>> +
>>     if( is_amd() ) { // AMD cpus specific settings
>>       if( supports_sse2() && FLAG_IS_DEFAULT(UseAddressNop) ) {
>>         // Use it on new AMD cpus starting from Opteron.
>> @@ -1370,6 +1435,14 @@
>>   #ifdef COMPILER2
>>       if (FLAG_IS_DEFAULT(UseFPUForSpilling) && supports_sse4_2()) {
>>         FLAG_SET_DEFAULT(UseFPUForSpilling, true);
>> +    }
>> +#endif
>> +  }
>> +
>> +    if (is_zx() && ((cpu_family() == 6)||(cpu_family() == 7)) && supports_sse3()) {
>> +#ifdef COMPILER2
>> +    if (FLAG_IS_DEFAULT(UseFPUForSpilling) && supports_sse4_2()) {
>> +      FLAG_SET_DEFAULT(UseFPUForSpilling, true);
>>       }
>>   #endif
>>     }
>> diff -Nur org/jdk/src/hotspot/cpu/x86/vm_version_x86.hpp zx/jdk/src/hotspot/cpu/x86/vm_version_x86.hpp
>> --- org/jdk/src/hotspot/cpu/x86/vm_version_x86.hpp2017-12-08 03:33:00.000000000 +0800
>> +++ zx/jdk/src/hotspot/cpu/x86/vm_version_x86.hpp2017-12-28 14:15:18.000000000 +0800
>> @@ -305,6 +305,9 @@
>>     enum Extended_Family {
>>       // AMD
>>       CPU_FAMILY_AMD_11H       = 0x11,
>> +    // ZX
>> +    CPU_FAMILY_ZX_CORE_F6    = 6,
>> +    CPU_FAMILY_ZX_CORE_F7    = 7,
>>       // Intel
>>       CPU_FAMILY_INTEL_CORE    = 6,
>>       CPU_MODEL_NEHALEM        = 0x1e,
>> @@ -549,6 +552,16 @@
>>         }
>>       }
>>
>> +// ZX features.
>> +    if(is_zx()) {
>> +      if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
>> +        result |= CPU_LZCNT;
>> +      // for ZX, ecx.bits.misalignsse bit (bit 8) indicates support for prefetchw
>> +      if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
>> +        result |= CPU_3DNOW_PREFETCH;
>> +      }
>> +    }
>> +
>>       return result;
>>     }
>>
>> @@ -657,6 +670,7 @@
>>     static bool is_P6()             { return cpu_family() >= 6; }
>>     static bool is_amd()            { assert_is_initialized(); return _cpuid_info.std_vendor_name_0 == 0x68747541; } // 'htuA'
>>     static bool is_intel()          { assert_is_initialized(); return _cpuid_info.std_vendor_name_0 == 0x756e6547; } // 'uneG'
>> +  static bool is_zx()             { assert_is_initialized(); return (_cpuid_info.std_vendor_name_0 == 0x746e6543)||(_cpuid_info.std_vendor_name_0 == 0x68532020); } // 'tneC'||'hS  '
>>     static bool is_atom_family()    { return ((cpu_family() == 0x06) && ((extended_cpu_model() == 0x36) || (extended_cpu_model() == 0x37) || (extended_cpu_model() == 0x4D))); } //Silvermont and Centerton
>>     static bool is_knights_family() { return ((cpu_family() == 0x06) && ((extended_cpu_model() == 0x57) || (extended_cpu_model() == 0x85))); } // Xeon Phi 3200/5200/7200 and Future Xeon Phi
>>
>> @@ -680,7 +694,16 @@
>>         }
>>       } else if (is_amd()) {
>>         result = (_cpuid_info.ext_cpuid8_ecx.bits.cores_per_cpu + 1);
>> -    }
>> +    } else if (is_zx()) {
>> +  bool supports_topology = supports_processor_topology();
>> +      if (supports_topology) {
>> +        result = _cpuid_info.tpl_cpuidB1_ebx.bits.logical_cpus /
>> +                 _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
>> +      }
>> +      if (!supports_topology || result == 0) {
>> +        result = (_cpuid_info.dcp_cpuid4_eax.bits.cores_per_cpu + 1);
>> +      }
>> +}
>>       return result;
>>     }
>>
>> @@ -688,7 +711,9 @@
>>       uint result = 1;
>>       if (is_intel() && supports_processor_topology()) {
>>         result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
>> -    } else if (_cpuid_info.std_cpuid1_edx.bits.ht != 0) {
>> +    } else if (is_zx() && supports_processor_topology()) {
>> +  result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
>> +} else if (_cpuid_info.std_cpuid1_edx.bits.ht != 0) {
>>         if (cpu_family() >= 0x17) {
>>           result = _cpuid_info.ext_cpuid1E_ebx.bits.threads_per_core + 1;
>>         } else {
>> @@ -705,7 +730,9 @@
>>         result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1);
>>       } else if (is_amd()) {
>>         result = _cpuid_info.ext_cpuid5_ecx.bits.L1_line_size;
>> -    }
>> +    } else if (is_zx()) {
>> +  result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1);
>> +}
>>       if (result < 32) // not defined ?
>>         result = 32;   // 32 bytes by default on x86 and other x64
>>       return result;
>>
>> Finally, running the jtreg test. I run 2 jtreg tests(jtreg:test/jdk:tier1 and jtreg:test/langtools:tier1) and both of them are pass.
>> The results are as follows.
>> ==============================
>> Test summary
>> ==============================
>>     TEST                                              TOTAL  PASS  FAIL ERROR
>>     jtreg:test/jdk:tier1                               1660  1660     0     0
>>     jtreg:test/langtools:tier1                          3828  3828     0     0
>>     jtreg:test/nashorn:tier1                              0     0     0     0
>>     jtreg:test/jaxp:tier1                                 0     0     0     0
>> ==============================
>> TEST SUCCESS
>>
>> Best Regards!
>> VicWang | R&D
>> Telephone:+86-01082695388-892477
>>
>>
>> 保密声明:
>> 本邮件含有保密或专有信息,仅供指定收件人使用。严禁对本邮件或其内容做任何未经授权的查阅、使用、复制或转发。
>> CONFIDENTIAL NOTE:
>> This email contains confidential or legally privileged information and is for the sole use of its intended recipient. Any unauthorized review, use, copying or forwarding of this email or the content of this email is strictly prohibited.
>>
>
>
>
> 保密声明:
> 本邮件含有保密或专有信息,仅供指定收件人使用。严禁对本邮件或其内容做任何
> 未经授权的查阅、使用、复制或转发。
> /CONFIDENTIAL NOTE: /
> /This email contains confidential or legally privileged information and
> is for the sole use of its intended recipient. Any unauthorized review,
> use, copying or forwarding of this email or the content of this email is
> strictly prohibited./
Reply | Threaded
Open this post in threaded view
|

Re: [patch] support zhaoxin x86 cpu vendor ids CentaurHauls and Shanghai

David Holmes
In reply to this post by Vic Wang(BJ-RD)
Hi,

I've created a JBS issue for this enhancement request:

https://bugs.openjdk.java.net/browse/JDK-8194279

On 28/12/2017 4:59 PM, Vic Wang(BJ-RD) wrote:
> Hi,
> Here is a hotspot patch for supporting zhaoxin x86 cpu vendor ids (CentaurHauls id and Shanghai id). The original repository is http://hg.openjdk.java.net/jdk/hs.
> Please help me to submit the changes . If there is anything incorrectly, please let me know. Thank you very much.
>
> The following is the changeset by the diff command.

Can you generate a patch using "hg diff" or "hg export" please. The raw
diff can't be applied directly to an existing repo without manual editing.

Thanks,
David

> diff -Nur org/jdk/src/hotspot/cpu/x86/assembler_x86.cpp zx/jdk/src/hotspot/cpu/x86/assembler_x86.cpp
> --- org/jdk/src/hotspot/cpu/x86/assembler_x86.cpp2017-12-08 03:33:00.000000000 +0800
> +++ zx/jdk/src/hotspot/cpu/x86/assembler_x86.cpp2017-12-28 11:31:30.000000000 +0800
> @@ -3167,6 +3167,89 @@
>       return;
>     }
>
> +  if (UseAddressNop && VM_Version::is_zx()) {
> +    //
> +    // Using multi-bytes nops "0x0F 0x1F [address]" for ZX
> +    //  1: 0x90
> +    //  2: 0x66 0x90
> +    //  3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
> +    //  4: 0x0F 0x1F 0x40 0x00
> +    //  5: 0x0F 0x1F 0x44 0x00 0x00
> +    //  6: 0x66 0x0F 0x1F 0x44 0x00 0x00
> +    //  7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
> +    //  8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
> +    //  9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
> +    // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
> +    // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
> +
> +    // The rest coding is ZX specific - don't use consecutive address nops
> +
> +    // 12: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
> +    // 13: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
> +    // 14: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
> +    // 15: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
> +
> +    while(i >= 15) {
> +      // For ZX don't generate consecutive addess nops (mix with regular nops)
> +      i -= 15;
> +      emit_int8(0x66);   // size prefix
> +      emit_int8(0x66);   // size prefix
> +      emit_int8(0x66);   // size prefix
> +      addr_nop_8();
> +      emit_int8(0x66);   // size prefix
> +      emit_int8(0x66);   // size prefix
> +      emit_int8(0x66);   // size prefix
> +      emit_int8((unsigned char)0x90);
> +                         // nop
> +    }
> +    switch (i) {
> +      case 14:
> +        emit_int8(0x66); // size prefix
> +      case 13:
> +        emit_int8(0x66); // size prefix
> +      case 12:
> +        addr_nop_8();
> +        emit_int8(0x66); // size prefix
> +        emit_int8(0x66); // size prefix
> +        emit_int8(0x66); // size prefix
> +        emit_int8((unsigned char)0x90);
> +                         // nop
> +        break;
> +      case 11:
> +        emit_int8(0x66); // size prefix
> +      case 10:
> +        emit_int8(0x66); // size prefix
> +      case 9:
> +        emit_int8(0x66); // size prefix
> +      case 8:
> +        addr_nop_8();
> +        break;
> +      case 7:
> +        addr_nop_7();
> +        break;
> +      case 6:
> +        emit_int8(0x66); // size prefix
> +      case 5:
> +        addr_nop_5();
> +        break;
> +      case 4:
> +        addr_nop_4();
> +        break;
> +      case 3:
> +        // Don't use "0x0F 0x1F 0x00" - need patching safe padding
> +        emit_int8(0x66); // size prefix
> +      case 2:
> +        emit_int8(0x66); // size prefix
> +      case 1:
> +        emit_int8((unsigned char)0x90);
> +                         // nop
> +        break;
> +      default:
> +        assert(i == 0, " ");
> +    }
> +    return;
> +  }
> +
>     // Using nops with size prefixes "0x66 0x90".
>     // From AMD Optimization Guide:
>     //  1: 0x90
> diff -Nur org/jdk/src/hotspot/cpu/x86/vm_version_x86.cpp zx/jdk/src/hotspot/cpu/x86/vm_version_x86.cpp
> --- org/jdk/src/hotspot/cpu/x86/vm_version_x86.cpp2017-12-08 03:33:00.000000000 +0800
> +++ zx/jdk/src/hotspot/cpu/x86/vm_version_x86.cpp2017-12-28 11:30:40.000000000 +0800
> @@ -628,6 +628,11 @@
>     if (UseSSE < 1)
>       _features &= ~CPU_SSE;
>
> +  //since AVX instructions is slower than SSE in some ZX cpus, force USEAVX=0.
> +  if(is_zx() && ((cpu_family()==6)||(cpu_family()==7))){
> +UseAVX = 0;
> +  }
> +
>     // first try initial setting and detect what we can support
>     int use_avx_limit = 0;
>     if (UseAVX > 0) {
> @@ -1078,6 +1083,66 @@
>     // UseXmmRegToRegMoveAll == true  --> movaps(xmm, xmm), movapd(xmm, xmm).
>     // UseXmmRegToRegMoveAll == false --> movss(xmm, xmm),  movsd(xmm, xmm).
>
> +
> +  if( is_zx() ) { // ZX cpus specific settings
> +    if( FLAG_IS_DEFAULT(UseStoreImmI16) ) {
> +      UseStoreImmI16 = false; // don't use it on ZX cpus
> +    }
> +    if( (cpu_family() == 6)|| (cpu_family() == 7) ) {
> +      if( FLAG_IS_DEFAULT(UseAddressNop) ) {
> +        // Use it on all ZX cpus
> +        UseAddressNop = true;
> +      }
> +    }
> +    if( FLAG_IS_DEFAULT(UseXmmLoadAndClearUpper) ) {
> +      UseXmmLoadAndClearUpper = true; // use movsd on all ZX cpus
> +    }
> +    if( FLAG_IS_DEFAULT(UseXmmRegToRegMoveAll) ) {
> +      if( supports_sse3() ) {
> +        UseXmmRegToRegMoveAll = true; // use movaps, movapd on new ZX cpus
> +      } else {
> +        UseXmmRegToRegMoveAll = false;
> +      }
> +    }
> +    if( ((cpu_family() == 6)||(cpu_family() == 7)) && supports_sse3() ) { // new ZX cpus
> +#ifdef COMPILER2
> +      if( FLAG_IS_DEFAULT(MaxLoopPad) ) {
> +        // For new ZX cpus do the next optimization:
> +        // don't align the beginning of a loop if there are enough instructions
> +        // left (NumberOfLoopInstrToAlign defined in c2_globals.hpp)
> +        // in current fetch line (OptoLoopAlignment) or the padding
> +        // is big (> MaxLoopPad).
> +        // Set MaxLoopPad to 11 for new ZX cpus to reduce number of
> +        // generated NOP instructions. 11 is the largest size of one
> +        // address NOP instruction '0F 1F' (see Assembler::nop(i)).
> +        MaxLoopPad = 11;
> +      }
> +#endif // COMPILER2
> +      if (FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
> +        UseXMMForArrayCopy = true; // use SSE2 movq on new ZX cpus
> +      }
> +      if (supports_sse4_2()) { // new ZX cpus
> +        if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
> +          UseUnalignedLoadStores = true; // use movdqu on newest ZX cpus
> +        }
> +      }
> +      if (supports_sse4_2()) {
> +        if (FLAG_IS_DEFAULT(UseSSE42Intrinsics)) {
> +          FLAG_SET_DEFAULT(UseSSE42Intrinsics, true);
> +        }
> +      } else {
> +        if (UseSSE42Intrinsics && !FLAG_IS_DEFAULT(UseAESIntrinsics)) {
> +          warning("SSE4.2 intrinsics require SSE4.2 instructions or higher. Intrinsics will be disabled.");
> +        }
> +        FLAG_SET_DEFAULT(UseSSE42Intrinsics, false);
> +      }
> +    }
> +
> +    if(FLAG_IS_DEFAULT(AllocatePrefetchInstr) && supports_3dnow_prefetch()) {
> +      FLAG_SET_DEFAULT(AllocatePrefetchInstr, 3);
> +    }
> +  }
> +
>     if( is_amd() ) { // AMD cpus specific settings
>       if( supports_sse2() && FLAG_IS_DEFAULT(UseAddressNop) ) {
>         // Use it on new AMD cpus starting from Opteron.
> @@ -1370,6 +1435,14 @@
>   #ifdef COMPILER2
>       if (FLAG_IS_DEFAULT(UseFPUForSpilling) && supports_sse4_2()) {
>         FLAG_SET_DEFAULT(UseFPUForSpilling, true);
> +    }
> +#endif
> +  }
> +
> +    if (is_zx() && ((cpu_family() == 6)||(cpu_family() == 7)) && supports_sse3()) {
> +#ifdef COMPILER2
> +    if (FLAG_IS_DEFAULT(UseFPUForSpilling) && supports_sse4_2()) {
> +      FLAG_SET_DEFAULT(UseFPUForSpilling, true);
>       }
>   #endif
>     }
> diff -Nur org/jdk/src/hotspot/cpu/x86/vm_version_x86.hpp zx/jdk/src/hotspot/cpu/x86/vm_version_x86.hpp
> --- org/jdk/src/hotspot/cpu/x86/vm_version_x86.hpp2017-12-08 03:33:00.000000000 +0800
> +++ zx/jdk/src/hotspot/cpu/x86/vm_version_x86.hpp2017-12-28 14:15:18.000000000 +0800
> @@ -305,6 +305,9 @@
>     enum Extended_Family {
>       // AMD
>       CPU_FAMILY_AMD_11H       = 0x11,
> +    // ZX
> +    CPU_FAMILY_ZX_CORE_F6    = 6,
> +    CPU_FAMILY_ZX_CORE_F7    = 7,
>       // Intel
>       CPU_FAMILY_INTEL_CORE    = 6,
>       CPU_MODEL_NEHALEM        = 0x1e,
> @@ -549,6 +552,16 @@
>         }
>       }
>
> +// ZX features.
> +    if(is_zx()) {
> +      if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
> +        result |= CPU_LZCNT;
> +      // for ZX, ecx.bits.misalignsse bit (bit 8) indicates support for prefetchw
> +      if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
> +        result |= CPU_3DNOW_PREFETCH;
> +      }
> +    }
> +
>       return result;
>     }
>
> @@ -657,6 +670,7 @@
>     static bool is_P6()             { return cpu_family() >= 6; }
>     static bool is_amd()            { assert_is_initialized(); return _cpuid_info.std_vendor_name_0 == 0x68747541; } // 'htuA'
>     static bool is_intel()          { assert_is_initialized(); return _cpuid_info.std_vendor_name_0 == 0x756e6547; } // 'uneG'
> +  static bool is_zx()             { assert_is_initialized(); return (_cpuid_info.std_vendor_name_0 == 0x746e6543)||(_cpuid_info.std_vendor_name_0 == 0x68532020); } // 'tneC'||'hS  '
>     static bool is_atom_family()    { return ((cpu_family() == 0x06) && ((extended_cpu_model() == 0x36) || (extended_cpu_model() == 0x37) || (extended_cpu_model() == 0x4D))); } //Silvermont and Centerton
>     static bool is_knights_family() { return ((cpu_family() == 0x06) && ((extended_cpu_model() == 0x57) || (extended_cpu_model() == 0x85))); } // Xeon Phi 3200/5200/7200 and Future Xeon Phi
>
> @@ -680,7 +694,16 @@
>         }
>       } else if (is_amd()) {
>         result = (_cpuid_info.ext_cpuid8_ecx.bits.cores_per_cpu + 1);
> -    }
> +    } else if (is_zx()) {
> +  bool supports_topology = supports_processor_topology();
> +      if (supports_topology) {
> +        result = _cpuid_info.tpl_cpuidB1_ebx.bits.logical_cpus /
> +                 _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
> +      }
> +      if (!supports_topology || result == 0) {
> +        result = (_cpuid_info.dcp_cpuid4_eax.bits.cores_per_cpu + 1);
> +      }
> +}
>       return result;
>     }
>
> @@ -688,7 +711,9 @@
>       uint result = 1;
>       if (is_intel() && supports_processor_topology()) {
>         result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
> -    } else if (_cpuid_info.std_cpuid1_edx.bits.ht != 0) {
> +    } else if (is_zx() && supports_processor_topology()) {
> +  result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
> +} else if (_cpuid_info.std_cpuid1_edx.bits.ht != 0) {
>         if (cpu_family() >= 0x17) {
>           result = _cpuid_info.ext_cpuid1E_ebx.bits.threads_per_core + 1;
>         } else {
> @@ -705,7 +730,9 @@
>         result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1);
>       } else if (is_amd()) {
>         result = _cpuid_info.ext_cpuid5_ecx.bits.L1_line_size;
> -    }
> +    } else if (is_zx()) {
> +  result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1);
> +}
>       if (result < 32) // not defined ?
>         result = 32;   // 32 bytes by default on x86 and other x64
>       return result;
>
> Finally, running the jtreg test. I run 2 jtreg tests(jtreg:test/jdk:tier1 and jtreg:test/langtools:tier1) and both of them are pass.
> The results are as follows.
> ==============================
> Test summary
> ==============================
>     TEST                                              TOTAL  PASS  FAIL ERROR
>     jtreg:test/jdk:tier1                               1660  1660     0     0
>     jtreg:test/langtools:tier1                          3828  3828     0     0
>     jtreg:test/nashorn:tier1                              0     0     0     0
>     jtreg:test/jaxp:tier1                                 0     0     0     0
> ==============================
> TEST SUCCESS
>
> Best Regards!
> VicWang | R&D
> Telephone:+86-01082695388-892477
>
>
> 保密声明:
> 本邮件含有保密或专有信息,仅供指定收件人使用。严禁对本邮件或其内容做任何未经授权的查阅、使用、复制或转发。
> CONFIDENTIAL NOTE:
> This email contains confidential or legally privileged information and is for the sole use of its intended recipient. Any unauthorized review, use, copying or forwarding of this email or the content of this email is strictly prohibited.
>
Reply | Threaded
Open this post in threaded view
|

答复: [patch] support zhaoxin x86 cpu vendor ids CentaurHauls and Shanghai

Vic Wang(BJ-RD)
Hi David:
I generate a patch using "hg diff". Please check it again. Thanks very much.

diff -r 4d28288c9f9e src/hotspot/cpu/x86/assembler_x86.cpp
--- a/src/hotspot/cpu/x86/assembler_x86.cppThu Dec 07 15:52:46 2017 +0100
+++ b/src/hotspot/cpu/x86/assembler_x86.cppWed Jan 03 16:04:30 2018 +0800
@@ -3167,6 +3167,89 @@
     return;
   }

+  if (UseAddressNop && VM_Version::is_zx()) {
+    //
+    // Using multi-bytes nops "0x0F 0x1F [address]" for ZX
+    //  1: 0x90
+    //  2: 0x66 0x90
+    //  3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
+    //  4: 0x0F 0x1F 0x40 0x00
+    //  5: 0x0F 0x1F 0x44 0x00 0x00
+    //  6: 0x66 0x0F 0x1F 0x44 0x00 0x00
+    //  7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
+    //  8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
+    //  9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
+    // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
+    // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
+
+    // The rest coding is ZX specific - don't use consecutive address nops
+
+    // 12: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
+    // 13: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
+    // 14: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
+    // 15: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
+
+    while(i >= 15) {
+      // For ZX don't generate consecutive addess nops (mix with regular nops)
+      i -= 15;
+      emit_int8(0x66);   // size prefix
+      emit_int8(0x66);   // size prefix
+      emit_int8(0x66);   // size prefix
+      addr_nop_8();
+      emit_int8(0x66);   // size prefix
+      emit_int8(0x66);   // size prefix
+      emit_int8(0x66);   // size prefix
+      emit_int8((unsigned char)0x90);
+                         // nop
+    }
+    switch (i) {
+      case 14:
+        emit_int8(0x66); // size prefix
+      case 13:
+        emit_int8(0x66); // size prefix
+      case 12:
+        addr_nop_8();
+        emit_int8(0x66); // size prefix
+        emit_int8(0x66); // size prefix
+        emit_int8(0x66); // size prefix
+        emit_int8((unsigned char)0x90);
+                         // nop
+        break;
+      case 11:
+        emit_int8(0x66); // size prefix
+      case 10:
+        emit_int8(0x66); // size prefix
+      case 9:
+        emit_int8(0x66); // size prefix
+      case 8:
+        addr_nop_8();
+        break;
+      case 7:
+        addr_nop_7();
+        break;
+      case 6:
+        emit_int8(0x66); // size prefix
+      case 5:
+        addr_nop_5();
+        break;
+      case 4:
+        addr_nop_4();
+        break;
+      case 3:
+        // Don't use "0x0F 0x1F 0x00" - need patching safe padding
+        emit_int8(0x66); // size prefix
+      case 2:
+        emit_int8(0x66); // size prefix
+      case 1:
+        emit_int8((unsigned char)0x90);
+                         // nop
+        break;
+      default:
+        assert(i == 0, " ");
+    }
+    return;
+  }
+
   // Using nops with size prefixes "0x66 0x90".
   // From AMD Optimization Guide:
   //  1: 0x90
diff -r 4d28288c9f9e src/hotspot/cpu/x86/vm_version_x86.cpp
--- a/src/hotspot/cpu/x86/vm_version_x86.cppThu Dec 07 15:52:46 2017 +0100
+++ b/src/hotspot/cpu/x86/vm_version_x86.cppWed Jan 03 16:04:30 2018 +0800
@@ -628,6 +628,11 @@
   if (UseSSE < 1)
     _features &= ~CPU_SSE;

+  //since AVX instructions is slower than SSE in some ZX cpus, force USEAVX=0.
+  if(is_zx() && ((cpu_family()==6)||(cpu_family()==7))){
+UseAVX = 0;
+  }
+
   // first try initial setting and detect what we can support
   int use_avx_limit = 0;
   if (UseAVX > 0) {
@@ -1078,6 +1083,66 @@
   // UseXmmRegToRegMoveAll == true  --> movaps(xmm, xmm), movapd(xmm, xmm).
   // UseXmmRegToRegMoveAll == false --> movss(xmm, xmm),  movsd(xmm, xmm).

+
+  if( is_zx() ) { // ZX cpus specific settings
+    if( FLAG_IS_DEFAULT(UseStoreImmI16) ) {
+      UseStoreImmI16 = false; // don't use it on ZX cpus
+    }
+    if( (cpu_family() == 6)|| (cpu_family() == 7) ) {
+      if( FLAG_IS_DEFAULT(UseAddressNop) ) {
+        // Use it on all ZX cpus
+        UseAddressNop = true;
+      }
+    }
+    if( FLAG_IS_DEFAULT(UseXmmLoadAndClearUpper) ) {
+      UseXmmLoadAndClearUpper = true; // use movsd on all ZX cpus
+    }
+    if( FLAG_IS_DEFAULT(UseXmmRegToRegMoveAll) ) {
+      if( supports_sse3() ) {
+        UseXmmRegToRegMoveAll = true; // use movaps, movapd on new ZX cpus
+      } else {
+        UseXmmRegToRegMoveAll = false;
+      }
+    }
+    if( ((cpu_family() == 6)||(cpu_family() == 7)) && supports_sse3() ) { // new ZX cpus
+#ifdef COMPILER2
+      if( FLAG_IS_DEFAULT(MaxLoopPad) ) {
+        // For new ZX cpus do the next optimization:
+        // don't align the beginning of a loop if there are enough instructions
+        // left (NumberOfLoopInstrToAlign defined in c2_globals.hpp)
+        // in current fetch line (OptoLoopAlignment) or the padding
+        // is big (> MaxLoopPad).
+        // Set MaxLoopPad to 11 for new ZX cpus to reduce number of
+        // generated NOP instructions. 11 is the largest size of one
+        // address NOP instruction '0F 1F' (see Assembler::nop(i)).
+        MaxLoopPad = 11;
+      }
+#endif // COMPILER2
+      if (FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
+        UseXMMForArrayCopy = true; // use SSE2 movq on new ZX cpus
+      }
+      if (supports_sse4_2()) { // new ZX cpus
+        if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
+          UseUnalignedLoadStores = true; // use movdqu on newest ZX cpus
+        }
+      }
+      if (supports_sse4_2()) {
+        if (FLAG_IS_DEFAULT(UseSSE42Intrinsics)) {
+          FLAG_SET_DEFAULT(UseSSE42Intrinsics, true);
+        }
+      } else {
+        if (UseSSE42Intrinsics && !FLAG_IS_DEFAULT(UseAESIntrinsics)) {
+          warning("SSE4.2 intrinsics require SSE4.2 instructions or higher. Intrinsics will be disabled.");
+        }
+        FLAG_SET_DEFAULT(UseSSE42Intrinsics, false);
+      }
+    }
+
+    if(FLAG_IS_DEFAULT(AllocatePrefetchInstr) && supports_3dnow_prefetch()) {
+      FLAG_SET_DEFAULT(AllocatePrefetchInstr, 3);
+    }
+  }
+
   if( is_amd() ) { // AMD cpus specific settings
     if( supports_sse2() && FLAG_IS_DEFAULT(UseAddressNop) ) {
       // Use it on new AMD cpus starting from Opteron.
@@ -1374,6 +1439,14 @@
 #endif
   }

+    if (is_zx() && ((cpu_family() == 6)||(cpu_family() == 7)) && supports_sse3()) {
+#ifdef COMPILER2
+    if (FLAG_IS_DEFAULT(UseFPUForSpilling) && supports_sse4_2()) {
+      FLAG_SET_DEFAULT(UseFPUForSpilling, true);
+    }
+#endif
+  }
+
 #ifdef _LP64
   // Prefetch settings

diff -r 4d28288c9f9e src/hotspot/cpu/x86/vm_version_x86.hpp
--- a/src/hotspot/cpu/x86/vm_version_x86.hppThu Dec 07 15:52:46 2017 +0100
+++ b/src/hotspot/cpu/x86/vm_version_x86.hppWed Jan 03 16:04:30 2018 +0800
@@ -305,6 +305,9 @@
   enum Extended_Family {
     // AMD
     CPU_FAMILY_AMD_11H       = 0x11,
+    // ZX
+    CPU_FAMILY_ZX_CORE_F6    = 6,
+CPU_FAMILY_ZX_CORE_F7    = 7,
     // Intel
     CPU_FAMILY_INTEL_CORE    = 6,
     CPU_MODEL_NEHALEM        = 0x1e,
@@ -549,6 +552,16 @@
       }
     }

+// ZX features.
+    if(is_zx()) {
+      if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
+        result |= CPU_LZCNT;
+      // for ZX, ecx.bits.misalignsse bit (bit 8) indicates support for prefetchw
+      if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
+        result |= CPU_3DNOW_PREFETCH;
+      }
+    }
+
     return result;
   }

@@ -657,6 +670,7 @@
   static bool is_P6()             { return cpu_family() >= 6; }
   static bool is_amd()            { assert_is_initialized(); return _cpuid_info.std_vendor_name_0 == 0x68747541; } // 'htuA'
   static bool is_intel()          { assert_is_initialized(); return _cpuid_info.std_vendor_name_0 == 0x756e6547; } // 'uneG'
+  static bool is_zx()            { assert_is_initialized(); return (_cpuid_info.std_vendor_name_0 == 0x746e6543)||(_cpuid_info.std_vendor_name_0 == 0x68532020); } // 'tneC'||'hS  '
   static bool is_atom_family()    { return ((cpu_family() == 0x06) && ((extended_cpu_model() == 0x36) || (extended_cpu_model() == 0x37) || (extended_cpu_model() == 0x4D))); } //Silvermont and Centerton
   static bool is_knights_family() { return ((cpu_family() == 0x06) && ((extended_cpu_model() == 0x57) || (extended_cpu_model() == 0x85))); } // Xeon Phi 3200/5200/7200 and Future Xeon Phi

@@ -680,7 +694,16 @@
       }
     } else if (is_amd()) {
       result = (_cpuid_info.ext_cpuid8_ecx.bits.cores_per_cpu + 1);
-    }
+    } else if (is_zx()) {
+  bool supports_topology = supports_processor_topology();
+      if (supports_topology) {
+        result = _cpuid_info.tpl_cpuidB1_ebx.bits.logical_cpus /
+                 _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
+      }
+      if (!supports_topology || result == 0) {
+        result = (_cpuid_info.dcp_cpuid4_eax.bits.cores_per_cpu + 1);
+      }
+}
     return result;
   }

@@ -688,7 +711,9 @@
     uint result = 1;
     if (is_intel() && supports_processor_topology()) {
       result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
-    } else if (_cpuid_info.std_cpuid1_edx.bits.ht != 0) {
+    } else if (is_zx() && supports_processor_topology()) {
+  result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
+} else if (_cpuid_info.std_cpuid1_edx.bits.ht != 0) {
       if (cpu_family() >= 0x17) {
         result = _cpuid_info.ext_cpuid1E_ebx.bits.threads_per_core + 1;
       } else {
@@ -705,7 +730,9 @@
       result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1);
     } else if (is_amd()) {
       result = _cpuid_info.ext_cpuid5_ecx.bits.L1_line_size;
-    }
+    } else if (is_zx()) {
+  result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1);
+}
     if (result < 32) // not defined ?
       result = 32;   // 32 bytes by default on x86 and other x64
     return result;


Best Regards!
VicWang | R&D
Telephone:+86-01082695388-892477

-----邮件原件-----
发件人: David Holmes [mailto:[hidden email]]
发送时间: 2018年1月2日 5:18
收件人: Vic Wang(BJ-RD) <[hidden email]>; [hidden email]
抄送: Cobe Chen(BJ-RD) <[hidden email]>
主题: Re: [patch] support zhaoxin x86 cpu vendor ids CentaurHauls and Shanghai

Hi,

I've created a JBS issue for this enhancement request:

https://bugs.openjdk.java.net/browse/JDK-8194279

On 28/12/2017 4:59 PM, Vic Wang(BJ-RD) wrote:
> Hi,
> Here is a hotspot patch for supporting zhaoxin x86 cpu vendor ids (CentaurHauls id and Shanghai id). The original repository is http://hg.openjdk.java.net/jdk/hs.
> Please help me to submit the changes . If there is anything incorrectly, please let me know. Thank you very much.
>
> The following is the changeset by the diff command.

Can you generate a patch using "hg diff" or "hg export" please. The raw diff can't be applied directly to an existing repo without manual editing.

Thanks,
David

> diff -Nur org/jdk/src/hotspot/cpu/x86/assembler_x86.cpp
> zx/jdk/src/hotspot/cpu/x86/assembler_x86.cpp
> --- org/jdk/src/hotspot/cpu/x86/assembler_x86.cpp2017-12-08
> 03:33:00.000000000 +0800
> +++ zx/jdk/src/hotspot/cpu/x86/assembler_x86.cpp2017-12-28
> +++ 11:31:30.000000000 +0800
> @@ -3167,6 +3167,89 @@
>       return;
>     }
>
> +  if (UseAddressNop && VM_Version::is_zx()) {
> +    //
> +    // Using multi-bytes nops "0x0F 0x1F [address]" for ZX
> +    //  1: 0x90
> +    //  2: 0x66 0x90
> +    //  3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
> +    //  4: 0x0F 0x1F 0x40 0x00
> +    //  5: 0x0F 0x1F 0x44 0x00 0x00
> +    //  6: 0x66 0x0F 0x1F 0x44 0x00 0x00
> +    //  7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
> +    //  8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
> +    //  9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
> +    // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
> +    // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
> +
> +    // The rest coding is ZX specific - don't use consecutive address
> + nops
> +
> +    // 12: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
> +    // 13: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
> +    // 14: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
> +    // 15: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
> + 0x66 0x66 0x66 0x90
> +
> +    while(i >= 15) {
> +      // For ZX don't generate consecutive addess nops (mix with regular nops)
> +      i -= 15;
> +      emit_int8(0x66);   // size prefix
> +      emit_int8(0x66);   // size prefix
> +      emit_int8(0x66);   // size prefix
> +      addr_nop_8();
> +      emit_int8(0x66);   // size prefix
> +      emit_int8(0x66);   // size prefix
> +      emit_int8(0x66);   // size prefix
> +      emit_int8((unsigned char)0x90);
> +                         // nop
> +    }
> +    switch (i) {
> +      case 14:
> +        emit_int8(0x66); // size prefix
> +      case 13:
> +        emit_int8(0x66); // size prefix
> +      case 12:
> +        addr_nop_8();
> +        emit_int8(0x66); // size prefix
> +        emit_int8(0x66); // size prefix
> +        emit_int8(0x66); // size prefix
> +        emit_int8((unsigned char)0x90);
> +                         // nop
> +        break;
> +      case 11:
> +        emit_int8(0x66); // size prefix
> +      case 10:
> +        emit_int8(0x66); // size prefix
> +      case 9:
> +        emit_int8(0x66); // size prefix
> +      case 8:
> +        addr_nop_8();
> +        break;
> +      case 7:
> +        addr_nop_7();
> +        break;
> +      case 6:
> +        emit_int8(0x66); // size prefix
> +      case 5:
> +        addr_nop_5();
> +        break;
> +      case 4:
> +        addr_nop_4();
> +        break;
> +      case 3:
> +        // Don't use "0x0F 0x1F 0x00" - need patching safe padding
> +        emit_int8(0x66); // size prefix
> +      case 2:
> +        emit_int8(0x66); // size prefix
> +      case 1:
> +        emit_int8((unsigned char)0x90);
> +                         // nop
> +        break;
> +      default:
> +        assert(i == 0, " ");
> +    }
> +    return;
> +  }
> +
>     // Using nops with size prefixes "0x66 0x90".
>     // From AMD Optimization Guide:
>     //  1: 0x90
> diff -Nur org/jdk/src/hotspot/cpu/x86/vm_version_x86.cpp
> zx/jdk/src/hotspot/cpu/x86/vm_version_x86.cpp
> --- org/jdk/src/hotspot/cpu/x86/vm_version_x86.cpp2017-12-08
> 03:33:00.000000000 +0800
> +++ zx/jdk/src/hotspot/cpu/x86/vm_version_x86.cpp2017-12-28
> +++ 11:30:40.000000000 +0800
> @@ -628,6 +628,11 @@
>     if (UseSSE < 1)
>       _features &= ~CPU_SSE;
>
> +  //since AVX instructions is slower than SSE in some ZX cpus, force USEAVX=0.
> +  if(is_zx() && ((cpu_family()==6)||(cpu_family()==7))){
> +UseAVX = 0;
> +  }
> +
>     // first try initial setting and detect what we can support
>     int use_avx_limit = 0;
>     if (UseAVX > 0) {
> @@ -1078,6 +1083,66 @@
>     // UseXmmRegToRegMoveAll == true  --> movaps(xmm, xmm), movapd(xmm, xmm).
>     // UseXmmRegToRegMoveAll == false --> movss(xmm, xmm),  movsd(xmm, xmm).
>
> +
> +  if( is_zx() ) { // ZX cpus specific settings
> +    if( FLAG_IS_DEFAULT(UseStoreImmI16) ) {
> +      UseStoreImmI16 = false; // don't use it on ZX cpus
> +    }
> +    if( (cpu_family() == 6)|| (cpu_family() == 7) ) {
> +      if( FLAG_IS_DEFAULT(UseAddressNop) ) {
> +        // Use it on all ZX cpus
> +        UseAddressNop = true;
> +      }
> +    }
> +    if( FLAG_IS_DEFAULT(UseXmmLoadAndClearUpper) ) {
> +      UseXmmLoadAndClearUpper = true; // use movsd on all ZX cpus
> +    }
> +    if( FLAG_IS_DEFAULT(UseXmmRegToRegMoveAll) ) {
> +      if( supports_sse3() ) {
> +        UseXmmRegToRegMoveAll = true; // use movaps, movapd on new ZX cpus
> +      } else {
> +        UseXmmRegToRegMoveAll = false;
> +      }
> +    }
> +    if( ((cpu_family() == 6)||(cpu_family() == 7)) && supports_sse3()
> +) { // new ZX cpus #ifdef COMPILER2
> +      if( FLAG_IS_DEFAULT(MaxLoopPad) ) {
> +        // For new ZX cpus do the next optimization:
> +        // don't align the beginning of a loop if there are enough instructions
> +        // left (NumberOfLoopInstrToAlign defined in c2_globals.hpp)
> +        // in current fetch line (OptoLoopAlignment) or the padding
> +        // is big (> MaxLoopPad).
> +        // Set MaxLoopPad to 11 for new ZX cpus to reduce number of
> +        // generated NOP instructions. 11 is the largest size of one
> +        // address NOP instruction '0F 1F' (see Assembler::nop(i)).
> +        MaxLoopPad = 11;
> +      }
> +#endif // COMPILER2
> +      if (FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
> +        UseXMMForArrayCopy = true; // use SSE2 movq on new ZX cpus
> +      }
> +      if (supports_sse4_2()) { // new ZX cpus
> +        if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
> +          UseUnalignedLoadStores = true; // use movdqu on newest ZX cpus
> +        }
> +      }
> +      if (supports_sse4_2()) {
> +        if (FLAG_IS_DEFAULT(UseSSE42Intrinsics)) {
> +          FLAG_SET_DEFAULT(UseSSE42Intrinsics, true);
> +        }
> +      } else {
> +        if (UseSSE42Intrinsics && !FLAG_IS_DEFAULT(UseAESIntrinsics)) {
> +          warning("SSE4.2 intrinsics require SSE4.2 instructions or higher. Intrinsics will be disabled.");
> +        }
> +        FLAG_SET_DEFAULT(UseSSE42Intrinsics, false);
> +      }
> +    }
> +
> +    if(FLAG_IS_DEFAULT(AllocatePrefetchInstr) && supports_3dnow_prefetch()) {
> +      FLAG_SET_DEFAULT(AllocatePrefetchInstr, 3);
> +    }
> +  }
> +
>     if( is_amd() ) { // AMD cpus specific settings
>       if( supports_sse2() && FLAG_IS_DEFAULT(UseAddressNop) ) {
>         // Use it on new AMD cpus starting from Opteron.
> @@ -1370,6 +1435,14 @@
>   #ifdef COMPILER2
>       if (FLAG_IS_DEFAULT(UseFPUForSpilling) && supports_sse4_2()) {
>         FLAG_SET_DEFAULT(UseFPUForSpilling, true);
> +    }
> +#endif
> +  }
> +
> +    if (is_zx() && ((cpu_family() == 6)||(cpu_family() == 7)) &&
> +supports_sse3()) { #ifdef COMPILER2
> +    if (FLAG_IS_DEFAULT(UseFPUForSpilling) && supports_sse4_2()) {
> +      FLAG_SET_DEFAULT(UseFPUForSpilling, true);
>       }
>   #endif
>     }
> diff -Nur org/jdk/src/hotspot/cpu/x86/vm_version_x86.hpp
> zx/jdk/src/hotspot/cpu/x86/vm_version_x86.hpp
> --- org/jdk/src/hotspot/cpu/x86/vm_version_x86.hpp2017-12-08
> 03:33:00.000000000 +0800
> +++ zx/jdk/src/hotspot/cpu/x86/vm_version_x86.hpp2017-12-28
> +++ 14:15:18.000000000 +0800
> @@ -305,6 +305,9 @@
>     enum Extended_Family {
>       // AMD
>       CPU_FAMILY_AMD_11H       = 0x11,
> +    // ZX
> +    CPU_FAMILY_ZX_CORE_F6    = 6,
> +    CPU_FAMILY_ZX_CORE_F7    = 7,
>       // Intel
>       CPU_FAMILY_INTEL_CORE    = 6,
>       CPU_MODEL_NEHALEM        = 0x1e,
> @@ -549,6 +552,16 @@
>         }
>       }
>
> +// ZX features.
> +    if(is_zx()) {
> +      if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
> +        result |= CPU_LZCNT;
> +      // for ZX, ecx.bits.misalignsse bit (bit 8) indicates support for prefetchw
> +      if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
> +        result |= CPU_3DNOW_PREFETCH;
> +      }
> +    }
> +
>       return result;
>     }
>
> @@ -657,6 +670,7 @@
>     static bool is_P6()             { return cpu_family() >= 6; }
>     static bool is_amd()            { assert_is_initialized(); return _cpuid_info.std_vendor_name_0 == 0x68747541; } // 'htuA'
>     static bool is_intel()          { assert_is_initialized(); return _cpuid_info.std_vendor_name_0 == 0x756e6547; } // 'uneG'
> +  static bool is_zx()             { assert_is_initialized(); return (_cpuid_info.std_vendor_name_0 == 0x746e6543)||(_cpuid_info.std_vendor_name_0 == 0x68532020); } // 'tneC'||'hS  '
>     static bool is_atom_family()    { return ((cpu_family() == 0x06) && ((extended_cpu_model() == 0x36) || (extended_cpu_model() == 0x37) || (extended_cpu_model() == 0x4D))); } //Silvermont and Centerton
>     static bool is_knights_family() { return ((cpu_family() == 0x06)
> && ((extended_cpu_model() == 0x57) || (extended_cpu_model() ==
> 0x85))); } // Xeon Phi 3200/5200/7200 and Future Xeon Phi
>
> @@ -680,7 +694,16 @@
>         }
>       } else if (is_amd()) {
>         result = (_cpuid_info.ext_cpuid8_ecx.bits.cores_per_cpu + 1);
> -    }
> +    } else if (is_zx()) {
> +  bool supports_topology = supports_processor_topology();
> +      if (supports_topology) {
> +        result = _cpuid_info.tpl_cpuidB1_ebx.bits.logical_cpus /
> +                 _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
> +      }
> +      if (!supports_topology || result == 0) {
> +        result = (_cpuid_info.dcp_cpuid4_eax.bits.cores_per_cpu + 1);
> +      }
> +}
>       return result;
>     }
>
> @@ -688,7 +711,9 @@
>       uint result = 1;
>       if (is_intel() && supports_processor_topology()) {
>         result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
> -    } else if (_cpuid_info.std_cpuid1_edx.bits.ht != 0) {
> +    } else if (is_zx() && supports_processor_topology()) {
> +  result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
> +} else if (_cpuid_info.std_cpuid1_edx.bits.ht != 0) {
>         if (cpu_family() >= 0x17) {
>           result = _cpuid_info.ext_cpuid1E_ebx.bits.threads_per_core + 1;
>         } else {
> @@ -705,7 +730,9 @@
>         result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1);
>       } else if (is_amd()) {
>         result = _cpuid_info.ext_cpuid5_ecx.bits.L1_line_size;
> -    }
> +    } else if (is_zx()) {
> +  result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1); }
>       if (result < 32) // not defined ?
>         result = 32;   // 32 bytes by default on x86 and other x64
>       return result;
>
> Finally, running the jtreg test. I run 2 jtreg tests(jtreg:test/jdk:tier1 and jtreg:test/langtools:tier1) and both of them are pass.
> The results are as follows.
> ==============================
> Test summary
> ==============================
>     TEST                                              TOTAL  PASS  FAIL ERROR
>     jtreg:test/jdk:tier1                               1660  1660     0     0
>     jtreg:test/langtools:tier1                          3828  3828     0     0
>     jtreg:test/nashorn:tier1                              0     0     0     0
>     jtreg:test/jaxp:tier1                                 0     0     0     0
> ==============================
> TEST SUCCESS
>
> Best Regards!
> VicWang | R&D
> Telephone:+86-01082695388-892477
>
>
> 保密声明:
> 本邮件含有保密或专有信息,仅供指定收件人使用。严禁对本邮件或其内容做任何未经授权的查阅、使用、复制或转发。
> CONFIDENTIAL NOTE:
> This email contains confidential or legally privileged information and is for the sole use of its intended recipient. Any unauthorized review, use, copying or forwarding of this email or the content of this email is strictly prohibited.
>


保密声明:
本邮件含有保密或专有信息,仅供指定收件人使用。严禁对本邮件或其内容做任何未经授权的查阅、使用、复制或转发。
CONFIDENTIAL NOTE:
This email contains confidential or legally privileged information and is for the sole use of its intended recipient. Any unauthorized review, use, copying or forwarding of this email or the content of this email is strictly prohibited.
Reply | Threaded
Open this post in threaded view
|

Re: 答复: [patch] support zhaoxin x86 cpu vendor ids CentaurHauls and Shanghai

David Holmes
On 3/01/2018 6:32 PM, Vic Wang(BJ-RD) wrote:
> Hi David:
> I generate a patch using "hg diff". Please check it again. Thanks very much.

Thanks.

> diff -r 4d28288c9f9e src/hotspot/cpu/x86/assembler_x86.cpp
> --- a/src/hotspot/cpu/x86/assembler_x86.cppThu Dec 07 15:52:46 2017 +0100

Something stripped out the spaces between the file name and timestamp.

I've hosted a webrev for the patch at:

http://cr.openjdk.java.net/~dholmes/8194279/webrev


A few minor comments. I can't comment on the technical accuracy of course :)

All copyright notices need the second year updated to 2018.

- src/hotspot/cpu/x86/assembler_x86.cpp

Seems fine - you copied the existing pattern so I won't call out any
style bugs. :)

---

- src/hotspot/cpu/x86/vm_version_x86.cpp

633 UseAVX = 0;

Indention is missing.

General style nit: your if statements are very inconsistently formatted.
A statement like:

if( is_zx() )

should be written:

if (is_zx())

- space after the "if" and no space after ( or before )


In this block:

+     if (is_zx() && ((cpu_family() == 6)||(cpu_family() == 7)) &&
supports_sse3()) {
+ #ifdef COMPILER2
+     if (FLAG_IS_DEFAULT(UseFPUForSpilling) && supports_sse4_2()) {
+       FLAG_SET_DEFAULT(UseFPUForSpilling, true);
+     }
+ #endif

I see this copies the existing pattern but isn't it simpler just to write:

     if (is_zx() && ((cpu_family() == 6)||(cpu_family() == 7)) &&
supports_sse4_2()) {
#ifdef COMPILER2
     if (FLAG_IS_DEFAULT(UseFPUForSpilling)) {
       FLAG_SET_DEFAULT(UseFPUForSpilling, true);
     }
#endif

?

---

-  src/hotspot/cpu/x86/vm_version_x86.hpp

310 CPU_FAMILY_ZX_CORE_F7    = 7,

Indentation is missing.

555 // ZX features.

Indentation is missing.

  556     if(is_zx()) {
  557       if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
  558         result |= CPU_LZCNT;
  559       // for ZX, ecx.bits.misalignsse bit (bit 8) indicates
support for prefetchw
  560       if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
  561         result |= CPU_3DNOW_PREFETCH;
  562       }
  563     }

The "intel" block you copied is full of style issues :( But as it is
exactly the same can't you just adjust:

546     if(is_intel()) {

to

546     if (is_intel() || is_zx()) {

? Similarly for cores_per_cpu(), threads_per_core() and L1_line_size().

Otherwise there are indentation issues in your changes to all those
functions.

Thanks,
David
-----
Reply | Threaded
Open this post in threaded view
|

答复: 答复: [patch] support zhaoxin x86 cpu vendor ids CentaurHauls and Shanghai

Vic Wang(BJ-RD)
Hi David:
Thanks for the detailed review.
I've regenerated the patch by "hg diff" for modifying the coding style issues.
Additionally, I don't merge some of the branches into intel's because it may add some codes when the new cpu features are added.

diff -r 4d28288c9f9e src/hotspot/cpu/x86/assembler_x86.cpp
--- a/src/hotspot/cpu/x86/assembler_x86.cppThu Dec 07 15:52:46 2017 +0100
+++ b/src/hotspot/cpu/x86/assembler_x86.cppThu Jan 04 12:20:31 2018 +0800
@@ -3167,6 +3167,89 @@
     return;
   }

+  if (UseAddressNop && VM_Version::is_zx()) {
+    //
+    // Using multi-bytes nops "0x0F 0x1F [address]" for ZX
+    //  1: 0x90
+    //  2: 0x66 0x90
+    //  3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
+    //  4: 0x0F 0x1F 0x40 0x00
+    //  5: 0x0F 0x1F 0x44 0x00 0x00
+    //  6: 0x66 0x0F 0x1F 0x44 0x00 0x00
+    //  7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
+    //  8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
+    //  9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
+    // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
+    // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
+
+    // The rest coding is ZX specific - don't use consecutive address nops
+
+    // 12: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
+    // 13: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
+    // 14: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
+    // 15: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
+
+    while(i >= 15) {
+      // For ZX don't generate consecutive addess nops (mix with regular nops)
+      i -= 15;
+      emit_int8(0x66);   // size prefix
+      emit_int8(0x66);   // size prefix
+      emit_int8(0x66);   // size prefix
+      addr_nop_8();
+      emit_int8(0x66);   // size prefix
+      emit_int8(0x66);   // size prefix
+      emit_int8(0x66);   // size prefix
+      emit_int8((unsigned char)0x90);
+                         // nop
+    }
+    switch (i) {
+      case 14:
+        emit_int8(0x66); // size prefix
+      case 13:
+        emit_int8(0x66); // size prefix
+      case 12:
+        addr_nop_8();
+        emit_int8(0x66); // size prefix
+        emit_int8(0x66); // size prefix
+        emit_int8(0x66); // size prefix
+        emit_int8((unsigned char)0x90);
+                         // nop
+        break;
+      case 11:
+        emit_int8(0x66); // size prefix
+      case 10:
+        emit_int8(0x66); // size prefix
+      case 9:
+        emit_int8(0x66); // size prefix
+      case 8:
+        addr_nop_8();
+        break;
+      case 7:
+        addr_nop_7();
+        break;
+      case 6:
+        emit_int8(0x66); // size prefix
+      case 5:
+        addr_nop_5();
+        break;
+      case 4:
+        addr_nop_4();
+        break;
+      case 3:
+        // Don't use "0x0F 0x1F 0x00" - need patching safe padding
+        emit_int8(0x66); // size prefix
+      case 2:
+        emit_int8(0x66); // size prefix
+      case 1:
+        emit_int8((unsigned char)0x90);
+                         // nop
+        break;
+      default:
+        assert(i == 0, " ");
+    }
+    return;
+  }
+
   // Using nops with size prefixes "0x66 0x90".
   // From AMD Optimization Guide:
   //  1: 0x90
diff -r 4d28288c9f9e src/hotspot/cpu/x86/vm_version_x86.cpp
--- a/src/hotspot/cpu/x86/vm_version_x86.cppThu Dec 07 15:52:46 2017 +0100
+++ b/src/hotspot/cpu/x86/vm_version_x86.cppThu Jan 04 12:20:31 2018 +0800
@@ -628,6 +628,11 @@
   if (UseSSE < 1)
     _features &= ~CPU_SSE;

+  //since AVX instructions is slower than SSE in some ZX cpus, force USEAVX=0.
+  if (is_zx() && ((cpu_family() == 6) || (cpu_family() == 7))){
+UseAVX = 0;
+  }
+
   // first try initial setting and detect what we can support
   int use_avx_limit = 0;
   if (UseAVX > 0) {
@@ -1078,6 +1083,66 @@
   // UseXmmRegToRegMoveAll == true  --> movaps(xmm, xmm), movapd(xmm, xmm).
   // UseXmmRegToRegMoveAll == false --> movss(xmm, xmm),  movsd(xmm, xmm).

+
+  if (is_zx()) { // ZX cpus specific settings
+    if (FLAG_IS_DEFAULT(UseStoreImmI16)) {
+      UseStoreImmI16 = false; // don't use it on ZX cpus
+    }
+    if ((cpu_family() == 6) || (cpu_family() == 7)) {
+      if (FLAG_IS_DEFAULT(UseAddressNop)) {
+        // Use it on all ZX cpus
+        UseAddressNop = true;
+      }
+    }
+    if (FLAG_IS_DEFAULT(UseXmmLoadAndClearUpper)) {
+      UseXmmLoadAndClearUpper = true; // use movsd on all ZX cpus
+    }
+    if (FLAG_IS_DEFAULT(UseXmmRegToRegMoveAll)) {
+      if (supports_sse3()) {
+        UseXmmRegToRegMoveAll = true; // use movaps, movapd on new ZX cpus
+      } else {
+        UseXmmRegToRegMoveAll = false;
+      }
+    }
+    if (((cpu_family() == 6) || (cpu_family() == 7)) && supports_sse3()) { // new ZX cpus
+#ifdef COMPILER2
+      if (FLAG_IS_DEFAULT(MaxLoopPad)) {
+        // For new ZX cpus do the next optimization:
+        // don't align the beginning of a loop if there are enough instructions
+        // left (NumberOfLoopInstrToAlign defined in c2_globals.hpp)
+        // in current fetch line (OptoLoopAlignment) or the padding
+        // is big (> MaxLoopPad).
+        // Set MaxLoopPad to 11 for new ZX cpus to reduce number of
+        // generated NOP instructions. 11 is the largest size of one
+        // address NOP instruction '0F 1F' (see Assembler::nop(i)).
+        MaxLoopPad = 11;
+      }
+#endif // COMPILER2
+      if (FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
+        UseXMMForArrayCopy = true; // use SSE2 movq on new ZX cpus
+      }
+      if (supports_sse4_2()) { // new ZX cpus
+        if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
+          UseUnalignedLoadStores = true; // use movdqu on newest ZX cpus
+        }
+      }
+      if (supports_sse4_2()) {
+        if (FLAG_IS_DEFAULT(UseSSE42Intrinsics)) {
+          FLAG_SET_DEFAULT(UseSSE42Intrinsics, true);
+        }
+      } else {
+        if (UseSSE42Intrinsics && !FLAG_IS_DEFAULT(UseAESIntrinsics)) {
+          warning("SSE4.2 intrinsics require SSE4.2 instructions or higher. Intrinsics will be disabled.");
+        }
+        FLAG_SET_DEFAULT(UseSSE42Intrinsics, false);
+      }
+    }
+
+    if (FLAG_IS_DEFAULT(AllocatePrefetchInstr) && supports_3dnow_prefetch()) {
+      FLAG_SET_DEFAULT(AllocatePrefetchInstr, 3);
+    }
+  }
+
   if( is_amd() ) { // AMD cpus specific settings
     if( supports_sse2() && FLAG_IS_DEFAULT(UseAddressNop) ) {
       // Use it on new AMD cpus starting from Opteron.
@@ -1374,6 +1439,14 @@
 #endif
   }

+  if (is_zx() && ((cpu_family() == 6) || (cpu_family() == 7)) && supports_sse4_2()) {
+#ifdef COMPILER2
+    if (FLAG_IS_DEFAULT(UseFPUForSpilling)) {
+      FLAG_SET_DEFAULT(UseFPUForSpilling, true);
+    }
+#endif
+  }
+
 #ifdef _LP64
   // Prefetch settings

diff -r 4d28288c9f9e src/hotspot/cpu/x86/vm_version_x86.hpp
--- a/src/hotspot/cpu/x86/vm_version_x86.hppThu Dec 07 15:52:46 2017 +0100
+++ b/src/hotspot/cpu/x86/vm_version_x86.hppThu Jan 04 12:20:31 2018 +0800
@@ -305,6 +305,9 @@
   enum Extended_Family {
     // AMD
     CPU_FAMILY_AMD_11H       = 0x11,
+    // ZX
+    CPU_FAMILY_ZX_CORE_F6    = 6,
+    CPU_FAMILY_ZX_CORE_F7    = 7,
     // Intel
     CPU_FAMILY_INTEL_CORE    = 6,
     CPU_MODEL_NEHALEM        = 0x1e,
@@ -549,6 +552,16 @@
       }
     }

+    // ZX features.
+    if (is_zx()) {
+      if (_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
+        result |= CPU_LZCNT;
+      // for ZX, ecx.bits.misalignsse bit (bit 8) indicates support for prefetchw
+      if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
+        result |= CPU_3DNOW_PREFETCH;
+      }
+    }
+
     return result;
   }

@@ -657,6 +670,7 @@
   static bool is_P6()             { return cpu_family() >= 6; }
   static bool is_amd()            { assert_is_initialized(); return _cpuid_info.std_vendor_name_0 == 0x68747541; } // 'htuA'
   static bool is_intel()          { assert_is_initialized(); return _cpuid_info.std_vendor_name_0 == 0x756e6547; } // 'uneG'
+  static bool is_zx()             { assert_is_initialized(); return (_cpuid_info.std_vendor_name_0 == 0x746e6543) || (_cpuid_info.std_vendor_name_0 == 0x68532020); } // 'tneC'||'hS  '
   static bool is_atom_family()    { return ((cpu_family() == 0x06) && ((extended_cpu_model() == 0x36) || (extended_cpu_model() == 0x37) || (extended_cpu_model() == 0x4D))); } //Silvermont and Centerton
   static bool is_knights_family() { return ((cpu_family() == 0x06) && ((extended_cpu_model() == 0x57) || (extended_cpu_model() == 0x85))); } // Xeon Phi 3200/5200/7200 and Future Xeon Phi

@@ -680,6 +694,15 @@
       }
     } else if (is_amd()) {
       result = (_cpuid_info.ext_cpuid8_ecx.bits.cores_per_cpu + 1);
+    } else if (is_zx()) {
+      bool supports_topology = supports_processor_topology();
+      if (supports_topology) {
+        result = _cpuid_info.tpl_cpuidB1_ebx.bits.logical_cpus /
+                 _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
+      }
+      if (!supports_topology || result == 0) {
+        result = (_cpuid_info.dcp_cpuid4_eax.bits.cores_per_cpu + 1);
+      }
     }
     return result;
   }
@@ -688,6 +711,8 @@
     uint result = 1;
     if (is_intel() && supports_processor_topology()) {
       result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
+    } else if (is_zx() && supports_processor_topology()) {
+      result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
     } else if (_cpuid_info.std_cpuid1_edx.bits.ht != 0) {
       if (cpu_family() >= 0x17) {
         result = _cpuid_info.ext_cpuid1E_ebx.bits.threads_per_core + 1;
@@ -705,6 +730,8 @@
       result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1);
     } else if (is_amd()) {
       result = _cpuid_info.ext_cpuid5_ecx.bits.L1_line_size;
+    } else if (is_zx()) {
+      result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1);
     }
     if (result < 32) // not defined ?
       result = 32;   // 32 bytes by default on x86 and other x64


Best Regards!
VicWang | R&D
Telephone:+86-01082695388-892477
-----邮件原件-----
发件人: David Holmes [mailto:[hidden email]]
发送时间: 2018年1月3日 18:03
收件人: Vic Wang(BJ-RD) <[hidden email]>; [hidden email]
抄送: Cobe Chen(BJ-RD) <[hidden email]>
主题: Re: 答复: [patch] support zhaoxin x86 cpu vendor ids CentaurHauls and Shanghai

On 3/01/2018 6:32 PM, Vic Wang(BJ-RD) wrote:
> Hi David:
> I generate a patch using "hg diff". Please check it again. Thanks very much.

Thanks.

> diff -r 4d28288c9f9e src/hotspot/cpu/x86/assembler_x86.cpp
> --- a/src/hotspot/cpu/x86/assembler_x86.cppThu Dec 07 15:52:46 2017
> +0100

Something stripped out the spaces between the file name and timestamp.

I've hosted a webrev for the patch at:

http://cr.openjdk.java.net/~dholmes/8194279/webrev


A few minor comments. I can't comment on the technical accuracy of course :)

All copyright notices need the second year updated to 2018.

- src/hotspot/cpu/x86/assembler_x86.cpp

Seems fine - you copied the existing pattern so I won't call out any style bugs. :)

---

- src/hotspot/cpu/x86/vm_version_x86.cpp

633 UseAVX = 0;

Indention is missing.

General style nit: your if statements are very inconsistently formatted.
A statement like:

if( is_zx() )

should be written:

if (is_zx())

- space after the "if" and no space after ( or before )


In this block:

+     if (is_zx() && ((cpu_family() == 6)||(cpu_family() == 7)) &&
supports_sse3()) {
+ #ifdef COMPILER2
+     if (FLAG_IS_DEFAULT(UseFPUForSpilling) && supports_sse4_2()) {
+       FLAG_SET_DEFAULT(UseFPUForSpilling, true);
+     }
+ #endif

I see this copies the existing pattern but isn't it simpler just to write:

     if (is_zx() && ((cpu_family() == 6)||(cpu_family() == 7)) &&
supports_sse4_2()) {
#ifdef COMPILER2
     if (FLAG_IS_DEFAULT(UseFPUForSpilling)) {
       FLAG_SET_DEFAULT(UseFPUForSpilling, true);
     }
#endif

?

---

-  src/hotspot/cpu/x86/vm_version_x86.hpp

310 CPU_FAMILY_ZX_CORE_F7    = 7,

Indentation is missing.

555 // ZX features.

Indentation is missing.

  556     if(is_zx()) {
  557       if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
  558         result |= CPU_LZCNT;
  559       // for ZX, ecx.bits.misalignsse bit (bit 8) indicates
support for prefetchw
  560       if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
  561         result |= CPU_3DNOW_PREFETCH;
  562       }
  563     }

The "intel" block you copied is full of style issues :( But as it is exactly the same can't you just adjust:

546     if(is_intel()) {

to

546     if (is_intel() || is_zx()) {

? Similarly for cores_per_cpu(), threads_per_core() and L1_line_size().

Otherwise there are indentation issues in your changes to all those functions.

Thanks,
David
-----


保密声明:
本邮件含有保密或专有信息,仅供指定收件人使用。严禁对本邮件或其内容做任何未经授权的查阅、使用、复制或转发。
CONFIDENTIAL NOTE:
This email contains confidential or legally privileged information and is for the sole use of its intended recipient. Any unauthorized review, use, copying or forwarding of this email or the content of this email is strictly prohibited.
Reply | Threaded
Open this post in threaded view
|

Re: [patch] support zhaoxin x86 cpu vendor ids CentaurHauls and Shanghai

David Holmes
Hi,

On 4/01/2018 6:35 PM, Vic Wang(BJ-RD) wrote:
> Hi David:
> Thanks for the detailed review.
> I've regenerated the patch by "hg diff" for modifying the coding style issues.
> Additionally, I don't merge some of the branches into intel's because it may add some codes when the new cpu features are added.

Okay this all seems fine. I've updated the webrev in place and fixed the
copyright years and a couple of space/indent issues:

http://cr.openjdk.java.net/~dholmes/8194279/webrev

if you'd like to check it. Also please advise how you wish the
"contributed-by:" line to appear. For now I'm assuming:

contributed-by: Vic Wang <[hidden email]>

We still need a second reviewer.

Thanks,
David

>
> diff -r 4d28288c9f9e src/hotspot/cpu/x86/assembler_x86.cpp
> --- a/src/hotspot/cpu/x86/assembler_x86.cppThu Dec 07 15:52:46 2017 +0100
> +++ b/src/hotspot/cpu/x86/assembler_x86.cppThu Jan 04 12:20:31 2018 +0800
> @@ -3167,6 +3167,89 @@
>       return;
>     }
>
> +  if (UseAddressNop && VM_Version::is_zx()) {
> +    //
> +    // Using multi-bytes nops "0x0F 0x1F [address]" for ZX
> +    //  1: 0x90
> +    //  2: 0x66 0x90
> +    //  3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
> +    //  4: 0x0F 0x1F 0x40 0x00
> +    //  5: 0x0F 0x1F 0x44 0x00 0x00
> +    //  6: 0x66 0x0F 0x1F 0x44 0x00 0x00
> +    //  7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
> +    //  8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
> +    //  9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
> +    // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
> +    // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
> +
> +    // The rest coding is ZX specific - don't use consecutive address nops
> +
> +    // 12: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
> +    // 13: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
> +    // 14: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
> +    // 15: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
> +
> +    while(i >= 15) {
> +      // For ZX don't generate consecutive addess nops (mix with regular nops)
> +      i -= 15;
> +      emit_int8(0x66);   // size prefix
> +      emit_int8(0x66);   // size prefix
> +      emit_int8(0x66);   // size prefix
> +      addr_nop_8();
> +      emit_int8(0x66);   // size prefix
> +      emit_int8(0x66);   // size prefix
> +      emit_int8(0x66);   // size prefix
> +      emit_int8((unsigned char)0x90);
> +                         // nop
> +    }
> +    switch (i) {
> +      case 14:
> +        emit_int8(0x66); // size prefix
> +      case 13:
> +        emit_int8(0x66); // size prefix
> +      case 12:
> +        addr_nop_8();
> +        emit_int8(0x66); // size prefix
> +        emit_int8(0x66); // size prefix
> +        emit_int8(0x66); // size prefix
> +        emit_int8((unsigned char)0x90);
> +                         // nop
> +        break;
> +      case 11:
> +        emit_int8(0x66); // size prefix
> +      case 10:
> +        emit_int8(0x66); // size prefix
> +      case 9:
> +        emit_int8(0x66); // size prefix
> +      case 8:
> +        addr_nop_8();
> +        break;
> +      case 7:
> +        addr_nop_7();
> +        break;
> +      case 6:
> +        emit_int8(0x66); // size prefix
> +      case 5:
> +        addr_nop_5();
> +        break;
> +      case 4:
> +        addr_nop_4();
> +        break;
> +      case 3:
> +        // Don't use "0x0F 0x1F 0x00" - need patching safe padding
> +        emit_int8(0x66); // size prefix
> +      case 2:
> +        emit_int8(0x66); // size prefix
> +      case 1:
> +        emit_int8((unsigned char)0x90);
> +                         // nop
> +        break;
> +      default:
> +        assert(i == 0, " ");
> +    }
> +    return;
> +  }
> +
>     // Using nops with size prefixes "0x66 0x90".
>     // From AMD Optimization Guide:
>     //  1: 0x90
> diff -r 4d28288c9f9e src/hotspot/cpu/x86/vm_version_x86.cpp
> --- a/src/hotspot/cpu/x86/vm_version_x86.cppThu Dec 07 15:52:46 2017 +0100
> +++ b/src/hotspot/cpu/x86/vm_version_x86.cppThu Jan 04 12:20:31 2018 +0800
> @@ -628,6 +628,11 @@
>     if (UseSSE < 1)
>       _features &= ~CPU_SSE;
>
> +  //since AVX instructions is slower than SSE in some ZX cpus, force USEAVX=0.
> +  if (is_zx() && ((cpu_family() == 6) || (cpu_family() == 7))){
> +UseAVX = 0;
> +  }
> +
>     // first try initial setting and detect what we can support
>     int use_avx_limit = 0;
>     if (UseAVX > 0) {
> @@ -1078,6 +1083,66 @@
>     // UseXmmRegToRegMoveAll == true  --> movaps(xmm, xmm), movapd(xmm, xmm).
>     // UseXmmRegToRegMoveAll == false --> movss(xmm, xmm),  movsd(xmm, xmm).
>
> +
> +  if (is_zx()) { // ZX cpus specific settings
> +    if (FLAG_IS_DEFAULT(UseStoreImmI16)) {
> +      UseStoreImmI16 = false; // don't use it on ZX cpus
> +    }
> +    if ((cpu_family() == 6) || (cpu_family() == 7)) {
> +      if (FLAG_IS_DEFAULT(UseAddressNop)) {
> +        // Use it on all ZX cpus
> +        UseAddressNop = true;
> +      }
> +    }
> +    if (FLAG_IS_DEFAULT(UseXmmLoadAndClearUpper)) {
> +      UseXmmLoadAndClearUpper = true; // use movsd on all ZX cpus
> +    }
> +    if (FLAG_IS_DEFAULT(UseXmmRegToRegMoveAll)) {
> +      if (supports_sse3()) {
> +        UseXmmRegToRegMoveAll = true; // use movaps, movapd on new ZX cpus
> +      } else {
> +        UseXmmRegToRegMoveAll = false;
> +      }
> +    }
> +    if (((cpu_family() == 6) || (cpu_family() == 7)) && supports_sse3()) { // new ZX cpus
> +#ifdef COMPILER2
> +      if (FLAG_IS_DEFAULT(MaxLoopPad)) {
> +        // For new ZX cpus do the next optimization:
> +        // don't align the beginning of a loop if there are enough instructions
> +        // left (NumberOfLoopInstrToAlign defined in c2_globals.hpp)
> +        // in current fetch line (OptoLoopAlignment) or the padding
> +        // is big (> MaxLoopPad).
> +        // Set MaxLoopPad to 11 for new ZX cpus to reduce number of
> +        // generated NOP instructions. 11 is the largest size of one
> +        // address NOP instruction '0F 1F' (see Assembler::nop(i)).
> +        MaxLoopPad = 11;
> +      }
> +#endif // COMPILER2
> +      if (FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
> +        UseXMMForArrayCopy = true; // use SSE2 movq on new ZX cpus
> +      }
> +      if (supports_sse4_2()) { // new ZX cpus
> +        if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
> +          UseUnalignedLoadStores = true; // use movdqu on newest ZX cpus
> +        }
> +      }
> +      if (supports_sse4_2()) {
> +        if (FLAG_IS_DEFAULT(UseSSE42Intrinsics)) {
> +          FLAG_SET_DEFAULT(UseSSE42Intrinsics, true);
> +        }
> +      } else {
> +        if (UseSSE42Intrinsics && !FLAG_IS_DEFAULT(UseAESIntrinsics)) {
> +          warning("SSE4.2 intrinsics require SSE4.2 instructions or higher. Intrinsics will be disabled.");
> +        }
> +        FLAG_SET_DEFAULT(UseSSE42Intrinsics, false);
> +      }
> +    }
> +
> +    if (FLAG_IS_DEFAULT(AllocatePrefetchInstr) && supports_3dnow_prefetch()) {
> +      FLAG_SET_DEFAULT(AllocatePrefetchInstr, 3);
> +    }
> +  }
> +
>     if( is_amd() ) { // AMD cpus specific settings
>       if( supports_sse2() && FLAG_IS_DEFAULT(UseAddressNop) ) {
>         // Use it on new AMD cpus starting from Opteron.
> @@ -1374,6 +1439,14 @@
>   #endif
>     }
>
> +  if (is_zx() && ((cpu_family() == 6) || (cpu_family() == 7)) && supports_sse4_2()) {
> +#ifdef COMPILER2
> +    if (FLAG_IS_DEFAULT(UseFPUForSpilling)) {
> +      FLAG_SET_DEFAULT(UseFPUForSpilling, true);
> +    }
> +#endif
> +  }
> +
>   #ifdef _LP64
>     // Prefetch settings
>
> diff -r 4d28288c9f9e src/hotspot/cpu/x86/vm_version_x86.hpp
> --- a/src/hotspot/cpu/x86/vm_version_x86.hppThu Dec 07 15:52:46 2017 +0100
> +++ b/src/hotspot/cpu/x86/vm_version_x86.hppThu Jan 04 12:20:31 2018 +0800
> @@ -305,6 +305,9 @@
>     enum Extended_Family {
>       // AMD
>       CPU_FAMILY_AMD_11H       = 0x11,
> +    // ZX
> +    CPU_FAMILY_ZX_CORE_F6    = 6,
> +    CPU_FAMILY_ZX_CORE_F7    = 7,
>       // Intel
>       CPU_FAMILY_INTEL_CORE    = 6,
>       CPU_MODEL_NEHALEM        = 0x1e,
> @@ -549,6 +552,16 @@
>         }
>       }
>
> +    // ZX features.
> +    if (is_zx()) {
> +      if (_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
> +        result |= CPU_LZCNT;
> +      // for ZX, ecx.bits.misalignsse bit (bit 8) indicates support for prefetchw
> +      if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
> +        result |= CPU_3DNOW_PREFETCH;
> +      }
> +    }
> +
>       return result;
>     }
>
> @@ -657,6 +670,7 @@
>     static bool is_P6()             { return cpu_family() >= 6; }
>     static bool is_amd()            { assert_is_initialized(); return _cpuid_info.std_vendor_name_0 == 0x68747541; } // 'htuA'
>     static bool is_intel()          { assert_is_initialized(); return _cpuid_info.std_vendor_name_0 == 0x756e6547; } // 'uneG'
> +  static bool is_zx()             { assert_is_initialized(); return (_cpuid_info.std_vendor_name_0 == 0x746e6543) || (_cpuid_info.std_vendor_name_0 == 0x68532020); } // 'tneC'||'hS  '
>     static bool is_atom_family()    { return ((cpu_family() == 0x06) && ((extended_cpu_model() == 0x36) || (extended_cpu_model() == 0x37) || (extended_cpu_model() == 0x4D))); } //Silvermont and Centerton
>     static bool is_knights_family() { return ((cpu_family() == 0x06) && ((extended_cpu_model() == 0x57) || (extended_cpu_model() == 0x85))); } // Xeon Phi 3200/5200/7200 and Future Xeon Phi
>
> @@ -680,6 +694,15 @@
>         }
>       } else if (is_amd()) {
>         result = (_cpuid_info.ext_cpuid8_ecx.bits.cores_per_cpu + 1);
> +    } else if (is_zx()) {
> +      bool supports_topology = supports_processor_topology();
> +      if (supports_topology) {
> +        result = _cpuid_info.tpl_cpuidB1_ebx.bits.logical_cpus /
> +                 _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
> +      }
> +      if (!supports_topology || result == 0) {
> +        result = (_cpuid_info.dcp_cpuid4_eax.bits.cores_per_cpu + 1);
> +      }
>       }
>       return result;
>     }
> @@ -688,6 +711,8 @@
>       uint result = 1;
>       if (is_intel() && supports_processor_topology()) {
>         result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
> +    } else if (is_zx() && supports_processor_topology()) {
> +      result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
>       } else if (_cpuid_info.std_cpuid1_edx.bits.ht != 0) {
>         if (cpu_family() >= 0x17) {
>           result = _cpuid_info.ext_cpuid1E_ebx.bits.threads_per_core + 1;
> @@ -705,6 +730,8 @@
>         result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1);
>       } else if (is_amd()) {
>         result = _cpuid_info.ext_cpuid5_ecx.bits.L1_line_size;
> +    } else if (is_zx()) {
> +      result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1);
>       }
>       if (result < 32) // not defined ?
>         result = 32;   // 32 bytes by default on x86 and other x64
>
>
> Best Regards!
> VicWang | R&D
> Telephone:+86-01082695388-892477
> -----邮件原件-----
> 发件人: David Holmes [mailto:[hidden email]]
> 发送时间: 2018年1月3日 18:03
> 收件人: Vic Wang(BJ-RD) <[hidden email]>; [hidden email]
> 抄送: Cobe Chen(BJ-RD) <[hidden email]>
> 主题: Re: 答复: [patch] support zhaoxin x86 cpu vendor ids CentaurHauls and Shanghai
>
> On 3/01/2018 6:32 PM, Vic Wang(BJ-RD) wrote:
>> Hi David:
>> I generate a patch using "hg diff". Please check it again. Thanks very much.
>
> Thanks.
>
>> diff -r 4d28288c9f9e src/hotspot/cpu/x86/assembler_x86.cpp
>> --- a/src/hotspot/cpu/x86/assembler_x86.cppThu Dec 07 15:52:46 2017
>> +0100
>
> Something stripped out the spaces between the file name and timestamp.
>
> I've hosted a webrev for the patch at:
>
> http://cr.openjdk.java.net/~dholmes/8194279/webrev
>
>
> A few minor comments. I can't comment on the technical accuracy of course :)
>
> All copyright notices need the second year updated to 2018.
>
> - src/hotspot/cpu/x86/assembler_x86.cpp
>
> Seems fine - you copied the existing pattern so I won't call out any style bugs. :)
>
> ---
>
> - src/hotspot/cpu/x86/vm_version_x86.cpp
>
> 633 UseAVX = 0;
>
> Indention is missing.
>
> General style nit: your if statements are very inconsistently formatted.
> A statement like:
>
> if( is_zx() )
>
> should be written:
>
> if (is_zx())
>
> - space after the "if" and no space after ( or before )
>
>
> In this block:
>
> +     if (is_zx() && ((cpu_family() == 6)||(cpu_family() == 7)) &&
> supports_sse3()) {
> + #ifdef COMPILER2
> +     if (FLAG_IS_DEFAULT(UseFPUForSpilling) && supports_sse4_2()) {
> +       FLAG_SET_DEFAULT(UseFPUForSpilling, true);
> +     }
> + #endif
>
> I see this copies the existing pattern but isn't it simpler just to write:
>
>       if (is_zx() && ((cpu_family() == 6)||(cpu_family() == 7)) &&
> supports_sse4_2()) {
> #ifdef COMPILER2
>       if (FLAG_IS_DEFAULT(UseFPUForSpilling)) {
>         FLAG_SET_DEFAULT(UseFPUForSpilling, true);
>       }
> #endif
>
> ?
>
> ---
>
> -  src/hotspot/cpu/x86/vm_version_x86.hpp
>
> 310 CPU_FAMILY_ZX_CORE_F7    = 7,
>
> Indentation is missing.
>
> 555 // ZX features.
>
> Indentation is missing.
>
>    556     if(is_zx()) {
>    557       if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
>    558         result |= CPU_LZCNT;
>    559       // for ZX, ecx.bits.misalignsse bit (bit 8) indicates
> support for prefetchw
>    560       if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
>    561         result |= CPU_3DNOW_PREFETCH;
>    562       }
>    563     }
>
> The "intel" block you copied is full of style issues :( But as it is exactly the same can't you just adjust:
>
> 546     if(is_intel()) {
>
> to
>
> 546     if (is_intel() || is_zx()) {
>
> ? Similarly for cores_per_cpu(), threads_per_core() and L1_line_size().
>
> Otherwise there are indentation issues in your changes to all those functions.
>
> Thanks,
> David
> -----
>
>
> 保密声明:
> 本邮件含有保密或专有信息,仅供指定收件人使用。严禁对本邮件或其内容做任何未经授权的查阅、使用、复制或转发。
> CONFIDENTIAL NOTE:
> This email contains confidential or legally privileged information and is for the sole use of its intended recipient. Any unauthorized review, use, copying or forwarding of this email or the content of this email is strictly prohibited.
>
Reply | Threaded
Open this post in threaded view
|

Re: [patch] support zhaoxin x86 cpu vendor ids CentaurHauls and Shanghai

Vladimir Kozlov
Looks good to me.

Thanks,
Vladimir

On 1/4/18 4:20 AM, David Holmes wrote:

> Hi,
>
> On 4/01/2018 6:35 PM, Vic Wang(BJ-RD) wrote:
>> Hi David:
>> Thanks for the detailed review.
>> I've regenerated the patch by "hg diff" for modifying the coding style issues.
>> Additionally, I don't merge some of the branches into intel's because it may add some codes when
>> the new cpu features are added.
>
> Okay this all seems fine. I've updated the webrev in place and fixed the copyright years and a
> couple of space/indent issues:
>
> http://cr.openjdk.java.net/~dholmes/8194279/webrev
>
> if you'd like to check it. Also please advise how you wish the "contributed-by:" line to appear. For
> now I'm assuming:
>
> contributed-by: Vic Wang <[hidden email]>
>
> We still need a second reviewer.
>
> Thanks,
> David
>
>>
>> diff -r 4d28288c9f9e src/hotspot/cpu/x86/assembler_x86.cpp
>> --- a/src/hotspot/cpu/x86/assembler_x86.cppThu Dec 07 15:52:46 2017 +0100
>> +++ b/src/hotspot/cpu/x86/assembler_x86.cppThu Jan 04 12:20:31 2018 +0800
>> @@ -3167,6 +3167,89 @@
>>       return;
>>     }
>>
>> +  if (UseAddressNop && VM_Version::is_zx()) {
>> +    //
>> +    // Using multi-bytes nops "0x0F 0x1F [address]" for ZX
>> +    //  1: 0x90
>> +    //  2: 0x66 0x90
>> +    //  3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
>> +    //  4: 0x0F 0x1F 0x40 0x00
>> +    //  5: 0x0F 0x1F 0x44 0x00 0x00
>> +    //  6: 0x66 0x0F 0x1F 0x44 0x00 0x00
>> +    //  7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
>> +    //  8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
>> +    //  9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
>> +    // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
>> +    // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
>> +
>> +    // The rest coding is ZX specific - don't use consecutive address nops
>> +
>> +    // 12: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
>> +    // 13: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
>> +    // 14: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
>> +    // 15: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
>> +
>> +    while(i >= 15) {
>> +      // For ZX don't generate consecutive addess nops (mix with regular nops)
>> +      i -= 15;
>> +      emit_int8(0x66);   // size prefix
>> +      emit_int8(0x66);   // size prefix
>> +      emit_int8(0x66);   // size prefix
>> +      addr_nop_8();
>> +      emit_int8(0x66);   // size prefix
>> +      emit_int8(0x66);   // size prefix
>> +      emit_int8(0x66);   // size prefix
>> +      emit_int8((unsigned char)0x90);
>> +                         // nop
>> +    }
>> +    switch (i) {
>> +      case 14:
>> +        emit_int8(0x66); // size prefix
>> +      case 13:
>> +        emit_int8(0x66); // size prefix
>> +      case 12:
>> +        addr_nop_8();
>> +        emit_int8(0x66); // size prefix
>> +        emit_int8(0x66); // size prefix
>> +        emit_int8(0x66); // size prefix
>> +        emit_int8((unsigned char)0x90);
>> +                         // nop
>> +        break;
>> +      case 11:
>> +        emit_int8(0x66); // size prefix
>> +      case 10:
>> +        emit_int8(0x66); // size prefix
>> +      case 9:
>> +        emit_int8(0x66); // size prefix
>> +      case 8:
>> +        addr_nop_8();
>> +        break;
>> +      case 7:
>> +        addr_nop_7();
>> +        break;
>> +      case 6:
>> +        emit_int8(0x66); // size prefix
>> +      case 5:
>> +        addr_nop_5();
>> +        break;
>> +      case 4:
>> +        addr_nop_4();
>> +        break;
>> +      case 3:
>> +        // Don't use "0x0F 0x1F 0x00" - need patching safe padding
>> +        emit_int8(0x66); // size prefix
>> +      case 2:
>> +        emit_int8(0x66); // size prefix
>> +      case 1:
>> +        emit_int8((unsigned char)0x90);
>> +                         // nop
>> +        break;
>> +      default:
>> +        assert(i == 0, " ");
>> +    }
>> +    return;
>> +  }
>> +
>>     // Using nops with size prefixes "0x66 0x90".
>>     // From AMD Optimization Guide:
>>     //  1: 0x90
>> diff -r 4d28288c9f9e src/hotspot/cpu/x86/vm_version_x86.cpp
>> --- a/src/hotspot/cpu/x86/vm_version_x86.cppThu Dec 07 15:52:46 2017 +0100
>> +++ b/src/hotspot/cpu/x86/vm_version_x86.cppThu Jan 04 12:20:31 2018 +0800
>> @@ -628,6 +628,11 @@
>>     if (UseSSE < 1)
>>       _features &= ~CPU_SSE;
>>
>> +  //since AVX instructions is slower than SSE in some ZX cpus, force USEAVX=0.
>> +  if (is_zx() && ((cpu_family() == 6) || (cpu_family() == 7))){
>> +UseAVX = 0;
>> +  }
>> +
>>     // first try initial setting and detect what we can support
>>     int use_avx_limit = 0;
>>     if (UseAVX > 0) {
>> @@ -1078,6 +1083,66 @@
>>     // UseXmmRegToRegMoveAll == true  --> movaps(xmm, xmm), movapd(xmm, xmm).
>>     // UseXmmRegToRegMoveAll == false --> movss(xmm, xmm),  movsd(xmm, xmm).
>>
>> +
>> +  if (is_zx()) { // ZX cpus specific settings
>> +    if (FLAG_IS_DEFAULT(UseStoreImmI16)) {
>> +      UseStoreImmI16 = false; // don't use it on ZX cpus
>> +    }
>> +    if ((cpu_family() == 6) || (cpu_family() == 7)) {
>> +      if (FLAG_IS_DEFAULT(UseAddressNop)) {
>> +        // Use it on all ZX cpus
>> +        UseAddressNop = true;
>> +      }
>> +    }
>> +    if (FLAG_IS_DEFAULT(UseXmmLoadAndClearUpper)) {
>> +      UseXmmLoadAndClearUpper = true; // use movsd on all ZX cpus
>> +    }
>> +    if (FLAG_IS_DEFAULT(UseXmmRegToRegMoveAll)) {
>> +      if (supports_sse3()) {
>> +        UseXmmRegToRegMoveAll = true; // use movaps, movapd on new ZX cpus
>> +      } else {
>> +        UseXmmRegToRegMoveAll = false;
>> +      }
>> +    }
>> +    if (((cpu_family() == 6) || (cpu_family() == 7)) && supports_sse3()) { // new ZX cpus
>> +#ifdef COMPILER2
>> +      if (FLAG_IS_DEFAULT(MaxLoopPad)) {
>> +        // For new ZX cpus do the next optimization:
>> +        // don't align the beginning of a loop if there are enough instructions
>> +        // left (NumberOfLoopInstrToAlign defined in c2_globals.hpp)
>> +        // in current fetch line (OptoLoopAlignment) or the padding
>> +        // is big (> MaxLoopPad).
>> +        // Set MaxLoopPad to 11 for new ZX cpus to reduce number of
>> +        // generated NOP instructions. 11 is the largest size of one
>> +        // address NOP instruction '0F 1F' (see Assembler::nop(i)).
>> +        MaxLoopPad = 11;
>> +      }
>> +#endif // COMPILER2
>> +      if (FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
>> +        UseXMMForArrayCopy = true; // use SSE2 movq on new ZX cpus
>> +      }
>> +      if (supports_sse4_2()) { // new ZX cpus
>> +        if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
>> +          UseUnalignedLoadStores = true; // use movdqu on newest ZX cpus
>> +        }
>> +      }
>> +      if (supports_sse4_2()) {
>> +        if (FLAG_IS_DEFAULT(UseSSE42Intrinsics)) {
>> +          FLAG_SET_DEFAULT(UseSSE42Intrinsics, true);
>> +        }
>> +      } else {
>> +        if (UseSSE42Intrinsics && !FLAG_IS_DEFAULT(UseAESIntrinsics)) {
>> +          warning("SSE4.2 intrinsics require SSE4.2 instructions or higher. Intrinsics will be
>> disabled.");
>> +        }
>> +        FLAG_SET_DEFAULT(UseSSE42Intrinsics, false);
>> +      }
>> +    }
>> +
>> +    if (FLAG_IS_DEFAULT(AllocatePrefetchInstr) && supports_3dnow_prefetch()) {
>> +      FLAG_SET_DEFAULT(AllocatePrefetchInstr, 3);
>> +    }
>> +  }
>> +
>>     if( is_amd() ) { // AMD cpus specific settings
>>       if( supports_sse2() && FLAG_IS_DEFAULT(UseAddressNop) ) {
>>         // Use it on new AMD cpus starting from Opteron.
>> @@ -1374,6 +1439,14 @@
>>   #endif
>>     }
>>
>> +  if (is_zx() && ((cpu_family() == 6) || (cpu_family() == 7)) && supports_sse4_2()) {
>> +#ifdef COMPILER2
>> +    if (FLAG_IS_DEFAULT(UseFPUForSpilling)) {
>> +      FLAG_SET_DEFAULT(UseFPUForSpilling, true);
>> +    }
>> +#endif
>> +  }
>> +
>>   #ifdef _LP64
>>     // Prefetch settings
>>
>> diff -r 4d28288c9f9e src/hotspot/cpu/x86/vm_version_x86.hpp
>> --- a/src/hotspot/cpu/x86/vm_version_x86.hppThu Dec 07 15:52:46 2017 +0100
>> +++ b/src/hotspot/cpu/x86/vm_version_x86.hppThu Jan 04 12:20:31 2018 +0800
>> @@ -305,6 +305,9 @@
>>     enum Extended_Family {
>>       // AMD
>>       CPU_FAMILY_AMD_11H       = 0x11,
>> +    // ZX
>> +    CPU_FAMILY_ZX_CORE_F6    = 6,
>> +    CPU_FAMILY_ZX_CORE_F7    = 7,
>>       // Intel
>>       CPU_FAMILY_INTEL_CORE    = 6,
>>       CPU_MODEL_NEHALEM        = 0x1e,
>> @@ -549,6 +552,16 @@
>>         }
>>       }
>>
>> +    // ZX features.
>> +    if (is_zx()) {
>> +      if (_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
>> +        result |= CPU_LZCNT;
>> +      // for ZX, ecx.bits.misalignsse bit (bit 8) indicates support for prefetchw
>> +      if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
>> +        result |= CPU_3DNOW_PREFETCH;
>> +      }
>> +    }
>> +
>>       return result;
>>     }
>>
>> @@ -657,6 +670,7 @@
>>     static bool is_P6()             { return cpu_family() >= 6; }
>>     static bool is_amd()            { assert_is_initialized(); return
>> _cpuid_info.std_vendor_name_0 == 0x68747541; } // 'htuA'
>>     static bool is_intel()          { assert_is_initialized(); return
>> _cpuid_info.std_vendor_name_0 == 0x756e6547; } // 'uneG'
>> +  static bool is_zx()             { assert_is_initialized(); return
>> (_cpuid_info.std_vendor_name_0 == 0x746e6543) || (_cpuid_info.std_vendor_name_0 == 0x68532020); }
>> // 'tneC'||'hS  '
>>     static bool is_atom_family()    { return ((cpu_family() == 0x06) && ((extended_cpu_model() ==
>> 0x36) || (extended_cpu_model() == 0x37) || (extended_cpu_model() == 0x4D))); } //Silvermont and
>> Centerton
>>     static bool is_knights_family() { return ((cpu_family() == 0x06) && ((extended_cpu_model() ==
>> 0x57) || (extended_cpu_model() == 0x85))); } // Xeon Phi 3200/5200/7200 and Future Xeon Phi
>>
>> @@ -680,6 +694,15 @@
>>         }
>>       } else if (is_amd()) {
>>         result = (_cpuid_info.ext_cpuid8_ecx.bits.cores_per_cpu + 1);
>> +    } else if (is_zx()) {
>> +      bool supports_topology = supports_processor_topology();
>> +      if (supports_topology) {
>> +        result = _cpuid_info.tpl_cpuidB1_ebx.bits.logical_cpus /
>> +                 _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
>> +      }
>> +      if (!supports_topology || result == 0) {
>> +        result = (_cpuid_info.dcp_cpuid4_eax.bits.cores_per_cpu + 1);
>> +      }
>>       }
>>       return result;
>>     }
>> @@ -688,6 +711,8 @@
>>       uint result = 1;
>>       if (is_intel() && supports_processor_topology()) {
>>         result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
>> +    } else if (is_zx() && supports_processor_topology()) {
>> +      result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
>>       } else if (_cpuid_info.std_cpuid1_edx.bits.ht != 0) {
>>         if (cpu_family() >= 0x17) {
>>           result = _cpuid_info.ext_cpuid1E_ebx.bits.threads_per_core + 1;
>> @@ -705,6 +730,8 @@
>>         result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1);
>>       } else if (is_amd()) {
>>         result = _cpuid_info.ext_cpuid5_ecx.bits.L1_line_size;
>> +    } else if (is_zx()) {
>> +      result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1);
>>       }
>>       if (result < 32) // not defined ?
>>         result = 32;   // 32 bytes by default on x86 and other x64
>>
>>
>> Best Regards!
>> VicWang | R&D
>> Telephone:+86-01082695388-892477
>> -----邮件原件-----
>> 发件人: David Holmes [mailto:[hidden email]]
>> 发送时间: 2018年1月3日 18:03
>> 收件人: Vic Wang(BJ-RD) <[hidden email]>; [hidden email]
>> 抄送: Cobe Chen(BJ-RD) <[hidden email]>
>> 主题: Re: 答复: [patch] support zhaoxin x86 cpu vendor ids CentaurHauls and Shanghai
>>
>> On 3/01/2018 6:32 PM, Vic Wang(BJ-RD) wrote:
>>> Hi David:
>>> I generate a patch using "hg diff". Please check it again. Thanks very much.
>>
>> Thanks.
>>
>>> diff -r 4d28288c9f9e src/hotspot/cpu/x86/assembler_x86.cpp
>>> --- a/src/hotspot/cpu/x86/assembler_x86.cppThu Dec 07 15:52:46 2017
>>> +0100
>>
>> Something stripped out the spaces between the file name and timestamp.
>>
>> I've hosted a webrev for the patch at:
>>
>> http://cr.openjdk.java.net/~dholmes/8194279/webrev
>>
>>
>> A few minor comments. I can't comment on the technical accuracy of course :)
>>
>> All copyright notices need the second year updated to 2018.
>>
>> - src/hotspot/cpu/x86/assembler_x86.cpp
>>
>> Seems fine - you copied the existing pattern so I won't call out any style bugs. :)
>>
>> ---
>>
>> - src/hotspot/cpu/x86/vm_version_x86.cpp
>>
>> 633 UseAVX = 0;
>>
>> Indention is missing.
>>
>> General style nit: your if statements are very inconsistently formatted.
>> A statement like:
>>
>> if( is_zx() )
>>
>> should be written:
>>
>> if (is_zx())
>>
>> - space after the "if" and no space after ( or before )
>>
>>
>> In this block:
>>
>> +     if (is_zx() && ((cpu_family() == 6)||(cpu_family() == 7)) &&
>> supports_sse3()) {
>> + #ifdef COMPILER2
>> +     if (FLAG_IS_DEFAULT(UseFPUForSpilling) && supports_sse4_2()) {
>> +       FLAG_SET_DEFAULT(UseFPUForSpilling, true);
>> +     }
>> + #endif
>>
>> I see this copies the existing pattern but isn't it simpler just to write:
>>
>>       if (is_zx() && ((cpu_family() == 6)||(cpu_family() == 7)) &&
>> supports_sse4_2()) {
>> #ifdef COMPILER2
>>       if (FLAG_IS_DEFAULT(UseFPUForSpilling)) {
>>         FLAG_SET_DEFAULT(UseFPUForSpilling, true);
>>       }
>> #endif
>>
>> ?
>>
>> ---
>>
>> -  src/hotspot/cpu/x86/vm_version_x86.hpp
>>
>> 310 CPU_FAMILY_ZX_CORE_F7    = 7,
>>
>> Indentation is missing.
>>
>> 555 // ZX features.
>>
>> Indentation is missing.
>>
>>    556     if(is_zx()) {
>>    557       if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
>>    558         result |= CPU_LZCNT;
>>    559       // for ZX, ecx.bits.misalignsse bit (bit 8) indicates
>> support for prefetchw
>>    560       if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
>>    561         result |= CPU_3DNOW_PREFETCH;
>>    562       }
>>    563     }
>>
>> The "intel" block you copied is full of style issues :( But as it is exactly the same can't you
>> just adjust:
>>
>> 546     if(is_intel()) {
>>
>> to
>>
>> 546     if (is_intel() || is_zx()) {
>>
>> ? Similarly for cores_per_cpu(), threads_per_core() and L1_line_size().
>>
>> Otherwise there are indentation issues in your changes to all those functions.
>>
>> Thanks,
>> David
>> -----
>>
>>
>> 保密声明:
>> 本邮件含有保密或专有信息,仅供指定收件人使用。严禁对本邮件或其内容做任何未经授权的查阅、使用、复制
>> 或转发。
>> CONFIDENTIAL NOTE:
>> This email contains confidential or legally privileged information and is for the sole use of its
>> intended recipient. Any unauthorized review, use, copying or forwarding of this email or the
>> content of this email is strictly prohibited.
>>
Reply | Threaded
Open this post in threaded view
|

答复: [patch] support zhaoxin x86 cpu vendor ids CentaurHauls and Shanghai

Vic Wang(BJ-RD)
Hi Vladimir and David:
Thanks for the review.

>if you'd like to check it. Also please advise how you wish the "contributed-by:" line to appear. For now I'm assuming:
>contributed-by: Vic Wang <[hidden email]>

You're right.

Thanks.
Vic
-----邮件原件-----
发件人: Vladimir Kozlov [mailto:[hidden email]]
发送时间: 2018年1月5日 3:48
收件人: David Holmes <[hidden email]>; Vic Wang(BJ-RD) <[hidden email]>; [hidden email]
抄送: Cobe Chen(BJ-RD) <[hidden email]>
主题: Re: [patch] support zhaoxin x86 cpu vendor ids CentaurHauls and Shanghai

Looks good to me.

Thanks,
Vladimir

On 1/4/18 4:20 AM, David Holmes wrote:

> Hi,
>
> On 4/01/2018 6:35 PM, Vic Wang(BJ-RD) wrote:
>> Hi David:
>> Thanks for the detailed review.
>> I've regenerated the patch by "hg diff" for modifying the coding style issues.
>> Additionally, I don't merge some of the branches into intel's because
>> it may add some codes when the new cpu features are added.
>
> Okay this all seems fine. I've updated the webrev in place and fixed
> the copyright years and a couple of space/indent issues:
>
> http://cr.openjdk.java.net/~dholmes/8194279/webrev
>
> if you'd like to check it. Also please advise how you wish the
> "contributed-by:" line to appear. For now I'm assuming:
>
> contributed-by: Vic Wang <[hidden email]>
>
> We still need a second reviewer.
>
> Thanks,
> David
>
>>
>> diff -r 4d28288c9f9e src/hotspot/cpu/x86/assembler_x86.cpp
>> --- a/src/hotspot/cpu/x86/assembler_x86.cppThu Dec 07 15:52:46 2017
>> +0100
>> +++ b/src/hotspot/cpu/x86/assembler_x86.cppThu Jan 04 12:20:31 2018
>> +++ +0800
>> @@ -3167,6 +3167,89 @@
>>       return;
>>     }
>>
>> +  if (UseAddressNop && VM_Version::is_zx()) {
>> +    //
>> +    // Using multi-bytes nops "0x0F 0x1F [address]" for ZX
>> +    //  1: 0x90
>> +    //  2: 0x66 0x90
>> +    //  3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need
>> +patching safe padding)
>> +    //  4: 0x0F 0x1F 0x40 0x00
>> +    //  5: 0x0F 0x1F 0x44 0x00 0x00
>> +    //  6: 0x66 0x0F 0x1F 0x44 0x00 0x00
>> +    //  7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
>> +    //  8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
>> +    //  9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
>> +    // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
>> +    // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
>> +
>> +    // The rest coding is ZX specific - don't use consecutive
>> +address nops
>> +
>> +    // 12: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66
>> +0x90
>> +    // 13: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66
>> +0x66 0x90
>> +    // 14: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66
>> +0x66 0x66 0x90
>> +    // 15: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
>> +0x66 0x66 0x66 0x90
>> +
>> +    while(i >= 15) {
>> +      // For ZX don't generate consecutive addess nops (mix with
>> +regular nops)
>> +      i -= 15;
>> +      emit_int8(0x66);   // size prefix
>> +      emit_int8(0x66);   // size prefix
>> +      emit_int8(0x66);   // size prefix
>> +      addr_nop_8();
>> +      emit_int8(0x66);   // size prefix
>> +      emit_int8(0x66);   // size prefix
>> +      emit_int8(0x66);   // size prefix
>> +      emit_int8((unsigned char)0x90);
>> +                         // nop
>> +    }
>> +    switch (i) {
>> +      case 14:
>> +        emit_int8(0x66); // size prefix
>> +      case 13:
>> +        emit_int8(0x66); // size prefix
>> +      case 12:
>> +        addr_nop_8();
>> +        emit_int8(0x66); // size prefix
>> +        emit_int8(0x66); // size prefix
>> +        emit_int8(0x66); // size prefix
>> +        emit_int8((unsigned char)0x90);
>> +                         // nop
>> +        break;
>> +      case 11:
>> +        emit_int8(0x66); // size prefix
>> +      case 10:
>> +        emit_int8(0x66); // size prefix
>> +      case 9:
>> +        emit_int8(0x66); // size prefix
>> +      case 8:
>> +        addr_nop_8();
>> +        break;
>> +      case 7:
>> +        addr_nop_7();
>> +        break;
>> +      case 6:
>> +        emit_int8(0x66); // size prefix
>> +      case 5:
>> +        addr_nop_5();
>> +        break;
>> +      case 4:
>> +        addr_nop_4();
>> +        break;
>> +      case 3:
>> +        // Don't use "0x0F 0x1F 0x00" - need patching safe padding
>> +        emit_int8(0x66); // size prefix
>> +      case 2:
>> +        emit_int8(0x66); // size prefix
>> +      case 1:
>> +        emit_int8((unsigned char)0x90);
>> +                         // nop
>> +        break;
>> +      default:
>> +        assert(i == 0, " ");
>> +    }
>> +    return;
>> +  }
>> +
>>     // Using nops with size prefixes "0x66 0x90".
>>     // From AMD Optimization Guide:
>>     //  1: 0x90
>> diff -r 4d28288c9f9e src/hotspot/cpu/x86/vm_version_x86.cpp
>> --- a/src/hotspot/cpu/x86/vm_version_x86.cppThu Dec 07 15:52:46 2017
>> +0100
>> +++ b/src/hotspot/cpu/x86/vm_version_x86.cppThu Jan 04 12:20:31 2018
>> +++ +0800
>> @@ -628,6 +628,11 @@
>>     if (UseSSE < 1)
>>       _features &= ~CPU_SSE;
>>
>> +  //since AVX instructions is slower than SSE in some ZX cpus, force USEAVX=0.
>> +  if (is_zx() && ((cpu_family() == 6) || (cpu_family() == 7))){
>> +UseAVX = 0;
>> +  }
>> +
>>     // first try initial setting and detect what we can support
>>     int use_avx_limit = 0;
>>     if (UseAVX > 0) {
>> @@ -1078,6 +1083,66 @@
>>     // UseXmmRegToRegMoveAll == true  --> movaps(xmm, xmm), movapd(xmm, xmm).
>>     // UseXmmRegToRegMoveAll == false --> movss(xmm, xmm),  movsd(xmm, xmm).
>>
>> +
>> +  if (is_zx()) { // ZX cpus specific settings
>> +    if (FLAG_IS_DEFAULT(UseStoreImmI16)) {
>> +      UseStoreImmI16 = false; // don't use it on ZX cpus
>> +    }
>> +    if ((cpu_family() == 6) || (cpu_family() == 7)) {
>> +      if (FLAG_IS_DEFAULT(UseAddressNop)) {
>> +        // Use it on all ZX cpus
>> +        UseAddressNop = true;
>> +      }
>> +    }
>> +    if (FLAG_IS_DEFAULT(UseXmmLoadAndClearUpper)) {
>> +      UseXmmLoadAndClearUpper = true; // use movsd on all ZX cpus
>> +    }
>> +    if (FLAG_IS_DEFAULT(UseXmmRegToRegMoveAll)) {
>> +      if (supports_sse3()) {
>> +        UseXmmRegToRegMoveAll = true; // use movaps, movapd on new
>> +ZX cpus
>> +      } else {
>> +        UseXmmRegToRegMoveAll = false;
>> +      }
>> +    }
>> +    if (((cpu_family() == 6) || (cpu_family() == 7)) &&
>> +supports_sse3()) { // new ZX cpus #ifdef COMPILER2
>> +      if (FLAG_IS_DEFAULT(MaxLoopPad)) {
>> +        // For new ZX cpus do the next optimization:
>> +        // don't align the beginning of a loop if there are enough
>> +instructions
>> +        // left (NumberOfLoopInstrToAlign defined in c2_globals.hpp)
>> +        // in current fetch line (OptoLoopAlignment) or the padding
>> +        // is big (> MaxLoopPad).
>> +        // Set MaxLoopPad to 11 for new ZX cpus to reduce number of
>> +        // generated NOP instructions. 11 is the largest size of one
>> +        // address NOP instruction '0F 1F' (see Assembler::nop(i)).
>> +        MaxLoopPad = 11;
>> +      }
>> +#endif // COMPILER2
>> +      if (FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
>> +        UseXMMForArrayCopy = true; // use SSE2 movq on new ZX cpus
>> +      }
>> +      if (supports_sse4_2()) { // new ZX cpus
>> +        if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
>> +          UseUnalignedLoadStores = true; // use movdqu on newest ZX
>> +cpus
>> +        }
>> +      }
>> +      if (supports_sse4_2()) {
>> +        if (FLAG_IS_DEFAULT(UseSSE42Intrinsics)) {
>> +          FLAG_SET_DEFAULT(UseSSE42Intrinsics, true);
>> +        }
>> +      } else {
>> +        if (UseSSE42Intrinsics &&
>> +!FLAG_IS_DEFAULT(UseAESIntrinsics)) {
>> +          warning("SSE4.2 intrinsics require SSE4.2 instructions or
>> +higher. Intrinsics will be
>> disabled.");
>> +        }
>> +        FLAG_SET_DEFAULT(UseSSE42Intrinsics, false);
>> +      }
>> +    }
>> +
>> +    if (FLAG_IS_DEFAULT(AllocatePrefetchInstr) &&
>> +supports_3dnow_prefetch()) {
>> +      FLAG_SET_DEFAULT(AllocatePrefetchInstr, 3);
>> +    }
>> +  }
>> +
>>     if( is_amd() ) { // AMD cpus specific settings
>>       if( supports_sse2() && FLAG_IS_DEFAULT(UseAddressNop) ) {
>>         // Use it on new AMD cpus starting from Opteron.
>> @@ -1374,6 +1439,14 @@
>>   #endif
>>     }
>>
>> +  if (is_zx() && ((cpu_family() == 6) || (cpu_family() == 7)) &&
>> +supports_sse4_2()) { #ifdef COMPILER2
>> +    if (FLAG_IS_DEFAULT(UseFPUForSpilling)) {
>> +      FLAG_SET_DEFAULT(UseFPUForSpilling, true);
>> +    }
>> +#endif
>> +  }
>> +
>>   #ifdef _LP64
>>     // Prefetch settings
>>
>> diff -r 4d28288c9f9e src/hotspot/cpu/x86/vm_version_x86.hpp
>> --- a/src/hotspot/cpu/x86/vm_version_x86.hppThu Dec 07 15:52:46 2017
>> +0100
>> +++ b/src/hotspot/cpu/x86/vm_version_x86.hppThu Jan 04 12:20:31 2018
>> +++ +0800
>> @@ -305,6 +305,9 @@
>>     enum Extended_Family {
>>       // AMD
>>       CPU_FAMILY_AMD_11H       = 0x11,
>> +    // ZX
>> +    CPU_FAMILY_ZX_CORE_F6    = 6,
>> +    CPU_FAMILY_ZX_CORE_F7    = 7,
>>       // Intel
>>       CPU_FAMILY_INTEL_CORE    = 6,
>>       CPU_MODEL_NEHALEM        = 0x1e, @@ -549,6 +552,16 @@
>>         }
>>       }
>>
>> +    // ZX features.
>> +    if (is_zx()) {
>> +      if (_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
>> +        result |= CPU_LZCNT;
>> +      // for ZX, ecx.bits.misalignsse bit (bit 8) indicates support
>> +for prefetchw
>> +      if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
>> +        result |= CPU_3DNOW_PREFETCH;
>> +      }
>> +    }
>> +
>>       return result;
>>     }
>>
>> @@ -657,6 +670,7 @@
>>     static bool is_P6()             { return cpu_family() >= 6; }
>>     static bool is_amd()            { assert_is_initialized(); return
>> _cpuid_info.std_vendor_name_0 == 0x68747541; } // 'htuA'
>>     static bool is_intel()          { assert_is_initialized(); return
>> _cpuid_info.std_vendor_name_0 == 0x756e6547; } // 'uneG'
>> +  static bool is_zx()             { assert_is_initialized(); return
>> (_cpuid_info.std_vendor_name_0 == 0x746e6543) ||
>> (_cpuid_info.std_vendor_name_0 == 0x68532020); } // 'tneC'||'hS  '
>>     static bool is_atom_family()    { return ((cpu_family() == 0x06)
>> && ((extended_cpu_model() ==
>> 0x36) || (extended_cpu_model() == 0x37) || (extended_cpu_model() ==
>> 0x4D))); } //Silvermont and Centerton
>>     static bool is_knights_family() { return ((cpu_family() == 0x06)
>> && ((extended_cpu_model() ==
>> 0x57) || (extended_cpu_model() == 0x85))); } // Xeon Phi
>> 3200/5200/7200 and Future Xeon Phi
>>
>> @@ -680,6 +694,15 @@
>>         }
>>       } else if (is_amd()) {
>>         result = (_cpuid_info.ext_cpuid8_ecx.bits.cores_per_cpu + 1);
>> +    } else if (is_zx()) {
>> +      bool supports_topology = supports_processor_topology();
>> +      if (supports_topology) {
>> +        result = _cpuid_info.tpl_cpuidB1_ebx.bits.logical_cpus /
>> +                 _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
>> +      }
>> +      if (!supports_topology || result == 0) {
>> +        result = (_cpuid_info.dcp_cpuid4_eax.bits.cores_per_cpu +
>> +1);
>> +      }
>>       }
>>       return result;
>>     }
>> @@ -688,6 +711,8 @@
>>       uint result = 1;
>>       if (is_intel() && supports_processor_topology()) {
>>         result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
>> +    } else if (is_zx() && supports_processor_topology()) {
>> +      result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
>>       } else if (_cpuid_info.std_cpuid1_edx.bits.ht != 0) {
>>         if (cpu_family() >= 0x17) {
>>           result = _cpuid_info.ext_cpuid1E_ebx.bits.threads_per_core
>> + 1; @@ -705,6 +730,8 @@
>>         result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1);
>>       } else if (is_amd()) {
>>         result = _cpuid_info.ext_cpuid5_ecx.bits.L1_line_size;
>> +    } else if (is_zx()) {
>> +      result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1);
>>       }
>>       if (result < 32) // not defined ?
>>         result = 32;   // 32 bytes by default on x86 and other x64
>>
>>
>> Best Regards!
>> VicWang | R&D
>> Telephone:+86-01082695388-892477
>> -----邮件原件-----
>> 发件人: David Holmes [mailto:[hidden email]]
>> 发送时间: 2018年1月3日 18:03
>> 收件人: Vic Wang(BJ-RD) <[hidden email]>;
>> [hidden email]
>> 抄送: Cobe Chen(BJ-RD) <[hidden email]>
>> 主题: Re: 答复: [patch] support zhaoxin x86 cpu vendor ids CentaurHauls
>> and Shanghai
>>
>> On 3/01/2018 6:32 PM, Vic Wang(BJ-RD) wrote:
>>> Hi David:
>>> I generate a patch using "hg diff". Please check it again. Thanks very much.
>>
>> Thanks.
>>
>>> diff -r 4d28288c9f9e src/hotspot/cpu/x86/assembler_x86.cpp
>>> --- a/src/hotspot/cpu/x86/assembler_x86.cppThu Dec 07 15:52:46 2017
>>> +0100
>>
>> Something stripped out the spaces between the file name and timestamp.
>>
>> I've hosted a webrev for the patch at:
>>
>> http://cr.openjdk.java.net/~dholmes/8194279/webrev
>>
>>
>> A few minor comments. I can't comment on the technical accuracy of
>> course :)
>>
>> All copyright notices need the second year updated to 2018.
>>
>> - src/hotspot/cpu/x86/assembler_x86.cpp
>>
>> Seems fine - you copied the existing pattern so I won't call out any
>> style bugs. :)
>>
>> ---
>>
>> - src/hotspot/cpu/x86/vm_version_x86.cpp
>>
>> 633 UseAVX = 0;
>>
>> Indention is missing.
>>
>> General style nit: your if statements are very inconsistently formatted.
>> A statement like:
>>
>> if( is_zx() )
>>
>> should be written:
>>
>> if (is_zx())
>>
>> - space after the "if" and no space after ( or before )
>>
>>
>> In this block:
>>
>> +     if (is_zx() && ((cpu_family() == 6)||(cpu_family() == 7)) &&
>> supports_sse3()) {
>> + #ifdef COMPILER2
>> +     if (FLAG_IS_DEFAULT(UseFPUForSpilling) && supports_sse4_2()) {
>> +       FLAG_SET_DEFAULT(UseFPUForSpilling, true);
>> +     }
>> + #endif
>>
>> I see this copies the existing pattern but isn't it simpler just to write:
>>
>>       if (is_zx() && ((cpu_family() == 6)||(cpu_family() == 7)) &&
>> supports_sse4_2()) {
>> #ifdef COMPILER2
>>       if (FLAG_IS_DEFAULT(UseFPUForSpilling)) {
>>         FLAG_SET_DEFAULT(UseFPUForSpilling, true);
>>       }
>> #endif
>>
>> ?
>>
>> ---
>>
>> -  src/hotspot/cpu/x86/vm_version_x86.hpp
>>
>> 310 CPU_FAMILY_ZX_CORE_F7    = 7,
>>
>> Indentation is missing.
>>
>> 555 // ZX features.
>>
>> Indentation is missing.
>>
>>    556     if(is_zx()) {
>>    557       if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
>>    558         result |= CPU_LZCNT;
>>    559       // for ZX, ecx.bits.misalignsse bit (bit 8) indicates
>> support for prefetchw
>>    560       if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
>>    561         result |= CPU_3DNOW_PREFETCH;
>>    562       }
>>    563     }
>>
>> The "intel" block you copied is full of style issues :( But as it is
>> exactly the same can't you just adjust:
>>
>> 546     if(is_intel()) {
>>
>> to
>>
>> 546     if (is_intel() || is_zx()) {
>>
>> ? Similarly for cores_per_cpu(), threads_per_core() and L1_line_size().
>>
>> Otherwise there are indentation issues in your changes to all those functions.
>>
>> Thanks,
>> David
>> -----
>>
>>
>> 保密声明:
>> 本邮件含有保密或专有信息,仅供指定收件人使用。严禁对本邮件或其内容做任何未经授权的查阅、使用、复制
>> 或转发。
>> CONFIDENTIAL NOTE:
>> This email contains confidential or legally privileged information
>> and is for the sole use of its intended recipient. Any unauthorized
>> review, use, copying or forwarding of this email or the content of this email is strictly prohibited.
>>


保密声明:
本邮件含有保密或专有信息,仅供指定收件人使用。严禁对本邮件或其内容做任何未经授权的查阅、使用、复制或转发。
CONFIDENTIAL NOTE:
This email contains confidential or legally privileged information and is for the sole use of its intended recipient. Any unauthorized review, use, copying or forwarding of this email or the content of this email is strictly prohibited.