RFR: JDK-8261302: NMT: Improve malloc site table hashing

classic Classic list List threaded Threaded
8 messages Options
Reply | Threaded
Open this post in threaded view
|

RFR: JDK-8261302: NMT: Improve malloc site table hashing

Thomas Stuefe
While looking at NMT tuning statistics, I saw longish bucket chains in the malloc site table and looked whether this can be improved.

The current hash algorithm uses the 32bit masked sum of all stack entries as hash.

I first experimented with different hash algorithms on different platforms (x86 and ppc, the latter because it has uniform op sizes) and did actually not find a noticeable improvement over what NMT does now. It seems that using the raw code pointers as base for the hash gives us already enough entropy. Avg load factor of the table always hovered around what was expected. Regardless of the hash I tried I was not able to get rid of the few longer chains.

The biggest improvement brought an experimental table size increase: currently the table size is 511 pointer slots (~4K). Quadrupling the size would bring the load factor down to 1-2. However, there is this comment in mallocSiteTable.hpp:

https://github.com/openjdk/jdk/blob/5183d8ae1eec86202eace2c4770f81edbc73cb68/src/hotspot/share/services/mallocSiteTable.hpp#L118

wich claims that a load factor of 6 is what is aimed for and deemed acceptable. So I am not going to touch that here (even though 4 or 12K more may be an okay price to pay for more efficient lookups.

---

With that out of the way, there are still small things we can improve about the hash function:

Since the vast majority of `NativeCallStack` objects will always need a hash code, it makes no sense to delay its calculation. By doing the hash code calculation in the constructor we can make `NativeCallStack::hash()` a simple inline getter.

When calculating the hash code, I also omitted the "if stack address is 0 stop" logic. The vast majority of call stacks have the full size and nothing much is gained from omitting those 0 values from the hash code calculation.

See difference (linux x86):

Before:

Dump of assembler code for function NativeCallStack::hash() const:
=> 0x00007ffff68092f0 <+0>:     mov    0x20(%rdi),%eax        <load hash
   0x00007ffff68092f3 <+3>:     push   %rbp
   0x00007ffff68092f4 <+4>:     mov    %rsp,%rbp              
   0x00007ffff68092f7 <+7>:     test   %eax,%eax              <0?->X
   0x00007ffff68092f9 <+9>:     jne    0x7ffff6809324 <NativeCallStack::hash() const+52>
   0x00007ffff68092fb <+11>:    mov    (%rdi),%rdx
   0x00007ffff68092fe <+14>:    test   %rdx,%rdx
   0x00007ffff6809301 <+17>:    je     0x7ffff6809330 <NativeCallStack::hash() const+64>
   0x00007ffff6809303 <+19>:    mov    0x8(%rdi),%rax
   0x00007ffff6809307 <+23>:    test   %rax,%rax
   0x00007ffff680930a <+26>:    je     0x7ffff680931f <NativeCallStack::hash() const+47>
   0x00007ffff680930c <+28>:    add    %rax,%rdx
   0x00007ffff680930f <+31>:    mov    0x10(%rdi),%rax
   0x00007ffff6809313 <+35>:    test   %rax,%rax
   0x00007ffff6809316 <+38>:    je     0x7ffff680931f <NativeCallStack::hash() const+47>
   0x00007ffff6809318 <+40>:    add    %rax,%rdx
   0x00007ffff680931b <+43>:    add    0x18(%rdi),%rdx
   0x00007ffff680931f <+47>:    mov    %edx,%eax
   0x00007ffff6809321 <+49>:    mov    %edx,0x20(%rdi)
   0x00007ffff6809324 <+52>:    pop    %rbp                   <X
   0x00007ffff6809325 <+53>:    retq  
   0x00007ffff6809326 <+54>:    nopw   %cs:0x0(%rax,%rax,1)
   0x00007ffff6809330 <+64>:    xor    %edx,%edx
   0x00007ffff6809332 <+66>:    jmp    0x7ffff680931f <NativeCallStack::hash() const+47>

hash() getter is not inlined; it queries the hash code each time and, when calculating it, uses simple adds interspersed with conditional jumps because of the "if stack address is 0 stop" logic.

With this patch, the `NativeCallStack::hash()` gets inlined at the call sites to a simple load.
The hash calculation gets now inlined into the constructor and uses 128bit xmm registers and packed integer adds:

Dump of assembler code for function NativeCallStack::NativeCallStack(int, bool):
=> 0x00007ffff67cbba0 <+0>:     push   %rbp
   0x00007ffff67cbba1 <+1>:     mov    %rsp,%rbp
   0x00007ffff67cbba4 <+4>:     push   %rbx
   0x00007ffff67cbba5 <+5>:     mov    %rdi,%rbx
   0x00007ffff67cbba8 <+8>:     sub    $0x8,%rsp
   0x00007ffff67cbbac <+12>:    test   %dl,%dl
   0x00007ffff67cbbae <+14>:    movl   $0x0,0x20(%rdi)
   0x00007ffff67cbbb5 <+21>:    jne    0x7ffff67cbc10 <NativeCallStack::NativeCallStack(int, bool)+112>
   0x00007ffff67cbbb7 <+23>:    movq   $0x0,(%rdi)
   0x00007ffff67cbbbe <+30>:    movq   $0x0,0x8(%rdi)
   0x00007ffff67cbbc6 <+38>:    movq   $0x0,0x10(%rdi)
   0x00007ffff67cbbce <+46>:    movq   $0x0,0x18(%rdi)
   0x00007ffff67cbbd6 <+54>:    movdqu (%rbx),%xmm0          <<<<
   0x00007ffff67cbbda <+58>:    movdqu 0x10(%rbx),%xmm2      <<<<
   0x00007ffff67cbbdf <+63>:    shufps $0x88,%xmm2,%xmm0     <<<<
   0x00007ffff67cbbe3 <+67>:    movdqa %xmm0,%xmm1           <<<<
   0x00007ffff67cbbe7 <+71>:    psrldq $0x8,%xmm1            <<<<
   0x00007ffff67cbbec <+76>:    paddd  %xmm1,%xmm0           <<<<
   0x00007ffff67cbbf0 <+80>:    movdqa %xmm0,%xmm1           <<<<
   0x00007ffff67cbbf4 <+84>:    psrldq $0x4,%xmm1            <<<<
   0x00007ffff67cbbf9 <+89>:    paddd  %xmm1,%xmm0           <<<<
   0x00007ffff67cbbfd <+93>:    movd   %xmm0,0x20(%rbx)      <<<<
   0x00007ffff67cbc02 <+98>:    add    $0x8,%rsp
   0x00007ffff67cbc06 <+102>:   pop    %rbx
   0x00007ffff67cbc07 <+103>:   pop    %rbp
   0x00007ffff67cbc08 <+104>:   retq  
   0x00007ffff67cbc09 <+105>:   nopl   0x0(%rax)
   0x00007ffff67cbc10 <+112>:   mov    %esi,%edx
   0x00007ffff67cbc12 <+114>:   mov    $0x4,%esi
   0x00007ffff67cbc17 <+119>:   callq  0x7ffff6824960 <os::get_native_stack(unsigned char**, int, int)>
   0x00007ffff67cbc1c <+124>:   jmp    0x7ffff67cbbd6 <NativeCallStack::NativeCallStack(int, bool)+54>

Thanks, Thomas

-------------

Commit messages:
 - Remove stray include
 - Start

Changes: https://git.openjdk.java.net/jdk/pull/2473/files
 Webrev: https://webrevs.openjdk.java.net/?repo=jdk&pr=2473&range=00
  Issue: https://bugs.openjdk.java.net/browse/JDK-8261302
  Stats: 29 lines in 2 files changed: 9 ins; 16 del; 4 mod
  Patch: https://git.openjdk.java.net/jdk/pull/2473.diff
  Fetch: git fetch https://git.openjdk.java.net/jdk pull/2473/head:pull/2473

PR: https://git.openjdk.java.net/jdk/pull/2473
Reply | Threaded
Open this post in threaded view
|

Re: RFR: JDK-8261302: NMT: Improve malloc site table hashing [v2]

Thomas Stuefe
> While looking at NMT tuning statistics, I saw longish bucket chains in the malloc site table and looked whether this can be improved.
>
> The current hash algorithm uses the 32bit masked sum of all stack entries as hash.
>
> I first experimented with different hash algorithms on different platforms (x86 and ppc, the latter because it has uniform op sizes) and did actually not find a noticeable improvement over what NMT does now. It seems that using the raw code pointers as base for the hash gives us already enough entropy. Avg load factor of the table always hovered around what was expected. Regardless of the hash I tried I was not able to get rid of the few longer chains.
>
> The biggest improvement brought an experimental table size increase: currently the table size is 511 pointer slots (~4K). Quadrupling the size would bring the load factor down to 1-2. However, there is this comment in mallocSiteTable.hpp:
>
> https://github.com/openjdk/jdk/blob/5183d8ae1eec86202eace2c4770f81edbc73cb68/src/hotspot/share/services/mallocSiteTable.hpp#L118
>
> wich claims that a load factor of 6 is what is aimed for and deemed acceptable. So I am not going to touch that here (even though 4 or 12K more may be an okay price to pay for more efficient lookups.
>
> ---
>
> With that out of the way, there are still small things we can improve about the hash function:
>
> Since the vast majority of `NativeCallStack` objects will always need a hash code, it makes no sense to delay its calculation. By doing the hash code calculation in the constructor we can make `NativeCallStack::hash()` a simple inline getter.
>
> When calculating the hash code, I also omitted the "if stack address is 0 stop" logic. The vast majority of call stacks have the full size and nothing much is gained from omitting those 0 values from the hash code calculation.
>
> See difference (linux x86):
>
> Before:
>
> Dump of assembler code for function NativeCallStack::hash() const:
> => 0x00007ffff68092f0 <+0>:     mov    0x20(%rdi),%eax        <load hash
>    0x00007ffff68092f3 <+3>:     push   %rbp
>    0x00007ffff68092f4 <+4>:     mov    %rsp,%rbp              
>    0x00007ffff68092f7 <+7>:     test   %eax,%eax              <0?->X
>    0x00007ffff68092f9 <+9>:     jne    0x7ffff6809324 <NativeCallStack::hash() const+52>
>    0x00007ffff68092fb <+11>:    mov    (%rdi),%rdx
>    0x00007ffff68092fe <+14>:    test   %rdx,%rdx
>    0x00007ffff6809301 <+17>:    je     0x7ffff6809330 <NativeCallStack::hash() const+64>
>    0x00007ffff6809303 <+19>:    mov    0x8(%rdi),%rax
>    0x00007ffff6809307 <+23>:    test   %rax,%rax
>    0x00007ffff680930a <+26>:    je     0x7ffff680931f <NativeCallStack::hash() const+47>
>    0x00007ffff680930c <+28>:    add    %rax,%rdx
>    0x00007ffff680930f <+31>:    mov    0x10(%rdi),%rax
>    0x00007ffff6809313 <+35>:    test   %rax,%rax
>    0x00007ffff6809316 <+38>:    je     0x7ffff680931f <NativeCallStack::hash() const+47>
>    0x00007ffff6809318 <+40>:    add    %rax,%rdx
>    0x00007ffff680931b <+43>:    add    0x18(%rdi),%rdx
>    0x00007ffff680931f <+47>:    mov    %edx,%eax
>    0x00007ffff6809321 <+49>:    mov    %edx,0x20(%rdi)
>    0x00007ffff6809324 <+52>:    pop    %rbp                   <X
>    0x00007ffff6809325 <+53>:    retq  
>    0x00007ffff6809326 <+54>:    nopw   %cs:0x0(%rax,%rax,1)
>    0x00007ffff6809330 <+64>:    xor    %edx,%edx
>    0x00007ffff6809332 <+66>:    jmp    0x7ffff680931f <NativeCallStack::hash() const+47>
>
> hash() getter is not inlined; it queries the hash code each time and, when calculating it, uses simple adds interspersed with conditional jumps because of the "if stack address is 0 stop" logic.
>
> With this patch, the `NativeCallStack::hash()` gets inlined at the call sites to a simple load.
> The hash calculation gets now inlined into the constructor and uses a series of simple adds now:
>
> Dump of assembler code for function NativeCallStack::NativeCallStack(int, bool):
> => 0x00007ffff67cc9a0 <+0>:     push   %rbp
>    0x00007ffff67cc9a1 <+1>:     mov    %rsp,%rbp
>    0x00007ffff67cc9a4 <+4>:     push   %rbx
>    0x00007ffff67cc9a5 <+5>:     mov    %rdi,%rbx
>    0x00007ffff67cc9a8 <+8>:     sub    $0x8,%rsp
>    0x00007ffff67cc9ac <+12>:    test   %dl,%dl
>    0x00007ffff67cc9ae <+14>:    movl   $0x0,0x20(%rdi)
>    0x00007ffff67cc9b5 <+21>:    jne    0x7ffff67cc9f0 <NativeCallStack::NativeCallStack(int, bool)+80>
>    0x00007ffff67cc9b7 <+23>:    movq   $0x0,(%rdi)
>    0x00007ffff67cc9be <+30>:    movq   $0x0,0x8(%rdi)
>    0x00007ffff67cc9c6 <+38>:    movq   $0x0,0x10(%rdi)
>    0x00007ffff67cc9ce <+46>:    movq   $0x0,0x18(%rdi)
>    0x00007ffff67cc9d6 <+54>:    mov    0x10(%rbx),%rax   <<<
>    0x00007ffff67cc9da <+58>:    add    0x8(%rbx),%rax    <<<    
>    0x00007ffff67cc9de <+62>:    add    0x18(%rbx),%rax   <<<
>    0x00007ffff67cc9e2 <+66>:    add    (%rbx),%rax       <<<
>    0x00007ffff67cc9e5 <+69>:    mov    %eax,0x20(%rbx)
>    0x00007ffff67cc9e8 <+72>:    add    $0x8,%rsp
>    0x00007ffff67cc9ec <+76>:    pop    %rbx
>    0x00007ffff67cc9ed <+77>:    pop    %rbp
>    0x00007ffff67cc9ee <+78>:    retq  
>    0x00007ffff67cc9ef <+79>:    nop
>    0x00007ffff67cc9f0 <+80>:    mov    %esi,%edx
>    0x00007ffff67cc9f2 <+82>:    mov    $0x4,%esi
>    0x00007ffff67cc9f7 <+87>:    callq  0x7ffff68250d0 <os::get_native_stack(unsigned char**, int, int)>
>    0x00007ffff67cc9fc <+92>:    jmp    0x7ffff67cc9d6 <NativeCallStack::NativeCallStack(int, bool)+54>
> End of assembler dump.
>
> Thanks, Thomas

Thomas Stuefe has refreshed the contents of this pull request, and previous commits have been removed. The incremental views will show differences compared to the previous content of the PR. The pull request contains one new commit since the last revision:

  Start

-------------

Changes:
  - all: https://git.openjdk.java.net/jdk/pull/2473/files
  - new: https://git.openjdk.java.net/jdk/pull/2473/files/b8e11afb..309d6c0a

Webrevs:
 - full: https://webrevs.openjdk.java.net/?repo=jdk&pr=2473&range=01
 - incr: https://webrevs.openjdk.java.net/?repo=jdk&pr=2473&range=00-01

  Stats: 2 lines in 1 file changed: 0 ins; 0 del; 2 mod
  Patch: https://git.openjdk.java.net/jdk/pull/2473.diff
  Fetch: git fetch https://git.openjdk.java.net/jdk pull/2473/head:pull/2473

PR: https://git.openjdk.java.net/jdk/pull/2473
Reply | Threaded
Open this post in threaded view
|

Re: RFR: JDK-8261302: NMT: Improve malloc site table hashing [v2]

Zhengyu Gu-3
On Tue, 9 Feb 2021 08:05:59 GMT, Thomas Stuefe <[hidden email]> wrote:

>> While looking at NMT tuning statistics, I saw longish bucket chains in the malloc site table and looked whether this can be improved.
>>
>> The current hash algorithm uses the 32bit masked sum of all stack entries as hash.
>>
>> I first experimented with different hash algorithms on different platforms (x86 and ppc, the latter because it has uniform op sizes) and did actually not find a noticeable improvement over what NMT does now. It seems that using the raw code pointers as base for the hash gives us already enough entropy. Avg load factor of the table always hovered around what was expected. Regardless of the hash I tried I was not able to get rid of the few longer chains.
>>
>> The biggest improvement brought an experimental table size increase: currently the table size is 511 pointer slots (~4K). Quadrupling the size would bring the load factor down to 1-2. However, there is this comment in mallocSiteTable.hpp:
>>
>> https://github.com/openjdk/jdk/blob/5183d8ae1eec86202eace2c4770f81edbc73cb68/src/hotspot/share/services/mallocSiteTable.hpp#L118
>>
>> wich claims that a load factor of 6 is what is aimed for and deemed acceptable. So I am not going to touch that here (even though 4 or 12K more may be an okay price to pay for more efficient lookups.
>>
>> ---
>>
>> With that out of the way, there are still small things we can improve about the hash function:
>>
>> Since the vast majority of `NativeCallStack` objects will always need a hash code, it makes no sense to delay its calculation. By doing the hash code calculation in the constructor we can make `NativeCallStack::hash()` a simple inline getter.
>>
>> When calculating the hash code, I also omitted the "if stack address is 0 stop" logic. The vast majority of call stacks have the full size and nothing much is gained from omitting those 0 values from the hash code calculation.
>>
>> See difference (linux x86):
>>
>> Before:
>>
>> Dump of assembler code for function NativeCallStack::hash() const:
>> => 0x00007ffff68092f0 <+0>:     mov    0x20(%rdi),%eax        <load hash
>>    0x00007ffff68092f3 <+3>:     push   %rbp
>>    0x00007ffff68092f4 <+4>:     mov    %rsp,%rbp              
>>    0x00007ffff68092f7 <+7>:     test   %eax,%eax              <0?->X
>>    0x00007ffff68092f9 <+9>:     jne    0x7ffff6809324 <NativeCallStack::hash() const+52>
>>    0x00007ffff68092fb <+11>:    mov    (%rdi),%rdx
>>    0x00007ffff68092fe <+14>:    test   %rdx,%rdx
>>    0x00007ffff6809301 <+17>:    je     0x7ffff6809330 <NativeCallStack::hash() const+64>
>>    0x00007ffff6809303 <+19>:    mov    0x8(%rdi),%rax
>>    0x00007ffff6809307 <+23>:    test   %rax,%rax
>>    0x00007ffff680930a <+26>:    je     0x7ffff680931f <NativeCallStack::hash() const+47>
>>    0x00007ffff680930c <+28>:    add    %rax,%rdx
>>    0x00007ffff680930f <+31>:    mov    0x10(%rdi),%rax
>>    0x00007ffff6809313 <+35>:    test   %rax,%rax
>>    0x00007ffff6809316 <+38>:    je     0x7ffff680931f <NativeCallStack::hash() const+47>
>>    0x00007ffff6809318 <+40>:    add    %rax,%rdx
>>    0x00007ffff680931b <+43>:    add    0x18(%rdi),%rdx
>>    0x00007ffff680931f <+47>:    mov    %edx,%eax
>>    0x00007ffff6809321 <+49>:    mov    %edx,0x20(%rdi)
>>    0x00007ffff6809324 <+52>:    pop    %rbp                   <X
>>    0x00007ffff6809325 <+53>:    retq  
>>    0x00007ffff6809326 <+54>:    nopw   %cs:0x0(%rax,%rax,1)
>>    0x00007ffff6809330 <+64>:    xor    %edx,%edx
>>    0x00007ffff6809332 <+66>:    jmp    0x7ffff680931f <NativeCallStack::hash() const+47>
>>
>> hash() getter is not inlined; it queries the hash code each time and, when calculating it, uses simple adds interspersed with conditional jumps because of the "if stack address is 0 stop" logic.
>>
>> With this patch, the `NativeCallStack::hash()` gets inlined at the call sites to a simple load.
>> The hash calculation gets now inlined into the constructor and uses a series of simple adds now:
>>
>> Dump of assembler code for function NativeCallStack::NativeCallStack(int, bool):
>> => 0x00007ffff67cc9a0 <+0>:     push   %rbp
>>    0x00007ffff67cc9a1 <+1>:     mov    %rsp,%rbp
>>    0x00007ffff67cc9a4 <+4>:     push   %rbx
>>    0x00007ffff67cc9a5 <+5>:     mov    %rdi,%rbx
>>    0x00007ffff67cc9a8 <+8>:     sub    $0x8,%rsp
>>    0x00007ffff67cc9ac <+12>:    test   %dl,%dl
>>    0x00007ffff67cc9ae <+14>:    movl   $0x0,0x20(%rdi)
>>    0x00007ffff67cc9b5 <+21>:    jne    0x7ffff67cc9f0 <NativeCallStack::NativeCallStack(int, bool)+80>
>>    0x00007ffff67cc9b7 <+23>:    movq   $0x0,(%rdi)
>>    0x00007ffff67cc9be <+30>:    movq   $0x0,0x8(%rdi)
>>    0x00007ffff67cc9c6 <+38>:    movq   $0x0,0x10(%rdi)
>>    0x00007ffff67cc9ce <+46>:    movq   $0x0,0x18(%rdi)
>>    0x00007ffff67cc9d6 <+54>:    mov    0x10(%rbx),%rax   <<<
>>    0x00007ffff67cc9da <+58>:    add    0x8(%rbx),%rax    <<<    
>>    0x00007ffff67cc9de <+62>:    add    0x18(%rbx),%rax   <<<
>>    0x00007ffff67cc9e2 <+66>:    add    (%rbx),%rax       <<<
>>    0x00007ffff67cc9e5 <+69>:    mov    %eax,0x20(%rbx)
>>    0x00007ffff67cc9e8 <+72>:    add    $0x8,%rsp
>>    0x00007ffff67cc9ec <+76>:    pop    %rbx
>>    0x00007ffff67cc9ed <+77>:    pop    %rbp
>>    0x00007ffff67cc9ee <+78>:    retq  
>>    0x00007ffff67cc9ef <+79>:    nop
>>    0x00007ffff67cc9f0 <+80>:    mov    %esi,%edx
>>    0x00007ffff67cc9f2 <+82>:    mov    $0x4,%esi
>>    0x00007ffff67cc9f7 <+87>:    callq  0x7ffff68250d0 <os::get_native_stack(unsigned char**, int, int)>
>>    0x00007ffff67cc9fc <+92>:    jmp    0x7ffff67cc9d6 <NativeCallStack::NativeCallStack(int, bool)+54>
>> End of assembler dump.
>>
>> Thanks, Thomas
>
> Thomas Stuefe has refreshed the contents of this pull request, and previous commits have been removed. The incremental views will show differences compared to the previous content of the PR.

Looks good to me.

-------------

Marked as reviewed by zgu (Reviewer).

PR: https://git.openjdk.java.net/jdk/pull/2473
Reply | Threaded
Open this post in threaded view
|

Re: RFR: JDK-8261302: NMT: Improve malloc site table hashing [v2]

Thomas Stuefe
On Tue, 9 Feb 2021 13:21:11 GMT, Zhengyu Gu <[hidden email]> wrote:

> Looks good to me.

Thanks Zhengyu.

-------------

PR: https://git.openjdk.java.net/jdk/pull/2473
Reply | Threaded
Open this post in threaded view
|

Re: RFR: JDK-8261302: NMT: Improve malloc site table hashing [v2]

Lutz Schmidt
In reply to this post by Thomas Stuefe
On Tue, 9 Feb 2021 08:05:59 GMT, Thomas Stuefe <[hidden email]> wrote:

>> While looking at NMT tuning statistics, I saw longish bucket chains in the malloc site table and looked whether this can be improved.
>>
>> The current hash algorithm uses the 32bit masked sum of all stack entries as hash.
>>
>> I first experimented with different hash algorithms on different platforms (x86 and ppc, the latter because it has uniform op sizes) and did actually not find a noticeable improvement over what NMT does now. It seems that using the raw code pointers as base for the hash gives us already enough entropy. Avg load factor of the table always hovered around what was expected. Regardless of the hash I tried I was not able to get rid of the few longer chains.
>>
>> The biggest improvement brought an experimental table size increase: currently the table size is 511 pointer slots (~4K). Quadrupling the size would bring the load factor down to 1-2. However, there is this comment in mallocSiteTable.hpp:
>>
>> https://github.com/openjdk/jdk/blob/5183d8ae1eec86202eace2c4770f81edbc73cb68/src/hotspot/share/services/mallocSiteTable.hpp#L118
>>
>> wich claims that a load factor of 6 is what is aimed for and deemed acceptable. So I am not going to touch that here (even though 4 or 12K more may be an okay price to pay for more efficient lookups.
>>
>> ---
>>
>> With that out of the way, there are still small things we can improve about the hash function:
>>
>> Since the vast majority of `NativeCallStack` objects will always need a hash code, it makes no sense to delay its calculation. By doing the hash code calculation in the constructor we can make `NativeCallStack::hash()` a simple inline getter.
>>
>> When calculating the hash code, I also omitted the "if stack address is 0 stop" logic. The vast majority of call stacks have the full size and nothing much is gained from omitting those 0 values from the hash code calculation.
>>
>> See difference (linux x86):
>>
>> Before:
>>
>> Dump of assembler code for function NativeCallStack::hash() const:
>> => 0x00007ffff68092f0 <+0>:     mov    0x20(%rdi),%eax        <load hash
>>    0x00007ffff68092f3 <+3>:     push   %rbp
>>    0x00007ffff68092f4 <+4>:     mov    %rsp,%rbp              
>>    0x00007ffff68092f7 <+7>:     test   %eax,%eax              <0?->X
>>    0x00007ffff68092f9 <+9>:     jne    0x7ffff6809324 <NativeCallStack::hash() const+52>
>>    0x00007ffff68092fb <+11>:    mov    (%rdi),%rdx
>>    0x00007ffff68092fe <+14>:    test   %rdx,%rdx
>>    0x00007ffff6809301 <+17>:    je     0x7ffff6809330 <NativeCallStack::hash() const+64>
>>    0x00007ffff6809303 <+19>:    mov    0x8(%rdi),%rax
>>    0x00007ffff6809307 <+23>:    test   %rax,%rax
>>    0x00007ffff680930a <+26>:    je     0x7ffff680931f <NativeCallStack::hash() const+47>
>>    0x00007ffff680930c <+28>:    add    %rax,%rdx
>>    0x00007ffff680930f <+31>:    mov    0x10(%rdi),%rax
>>    0x00007ffff6809313 <+35>:    test   %rax,%rax
>>    0x00007ffff6809316 <+38>:    je     0x7ffff680931f <NativeCallStack::hash() const+47>
>>    0x00007ffff6809318 <+40>:    add    %rax,%rdx
>>    0x00007ffff680931b <+43>:    add    0x18(%rdi),%rdx
>>    0x00007ffff680931f <+47>:    mov    %edx,%eax
>>    0x00007ffff6809321 <+49>:    mov    %edx,0x20(%rdi)
>>    0x00007ffff6809324 <+52>:    pop    %rbp                   <X
>>    0x00007ffff6809325 <+53>:    retq  
>>    0x00007ffff6809326 <+54>:    nopw   %cs:0x0(%rax,%rax,1)
>>    0x00007ffff6809330 <+64>:    xor    %edx,%edx
>>    0x00007ffff6809332 <+66>:    jmp    0x7ffff680931f <NativeCallStack::hash() const+47>
>>
>> hash() getter is not inlined; it queries the hash code each time and, when calculating it, uses simple adds interspersed with conditional jumps because of the "if stack address is 0 stop" logic.
>>
>> With this patch, the `NativeCallStack::hash()` gets inlined at the call sites to a simple load.
>> The hash calculation gets now inlined into the constructor and uses a series of simple adds now:
>>
>> Dump of assembler code for function NativeCallStack::NativeCallStack(int, bool):
>> => 0x00007ffff67cc9a0 <+0>:     push   %rbp
>>    0x00007ffff67cc9a1 <+1>:     mov    %rsp,%rbp
>>    0x00007ffff67cc9a4 <+4>:     push   %rbx
>>    0x00007ffff67cc9a5 <+5>:     mov    %rdi,%rbx
>>    0x00007ffff67cc9a8 <+8>:     sub    $0x8,%rsp
>>    0x00007ffff67cc9ac <+12>:    test   %dl,%dl
>>    0x00007ffff67cc9ae <+14>:    movl   $0x0,0x20(%rdi)
>>    0x00007ffff67cc9b5 <+21>:    jne    0x7ffff67cc9f0 <NativeCallStack::NativeCallStack(int, bool)+80>
>>    0x00007ffff67cc9b7 <+23>:    movq   $0x0,(%rdi)
>>    0x00007ffff67cc9be <+30>:    movq   $0x0,0x8(%rdi)
>>    0x00007ffff67cc9c6 <+38>:    movq   $0x0,0x10(%rdi)
>>    0x00007ffff67cc9ce <+46>:    movq   $0x0,0x18(%rdi)
>>    0x00007ffff67cc9d6 <+54>:    mov    0x10(%rbx),%rax   <<<
>>    0x00007ffff67cc9da <+58>:    add    0x8(%rbx),%rax    <<<    
>>    0x00007ffff67cc9de <+62>:    add    0x18(%rbx),%rax   <<<
>>    0x00007ffff67cc9e2 <+66>:    add    (%rbx),%rax       <<<
>>    0x00007ffff67cc9e5 <+69>:    mov    %eax,0x20(%rbx)
>>    0x00007ffff67cc9e8 <+72>:    add    $0x8,%rsp
>>    0x00007ffff67cc9ec <+76>:    pop    %rbx
>>    0x00007ffff67cc9ed <+77>:    pop    %rbp
>>    0x00007ffff67cc9ee <+78>:    retq  
>>    0x00007ffff67cc9ef <+79>:    nop
>>    0x00007ffff67cc9f0 <+80>:    mov    %esi,%edx
>>    0x00007ffff67cc9f2 <+82>:    mov    $0x4,%esi
>>    0x00007ffff67cc9f7 <+87>:    callq  0x7ffff68250d0 <os::get_native_stack(unsigned char**, int, int)>
>>    0x00007ffff67cc9fc <+92>:    jmp    0x7ffff67cc9d6 <NativeCallStack::NativeCallStack(int, bool)+54>
>> End of assembler dump.
>>
>> Thanks, Thomas
>
> Thomas Stuefe has refreshed the contents of this pull request, and previous commits have been removed. The incremental views will show differences compared to the previous content of the PR.

LGTM, except for the one comment I added (it's not a "must", it's a "please").

src/hotspot/share/utilities/nativeCallStack.cpp line 31:

> 29: #include "utilities/nativeCallStack.hpp"
> 30:
> 31: static unsigned calculate_hash(address stack[NMT_TrackingStackDepth]) {

I know it doesn't change anything semantically, but I'd like to see the int type specifier.

-------------

Marked as reviewed by lucy (Reviewer).

PR: https://git.openjdk.java.net/jdk/pull/2473
Reply | Threaded
Open this post in threaded view
|

Re: RFR: JDK-8261302: NMT: Improve malloc site table hashing [v3]

Thomas Stuefe
In reply to this post by Thomas Stuefe
> While looking at NMT tuning statistics, I saw longish bucket chains in the malloc site table and looked whether this can be improved.
>
> The current hash algorithm uses the 32bit masked sum of all stack entries as hash.
>
> I first experimented with different hash algorithms on different platforms (x86 and ppc, the latter because it has uniform op sizes) and did actually not find a noticeable improvement over what NMT does now. It seems that using the raw code pointers as base for the hash gives us already enough entropy. Avg load factor of the table always hovered around what was expected. Regardless of the hash I tried I was not able to get rid of the few longer chains.
>
> The biggest improvement brought an experimental table size increase: currently the table size is 511 pointer slots (~4K). Quadrupling the size would bring the load factor down to 1-2. However, there is this comment in mallocSiteTable.hpp:
>
> https://github.com/openjdk/jdk/blob/5183d8ae1eec86202eace2c4770f81edbc73cb68/src/hotspot/share/services/mallocSiteTable.hpp#L118
>
> wich claims that a load factor of 6 is what is aimed for and deemed acceptable. So I am not going to touch that here (even though 4 or 12K more may be an okay price to pay for more efficient lookups.
>
> ---
>
> With that out of the way, there are still small things we can improve about the hash function:
>
> Since the vast majority of `NativeCallStack` objects will always need a hash code, it makes no sense to delay its calculation. By doing the hash code calculation in the constructor we can make `NativeCallStack::hash()` a simple inline getter.
>
> When calculating the hash code, I also omitted the "if stack address is 0 stop" logic. The vast majority of call stacks have the full size and nothing much is gained from omitting those 0 values from the hash code calculation.
>
> See difference (linux x86):
>
> Before:
>
> Dump of assembler code for function NativeCallStack::hash() const:
> => 0x00007ffff68092f0 <+0>:     mov    0x20(%rdi),%eax        <load hash
>    0x00007ffff68092f3 <+3>:     push   %rbp
>    0x00007ffff68092f4 <+4>:     mov    %rsp,%rbp              
>    0x00007ffff68092f7 <+7>:     test   %eax,%eax              <0?->X
>    0x00007ffff68092f9 <+9>:     jne    0x7ffff6809324 <NativeCallStack::hash() const+52>
>    0x00007ffff68092fb <+11>:    mov    (%rdi),%rdx
>    0x00007ffff68092fe <+14>:    test   %rdx,%rdx
>    0x00007ffff6809301 <+17>:    je     0x7ffff6809330 <NativeCallStack::hash() const+64>
>    0x00007ffff6809303 <+19>:    mov    0x8(%rdi),%rax
>    0x00007ffff6809307 <+23>:    test   %rax,%rax
>    0x00007ffff680930a <+26>:    je     0x7ffff680931f <NativeCallStack::hash() const+47>
>    0x00007ffff680930c <+28>:    add    %rax,%rdx
>    0x00007ffff680930f <+31>:    mov    0x10(%rdi),%rax
>    0x00007ffff6809313 <+35>:    test   %rax,%rax
>    0x00007ffff6809316 <+38>:    je     0x7ffff680931f <NativeCallStack::hash() const+47>
>    0x00007ffff6809318 <+40>:    add    %rax,%rdx
>    0x00007ffff680931b <+43>:    add    0x18(%rdi),%rdx
>    0x00007ffff680931f <+47>:    mov    %edx,%eax
>    0x00007ffff6809321 <+49>:    mov    %edx,0x20(%rdi)
>    0x00007ffff6809324 <+52>:    pop    %rbp                   <X
>    0x00007ffff6809325 <+53>:    retq  
>    0x00007ffff6809326 <+54>:    nopw   %cs:0x0(%rax,%rax,1)
>    0x00007ffff6809330 <+64>:    xor    %edx,%edx
>    0x00007ffff6809332 <+66>:    jmp    0x7ffff680931f <NativeCallStack::hash() const+47>
>
> hash() getter is not inlined; it queries the hash code each time and, when calculating it, uses simple adds interspersed with conditional jumps because of the "if stack address is 0 stop" logic.
>
> With this patch, the `NativeCallStack::hash()` gets inlined at the call sites to a simple load.
> The hash calculation gets now inlined into the constructor and uses a series of simple adds now:
>
> Dump of assembler code for function NativeCallStack::NativeCallStack(int, bool):
> => 0x00007ffff67cc9a0 <+0>:     push   %rbp
>    0x00007ffff67cc9a1 <+1>:     mov    %rsp,%rbp
>    0x00007ffff67cc9a4 <+4>:     push   %rbx
>    0x00007ffff67cc9a5 <+5>:     mov    %rdi,%rbx
>    0x00007ffff67cc9a8 <+8>:     sub    $0x8,%rsp
>    0x00007ffff67cc9ac <+12>:    test   %dl,%dl
>    0x00007ffff67cc9ae <+14>:    movl   $0x0,0x20(%rdi)
>    0x00007ffff67cc9b5 <+21>:    jne    0x7ffff67cc9f0 <NativeCallStack::NativeCallStack(int, bool)+80>
>    0x00007ffff67cc9b7 <+23>:    movq   $0x0,(%rdi)
>    0x00007ffff67cc9be <+30>:    movq   $0x0,0x8(%rdi)
>    0x00007ffff67cc9c6 <+38>:    movq   $0x0,0x10(%rdi)
>    0x00007ffff67cc9ce <+46>:    movq   $0x0,0x18(%rdi)
>    0x00007ffff67cc9d6 <+54>:    mov    0x10(%rbx),%rax   <<<
>    0x00007ffff67cc9da <+58>:    add    0x8(%rbx),%rax    <<<    
>    0x00007ffff67cc9de <+62>:    add    0x18(%rbx),%rax   <<<
>    0x00007ffff67cc9e2 <+66>:    add    (%rbx),%rax       <<<
>    0x00007ffff67cc9e5 <+69>:    mov    %eax,0x20(%rbx)
>    0x00007ffff67cc9e8 <+72>:    add    $0x8,%rsp
>    0x00007ffff67cc9ec <+76>:    pop    %rbx
>    0x00007ffff67cc9ed <+77>:    pop    %rbp
>    0x00007ffff67cc9ee <+78>:    retq  
>    0x00007ffff67cc9ef <+79>:    nop
>    0x00007ffff67cc9f0 <+80>:    mov    %esi,%edx
>    0x00007ffff67cc9f2 <+82>:    mov    $0x4,%esi
>    0x00007ffff67cc9f7 <+87>:    callq  0x7ffff68250d0 <os::get_native_stack(unsigned char**, int, int)>
>    0x00007ffff67cc9fc <+92>:    jmp    0x7ffff67cc9d6 <NativeCallStack::NativeCallStack(int, bool)+54>
> End of assembler dump.
>
> Thanks, Thomas

Thomas Stuefe has updated the pull request incrementally with one additional commit since the last revision:

  unsigned -> unsigned int

-------------

Changes:
  - all: https://git.openjdk.java.net/jdk/pull/2473/files
  - new: https://git.openjdk.java.net/jdk/pull/2473/files/309d6c0a..e56b85d3

Webrevs:
 - full: https://webrevs.openjdk.java.net/?repo=jdk&pr=2473&range=02
 - incr: https://webrevs.openjdk.java.net/?repo=jdk&pr=2473&range=01-02

  Stats: 1 line in 1 file changed: 0 ins; 0 del; 1 mod
  Patch: https://git.openjdk.java.net/jdk/pull/2473.diff
  Fetch: git fetch https://git.openjdk.java.net/jdk pull/2473/head:pull/2473

PR: https://git.openjdk.java.net/jdk/pull/2473
Reply | Threaded
Open this post in threaded view
|

Re: RFR: JDK-8261302: NMT: Improve malloc site table hashing [v2]

Thomas Stuefe
In reply to this post by Lutz Schmidt
On Wed, 10 Feb 2021 07:28:33 GMT, Lutz Schmidt <[hidden email]> wrote:

> LGTM, except for the one comment I added (it's not a "must", it's a "please").

Thanks Lucy!

> src/hotspot/share/utilities/nativeCallStack.cpp line 31:
>
>> 29: #include "utilities/nativeCallStack.hpp"
>> 30:
>> 31: static unsigned calculate_hash(address stack[NMT_TrackingStackDepth]) {
>
> I know it doesn't change anything semantically, but I'd like to see the int type specifier.

Sure, I'll change it before pushing.

-------------

PR: https://git.openjdk.java.net/jdk/pull/2473
Reply | Threaded
Open this post in threaded view
|

Integrated: JDK-8261302: NMT: Improve malloc site table hashing

Thomas Stuefe
In reply to this post by Thomas Stuefe
On Tue, 9 Feb 2021 07:17:00 GMT, Thomas Stuefe <[hidden email]> wrote:

> While looking at NMT tuning statistics, I saw longish bucket chains in the malloc site table and looked whether this can be improved.
>
> The current hash algorithm uses the 32bit masked sum of all stack entries as hash.
>
> I first experimented with different hash algorithms on different platforms (x86 and ppc, the latter because it has uniform op sizes) and did actually not find a noticeable improvement over what NMT does now. It seems that using the raw code pointers as base for the hash gives us already enough entropy. Avg load factor of the table always hovered around what was expected. Regardless of the hash I tried I was not able to get rid of the few longer chains.
>
> The biggest improvement brought an experimental table size increase: currently the table size is 511 pointer slots (~4K). Quadrupling the size would bring the load factor down to 1-2. However, there is this comment in mallocSiteTable.hpp:
>
> https://github.com/openjdk/jdk/blob/5183d8ae1eec86202eace2c4770f81edbc73cb68/src/hotspot/share/services/mallocSiteTable.hpp#L118
>
> wich claims that a load factor of 6 is what is aimed for and deemed acceptable. So I am not going to touch that here (even though 4 or 12K more may be an okay price to pay for more efficient lookups.
>
> ---
>
> With that out of the way, there are still small things we can improve about the hash function:
>
> Since the vast majority of `NativeCallStack` objects will always need a hash code, it makes no sense to delay its calculation. By doing the hash code calculation in the constructor we can make `NativeCallStack::hash()` a simple inline getter.
>
> When calculating the hash code, I also omitted the "if stack address is 0 stop" logic. The vast majority of call stacks have the full size and nothing much is gained from omitting those 0 values from the hash code calculation.
>
> See difference (linux x86):
>
> Before:
>
> Dump of assembler code for function NativeCallStack::hash() const:
> => 0x00007ffff68092f0 <+0>:     mov    0x20(%rdi),%eax        <load hash
>    0x00007ffff68092f3 <+3>:     push   %rbp
>    0x00007ffff68092f4 <+4>:     mov    %rsp,%rbp              
>    0x00007ffff68092f7 <+7>:     test   %eax,%eax              <0?->X
>    0x00007ffff68092f9 <+9>:     jne    0x7ffff6809324 <NativeCallStack::hash() const+52>
>    0x00007ffff68092fb <+11>:    mov    (%rdi),%rdx
>    0x00007ffff68092fe <+14>:    test   %rdx,%rdx
>    0x00007ffff6809301 <+17>:    je     0x7ffff6809330 <NativeCallStack::hash() const+64>
>    0x00007ffff6809303 <+19>:    mov    0x8(%rdi),%rax
>    0x00007ffff6809307 <+23>:    test   %rax,%rax
>    0x00007ffff680930a <+26>:    je     0x7ffff680931f <NativeCallStack::hash() const+47>
>    0x00007ffff680930c <+28>:    add    %rax,%rdx
>    0x00007ffff680930f <+31>:    mov    0x10(%rdi),%rax
>    0x00007ffff6809313 <+35>:    test   %rax,%rax
>    0x00007ffff6809316 <+38>:    je     0x7ffff680931f <NativeCallStack::hash() const+47>
>    0x00007ffff6809318 <+40>:    add    %rax,%rdx
>    0x00007ffff680931b <+43>:    add    0x18(%rdi),%rdx
>    0x00007ffff680931f <+47>:    mov    %edx,%eax
>    0x00007ffff6809321 <+49>:    mov    %edx,0x20(%rdi)
>    0x00007ffff6809324 <+52>:    pop    %rbp                   <X
>    0x00007ffff6809325 <+53>:    retq  
>    0x00007ffff6809326 <+54>:    nopw   %cs:0x0(%rax,%rax,1)
>    0x00007ffff6809330 <+64>:    xor    %edx,%edx
>    0x00007ffff6809332 <+66>:    jmp    0x7ffff680931f <NativeCallStack::hash() const+47>
>
> hash() getter is not inlined; it queries the hash code each time and, when calculating it, uses simple adds interspersed with conditional jumps because of the "if stack address is 0 stop" logic.
>
> With this patch, the `NativeCallStack::hash()` gets inlined at the call sites to a simple load.
> The hash calculation gets now inlined into the constructor and uses a series of simple adds now:
>
> Dump of assembler code for function NativeCallStack::NativeCallStack(int, bool):
> => 0x00007ffff67cc9a0 <+0>:     push   %rbp
>    0x00007ffff67cc9a1 <+1>:     mov    %rsp,%rbp
>    0x00007ffff67cc9a4 <+4>:     push   %rbx
>    0x00007ffff67cc9a5 <+5>:     mov    %rdi,%rbx
>    0x00007ffff67cc9a8 <+8>:     sub    $0x8,%rsp
>    0x00007ffff67cc9ac <+12>:    test   %dl,%dl
>    0x00007ffff67cc9ae <+14>:    movl   $0x0,0x20(%rdi)
>    0x00007ffff67cc9b5 <+21>:    jne    0x7ffff67cc9f0 <NativeCallStack::NativeCallStack(int, bool)+80>
>    0x00007ffff67cc9b7 <+23>:    movq   $0x0,(%rdi)
>    0x00007ffff67cc9be <+30>:    movq   $0x0,0x8(%rdi)
>    0x00007ffff67cc9c6 <+38>:    movq   $0x0,0x10(%rdi)
>    0x00007ffff67cc9ce <+46>:    movq   $0x0,0x18(%rdi)
>    0x00007ffff67cc9d6 <+54>:    mov    0x10(%rbx),%rax   <<<
>    0x00007ffff67cc9da <+58>:    add    0x8(%rbx),%rax    <<<    
>    0x00007ffff67cc9de <+62>:    add    0x18(%rbx),%rax   <<<
>    0x00007ffff67cc9e2 <+66>:    add    (%rbx),%rax       <<<
>    0x00007ffff67cc9e5 <+69>:    mov    %eax,0x20(%rbx)
>    0x00007ffff67cc9e8 <+72>:    add    $0x8,%rsp
>    0x00007ffff67cc9ec <+76>:    pop    %rbx
>    0x00007ffff67cc9ed <+77>:    pop    %rbp
>    0x00007ffff67cc9ee <+78>:    retq  
>    0x00007ffff67cc9ef <+79>:    nop
>    0x00007ffff67cc9f0 <+80>:    mov    %esi,%edx
>    0x00007ffff67cc9f2 <+82>:    mov    $0x4,%esi
>    0x00007ffff67cc9f7 <+87>:    callq  0x7ffff68250d0 <os::get_native_stack(unsigned char**, int, int)>
>    0x00007ffff67cc9fc <+92>:    jmp    0x7ffff67cc9d6 <NativeCallStack::NativeCallStack(int, bool)+54>
> End of assembler dump.
>
> Thanks, Thomas

This pull request has now been integrated.

Changeset: a3d6e371
Author:    Thomas Stuefe <[hidden email]>
URL:       https://git.openjdk.java.net/jdk/commit/a3d6e371
Stats:     29 lines in 2 files changed: 9 ins; 16 del; 4 mod

8261302: NMT: Improve malloc site table hashing

Reviewed-by: zgu, lucy

-------------

PR: https://git.openjdk.java.net/jdk/pull/2473