stdcxx-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Martin Sebor <se...@roguewave.com>
Subject Re: [PATCH] Use __rw_atomic_xxx() on Windows
Date Thu, 06 Sep 2007 02:48:50 GMT
Travis Vitek wrote:
> Oh, yeah. that is the other thing that I did Friday. I wrote a testcase
> to compare __rw_atomic_add32() against InterlockedIncrement() on Win32.
> There is a performance penalty...

I'd be curious to know if the performance penalty is due to the
function call overhead or something else.

In any case though, I think we could tweak the patch and change
the __rw_atomic_pre{de,in}crement() overloads for int and long
to call the appropriate Interlocked{De,In}crement() intrinsics
and have the other overloads use the new ones.

Farid, what do you think about this approach?

Martin

> 
>   C:\Temp>t 2 && t 4 && t 8
>   ---------- locked inc ---- atomic_add ---- 2 threads
>   ms               4266            4469
>   ms/op      0.00003178      0.00003330      -4.7586%
>   thr ms          18117           18437
>   thr ms/op  0.00013498      0.00013737      -1.7663%
>   ---------- locked inc ---- atomic_add ---- 4 threads
>   ms               7969            8609
>   ms/op      0.00005937      0.00006414      -8.0311%
>   thr ms          36359           37019
>   thr ms/op  0.00027090      0.00027581      -1.8152%
>   ---------- locked inc ---- atomic_add ---- 8 threads
>   ms               5016            5484
>   ms/op      0.00003737      0.00004086      -9.3301%
>   thr ms          60846           66130
>   thr ms/op  0.00045334      0.00049271      -8.6842%
> 
>   C:\Temp>t 2 && t 4 && t 8
>   ---------- locked inc ---- atomic_add ---- 2 threads
>   ms               2781            2906
>   ms/op      0.00002072      0.00002165      -4.4948%
>   thr ms          14961           16093
>   thr ms/op  0.00011147      0.00011990      -7.5663%
>   ---------- locked inc ---- atomic_add ---- 4 threads
>   ms               2781            2891
>   ms/op      0.00002072      0.00002154      -3.9554%
>   thr ms          30867           31328
>   thr ms/op  0.00022998      0.00023341      -1.4935%
>   ---------- locked inc ---- atomic_add ---- 8 threads
>   ms               2782            2890
>   ms/op      0.00002073      0.00002153      -3.8821%
>   thr ms          64318           64341
>   thr ms/op  0.00047921      0.00047938      -0.0358%
> 
> I will do a quick run using the string performance test after lunch.
> I'll report the results on that later. I've pasted the source for the
> bulk of my test below. If someone wants the entire thing, let me know
> and I'll provide everything.
> 
> Travis
> 
> 
> Martin Sebor wrote:
>> Subject: Re: [PATCH] Use __rw_atomic_xxx() on Windows
>>
>> What's the status of this? We need to decide if we can put this
>> in 4.2 or defer it for 4.2.1. To put it in 4.2 we need to make
>> sure the new functions don't cause a performance regression in
>> basic_string. I.e., we need to see the before and after numbers.
>>
>> Martin
>>
>> Martin Sebor wrote:
>>> One concern I have is performance. Does replacing the intrinsics with
>>> out of line function call whose semantics the compiler has no idea
>>> about have any impact on the runtime efficiency of the 
>> generated code?
>>> I would be especially interested in "real life" scenarios such as the
>>> usage of the atomic operations in basic_string.
>>>
>>> It would be good to see some before and after numbers. If you don't
>>> have all the platforms to run the test post your benchmark and Travis
>>> can help you put them together.
> 
> #include <stdio.h>
> #include <stdlib.h>
> 
> #define WIN32_LEAN_AND_MEAN
> #include <windows.h>
> #include <process.h>
> 
> #include "lib.h"
> 
> #define MIN_THREADS 2
> #define MAX_THREADS 16
> 
> unsigned long locked_inc(long* val, long iters)
> {
>     const unsigned long t0 = GetTickCount ();
> 
>     long n;
>     for (n = 0; n < iters; ++n)
>     {
>         InterlockedIncrement(val);
>     }
> 
>     const unsigned long t1 = GetTickCount ();
> 
>     return (t1 - t0);
> }
> 
> unsigned long atomic_add(long* val, long iters)
> {
>     const unsigned long t0 = GetTickCount ();
> 
>     long n;
>     for (n = 0; n < iters; ++n)
>     {
>         __rw_atomic_add32(val, 1);
>     }
> 
>     const unsigned long t1 = GetTickCount ();
> 
>     return (t1 - t0);
> }
> 
> struct thread_param {
> 
>     // atomic variable
>     long* variable;
> 
>     // number of iterations
>     long iters;
> 
>     // function to invoke
>     unsigned long (*fun)(long*, long);
> 
>     // result of function
>     unsigned long result;
> 
>     // thread handle used by main thread
>     HANDLE thread;
> };
> 
> extern "C" {
> 
>     void thread_func(void* p)
>     {
>         thread_param* param = (thread_param*)p;
>         param->result = (param->fun)(param->variable, param->iters);
>     }
> 
> } // extern "C"
> 
> 
> unsigned long run_threads(int nthreads, unsigned long (*fun)(long*,
> long), long iters)
> {
>     thread_param params[MAX_THREADS];
>     long thread_var = 0;
> 
>     int i;
>     for (i = 0; i < nthreads; ++i) {
>         params[i].variable = &thread_var;
>         params[i].result   = 0;
>         params[i].fun      = fun;
>         params[i].iters    = iters;
>     }
> 
>     int n;
>     for (n = 0; n < nthreads; ++n) {
>         params[n].thread = (HANDLE)_beginthread(thread_func, 0,
> &params[n]);
>     }
> 
>     unsigned long thread_time = 0;
> 
>     for (n = 0; n < nthreads; ++n) {
>         WaitForSingleObject (params[n].thread, INFINITE);
>         thread_time += params[n].result;
>     }
> 
>     return thread_time;
> }
> 
> 
> int main(int argc, char* argv[])
> {
>     int nthreads = MIN_THREADS;
>     if (1 < argc)
>         nthreads = atoi(argv[1]);
> 
>     // cap thread count
>     if (nthreads < MIN_THREADS)
>         nthreads = MIN_THREADS;
>     else if (MAX_THREADS < nthreads)
>         nthreads = MAX_THREADS;
> 
>     const long ops = 0x7ffffff;
>     long thread_var;
>     
>     thread_var = 0;
>     unsigned long locked_inc_ms = locked_inc (&thread_var, ops);
>     
>     thread_var = 0;
>     unsigned long atomic_add_ms = atomic_add (&thread_var, ops);
> 
>     printf("---------- locked inc ---- atomic_add ---- %d threads\n",
> nthreads);
>     printf("ms           %8.u        %8.u\n", locked_inc_ms,
> atomic_add_ms);
> 
>     float locked_inc_ops_p_ms = 1.f * locked_inc_ms / ops;
>     float atomic_add_ops_p_ms = 1.f * atomic_add_ms / ops;
> 
>     printf("ms/op      %8.8f      %8.8f      %.4f%%\n", 
>         locked_inc_ops_p_ms, atomic_add_ops_p_ms,
>         100.f * (locked_inc_ops_p_ms - atomic_add_ops_p_ms) /
> locked_inc_ops_p_ms);
> 
>     // do it with threads
> 
>     locked_inc_ms = run_threads(nthreads, locked_inc, ops);
>     atomic_add_ms = run_threads(nthreads, atomic_add, ops);
> 
>     locked_inc_ms /= nthreads;
>     atomic_add_ms /= nthreads;
> 
>     printf("thr ms       %8.u        %8.u\n", locked_inc_ms,
> atomic_add_ms);
> 
>     locked_inc_ops_p_ms = 1.f * locked_inc_ms / ops;
>     atomic_add_ops_p_ms = 1.f * atomic_add_ms / ops;
> 
>      printf("thr ms/op  %8.8f      %8.8f      %.4f%%\n", 
>         locked_inc_ops_p_ms, atomic_add_ops_p_ms,
>         100.f * (locked_inc_ops_p_ms - atomic_add_ops_p_ms) /
> locked_inc_ops_p_ms);
> 
>     return 0;
> }


Mime
View raw message