FC3 x86_64 math functions slowdown

D. Angelis dangelis at beta-cae.gr
Sun Jul 10 04:35:18 UTC 2005


Hi all

At work I have several x86_64 based (Opteron, Athlon 64, Xenon 64 CPUs) 
machines running FC3 and SuSe 9.

I have noticed a significant (up to 5 times !!!) slowdown of some math 
functions in /lib64/libm.so.6 (glibc-2.3-5) shipped with FC3
compared to the version (glibc-2.3.3) shipped with SuSe 9.1. (I do not 
have any machines running FC1,FC2 to test them too).

To make it clear I used a simple benchmarking program (See source at the 
bottom of this message)
in two identically configured machines running SuSe 9.1 and FC3.

It basically tries to measure the time a call to a math function takes 
by calling them multiple times.

Both 32 and 64 bit executables were used to show that the problem is in 
the 64 bit library.
Note that there is no notable change in the performance of the 32 bit 
library.

The results are 100% reproducible and invariant to compiler version and 
options used.

Both double and float versions of each function were used to show the 
huge difference in performance
of tan() and tanf().

Another strange thing is that in both versions of the library the 64bit 
sin() function family is two times slower than the 32bit
while for all other functions the 64bit ones are faster or close to the 
speed of the 32bit ones.

The following table summarizes the results .
 
                              glibc 2.3.3    glibc 2.3.5
                                                   
  Math function    32 bit  64 bit  32 bit  64 bit
 
  f1 = log(f2)        0.10    0.03    0.11    0.08 !!   (3 times slower) 
  f1 = logf(f2)       0.10    0.03    0.11    0.09 !!   (same here)

  f1 = tan(f2)        0.07    0.08    0.07    0.35 !!!  (even worst, 5 
times slower)
  f1 = tanf(f2)       0.07    0.06    0.07    0.09 !    (this is crazy 
:1.5 times slower than glibc-2.3.3 but 4 times faster than "tan()" of 
glibc-2.3.5)

  f1 = exp(f2)        0.07    0.03    0.07    0.27 !!! (almost 4 times 
slower)
  f1 = expf(f2)       0.07    0.03    0.07    0.27 !!! (same here)

  f1 = sin(f2)        0.03    0.05   0.03    0.06 * (this is strange : 
64bit version is 2 times slower than 32bit)  
  f1 = sinf(f2)       0.03    0.05   0.03    0.06 * (same here)

  f1 = cos(f2)        0.05    0.06    0.05    0.06
  f1 = cosf(f2)       0.05    0.06    0.05    0.06
  f1 = cosl(f2)       0.05    0.06    0.06    0.06

  f1 = sqrt(f2)       0.02    0.01    0.02    0.01
  f1 = sqrtf(f2)      0.02    0.01    0.02    0.01





SYSTEM INFO OF THE TEST MACHINES :
 
System 1 : Suse 9.1 (x86-64)
 
galactix # uname -a
Linux galactix 2.6.4-54.5-default #1 Fri May 7 16:47:49 UTC 2004 x86_64 
x86_64 x86_64 GNU/Linux
 
galactix # rpm -q glibc
glibc-2.3.3-63
 
galactix # cat /proc/cpuinfo
 
processor       : 0
vendor_id       : AuthenticAMD
cpu family      : 15
model           : 5
model name      : AMD Opteron(tm) Processor 246
stepping        : 8
cpu MHz         : 1992.158
cache size      : 1024 KB
fpu             : yes
fpu_exception   : yes
cpuid level     : 1
wp              : yes
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge 
mca cmov
pat pse36 clflush mmx fxsr sse sse2 sys call nix mmxext elm 3dnowext 3now
bogomips        : 3915.77
TLB size        : 1088 4K pages
clflush size    : 64
address sizes   : 40 bits physical, 48 bits virtual
power management: ts ttp
 
System 2 : FC3 (x86-64)
 
neron # uname -a
Linux neron.localdomain 2.6.10-1.770_FC3 #1 Thu Feb 24 18:09:38 EST 2005 
x86_64
x86_64 x86_64 GNU/Linux
 
neron # rpm -q glibc
glibc-2.3.5-0.fc3.1
 
neron # cat /proc/cpuinfo
 
processor       : 0
vendor_id       : AuthenticAMD
cpu family      : 15
model           : 5
model name      : AMD Opteron(tm) Processor 246
stepping        : 8
cpu MHz         : 1994.595
cache size      : 1024 KB
fpu             : yes
fpu_exception   : yes
cpuid level     : 1
wp              : yes
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge 
mca cmov
pat pse36 clflush mmx fxsr sse sse2 pni syscall nx mmxext lm 3dnowext 3dnow
bogomips        : 3923.96
TLB size        : 1088 4K pages
clflush size    : 64
cache_alignment : 64
address sizes   : 40 bits physical, 48 bits virtual
power management: ts ttp
 


SOURCE CODE OF BENCHMARKING PROGRAM

galactix # cat metro.c

----------------- CUT HERE --------------------
#include <stdio.h>
#include <limits.h>
#include <math.h>
#include <string.h>
#include <stdlib.h>
 
#define BASEN 50000000
#define WARNRANGE 0.4
 
struct stacknode {
        int val;
        struct stacknode *next;
};
typedef struct stacknode *Stackp;
Stackp  stackroot;
 
void push(int i)
{
        Stackp p;
 
        p = (Stackp) malloc(sizeof(struct stacknode));
        p->val = i;
        p->next = stackroot;
        stackroot = p;
}
 
int pop()
{
        Stackp p;
        int     i;
 
        p = stackroot;
        i = stackroot->val;
        stackroot = stackroot->next;
        free(p);
        return i;
}
 
#include <time.h>
 
int jobclicks()
{
        return (int) clock();
}
 
#define quoted(TEXT) #TEXT
/*#define quoted(TEXT) "TEXT"*/
 
#define loop1(CODE) loop1ctr++; \
        for (i = 0; i < n; i++) { CODE; } \
        loop1next = jobclicks(); \
        thisclicks = loop1next - loop1start; \
        sumclicks += thisclicks; \
        if (thisclicks < minclicks) minclicks = thisclicks; \
        if (thisclicks > maxclicks) maxclicks = thisclicks; \
        printf("%7d ", loop1next - loop1start); \
        loop1start = loop1next;
 
#define loop(CODE) printf("  %-30s", quoted(CODE)); \
        minclicks = INT_MAX; maxclicks = -1; sumclicks = 0; \
        loop1ctr = 0; \
        loop1start = jobclicks(); \
        loop1(CODE) \
        loop1(CODE) \
        i0 = i1 + i2 + i3; \
        loop1(CODE) \
        i0 = i1 + i2 + i3 - i1 - i2 - i3; \
        loop1(CODE) \
        i0 = i1 + i2 + i3 + i1*i2 + i2*i3 + i1*i3; \
        loop1(CODE) \
        queststr = ""; \
        if (loop1ctr * (maxclicks - minclicks) > WARNRANGE *  sumclicks) \
                queststr = "?"; \
        lastmics = sumclicks * 1000000.0 / ((double) CLOCKS_PER_SEC * n 
* loop1ctr); \
        printf("%10.2f%s\n", lastmics - basemics, queststr);
 
#define title(TEXT) printf("%s (n=%d)\n", TEXT, n);
 
/* The experiment */
 
int sum1(int a) { return a; }
int sum2(int a, int b) { return a + b; }
int sum3(int a, int b, int c) { return a + b + c; }
 
int main()
{
        int     loop1start, loop1next, loop1ctr;
        double  lastmics, basemics;
        int     minclicks, maxclicks, sumclicks, thisclicks, startclicks;
        int     i, n, basen;
        volatile int    i0, i1, i2, i3, i4;
        volatile float  f0, f1, f2, f3;
        int     *v;
        char    *queststr;
        char    s[100];
        char    fname[20];
        FILE    *fp;
        char    s0123456789[] = "0123456789";
        char    sa123456789[] = "a123456789";
        char    s12345[] = "12345";
        char    s123_45[] = "123.45";
        char    sd[] = "%d";
        char    sdn[] = "%d\n";
        char    sf[] = "%f";
        char    sf62[] = "%f6.2";
 
        setbuf(stdout, (char *) 0);     /* No buffering to watch output */
        printf("  Operation                         Clicks for each 
trial ");
        printf("   Mics/N\n");
        startclicks = jobclicks();
 
        basen = BASEN;
        n = basen;
        title("Null Loop")
        i0 = i1 = i2 = i3 = 5;
        f0 = f1 = f2 = f3 = 5.0;
        basemics = 0.0;
        loop({})
        basemics = lastmics;
 
        n = basen/10;
/*      n = basen;*/
        title("Math Functions");
        f2 = 5.0;
        loop(f1 = log(f2))
        loop(f1 = logf(f2))
        loop(f1 = tan(f2))
        loop(f1 = tanf(f2))
        loop(f1 = exp(f2))
        loop(f1 = expf(f2))
        loop(f1 = sin(f2))
        loop(f1 = sinf(f2))
        loop(f1 = cos(f2))
        loop(f1 = cosf(f2))
        loop(f1 = cosl(f2))
        loop(f1 = sqrt(f2))
        loop(f1 = sqrtf(f2))
 
        printf("Total Seconds:%10.2f\n", ((float) 
jobclicks()-startclicks) / CLOCKS_PER_SEC);
        return 0;
}
--------- CUT HERE --------


Hope its helpful

   D. Angelis




More information about the fedora-list mailing list