/* Compilation : gcc -S -O4 -fomit-frame-pointer -fprefetch-loop-arrays -msse2 -ftree-vectorize -ftree-loop-linear e6.c */

/* Taille des bitsets */
#define N 1048576
//#define N 16384

void 
recopie8by8(unsigned char *s, unsigned char *d, int c)
{
	/* on recopie s dans d, octet par octet */
	__asm__("cld\n\t"
	    "rep\n\t"
	    "movsb\n\t"
	    :
	    :"S"(s), "D"(d), "c"(c));
}


/* On considere que s et d sont des vecteurs de bits          */
/* On fait un ET logique entre s et d (resultat dans d)     */
/* On utilise des operations mmx                              */
/* nb_bit represente le nombre de bits dans les vecteur s et d */
/* qui sont de meme taille et multiple de 128                  */
/* Attention : il y a un effet de bord sur les prefetch : on  */
/* prefetch un "coup trop loin"                               */

void 
pand(unsigned char *s, unsigned char *d, int nb_bit)
{
	int             count,i;
        int             *cs=(int *)s, *cd=(int *)d;

	count = (int)(nb_bit / 32);

        for(i=0;i<count;i++){
	  cd[i] = cd[i] & cs[i];  
        }
}

/* On considere que s et d sont des vecteurs de bits          */
/* On fait un ET logique entre s et d (resultat dans t)       */
/* On utilise des operations mmx                              */
/* nb_bit represente le nombre de bits dans les vecteur s et d */
/* qui sont de meme taille et multiple de 128                  */
/* Attention : il y a un effet de bord sur les prefetch : on  */
/* prefetch un "coup trop loin"                               */


void 
pandCopy(unsigned char *s, unsigned char *t, unsigned char *d, long int nb_bit)
{
	int             count,i;
        int             *cs=(int *)s, *ct=(int *)t, *cd=(int *)d;

	count = (int)(nb_bit / 32);

        for(i=0;i<count;i++){
	  cd[i] = ct[i] & cs[i];  
        }
        
}

/* Ici c'est le OR entre deux bitsets */
void 
por(unsigned char *s, unsigned char *d, int nb_bit)
{
	int             count,i;
        int             *cs=(int *)s, *cd=(int *)d;

	count = (int)(nb_bit / 32);

        for(i=0;i<count;i++){
	  cd[i] = cd[i] | cs[i];  
        }
}

/* Le OR avec recopie du résultat dans le vecteur t */
void 
porCopy(unsigned char *s, unsigned char *d, unsigned char *t, int nb_bit)
{
	int             count,i;
        int             *cs=(int *)s, *ct=(int *)t, *cd=(int *)d;

	count = (int)(nb_bit / 32);

        for(i=0;i<count;i++){
	  cd[i] = ct[i] | cs[i];  
        }
}


/* Et maintenant... le XOR entre deux bitsets */
void 
pxor(unsigned char *s, unsigned char *d, int nb_bit)
{
	int             count,i;
        int             *cs=(int *)s, *cd=(int *)d;

	count = (int)(nb_bit / 32);

        for(i=0;i<count;i++){
	  cd[i] = cd[i] ^ cs[i];  
        }
}

/* Le XOR avec recopie du resultat dans le buffer t */
void 
pxorCopy(unsigned char *s, unsigned char *d, unsigned char *t, int nb_bit)
{
	int             count,i;
        int             *cs=(int *)s, *ct=(int *)t, *cd=(int *)d;

	count = (int)(nb_bit / 32);

        for(i=0;i<count;i++){
	  cd[i] = ct[i] ^ cs[i];  
        }
}


void 
SetBitTo1(unsigned char *d, int pos)
{
	/* le code qui suit permet de positionner à 1 un bit parmi les       */
	/* 8*x=N bit du vecteur src => instruction bts. On peut egalement */
	/* positionner un bit à zéro par l'instruction btr                */
	/* Note :                                                         */
	/*
	 * Le vecteur dst est vu de "gauche à droite" comme un vecteur de
	 * bits et il est indicé sur [0..N-1]
	 */
	int             bit, pos1;

	bit = 8-(pos % 8)-1;	/* ATTENTION a la representation inversée */
	pos1 = (int) (pos / 8);
	__asm__         __volatile__("lea %0, %%edi\n\t"
	       "add %2, %%edi\n\t"
	       "bts %1, (%%edi)\n\t"
	       : "=m"(*d):"ic"(bit), "iS"(pos1):"%eax", "%ebx", "%edi");

}

void 
SetBitTo0(unsigned char *d, int pos)
{
	/* le code qui suit permet de positionner à 0 un bit parmi les       */
	/* 8*x=N bit du vecteur src => instruction bts. On peut egalement */
	/* positionner un bit à zéro par l'instruction btr                */
	/* Note :                                                         */
	/*
	 * Le vecteur dst est vu de "gauche à droite" comme un vecteur de
	 * bits et il est indicé sur [0..N-1]
	 */
	int             bit, pos1;

	bit = 8-(pos %8)-1;	/* ATTENTION a la representation inversée */
	pos1 = (int) (pos / 8);
	__asm__         __volatile__("lea %0, %%edi\n\t"
	       "add %2, %%edi\n\t"
	       "btr %1, (%%edi)\n\t"
	       : "=m"(*d):"ic"(bit), "iS"(pos1):"%eax", "%ebx", "%edi");

}

unsigned char TestBit(unsigned char *d, int pos)
{
	/* le code qui suit permet de tester la valeur d'un bit */
	/* On utilise l'instruction bt qui positonne la valeur de la carry */
	/* avec la valeur du bit testé. On retourne la valeur dans un */
	/* "char" : thebit */
	int             bit, pos1;
	unsigned char            thebit[1];

	bit = 8-(pos % 8)-1;	/* ATTENTION a la representation inversée */
	pos1 = (int) (pos / 8);
	thebit[0] = -1;		/* initialisation par défaut */
	__asm__         __volatile__("xor %%al, %%al\n\t"
           "lea %0, %%edi\n\t"
           "add %3, %%edi\n\t"
           "bt  %2, (%%edi)\n\t"
           "adc %%al, %%al\n\t"
           "mov %%al, %1\n\t"
           : "=m"(*d), "=m"(thebit):"ic"(bit), "id"(pos1):"%al", "%ebx", "%edi"        );

	return thebit[0];
}

/* Retourne le nombre de bits à 1 dans le bitset s de taille nb_bit */
int
NumberBit1(unsigned char *s, int nb_bit)
{
	register int             i;
        register int             resultat;
        /* Tableau qui contient le nombre de bit des 256 premiers entiers */
        unsigned char            Tab[256]={0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
	/* Ce tableau s'obtient avec un programme C++ du type : */
        /*   for(i=0;i<256;i++){         */
	/*    bitset<8> a(i);            */
        /*    cout << a.count() << ", "; */
        /*   }                           */

        /* L'algorithme est simple : on consulte le bitset octet par */
        /* octet et on somme le nombre de bits dans les octets       */
        /* successifs en consultant le tableau predefini Tab         */

        resultat=0;
	for(i=0;i< (int)(nb_bit/8);i++)
          resultat += Tab[s[i]];

 return resultat;
}

/* Retourne le nombre de bits à 0 dans le bitset s de taille nb_bit */
int
NumberBit0(unsigned char *s, int nb_bit)
{
 return (nb_bit - NumberBit1(s,nb_bit));
}

/* Tests whether any  of the bits are on.  */ 
int
Any (unsigned char *s, int nb_bit) 
{
  return(nb_bit == NumberBit1(s,nb_bit));
}

/* Tests whether any  of the bits are equal to 0. */
int
None (unsigned char *s, int nb_bit) 
{
  return(nb_bit == NumberBit0(s,nb_bit));
}

/* Mets tous les bits à 1 dans le bitset s de taille nb_bit */
void
Reset1(unsigned char *s, int nb_bit)
{
	register int             i;
	for(i=0;i< (int)(nb_bit/8);i++)
	  s[(short)i]=(unsigned char)255;
	//*(s + i)=0xff;
}

/* Mets tous les bits à 1 dans le bitset s de taille nb_bit */
/* Version MMX                                              */
void
Reset1SIMD(unsigned char *s, int nb_bit)
{
	int             count;
        int             data[2]={0xffffffff,0xffffffff};

	count = (int)(nb_bit / 64);
	__asm__ __volatile__("lea %0, %%eax\n\t"   /* adresse de src/dst */
          "lea %1, %%edx\n\t"   /* adresse de src/dst */
          "movdqu (%%edx), %%xmm1\n\t"
	  "xor %%ebx, %%ebx\n"	/* initialisation compteur de boucle */
          ".myloop5: prefetchT0 64(%%eax)\n\t" /* on charge par anticipation */
          "movdqu %%xmm1, (%%eax)\n\t"	/* on range le resultat */
          "add $8, %%eax\n\t"	/* on passe 8 "adresses" plus loin */
          "incl %%ebx\n\t"
          "cmp %2, %%ebx\n\t"	/* fin des iterations ? */
          "jle  .myloop5\n\t"	/* non, alors on recommence un tour... */
          : "=m"(*s),"=m"(data):"ic"(count - 1):"%eax", "%ebx", "%edx");
}

/* Mets tous les bits à 0 dans le bitset s de taille nb_bit */
void
Reset0(unsigned char *s, int nb_bit)
{
	register int             i;
	for(i=0;i< (int)(nb_bit/8);i++)
	  s[(short)i]=(short)0;
}

/* Mets tous les bits à 0 dans le bitset s de taille nb_bit */
/* Version MMX                                              */
void
Reset0SIMD(unsigned char *s, int nb_bit)
{
	int             count;

	count = (int)(nb_bit / 64);
	__asm__ __volatile__("lea %0, %%eax\n\t"   /* adresse de src/dst */
          "pxor %%xmm1, %%xmm1\n\t"	/* tous les bits a 0 */
	  "xor %%ebx, %%ebx\n"	/* initialisation compteur de boucle */
          ".myloop6: prefetchT0 64(%%eax)\n\t" /* on charge par anticipation */
          "movdqu %%xmm1, (%%eax)\n\t"	/* on range le resultat */
          "add $8, %%eax\n\t"	/* on passe 8 "adresses" plus loin */
          "incl %%ebx\n\t"
          "cmp %1, %%ebx\n\t"	/* fin des iterations ? */
          "jle  .myloop6\n\t"	/* non, alors on recommence un tour... */
          : "=m"(*s):"ic"(count - 1):"%eax", "%ebx");
}


/* Find_first (bitset) : Finds the index of the first "on" bit when we read */
/*  the bitset from left to right. We return -- (=255 as an usigned char) */
/* if we do not find any 1 */
/* Note: x86 asm has  bsf et bsr flag to do this, but it seems that */
/* the 2 instructions are not implemented with GCC!                 */

#include <byteswap.h>

int Find_first (unsigned char *b, int size){

	unsigned char            thebit[1];
        register int i,j,count;
        unsigned short *buf, data1, data2;

	count = size/(sizeof(unsigned short)*8); /* 2 octets = 16 bits */
        thebit[0] = -1;		/* initialisation par défaut */
        i=0;
        buf = (unsigned short *)b;          /* on initialise */
        
        while(i<count){
          j = buf[i];
          /* INVERSER LES 8 BITS DE POIDS FORT AVEC LES 8 BITS POIDS FAIBLE */
          /* Convert little/big endian to big/little endian by swapping bytes*/
          j = bswap_16(j);
          /*	printf("%d\n",sizeof(unsigned short));*/
          /* printf("%u %u\n",j,i); */
          data1 = j & 0xff00; /* on isole les 8 bits de poids fort */
          if(data1) { /* il y a un 1 dans les bits de poids fort */
             data1 = j & 0xf000;
             if(data1) {
                data1 = j & 0xc000;
                if(data1){
                  if(j & 0x8000) return(i*8*sizeof(unsigned short));
                  else return(1+(i*8*sizeof(unsigned short)));
                }{
                  if(j & 0x2000) return(2+i*8*sizeof(unsigned short));
                  else return(3+(i*8*sizeof(unsigned short)));
                }
             }{ /* c'est dans les "second 4 bits" */
                data1 = j & 0x0c00;
                if(data1){
                  if(j & 0x0800) return(4+i*8*sizeof(unsigned short));
                  else return(5+(i*8*sizeof(unsigned short)));
                }{
                  if(j & 0x0200) return(6+i*8*sizeof(unsigned short));
                  else return(7+(i*8*sizeof(unsigned short)));
                }
             }
          } { /* eventuellement dans les bits de poids faible */
            data2 = j & 0x00ff; /* on isole les 8 bits de poids faible */
	    if(data2) { /* il y a un bit a 1 dans les poids faible */
                data1 = j & 0x00f0;
                if(data1){
                  data1 = j & 0x00c0;
                  if(data1) {/* quel bit ? */
                     if(j & 0x0080) return(8+i*8*sizeof(unsigned short));
                     else return(9+(i*8*sizeof(unsigned short)));
                  } {
                     if(j & 0x0020) return(10+i*8*sizeof(unsigned short));
                     else return(11+(i*8*sizeof(unsigned short)));
                  }
                }{ /* c'est dans les 4 derniers bits */
                  data1 = j & 0x000c;
                  if(data1) {/* quel bit ? */
                     if(j & 0x0008) return(12+i*8*sizeof(unsigned short));
                     else return(13+(i*8*sizeof(unsigned short)));
                  } {
                     if(j & 0x0002) return(14+i*8*sizeof(unsigned short));
                     else return(15+(i*8*sizeof(unsigned short)));
                  }
                }
            }
          }
	  i++;
        }

	return (int)(thebit[0]);
}

/* Find_last (bitset) : Finds the index of the last "on" bit when we read */
/*  the bitset from left to right. We return -- (=255 as an usigned char) */
/* if we do not find any 1 */
/* Note: x86 asm has  bsf et bsr flag to do this, but it seems that */
/* the 2 instructions are not implemented with GCC!                 */

int Find_last (unsigned char *b, int size){

	unsigned char            thebit[1];
        register int i,j,count;
        unsigned short *buf, data1, data2;

	count = size/(sizeof(unsigned short)*8); /* 2 octets = 16 bits */
        thebit[0] = -1;		/* initialisation par défaut */
        i=count-1;
        buf = (unsigned short *)b;          /* on initialise */
        
        while(i>=0){
          j = buf[i];
          /* INVERSER LES 8 BITS DE POIDS FORT AVEC LES 8 BITS POIDS FAIBLE */
          /* Convert little/big endian to big/little endian by swapping bytes*/
          j = bswap_16(j);
          /*	printf("%d\n",sizeof(unsigned short));*/
          /* printf("%u %u\n",j,i); */
          data1 = j & 0x00ff; /* on isole les 8 bits de poids faible */
          if(data1) { /* il y a un 1 dans les bits de poids faible */
             data1 = j & 0x000f;
             if(data1) {
                data1 = j & 0x0003;
                if(data1){
                  if(j & 0x0001) return(15+i*8*sizeof(unsigned short));
                  else return(14+(i*8*sizeof(unsigned short)));
                }{
                  if(j & 0x0004) return(13+i*8*sizeof(unsigned short));
                  else return(12+(i*8*sizeof(unsigned short)));
                }
             }{ /* c'est dans les "seconds 4 bits" */
                data1 = j & 0x0030;
                if(data1){
                  if(j & 0x0010) return(11+i*8*sizeof(unsigned short));
                  else return(10+(i*8*sizeof(unsigned short)));
                }{
                  if(j & 0x0040) return(9+i*8*sizeof(unsigned short));
                  else return(8+(i*8*sizeof(unsigned short)));
                }
             }
          } { /* eventuellement dans les bits de poids faible */
            data2 = j & 0xff00; /* on isole les 8 bits de poids fort */
	    if(data2) { /* il y a un bit a 1 dans les poids fort */
                data1 = j & 0x0f00;
                if(data1){
                  data1 = j & 0x0300;
                  if(data1) {/* quel bit ? */
                     if(j & 0x0100) return(7+i*8*sizeof(unsigned short));
                     else return(6+(i*8*sizeof(unsigned short)));
                  } {
                     if(j & 0x0400) return(5+i*8*sizeof(unsigned short));
                     else return(4+(i*8*sizeof(unsigned short)));
                  }
                }{ /* c'est dans les 4 premiers bits */
                  data1 = j & 0x3000;
                  if(data1) {/* quel bit ? */
                     if(j & 0x1000) return(3+i*8*sizeof(unsigned short));
                     else return(2+(i*8*sizeof(unsigned short)));
                  } {
                     if(j & 0x4000) return(1+i*8*sizeof(unsigned short));
                     else return(i*8*sizeof(unsigned short));
                  }
                }
            }
          }
	  i--;
        }

	return (int)(thebit[0]);
}


/* Find_next (b,l,r): Finds the index of  the next "on" bit, */
/* starting at position l in the bitset and ending in position r  */
/* in bitset b of size "N" bits */

int Find_next (unsigned char *b, int l, int r){

        register int i,j,count;
        unsigned short *buf, data1, data2;

        buf = (unsigned short *)b;          /* on initialise */

        /* On cherche jusqu'a tomber sur une frontiere de mot de 16 bits */
        for(i=0;(l%16 !=0) && (i<16-(l%16)) && ((l+i)<=r);i++){
          count = TestBit(b,l+i);
          //printf("count=%d l=%d i=%d\n",count,l,i);
	  if(count != 0) return(l+i);
        }

        /* On cherche "au milieu" */
        if(l%16 == 0) {
           i = l/(sizeof(unsigned short)*8);
           if((r+1)%16 == 0) count = (r+1)/(sizeof(unsigned short)*8);
           else count = ((r+1)-((r+1)%16))/(sizeof(unsigned short)*8);
        }else {
           i = (l + 16-(l%16))/(sizeof(unsigned short)*8);
           if((r+1)%16 == 0) count = (r+1)/(sizeof(unsigned short)*8);
           else count = ((r+1)-((r+1)%16))/(sizeof(unsigned short)*8);
        }
        //printf("TOTO %d %d\n",i,count);
        while(i<count){
          j = buf[i];
          /* INVERSER LES 8 BITS DE POIDS FORT AVEC LES 8 BITS POIDS FAIBLE */
          /* Convert little/big endian to big/little endian by swapping bytes*/
          j = bswap_16(j);
          /*	printf("%d\n",sizeof(unsigned short));*/
          //printf("%u %u\n",j,i);
          data1 = j & 0xff00; /* on isole les 8 bits de poids fort */
          if(data1) { /* il y a un 1 dans les bits de poids fort */
             data1 = j & 0xf000;
             if(data1) {
                data1 = j & 0xc000;
                if(data1){
                  if(j & 0x8000) return(i*8*sizeof(unsigned short));
                  else return(1+(i*8*sizeof(unsigned short)));
                }{
                  if(j & 0x2000) return(2+i*8*sizeof(unsigned short));
                  else return(3+(i*8*sizeof(unsigned short)));
                }
             }{ /* c'est dans les "second 4 bits" */
                data1 = j & 0x0c00;
                if(data1){
                  if(j & 0x0800) return(4+i*8*sizeof(unsigned short));
                  else return(5+(i*8*sizeof(unsigned short)));
                }{
                  if(j & 0x0200) return(6+i*8*sizeof(unsigned short));
                  else return(7+(i*8*sizeof(unsigned short)));
                }
             }
          } { /* eventuellement dans les bits de poids faible */
            data2 = j & 0x00ff; /* on isole les 8 bits de poids faible */
	    if(data2) { /* il y a un bit a 1 dans les poids faible */
                data1 = j & 0x00f0;
                if(data1){
                  data1 = j & 0x00c0;
                  if(data1) {/* quel bit ? */
                     if(j & 0x0080) return(8+i*8*sizeof(unsigned short));
                     else return(9+(i*8*sizeof(unsigned short)));
                  } {
                     if(j & 0x0020) return(10+i*8*sizeof(unsigned short));
                     else return(11+(i*8*sizeof(unsigned short)));
                  }
                }{ /* c'est dans les 4 derniers bits */
                  data1 = j & 0x000c;
                  if(data1) {/* quel bit ? */
                     if(j & 0x0008) return(12+i*8*sizeof(unsigned short));
                     else return(13+(i*8*sizeof(unsigned short)));
                  } {
                     if(j & 0x0002) return(14+i*8*sizeof(unsigned short));
                     else return(15+(i*8*sizeof(unsigned short)));
                  }
                }
            }
          }
	  i++;
        }

        /* On cherche à la fin */
        for(i=r-(r%16);((r+1)%16 != 0) && (i<=r) && (i>=l);i++){
          count = TestBit(b,i);
          //printf("%d ",count);
	  if(count != 0) return(i);
        }

        /* On n'a pas trouvé de bit à 1 */

        return(-1);
           
}


/*	Function to test if multimedia instructions are supported...
inline extern 
*/

int
mm_support(void)
{
	/* Returns 1 if MMX instructions are supported,
	   3 if Cyrix MMX and Extended MMX instructions are supported
	   5 if AMD MMX and 3DNow! instructions are supported
	   0 if hardware does not support any of these
	*/
	int rval = 0;
	__asm__ __volatile__ (/* See if CPUID instruction is supported ... */
		/* ... Get copies of EFLAGS into eax and ecx */
		"pushf\n\t"
		"popl %%eax\n\t"
		"movl %%eax, %%ecx\n\t"
		/* ... Toggle the ID bit in one copy and store */
		/*     to the EFLAGS reg */
		"xorl $0x200000, %%eax\n\t"
		"push %%eax\n\t"
		"popf\n\t"

		/* ... Get the (hopefully modified) EFLAGS */
		"pushf\n\t"
		"popl %%eax\n\t"
		/* ... Compare and test result */
		"xorl %%eax, %%ecx\n\t"
		"testl $0x200000, %%ecx\n\t"
		"jz NotSupported1\n\t"		/* CPUID not supported */
		/* Get standard CPUID information, and
		       go to a specific vendor section */
		"movl $0, %%eax\n\t"
		"cpuid\n\t"
		/* Check for Intel */
		"cmpl $0x756e6547, %%ebx\n\t"
		"jne TryAMD\n\t"
		"cmpl $0x49656e69, %%edx\n\t"
		"jne TryAMD\n\t"
		"cmpl $0x6c65746e, %%ecx\n"
		"jne TryAMD\n\t"
		"jmp Intel\n\t"
		/* Check for AMD */
		"\nTryAMD:\n\t"
		"cmpl $0x68747541, %%ebx\n\t"
		"jne TryCyrix\n\t"
		"cmpl $0x69746e65, %%edx\n\t"
		"jne TryCyrix\n\t"
		"cmpl $0x444d4163, %%ecx\n"
		"jne TryCyrix\n\t"
		"jmp AMD\n\t"
		/* Check for Cyrix */
		"\nTryCyrix:\n\t"
		"cmpl $0x69727943, %%ebx\n\t"
		"jne NotSupported2\n\t"
		"cmpl $0x736e4978, %%edx\n\t"
		"jne NotSupported3\n\t"
		"cmpl $0x64616574, %%ecx\n\t"
		"jne NotSupported4\n\t"
		/* Drop through to Cyrix... */
		/* Cyrix Section */
		/* See if extended CPUID level 80000001 is supported */
		/* The value of CPUID/80000001 for the 6x86MX is undefined
		   according to the Cyrix CPU Detection Guide (Preliminary
		   Rev. 1.01 table 1), so we'll check the value of eax for
		   CPUID/0 to see if standard CPUID level 2 is supported.
		   According to the table, the only CPU which supports level
		   2 is also the only one which supports extended CPUID levels.
		*/
		"cmpl $0x2, %%eax\n\t"
		"jne MMXtest\n\t"	/* Use standard CPUID instead */
		/* Extended CPUID supported (in theory), so get extended
		   features */
		"movl $0x80000001, %%eax\n\t"
		"cpuid\n\t"
		"testl $0x00800000, %%eax\n\t"	/* Test for MMX */
		"jz NotSupported5\n\t"		/* MMX not supported */
		"testl $0x01000000, %%eax\n\t"	/* Test for Ext'd MMX */
		"jnz EMMXSupported\n\t"
		"movl $1, %0\n\n\t"		/* MMX Supported */
		"jmp Return\n\n"
		"EMMXSupported:\n\t"
		"movl $3, %0\n\n\t"		/* EMMX and MMX Supported */
		"jmp Return\n\t"
		/* AMD Section */
		"AMD:\n\t"
		/* See if extended CPUID is supported */
		"movl $0x80000000, %%eax\n\t"
		"cpuid\n\t"
		"cmpl $0x80000000, %%eax\n\t"
		"jl MMXtest\n\t"	/* Use standard CPUID instead */
		/* Extended CPUID supported, so get extended features */
		"movl $0x80000001, %%eax\n\t"
		"cpuid\n\t"
		"testl $0x00800000, %%edx\n\t"	/* Test for MMX */
		"jz NotSupported6\n\t"		/* MMX not supported */
		"testl $0x80000000, %%edx\n\t"	/* Test for 3DNow! */
		"jnz ThreeDNowSupported\n\t"
		"movl $1, %0\n\n\t"		/* MMX Supported */
		"jmp Return\n\n"
		"ThreeDNowSupported:\n\t"
		"movl $5, %0\n\n\t"		/* 3DNow! and MMX Supported */
		"jmp Return\n\t"
		/* Intel Section */
		"Intel:\n\t"
		/* Check for MMX */
		"MMXtest:\n\t"
		"movl $1, %%eax\n\t"
		"cpuid\n\t"
		"testl $0x00800000, %%edx\n\t"	/* Test for MMX */
		"jz NotSupported7\n\t"		/* MMX Not supported */
		"movl $1, %0\n\n\t"		/* MMX Supported */
		"jmp Return\n\t"
		/* Nothing supported */
		"\nNotSupported1:\n\t"
		"#movl $101, %0\n\n\t"
		"\nNotSupported2:\n\t"
		"#movl $102, %0\n\n\t"
		"\nNotSupported3:\n\t"
		"#movl $103, %0\n\n\t"
		"\nNotSupported4:\n\t"
		"#movl $104, %0\n\n\t"
		"\nNotSupported5:\n\t"
		"#movl $105, %0\n\n\t"
		"\nNotSupported6:\n\t"
		"#movl $106, %0\n\n\t"
		"\nNotSupported7:\n\t"
		"#movl $107, %0\n\n\t"
		"movl $0, %0\n\n\t"
		"Return:\n\t"
		: "=m" (rval)
		: /* no input */
		: "eax", "ebx", "ecx", "edx"
	);
	/* Return */
	return(rval);
}

/*	Function to test if mmx instructions are supported...
inline extern 
int
mmx_ok(void)
{
        // Returns 1 if MMX instructions are supported, 0 otherwise 
	return ( mm_support() & 0x1 );
}
*/


