/* $Id: x86.c,v 1.2.2.6 2008/01/26 12:05:25 mikpe Exp $
 * x86-specific code.
 *
 * Copyright (C) 2000-2008  Mikael Pettersson
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "libperfctr.h"
#include "arch.h"
#include <errno.h>
#include <setjmp.h>
#include <signal.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "libperfctr.h"
#include "arch.h"

//static struct vperfctr_control control;

static struct gperfctr *gperfctr;
static struct perfctr_info info;
static unsigned int nrcpus;
static unsigned short *cpu_logical_map;
struct gperfctr_state
{				/* no longer defined in or used by the kernel */
  unsigned int nrcpus;
  struct gperfctr_cpu_state cpu_state[2];	/* actually 'nrcpus' */
};
static struct gperfctr_state *state;
static struct gperfctr_state *prev_state;
int counting_mips;
static unsigned long sampling_interval = 10000;	/* XXX: reduce for >4GHz CPUs */
static unsigned int sleep_interval = 5;

/*
 * Data used in the user program
 */

/*
 * Vector of size N of input sizes
int TAB[5] = {512, 1024, 2048, 4096, 8192 };
int TAB[5] = {2048, 2048, 2048, 2048, 2048 };
int TAB[5] = {8192 ,8192 ,8192 ,8192 ,8192 };
int TAB[5] = {65536 ,65536 ,65536 ,65536 ,65536 };
 */

int TAB[5] = { 268435451, 1, 1, 1, 1 };

/*******************************************************/

static unsigned int
hweight32 (unsigned int w)
{
  unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
  res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
  res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
  res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
  return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
}

static void
setup_cpu_logical_map_and_nrcpus (const struct perfctr_cpus_info *cpus_info)
{
  const unsigned int *cpus, *cpus_forbidden;
  unsigned int nrwords, i, cpumask, bitmask;
  unsigned int logical_cpu_nr, kernel_cpu_nr;

  cpus = cpus_info->cpus->mask;
  cpus_forbidden = cpus_info->cpus_forbidden->mask;
  nrwords = cpus_info->cpus->nrwords;

  nrcpus = 0;
  for (i = 0; i < nrwords; ++i)
    nrcpus += hweight32 (cpus[i] & ~cpus_forbidden[i]);

  printf ("nrcpus=%d nrwords=%d\n", nrcpus, nrwords);

  cpu_logical_map = malloc (nrcpus * sizeof (cpu_logical_map[0]));
  if (!cpu_logical_map)
    {
      perror ("malloc");
      exit (1);
    }

  logical_cpu_nr = 0;
  for (i = 0; i < nrwords; ++i)
    {
      cpumask = cpus[i] & ~cpus_forbidden[i];
      kernel_cpu_nr = i * 8 * sizeof (int);
      for (bitmask = 1; cpumask != 0; ++kernel_cpu_nr, bitmask <<= 1)
	{
	  if (cpumask & bitmask)
	    {
	      cpumask &= ~bitmask;
	      cpu_logical_map[logical_cpu_nr] = kernel_cpu_nr;
	      ++logical_cpu_nr;
	    }
	}
    }

  if (logical_cpu_nr != nrcpus)
    abort ();
}

static void
do_init (void)
{
  struct perfctr_cpus_info *cpus_info;
  size_t nbytes;
  unsigned int i;

  gperfctr = gperfctr_open ();
  if (!gperfctr)
    {
      perror ("gperfctr_open");
      exit (1);
    }
  if (gperfctr_info (gperfctr, &info) < 0)
    {
      perror ("gperfctr_info");
      exit (1);
    }
  cpus_info = gperfctr_cpus_info (gperfctr);
  if (!cpus_info)
    {
      perror ("gperfctr_info");
      exit (1);
    }
  printf ("\nPerfCtr Info:\n");
  perfctr_info_print (&info);
  perfctr_cpus_info_print (cpus_info);

  /* use all non-forbidden CPUs */

  setup_cpu_logical_map_and_nrcpus (cpus_info);
  free (cpus_info);

  /* now alloc state memory based on nrcpus */

  nbytes = offsetof (struct gperfctr_state, cpu_state[0])
    + nrcpus * sizeof (state->cpu_state[0]);
  state = malloc (nbytes);
  prev_state = malloc (nbytes);
  if (!state || !prev_state)
    {
      perror ("malloc");
      exit (1);
    }
  memset (state, 0, nbytes);
  memset (prev_state, 0, nbytes);

  /* format state to indicate which CPUs we want to sample */

  for (i = 0; i < nrcpus; ++i)
    {
      state->cpu_state[i].cpu = cpu_logical_map[i];
      prev_state->cpu_state[i].cpu = cpu_logical_map[i];
    }
  state->nrcpus = nrcpus;
  prev_state->nrcpus = nrcpus;

}

static int
do_read ()
{
  unsigned int i;

  for (i = 0; i < state->nrcpus; ++i)
    {
      if (gperfctr_read (gperfctr, &state->cpu_state[i]) < 0)
	{
	  perror ("gperfctr_read error on state");
	  return -1;
	}
    }
  return 0;
}

static void
print_control (const struct perfctr_cpu_control *control)
{
  printf ("\nControl used:\n");
  perfctr_cpu_control_print (control);
}

static void
do_enable ()
{
  struct perfctr_cpu_control cpu_control;
  unsigned int i;

  setup_control (&info, &cpu_control);
  print_control (&cpu_control);

  for (i = 0; i < nrcpus; ++i)
    {
      struct gperfctr_cpu_control control;
      control.cpu = cpu_logical_map[i];
      control.cpu_control = cpu_control;
      if (gperfctr_control (gperfctr, &control) < 0)
	{
	  perror ("gperfctr_control");
	  exit (1);
	}
    }
  if (gperfctr_start (gperfctr, sampling_interval) < 0)
    {
      perror ("gperfctr_start");
      exit (1);
    }
}

void
do_print ()
{
  int i, cpu, ctr;

  printf ("\nFinal Sample(s):\n");
  for (i = 0; i < state->nrcpus; ++i)
    {
      cpu = state->cpu_state[i].cpu;
      printf ("\nCPU %d:\n", cpu);
      if (state->cpu_state[i].cpu_control.tsc_on)
	{
	  printf ("\ttsc\t\t%lld\n", state->cpu_state[i].sum.tsc);
	}
      for (ctr = 0; ctr < state->cpu_state[i].cpu_control.nractrs; ++ctr)
	{
	  printf ("\tpmc[%d]\t\t%lld\n",
		  ctr, state->cpu_state[i].sum.pmc[ctr]);
	}
	if( ctr >= 1 ) {	/* compute and display MFLOP/s or MIP/s */
	    unsigned long long tsc = state->cpu_state[i].sum.tsc;
	    double seconds = state->cpu_state[i].cpu_control.tsc_on
	      ? ((double)tsc * (double)(info.tsc_to_cpu_mult ? : 1) / (double)info.cpu_khz) / 1000.0
		: (double)sleep_interval; /* don't div-by-0 on WinChip ... */
	    printf("\tSECONDS\t\t%.15g\n", seconds);
	}
    }
}

 
void setup_control(const struct perfctr_info *info,
		   struct perfctr_cpu_control *control)
{
    unsigned int tsc_on = 1;
    unsigned int nractrs = 1;
    unsigned int pmc_map0 = 0;
    unsigned int evntsel0 = 0;

    memset(control, 0, sizeof *control);

    /* Attempt to set up control to count clocks via the TSC
       and FLOPS via PMC0. */
    switch (info->cpu_type) {
      case PERFCTR_X86_GENERIC:
	nractrs = 0;		/* no PMCs available */
	break;
      case PERFCTR_X86_AMD_K8:
      case PERFCTR_X86_AMD_K8C:
      case PERFCTR_X86_AMD_FAM10H:
	/* RETIRED_FPU_INSTRS, Unit Mask "x87 instrs", any CPL, Enable */
	evntsel0 = 0xCB | (0x01 << 8) | (3 << 16) | (1 << 22);
	break;
#if !defined(__x86_64__)
      case PERFCTR_X86_INTEL_P5:
      case PERFCTR_X86_INTEL_P5MMX:
      case PERFCTR_X86_CYRIX_MII:
	/* event 0x22 (FLOPS), any CPL */
	evntsel0 = 0x22 | (3 << 6);
	break;
      case PERFCTR_X86_INTEL_P6:
      case PERFCTR_X86_INTEL_PII:
      case PERFCTR_X86_INTEL_PIII:
      case PERFCTR_X86_INTEL_PENTM:
      case PERFCTR_X86_INTEL_CORE:
	/* note: FLOPS is only available in PERFCTR0 */
	/* event 0xC1 (FLOPS), any CPL, Enable */
	evntsel0 = 0xC1 | (3 << 16) | (1 << 22);
	break;
#endif
      case PERFCTR_X86_INTEL_CORE2:
	/* event 0xC1 umask 0xFE (X87_OPS_RETIRED_ANY), any CPL, Enable */
	evntsel0 = 0xC1 | (0xFE << 8) | (3 << 16) | (1 << 22);
	break;
#if !defined(__x86_64__)
      case PERFCTR_X86_AMD_K7:
	/* K7 apparently can't count FLOPS. */
	counting_mips = 1;
	/* event 0xC0 (RETIRED_INSTRUCTIONS), any CPL, Enable */
	evntsel0 = 0xC0 | (3 << 16) | (1 << 22);
	break;
      case PERFCTR_X86_WINCHIP_C6:
	counting_mips = 1;	/* can't count FLOPS */
	tsc_on = 0;		/* no working TSC available */
	evntsel0 = 0x02;	/* X86_INSTRUCTIONS */
	break;
      case PERFCTR_X86_WINCHIP_2:
	counting_mips = 1;	/* can't count FLOPS */
	tsc_on = 0;		/* no working TSC available */
	evntsel0 = 0x16;	/* INSTRUCTIONS_EXECUTED */
	break;
      case PERFCTR_X86_VIA_C3:
	counting_mips = 1;	/* can't count FLOPS */
	pmc_map0 = 1;		/* redirect PMC0 to PERFCTR1 */
	evntsel0 = 0xC0;	/* INSTRUCTIONS_EXECUTED */
	break;
      case PERFCTR_X86_INTEL_P4:
      case PERFCTR_X86_INTEL_P4M2:
#endif
      case PERFCTR_X86_INTEL_P4M3:
	nractrs = 2;
	/* set up PMC(1) to produce tagged x87_FP_uop:s */
	control->pmc_map[1] = 0x8 | (1 << 31);
	control->evntsel[1] = (0x3 << 16) | (1 << 13) | (1 << 12);
	control->p4.escr[1] = (4 << 25) | (1 << 24) | (1 << 5) | (1 << 4) | (1 << 2);
        /* set up PMC(0) to count execution_event(X87_FP_retired) */
	pmc_map0 = 0xC | (1 << 31);
	evntsel0 = (0x3 << 16) | (5 << 13) | (1 << 12);
	control->p4.escr[0] = (0xC << 25) | (1 << 9) | (1 << 2);

	break;
      default:
	fprintf(stderr, "cpu_type %u (%s) not supported\n",
		info->cpu_type, perfctr_info_cpu_name(info));
	exit(1);
    }
    control->tsc_on = tsc_on;
    control->nractrs = nractrs;
    control->pmc_map[0] = pmc_map0;
    control->evntsel[0] = evntsel0;
    control->pmc_map[0] = 0x8000000C;
    control->evntsel[0] = 0x0003B000;
    control->p4.escr[0] = 0x12000204;
    control->p4.pebs_enable = 0x01000001;
    control->p4.pebs_matrix_vert = 0x00000001;

// Name:EvntSel:CounterSet:DefaultUnitMask
      //P4_TC_deliver_mode:0x01:0x13:0x1
      //P4_BPU_fetch_request:0x03:0x1:0x0
      //P4_ITLB_reference:0x18:0xC:0x7
      //P4_memory_cancel:0x02:0x7:0xC
      //P4_memory_complete:0x08:0x11:0x3
      //P4_load_port_replay:0x04:0x11:0x2
      //P4_store_port_replay:0x05:0x11:0x2
      //P4_MOB_load_replay:0x03:0xD:0x3A
      //P4_page_walk_type:0x01:0xF:0x3
      //P4_BSQ_cache_reference:0x0C:0x2:0x73F
      //P4_IOQ_allocation:0x03:0x9:0xEFE1
      //P4_IOQ_active_entries:0x1A:0xB:0xEFE1
      //P4_FSB_data_activity:0x17:0x9:0x1B
      //P4_BSQ_allocation:0x05:0x3:0x21
      //P4_bsq_active_entries:0x06:0x4:0x21
      //P4_SSE_input_assist:0x34:0x8:0x8000
      //P4_packed_SP_uop:0x08:0x8:0x8000
      //P4_packed_DP_uop:0x0C:0x8:0x8000
      //P4_scalar_SP_uop:0x0A:0x8:0x8000
      //P4_scalar_DP_uop:0x0E:0x8:0x8000
      //P4_64bit_MMX_uop:0x02:0x8:0x8000
      //P4_128bit_MMX_uop:0x1A:0x8:0x8000
      //P4_x87_FP_uop:0x04:0x8:0x8000
      //P4_x87_SIMD_moves_uop:0x2E:0x8:0x18
      //P4_TC_misc:0x06:0x13:0x10
      //P4_global_power_events:0x13:0x9:0x1
      //P4_tc_ms_xfer:0x05:0xE:0x1
      //P4_uop_queue_writes:0x09:0xE:0x7
      //P4_retired_mispred_branch_type:0x05:0x12:0x1E
      //P4_retired_branch_type:0x04:0x12:0x1E
      //P4_resource_stall:0x01:0x0:0x20
      //P4_WC_Buffer:0x05:0x7:0x1
      //P4_b2b_cycles:0x16:0x9:0x7E
      //P4_bnr:0x08:0x9:0x7
      //P4_snoop:0x06:0x9:0xC4
      //P4_response:0x04:0x9:0x306
      //P4_front_end_event:0x08:0x6:0x1
      //P4_execution_event:0x0C:0x6:0x1
      //P4_replay_event:0x09:0x6:0x1
      //P4_instr_retired:0x02:0x5:0x1
      //P4_uops_retired:0x01:0x5:0x1
      //P4_uop_type:0x02:0x10:0x6
      //P4_branch_retired:0x06:0x6:0xC
      //P4_mispred_branch_retired:0x03:0x5:0x1
      //P4_x87_assist:0x03:0x6:0x1F
      //P4_machine_clear:0x02:0x6:0x1
      //P4M3_instr_completed:0x07:0x5:0x1
    //control->pmc_map[0] = (unsigned int)0x5;
    //control->evntsel[0] = (unsigned int)0x07;
    //control->p4.escr[0] = (unsigned int)0x1;
    /********************************************************/
    // TOUT SE JOUE LA
    // VOIR LA STRUCTURE DE DONNEES DANS perfctr.h qui est sous
    // linux/include/asm-x86 ou asm-x8664
    /* (L1 cache read misses on P4) */
    /* 0x0003B000/0x12000204@0x8000000C */

    //control->pmc_map[1] = 0x8000000C;
    //control->evntsel[1] = 0x0003B000;
    //control->p4.escr[1] = 0x12000204;
    //control->p4.pebs_enable = 0x01000001;
    //control->p4.pebs_matrix_vert = 0x00000001;

}
