[PATCH 1/2 -tip] perf_counter: Add generalized hardware vectoredco-processor support for AMD and Intel Corei7/Nehalem

From: Jaswinder Singh Rajput
Date: Thu Jul 02 2009 - 05:45:50 EST



This output is from AMD box:

$ ./perf stat -e add -e multiply -e divide -e vec-idle-cycles -e vec-stall-cycles -e vec-ops -- ls -lR /usr/include/ > /dev/null

Performance counter stats for 'ls -lR /usr/include/':

4218 vec-adds (scaled from 66.60%)
7426 vec-muls (scaled from 66.67%)
5441 vec-divs (scaled from 66.29%)
821982187 vec-idle-cycles (scaled from 66.45%)
2681 vec-stall-cycles (scaled from 67.11%)
7887 vec-ops (scaled from 66.88%)

0.417614573 seconds time elapsed

$ ./perf stat -e add -e multiply -e divide -e vec-idle-cycles -e vec-stall-cycles -e vec-ops -- /usr/bin/rhythmbox ~jaswinder/Music/singhiskinng.mp3

Performance counter stats for '/usr/bin/rhythmbox /home/jaswinder/Music/singhiskinng.mp3':

17552264 vec-adds (scaled from 66.28%)
19715258 vec-muls (scaled from 66.63%)
15862733 vec-divs (scaled from 66.82%)
23735187095 vec-idle-cycles (scaled from 66.89%)
11353159 vec-stall-cycles (scaled from 66.90%)
36628571 vec-ops (scaled from 66.48%)

298.350012843 seconds time elapsed

$ ./perf stat -e add -e multiply -e divide -e vec-idle-cycles -e vec-stall-cycles -e vec-ops -- /usr/bin/vlc ~jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv

Performance counter stats for '/usr/bin/vlc /home/jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv':

20177177044 vec-adds (scaled from 66.63%)
34101687027 vec-muls (scaled from 66.64%)
3984060862 vec-divs (scaled from 66.71%)
26349684710 vec-idle-cycles (scaled from 66.65%)
9052001905 vec-stall-cycles (scaled from 66.66%)
76440734242 vec-ops (scaled from 66.71%)

272.523058097 seconds time elapsed

$ ./perf list shows vector events like :

vec-adds OR add [Hardware vector event]
vec-muls OR multiply [Hardware vector event]
vec-divs OR divide [Hardware vector event]
vec-idle-cycles OR vec-empty-cycles [Hardware vector event]
vec-stall-cycles OR vec-busy-cycles [Hardware vector event]
vec-ops OR vec-operations [Hardware vector event]

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@xxxxxxxxx>
---
arch/x86/kernel/cpu/perf_counter.c | 45 +++++++++++++++++++++++++++++
include/linux/perf_counter.h | 15 ++++++++++
kernel/perf_counter.c | 1 +
tools/perf/util/parse-events.c | 55 ++++++++++++++++++++++++++++++++++++
4 files changed, 116 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 36c3dc7..48f28b7 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -372,6 +372,22 @@ static const u64 atom_hw_cache_event_ids
},
};

+/*
+ * Generalized hw vectored co-processor event table
+ */
+
+static u64 __read_mostly hw_vector_event_ids[PERF_COUNT_HW_VECTOR_MAX];
+
+static const u64 nehalem_hw_vector_event_ids[] =
+{
+ [PERF_COUNT_HW_VECTOR_ADD] = 0x01B1, /* UOPS_EXECUTED.PORT0 */
+ [PERF_COUNT_HW_VECTOR_MULTIPLY] = 0x0214, /* ARITH.MUL */
+ [PERF_COUNT_HW_VECTOR_DIVIDE] = 0x0114, /* ARITH.CYCLES_DIV_BUSY */
+ [PERF_COUNT_HW_VECTOR_IDLE_CYCLES] = 0x0,
+ [PERF_COUNT_HW_VECTOR_STALL_CYCLES] = 0x60A2, /* RESOURCE_STALLS.FPCW|MXCSR*/
+ [PERF_COUNT_HW_VECTOR_OPS] = 0x0710, /* FP_COMP_OPS_EXE.X87|MMX|SSE_FP*/
+};
+
static u64 intel_pmu_raw_event(u64 event)
{
#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
@@ -481,6 +497,17 @@ static const u64 amd_hw_cache_event_ids
},
};

+static const u64 amd_hw_vector_event_ids[] =
+{
+ [PERF_COUNT_HW_VECTOR_ADD] = 0x0100, /* Dispatched FPU Add */
+ [PERF_COUNT_HW_VECTOR_MULTIPLY] = 0x0200, /* Dispatched FPU Multiply */
+ [PERF_COUNT_HW_VECTOR_DIVIDE] = 0x0400, /* Dispatched FPU Store */
+ [PERF_COUNT_HW_VECTOR_IDLE_CYCLES] = 0x0001, /* FPU Empty cycles */
+ [PERF_COUNT_HW_VECTOR_STALL_CYCLES] = 0x00D7, /* Dispatch stall for FPU */
+ [PERF_COUNT_HW_VECTOR_OPS] = 0x0FCB, /* Retired x87|(MMX & 3Dnow)
+ |SSE & SSE2) Instructions */
+};
+
/*
* AMD Performance Monitor K7 and later.
*/
@@ -659,6 +686,17 @@ set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
return 0;
}

+static inline int
+set_hw_vector_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
+{
+ if (attr->config >= PERF_COUNT_HW_VECTOR_MAX)
+ return -EINVAL;
+
+ hwc->config |= hw_vector_event_ids[attr->config];
+
+ return 0;
+}
+
/*
* Setup the hardware configuration for a given attr_type
*/
@@ -716,6 +754,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
if (attr->type == PERF_TYPE_HW_CACHE)
return set_ext_hw_attr(hwc, attr);

+ if (attr->type == PERF_TYPE_HW_VECTOR)
+ return set_hw_vector_attr(hwc, attr);
+
if (attr->config >= x86_pmu.max_events)
return -EINVAL;
/*
@@ -1444,6 +1485,8 @@ static int intel_pmu_init(void)
case 26:
memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ memcpy(hw_vector_event_ids, nehalem_hw_vector_event_ids,
+ sizeof(hw_vector_event_ids));

pr_cont("Nehalem/Corei7 events, ");
break;
@@ -1468,6 +1511,8 @@ static int amd_pmu_init(void)
/* Events are common for all AMDs */
memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ memcpy(hw_vector_event_ids, amd_hw_vector_event_ids,
+ sizeof(hw_vector_event_ids));

return 0;
}
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 5e970c7..e91b712 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -31,6 +31,7 @@ enum perf_type_id {
PERF_TYPE_TRACEPOINT = 2,
PERF_TYPE_HW_CACHE = 3,
PERF_TYPE_RAW = 4,
+ PERF_TYPE_HW_VECTOR = 5,

PERF_TYPE_MAX, /* non-ABI */
};
@@ -89,6 +90,20 @@ enum perf_hw_cache_op_result_id {
};

/*
+ * Generalized hardware vectored co-processor counters:
+ */
+enum perf_hw_vector_id {
+ PERF_COUNT_HW_VECTOR_ADD = 0,
+ PERF_COUNT_HW_VECTOR_MULTIPLY = 1,
+ PERF_COUNT_HW_VECTOR_DIVIDE = 2,
+ PERF_COUNT_HW_VECTOR_IDLE_CYCLES = 3,
+ PERF_COUNT_HW_VECTOR_STALL_CYCLES = 4,
+ PERF_COUNT_HW_VECTOR_OPS = 5,
+
+ PERF_COUNT_HW_VECTOR_MAX, /* non-ABI */
+};
+
+/*
* Special "software" counters provided by the kernel, even if the hardware
* does not support performance counters. These counters measure various
* physical and sw events of the kernel (and allow the profiling of them as
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d55a50d..dd3848a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3838,6 +3838,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
case PERF_TYPE_RAW:
case PERF_TYPE_HARDWARE:
case PERF_TYPE_HW_CACHE:
+ case PERF_TYPE_HW_VECTOR:
pmu = hw_perf_counter_init(counter);
break;

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 5184959..8213dfb 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -40,6 +40,17 @@ static struct event_symbol event_symbols[] = {
{ CSW(CPU_MIGRATIONS), "cpu-migrations", "migrations" },
};

+#define CHVECTOR(x) .type = PERF_TYPE_HW_VECTOR, .config = PERF_COUNT_HW_VECTOR_##x
+
+static struct event_symbol vector_event_symbols[] = {
+ { CHVECTOR(ADD), "vec-adds", "add" },
+ { CHVECTOR(MULTIPLY), "vec-muls", "multiply" },
+ { CHVECTOR(DIVIDE), "vec-divs", "divide" },
+ { CHVECTOR(IDLE_CYCLES), "vec-idle-cycles", "vec-empty-cycles"},
+ { CHVECTOR(STALL_CYCLES), "vec-stall-cycles", "vec-busy-cycles"},
+ { CHVECTOR(OPS), "vec-ops", "vec-operations"},
+};
+
#define __PERF_COUNTER_FIELD(config, name) \
((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)

@@ -172,6 +183,11 @@ char *event_name(int counter)
return event_cache_name(cache_type, cache_op, cache_result);
}

+ case PERF_TYPE_HW_VECTOR:
+ if (config < PERF_COUNT_HW_VECTOR_MAX)
+ return vector_event_symbols[config].symbol;
+ return "unknown-vector";
+
case PERF_TYPE_SOFTWARE:
if (config < PERF_COUNT_SW_MAX)
return sw_event_names[config];
@@ -280,6 +296,21 @@ static int check_events(const char *str, unsigned int i)
return 0;
}

+static int check_vector_events(const char *str, unsigned int i)
+{
+ int n;
+
+ n = strlen(vector_event_symbols[i].symbol);
+ if (!strncmp(str, vector_event_symbols[i].symbol, n))
+ return n;
+
+ n = strlen(vector_event_symbols[i].alias);
+ if (n)
+ if (!strncmp(str, vector_event_symbols[i].alias, n))
+ return n;
+ return 0;
+}
+
static int
parse_symbolic_event(const char **strp, struct perf_counter_attr *attr)
{
@@ -296,6 +327,17 @@ parse_symbolic_event(const char **strp, struct perf_counter_attr *attr)
return 1;
}
}
+
+ for (i = 0; i < ARRAY_SIZE(vector_event_symbols); i++) {
+ n = check_vector_events(str, i);
+ if (n > 0) {
+ attr->type = vector_event_symbols[i].type;
+ attr->config = vector_event_symbols[i].config;
+ *strp = str + n;
+ return 1;
+ }
+ }
+
return 0;
}

@@ -420,6 +462,7 @@ static const char * const event_type_descriptors[] = {
"Software event",
"Tracepoint event",
"Hardware cache event",
+ "Hardware vector event",
};

/*
@@ -468,6 +511,18 @@ void print_events(void)
}

fprintf(stderr, "\n");
+ syms = vector_event_symbols;
+ type = syms->type;
+ for (i = 0; i < ARRAY_SIZE(vector_event_symbols); i++, syms++) {
+ if (strlen(syms->alias))
+ sprintf(name, "%s OR %s", syms->symbol, syms->alias);
+ else
+ strcpy(name, syms->symbol);
+ fprintf(stderr, " %-40s [%s]\n", name,
+ event_type_descriptors[type]);
+ }
+
+ fprintf(stderr, "\n");
fprintf(stderr, " %-40s [raw hardware event descriptor]\n",
"rNNN");
fprintf(stderr, "\n");
--
1.6.0.6



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/