1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
|
/* SPDX-License-Identifier: GPL-2.0 */
/*
* If TRACE_SYSTEM is defined, that will be the directory created
* in the ftrace directory under /sys/kernel/tracing/events/<system>
*
* The define_trace.h below will also look for a file name of
* TRACE_SYSTEM.h where TRACE_SYSTEM is what is defined here.
* In this case, it would look for sample-trace.h
*
* If the header name will be different than the system name
* (as in this case), then you can override the header name that
* define_trace.h will look up by defining TRACE_INCLUDE_FILE
*
* This file is called trace-events-sample.h but we want the system
* to be called "sample-trace". Therefore we must define the name of this
* file:
*
* #define TRACE_INCLUDE_FILE trace-events-sample
*
* As we do an the bottom of this file.
*
* Notice that TRACE_SYSTEM should be defined outside of #if
* protection, just like TRACE_INCLUDE_FILE.
*/
#undef TRACE_SYSTEM
#define TRACE_SYSTEM sample-trace
/*
* TRACE_SYSTEM is expected to be a C valid variable (alpha-numeric
* and underscore), although it may start with numbers. If for some
* reason it is not, you need to add the following lines:
*/
#undef TRACE_SYSTEM_VAR
#define TRACE_SYSTEM_VAR sample_trace
/*
* But the above is only needed if TRACE_SYSTEM is not alpha-numeric
* and underscored. By default, TRACE_SYSTEM_VAR will be equal to
* TRACE_SYSTEM. As TRACE_SYSTEM_VAR must be alpha-numeric, if
* TRACE_SYSTEM is not, then TRACE_SYSTEM_VAR must be defined with
* only alpha-numeric and underscores.
*
* The TRACE_SYSTEM_VAR is only used internally and not visible to
* user space.
*/
/*
* Notice that this file is not protected like a normal header.
* We also must allow for rereading of this file. The
*
* || defined(TRACE_HEADER_MULTI_READ)
*
* serves this purpose.
*/
#if !defined(_TRACE_EVENT_SAMPLE_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_EVENT_SAMPLE_H
/*
* All trace headers should include tracepoint.h, until we finally
* make it into a standard header.
*/
#include <linux/tracepoint.h>
/*
* The TRACE_EVENT macro is broken up into 5 parts.
*
* name: name of the trace point. This is also how to enable the tracepoint.
* A function called trace_foo_bar() will be created.
*
* proto: the prototype of the function trace_foo_bar()
* Here it is trace_foo_bar(char *foo, int bar).
*
* args: must match the arguments in the prototype.
* Here it is simply "foo, bar".
*
* struct: This defines the way the data will be stored in the ring buffer.
* The items declared here become part of a special structure
* called "__entry", which can be used in the fast_assign part of the
* TRACE_EVENT macro.
*
* Here are the currently defined types you can use:
*
* __field : Is broken up into type and name. Where type can be any
* primitive type (integer, long or pointer).
*
* __field(int, foo)
*
* __entry->foo = 5;
*
* __field_struct : This can be any static complex data type (struct, union
* but not an array). Be careful using complex types, as each
* event is limited in size, and copying large amounts of data
* into the ring buffer can slow things down.
*
* __field_struct(struct bar, foo)
*
* __entry->bar.x = y;
* __array: There are three fields (type, name, size). The type is the
* type of elements in the array, the name is the name of the array.
* size is the number of items in the array (not the total size).
*
* __array( char, foo, 10) is the same as saying: char foo[10];
*
* Assigning arrays can be done like any array:
*
* __entry->foo[0] = 'a';
*
* memcpy(__entry->foo, bar, 10);
*
* __dynamic_array: This is similar to array, but can vary its size from
* instance to instance of the tracepoint being called.
* Like __array, this too has three elements (type, name, size);
* type is the type of the element, name is the name of the array.
* The size is different than __array. It is not a static number,
* but the algorithm to figure out the length of the array for the
* specific instance of tracepoint. Again, size is the number of
* items in the array, not the total length in bytes.
*
* __dynamic_array( int, foo, bar) is similar to: int foo[bar];
*
* Note, unlike arrays, you must use the __get_dynamic_array() macro
* to access the array.
*
* memcpy(__get_dynamic_array(foo), bar, 10);
*
* Notice, that "__entry" is not needed here.
*
* __string: This is a special kind of __dynamic_array. It expects to
* have a null terminated character array passed to it (it allows
* for NULL too, which would be converted into "(null)"). __string
* takes two parameter (name, src), where name is the name of
* the string saved, and src is the string to copy into the
* ring buffer.
*
* __string(foo, bar) is similar to: strcpy(foo, bar)
*
* To assign a string, use the helper macro __assign_str().
*
* __assign_str(foo);
*
* The __string() macro saves off the string that is passed into
* the second parameter, and the __assign_str() will store than
* saved string into the "foo" field.
*
* __vstring: This is similar to __string() but instead of taking a
* dynamic length, it takes a variable list va_list 'va' variable.
* Some event callers already have a message from parameters saved
* in a va_list. Passing in the format and the va_list variable
* will save just enough on the ring buffer for that string.
* Note, the va variable used is a pointer to a va_list, not
* to the va_list directly.
*
* (va_list *va)
*
* __vstring(foo, fmt, va) is similar to: vsnprintf(foo, fmt, va)
*
* To assign the string, use the helper macro __assign_vstr().
*
* __assign_vstr(foo, fmt, va);
*
* In most cases, the __assign_vstr() macro will take the same
* parameters as the __vstring() macro had to declare the string.
* Use __get_str() to retrieve the __vstring() just like it would for
* __string().
*
* __string_len: This is a helper to a __dynamic_array, but it understands
* that the array has characters in it, it will allocate 'len' + 1 bytes
* in the ring buffer and add a '\0' to the string. This is
* useful if the string being saved has no terminating '\0' byte.
* It requires that the length of the string is known as it acts
* like a memcpy().
*
* Declared with:
*
* __string_len(foo, bar, len)
*
* To assign this string, use the helper macro __assign_str().
* The length is saved via the __string_len() and is retrieved in
* __assign_str().
*
* __assign_str(foo);
*
* Then len + 1 is allocated to the ring buffer, and a nul terminating
* byte is added. This is similar to:
*
* memcpy(__get_str(foo), bar, len);
* __get_str(foo)[len] = 0;
*
* The advantage of using this over __dynamic_array, is that it
* takes care of allocating the extra byte on the ring buffer
* for the '\0' terminating byte, and __get_str(foo) can be used
* in the TP_printk().
*
* __bitmask: This is another kind of __dynamic_array, but it expects
* an array of longs, and the number of bits to parse. It takes
* two parameters (name, nr_bits), where name is the name of the
* bitmask to save, and the nr_bits is the number of bits to record.
*
* __bitmask(target_cpu, nr_cpumask_bits)
*
* To assign a bitmask, use the __assign_bitmask() helper macro.
*
* __assign_bitmask(target_cpus, cpumask_bits(bar), nr_cpumask_bits);
*
* __cpumask: This is pretty much the same as __bitmask but is specific for
* CPU masks. The type displayed to the user via the format files will
* be "cpumaks_t" such that user space may deal with them differently
* if they choose to do so, and the bits is always set to nr_cpumask_bits.
*
* __cpumask(target_cpu)
*
* To assign a cpumask, use the __assign_cpumask() helper macro.
*
* __assign_cpumask(target_cpus, cpumask_bits(bar));
*
* fast_assign: This is a C like function that is used to store the items
* into the ring buffer. A special variable called "__entry" will be the
* structure that points into the ring buffer and has the same fields as
* described by the struct part of TRACE_EVENT above.
*
* printk: This is a way to print out the data in pretty print. This is
* useful if the system crashes and you are logging via a serial line,
* the data can be printed to the console using this "printk" method.
* This is also used to print out the data from the trace files.
* Again, the __entry macro is used to access the data from the ring buffer.
*
* Note, __dynamic_array, __string, __bitmask and __cpumask require special
* helpers to access the data.
*
* For __dynamic_array(int, foo, bar) use __get_dynamic_array(foo)
* Use __get_dynamic_array_len(foo) to get the length of the array
* saved. Note, __get_dynamic_array_len() returns the total allocated
* length of the dynamic array; __print_array() expects the second
* parameter to be the number of elements. To get that, the array length
* needs to be divided by the element size.
*
* For __string(foo, bar) use __get_str(foo)
*
* For __bitmask(target_cpus, nr_cpumask_bits) use __get_bitmask(target_cpus)
*
* For __cpumask(target_cpus) use __get_cpumask(target_cpus)
*
*
* Note, that for both the assign and the printk, __entry is the handler
* to the data structure in the ring buffer, and is defined by the
* TP_STRUCT__entry.
*/
/*
* It is OK to have helper functions in the file, but they need to be protected
* from being defined more than once. Remember, this file gets included more
* than once.
*/
#ifndef __TRACE_EVENT_SAMPLE_HELPER_FUNCTIONS
#define __TRACE_EVENT_SAMPLE_HELPER_FUNCTIONS
static inline int __length_of(const int *list)
{
int i;
if (!list)
return 0;
for (i = 0; list[i]; i++)
;
return i;
}
enum {
TRACE_SAMPLE_FOO = 2,
TRACE_SAMPLE_BAR = 4,
TRACE_SAMPLE_ZOO = 8,
};
#endif
/*
* If enums are used in the TP_printk(), their names will be shown in
* format files and not their values. This can cause problems with user
* space programs that parse the format files to know how to translate
* the raw binary trace output into human readable text.
*
* To help out user space programs, any enum that is used in the TP_printk()
* should be defined by TRACE_DEFINE_ENUM() macro. All that is needed to
* be done is to add this macro with the enum within it in the trace
* header file, and it will be converted in the output.
*/
TRACE_DEFINE_ENUM(TRACE_SAMPLE_FOO);
TRACE_DEFINE_ENUM(TRACE_SAMPLE_BAR);
TRACE_DEFINE_ENUM(TRACE_SAMPLE_ZOO);
TRACE_EVENT(foo_bar,
TP_PROTO(const char *foo, int bar, const int *lst,
const char *string, const struct cpumask *mask,
const char *fmt, va_list *va),
TP_ARGS(foo, bar, lst, string, mask, fmt, va),
TP_STRUCT__entry(
__array( char, foo, 10 )
__field( int, bar )
__dynamic_array(int, list, __length_of(lst))
__string( str, string )
__bitmask( cpus, num_possible_cpus() )
__cpumask( cpum )
__vstring( vstr, fmt, va )
__string_len( lstr, foo, bar / 2 < strlen(foo) ? bar / 2 : strlen(foo) )
),
TP_fast_assign(
strscpy(__entry->foo, foo, 10);
__entry->bar = bar;
memcpy(__get_dynamic_array(list), lst,
__length_of(lst) * sizeof(int));
__assign_str(str);
__assign_str(lstr);
__assign_vstr(vstr, fmt, va);
__assign_bitmask(cpus, cpumask_bits(mask), num_possible_cpus());
__assign_cpumask(cpum, cpumask_bits(mask));
),
TP_printk("foo %s %d %s %s %s %s %s %s (%s) (%s) %s", __entry->foo, __entry->bar,
/*
* Notice here the use of some helper functions. This includes:
*
* __print_symbolic( variable, { value, "string" }, ... ),
*
* The variable is tested against each value of the { } pair. If
* the variable matches one of the values, then it will print the
* string in that pair. If non are matched, it returns a string
* version of the number (if __entry->bar == 7 then "7" is returned).
*/
__print_symbolic(__entry->bar,
{ 0, "zero" },
{ TRACE_SAMPLE_FOO, "TWO" },
{ TRACE_SAMPLE_BAR, "FOUR" },
{ TRACE_SAMPLE_ZOO, "EIGHT" },
{ 10, "TEN" }
),
/*
* __print_flags( variable, "delim", { value, "flag" }, ... ),
*
* This is similar to __print_symbolic, except that it tests the bits
* of the value. If ((FLAG & variable) == FLAG) then the string is
* printed. If more than one flag matches, then each one that does is
* also printed with delim in between them.
* If not all bits are accounted for, then the not found bits will be
* added in hex format: 0x506 will show BIT2|BIT4|0x500
*/
__print_flags(__entry->bar, "|",
{ 1, "BIT1" },
{ 2, "BIT2" },
{ 4, "BIT3" },
{ 8, "BIT4" }
),
/*
* __print_array( array, len, element_size )
*
* This prints out the array that is defined by __array in a nice format.
*/
__print_array(__get_dynamic_array(list),
__get_dynamic_array_len(list) / sizeof(int),
sizeof(int)),
/* A shortcut is to use __print_dynamic_array for dynamic arrays */
__print_dynamic_array(list, sizeof(int)),
__get_str(str), __get_str(lstr),
__get_bitmask(cpus), __get_cpumask(cpum),
__get_str(vstr))
);
/*
* There may be a case where a tracepoint should only be called if
* some condition is set. Otherwise the tracepoint should not be called.
* But to do something like:
*
* if (cond)
* trace_foo();
*
* Would cause a little overhead when tracing is not enabled, and that
* overhead, even if small, is not something we want. As tracepoints
* use static branch (aka jump_labels), where no branch is taken to
* skip the tracepoint when not enabled, and a jmp is placed to jump
* to the tracepoint code when it is enabled, having a if statement
* nullifies that optimization. It would be nice to place that
* condition within the static branch. This is where TRACE_EVENT_CONDITION
* comes in.
*
* TRACE_EVENT_CONDITION() is just like TRACE_EVENT, except it adds another
* parameter just after args. Where TRACE_EVENT has:
*
* TRACE_EVENT(name, proto, args, struct, assign, printk)
*
* the CONDITION version has:
*
* TRACE_EVENT_CONDITION(name, proto, args, cond, struct, assign, printk)
*
* Everything is the same as TRACE_EVENT except for the new cond. Think
* of the cond variable as:
*
* if (cond)
* trace_foo_bar_with_cond();
*
* Except that the logic for the if branch is placed after the static branch.
* That is, the if statement that processes the condition will not be
* executed unless that traecpoint is enabled. Otherwise it still remains
* a nop.
*/
TRACE_EVENT_CONDITION(foo_bar_with_cond,
TP_PROTO(const char *foo, int bar),
TP_ARGS(foo, bar),
TP_CONDITION(!(bar % 10)),
TP_STRUCT__entry(
__string( foo, foo )
__field( int, bar )
),
TP_fast_assign(
__assign_str(foo);
__entry->bar = bar;
),
TP_printk("foo %s %d", __get_str(foo), __entry->bar)
);
int foo_bar_reg(void);
void foo_bar_unreg(void);
/*
* Now in the case that some function needs to be called when the
* tracepoint is enabled and/or when it is disabled, the
* TRACE_EVENT_FN() serves this purpose. This is just like TRACE_EVENT()
* but adds two more parameters at the end:
*
* TRACE_EVENT_FN( name, proto, args, struct, assign, printk, reg, unreg)
*
* reg and unreg are functions with the prototype of:
*
* void reg(void)
*
* The reg function gets called before the tracepoint is enabled, and
* the unreg function gets called after the tracepoint is disabled.
*
* Note, reg and unreg are allowed to be NULL. If you only need to
* call a function before enabling, or after disabling, just set one
* function and pass in NULL for the other parameter.
*/
TRACE_EVENT_FN(foo_bar_with_fn,
TP_PROTO(const char *foo, int bar),
TP_ARGS(foo, bar),
TP_STRUCT__entry(
__string( foo, foo )
__field( int, bar )
),
TP_fast_assign(
__assign_str(foo);
__entry->bar = bar;
),
TP_printk("foo %s %d", __get_str(foo), __entry->bar),
foo_bar_reg, foo_bar_unreg
);
/*
* Each TRACE_EVENT macro creates several helper functions to produce
* the code to add the tracepoint, create the files in the trace
* directory, hook it to perf, assign the values and to print out
* the raw data from the ring buffer. To prevent too much bloat,
* if there are more than one tracepoint that uses the same format
* for the proto, args, struct, assign and printk, and only the name
* is different, it is highly recommended to use the DECLARE_EVENT_CLASS
*
* DECLARE_EVENT_CLASS() macro creates most of the functions for the
* tracepoint. Then DEFINE_EVENT() is use to hook a tracepoint to those
* functions. This DEFINE_EVENT() is an instance of the class and can
* be enabled and disabled separately from other events (either TRACE_EVENT
* or other DEFINE_EVENT()s).
*
* Note, TRACE_EVENT() itself is simply defined as:
*
* #define TRACE_EVENT(name, proto, args, tstruct, assign, printk) \
* DECLARE_EVENT_CLASS(name, proto, args, tstruct, assign, printk); \
* DEFINE_EVENT(name, name, proto, args)
*
* The DEFINE_EVENT() also can be declared with conditions and reg functions:
*
* DEFINE_EVENT_CONDITION(template, name, proto, args, cond);
* DEFINE_EVENT_FN(template, name, proto, args, reg, unreg);
*/
DECLARE_EVENT_CLASS(foo_template,
TP_PROTO(const char *foo, int bar),
TP_ARGS(foo, bar),
TP_STRUCT__entry(
__string( foo, foo )
__field( int, bar )
),
TP_fast_assign(
__assign_str(foo);
__entry->bar = bar;
),
TP_printk("foo %s %d", __get_str(foo), __entry->bar)
);
/*
* Here's a better way for the previous samples (except, the first
* example had more fields and could not be used here).
*/
DEFINE_EVENT(foo_template, foo_with_template_simple,
TP_PROTO(const char *foo, int bar),
TP_ARGS(foo, bar));
DEFINE_EVENT_CONDITION(foo_template, foo_with_template_cond,
TP_PROTO(const char *foo, int bar),
TP_ARGS(foo, bar),
TP_CONDITION(!(bar % 8)));
DEFINE_EVENT_FN(foo_template, foo_with_template_fn,
TP_PROTO(const char *foo, int bar),
TP_ARGS(foo, bar),
foo_bar_reg, foo_bar_unreg);
/*
* Anytime two events share basically the same values and have
* the same output, use the DECLARE_EVENT_CLASS() and DEFINE_EVENT()
* when ever possible.
*/
/*
* If the event is similar to the DECLARE_EVENT_CLASS, but you need
* to have a different output, then use DEFINE_EVENT_PRINT() which
* lets you override the TP_printk() of the class.
*/
DEFINE_EVENT_PRINT(foo_template, foo_with_template_print,
TP_PROTO(const char *foo, int bar),
TP_ARGS(foo, bar),
TP_printk("bar %s %d", __get_str(foo), __entry->bar));
/*
* There are yet another __rel_loc dynamic data attribute. If you
* use __rel_dynamic_array() and __rel_string() etc. macros, you
* can use this attribute. There is no difference from the viewpoint
* of functionality with/without 'rel' but the encoding is a bit
* different. This is expected to be used with user-space event,
* there is no reason that the kernel event use this, but only for
* testing.
*/
TRACE_EVENT(foo_rel_loc,
TP_PROTO(const char *foo, int bar, unsigned long *mask, const cpumask_t *cpus),
TP_ARGS(foo, bar, mask, cpus),
TP_STRUCT__entry(
__rel_string( foo, foo )
__field( int, bar )
__rel_bitmask( bitmask,
BITS_PER_BYTE * sizeof(unsigned long) )
__rel_cpumask( cpumask )
),
TP_fast_assign(
__assign_rel_str(foo);
__entry->bar = bar;
__assign_rel_bitmask(bitmask, mask,
BITS_PER_BYTE * sizeof(unsigned long));
__assign_rel_cpumask(cpumask, cpus);
),
TP_printk("foo_rel_loc %s, %d, %s, %s", __get_rel_str(foo), __entry->bar,
__get_rel_bitmask(bitmask),
__get_rel_cpumask(cpumask))
);
#endif
/***** NOTICE! The #if protection ends here. *****/
/*
* There are several ways I could have done this. If I left out the
* TRACE_INCLUDE_PATH, then it would default to the kernel source
* include/trace/events directory.
*
* I could specify a path from the define_trace.h file back to this
* file.
*
* #define TRACE_INCLUDE_PATH ../../samples/trace_events
*
* But the safest and easiest way to simply make it use the directory
* that the file is in is to add in the Makefile:
*
* CFLAGS_trace-events-sample.o := -I$(src)
*
* This will make sure the current path is part of the include
* structure for our file so that define_trace.h can find it.
*
* I could have made only the top level directory the include:
*
* CFLAGS_trace-events-sample.o := -I$(PWD)
*
* And then let the path to this directory be the TRACE_INCLUDE_PATH:
*
* #define TRACE_INCLUDE_PATH samples/trace_events
*
* But then if something defines "samples" or "trace_events" as a macro
* then we could risk that being converted too, and give us an unexpected
* result.
*/
#undef TRACE_INCLUDE_PATH
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_PATH .
/*
* TRACE_INCLUDE_FILE is not needed if the filename and TRACE_SYSTEM are equal
*/
#define TRACE_INCLUDE_FILE trace-events-sample
#include <trace/define_trace.h>
|