source: trunk/kStuff/kProfiler2/prfamd64msc.asm@ 3703

Last change on this file since 3703 was 3609, checked in by bird, 18 years ago

keywords

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id Revision
File size: 13.0 KB
Line 
1; $Id: prfamd64msc.asm 3609 2007-10-29 01:11:39Z bird $;
2;; @file
3; kProfiler Mark 2 - Microsoft C/C++ Compiler Interaction, AMD64.
4;
5
6;
7; Copyright (c) 2006-2007 knut st. osmundsen <bird-src-spam@anduin.net>
8;
9; This file is part of kProfiler.
10;
11; kProfiler is free software; you can redistribute it and/or
12; modify it under the terms of the GNU Lesser General Public
13; License as published by the Free Software Foundation; either
14; version 2.1 of the License, or (at your option) any later version.
15;
16; kProfiler is distributed in the hope that it will be useful,
17; but WITHOUT ANY WARRANTY; without even the implied warranty of
18; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19; Lesser General Public License for more details.
20;
21; You should have received a copy of the GNU Lesser General Public
22; License along with kProfiler; if not, write to the Free Software
23; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24;
25
26
27[section .data]
28;
29g_fCalibrated:
30 dd 0
31g_OverheadAdj:
32 dd 0
33
34[section .text]
35
36extern KPRF_ENTER
37extern KPRF_LEAVE
38
39global _penter
40global _pexit
41
42;ifdef UNDEFINED
43global common_return_path
44global common_overhead
45global common_no_overhead
46global calibrate
47global calib_inner_update_minimum
48global calib_inner_next
49global calib_outer_dec
50global calib_outer_inc
51global calib_done
52global calib_nullproc
53;endif
54
55
56;;
57; On x86 the call to this function has been observed to be put before
58; creating the stack frame, as the very first instruction in the function.
59;
60; Thus the stack layout is as follows:
61; 24 return address of the calling function.
62; 20 our return address - the address of the calling function + 5.
63; 1c eax
64; 18 edx
65; 14 eflags
66; 10 ecx
67; c tsc high - param 3
68; 8 tsc low
69; 4 frame pointer - param 2
70; 0 function ptr - param 1
71;
72;
73align 16
74_penter:
75 ; save volatile register and get the time stamp.
76 push rax
77 push rdx
78 rdtsc
79 pushfq
80 push rcx
81 push r8
82 push r9
83 push r10
84 push r11
85 sub rsp, 28h ; rsp is unaligned at this point (8 pushes).
86 ; reserve 20h for spill, and 8 bytes for ts.
87
88 ; setting up the enter call frame
89 mov r8d, edx
90 shl r8, 32
91 or r8, rax ; param 3 - the timestamp
92 mov [rsp + 20h], r8 ; save the tsc for later use.
93 lea rdx, [rsp + 8*8 + 28h] ; Param 2 - default frame pointer
94 mov rcx, [rdx] ; Param 1 - The function address
95
96 ; MSC seems to put the _penter both before and after the typical sub rsp, xxh
97 ; statement as if it cannot quite make up its mind. We'll try adjust for this
98 ; to make the unwinding a bit more accurate wrt to longjmp/throw. But since
99 ; there are also an uneven amount of push/pop around the _penter/_pexit we
100 ; can never really make a perfect job of it. sigh.
101 cmp word [rcx - 5 - 4], 08348h ; sub rsp, imm8
102 jne .not_byte_sub
103 cmp byte [rcx - 5 - 2], 0ech
104 jne .not_byte_sub
105 movzx eax, byte [rcx - 5 - 1] ; imm8
106 add rdx, rax
107 jmp .call_prf_enter
108.not_byte_sub:
109 cmp word [rcx - 5 - 7], 08148h ; sub rsp, imm8
110 jne .not_dword_sub
111 cmp byte [rcx - 5 - 5], 0ech
112 jne .not_dword_sub
113 mov eax, [rcx - 5 - 4] ; imm32
114 add rdx, rax
115; jmp .call_prf_enter
116.not_dword_sub:
117.call_prf_enter:
118 call KPRF_ENTER
119 jmp common_return_path
120
121
122;;
123; On x86 the call to this function has been observed to be put right before
124; return instruction. This fact matters since since we have to calc the same
125; stack address as in _penter.
126;
127; Thus the stack layout is as follows:
128; 24 return address of the calling function.
129; 20 our return address - the address of the calling function + 5.
130; 1c eax
131; 18 edx
132; 14 eflags
133; 10 ecx
134; c tsc high - param 3
135; 8 tsc low
136; 4 frame pointer - param 2
137; 0 function ptr - param 1
138;
139;
140align 16
141_pexit:
142 ; save volatile register and get the time stamp.
143 push rax
144 push rdx
145 rdtsc
146 pushfq
147 push rcx
148 push r8
149 push r9
150 push r10
151 push r11
152 sub rsp, 28h ; rsp is unaligned at this point (8 pushes).
153 ; reserve 20h for spill, and 8 bytes for ts.
154
155 ; setting up the enter call frame
156 mov r8d, edx
157 shl r8, 32
158 or r8, rax ; param 3 - the timestamp
159 mov [rsp + 20h], r8 ; save the tsc for later use.
160 lea rdx, [rsp + 8*8 + 28h] ; Param 2 - frame pointer.
161 mov rcx, [rdx] ; Param 1 - The function address
162
163 ; MSC some times put the _pexit before the add rsp, xxh. To try match up with
164 ; any adjustments made in _penter, we'll try detect this.
165 cmp word [rcx], 08348h ; add rsp, imm8
166 jne .not_byte_sub
167 cmp byte [rcx + 2], 0c4h
168 jne .not_byte_sub
169 movzx eax, byte [rcx + 3] ; imm8
170 add rdx, rax
171 jmp .call_prf_leave
172.not_byte_sub:
173 cmp word [rcx], 08148h ; add rsp, imm32
174 jne .not_dword_sub
175 cmp byte [rcx + 2], 0c4h
176 jne .not_dword_sub
177 mov eax, [rcx + 3] ; imm32
178 add rdx, rax
179; jmp .call_prf_leave
180.not_dword_sub:
181.call_prf_leave:
182 call KPRF_LEAVE
183 jmp common_return_path
184
185
186;;
187; This is the common return path for both the enter and exit hooks.
188; It's kept common because we can then use the same overhead adjustment
189; and save some calibration efforts. It also saves space :-)
190align 16
191common_return_path:
192 ; Update overhead
193 test rax, rax
194 jz common_no_overhead
195 cmp byte [g_fCalibrated wrt rip], 0
196 jnz common_overhead
197 call calibrate
198common_overhead:
199 mov rcx, rax ; rcx <- pointer to overhead counter.
200 mov eax, [g_OverheadAdj wrt rip]; apply the adjustment before reading tsc
201 sub [rsp + 20h], rax
202
203 rdtsc
204 shl rdx, 32
205 or rdx, rax ; rdx = 64-bit timestamp
206 sub rdx, [rsp + 20h] ; rdx = elapsed
207 lock add [rcx], rdx ; update counter.
208common_no_overhead:
209
210 ; restore volatile registers.
211 add rsp, 28h
212 pop r11
213 pop r10
214 pop r9
215 pop r8
216 pop rcx
217 popfq
218 pop rdx
219 pop rax
220 ret
221
222;;
223; Data rsi points to while we're calibrating.
224struc CALIBDATA
225 .Overhead resq 1
226 .Profiled resq 1
227 .EnterTS resq 1
228 .Min resq 1
229endstruc
230
231
232
233align 16
234;;
235; Do necessary calibrations.
236;
237calibrate:
238 ; prolog - save everything
239 push rbp
240 pushfq
241 push rax ; pushaq
242 push rbx
243 push rcx
244 push rdx
245 push rdi
246 push rsi
247 push r8
248 push r9
249 push r10
250 push r11
251 push r12
252 push r13
253 push r14
254 push r15
255 mov rbp, rsp
256
257 sub rsp, CALIBDATA_size
258 mov rsi, rsp ; rsi points to the CALIBDATA
259
260 and rsp, -16
261
262 ;
263 ; Indicate that we have finished calibrating.
264 ;
265 mov eax, 1
266 xchg dword [g_fCalibrated wrt rip], eax
267
268 ;
269 ; The outer loop - find the right adjustment.
270 ;
271 mov ebx, 200h ; loop counter.
272calib_outer_loop:
273
274 ;
275 ; The inner loop - calls the function number of times to establish a
276 ; good minimum value
277 ;
278 mov ecx, 200h
279 mov dword [rsi + CALIBDATA.Min], 0ffffffffh
280 mov dword [rsi + CALIBDATA.Min + 4], 07fffffffh
281calib_inner_loop:
282
283 ; zero the overhead and profiled times.
284 xor eax, eax
285 mov [rsi + CALIBDATA.Overhead], rax
286 mov [rsi + CALIBDATA.Profiled], rax
287 call calib_nullproc
288
289 ; subtract the overhead
290 mov rax, [rsi + CALIBDATA.Profiled]
291 sub rax, [rsi + CALIBDATA.Overhead]
292
293 ; update the minimum value.
294 bt rax, 63
295 jc near calib_outer_dec ; if negative, just simplify and shortcut
296 cmp rax, [rsi + CALIBDATA.Min]
297 jge calib_inner_next
298calib_inner_update_minimum:
299 mov [rsi + CALIBDATA.Min], rax
300calib_inner_next:
301 loop calib_inner_loop
302
303 ; Is the minimum value acceptable?
304 test dword [rsi + CALIBDATA.Min + 4], 80000000h
305 jnz calib_outer_dec ; simplify if negative.
306 cmp dword [rsi + CALIBDATA.Min + 4], 0
307 jnz calib_outer_inc ; this shouldn't be possible
308 cmp dword [rsi + CALIBDATA.Min], 1fh
309 jbe calib_outer_dec ; too low - 2 ticks per pair is the minimum!
310 ;cmp dword [rsi + CALIBDATA.Min], 30h
311 ;jbe calib_done ; this is fine!
312 cmp dword [rsi + CALIBDATA.Min], 70h ; - a bit weird...
313 jbe calib_outer_next ; do the full 200h*200h iteration
314calib_outer_inc:
315 inc dword [g_OverheadAdj wrt rip]
316 jmp calib_outer_next
317calib_outer_dec:
318 cmp dword [g_OverheadAdj wrt rip], 1
319 je calib_done
320 dec dword [g_OverheadAdj wrt rip]
321calib_outer_next:
322 dec ebx
323 jnz calib_outer_loop
324calib_done:
325
326 ; epilog - restore it all.
327 mov rsp, rbp
328 pop r15
329 pop r14
330 pop r13
331 pop r12
332 pop r11
333 pop r10
334 pop r9
335 pop r8
336 pop rsi
337 pop rdi
338 pop rdx
339 pop rcx
340 pop rbx
341 pop rax
342 popfq
343 pop rbp
344 ret
345
346
347
348
349;;
350; The calibration _penter - this must be identical to the real thing except for the KPRF call.
351align 16
352calib_penter:
353 ; This part must be identical past the rdtsc.
354 push rax
355 push rdx
356 rdtsc
357 pushfq
358 push rcx
359 push r8
360 push r9
361 push r10
362 push r11
363 sub rsp, 28h ; rsp is unaligned at this point (8 pushes).
364 ; reserve 20h for spill, and 8 bytes for ts.
365
366 ; store the entry / stack frame.
367 mov r8d, edx
368 shl r8, 32
369 or r8, rax
370 mov [rsp + 20h], r8
371
372 mov [rsi + CALIBDATA.EnterTS], r8
373
374 lea rax, [rsi + CALIBDATA.Overhead]
375 jmp common_overhead
376
377
378;;
379; The calibration _pexit - this must be identical to the real thing except for the KPRF call.
380align 16
381calib_pexit:
382 ; This part must be identical past the rdtsc.
383 push rax
384 push rdx
385 rdtsc
386 pushfq
387 push rcx
388 push r8
389 push r9
390 push r10
391 push r11
392 sub rsp, 28h ; rsp is unaligned at this point (8 pushes).
393 ; reserve 20h for spill, and 8 bytes for ts.
394
395 ; store the entry / stack frame.
396 mov r8d, edx
397 shl r8, 32
398 or r8, rax
399 mov [rsp + 20h], r8
400
401 sub r8, [rsi + CALIBDATA.EnterTS]
402 add [rsi + CALIBDATA.Profiled], r8
403
404 lea rax, [rsi + CALIBDATA.EnterTS]
405 jmp common_overhead
406
407
408;;
409; The 'function' we're profiling.
410; The general idea is that each pair should take something like 2-10 ticks.
411;
412; (Btw. If we don't use multiple pairs here, we end up with the wrong result.)
413align 16
414calib_nullproc:
415 call calib_penter ;0
416 call calib_pexit
417
418 call calib_penter ;1
419 call calib_pexit
420
421 call calib_penter ;2
422 call calib_pexit
423
424 call calib_penter ;3
425 call calib_pexit
426
427 call calib_penter ;4
428 call calib_pexit
429
430 call calib_penter ;5
431 call calib_pexit
432
433 call calib_penter ;6
434 call calib_pexit
435
436 call calib_penter ;7
437 call calib_pexit
438
439 call calib_penter ;8
440 call calib_pexit
441
442 call calib_penter ;9
443 call calib_pexit
444
445 call calib_penter ;a
446 call calib_pexit
447
448 call calib_penter ;b
449 call calib_pexit
450
451 call calib_penter ;c
452 call calib_pexit
453
454 call calib_penter ;d
455 call calib_pexit
456
457 call calib_penter ;e
458 call calib_pexit
459
460 call calib_penter ;f
461 call calib_pexit
462 ret
463
464
465;
466; Dummy stack check function.
467;
468global __chkstk
469__chkstk:
470 ret
Note: See TracBrowser for help on using the repository browser.