source: trunk/kStuff/kProfiler2/prfamd64msc.asm@ 3587

Last change on this file since 3587 was 3586, checked in by bird, 18 years ago

Made it build on 64-bit windows.

File size: 11.8 KB
Line 
1; $Id: $;
2;; @file
3; kProfiler Mark 2 - Microsoft C/C++ Compiler Interaction, AMD64.
4;
5
6;
7; Copyright (c) 2006-2007 knut st. osmundsen <bird-src-spam@anduin.net>
8;
9; This file is part of kProfiler.
10;
11; kProfiler is free software; you can redistribute it and/or
12; modify it under the terms of the GNU Lesser General Public
13; License as published by the Free Software Foundation; either
14; version 2.1 of the License, or (at your option) any later version.
15;
16; kProfiler is distributed in the hope that it will be useful,
17; but WITHOUT ANY WARRANTY; without even the implied warranty of
18; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19; Lesser General Public License for more details.
20;
21; You should have received a copy of the GNU Lesser General Public
22; License along with kProfiler; if not, write to the Free Software
23; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24;
25
26
27[section .data]
28;
29g_fCalibrated:
30 dd 0
31g_OverheadAdj:
32 dd 0
33
34[section .text]
35
36extern KPRF_ENTER
37extern KPRF_LEAVE
38
39global _penter
40global _pexit
41
42;ifdef UNDEFINED
43global common_return_path
44global common_overhead
45global common_no_overhead
46global calibrate
47global calib_inner_update_minimum
48global calib_inner_next
49global calib_outer_dec
50global calib_outer_inc
51global calib_done
52global calib_nullproc
53;endif
54
55
56;;
57; On x86 the call to this function has been observed to be put before
58; creating the stack frame, as the very first instruction in the function.
59;
60; Thus the stack layout is as follows:
61; 24 return address of the calling function.
62; 20 our return address - the address of the calling function + 5.
63; 1c eax
64; 18 edx
65; 14 eflags
66; 10 ecx
67; c tsc high - param 3
68; 8 tsc low
69; 4 frame pointer - param 2
70; 0 function ptr - param 1
71;
72;
73align 16
74_penter:
75 ; save volatile register and get the time stamp.
76 push rax
77 push rdx
78 rdtsc
79 pushfq
80 push rcx
81 push r8
82 push r9
83 push r10
84 push r11
85 sub rsp, 30h ; rsp is aligned at this point (7 pushes).
86 ; reserve 20h for spill, and 8 bytes for ts.
87
88 ; setting up the enter call frame
89 mov r8d, edx
90 shl r8, 32
91 or r8, rax ; param 3 - the timestamp
92 mov [rsp + 20h], r8 ; save the tsc for later use.
93 lea rdx, [rsp + 7*8 + 30h] ; Param 2 - frame pointer (pointer to the return address of the function calling us)
94 mov rcx, [rdx] ; Param 1 - The function address
95 call KPRF_ENTER
96 jmp common_return_path
97
98
99;;
100; On x86 the call to this function has been observed to be put right before
101; return instruction. This fact matters since since we have to calc the same
102; stack address as in _penter.
103;
104; Thus the stack layout is as follows:
105; 24 return address of the calling function.
106; 20 our return address - the address of the calling function + 5.
107; 1c eax
108; 18 edx
109; 14 eflags
110; 10 ecx
111; c tsc high - param 3
112; 8 tsc low
113; 4 frame pointer - param 2
114; 0 function ptr - param 1
115;
116;
117align 16
118_pexit:
119 ; save volatile register and get the time stamp.
120 push rax
121 push rdx
122 rdtsc
123 pushfq
124 push rcx
125 push r8
126 push r9
127 push r10
128 push r11
129 sub rsp, 30h ; rsp is aligned at this point (7 pushes).
130 ; reserve 20h for spill, and 8 bytes for ts.
131
132 ; setting up the leave call frame.
133 mov r8d, edx
134 shl r8, 32
135 or r8, rax ; param 3 - the timestamp
136 mov [rsp + 20h], r8 ; save the tsc for later use.
137 lea rdx, [rsp + 7*8 + 30h] ; Param 2 - frame pointer (pointer to the return address of the function calling us)
138 mov rcx, [rdx] ; Param 1 - The function address
139 call KPRF_LEAVE
140 jmp common_return_path
141
142
143;;
144; This is the common return path for both the enter and exit hooks.
145; It's kept common because we can then use the same overhead adjustment
146; and save some calibration efforts. It also saves space :-)
147align 16
148common_return_path:
149 ; Update overhead
150 test eax, eax
151 jz common_no_overhead
152 cmp byte [g_fCalibrated wrt rip], 0
153 jnz common_overhead
154 call calibrate
155common_overhead:
156 mov rcx, rax ; rcx <- pointer to overhead counter.
157 mov eax, [g_OverheadAdj wrt rip]; apply the adjustment before reading tsc
158 sub [rsp + 20h], rax
159
160 rdtsc
161 shl rdx, 32
162 or rdx, rax ; rdx = 64-bit timestamp
163 sub rdx, [rsp + 20h] ; rdx = elapsed
164 lock add [rcx], rdx ; update counter.
165common_no_overhead:
166
167 ; restore volatile registers.
168 add rsp, 30h
169 pop r11
170 pop r10
171 pop r9
172 pop r8
173 pop rcx
174 popfq
175 pop rdx
176 pop rax
177 ret
178
179;;
180; Data rsi points to while we're calibrating.
181struc CALIBDATA
182 .OverheadLo resd 1
183 .OverheadHi resd 1
184 .ProfiledLo resd 1
185 .ProfiledHi resd 1
186 .EnterTSLo resd 1
187 .EnterTSHi resd 1
188 .MinLo resd 1
189 .MinHi resd 1
190endstruc
191
192
193
194align 16
195;;
196; Do necessary calibrations.
197;
198calibrate:
199 ; prolog - save everything
200 push rbp
201 pushfq
202 push rax ; pushaq
203 push rbx
204 push rcx
205 push rdx
206 push rdi
207 push rsi
208 push r8
209 push r9
210 push r10
211 push r11
212 push r12
213 push r13
214 push r14
215 push r15
216 mov rbp, rsp
217
218 sub rsp, CALIBDATA_size
219 mov rsi, rsp ; rsi points to the CALIBDATA
220
221 and rsp, -15
222
223 ;
224 ; Indicate that we have finished calibrating.
225 ;
226 mov eax, 1
227 xchg dword [g_fCalibrated wrt rip], eax
228
229 ;
230 ; The outer loop - find the right adjustment.
231 ;
232 mov ebx, 200h ; loop counter.
233calib_outer_loop:
234
235 ;
236 ; The inner loop - calls the function number of times to establish a
237 ; good minimum value
238 ;
239 mov ecx, 200h
240 mov dword [rsi + CALIBDATA.MinLo], 0ffffffffh
241 mov dword [rsi + CALIBDATA.MinHi], 07fffffffh
242calib_inner_loop:
243
244 ; zero the overhead and profiled times.
245 xor eax, eax
246 mov [rsi + CALIBDATA.OverheadLo], rax
247 mov [rsi + CALIBDATA.ProfiledLo], rax
248 call calib_nullproc
249
250 ; subtract the overhead
251 mov rax, [rsi + CALIBDATA.ProfiledLo]
252 sub rax, [rsi + CALIBDATA.OverheadLo]
253
254 ; update the minimum value.
255 bt rax, 63
256 jc near calib_outer_dec ; if negative, just simplify and shortcut
257 cmp rax, [rsi + CALIBDATA.MinHi]
258 jge calib_inner_next
259calib_inner_update_minimum:
260 mov [rsi + CALIBDATA.MinLo], rax
261calib_inner_next:
262 loop calib_inner_loop
263
264 ; Is the minimum value acceptable?
265 test dword [rsi + CALIBDATA.MinLo + 4], 80000000h
266 jnz calib_outer_dec ; simplify if negative.
267 cmp dword [rsi + CALIBDATA.MinHi + 4], 0
268 jnz calib_outer_inc ; this shouldn't be possible
269 cmp dword [rsi + CALIBDATA.MinLo], 1fh
270 jbe calib_outer_dec ; too low - 2 ticks per pair is the minimum!
271 cmp dword [rsi + CALIBDATA.MinLo], 30h
272 jbe calib_done ; this is fine!
273calib_outer_inc:
274 inc dword [g_OverheadAdj wrt rip]
275 jmp calib_outer_next
276calib_outer_dec:
277 cmp dword [g_OverheadAdj wrt rip], 1
278 je calib_done
279 dec dword [g_OverheadAdj wrt rip]
280calib_outer_next:
281 dec ebx
282 jnz calib_outer_loop
283calib_done:
284
285 ; epilog - restore it all.
286 leave
287 pop r15
288 pop r14
289 pop r13
290 pop r12
291 pop r11
292 pop r10
293 pop r9
294 pop r8
295 pop rsi
296 pop rdi
297 pop rdx
298 pop rcx
299 pop rbx
300 pop rax
301 popfq
302 ret
303
304
305
306
307;;
308; The calibration _penter - this must be identical to the real thing except for the KPRF call.
309align 16
310calib_penter:
311 ; This part must be identical past the rdtsc.
312 push rax
313 push rdx
314 rdtsc
315 pushfq
316 push rcx
317 push r8
318 push r9
319 push r10
320 push r11
321 sub rsp, 30h ; rsp is aligned at this point (7 pushes).
322 ; reserve 20h for spill, and 8 bytes for ts.
323
324 ; store the entry / stack frame.
325 mov r8d, edx
326 shl r8, 32
327 or r8, rax
328 mov [rsp + 20h], r8
329
330 mov [rsi + CALIBDATA.EnterTSLo], r8
331
332 lea rax, [rsi + CALIBDATA.OverheadLo]
333 jmp common_overhead
334
335
336;;
337; The calibration _pexit - this must be identical to the real thing except for the KPRF call.
338align 16
339calib_pexit:
340 ; This part must be identical past the rdtsc.
341 push rax
342 push rdx
343 rdtsc
344 pushfq
345 push rcx
346 push r8
347 push r9
348 push r10
349 push r11
350 sub rsp, 30h ; rsp is aligned at this point (7 pushes).
351 ; reserve 20h for spill, and 8 bytes for ts.
352
353 ; store the entry / stack frame.
354 mov r8d, edx
355 shl r8, 32
356 or r8, rax
357 mov [rsp + 20h], r8
358
359 sub r8, [rsi + CALIBDATA.EnterTSLo]
360 add [rsi + CALIBDATA.ProfiledLo], r8
361
362 lea rax, [rsi + CALIBDATA.EnterTSLo]
363 jmp common_overhead
364
365
366;;
367; The 'function' we're profiling.
368; The general idea is that each pair should take something like 2-10 ticks.
369;
370; (Btw. If we don't use multiple pairs here, we end up with the wrong result.)
371align 16
372calib_nullproc:
373 call calib_penter ;0
374 call calib_pexit
375
376 call calib_penter ;1
377 call calib_pexit
378
379 call calib_penter ;2
380 call calib_pexit
381
382 call calib_penter ;3
383 call calib_pexit
384
385 call calib_penter ;4
386 call calib_pexit
387
388 call calib_penter ;5
389 call calib_pexit
390
391 call calib_penter ;6
392 call calib_pexit
393
394 call calib_penter ;7
395 call calib_pexit
396
397 call calib_penter ;8
398 call calib_pexit
399
400 call calib_penter ;9
401 call calib_pexit
402
403 call calib_penter ;a
404 call calib_pexit
405
406 call calib_penter ;b
407 call calib_pexit
408
409 call calib_penter ;c
410 call calib_pexit
411
412 call calib_penter ;d
413 call calib_pexit
414
415 call calib_penter ;e
416 call calib_pexit
417
418 call calib_penter ;f
419 call calib_pexit
420 ret
421
422
423;
424; Dummy stack check function.
425;
426global __chkstk
427__chkstk:
428 ret
Note: See TracBrowser for help on using the repository browser.