| 1 | ; $Id: prfamd64msc.asm 29 2009-07-01 20:30:29Z bird $;
 | 
|---|
| 2 | ;; @file
 | 
|---|
| 3 | ; kProfiler Mark 2 - Microsoft C/C++ Compiler Interaction, AMD64.
 | 
|---|
| 4 | ;
 | 
|---|
| 5 | 
 | 
|---|
| 6 | ;
 | 
|---|
| 7 | ; Copyright (c) 2006-2007 Knut St. Osmundsen <bird-kStuff-spamix@anduin.net>
 | 
|---|
| 8 | ;
 | 
|---|
| 9 | ; Permission is hereby granted, free of charge, to any person
 | 
|---|
| 10 | ; obtaining a copy of this software and associated documentation
 | 
|---|
| 11 | ; files (the "Software"), to deal in the Software without
 | 
|---|
| 12 | ; restriction, including without limitation the rights to use,
 | 
|---|
| 13 | ; copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
|---|
| 14 | ; copies of the Software, and to permit persons to whom the
 | 
|---|
| 15 | ; Software is furnished to do so, subject to the following
 | 
|---|
| 16 | ; conditions:
 | 
|---|
| 17 | ;
 | 
|---|
| 18 | ; The above copyright notice and this permission notice shall be
 | 
|---|
| 19 | ; included in all copies or substantial portions of the Software.
 | 
|---|
| 20 | ;
 | 
|---|
| 21 | ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 | 
|---|
| 22 | ; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 | 
|---|
| 23 | ; OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 | 
|---|
| 24 | ; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 | 
|---|
| 25 | ; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 | 
|---|
| 26 | ; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 | 
|---|
| 27 | ; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 | 
|---|
| 28 | ; OTHER DEALINGS IN THE SOFTWARE.
 | 
|---|
| 29 | ;
 | 
|---|
| 30 | 
 | 
|---|
| 31 | [section .data]
 | 
|---|
| 32 | ;
 | 
|---|
| 33 | g_fCalibrated:
 | 
|---|
| 34 |         dd 0
 | 
|---|
| 35 | g_OverheadAdj:
 | 
|---|
| 36 |         dd 0
 | 
|---|
| 37 | 
 | 
|---|
| 38 | [section .text]
 | 
|---|
| 39 | 
 | 
|---|
| 40 | extern KPRF_ENTER
 | 
|---|
| 41 | extern KPRF_LEAVE
 | 
|---|
| 42 | 
 | 
|---|
| 43 | global _penter
 | 
|---|
| 44 | global _pexit
 | 
|---|
| 45 | 
 | 
|---|
| 46 | ;ifdef  UNDEFINED
 | 
|---|
| 47 | global common_return_path
 | 
|---|
| 48 | global common_overhead
 | 
|---|
| 49 | global common_no_overhead
 | 
|---|
| 50 | global calibrate
 | 
|---|
| 51 | global calib_inner_update_minimum
 | 
|---|
| 52 | global calib_inner_next
 | 
|---|
| 53 | global calib_outer_dec
 | 
|---|
| 54 | global calib_outer_inc
 | 
|---|
| 55 | global calib_done
 | 
|---|
| 56 | global calib_nullproc
 | 
|---|
| 57 | ;endif
 | 
|---|
| 58 | 
 | 
|---|
| 59 | 
 | 
|---|
| 60 | ;;
 | 
|---|
| 61 | ; On x86 the call to this function has been observed to be put before
 | 
|---|
| 62 | ; creating the stack frame, as the very first instruction in the function.
 | 
|---|
| 63 | ;
 | 
|---|
| 64 | ; Thus the stack layout is as follows:
 | 
|---|
| 65 | ;       24      return address of the calling function.
 | 
|---|
| 66 | ;       20      our return address - the address of the calling function + 5.
 | 
|---|
| 67 | ;       1c      eax
 | 
|---|
| 68 | ;       18      edx
 | 
|---|
| 69 | ;       14      eflags
 | 
|---|
| 70 | ;       10      ecx
 | 
|---|
| 71 | ;       c       tsc high       - param 3
 | 
|---|
| 72 | ;       8       tsc low
 | 
|---|
| 73 | ;       4       frame pointer  - param 2
 | 
|---|
| 74 | ;       0       function ptr   - param 1
 | 
|---|
| 75 | ;
 | 
|---|
| 76 | ;
 | 
|---|
| 77 | align 16
 | 
|---|
| 78 | _penter:
 | 
|---|
| 79 |         ; save volatile register and get the time stamp.
 | 
|---|
| 80 |         push    rax
 | 
|---|
| 81 |         push    rdx
 | 
|---|
| 82 |         rdtsc
 | 
|---|
| 83 |         pushfq
 | 
|---|
| 84 |         push    rcx
 | 
|---|
| 85 |         push    r8
 | 
|---|
| 86 |         push    r9
 | 
|---|
| 87 |         push    r10
 | 
|---|
| 88 |         push    r11
 | 
|---|
| 89 |         sub     rsp, 28h                ; rsp is unaligned at this point (8 pushes).
 | 
|---|
| 90 |                                         ; reserve 20h for spill, and 8 bytes for ts.
 | 
|---|
| 91 | 
 | 
|---|
| 92 |         ; setting up the enter call frame
 | 
|---|
| 93 |         mov     r8d, edx
 | 
|---|
| 94 |         shl     r8, 32
 | 
|---|
| 95 |         or      r8, rax                 ; param 3 - the timestamp
 | 
|---|
| 96 |         mov     [rsp + 20h], r8         ; save the tsc for later use.
 | 
|---|
| 97 |         lea     rdx, [rsp + 8*8 + 28h]  ; Param 2 - default frame pointer
 | 
|---|
| 98 |         mov     rcx, [rdx]              ; Param 1 - The function address
 | 
|---|
| 99 | 
 | 
|---|
| 100 |         ; MSC seems to put the _penter both before and after the typical sub rsp, xxh
 | 
|---|
| 101 |         ; statement as if it cannot quite make up its mind. We'll try adjust for this
 | 
|---|
| 102 |         ; to make the unwinding a bit more accurate wrt to longjmp/throw. But since
 | 
|---|
| 103 |         ; there are also an uneven amount of push/pop around the _penter/_pexit we
 | 
|---|
| 104 |         ; can never really make a perfect job of it. sigh.
 | 
|---|
| 105 |         cmp     word [rcx - 5 - 4], 08348h  ; sub rsp, imm8
 | 
|---|
| 106 |         jne     .not_byte_sub
 | 
|---|
| 107 |         cmp     byte [rcx - 5 - 2], 0ech
 | 
|---|
| 108 |         jne     .not_byte_sub
 | 
|---|
| 109 |         movzx   eax, byte [rcx - 5 - 1]     ; imm8
 | 
|---|
| 110 |         add     rdx, rax
 | 
|---|
| 111 |         jmp     .call_prf_enter
 | 
|---|
| 112 | .not_byte_sub:
 | 
|---|
| 113 |         cmp     word [rcx - 5 - 7], 08148h  ; sub rsp, imm32
 | 
|---|
| 114 |         jne     .not_dword_sub
 | 
|---|
| 115 |         cmp     byte [rcx - 5 - 5], 0ech
 | 
|---|
| 116 |         jne     .not_dword_sub
 | 
|---|
| 117 |         mov     eax, [rcx - 5 - 4]          ; imm32
 | 
|---|
| 118 |         add     rdx, rax
 | 
|---|
| 119 | ;        jmp     .call_prf_enter
 | 
|---|
| 120 | .not_dword_sub:
 | 
|---|
| 121 | .call_prf_enter:
 | 
|---|
| 122 |         call    KPRF_ENTER
 | 
|---|
| 123 |         jmp     common_return_path
 | 
|---|
| 124 | 
 | 
|---|
| 125 | 
 | 
|---|
| 126 | ;;
 | 
|---|
| 127 | ; On x86 the call to this function has been observed to be put right before
 | 
|---|
| 128 | ; return instruction. This fact matters since since we have to calc the same
 | 
|---|
| 129 | ; stack address as in _penter.
 | 
|---|
| 130 | ;
 | 
|---|
| 131 | ; Thus the stack layout is as follows:
 | 
|---|
| 132 | ;       24      return address of the calling function.
 | 
|---|
| 133 | ;       20      our return address - the address of the calling function + 5.
 | 
|---|
| 134 | ;       1c      eax
 | 
|---|
| 135 | ;       18      edx
 | 
|---|
| 136 | ;       14      eflags
 | 
|---|
| 137 | ;       10      ecx
 | 
|---|
| 138 | ;       c       tsc high       - param 3
 | 
|---|
| 139 | ;       8       tsc low
 | 
|---|
| 140 | ;       4       frame pointer  - param 2
 | 
|---|
| 141 | ;       0       function ptr   - param 1
 | 
|---|
| 142 | ;
 | 
|---|
| 143 | ;
 | 
|---|
| 144 | align 16
 | 
|---|
| 145 | _pexit:
 | 
|---|
| 146 |         ; save volatile register and get the time stamp.
 | 
|---|
| 147 |         push    rax
 | 
|---|
| 148 |         push    rdx
 | 
|---|
| 149 |         rdtsc
 | 
|---|
| 150 |         pushfq
 | 
|---|
| 151 |         push    rcx
 | 
|---|
| 152 |         push    r8
 | 
|---|
| 153 |         push    r9
 | 
|---|
| 154 |         push    r10
 | 
|---|
| 155 |         push    r11
 | 
|---|
| 156 |         sub     rsp, 28h                ; rsp is unaligned at this point (8 pushes).
 | 
|---|
| 157 |                                         ; reserve 20h for spill, and 8 bytes for ts.
 | 
|---|
| 158 | 
 | 
|---|
| 159 |         ; setting up the enter call frame
 | 
|---|
| 160 |         mov     r8d, edx
 | 
|---|
| 161 |         shl     r8, 32
 | 
|---|
| 162 |         or      r8, rax                 ; param 3 - the timestamp
 | 
|---|
| 163 |         mov     [rsp + 20h], r8         ; save the tsc for later use.
 | 
|---|
| 164 |         lea     rdx, [rsp + 8*8 + 28h]  ; Param 2 - frame pointer.
 | 
|---|
| 165 |         mov     rcx, [rdx]              ; Param 1 - The function address
 | 
|---|
| 166 | 
 | 
|---|
| 167 |         ; MSC some times put the _pexit before the add rsp, xxh. To try match up with
 | 
|---|
| 168 |         ; any adjustments made in _penter, we'll try detect this.
 | 
|---|
| 169 |         cmp     word [rcx], 08348h      ; add rsp, imm8
 | 
|---|
| 170 |         jne     .not_byte_sub
 | 
|---|
| 171 |         cmp     byte [rcx + 2], 0c4h
 | 
|---|
| 172 |         jne     .not_byte_sub
 | 
|---|
| 173 |         movzx   eax, byte [rcx + 3]     ; imm8
 | 
|---|
| 174 |         add     rdx, rax
 | 
|---|
| 175 |         jmp     .call_prf_leave
 | 
|---|
| 176 | .not_byte_sub:
 | 
|---|
| 177 |         cmp     word [rcx], 08148h      ; add rsp, imm32
 | 
|---|
| 178 |         jne     .not_dword_sub
 | 
|---|
| 179 |         cmp     byte [rcx + 2], 0c4h
 | 
|---|
| 180 |         jne     .not_dword_sub
 | 
|---|
| 181 |         mov     eax, [rcx + 3]          ; imm32
 | 
|---|
| 182 |         add     rdx, rax
 | 
|---|
| 183 | ;        jmp     .call_prf_leave
 | 
|---|
| 184 | .not_dword_sub:
 | 
|---|
| 185 | .call_prf_leave:
 | 
|---|
| 186 |         call    KPRF_LEAVE
 | 
|---|
| 187 |         jmp common_return_path
 | 
|---|
| 188 | 
 | 
|---|
| 189 | 
 | 
|---|
| 190 | ;;
 | 
|---|
| 191 | ; This is the common return path for both the enter and exit hooks.
 | 
|---|
| 192 | ; It's kept common because we can then use the same overhead adjustment
 | 
|---|
| 193 | ; and save some calibration efforts. It also saves space :-)
 | 
|---|
| 194 | align 16
 | 
|---|
| 195 | common_return_path:
 | 
|---|
| 196 |         ; Update overhead
 | 
|---|
| 197 |         test    rax, rax
 | 
|---|
| 198 |         jz      common_no_overhead
 | 
|---|
| 199 |         cmp     byte [g_fCalibrated wrt rip], 0
 | 
|---|
| 200 |         jnz     common_overhead
 | 
|---|
| 201 |         call    calibrate
 | 
|---|
| 202 | common_overhead:
 | 
|---|
| 203 |         mov     rcx, rax                ; rcx <- pointer to overhead counter.
 | 
|---|
| 204 |         mov     eax, [g_OverheadAdj wrt rip]; apply the adjustment before reading tsc
 | 
|---|
| 205 |         sub     [rsp + 20h], rax
 | 
|---|
| 206 | 
 | 
|---|
| 207 |         rdtsc
 | 
|---|
| 208 |         shl     rdx, 32
 | 
|---|
| 209 |         or      rdx, rax                ; rdx = 64-bit timestamp
 | 
|---|
| 210 |         sub     rdx, [rsp + 20h]        ; rdx = elapsed
 | 
|---|
| 211 |         lock add [rcx], rdx             ; update counter.
 | 
|---|
| 212 | common_no_overhead:
 | 
|---|
| 213 | 
 | 
|---|
| 214 |         ; restore volatile registers.
 | 
|---|
| 215 |         add     rsp, 28h
 | 
|---|
| 216 |         pop     r11
 | 
|---|
| 217 |         pop     r10
 | 
|---|
| 218 |         pop     r9
 | 
|---|
| 219 |         pop     r8
 | 
|---|
| 220 |         pop     rcx
 | 
|---|
| 221 |         popfq
 | 
|---|
| 222 |         pop     rdx
 | 
|---|
| 223 |         pop     rax
 | 
|---|
| 224 |         ret
 | 
|---|
| 225 | 
 | 
|---|
| 226 | ;;
 | 
|---|
| 227 | ; Data rsi points to while we're calibrating.
 | 
|---|
| 228 | struc CALIBDATA
 | 
|---|
| 229 |     .Overhead   resq 1
 | 
|---|
| 230 |     .Profiled   resq 1
 | 
|---|
| 231 |     .EnterTS    resq 1
 | 
|---|
| 232 |     .Min        resq 1
 | 
|---|
| 233 | endstruc
 | 
|---|
| 234 | 
 | 
|---|
| 235 | 
 | 
|---|
| 236 | 
 | 
|---|
| 237 | align 16
 | 
|---|
| 238 | ;;
 | 
|---|
| 239 | ; Do necessary calibrations.
 | 
|---|
| 240 | ;
 | 
|---|
| 241 | calibrate:
 | 
|---|
| 242 |         ; prolog - save everything
 | 
|---|
| 243 |         push    rbp
 | 
|---|
| 244 |         pushfq
 | 
|---|
| 245 |         push    rax                     ; pushaq
 | 
|---|
| 246 |         push    rbx
 | 
|---|
| 247 |         push    rcx
 | 
|---|
| 248 |         push    rdx
 | 
|---|
| 249 |         push    rdi
 | 
|---|
| 250 |         push    rsi
 | 
|---|
| 251 |         push    r8
 | 
|---|
| 252 |         push    r9
 | 
|---|
| 253 |         push    r10
 | 
|---|
| 254 |         push    r11
 | 
|---|
| 255 |         push    r12
 | 
|---|
| 256 |         push    r13
 | 
|---|
| 257 |         push    r14
 | 
|---|
| 258 |         push    r15
 | 
|---|
| 259 |         mov     rbp, rsp
 | 
|---|
| 260 | 
 | 
|---|
| 261 |         sub     rsp, CALIBDATA_size
 | 
|---|
| 262 |         mov     rsi, rsp                ; rsi points to the CALIBDATA
 | 
|---|
| 263 | 
 | 
|---|
| 264 |         and     rsp, -16
 | 
|---|
| 265 | 
 | 
|---|
| 266 |         ;
 | 
|---|
| 267 |         ; Indicate that we have finished calibrating.
 | 
|---|
| 268 |         ;
 | 
|---|
| 269 |         mov     eax, 1
 | 
|---|
| 270 |         xchg    dword [g_fCalibrated wrt rip], eax
 | 
|---|
| 271 | 
 | 
|---|
| 272 |         ;
 | 
|---|
| 273 |         ; The outer loop - find the right adjustment.
 | 
|---|
| 274 |         ;
 | 
|---|
| 275 |         mov     ebx, 200h               ; loop counter.
 | 
|---|
| 276 | calib_outer_loop:
 | 
|---|
| 277 | 
 | 
|---|
| 278 |         ;
 | 
|---|
| 279 |         ; The inner loop - calls the function number of times to establish a
 | 
|---|
| 280 |         ;                  good minimum value
 | 
|---|
| 281 |         ;
 | 
|---|
| 282 |         mov     ecx, 200h
 | 
|---|
| 283 |         mov     dword [rsi + CALIBDATA.Min], 0ffffffffh
 | 
|---|
| 284 |         mov     dword [rsi + CALIBDATA.Min + 4], 07fffffffh
 | 
|---|
| 285 | calib_inner_loop:
 | 
|---|
| 286 | 
 | 
|---|
| 287 |         ; zero the overhead and profiled times.
 | 
|---|
| 288 |         xor     eax, eax
 | 
|---|
| 289 |         mov     [rsi + CALIBDATA.Overhead], rax
 | 
|---|
| 290 |         mov     [rsi + CALIBDATA.Profiled], rax
 | 
|---|
| 291 |         call    calib_nullproc
 | 
|---|
| 292 | 
 | 
|---|
| 293 |         ; subtract the overhead
 | 
|---|
| 294 |         mov     rax, [rsi + CALIBDATA.Profiled]
 | 
|---|
| 295 |         sub     rax, [rsi + CALIBDATA.Overhead]
 | 
|---|
| 296 | 
 | 
|---|
| 297 |         ; update the minimum value.
 | 
|---|
| 298 |         bt      rax, 63
 | 
|---|
| 299 |         jc near calib_outer_dec        ; if negative, just simplify and shortcut
 | 
|---|
| 300 |         cmp     rax, [rsi + CALIBDATA.Min]
 | 
|---|
| 301 |         jge     calib_inner_next
 | 
|---|
| 302 | calib_inner_update_minimum:
 | 
|---|
| 303 |         mov     [rsi + CALIBDATA.Min], rax
 | 
|---|
| 304 | calib_inner_next:
 | 
|---|
| 305 |         loop    calib_inner_loop
 | 
|---|
| 306 | 
 | 
|---|
| 307 |         ; Is the minimum value acceptable?
 | 
|---|
| 308 |         test    dword [rsi + CALIBDATA.Min + 4], 80000000h
 | 
|---|
| 309 |         jnz     calib_outer_dec         ; simplify if negative.
 | 
|---|
| 310 |         cmp     dword [rsi + CALIBDATA.Min + 4], 0
 | 
|---|
| 311 |         jnz     calib_outer_inc         ; this shouldn't be possible
 | 
|---|
| 312 |         cmp     dword [rsi + CALIBDATA.Min], 1fh
 | 
|---|
| 313 |         jbe     calib_outer_dec         ; too low - 2 ticks per pair is the minimum!
 | 
|---|
| 314 |         ;cmp     dword [rsi + CALIBDATA.Min], 30h
 | 
|---|
| 315 |         ;jbe     calib_done              ; this is fine!
 | 
|---|
| 316 |         cmp     dword [rsi + CALIBDATA.Min], 70h ; - a bit weird...
 | 
|---|
| 317 |         jbe     calib_outer_next         ; do the full 200h*200h iteration
 | 
|---|
| 318 | calib_outer_inc:
 | 
|---|
| 319 |         inc     dword [g_OverheadAdj wrt rip]
 | 
|---|
| 320 |         jmp     calib_outer_next
 | 
|---|
| 321 | calib_outer_dec:
 | 
|---|
| 322 |         cmp     dword [g_OverheadAdj wrt rip], 1
 | 
|---|
| 323 |         je      calib_done
 | 
|---|
| 324 |         dec     dword [g_OverheadAdj wrt rip]
 | 
|---|
| 325 | calib_outer_next:
 | 
|---|
| 326 |         dec     ebx
 | 
|---|
| 327 |         jnz     calib_outer_loop
 | 
|---|
| 328 | calib_done:
 | 
|---|
| 329 | 
 | 
|---|
| 330 |         ; epilog - restore it all.
 | 
|---|
| 331 |         mov     rsp, rbp
 | 
|---|
| 332 |         pop     r15
 | 
|---|
| 333 |         pop     r14
 | 
|---|
| 334 |         pop     r13
 | 
|---|
| 335 |         pop     r12
 | 
|---|
| 336 |         pop     r11
 | 
|---|
| 337 |         pop     r10
 | 
|---|
| 338 |         pop     r9
 | 
|---|
| 339 |         pop     r8
 | 
|---|
| 340 |         pop     rsi
 | 
|---|
| 341 |         pop     rdi
 | 
|---|
| 342 |         pop     rdx
 | 
|---|
| 343 |         pop     rcx
 | 
|---|
| 344 |         pop     rbx
 | 
|---|
| 345 |         pop     rax
 | 
|---|
| 346 |         popfq
 | 
|---|
| 347 |         pop     rbp
 | 
|---|
| 348 |         ret
 | 
|---|
| 349 | 
 | 
|---|
| 350 | 
 | 
|---|
| 351 | 
 | 
|---|
| 352 | 
 | 
|---|
| 353 | ;;
 | 
|---|
| 354 | ; The calibration _penter - this must be identical to the real thing except for the KPRF call.
 | 
|---|
| 355 | align 16
 | 
|---|
| 356 | calib_penter:
 | 
|---|
| 357 |         ; This part must be identical past the rdtsc.
 | 
|---|
| 358 |         push    rax
 | 
|---|
| 359 |         push    rdx
 | 
|---|
| 360 |         rdtsc
 | 
|---|
| 361 |         pushfq
 | 
|---|
| 362 |         push    rcx
 | 
|---|
| 363 |         push    r8
 | 
|---|
| 364 |         push    r9
 | 
|---|
| 365 |         push    r10
 | 
|---|
| 366 |         push    r11
 | 
|---|
| 367 |         sub     rsp, 28h                ; rsp is unaligned at this point (8 pushes).
 | 
|---|
| 368 |                                         ; reserve 20h for spill, and 8 bytes for ts.
 | 
|---|
| 369 | 
 | 
|---|
| 370 |         ; store the entry / stack frame.
 | 
|---|
| 371 |         mov     r8d, edx
 | 
|---|
| 372 |         shl     r8, 32
 | 
|---|
| 373 |         or      r8, rax
 | 
|---|
| 374 |         mov     [rsp + 20h], r8
 | 
|---|
| 375 | 
 | 
|---|
| 376 |         mov     [rsi + CALIBDATA.EnterTS], r8
 | 
|---|
| 377 | 
 | 
|---|
| 378 |         lea     rax, [rsi + CALIBDATA.Overhead]
 | 
|---|
| 379 |         jmp     common_overhead
 | 
|---|
| 380 | 
 | 
|---|
| 381 | 
 | 
|---|
| 382 | ;;
 | 
|---|
| 383 | ; The calibration _pexit - this must be identical to the real thing except for the KPRF call.
 | 
|---|
| 384 | align 16
 | 
|---|
| 385 | calib_pexit:
 | 
|---|
| 386 |         ; This part must be identical past the rdtsc.
 | 
|---|
| 387 |         push    rax
 | 
|---|
| 388 |         push    rdx
 | 
|---|
| 389 |         rdtsc
 | 
|---|
| 390 |         pushfq
 | 
|---|
| 391 |         push    rcx
 | 
|---|
| 392 |         push    r8
 | 
|---|
| 393 |         push    r9
 | 
|---|
| 394 |         push    r10
 | 
|---|
| 395 |         push    r11
 | 
|---|
| 396 |         sub     rsp, 28h                ; rsp is unaligned at this point (8 pushes).
 | 
|---|
| 397 |                                         ; reserve 20h for spill, and 8 bytes for ts.
 | 
|---|
| 398 | 
 | 
|---|
| 399 |         ; store the entry / stack frame.
 | 
|---|
| 400 |         mov     r8d, edx
 | 
|---|
| 401 |         shl     r8, 32
 | 
|---|
| 402 |         or      r8, rax
 | 
|---|
| 403 |         mov     [rsp + 20h], r8
 | 
|---|
| 404 | 
 | 
|---|
| 405 |         sub     r8, [rsi + CALIBDATA.EnterTS]
 | 
|---|
| 406 |         add     [rsi + CALIBDATA.Profiled], r8
 | 
|---|
| 407 | 
 | 
|---|
| 408 |         lea     rax, [rsi + CALIBDATA.EnterTS]
 | 
|---|
| 409 |         jmp     common_overhead
 | 
|---|
| 410 | 
 | 
|---|
| 411 | 
 | 
|---|
| 412 | ;;
 | 
|---|
| 413 | ; The 'function' we're profiling.
 | 
|---|
| 414 | ; The general idea is that each pair should take something like 2-10 ticks.
 | 
|---|
| 415 | ;
 | 
|---|
| 416 | ; (Btw. If we don't use multiple pairs here, we end up with the wrong result.)
 | 
|---|
| 417 | align 16
 | 
|---|
| 418 | calib_nullproc:
 | 
|---|
| 419 |         call    calib_penter ;0
 | 
|---|
| 420 |         call    calib_pexit
 | 
|---|
| 421 | 
 | 
|---|
| 422 |         call    calib_penter ;1
 | 
|---|
| 423 |         call    calib_pexit
 | 
|---|
| 424 | 
 | 
|---|
| 425 |         call    calib_penter ;2
 | 
|---|
| 426 |         call    calib_pexit
 | 
|---|
| 427 | 
 | 
|---|
| 428 |         call    calib_penter ;3
 | 
|---|
| 429 |         call    calib_pexit
 | 
|---|
| 430 | 
 | 
|---|
| 431 |         call    calib_penter ;4
 | 
|---|
| 432 |         call    calib_pexit
 | 
|---|
| 433 | 
 | 
|---|
| 434 |         call    calib_penter ;5
 | 
|---|
| 435 |         call    calib_pexit
 | 
|---|
| 436 | 
 | 
|---|
| 437 |         call    calib_penter ;6
 | 
|---|
| 438 |         call    calib_pexit
 | 
|---|
| 439 | 
 | 
|---|
| 440 |         call    calib_penter ;7
 | 
|---|
| 441 |         call    calib_pexit
 | 
|---|
| 442 | 
 | 
|---|
| 443 |         call    calib_penter ;8
 | 
|---|
| 444 |         call    calib_pexit
 | 
|---|
| 445 | 
 | 
|---|
| 446 |         call    calib_penter ;9
 | 
|---|
| 447 |         call    calib_pexit
 | 
|---|
| 448 | 
 | 
|---|
| 449 |         call    calib_penter ;a
 | 
|---|
| 450 |         call    calib_pexit
 | 
|---|
| 451 | 
 | 
|---|
| 452 |         call    calib_penter ;b
 | 
|---|
| 453 |         call    calib_pexit
 | 
|---|
| 454 | 
 | 
|---|
| 455 |         call    calib_penter ;c
 | 
|---|
| 456 |         call    calib_pexit
 | 
|---|
| 457 | 
 | 
|---|
| 458 |         call    calib_penter ;d
 | 
|---|
| 459 |         call    calib_pexit
 | 
|---|
| 460 | 
 | 
|---|
| 461 |         call    calib_penter ;e
 | 
|---|
| 462 |         call    calib_pexit
 | 
|---|
| 463 | 
 | 
|---|
| 464 |         call    calib_penter ;f
 | 
|---|
| 465 |         call    calib_pexit
 | 
|---|
| 466 |         ret
 | 
|---|
| 467 | 
 | 
|---|
| 468 | 
 | 
|---|
| 469 | ;
 | 
|---|
| 470 | ; Dummy stack check function.
 | 
|---|
| 471 | ;
 | 
|---|
| 472 | global __chkstk
 | 
|---|
| 473 | __chkstk:
 | 
|---|
| 474 |     ret
 | 
|---|