source: contrib/API/lib/fastmemcpy.asm@ 724

Last change on this file since 724 was 541, checked in by David Azarewicz, 15 years ago

Initial import

File size: 8.5 KB
Line 
1; * This file is part of uniaud.dll.
2; *
3; * Copyright (c) 2010 Mensys BV
4; * Copyright (c) 2007 Vlad Stelmahovsky aka Vladest
5; *
6; * This library is free software: you can redistribute it and/or modify
7; * it under the terms of the GNU Lesser General Public License as
8; * published by the Free Software Foundation, either version 3 of
9; * the License, or (at your option) any later version.
10; *
11; * This library is distributed in the hope that it will be useful,
12; * but WITHOUT ANY WARRANTY; without even the implied warranty of
13; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14; * GNU Lesser General Public License for more details.
15; *
16; * You should have received a copy of the GNU Lesser General Public
17; * License and the GNU General Public License along with this library.
18; * If not, see <http://www.gnu.org/licenses/>.
19CODE32 segment public use32 'CODE'
20
21 public _memcpy
22 public sse_memcpy
23 public mmx_memcpy
24 public mmx2_memcpy
25
26arg0 EQU 8
27arg1 EQU 12
28arg2 EQU 16
29
30 .686
31 .XMM3
32
33_memcpy proc near
34
35 push ebp
36 mov ebp, esp
37 push edi
38 push esi
39 sub esp, 0Ch
40
41 cmp dword ptr [ebp+arg2], 3
42 ja f0
43 mov edi, [ebp+arg0]
44 mov esi, [ebp+arg1]
45 mov ecx, [ebp+arg2]
46 rep movsb
47 mov [ebp+arg0], edi
48 jmp memcpy_done
49
50 align 4
51
52f0:
53
54 mov ecx, [ebp+arg2]
55 shr ecx, 2
56 mov edi, [ebp+arg0]
57 mov esi, [ebp+arg1]
58 mov eax, [ebp+arg2]
59 rep movsd
60 test al, 2
61 jz f1
62 movsw
63
64f1:
65
66 test al, 1
67 jz f2
68 movsb
69
70f2:
71
72 mov eax, ecx
73 mov [ebp-0Ch], eax
74 mov eax, edi
75 mov [ebp-10h], eax
76 mov eax, esi
77 mov [ebp-14h], eax
78
79memcpy_done:
80
81 mov eax, [ebp+arg0]
82 add esp, 0Ch
83 pop esi
84 pop edi
85 pop ebp
86
87 ret
88
89_memcpy endp
90
91sse_memcpy proc near
92
93 push ebp
94 mov ebp, esp
95 push edi
96 push esi
97 sub esp, 10h
98 mov eax, [ebp+arg0]
99 mov [ebp-0Ch], eax
100
101 mov eax, [ebp+arg1]
102 prefetchnta [eax]
103 prefetchnta [eax+40h]
104 prefetchnta [eax+80h]
105 prefetchnta [eax+0C0h]
106 prefetchnta [eax+100h]
107
108 cmp dword ptr [ebp+arg2], 3Fh ; MIN_LEN
109 jae sse_1
110 jmp sse_9
111
112sse_1:
113
114 mov eax, [ebp+arg0]
115 and eax, 0Fh
116 mov [ebp-14h], eax
117
118 cmp dword ptr [ebp-14h], 0
119 jz sse_2
120 mov eax, 10h
121 sub eax, [ebp-14h]
122 mov [ebp-14h], eax
123 lea eax, [ebp+arg2]
124 mov edx, [ebp-14h]
125 sub [eax], edx
126
127 mov edi, [ebp+arg0]
128 mov esi, [ebp+arg1]
129 mov ecx, [ebp-14h]
130 rep movsb
131 mov eax, edi
132 mov [ebp+arg0], eax
133 mov eax, esi
134 mov [ebp+arg1], eax
135
136sse_2:
137
138 mov eax, [ebp+arg2]
139 shr eax, 6
140 mov [ebp-10h], eax
141 lea eax, [ebp+arg2]
142 and dword ptr [eax], 3Fh
143
144 mov eax, [ebp+arg1]
145 and eax, 0Fh
146 test eax, eax
147 jz sse_5
148
149sse_3:
150
151 cmp dword ptr [ebp-10h], 0
152 jnz sse_4
153 jmp sse_8
154
155
156sse_4:
157
158 mov edx, [ebp+arg1]
159 mov eax, [ebp+arg0]
160 prefetchnta [edx+140h]
161 movups xmm0, [edx]
162 movups xmm1, [edx+10h]
163 movups xmm2, [edx+20h]
164 movups xmm3, [edx+30h]
165 movntps [eax], xmm0
166 movntps [eax+10h], xmm1
167 movntps [eax+20h], xmm2
168 movntps [eax+30h], xmm3
169
170 lea eax, [ebp+arg1]
171 add dword ptr [eax], 40h
172 add dword ptr [ebp+arg0], 40h
173 lea eax, [ebp-10h]
174 dec dword ptr [eax]
175 jmp sse_3
176
177sse_5:
178
179 lea esi, [esi+0]
180
181sse_6:
182
183 cmp dword ptr [ebp-10h], 0
184 jnz sse_7
185 jmp sse_8
186
187sse_7:
188
189 mov edx, [ebp+arg1]
190 mov eax, [ebp+arg0]
191 prefetchnta [edx+140h]
192 movaps xmm0, [edx]
193 movaps xmm1, [edx+10h]
194 movaps xmm2, [edx+20h]
195 movaps xmm3, [edx+30h]
196 movntps [eax], xmm0
197 movntps [eax+10h], xmm1
198 movntps [eax+20h], xmm2
199 movntps [eax+30h], xmm3
200 lea eax, [ebp+arg1]
201 add dword ptr [eax], 40h
202
203 add dword ptr [ebp+arg0], 40h
204 lea eax, [ebp-10h]
205 dec dword ptr [eax]
206 jmp sse_6
207
208sse_8:
209
210 sfence
211 emms
212
213sse_9:
214
215 cmp dword ptr [ebp+arg2], 0
216 jz sse_10
217 sub esp, 4
218 push dword ptr [ebp+arg2]
219 push dword ptr [ebp+arg1]
220 push dword ptr [ebp+arg0]
221 call _memcpy
222 add esp, 10h
223
224sse_10:
225
226 mov eax, [ebp-0Ch]
227 lea esp, [ebp-8]
228 pop esi
229 pop edi
230 pop ebp
231
232 ret
233sse_memcpy endp
234
235mmx_memcpy proc near
236 push ebp
237 mov ebp, esp
238 push edi
239 push esi
240 sub esp, 10h
241 mov eax, [ebp+arg0]
242 mov [ebp-0Ch], eax
243
244 cmp dword ptr [ebp+arg2], 7FFh ; MMX1_MIN_LEN
245 jae mmx_1
246 jmp mmx_6
247
248mmx_1:
249
250 mov eax, [ebp+arg0]
251 and eax, 7
252 mov [ebp-14h], eax
253 cmp dword ptr [ebp-14h], 0
254 jz mmx_2
255
256 mov eax, 8
257 sub eax, [ebp-14h]
258 mov [ebp-14h], eax
259 lea eax, [ebp+arg2]
260 mov edx, [ebp-14h]
261 sub [eax], edx
262 mov edi, [ebp+arg0]
263 mov esi, [ebp+arg1]
264 mov ecx, [ebp-14h]
265 rep movsb
266 mov eax, edi
267 mov [ebp+arg0], eax
268 mov eax, esi
269 mov [ebp+arg1], eax
270
271mmx_2:
272
273 mov eax, [ebp+arg2]
274 shr eax, 6
275 mov [ebp-10h], eax
276 lea eax, [ebp+arg2]
277 and dword ptr [eax], 3Fh
278 lea esi, [esi+0]
279
280mmx_3:
281
282 cmp dword ptr [ebp-10h], 0
283 jnz mmx_4
284 jmp mmx_5
285
286mmx_4:
287
288 mov edx, [ebp+arg1]
289 mov eax, [ebp+arg0]
290 movq mm0, qword ptr [edx]
291 movq mm1, qword ptr [edx+8]
292 movq mm2, qword ptr [edx+10h]
293 movq mm3, qword ptr [edx+18h]
294 movq mm4, qword ptr [edx+20h]
295 movq mm5, qword ptr [edx+28h]
296 movq mm6, qword ptr [edx+30h]
297 movq mm7, qword ptr [edx+38h]
298 movq qword ptr [eax], mm0
299 movq qword ptr [eax+8], mm1
300 movq qword ptr [eax+10h], mm2
301 movq qword ptr [eax+18h], mm3
302 movq qword ptr [eax+20h], mm4
303 movq qword ptr [eax+28h], mm5
304 movq qword ptr [eax+30h], mm6
305 movq qword ptr [eax+38h], mm7
306 lea eax, [ebp+arg1]
307 add dword ptr [eax], 40h
308 add dword ptr [ebp+arg0], 40h
309
310 lea eax, [ebp-10h]
311 dec dword ptr [eax]
312 jmp mmx_3
313
314 align 4
315
316mmx_5:
317
318 emms
319
320mmx_6:
321
322 cmp dword ptr [ebp+arg2], 0
323 jz mmx_7
324 sub esp, 4
325 push dword ptr [ebp+arg2]
326 push dword ptr [ebp+arg1]
327 push dword ptr [ebp+arg0]
328 call _memcpy
329 add esp, 10h
330
331mmx_7:
332
333 mov eax, [ebp-0Ch]
334 lea esp, [ebp-8]
335 pop esi
336 pop edi
337 pop ebp
338
339 ret
340mmx_memcpy endp
341
342mmx2_memcpy proc near
343
344 push ebp
345 mov ebp, esp
346 push edi
347 push esi
348 sub esp, 10h
349
350 mov eax, [ebp+arg0]
351 mov [ebp-0Ch], eax
352 mov eax, [ebp+arg1]
353 prefetchnta [eax]
354 prefetchnta [eax+40h]
355 prefetchnta [eax+80h]
356 prefetchnta [eax+0C0h]
357 prefetchnta [eax+100h]
358 cmp dword ptr [ebp+arg2], 3Fh ; MIN_LEN
359 jae mmx2_1
360 jmp mmx2_6
361
362mmx2_1:
363
364 mov eax, [ebp+arg0]
365 and eax, 7
366 mov [ebp-14h], eax
367 cmp dword ptr [ebp-14h], 0
368 jz mmx2_2
369 mov eax, 8
370 sub eax, [ebp-14h]
371 mov [ebp-14h], eax
372
373 lea eax, [ebp+arg2]
374 mov edx, [ebp-14h]
375 sub [eax], edx
376 mov edi, [ebp+arg0]
377 mov esi, [ebp+arg1]
378 mov ecx, [ebp-14h]
379 rep movsb
380 mov eax, edi
381 mov [ebp+arg0], eax
382 mov eax, esi
383 mov [ebp+arg1], eax
384
385mmx2_2:
386
387 mov eax, [ebp+arg2]
388 shr eax, 6
389 mov [ebp-10h], eax
390
391 lea eax, [ebp+arg2]
392 and dword ptr [eax], 3Fh
393 lea esi, [esi+0]
394
395mmx2_3:
396
397 cmp dword ptr [ebp-10h], 0
398 jnz mmx2_4
399 jmp mmx2_5
400
401mmx2_4:
402
403 mov edx, [ebp+arg1]
404 mov eax, [ebp+arg0]
405 prefetchnta [edx+140h]
406 movq mm0, qword ptr [edx]
407 movq mm1, qword ptr [edx+8]
408 movq mm2, qword ptr [edx+10h]
409 movq mm3, qword ptr [edx+18h]
410 movq mm4, qword ptr [edx+20h]
411 movq mm5, qword ptr [edx+28h]
412 movq mm6, qword ptr [edx+30h]
413 movq mm7, qword ptr [edx+38h]
414 movntq qword ptr [eax], mm0
415 movntq qword ptr [eax+8], mm1
416 movntq qword ptr [eax+10h], mm2
417 movntq qword ptr [eax+18h], mm3
418 movntq qword ptr [eax+20h], mm4
419 movntq qword ptr [eax+28h], mm5
420 movntq qword ptr [eax+30h], mm6
421 movntq qword ptr [eax+38h], mm7
422
423 lea eax, [ebp+arg1]
424 add dword ptr [eax], 40h
425 add dword ptr [ebp+arg0], 40h
426 lea eax, [ebp-10h]
427 dec dword ptr [eax]
428 jmp mmx2_3
429
430mmx2_5:
431
432 sfence
433 emms
434
435mmx2_6:
436
437 cmp dword ptr [ebp+arg2], 0
438 jz short mmx2_7
439 sub esp, 4
440 push dword ptr [ebp+arg2]
441 push dword ptr [ebp+arg1]
442 push dword ptr [ebp+arg0]
443 call _memcpy
444 add esp, 10h
445
446mmx2_7:
447
448 mov eax, [ebp-0Ch]
449 lea esp, [ebp-8]
450 pop esi
451 pop edi
452 pop ebp
453
454 ret
455mmx2_memcpy endp
456
457CODE32 ends
458
459 END
Note: See TracBrowser for help on using the repository browser.