51 #ifdef HAVE_SYS_TIMES_H 52 #include <sys/times.h> 67 #define xprintf printf 70 extern void *(* xine_fast_memcpy)(
void *to,
const void *from,
size_t len);
72 #define LOG_MODULE "memcpy" 82 #include <xine/xine_internal.h> 83 #include "../xine-engine/xine_private.h" 87 void *(* xine_fast_memcpy)(
void *to,
const void *from,
size_t len);
153 #if defined(ARCH_X86) || defined(ARCH_X86_64) 156 #define small_memcpy(to,from,n)\ 158 register uintptr_t dummy;\ 159 __asm__ __volatile__(\ 161 :"=&D"(to), "=&S"(from), "=&c"(dummy)\ 162 :"0" (to), "1" (from),"2" (n)\ 167 static __inline__
void * linux_kernel_memcpy_impl (
175 small_memcpy(to,from,n);
178 __asm__ __volatile__(
183 "1:\ttestb $1,%b4\n\t" 187 :
"=&c" (d0),
"=&D" (d1),
"=&S" (d2)
188 :
"0" (n/4),
"q" (n),
"1" ((uintptr_t) to),
"2" ((uintptr_t) from)
194 #define AVX_MMREG_SIZE 32 195 #define SSE_MMREG_SIZE 16 196 #define MMX_MMREG_SIZE 8 198 #define MMX1_MIN_LEN 0x800 204 static void * sse_memcpy(
void * to,
const void * from,
size_t len)
211 __asm__ __volatile__ (
212 " prefetchnta (%0)\n" 213 " prefetchnta 32(%0)\n" 214 " prefetchnta 64(%0)\n" 215 " prefetchnta 96(%0)\n" 216 " prefetchnta 128(%0)\n" 217 " prefetchnta 160(%0)\n" 218 " prefetchnta 192(%0)\n" 219 " prefetchnta 224(%0)\n" 220 " prefetchnta 256(%0)\n" 221 " prefetchnta 288(%0)\n" 226 register uintptr_t delta;
228 delta = ((uintptr_t)to)&(SSE_MMREG_SIZE-1);
231 delta=SSE_MMREG_SIZE-delta;
233 small_memcpy(to, from, delta);
237 if(((uintptr_t)from) & 15)
241 __asm__ __volatile__ (
242 "prefetchnta 320(%0)\n" 243 "prefetchnta 352(%0)\n" 244 "movups (%0), %%xmm0\n" 245 "movups 16(%0), %%xmm1\n" 246 "movups 32(%0), %%xmm2\n" 247 "movups 48(%0), %%xmm3\n" 248 "movntps %%xmm0, (%1)\n" 249 "movntps %%xmm1, 16(%1)\n" 250 "movntps %%xmm2, 32(%1)\n" 251 "movntps %%xmm3, 48(%1)\n" 252 ::
"r" (from),
"r" (to) :
"memory");
253 from = ((
const unsigned char *)from) + 64;
254 to = ((
unsigned char *)to) + 64;
264 __asm__ __volatile__ (
265 "prefetchnta 320(%0)\n" 266 "prefetchnta 352(%0)\n" 267 "movaps (%0), %%xmm0\n" 268 "movaps 16(%0), %%xmm1\n" 269 "movaps 32(%0), %%xmm2\n" 270 "movaps 48(%0), %%xmm3\n" 271 "movntps %%xmm0, (%1)\n" 272 "movntps %%xmm1, 16(%1)\n" 273 "movntps %%xmm2, 32(%1)\n" 274 "movntps %%xmm3, 48(%1)\n" 275 ::
"r" (from),
"r" (to) :
"memory");
276 from = ((
const unsigned char *)from) + 64;
277 to = ((
unsigned char *)to) + 64;
281 __asm__ __volatile__ (
"sfence":::
"memory");
286 if(len) linux_kernel_memcpy_impl(to, from, len);
291 static void * avx_memcpy(
void * to,
const void * from,
size_t len)
298 __asm__ __volatile__ (
299 " prefetchnta (%0)\n" 300 " prefetchnta 32(%0)\n" 301 " prefetchnta 64(%0)\n" 302 " prefetchnta 96(%0)\n" 303 " prefetchnta 128(%0)\n" 304 " prefetchnta 160(%0)\n" 305 " prefetchnta 192(%0)\n" 306 " prefetchnta 224(%0)\n" 307 " prefetchnta 256(%0)\n" 308 " prefetchnta 288(%0)\n" 313 register uintptr_t delta;
315 delta = ((uintptr_t)to)&(AVX_MMREG_SIZE-1);
318 delta=AVX_MMREG_SIZE-delta;
320 small_memcpy(to, from, delta);
324 if(((uintptr_t)from) & 31)
328 __asm__ __volatile__ (
329 "prefetchnta 320(%0)\n" 330 "prefetchnta 352(%0)\n" 331 "prefetchnta 384(%0)\n" 332 "prefetchnta 416(%0)\n" 333 "vmovups (%0), %%ymm0\n" 334 "vmovups 32(%0), %%ymm1\n" 335 "vmovups 64(%0), %%ymm2\n" 336 "vmovups 96(%0), %%ymm3\n" 337 "vmovntps %%ymm0, (%1)\n" 338 "vmovntps %%ymm1, 32(%1)\n" 339 "vmovntps %%ymm2, 64(%1)\n" 340 "vmovntps %%ymm3, 96(%1)\n" 341 ::
"r" (from),
"r" (to) :
"memory");
342 from = ((
const unsigned char *)from) + 128;
343 to = ((
unsigned char *)to) + 128;
353 __asm__ __volatile__ (
354 "prefetchnta 320(%0)\n" 355 "prefetchnta 352(%0)\n" 356 "prefetchnta 384(%0)\n" 357 "prefetchnta 416(%0)\n" 358 "vmovaps (%0), %%ymm0\n" 359 "vmovaps 32(%0), %%ymm1\n" 360 "vmovaps 64(%0), %%ymm2\n" 361 "vmovaps 96(%0), %%ymm3\n" 362 "vmovntps %%ymm0, (%1)\n" 363 "vmovntps %%ymm1, 32(%1)\n" 364 "vmovntps %%ymm2, 64(%1)\n" 365 "vmovntps %%ymm3, 96(%1)\n" 366 ::
"r" (from),
"r" (to) :
"memory");
367 from = ((
const unsigned char *)from) + 128;
368 to = ((
unsigned char *)to) + 128;
372 __asm__ __volatile__ (
"sfence":::
"memory");
373 __asm__ __volatile__ (
"vzeroupper");
378 if(len) linux_kernel_memcpy_impl(to, from, len);
383 static void * mmx_memcpy(
void * to,
const void * from,
size_t len)
389 if(len >= MMX1_MIN_LEN)
391 register uintptr_t delta;
393 delta = ((uintptr_t)to)&(MMX_MMREG_SIZE-1);
396 delta=MMX_MMREG_SIZE-delta;
398 small_memcpy(to, from, delta);
404 __asm__ __volatile__ (
406 "movq 8(%0), %%mm1\n" 407 "movq 16(%0), %%mm2\n" 408 "movq 24(%0), %%mm3\n" 409 "movq 32(%0), %%mm4\n" 410 "movq 40(%0), %%mm5\n" 411 "movq 48(%0), %%mm6\n" 412 "movq 56(%0), %%mm7\n" 414 "movq %%mm1, 8(%1)\n" 415 "movq %%mm2, 16(%1)\n" 416 "movq %%mm3, 24(%1)\n" 417 "movq %%mm4, 32(%1)\n" 418 "movq %%mm5, 40(%1)\n" 419 "movq %%mm6, 48(%1)\n" 420 "movq %%mm7, 56(%1)\n" 421 ::
"r" (from),
"r" (to) :
"memory");
422 from = ((
const unsigned char *)from) + 64;
423 to = ((
unsigned char *)to) + 64;
425 __asm__ __volatile__ (
"emms":::
"memory");
430 if(len) linux_kernel_memcpy_impl(to, from, len);
434 static void * mmx2_memcpy(
void * to,
const void * from,
size_t len)
441 __asm__ __volatile__ (
442 " prefetchnta (%0)\n" 443 " prefetchnta 32(%0)\n" 444 " prefetchnta 64(%0)\n" 445 " prefetchnta 96(%0)\n" 446 " prefetchnta 128(%0)\n" 447 " prefetchnta 160(%0)\n" 448 " prefetchnta 192(%0)\n" 449 " prefetchnta 224(%0)\n" 450 " prefetchnta 256(%0)\n" 451 " prefetchnta 288(%0)\n" 456 register uintptr_t delta;
458 delta = ((uintptr_t)to)&(MMX_MMREG_SIZE-1);
461 delta=MMX_MMREG_SIZE-delta;
463 small_memcpy(to, from, delta);
469 __asm__ __volatile__ (
470 "prefetchnta 320(%0)\n" 471 "prefetchnta 352(%0)\n" 473 "movq 8(%0), %%mm1\n" 474 "movq 16(%0), %%mm2\n" 475 "movq 24(%0), %%mm3\n" 476 "movq 32(%0), %%mm4\n" 477 "movq 40(%0), %%mm5\n" 478 "movq 48(%0), %%mm6\n" 479 "movq 56(%0), %%mm7\n" 480 "movntq %%mm0, (%1)\n" 481 "movntq %%mm1, 8(%1)\n" 482 "movntq %%mm2, 16(%1)\n" 483 "movntq %%mm3, 24(%1)\n" 484 "movntq %%mm4, 32(%1)\n" 485 "movntq %%mm5, 40(%1)\n" 486 "movntq %%mm6, 48(%1)\n" 487 "movntq %%mm7, 56(%1)\n" 488 ::
"r" (from),
"r" (to) :
"memory");
489 from = ((
const unsigned char *)from) + 64;
490 to = ((
unsigned char *)to) + 64;
494 __asm__ __volatile__ (
"sfence":::
"memory");
495 __asm__ __volatile__ (
"emms":::
"memory");
500 if(len) linux_kernel_memcpy_impl(to, from, len);
504 static void *linux_kernel_memcpy(
void *to,
const void *from,
size_t len) {
505 return linux_kernel_memcpy_impl(to,from,len);
509 static const struct {
511 void *(*
const function)(
void *to,
const void *from,
size_t len);
517 {
"libc", memcpy, 0 },
518 #if (defined(ARCH_X86) || defined(ARCH_X86_64)) 519 {
"linux kernel", linux_kernel_memcpy, 0 },
520 {
"MMX ", mmx_memcpy,
MM_MMX },
522 {
"SSE", sse_memcpy, MM_MMXEXT|
MM_SSE },
532 #ifdef HAVE_POSIX_TIMERS 535 # ifndef CLOCK_THREAD_CPUTIME_ID 537 # define CLOCK_THREAD_CPUTIME_ID CLOCK_MONOTONIC 540 static int64_t _x_gettime(
void)
543 return (clock_gettime (CLOCK_THREAD_CPUTIME_ID, &tm) == -1)
545 : (int64_t)tm.tv_sec * 1e9 + tm.tv_nsec;
547 # define rdtsc(x) _x_gettime() 549 #elif (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(HAVE_SYS_TIMES_H) 550 static int64_t
rdtsc(
int config_flags)
555 if( config_flags &
MM_MMX ) {
556 __asm__
volatile (
".byte 0x0f, 0x31" :
"=A" (x));
564 static uint64_t
rdtsc(
int config_flags)
569 #ifdef HAVE_SYS_TIMES_H 579 static void update_fast_memcpy(
void *user_data, xine_cfg_entry_t *entry) {
580 static int config_flags = -1;
581 xine_t *xine = (xine_t *) user_data;
586 method = entry->num_value;
595 xprintf(xine, XINE_VERBOSITY_DEBUG,
"xine: will probe memcpy on startup\n" );
600 #define BUFSIZE 1024*1024 602 void xine_probe_fast_memcpy(xine_t *xine)
610 int config_flags = -1;
614 static const char *
const memcpy_methods[] = {
616 #if (defined(ARCH_X86) || defined(ARCH_X86_64)) 617 "kernel",
"mmx",
"mmxext",
"sse",
627 best = xine->config->register_enum (xine->config,
"engine.performance.memcpy_method", 0,
628 (
char **)memcpy_methods,
629 _(
"memcopy method used by xine"),
630 _(
"The copying of large memory blocks is one of the most " 631 "expensive operations on todays computers. Therefore xine " 632 "provides various tuned methods to do this copying. " 633 "Usually, the best method is detected automatically."),
634 20, update_fast_memcpy, (
void *) xine);
652 if( (buf1 = malloc(
BUFSIZE)) == NULL )
655 if( (buf2 = malloc(
BUFSIZE)) == NULL ) {
660 xprintf(
_(
"Benchmarking memcpy methods (smaller is better):\n"));
677 t =
rdtsc(config_flags);
683 t =
rdtsc(config_flags) - t;
693 xine->config->update_num (xine->config,
"engine.performance.memcpy_method", best);
static const struct @1 memcpy_method[]
static uint64_t memcpy_timing[sizeof(memcpy_method)/sizeof(memcpy_method[0])]
void *(* xine_fast_memcpy)(void *to, const void *from, size_t len)
void probe_fast_memcpy(void)
static uint64_t rdtsc(int config_flags)
uint32_t xine_mm_accel(void)