/*
 * Copyright (C) 2004 Thomas Hellström, All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sub license,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include "via_driver.h"
#include "compiler.h"


#define BSIZ 2048  /* size of /proc/cpuinfo buffer */
#define BSIZW 720  /* typical copy width (YUV420) */
#define BSIZA 736  /* multiple of 32 bytes */
#define BSIZH 576  /* typical copy height */

#define SSE_PREFETCH "  prefetchnta "
#define FENCE __asm__ __volatile__ ("sfence":::"memory");
#define FENCEMMS __asm__ __volatile__ ("\t"		\
				       "sfence\n\t"	\
				       "emms\n\t"	\
				       :::"memory");
#define FEMMS __asm__ __volatile__("femms":::"memory");
#define EMMS __asm__ __volatile__("emms":::"memory");

#define NOW_PREFETCH "  prefetch "


#define PREFETCH1(arch_prefetch,from)			\
    __asm__ __volatile__ (				\
			  "1:  " arch_prefetch "(%0)\n"	\
			  arch_prefetch "32(%0)\n"	\
			  arch_prefetch "64(%0)\n"	\
			  arch_prefetch "96(%0)\n"	\
			  arch_prefetch "128(%0)\n"	\
			  arch_prefetch "160(%0)\n"	\
			  arch_prefetch "192(%0)\n"	\
			  arch_prefetch "256(%0)\n"	\
			  arch_prefetch "288(%0)\n"	\
			  "2:\n"			\
			  : : "r" (from) );

#define PREFETCH2(arch_prefetch,from)			\
    __asm__ __volatile__ (				\
			  arch_prefetch "320(%0)\n"	\
			  : : "r" (from) );
#define PREFETCH3(arch_prefetch,from)			\
    __asm__ __volatile__ (				\
			  arch_prefetch "288(%0)\n"	\
			  : : "r" (from) );


#define small_memcpy(to, from, n)					\
    {									\
	__asm__ __volatile__(						\
			     "movl %2,%%ecx\n\t"			\
			     "sarl $2,%%ecx\n\t"			\
			     "rep ; movsl\n\t"				\
			     "testb $2,%b2\n\t"				\
			     "je 1f\n\t"				\
			     "movsw\n"					\
			     "1:\ttestb $1,%b2\n\t"			\
			     "je 2f\n\t"				\
			     "movsb\n"					\
			     "2:"					\
			     :"=&D" (to), "=&S" (from)			\
			     :"q" (n),"0" ((long) to),"1" ((long) from) \
			     : "%ecx","memory");			\
    }


#define SSE_CPY(prefetch, from, to, dummy, lcnt)			\
    if ((unsigned long) from & 15) {					\
	__asm__ __volatile__ (						\
			      "1:\n"					\
			      prefetch "320(%1)\n"			\
			      "  movups (%1), %%xmm0\n"			\
			      "  movups 16(%1), %%xmm1\n"		\
			      "  movntps %%xmm0, (%0)\n"		\
			      "  movntps %%xmm1, 16(%0)\n"		\
                              prefetch "352(%1)\n"			\
			      "  movups 32(%1), %%xmm2\n"		\
			      "  movups 48(%1), %%xmm3\n"		\
			      "  movntps %%xmm2, 32(%0)\n"		\
			      "  movntps %%xmm3, 48(%0)\n"		\
			      "  addl $64,%0\n"				\
			      "  addl $64,%1\n"				\
			      "  decl %2\n"				\
			      "  jne 1b\n"				\
			      :"=&D"(to), "=&S"(from), "=&r"(dummy)	\
			      :"0" (to), "1" (from), "2" (lcnt): "memory"); \
    } else {								\
	__asm__ __volatile__ (						\
			      "2:\n"					\
			      prefetch "320(%1)\n"			\
			      "  movaps (%1), %%xmm0\n"			\
			      "  movaps 16(%1), %%xmm1\n"		\
			      "  movntps %%xmm0, (%0)\n"		\
			      "  movntps %%xmm1, 16(%0)\n"		\
			      prefetch "352(%1)\n"			\
			      "  movaps 32(%1), %%xmm2\n"		\
			      "  movaps 48(%1), %%xmm3\n"		\
			      "  movntps %%xmm2, 32(%0)\n"		\
			      "  movntps %%xmm3, 48(%0)\n"		\
			      "  addl $64,%0\n"				\
			      "  addl $64,%1\n"				\
			      "  decl %2\n"				\
			      "  jne 2b\n"				\
			      :"=&D"(to), "=&S"(from), "=&r"(dummy)	\
			      :"0" (to), "1" (from), "2" (lcnt): "memory"); \
    }

#define MMX_CPY(prefetch, from, to, dummy, lcnt)			\
    __asm__ __volatile__ (						\
			  "1:\n"					\
			  prefetch "320(%1)\n"				\
			  "2:  movq (%1), %%mm0\n"			\
			  "  movq 8(%1), %%mm1\n"			\
			  "  movq 16(%1), %%mm2\n"			\
			  "  movq 24(%1), %%mm3\n"			\
			  "  movq %%mm0, (%0)\n"			\
			  "  movq %%mm1, 8(%0)\n"			\
			  "  movq %%mm2, 16(%0)\n"			\
			  "  movq %%mm3, 24(%0)\n"			\
			  prefetch "352(%1)\n"				\
			  "  movq 32(%1), %%mm0\n"			\
			  "  movq 40(%1), %%mm1\n"			\
			  "  movq 48(%1), %%mm2\n"			\
			  "  movq 56(%1), %%mm3\n"			\
			  "  movq %%mm0, 32(%0)\n"			\
			  "  movq %%mm1, 40(%0)\n"			\
			  "  movq %%mm2, 48(%0)\n"			\
			  "  movq %%mm3, 56(%0)\n"			\
			  "  addl $64,%0\n"				\
			  "  addl $64,%1\n"				\
			  "  decl %2\n"					\
			  "  jne 1b\n"					\
			  :"=&D"(to), "=&S"(from), "=&r"(dummy)		\
			  :"0" (to), "1" (from), "2" (lcnt) : "memory");

#define MMXEXT_CPY(prefetch, from, to, dummy, lcnt)			\
    __asm__ __volatile__ (						\
			  ".p2align 4,,7\n"				\
			  "1:\n"					\
			  prefetch "320(%1)\n"				\
			  "  movq (%1), %%mm0\n"			\
			  "  movq 8(%1), %%mm1\n"			\
			  "  movq 16(%1), %%mm2\n"			\
			  "  movq 24(%1), %%mm3\n"			\
			  "  movntq %%mm0, (%0)\n"			\
			  "  movntq %%mm1, 8(%0)\n"			\
			  "  movntq %%mm2, 16(%0)\n"			\
			  "  movntq %%mm3, 24(%0)\n"			\
			  prefetch "352(%1)\n"				\
			  "  movq 32(%1), %%mm0\n"			\
			  "  movq 40(%1), %%mm1\n"			\
			  "  movq 48(%1), %%mm2\n"			\
			  "  movq 56(%1), %%mm3\n"			\
			  "  movntq %%mm0, 32(%0)\n"			\
			  "  movntq %%mm1, 40(%0)\n"			\
			  "  movntq %%mm2, 48(%0)\n"			\
			  "  movntq %%mm3, 56(%0)\n"			\
			  "  addl $64,%0\n"				\
			  "  addl $64,%1\n"				\
			  "  decl %2\n"					\
			  "  jne 1b\n"					\
			  :"=&D"(to), "=&S"(from), "=&r"(dummy)		\
			  :"0" (to), "1" (from), "2" (lcnt) : "memory");


#define PREFETCH_FUNC(prefix, itype, ptype, begin, fence)		\
									\
    static void prefix##_YUV42X(unsigned char *to,			\
				const unsigned char *from,		\
				int dstPitch,				\
				int w,					\
				int h,					\
				int yuv422)				\
    {									\
	int dadd, rest, count, hc, lcnt;				\
	register int dummy;						\
	PREFETCH1(ptype##_PREFETCH, from);				\
	begin;								\
	count = 2;							\
									\
	/* If destination pitch equals width, do it all in one go. */	\
									\
	if (yuv422) {							\
	    w <<= 1;							\
	    if (w == dstPitch) {					\
		w *= h;							\
		h = 1;							\
		dstPitch = w;						\
		count = 0;						\
	    } else {							\
		h -= 1;							\
		count = 1;						\
	    }								\
	} else if (w == dstPitch) {					\
	    w = h*(w + (w >> 1));					\
	    count = 0;							\
	    h = 1;							\
	    dstPitch = w;						\
	}								\
									\
	lcnt = w >> 6;							\
	rest = w & 63;							\
	while (count--) {						\
	    hc = h;							\
	    lcnt = w >> 6;						\
	    rest = w & 63;						\
	    dadd = dstPitch - w;					\
	    while (hc--) {						\
		if (lcnt) {						\
		    itype##_CPY(ptype##_PREFETCH, from, to, dummy, lcnt); \
		}							\
		if (rest) {						\
		    PREFETCH2(ptype##_PREFETCH, from);			\
		    small_memcpy(to, from, rest);			\
		    PREFETCH3(ptype##_PREFETCH, from);			\
		}							\
		to += dadd;						\
	    }								\
	    w >>= 1;							\
	    dstPitch >>= 1;						\
	    h -= 1;							\
	}								\
	if (lcnt > 5) {							\
	    lcnt -= 5;							\
	    itype##_CPY(ptype##_PREFETCH, from, to, dummy, lcnt);	\
	    lcnt = 5;							\
	}								\
	if (lcnt) {							\
	    itype##_CPY("#", from, to, dummy, lcnt);			\
	}								\
	if (rest) small_memcpy(to, from, rest);				\
	fence;								\
    }

#define NOPREFETCH_FUNC(prefix, itype, begin, fence)			\
    static void prefix##_YUV42X(unsigned char *to,			\
				const unsigned char *from,		\
				int dstPitch,				\
				int w,					\
				int h,					\
				int yuv422)				\
									\
    {									\
	int dadd, rest, count, hc, lcnt;				\
	register int dummy;						\
	begin;								\
	count = 2;							\
									\
	/* If destination pitch equals width, do it all in one go. */	\
									\
	if (yuv422) {							\
	    w <<= 1;							\
	    count = 1;							\
	    if (w == dstPitch) {					\
		w *= h;							\
		h = 1;							\
		dstPitch = w;						\
	    }								\
	} else if (w == dstPitch) {					\
	    w = h*(w + (w >> 1));					\
	    count = 1;							\
	    h = 1;							\
	    dstPitch = w;						\
	}								\
									\
	lcnt = w >> 6;							\
	rest = w & 63;							\
	while (count--) {						\
	    hc = h;							\
	    dadd = dstPitch - w;					\
	    lcnt = w >> 6;						\
	    rest = w & 63;						\
	    while (hc--) {						\
		if (lcnt) {						\
		    itype##_CPY("#", from, to, dummy, lcnt);		\
		}							\
		if (rest) small_memcpy(to, from, rest);			\
		to += dadd;						\
	    }								\
	    w >>= 1;							\
	    dstPitch >>= 1;						\
	}								\
	fence;								\
    }


static void
libc_YUV42X(unsigned char *dst, const unsigned char *src,
            int dstPitch, int w, int h, int yuv422)
{
    if (yuv422)
        w <<= 1;
    if (dstPitch == w) {
        int size = h * ((yuv422) ? w : (w + (w >> 1)));

        memcpy(dst, src, size);
        return;
    } else {
        int count;

        /* Copy Y component to video memory. */
        count = h;
        while (count--) {
            memcpy(dst, src, w);
            src += w;
            dst += dstPitch;
        }

        /* UV component is 1/2 of Y. */
        if (!yuv422) {
            w >>= 1;
            dstPitch >>= 1;

            /* Copy V(Cr),U(Cb) components to video memory. */
            count = h;
            while (count--) {
                memcpy(dst, src, w);
                src += w;
                dst += dstPitch;
            }
        }
    }
}

#ifdef __i386__

/* Linux kernel __memcpy. */
static __inline void *
__memcpy(void *to, const void *from, size_t n)
{
    int d1, d2, d3;

    __asm__ __volatile__(
                         "rep ; movsl\n\t"
                         "testb $2,%b4\n\t"
                         "je 1f\n\t"
                         "movsw\n"
                         "1:\ttestb $1,%b4\n\t"
                         "je 2f\n\t"
                         "movsb\n"
                         "2:"
                         :"=&c"(d1), "=&D"(d2), "=&S"(d3)
                         :"0"(n >> 2), "q"(n), "1"((long)to), "2"((long)from)
                         :"memory");

    return (to);
}


static void
kernel_YUV42X(unsigned char *dst, const unsigned char *src,
              int dstPitch, int w, int h, int yuv422)
{
    if (yuv422)
        w <<= 1;
    if (dstPitch == w) {
        int size = h * ((yuv422) ? w : (w + (w >> 1)));

        __memcpy(dst, src, size);
        return;
    } else {
        int count;

        /* Copy Y component to video memory. */
        count = h;
        while (count--) {
            __memcpy(dst, src, w);
            src += w;
            dst += dstPitch;
        }

        /* UV component is 1/2 of Y. */
        if (!yuv422) {

            w >>= 1;
            dstPitch >>= 1;

            /* Copy V(Cr),U(Cb) components to video memory. */
            count = h;
            while (count--) {
                __memcpy(dst, src, w);
                src += w;
                dst += dstPitch;
            }
        }
    }
}

PREFETCH_FUNC(sse, SSE, SSE,, FENCE)
PREFETCH_FUNC(mmxext, MMXEXT, SSE, EMMS, FENCEMMS)
PREFETCH_FUNC(now, MMX, NOW, FEMMS, FEMMS)
NOPREFETCH_FUNC(mmx, MMX, EMMS, EMMS)

static void
*kernel_memcpy(void *to, const void *from, size_t len)
{
    return __memcpy(to, from, len);
}

static unsigned
fastrdtsc(void)
{
    unsigned eax;

    __asm__ volatile ("\t"
                      "pushl %%ebx\n\t"
                      "cpuid\n\t"
                      ".byte 0x0f, 0x31\n\t"
                      "popl %%ebx\n"
                      :"=a" (eax)
                      :"0"(0)
                      :"ecx", "edx", "cc");

    return eax;
}


static unsigned
time_function(vidCopyFunc mf, unsigned char *buf1, unsigned char *buf2)
{
    unsigned t, t2;

    t = fastrdtsc();

    (*mf) (buf1, buf2, BSIZA, BSIZW, BSIZH, 0);

    t2 = fastrdtsc();
    return ((t < t2) ? t2 - t : 0xFFFFFFFFU - (t - t2 - 1));
}

enum
{ libc = 0, kernel, sse, mmx, now, mmxext, totNum };

typedef struct
{
    vidCopyFunc mFunc;
    char *mName, **cpuFlag;
} McFuncData;

static char *libc_cpuflags[] = { " ", 0 };
static char *kernel_cpuflags[] = { " ", 0 };
static char *sse_cpuflags[] = { " sse ", 0 };
static char *mmx_cpuflags[] = { " mmx ", 0 };
static char *now_cpuflags[] = { " 3dnow ", 0 };
static char *mmx2_cpuflags[] = { " mmxext ", " sse ", 0 };

static McFuncData mcFunctions[totNum] = {
{libc_YUV42X, "libc", libc_cpuflags},
{kernel_YUV42X, "kernel", kernel_cpuflags},
{sse_YUV42X, "SSE", sse_cpuflags},
{mmx_YUV42X, "MMX", mmx_cpuflags},
{now_YUV42X, "3DNow!", now_cpuflags},
{mmxext_YUV42X, "MMX2", mmx2_cpuflags}
};


static int
flagValid(const char *cpuinfo, char *flag)
{
    const char *flagLoc, *nextProc;
    int located = 0;

    while ((cpuinfo = strstr(cpuinfo, "processor\t:"))) {
        located = 1;
        cpuinfo += 11;
        if ((flagLoc = strstr(cpuinfo, flag))) {
            if ((nextProc = strstr(cpuinfo, "processor\t:"))) {
                if (nextProc < flagLoc)
                    return 0;
            }
        } else {
            return 0;
        }
    }
    return located;
}


static int
cpuValid(const char *cpuinfo, char **flags)
{
    for (; *flags != 0; flags++) {
        if (flagValid(cpuinfo, *flags))
            return 1;
    }
    return 0;
}

/*
 * Benchmark the video copy routines and choose the fastest.
 */
vidCopyFunc
viaVidCopyInit(char *copyType, ScreenPtr pScreen)
{
    ScrnInfoPtr pScrn = xf86ScreenToScrn(pScreen);

    char buf[BSIZ];
    unsigned char *buf1, *buf2, *buf3;
    char *tmpBuf, *endBuf;
    int count, j, bestSoFar;
    unsigned best, tmp, testSize, alignSize, tmp2;
    struct buffer_object *tmpFbBuffer;
    McFuncData *curData;
    FILE *cpuInfoFile;
    double cpuFreq;

    if (NULL == (cpuInfoFile = fopen("/proc/cpuinfo", "r"))) {
        return libc_YUV42X;
    }
    count = fread(buf, 1, BSIZ, cpuInfoFile);
    if (ferror(cpuInfoFile)) {
        fclose(cpuInfoFile);
        return libc_YUV42X;
    }
    fclose(cpuInfoFile);
    if (BSIZ == count) {
        xf86DrvMsg(pScrn->scrnIndex, X_WARNING,
                   "\"/proc/cpuinfo\" file too long. "
                   "Using Linux kernel memcpy.\n");
        return libc_YUV42X;
    }
    buf[count] = 0;

    while (count--)
        if ('\n' == buf[count])
            buf[count] = ' ';

    /* Extract the CPU frequency. */
    cpuFreq = 0.;
    if (NULL != (tmpBuf = strstr(buf, "cpu MHz"))) {
        if (NULL != (tmpBuf = strstr(tmpBuf, ":") + 1)) {
            cpuFreq = strtod(tmpBuf, &endBuf);
            if (endBuf == tmpBuf)
                tmpBuf = NULL;
        }
    }

    alignSize = BSIZH * (BSIZA + (BSIZA >> 1));
    testSize = BSIZH * (BSIZW + (BSIZW >> 1));
    /*
     * Allocate an area of offscreen FB memory, (buf1), a simulated video
     * player buffer (buf2) and a pool of uninitialized "video" data (buf3).
     */
    tmpFbBuffer = drm_bo_alloc(pScrn, alignSize, 32, TTM_PL_FLAG_VRAM);
    if (!tmpFbBuffer)
        return libc_YUV42X;
    if (NULL == (buf2 = (unsigned char *)malloc(testSize))) {
        drm_bo_free(pScrn, tmpFbBuffer);
        return libc_YUV42X;
    }
    if (NULL == (buf3 = (unsigned char *)malloc(testSize))) {
        free(buf2);
        drm_bo_free(pScrn, tmpFbBuffer);
        return libc_YUV42X;
    }
    buf1 = drm_bo_map(pScrn, tmpFbBuffer);
    bestSoFar = 0;
    best = 0xFFFFFFFFU;

    /* Make probable that buf1 and buf2 are in memory by referencing them. */
    libc_YUV42X(buf1, buf2, BSIZA, BSIZW, BSIZH, 0);

    xf86DrvMsg(pScrn->scrnIndex, X_INFO,
               "Benchmarking %s copy.  Less time is better.\n", copyType);
    for (j = 0; j < totNum; ++j) {
        curData = mcFunctions + j;

        if (cpuValid(buf, curData->cpuFlag)) {

            /* Simulate setup of the video buffer. */
            kernel_memcpy(buf2, buf3, testSize);

            /* Copy the video buffer to frame-buffer memory. */
            tmp = time_function(curData->mFunc, buf1, buf2);

            /* Do it again to avoid context-switch effects. */
            kernel_memcpy(buf2, buf3, testSize);
            tmp2 = time_function(curData->mFunc, buf1, buf2);
            tmp = (tmp2 < tmp) ? tmp2 : tmp;

            if (NULL == tmpBuf) {
                xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
                           "Timed %6s YUV420 copy... %u.\n",
                           curData->mName, tmp);
            } else {
                xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
                           "Timed %6s YUV420 copy... %u. "
                           "Throughput: %.1f MiB/s.\n",
                           curData->mName, tmp,
                           cpuFreq * 1.e6 * (double)testSize /
                           ((double)(tmp) * (double)(0x100000)));
            }
            if (tmp < best) {
                best = tmp;
                bestSoFar = j;
            }
        } else {
            xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
                       "Ditching %6s YUV420 copy. Not supported by CPU.\n",
                       curData->mName);
        }
    }
    free(buf3);
    free(buf2);
    drm_bo_unmap(pScrn, tmpFbBuffer);
    drm_bo_free(pScrn, tmpFbBuffer);
    xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
               "Using %s YUV42X copy for %s.\n",
               mcFunctions[bestSoFar].mName, copyType);
    return mcFunctions[bestSoFar].mFunc;
}

#else

vidCopyFunc
viaVidCopyInit(char *copyType, ScreenPtr pScreen)
{
    ScrnInfoPtr pScrn = xf86ScreenToScrn(pScreen);

    xf86DrvMsg(pScrn->scrnIndex, X_INFO,
               "Using default xfree86 memcpy for video.\n");
    return libc_YUV42X;
}

#endif /* __i386__ */