1 | /*
|
---|
2 | * GMC (Global Motion Compensation)
|
---|
3 | * AltiVec-enabled
|
---|
4 | * Copyright (c) 2003 Romain Dolbeau <[email protected]>
|
---|
5 | *
|
---|
6 | * This library is free software; you can redistribute it and/or
|
---|
7 | * modify it under the terms of the GNU Lesser General Public
|
---|
8 | * License as published by the Free Software Foundation; either
|
---|
9 | * version 2 of the License, or (at your option) any later version.
|
---|
10 | *
|
---|
11 | * This library is distributed in the hope that it will be useful,
|
---|
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
---|
14 | * Lesser General Public License for more details.
|
---|
15 | *
|
---|
16 | * You should have received a copy of the GNU Lesser General Public
|
---|
17 | * License along with this library; if not, write to the Free Software
|
---|
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
---|
19 | */
|
---|
20 |
|
---|
21 | #include "../dsputil.h"
|
---|
22 |
|
---|
23 | #include "gcc_fixes.h"
|
---|
24 |
|
---|
25 | #include "dsputil_altivec.h"
|
---|
26 |
|
---|
27 | /*
|
---|
28 | altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8,
|
---|
29 | to preserve proper dst alignement.
|
---|
30 | */
|
---|
31 | #define GMC1_PERF_COND (h==8)
|
---|
32 | void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder)
|
---|
33 | {
|
---|
34 | POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND);
|
---|
35 | #ifdef ALTIVEC_USE_REFERENCE_C_CODE
|
---|
36 | const int A=(16-x16)*(16-y16);
|
---|
37 | const int B=( x16)*(16-y16);
|
---|
38 | const int C=(16-x16)*( y16);
|
---|
39 | const int D=( x16)*( y16);
|
---|
40 | int i;
|
---|
41 |
|
---|
42 | POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
|
---|
43 |
|
---|
44 | for(i=0; i<h; i++)
|
---|
45 | {
|
---|
46 | dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
|
---|
47 | dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
|
---|
48 | dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
|
---|
49 | dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
|
---|
50 | dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
|
---|
51 | dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
|
---|
52 | dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
|
---|
53 | dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
|
---|
54 | dst+= stride;
|
---|
55 | src+= stride;
|
---|
56 | }
|
---|
57 |
|
---|
58 | POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
|
---|
59 |
|
---|
60 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */
|
---|
61 | const unsigned short __attribute__ ((aligned(16))) rounder_a[8] =
|
---|
62 | {rounder, rounder, rounder, rounder,
|
---|
63 | rounder, rounder, rounder, rounder};
|
---|
64 | const unsigned short __attribute__ ((aligned(16))) ABCD[8] =
|
---|
65 | {
|
---|
66 | (16-x16)*(16-y16), /* A */
|
---|
67 | ( x16)*(16-y16), /* B */
|
---|
68 | (16-x16)*( y16), /* C */
|
---|
69 | ( x16)*( y16), /* D */
|
---|
70 | 0, 0, 0, 0 /* padding */
|
---|
71 | };
|
---|
72 | register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
|
---|
73 | register const_vector unsigned short vcsr8 = (const_vector unsigned short)vec_splat_u16(8);
|
---|
74 | register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD;
|
---|
75 | register vector unsigned short Av, Bv, Cv, Dv, rounderV, tempA, tempB, tempC, tempD;
|
---|
76 | int i;
|
---|
77 | unsigned long dst_odd = (unsigned long)dst & 0x0000000F;
|
---|
78 | unsigned long src_really_odd = (unsigned long)src & 0x0000000F;
|
---|
79 |
|
---|
80 |
|
---|
81 | POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
|
---|
82 |
|
---|
83 | tempA = vec_ld(0, (unsigned short*)ABCD);
|
---|
84 | Av = vec_splat(tempA, 0);
|
---|
85 | Bv = vec_splat(tempA, 1);
|
---|
86 | Cv = vec_splat(tempA, 2);
|
---|
87 | Dv = vec_splat(tempA, 3);
|
---|
88 |
|
---|
89 | rounderV = vec_ld(0, (unsigned short*)rounder_a);
|
---|
90 |
|
---|
91 | // we'll be able to pick-up our 9 char elements
|
---|
92 | // at src from those 32 bytes
|
---|
93 | // we load the first batch here, as inside the loop
|
---|
94 | // we can re-use 'src+stride' from one iteration
|
---|
95 | // as the 'src' of the next.
|
---|
96 | src_0 = vec_ld(0, src);
|
---|
97 | src_1 = vec_ld(16, src);
|
---|
98 | srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src));
|
---|
99 |
|
---|
100 | if (src_really_odd != 0x0000000F)
|
---|
101 | { // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector.
|
---|
102 | srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src));
|
---|
103 | }
|
---|
104 | else
|
---|
105 | {
|
---|
106 | srcvB = src_1;
|
---|
107 | }
|
---|
108 | srcvA = vec_mergeh(vczero, srcvA);
|
---|
109 | srcvB = vec_mergeh(vczero, srcvB);
|
---|
110 |
|
---|
111 | for(i=0; i<h; i++)
|
---|
112 | {
|
---|
113 | dst_odd = (unsigned long)dst & 0x0000000F;
|
---|
114 | src_really_odd = (((unsigned long)src) + stride) & 0x0000000F;
|
---|
115 |
|
---|
116 | dstv = vec_ld(0, dst);
|
---|
117 |
|
---|
118 | // we we'll be able to pick-up our 9 char elements
|
---|
119 | // at src + stride from those 32 bytes
|
---|
120 | // then reuse the resulting 2 vectors srvcC and srcvD
|
---|
121 | // as the next srcvA and srcvB
|
---|
122 | src_0 = vec_ld(stride + 0, src);
|
---|
123 | src_1 = vec_ld(stride + 16, src);
|
---|
124 | srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src));
|
---|
125 |
|
---|
126 | if (src_really_odd != 0x0000000F)
|
---|
127 | { // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector.
|
---|
128 | srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src));
|
---|
129 | }
|
---|
130 | else
|
---|
131 | {
|
---|
132 | srcvD = src_1;
|
---|
133 | }
|
---|
134 |
|
---|
135 | srcvC = vec_mergeh(vczero, srcvC);
|
---|
136 | srcvD = vec_mergeh(vczero, srcvD);
|
---|
137 |
|
---|
138 |
|
---|
139 | // OK, now we (finally) do the math :-)
|
---|
140 | // those four instructions replaces 32 int muls & 32 int adds.
|
---|
141 | // isn't AltiVec nice ?
|
---|
142 | tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV);
|
---|
143 | tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA);
|
---|
144 | tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB);
|
---|
145 | tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC);
|
---|
146 |
|
---|
147 | srcvA = srcvC;
|
---|
148 | srcvB = srcvD;
|
---|
149 |
|
---|
150 | tempD = vec_sr(tempD, vcsr8);
|
---|
151 |
|
---|
152 | dstv2 = vec_pack(tempD, (vector unsigned short)vczero);
|
---|
153 |
|
---|
154 | if (dst_odd)
|
---|
155 | {
|
---|
156 | dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1));
|
---|
157 | }
|
---|
158 | else
|
---|
159 | {
|
---|
160 | dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3));
|
---|
161 | }
|
---|
162 |
|
---|
163 | vec_st(dstv2, 0, dst);
|
---|
164 |
|
---|
165 | dst += stride;
|
---|
166 | src += stride;
|
---|
167 | }
|
---|
168 |
|
---|
169 | POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
|
---|
170 |
|
---|
171 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
|
---|
172 | }
|
---|