1 | /*
|
---|
2 | * idct for sh4
|
---|
3 | *
|
---|
4 | * Copyright (c) 2001-2003 BERO <[email protected]>
|
---|
5 | *
|
---|
6 | * This library is free software; you can redistribute it and/or
|
---|
7 | * modify it under the terms of the GNU Lesser General Public
|
---|
8 | * License as published by the Free Software Foundation; either
|
---|
9 | * version 2 of the License, or (at your option) any later version.
|
---|
10 | *
|
---|
11 | * This library is distributed in the hope that it will be useful,
|
---|
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
---|
14 | * Lesser General Public License for more details.
|
---|
15 | *
|
---|
16 | * You should have received a copy of the GNU Lesser General Public
|
---|
17 | * License along with this library; if not, write to the Free Software
|
---|
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
---|
19 | */
|
---|
20 |
|
---|
21 | #include "../dsputil.h"
|
---|
22 | #define c1 1.38703984532214752434 /* sqrt(2)*cos(1*pi/16) */
|
---|
23 | #define c2 1.30656296487637657577 /* sqrt(2)*cos(2*pi/16) */
|
---|
24 | #define c3 1.17587560241935884520 /* sqrt(2)*cos(3*pi/16) */
|
---|
25 | #define c4 1.00000000000000000000 /* sqrt(2)*cos(4*pi/16) */
|
---|
26 | #define c5 0.78569495838710234903 /* sqrt(2)*cos(5*pi/16) */
|
---|
27 | #define c6 0.54119610014619712324 /* sqrt(2)*cos(6*pi/16) */
|
---|
28 | #define c7 0.27589937928294311353 /* sqrt(2)*cos(7*pi/16) */
|
---|
29 |
|
---|
30 | const static float even_table[] __attribute__ ((aligned(8))) = {
|
---|
31 | c4, c4, c4, c4,
|
---|
32 | c2, c6,-c6,-c2,
|
---|
33 | c4,-c4,-c4, c4,
|
---|
34 | c6,-c2, c2,-c6
|
---|
35 | };
|
---|
36 |
|
---|
37 | const static float odd_table[] __attribute__ ((aligned(8))) = {
|
---|
38 | c1, c3, c5, c7,
|
---|
39 | c3,-c7,-c1,-c5,
|
---|
40 | c5,-c1, c7, c3,
|
---|
41 | c7,-c5, c3,-c1
|
---|
42 | };
|
---|
43 |
|
---|
44 | #undef c1
|
---|
45 | #undef c2
|
---|
46 | #undef c3
|
---|
47 | #undef c4
|
---|
48 | #undef c5
|
---|
49 | #undef c6
|
---|
50 | #undef c7
|
---|
51 |
|
---|
52 | #if defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__)
|
---|
53 |
|
---|
54 | #define load_matrix(table) \
|
---|
55 | __asm__ volatile( \
|
---|
56 | " fschg\n" \
|
---|
57 | " fmov @%0+,xd0\n" \
|
---|
58 | " fmov @%0+,xd2\n" \
|
---|
59 | " fmov @%0+,xd4\n" \
|
---|
60 | " fmov @%0+,xd6\n" \
|
---|
61 | " fmov @%0+,xd8\n" \
|
---|
62 | " fmov @%0+,xd10\n" \
|
---|
63 | " fmov @%0+,xd12\n" \
|
---|
64 | " fmov @%0+,xd14\n" \
|
---|
65 | " fschg\n" \
|
---|
66 | :\
|
---|
67 | : "r"(table)\
|
---|
68 | : "0" \
|
---|
69 | )
|
---|
70 |
|
---|
71 | #define ftrv() \
|
---|
72 | __asm__ volatile("ftrv xmtrx,fv0" \
|
---|
73 | : "=f"(fr0),"=f"(fr1),"=f"(fr2),"=f"(fr3) \
|
---|
74 | : "0"(fr0), "1"(fr1), "2"(fr2), "3"(fr3) );
|
---|
75 |
|
---|
76 | #define DEFREG \
|
---|
77 | register float fr0 __asm__("fr0"); \
|
---|
78 | register float fr1 __asm__("fr1"); \
|
---|
79 | register float fr2 __asm__("fr2"); \
|
---|
80 | register float fr3 __asm__("fr3")
|
---|
81 |
|
---|
82 | #else
|
---|
83 |
|
---|
84 | /* generic C code for check */
|
---|
85 |
|
---|
86 | static void ftrv_(const float xf[],float fv[])
|
---|
87 | {
|
---|
88 | float f0,f1,f2,f3;
|
---|
89 | f0 = fv[0];
|
---|
90 | f1 = fv[1];
|
---|
91 | f2 = fv[2];
|
---|
92 | f3 = fv[3];
|
---|
93 | fv[0] = xf[0]*f0 + xf[4]*f1 + xf[ 8]*f2 + xf[12]*f3;
|
---|
94 | fv[1] = xf[1]*f0 + xf[5]*f1 + xf[ 9]*f2 + xf[13]*f3;
|
---|
95 | fv[2] = xf[2]*f0 + xf[6]*f1 + xf[10]*f2 + xf[14]*f3;
|
---|
96 | fv[3] = xf[3]*f0 + xf[7]*f1 + xf[11]*f2 + xf[15]*f3;
|
---|
97 | }
|
---|
98 |
|
---|
99 | static void load_matrix_(float xf[],const float table[])
|
---|
100 | {
|
---|
101 | int i;
|
---|
102 | for(i=0;i<16;i++) xf[i]=table[i];
|
---|
103 | }
|
---|
104 |
|
---|
105 | #define ftrv() ftrv_(xf,fv)
|
---|
106 | #define load_matrix(table) load_matrix_(xf,table)
|
---|
107 |
|
---|
108 | #define DEFREG \
|
---|
109 | float fv[4],xf[16]
|
---|
110 |
|
---|
111 | #define fr0 fv[0]
|
---|
112 | #define fr1 fv[1]
|
---|
113 | #define fr2 fv[2]
|
---|
114 | #define fr3 fv[3]
|
---|
115 |
|
---|
116 | #endif
|
---|
117 |
|
---|
118 | #if 1
|
---|
119 | #define DESCALE(x,n) (x)*(1.0f/(1<<(n)))
|
---|
120 | #else
|
---|
121 | #define DESCALE(x,n) (((int)(x)+(1<<(n-1)))>>(n))
|
---|
122 | #endif
|
---|
123 |
|
---|
124 | /* this code work worse on gcc cvs. 3.2.3 work fine */
|
---|
125 |
|
---|
126 |
|
---|
127 | #if 1
|
---|
128 | //optimized
|
---|
129 |
|
---|
130 | void idct_sh4(DCTELEM *block)
|
---|
131 | {
|
---|
132 | DEFREG;
|
---|
133 |
|
---|
134 | int i;
|
---|
135 | float tblock[8*8],*fblock;
|
---|
136 | int ofs1,ofs2,ofs3;
|
---|
137 |
|
---|
138 | #if defined(__SH4__)
|
---|
139 | #error "FIXME!! change to single float"
|
---|
140 | #endif
|
---|
141 |
|
---|
142 | /* row */
|
---|
143 |
|
---|
144 | /* even part */
|
---|
145 | load_matrix(even_table);
|
---|
146 |
|
---|
147 | fblock = tblock+4;
|
---|
148 | i = 8;
|
---|
149 | do {
|
---|
150 | fr0 = block[0];
|
---|
151 | fr1 = block[2];
|
---|
152 | fr2 = block[4];
|
---|
153 | fr3 = block[6];
|
---|
154 | block+=8;
|
---|
155 | ftrv();
|
---|
156 | *--fblock = fr3;
|
---|
157 | *--fblock = fr2;
|
---|
158 | *--fblock = fr1;
|
---|
159 | *--fblock = fr0;
|
---|
160 | fblock+=8+4;
|
---|
161 | } while(--i);
|
---|
162 | block-=8*8;
|
---|
163 | fblock-=8*8+4;
|
---|
164 |
|
---|
165 | load_matrix(odd_table);
|
---|
166 |
|
---|
167 | i = 8;
|
---|
168 |
|
---|
169 | // ofs1 = sizeof(float)*1;
|
---|
170 | // ofs2 = sizeof(float)*2;
|
---|
171 | // ofs3 = sizeof(float)*3;
|
---|
172 |
|
---|
173 | do {
|
---|
174 | float t0,t1,t2,t3;
|
---|
175 | fr0 = block[1];
|
---|
176 | fr1 = block[3];
|
---|
177 | fr2 = block[5];
|
---|
178 | fr3 = block[7];
|
---|
179 | block+=8;
|
---|
180 | ftrv();
|
---|
181 | t0 = *fblock++;
|
---|
182 | t1 = *fblock++;
|
---|
183 | t2 = *fblock++;
|
---|
184 | t3 = *fblock++;
|
---|
185 | fblock+=4;
|
---|
186 | *--fblock = t0 - fr0;
|
---|
187 | *--fblock = t1 - fr1;
|
---|
188 | *--fblock = t2 - fr2;
|
---|
189 | *--fblock = t3 - fr3;
|
---|
190 | *--fblock = t3 + fr3;
|
---|
191 | *--fblock = t2 + fr2;
|
---|
192 | *--fblock = t1 + fr1;
|
---|
193 | *--fblock = t0 + fr0;
|
---|
194 | fblock+=8;
|
---|
195 | } while(--i);
|
---|
196 | block-=8*8;
|
---|
197 | fblock-=8*8;
|
---|
198 |
|
---|
199 | /* col */
|
---|
200 |
|
---|
201 | /* even part */
|
---|
202 | load_matrix(even_table);
|
---|
203 |
|
---|
204 | ofs1 = sizeof(float)*2*8;
|
---|
205 | ofs2 = sizeof(float)*4*8;
|
---|
206 | ofs3 = sizeof(float)*6*8;
|
---|
207 |
|
---|
208 | i = 8;
|
---|
209 |
|
---|
210 | #define OA(fblock,ofs) *(float*)((char*)fblock + ofs)
|
---|
211 |
|
---|
212 | do {
|
---|
213 | fr0 = OA(fblock, 0);
|
---|
214 | fr1 = OA(fblock,ofs1);
|
---|
215 | fr2 = OA(fblock,ofs2);
|
---|
216 | fr3 = OA(fblock,ofs3);
|
---|
217 | ftrv();
|
---|
218 | OA(fblock,0 ) = fr0;
|
---|
219 | OA(fblock,ofs1) = fr1;
|
---|
220 | OA(fblock,ofs2) = fr2;
|
---|
221 | OA(fblock,ofs3) = fr3;
|
---|
222 | fblock++;
|
---|
223 | } while(--i);
|
---|
224 | fblock-=8;
|
---|
225 |
|
---|
226 | load_matrix(odd_table);
|
---|
227 |
|
---|
228 | i=8;
|
---|
229 | do {
|
---|
230 | float t0,t1,t2,t3;
|
---|
231 | t0 = OA(fblock, 0); /* [8*0] */
|
---|
232 | t1 = OA(fblock,ofs1); /* [8*2] */
|
---|
233 | t2 = OA(fblock,ofs2); /* [8*4] */
|
---|
234 | t3 = OA(fblock,ofs3); /* [8*6] */
|
---|
235 | fblock+=8;
|
---|
236 | fr0 = OA(fblock, 0); /* [8*1] */
|
---|
237 | fr1 = OA(fblock,ofs1); /* [8*3] */
|
---|
238 | fr2 = OA(fblock,ofs2); /* [8*5] */
|
---|
239 | fr3 = OA(fblock,ofs3); /* [8*7] */
|
---|
240 | fblock+=-8+1;
|
---|
241 | ftrv();
|
---|
242 | block[8*0] = DESCALE(t0 + fr0,3);
|
---|
243 | block[8*7] = DESCALE(t0 - fr0,3);
|
---|
244 | block[8*1] = DESCALE(t1 + fr1,3);
|
---|
245 | block[8*6] = DESCALE(t1 - fr1,3);
|
---|
246 | block[8*2] = DESCALE(t2 + fr2,3);
|
---|
247 | block[8*5] = DESCALE(t2 - fr2,3);
|
---|
248 | block[8*3] = DESCALE(t3 + fr3,3);
|
---|
249 | block[8*4] = DESCALE(t3 - fr3,3);
|
---|
250 | block++;
|
---|
251 | } while(--i);
|
---|
252 |
|
---|
253 | #if defined(__SH4__)
|
---|
254 | #error "FIXME!! change to double"
|
---|
255 | #endif
|
---|
256 | }
|
---|
257 | #else
|
---|
258 | void idct_sh4(DCTELEM *block)
|
---|
259 | {
|
---|
260 | DEFREG;
|
---|
261 |
|
---|
262 | int i;
|
---|
263 | float tblock[8*8],*fblock;
|
---|
264 |
|
---|
265 | /* row */
|
---|
266 |
|
---|
267 | /* even part */
|
---|
268 | load_matrix(even_table);
|
---|
269 |
|
---|
270 | fblock = tblock;
|
---|
271 | i = 8;
|
---|
272 | do {
|
---|
273 | fr0 = block[0];
|
---|
274 | fr1 = block[2];
|
---|
275 | fr2 = block[4];
|
---|
276 | fr3 = block[6];
|
---|
277 | block+=8;
|
---|
278 | ftrv();
|
---|
279 | fblock[0] = fr0;
|
---|
280 | fblock[2] = fr1;
|
---|
281 | fblock[4] = fr2;
|
---|
282 | fblock[6] = fr3;
|
---|
283 | fblock+=8;
|
---|
284 | } while(--i);
|
---|
285 | block-=8*8;
|
---|
286 | fblock-=8*8;
|
---|
287 |
|
---|
288 | load_matrix(odd_table);
|
---|
289 |
|
---|
290 | i = 8;
|
---|
291 |
|
---|
292 | do {
|
---|
293 | float t0,t1,t2,t3;
|
---|
294 | fr0 = block[1];
|
---|
295 | fr1 = block[3];
|
---|
296 | fr2 = block[5];
|
---|
297 | fr3 = block[7];
|
---|
298 | block+=8;
|
---|
299 | ftrv();
|
---|
300 | t0 = fblock[0];
|
---|
301 | t1 = fblock[2];
|
---|
302 | t2 = fblock[4];
|
---|
303 | t3 = fblock[6];
|
---|
304 | fblock[0] = t0 + fr0;
|
---|
305 | fblock[7] = t0 - fr0;
|
---|
306 | fblock[1] = t1 + fr1;
|
---|
307 | fblock[6] = t1 - fr1;
|
---|
308 | fblock[2] = t2 + fr2;
|
---|
309 | fblock[5] = t2 - fr2;
|
---|
310 | fblock[3] = t3 + fr3;
|
---|
311 | fblock[4] = t3 - fr3;
|
---|
312 | fblock+=8;
|
---|
313 | } while(--i);
|
---|
314 | block-=8*8;
|
---|
315 | fblock-=8*8;
|
---|
316 |
|
---|
317 | /* col */
|
---|
318 |
|
---|
319 | /* even part */
|
---|
320 | load_matrix(even_table);
|
---|
321 |
|
---|
322 | i = 8;
|
---|
323 |
|
---|
324 | do {
|
---|
325 | fr0 = fblock[8*0];
|
---|
326 | fr1 = fblock[8*2];
|
---|
327 | fr2 = fblock[8*4];
|
---|
328 | fr3 = fblock[8*6];
|
---|
329 | ftrv();
|
---|
330 | fblock[8*0] = fr0;
|
---|
331 | fblock[8*2] = fr1;
|
---|
332 | fblock[8*4] = fr2;
|
---|
333 | fblock[8*6] = fr3;
|
---|
334 | fblock++;
|
---|
335 | } while(--i);
|
---|
336 | fblock-=8;
|
---|
337 |
|
---|
338 | load_matrix(odd_table);
|
---|
339 |
|
---|
340 | i=8;
|
---|
341 | do {
|
---|
342 | float t0,t1,t2,t3;
|
---|
343 | fr0 = fblock[8*1];
|
---|
344 | fr1 = fblock[8*3];
|
---|
345 | fr2 = fblock[8*5];
|
---|
346 | fr3 = fblock[8*7];
|
---|
347 | ftrv();
|
---|
348 | t0 = fblock[8*0];
|
---|
349 | t1 = fblock[8*2];
|
---|
350 | t2 = fblock[8*4];
|
---|
351 | t3 = fblock[8*6];
|
---|
352 | fblock++;
|
---|
353 | block[8*0] = DESCALE(t0 + fr0,3);
|
---|
354 | block[8*7] = DESCALE(t0 - fr0,3);
|
---|
355 | block[8*1] = DESCALE(t1 + fr1,3);
|
---|
356 | block[8*6] = DESCALE(t1 - fr1,3);
|
---|
357 | block[8*2] = DESCALE(t2 + fr2,3);
|
---|
358 | block[8*5] = DESCALE(t2 - fr2,3);
|
---|
359 | block[8*3] = DESCALE(t3 + fr3,3);
|
---|
360 | block[8*4] = DESCALE(t3 - fr3,3);
|
---|
361 | block++;
|
---|
362 | } while(--i);
|
---|
363 | }
|
---|
364 | #endif
|
---|