gmc_altivec.c@ 10184

最後變更在這個檔案從10184是 5776,由 vboxsync 提交於 17 年前
ffmpeg: exported to OSE
檔案大小: 5.9 KB

行
1	/*
2	* GMC (Global Motion Compensation)
3	* AltiVec-enabled
4	* Copyright (c) 2003 Romain Dolbeau <[email protected]>
5	*
6	* This library is free software; you can redistribute it and/or
7	* modify it under the terms of the GNU Lesser General Public
8	* License as published by the Free Software Foundation; either
9	* version 2 of the License, or (at your option) any later version.
10	*
11	* This library is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	* Lesser General Public License for more details.
15	*
16	* You should have received a copy of the GNU Lesser General Public
17	* License along with this library; if not, write to the Free Software
18	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19	*/
20
21	#include "../dsputil.h"
22
23	#include "gcc_fixes.h"
24
25	#include "dsputil_altivec.h"
26
27	/*
28	altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8,
29	to preserve proper dst alignement.
30	*/
31	#define GMC1_PERF_COND (h==8)
32	void gmc1_altivec(uint8_t dst / align 8 /, uint8_t src /* align1 */, int stride, int h, int x16, int y16, int rounder)
33	{
34	POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND);
35	#ifdef ALTIVEC_USE_REFERENCE_C_CODE
36	const int A=(16-x16)*(16-y16);
37	const int B=( x16)*(16-y16);
38	const int C=(16-x16)*( y16);
39	const int D=( x16)*( y16);
40	int i;
41
42	POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
43
44	for(i=0; i<h; i++)
45	{
46	dst[0]= (Asrc[0] + Bsrc[1] + Csrc[stride+0] + Dsrc[stride+1] + rounder)>>8;
47	dst[1]= (Asrc[1] + Bsrc[2] + Csrc[stride+1] + Dsrc[stride+2] + rounder)>>8;
48	dst[2]= (Asrc[2] + Bsrc[3] + Csrc[stride+2] + Dsrc[stride+3] + rounder)>>8;
49	dst[3]= (Asrc[3] + Bsrc[4] + Csrc[stride+3] + Dsrc[stride+4] + rounder)>>8;
50	dst[4]= (Asrc[4] + Bsrc[5] + Csrc[stride+4] + Dsrc[stride+5] + rounder)>>8;
51	dst[5]= (Asrc[5] + Bsrc[6] + Csrc[stride+5] + Dsrc[stride+6] + rounder)>>8;
52	dst[6]= (Asrc[6] + Bsrc[7] + Csrc[stride+6] + Dsrc[stride+7] + rounder)>>8;
53	dst[7]= (Asrc[7] + Bsrc[8] + Csrc[stride+7] + Dsrc[stride+8] + rounder)>>8;
54	dst+= stride;
55	src+= stride;
56	}
57
58	POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
59
60	#else /* ALTIVEC_USE_REFERENCE_C_CODE */
61	const unsigned short __attribute__ ((aligned(16))) rounder_a[8] =
62	{rounder, rounder, rounder, rounder,
63	rounder, rounder, rounder, rounder};
64	const unsigned short __attribute__ ((aligned(16))) ABCD[8] =
65	{
66	(16-x16)(16-y16), / A */
67	( x16)(16-y16), / B */
68	(16-x16)( y16), / C */
69	( x16)( y16), / D */
70	0, 0, 0, 0 /* padding */
71	};
72	register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
73	register const_vector unsigned short vcsr8 = (const_vector unsigned short)vec_splat_u16(8);
74	register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD;
75	register vector unsigned short Av, Bv, Cv, Dv, rounderV, tempA, tempB, tempC, tempD;
76	int i;
77	unsigned long dst_odd = (unsigned long)dst & 0x0000000F;
78	unsigned long src_really_odd = (unsigned long)src & 0x0000000F;
79
80
81	POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
82
83	tempA = vec_ld(0, (unsigned short*)ABCD);
84	Av = vec_splat(tempA, 0);
85	Bv = vec_splat(tempA, 1);
86	Cv = vec_splat(tempA, 2);
87	Dv = vec_splat(tempA, 3);
88
89	rounderV = vec_ld(0, (unsigned short*)rounder_a);
90
91	// we'll be able to pick-up our 9 char elements
92	// at src from those 32 bytes
93	// we load the first batch here, as inside the loop
94	// we can re-use 'src+stride' from one iteration
95	// as the 'src' of the next.
96	src_0 = vec_ld(0, src);
97	src_1 = vec_ld(16, src);
98	srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src));
99
100	if (src_really_odd != 0x0000000F)
101	{ // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector.
102	srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src));
103	}
104	else
105	{
106	srcvB = src_1;
107	}
108	srcvA = vec_mergeh(vczero, srcvA);
109	srcvB = vec_mergeh(vczero, srcvB);
110
111	for(i=0; i<h; i++)
112	{
113	dst_odd = (unsigned long)dst & 0x0000000F;
114	src_really_odd = (((unsigned long)src) + stride) & 0x0000000F;
115
116	dstv = vec_ld(0, dst);
117
118	// we we'll be able to pick-up our 9 char elements
119	// at src + stride from those 32 bytes
120	// then reuse the resulting 2 vectors srvcC and srcvD
121	// as the next srcvA and srcvB
122	src_0 = vec_ld(stride + 0, src);
123	src_1 = vec_ld(stride + 16, src);
124	srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src));
125
126	if (src_really_odd != 0x0000000F)
127	{ // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector.
128	srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src));
129	}
130	else
131	{
132	srcvD = src_1;
133	}
134
135	srcvC = vec_mergeh(vczero, srcvC);
136	srcvD = vec_mergeh(vczero, srcvD);
137
138
139	// OK, now we (finally) do the math :-)
140	// those four instructions replaces 32 int muls & 32 int adds.
141	// isn't AltiVec nice ?
142	tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV);
143	tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA);
144	tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB);
145	tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC);
146
147	srcvA = srcvC;
148	srcvB = srcvD;
149
150	tempD = vec_sr(tempD, vcsr8);
151
152	dstv2 = vec_pack(tempD, (vector unsigned short)vczero);
153
154	if (dst_odd)
155	{
156	dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1));
157	}
158	else
159	{
160	dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3));
161	}
162
163	vec_st(dstv2, 0, dst);
164
165	dst += stride;
166	src += stride;
167	}
168
169	POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
170
171	#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
172	}

注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

source: vbox/trunk/src/libs/ffmpeg-20060710/libavcodec/ppc/gmc_altivec.c@ 10184

以其他格式下載: