1 | #ifndef MMX_X64_H_INCLUDED
|
---|
2 | #define MMX_X64_H_INCLUDED
|
---|
3 |
|
---|
4 | /* Implementation of x64 MMX substitition functions, before
|
---|
5 | * pixman is reimplemented not to use __m64 type on Visual C++
|
---|
6 | *
|
---|
7 | * Copyright (C)2009 by George Yohng
|
---|
8 | * Released in public domain.
|
---|
9 | */
|
---|
10 |
|
---|
11 | #include <intrin.h>
|
---|
12 |
|
---|
13 | #define M64C(a) (*(const __m64 *)(&a))
|
---|
14 | #define M64U(a) (*(const unsigned long long *)(&a))
|
---|
15 |
|
---|
16 | __inline __m64
|
---|
17 | _m_from_int (int a)
|
---|
18 | {
|
---|
19 | long long i64 = a;
|
---|
20 |
|
---|
21 | return M64C (i64);
|
---|
22 | }
|
---|
23 |
|
---|
24 | __inline __m64
|
---|
25 | _mm_setzero_si64 ()
|
---|
26 | {
|
---|
27 | long long i64 = 0;
|
---|
28 |
|
---|
29 | return M64C (i64);
|
---|
30 | }
|
---|
31 |
|
---|
32 | __inline __m64
|
---|
33 | _mm_set_pi32 (int i1, int i0)
|
---|
34 | {
|
---|
35 | unsigned long long i64 = ((unsigned)i0) + (((unsigned long long)(unsigned)i1) << 32);
|
---|
36 |
|
---|
37 | return M64C (i64);
|
---|
38 | }
|
---|
39 |
|
---|
40 | __inline void
|
---|
41 | _m_empty ()
|
---|
42 | {
|
---|
43 | }
|
---|
44 |
|
---|
45 | __inline __m64
|
---|
46 | _mm_set1_pi16 (short w)
|
---|
47 | {
|
---|
48 | unsigned long long i64 = ((unsigned long long)(unsigned short)(w)) * 0x0001000100010001ULL;
|
---|
49 |
|
---|
50 | return M64C (i64);
|
---|
51 | }
|
---|
52 |
|
---|
53 | __inline int
|
---|
54 | _m_to_int (__m64 m)
|
---|
55 | {
|
---|
56 | return m.m64_i32[0];
|
---|
57 | }
|
---|
58 |
|
---|
59 | __inline __m64
|
---|
60 | _mm_movepi64_pi64 (__m128i a)
|
---|
61 | {
|
---|
62 | return M64C (a.m128i_i64[0]);
|
---|
63 | }
|
---|
64 |
|
---|
65 | __inline __m64
|
---|
66 | _m_pand (__m64 a, __m64 b)
|
---|
67 | {
|
---|
68 | unsigned long long i64 = M64U (a) & M64U (b);
|
---|
69 |
|
---|
70 | return M64C (i64);
|
---|
71 | }
|
---|
72 |
|
---|
73 | __inline __m64
|
---|
74 | _m_por (__m64 a, __m64 b)
|
---|
75 | {
|
---|
76 | unsigned long long i64 = M64U (a) | M64U (b);
|
---|
77 |
|
---|
78 | return M64C (i64);
|
---|
79 | }
|
---|
80 |
|
---|
81 | __inline __m64
|
---|
82 | _m_pxor (__m64 a, __m64 b)
|
---|
83 | {
|
---|
84 | unsigned long long i64 = M64U (a) ^ M64U (b);
|
---|
85 |
|
---|
86 | return M64C (i64);
|
---|
87 | }
|
---|
88 |
|
---|
89 | __inline __m64
|
---|
90 | _m_pmulhuw (__m64 a, __m64 b) /* unoptimized */
|
---|
91 | {
|
---|
92 | unsigned short d[4] =
|
---|
93 | {
|
---|
94 | (unsigned short)((((unsigned)a.m64_u16[0]) * b.m64_u16[0]) >> 16),
|
---|
95 | (unsigned short)((((unsigned)a.m64_u16[1]) * b.m64_u16[1]) >> 16),
|
---|
96 | (unsigned short)((((unsigned)a.m64_u16[2]) * b.m64_u16[2]) >> 16),
|
---|
97 | (unsigned short)((((unsigned)a.m64_u16[3]) * b.m64_u16[3]) >> 16)
|
---|
98 | };
|
---|
99 |
|
---|
100 | return M64C (d[0]);
|
---|
101 | }
|
---|
102 |
|
---|
103 | __inline __m64
|
---|
104 | _m_pmullw2 (__m64 a, __m64 b) /* unoptimized */
|
---|
105 | {
|
---|
106 | unsigned short d[4] =
|
---|
107 | {
|
---|
108 | (unsigned short)((((unsigned)a.m64_u16[0]) * b.m64_u16[0])),
|
---|
109 | (unsigned short)((((unsigned)a.m64_u16[1]) * b.m64_u16[1])),
|
---|
110 | (unsigned short)((((unsigned)a.m64_u16[2]) * b.m64_u16[2])),
|
---|
111 | (unsigned short)((((unsigned)a.m64_u16[3]) * b.m64_u16[3]))
|
---|
112 | };
|
---|
113 |
|
---|
114 | return M64C (d[0]);
|
---|
115 | }
|
---|
116 |
|
---|
117 | __inline __m64
|
---|
118 | _m_pmullw (__m64 a, __m64 b) /* unoptimized */
|
---|
119 | {
|
---|
120 | unsigned long long x =
|
---|
121 | ((unsigned long long)(unsigned short)((((unsigned)a.m64_u16[0]) * b.m64_u16[0]))) +
|
---|
122 | (((unsigned long long)(unsigned short)((((unsigned)a.m64_u16[1]) * b.m64_u16[1]))) << 16) +
|
---|
123 | (((unsigned long long)(unsigned short)((((unsigned)a.m64_u16[2]) * b.m64_u16[2]))) << 32) +
|
---|
124 | (((unsigned long long)(unsigned short)((((unsigned)a.m64_u16[3]) * b.m64_u16[3]))) << 48);
|
---|
125 |
|
---|
126 | return M64C (x);
|
---|
127 | }
|
---|
128 |
|
---|
129 | __inline __m64
|
---|
130 | _m_paddusb (__m64 a, __m64 b) /* unoptimized */
|
---|
131 | {
|
---|
132 | unsigned long long x = (M64U (a) & 0x00FF00FF00FF00FFULL) +
|
---|
133 | (M64U (b) & 0x00FF00FF00FF00FFULL);
|
---|
134 |
|
---|
135 | unsigned long long y = ((M64U (a) >> 8) & 0x00FF00FF00FF00FFULL) +
|
---|
136 | ((M64U (b) >> 8) & 0x00FF00FF00FF00FFULL);
|
---|
137 |
|
---|
138 | x |= ((x & 0xFF00FF00FF00FF00ULL) >> 8) * 0xFF;
|
---|
139 | y |= ((y & 0xFF00FF00FF00FF00ULL) >> 8) * 0xFF;
|
---|
140 |
|
---|
141 | x = (x & 0x00FF00FF00FF00FFULL) | ((y & 0x00FF00FF00FF00FFULL) << 8);
|
---|
142 |
|
---|
143 | return M64C (x);
|
---|
144 | }
|
---|
145 |
|
---|
146 | __inline __m64
|
---|
147 | _m_paddusw (__m64 a, __m64 b) /* unoptimized */
|
---|
148 | {
|
---|
149 | unsigned long long x = (M64U (a) & 0x0000FFFF0000FFFFULL) +
|
---|
150 | (M64U (b) & 0x0000FFFF0000FFFFULL);
|
---|
151 |
|
---|
152 | unsigned long long y = ((M64U (a) >> 16) & 0x0000FFFF0000FFFFULL) +
|
---|
153 | ((M64U (b) >> 16) & 0x0000FFFF0000FFFFULL);
|
---|
154 |
|
---|
155 | x |= ((x & 0xFFFF0000FFFF0000) >> 16) * 0xFFFF;
|
---|
156 | y |= ((y & 0xFFFF0000FFFF0000) >> 16) * 0xFFFF;
|
---|
157 |
|
---|
158 | x = (x & 0x0000FFFF0000FFFFULL) | ((y & 0x0000FFFF0000FFFFULL) << 16);
|
---|
159 |
|
---|
160 | return M64C (x);
|
---|
161 | }
|
---|
162 |
|
---|
163 | __inline __m64
|
---|
164 | _m_pshufw (__m64 a, int n) /* unoptimized */
|
---|
165 | {
|
---|
166 | unsigned short d[4] =
|
---|
167 | {
|
---|
168 | a.m64_u16[n & 3],
|
---|
169 | a.m64_u16[(n >> 2) & 3],
|
---|
170 | a.m64_u16[(n >> 4) & 3],
|
---|
171 | a.m64_u16[(n >> 6) & 3]
|
---|
172 | };
|
---|
173 |
|
---|
174 | return M64C (d[0]);
|
---|
175 | }
|
---|
176 |
|
---|
177 | __inline unsigned char
|
---|
178 | sat16 (unsigned short d)
|
---|
179 | {
|
---|
180 | if (d > 0xFF) return 0xFF;
|
---|
181 | else return d & 0xFF;
|
---|
182 | }
|
---|
183 |
|
---|
184 | __inline __m64
|
---|
185 | _m_packuswb (__m64 m1, __m64 m2) /* unoptimized */
|
---|
186 | {
|
---|
187 | unsigned char d[8] =
|
---|
188 | {
|
---|
189 | sat16 (m1.m64_u16[0]),
|
---|
190 | sat16 (m1.m64_u16[1]),
|
---|
191 | sat16 (m1.m64_u16[2]),
|
---|
192 | sat16 (m1.m64_u16[3]),
|
---|
193 | sat16 (m2.m64_u16[0]),
|
---|
194 | sat16 (m2.m64_u16[1]),
|
---|
195 | sat16 (m2.m64_u16[2]),
|
---|
196 | sat16 (m2.m64_u16[3])
|
---|
197 | };
|
---|
198 |
|
---|
199 | return M64C (d[0]);
|
---|
200 | }
|
---|
201 |
|
---|
202 | __inline __m64 _m_punpcklbw (__m64 m1, __m64 m2) /* unoptimized */
|
---|
203 | {
|
---|
204 | unsigned char d[8] =
|
---|
205 | {
|
---|
206 | m1.m64_u8[0],
|
---|
207 | m2.m64_u8[0],
|
---|
208 | m1.m64_u8[1],
|
---|
209 | m2.m64_u8[1],
|
---|
210 | m1.m64_u8[2],
|
---|
211 | m2.m64_u8[2],
|
---|
212 | m1.m64_u8[3],
|
---|
213 | m2.m64_u8[3],
|
---|
214 | };
|
---|
215 |
|
---|
216 | return M64C (d[0]);
|
---|
217 | }
|
---|
218 |
|
---|
219 | __inline __m64 _m_punpckhbw (__m64 m1, __m64 m2) /* unoptimized */
|
---|
220 | {
|
---|
221 | unsigned char d[8] =
|
---|
222 | {
|
---|
223 | m1.m64_u8[4],
|
---|
224 | m2.m64_u8[4],
|
---|
225 | m1.m64_u8[5],
|
---|
226 | m2.m64_u8[5],
|
---|
227 | m1.m64_u8[6],
|
---|
228 | m2.m64_u8[6],
|
---|
229 | m1.m64_u8[7],
|
---|
230 | m2.m64_u8[7],
|
---|
231 | };
|
---|
232 |
|
---|
233 | return M64C (d[0]);
|
---|
234 | }
|
---|
235 |
|
---|
236 | __inline __m64 _m_psrlwi (__m64 a, int n) /* unoptimized */
|
---|
237 | {
|
---|
238 | unsigned short d[4] =
|
---|
239 | {
|
---|
240 | a.m64_u16[0] >> n,
|
---|
241 | a.m64_u16[1] >> n,
|
---|
242 | a.m64_u16[2] >> n,
|
---|
243 | a.m64_u16[3] >> n
|
---|
244 | };
|
---|
245 |
|
---|
246 | return M64C (d[0]);
|
---|
247 | }
|
---|
248 |
|
---|
249 | __inline __m64 _m_psrlqi (__m64 m, int n)
|
---|
250 | {
|
---|
251 | unsigned long long x = M64U (m) >> n;
|
---|
252 |
|
---|
253 | return M64C (x);
|
---|
254 | }
|
---|
255 |
|
---|
256 | __inline __m64 _m_psllqi (__m64 m, int n)
|
---|
257 | {
|
---|
258 | unsigned long long x = M64U (m) << n;
|
---|
259 |
|
---|
260 | return M64C (x);
|
---|
261 | }
|
---|
262 |
|
---|
263 | #endif /* MMX_X64_H_INCLUDED */
|
---|