VirtualBox

source: vbox/trunk/src/recompiler/target-i386/ops_sse.h@ 36140

最後變更 在這個檔案從36140是 36140,由 vboxsync 提交於 14 年 前

rem: Re-synced to svn://svn.savannah.nongnu.org/qemu/trunk@5495 (repo UUID c046a42c-6fe2-441c-8c8c-71466251a162).

  • 屬性 svn:eol-style 設為 native
檔案大小: 58.1 KB
 
1/*
2 * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support
3 *
4 * Copyright (c) 2005 Fabrice Bellard
5 * Copyright (c) 2008 Intel Corporation <[email protected]>
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22/*
23 * Oracle LGPL Disclaimer: For the avoidance of doubt, except that if any license choice
24 * other than GPL or LGPL is available it will apply instead, Oracle elects to use only
25 * the Lesser General Public License version 2.1 (LGPLv2) at this time for any software where
26 * a choice of LGPL license versions is made available with the language indicating
27 * that LGPLv2 or any later version may be used, or where a choice of which version
28 * of the LGPL is applied is otherwise unspecified.
29 */
30
31#if SHIFT == 0
32#define Reg MMXReg
33#define XMM_ONLY(x...)
34#define B(n) MMX_B(n)
35#define W(n) MMX_W(n)
36#define L(n) MMX_L(n)
37#define Q(n) q
38#define SUFFIX _mmx
39#else
40#define Reg XMMReg
41#define XMM_ONLY(x...) x
42#define B(n) XMM_B(n)
43#define W(n) XMM_W(n)
44#define L(n) XMM_L(n)
45#define Q(n) XMM_Q(n)
46#define SUFFIX _xmm
47#endif
48
49void glue(helper_psrlw, SUFFIX)(Reg *d, Reg *s)
50{
51 int shift;
52
53 if (s->Q(0) > 15) {
54 d->Q(0) = 0;
55#if SHIFT == 1
56 d->Q(1) = 0;
57#endif
58 } else {
59 shift = s->B(0);
60 d->W(0) >>= shift;
61 d->W(1) >>= shift;
62 d->W(2) >>= shift;
63 d->W(3) >>= shift;
64#if SHIFT == 1
65 d->W(4) >>= shift;
66 d->W(5) >>= shift;
67 d->W(6) >>= shift;
68 d->W(7) >>= shift;
69#endif
70 }
71 FORCE_RET();
72}
73
74void glue(helper_psraw, SUFFIX)(Reg *d, Reg *s)
75{
76 int shift;
77
78 if (s->Q(0) > 15) {
79 shift = 15;
80 } else {
81 shift = s->B(0);
82 }
83 d->W(0) = (int16_t)d->W(0) >> shift;
84 d->W(1) = (int16_t)d->W(1) >> shift;
85 d->W(2) = (int16_t)d->W(2) >> shift;
86 d->W(3) = (int16_t)d->W(3) >> shift;
87#if SHIFT == 1
88 d->W(4) = (int16_t)d->W(4) >> shift;
89 d->W(5) = (int16_t)d->W(5) >> shift;
90 d->W(6) = (int16_t)d->W(6) >> shift;
91 d->W(7) = (int16_t)d->W(7) >> shift;
92#endif
93}
94
95void glue(helper_psllw, SUFFIX)(Reg *d, Reg *s)
96{
97 int shift;
98
99 if (s->Q(0) > 15) {
100 d->Q(0) = 0;
101#if SHIFT == 1
102 d->Q(1) = 0;
103#endif
104 } else {
105 shift = s->B(0);
106 d->W(0) <<= shift;
107 d->W(1) <<= shift;
108 d->W(2) <<= shift;
109 d->W(3) <<= shift;
110#if SHIFT == 1
111 d->W(4) <<= shift;
112 d->W(5) <<= shift;
113 d->W(6) <<= shift;
114 d->W(7) <<= shift;
115#endif
116 }
117 FORCE_RET();
118}
119
120void glue(helper_psrld, SUFFIX)(Reg *d, Reg *s)
121{
122 int shift;
123
124 if (s->Q(0) > 31) {
125 d->Q(0) = 0;
126#if SHIFT == 1
127 d->Q(1) = 0;
128#endif
129 } else {
130 shift = s->B(0);
131 d->L(0) >>= shift;
132 d->L(1) >>= shift;
133#if SHIFT == 1
134 d->L(2) >>= shift;
135 d->L(3) >>= shift;
136#endif
137 }
138 FORCE_RET();
139}
140
141void glue(helper_psrad, SUFFIX)(Reg *d, Reg *s)
142{
143 int shift;
144
145 if (s->Q(0) > 31) {
146 shift = 31;
147 } else {
148 shift = s->B(0);
149 }
150 d->L(0) = (int32_t)d->L(0) >> shift;
151 d->L(1) = (int32_t)d->L(1) >> shift;
152#if SHIFT == 1
153 d->L(2) = (int32_t)d->L(2) >> shift;
154 d->L(3) = (int32_t)d->L(3) >> shift;
155#endif
156}
157
158void glue(helper_pslld, SUFFIX)(Reg *d, Reg *s)
159{
160 int shift;
161
162 if (s->Q(0) > 31) {
163 d->Q(0) = 0;
164#if SHIFT == 1
165 d->Q(1) = 0;
166#endif
167 } else {
168 shift = s->B(0);
169 d->L(0) <<= shift;
170 d->L(1) <<= shift;
171#if SHIFT == 1
172 d->L(2) <<= shift;
173 d->L(3) <<= shift;
174#endif
175 }
176 FORCE_RET();
177}
178
179void glue(helper_psrlq, SUFFIX)(Reg *d, Reg *s)
180{
181 int shift;
182
183 if (s->Q(0) > 63) {
184 d->Q(0) = 0;
185#if SHIFT == 1
186 d->Q(1) = 0;
187#endif
188 } else {
189 shift = s->B(0);
190 d->Q(0) >>= shift;
191#if SHIFT == 1
192 d->Q(1) >>= shift;
193#endif
194 }
195 FORCE_RET();
196}
197
198void glue(helper_psllq, SUFFIX)(Reg *d, Reg *s)
199{
200 int shift;
201
202 if (s->Q(0) > 63) {
203 d->Q(0) = 0;
204#if SHIFT == 1
205 d->Q(1) = 0;
206#endif
207 } else {
208 shift = s->B(0);
209 d->Q(0) <<= shift;
210#if SHIFT == 1
211 d->Q(1) <<= shift;
212#endif
213 }
214 FORCE_RET();
215}
216
217#if SHIFT == 1
218void glue(helper_psrldq, SUFFIX)(Reg *d, Reg *s)
219{
220 int shift, i;
221
222 shift = s->L(0);
223 if (shift > 16)
224 shift = 16;
225 for(i = 0; i < 16 - shift; i++)
226 d->B(i) = d->B(i + shift);
227 for(i = 16 - shift; i < 16; i++)
228 d->B(i) = 0;
229 FORCE_RET();
230}
231
232void glue(helper_pslldq, SUFFIX)(Reg *d, Reg *s)
233{
234 int shift, i;
235
236 shift = s->L(0);
237 if (shift > 16)
238 shift = 16;
239 for(i = 15; i >= shift; i--)
240 d->B(i) = d->B(i - shift);
241 for(i = 0; i < shift; i++)
242 d->B(i) = 0;
243 FORCE_RET();
244}
245#endif
246
247#define SSE_HELPER_B(name, F)\
248void glue(name, SUFFIX) (Reg *d, Reg *s)\
249{\
250 d->B(0) = F(d->B(0), s->B(0));\
251 d->B(1) = F(d->B(1), s->B(1));\
252 d->B(2) = F(d->B(2), s->B(2));\
253 d->B(3) = F(d->B(3), s->B(3));\
254 d->B(4) = F(d->B(4), s->B(4));\
255 d->B(5) = F(d->B(5), s->B(5));\
256 d->B(6) = F(d->B(6), s->B(6));\
257 d->B(7) = F(d->B(7), s->B(7));\
258 XMM_ONLY(\
259 d->B(8) = F(d->B(8), s->B(8));\
260 d->B(9) = F(d->B(9), s->B(9));\
261 d->B(10) = F(d->B(10), s->B(10));\
262 d->B(11) = F(d->B(11), s->B(11));\
263 d->B(12) = F(d->B(12), s->B(12));\
264 d->B(13) = F(d->B(13), s->B(13));\
265 d->B(14) = F(d->B(14), s->B(14));\
266 d->B(15) = F(d->B(15), s->B(15));\
267 )\
268}
269
270#define SSE_HELPER_W(name, F)\
271void glue(name, SUFFIX) (Reg *d, Reg *s)\
272{\
273 d->W(0) = F(d->W(0), s->W(0));\
274 d->W(1) = F(d->W(1), s->W(1));\
275 d->W(2) = F(d->W(2), s->W(2));\
276 d->W(3) = F(d->W(3), s->W(3));\
277 XMM_ONLY(\
278 d->W(4) = F(d->W(4), s->W(4));\
279 d->W(5) = F(d->W(5), s->W(5));\
280 d->W(6) = F(d->W(6), s->W(6));\
281 d->W(7) = F(d->W(7), s->W(7));\
282 )\
283}
284
285#define SSE_HELPER_L(name, F)\
286void glue(name, SUFFIX) (Reg *d, Reg *s)\
287{\
288 d->L(0) = F(d->L(0), s->L(0));\
289 d->L(1) = F(d->L(1), s->L(1));\
290 XMM_ONLY(\
291 d->L(2) = F(d->L(2), s->L(2));\
292 d->L(3) = F(d->L(3), s->L(3));\
293 )\
294}
295
296#define SSE_HELPER_Q(name, F)\
297void glue(name, SUFFIX) (Reg *d, Reg *s)\
298{\
299 d->Q(0) = F(d->Q(0), s->Q(0));\
300 XMM_ONLY(\
301 d->Q(1) = F(d->Q(1), s->Q(1));\
302 )\
303}
304
305#if SHIFT == 0
306static inline int satub(int x)
307{
308 if (x < 0)
309 return 0;
310 else if (x > 255)
311 return 255;
312 else
313 return x;
314}
315
316static inline int satuw(int x)
317{
318 if (x < 0)
319 return 0;
320 else if (x > 65535)
321 return 65535;
322 else
323 return x;
324}
325
326static inline int satsb(int x)
327{
328 if (x < -128)
329 return -128;
330 else if (x > 127)
331 return 127;
332 else
333 return x;
334}
335
336static inline int satsw(int x)
337{
338 if (x < -32768)
339 return -32768;
340 else if (x > 32767)
341 return 32767;
342 else
343 return x;
344}
345
346#define FADD(a, b) ((a) + (b))
347#define FADDUB(a, b) satub((a) + (b))
348#define FADDUW(a, b) satuw((a) + (b))
349#define FADDSB(a, b) satsb((int8_t)(a) + (int8_t)(b))
350#define FADDSW(a, b) satsw((int16_t)(a) + (int16_t)(b))
351
352#define FSUB(a, b) ((a) - (b))
353#define FSUBUB(a, b) satub((a) - (b))
354#define FSUBUW(a, b) satuw((a) - (b))
355#define FSUBSB(a, b) satsb((int8_t)(a) - (int8_t)(b))
356#define FSUBSW(a, b) satsw((int16_t)(a) - (int16_t)(b))
357#define FMINUB(a, b) ((a) < (b)) ? (a) : (b)
358#define FMINSW(a, b) ((int16_t)(a) < (int16_t)(b)) ? (a) : (b)
359#define FMAXUB(a, b) ((a) > (b)) ? (a) : (b)
360#define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b)
361
362#define FAND(a, b) (a) & (b)
363#define FANDN(a, b) ((~(a)) & (b))
364#define FOR(a, b) (a) | (b)
365#define FXOR(a, b) (a) ^ (b)
366
367#define FCMPGTB(a, b) (int8_t)(a) > (int8_t)(b) ? -1 : 0
368#define FCMPGTW(a, b) (int16_t)(a) > (int16_t)(b) ? -1 : 0
369#define FCMPGTL(a, b) (int32_t)(a) > (int32_t)(b) ? -1 : 0
370#define FCMPEQ(a, b) (a) == (b) ? -1 : 0
371
372#define FMULLW(a, b) (a) * (b)
373#define FMULHRW(a, b) ((int16_t)(a) * (int16_t)(b) + 0x8000) >> 16
374#define FMULHUW(a, b) (a) * (b) >> 16
375#define FMULHW(a, b) (int16_t)(a) * (int16_t)(b) >> 16
376
377#define FAVG(a, b) ((a) + (b) + 1) >> 1
378#endif
379
380SSE_HELPER_B(helper_paddb, FADD)
381SSE_HELPER_W(helper_paddw, FADD)
382SSE_HELPER_L(helper_paddl, FADD)
383SSE_HELPER_Q(helper_paddq, FADD)
384
385SSE_HELPER_B(helper_psubb, FSUB)
386SSE_HELPER_W(helper_psubw, FSUB)
387SSE_HELPER_L(helper_psubl, FSUB)
388SSE_HELPER_Q(helper_psubq, FSUB)
389
390SSE_HELPER_B(helper_paddusb, FADDUB)
391SSE_HELPER_B(helper_paddsb, FADDSB)
392SSE_HELPER_B(helper_psubusb, FSUBUB)
393SSE_HELPER_B(helper_psubsb, FSUBSB)
394
395SSE_HELPER_W(helper_paddusw, FADDUW)
396SSE_HELPER_W(helper_paddsw, FADDSW)
397SSE_HELPER_W(helper_psubusw, FSUBUW)
398SSE_HELPER_W(helper_psubsw, FSUBSW)
399
400SSE_HELPER_B(helper_pminub, FMINUB)
401SSE_HELPER_B(helper_pmaxub, FMAXUB)
402
403SSE_HELPER_W(helper_pminsw, FMINSW)
404SSE_HELPER_W(helper_pmaxsw, FMAXSW)
405
406SSE_HELPER_Q(helper_pand, FAND)
407SSE_HELPER_Q(helper_pandn, FANDN)
408SSE_HELPER_Q(helper_por, FOR)
409SSE_HELPER_Q(helper_pxor, FXOR)
410
411SSE_HELPER_B(helper_pcmpgtb, FCMPGTB)
412SSE_HELPER_W(helper_pcmpgtw, FCMPGTW)
413SSE_HELPER_L(helper_pcmpgtl, FCMPGTL)
414
415SSE_HELPER_B(helper_pcmpeqb, FCMPEQ)
416SSE_HELPER_W(helper_pcmpeqw, FCMPEQ)
417SSE_HELPER_L(helper_pcmpeql, FCMPEQ)
418
419SSE_HELPER_W(helper_pmullw, FMULLW)
420#if SHIFT == 0
421SSE_HELPER_W(helper_pmulhrw, FMULHRW)
422#endif
423SSE_HELPER_W(helper_pmulhuw, FMULHUW)
424SSE_HELPER_W(helper_pmulhw, FMULHW)
425
426SSE_HELPER_B(helper_pavgb, FAVG)
427SSE_HELPER_W(helper_pavgw, FAVG)
428
429void glue(helper_pmuludq, SUFFIX) (Reg *d, Reg *s)
430{
431 d->Q(0) = (uint64_t)s->L(0) * (uint64_t)d->L(0);
432#if SHIFT == 1
433 d->Q(1) = (uint64_t)s->L(2) * (uint64_t)d->L(2);
434#endif
435}
436
437void glue(helper_pmaddwd, SUFFIX) (Reg *d, Reg *s)
438{
439 int i;
440
441 for(i = 0; i < (2 << SHIFT); i++) {
442 d->L(i) = (int16_t)s->W(2*i) * (int16_t)d->W(2*i) +
443 (int16_t)s->W(2*i+1) * (int16_t)d->W(2*i+1);
444 }
445 FORCE_RET();
446}
447
448#if SHIFT == 0
449static inline int abs1(int a)
450{
451 if (a < 0)
452 return -a;
453 else
454 return a;
455}
456#endif
457void glue(helper_psadbw, SUFFIX) (Reg *d, Reg *s)
458{
459 unsigned int val;
460
461 val = 0;
462 val += abs1(d->B(0) - s->B(0));
463 val += abs1(d->B(1) - s->B(1));
464 val += abs1(d->B(2) - s->B(2));
465 val += abs1(d->B(3) - s->B(3));
466 val += abs1(d->B(4) - s->B(4));
467 val += abs1(d->B(5) - s->B(5));
468 val += abs1(d->B(6) - s->B(6));
469 val += abs1(d->B(7) - s->B(7));
470 d->Q(0) = val;
471#if SHIFT == 1
472 val = 0;
473 val += abs1(d->B(8) - s->B(8));
474 val += abs1(d->B(9) - s->B(9));
475 val += abs1(d->B(10) - s->B(10));
476 val += abs1(d->B(11) - s->B(11));
477 val += abs1(d->B(12) - s->B(12));
478 val += abs1(d->B(13) - s->B(13));
479 val += abs1(d->B(14) - s->B(14));
480 val += abs1(d->B(15) - s->B(15));
481 d->Q(1) = val;
482#endif
483}
484
485void glue(helper_maskmov, SUFFIX) (Reg *d, Reg *s, target_ulong a0)
486{
487 int i;
488 for(i = 0; i < (8 << SHIFT); i++) {
489 if (s->B(i) & 0x80)
490 stb(a0 + i, d->B(i));
491 }
492 FORCE_RET();
493}
494
495void glue(helper_movl_mm_T0, SUFFIX) (Reg *d, uint32_t val)
496{
497 d->L(0) = val;
498 d->L(1) = 0;
499#if SHIFT == 1
500 d->Q(1) = 0;
501#endif
502}
503
504#ifdef TARGET_X86_64
505void glue(helper_movq_mm_T0, SUFFIX) (Reg *d, uint64_t val)
506{
507 d->Q(0) = val;
508#if SHIFT == 1
509 d->Q(1) = 0;
510#endif
511}
512#endif
513
514#if SHIFT == 0
515void glue(helper_pshufw, SUFFIX) (Reg *d, Reg *s, int order)
516{
517 Reg r;
518 r.W(0) = s->W(order & 3);
519 r.W(1) = s->W((order >> 2) & 3);
520 r.W(2) = s->W((order >> 4) & 3);
521 r.W(3) = s->W((order >> 6) & 3);
522 *d = r;
523}
524#else
525void helper_shufps(Reg *d, Reg *s, int order)
526{
527 Reg r;
528 r.L(0) = d->L(order & 3);
529 r.L(1) = d->L((order >> 2) & 3);
530 r.L(2) = s->L((order >> 4) & 3);
531 r.L(3) = s->L((order >> 6) & 3);
532 *d = r;
533}
534
535void helper_shufpd(Reg *d, Reg *s, int order)
536{
537 Reg r;
538 r.Q(0) = d->Q(order & 1);
539 r.Q(1) = s->Q((order >> 1) & 1);
540 *d = r;
541}
542
543void glue(helper_pshufd, SUFFIX) (Reg *d, Reg *s, int order)
544{
545 Reg r;
546 r.L(0) = s->L(order & 3);
547 r.L(1) = s->L((order >> 2) & 3);
548 r.L(2) = s->L((order >> 4) & 3);
549 r.L(3) = s->L((order >> 6) & 3);
550 *d = r;
551}
552
553void glue(helper_pshuflw, SUFFIX) (Reg *d, Reg *s, int order)
554{
555 Reg r;
556 r.W(0) = s->W(order & 3);
557 r.W(1) = s->W((order >> 2) & 3);
558 r.W(2) = s->W((order >> 4) & 3);
559 r.W(3) = s->W((order >> 6) & 3);
560 r.Q(1) = s->Q(1);
561 *d = r;
562}
563
564void glue(helper_pshufhw, SUFFIX) (Reg *d, Reg *s, int order)
565{
566 Reg r;
567 r.Q(0) = s->Q(0);
568 r.W(4) = s->W(4 + (order & 3));
569 r.W(5) = s->W(4 + ((order >> 2) & 3));
570 r.W(6) = s->W(4 + ((order >> 4) & 3));
571 r.W(7) = s->W(4 + ((order >> 6) & 3));
572 *d = r;
573}
574#endif
575
576#if SHIFT == 1
577/* FPU ops */
578/* XXX: not accurate */
579
580#define SSE_HELPER_S(name, F)\
581void helper_ ## name ## ps (Reg *d, Reg *s)\
582{\
583 d->XMM_S(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
584 d->XMM_S(1) = F(32, d->XMM_S(1), s->XMM_S(1));\
585 d->XMM_S(2) = F(32, d->XMM_S(2), s->XMM_S(2));\
586 d->XMM_S(3) = F(32, d->XMM_S(3), s->XMM_S(3));\
587}\
588\
589void helper_ ## name ## ss (Reg *d, Reg *s)\
590{\
591 d->XMM_S(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
592}\
593void helper_ ## name ## pd (Reg *d, Reg *s)\
594{\
595 d->XMM_D(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
596 d->XMM_D(1) = F(64, d->XMM_D(1), s->XMM_D(1));\
597}\
598\
599void helper_ ## name ## sd (Reg *d, Reg *s)\
600{\
601 d->XMM_D(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
602}
603
604#define FPU_ADD(size, a, b) float ## size ## _add(a, b, &env->sse_status)
605#define FPU_SUB(size, a, b) float ## size ## _sub(a, b, &env->sse_status)
606#define FPU_MUL(size, a, b) float ## size ## _mul(a, b, &env->sse_status)
607#define FPU_DIV(size, a, b) float ## size ## _div(a, b, &env->sse_status)
608#define FPU_MIN(size, a, b) (a) < (b) ? (a) : (b)
609#define FPU_MAX(size, a, b) (a) > (b) ? (a) : (b)
610#define FPU_SQRT(size, a, b) float ## size ## _sqrt(b, &env->sse_status)
611
612SSE_HELPER_S(add, FPU_ADD)
613SSE_HELPER_S(sub, FPU_SUB)
614SSE_HELPER_S(mul, FPU_MUL)
615SSE_HELPER_S(div, FPU_DIV)
616SSE_HELPER_S(min, FPU_MIN)
617SSE_HELPER_S(max, FPU_MAX)
618SSE_HELPER_S(sqrt, FPU_SQRT)
619
620
621/* float to float conversions */
622void helper_cvtps2pd(Reg *d, Reg *s)
623{
624 float32 s0, s1;
625 s0 = s->XMM_S(0);
626 s1 = s->XMM_S(1);
627 d->XMM_D(0) = float32_to_float64(s0, &env->sse_status);
628 d->XMM_D(1) = float32_to_float64(s1, &env->sse_status);
629}
630
631void helper_cvtpd2ps(Reg *d, Reg *s)
632{
633 d->XMM_S(0) = float64_to_float32(s->XMM_D(0), &env->sse_status);
634 d->XMM_S(1) = float64_to_float32(s->XMM_D(1), &env->sse_status);
635 d->Q(1) = 0;
636}
637
638void helper_cvtss2sd(Reg *d, Reg *s)
639{
640 d->XMM_D(0) = float32_to_float64(s->XMM_S(0), &env->sse_status);
641}
642
643void helper_cvtsd2ss(Reg *d, Reg *s)
644{
645 d->XMM_S(0) = float64_to_float32(s->XMM_D(0), &env->sse_status);
646}
647
648/* integer to float */
649void helper_cvtdq2ps(Reg *d, Reg *s)
650{
651 d->XMM_S(0) = int32_to_float32(s->XMM_L(0), &env->sse_status);
652 d->XMM_S(1) = int32_to_float32(s->XMM_L(1), &env->sse_status);
653 d->XMM_S(2) = int32_to_float32(s->XMM_L(2), &env->sse_status);
654 d->XMM_S(3) = int32_to_float32(s->XMM_L(3), &env->sse_status);
655}
656
657void helper_cvtdq2pd(Reg *d, Reg *s)
658{
659 int32_t l0, l1;
660 l0 = (int32_t)s->XMM_L(0);
661 l1 = (int32_t)s->XMM_L(1);
662 d->XMM_D(0) = int32_to_float64(l0, &env->sse_status);
663 d->XMM_D(1) = int32_to_float64(l1, &env->sse_status);
664}
665
666void helper_cvtpi2ps(XMMReg *d, MMXReg *s)
667{
668 d->XMM_S(0) = int32_to_float32(s->MMX_L(0), &env->sse_status);
669 d->XMM_S(1) = int32_to_float32(s->MMX_L(1), &env->sse_status);
670}
671
672void helper_cvtpi2pd(XMMReg *d, MMXReg *s)
673{
674 d->XMM_D(0) = int32_to_float64(s->MMX_L(0), &env->sse_status);
675 d->XMM_D(1) = int32_to_float64(s->MMX_L(1), &env->sse_status);
676}
677
678void helper_cvtsi2ss(XMMReg *d, uint32_t val)
679{
680 d->XMM_S(0) = int32_to_float32(val, &env->sse_status);
681}
682
683void helper_cvtsi2sd(XMMReg *d, uint32_t val)
684{
685 d->XMM_D(0) = int32_to_float64(val, &env->sse_status);
686}
687
688#ifdef TARGET_X86_64
689void helper_cvtsq2ss(XMMReg *d, uint64_t val)
690{
691 d->XMM_S(0) = int64_to_float32(val, &env->sse_status);
692}
693
694void helper_cvtsq2sd(XMMReg *d, uint64_t val)
695{
696 d->XMM_D(0) = int64_to_float64(val, &env->sse_status);
697}
698#endif
699
700/* float to integer */
701void helper_cvtps2dq(XMMReg *d, XMMReg *s)
702{
703 d->XMM_L(0) = float32_to_int32(s->XMM_S(0), &env->sse_status);
704 d->XMM_L(1) = float32_to_int32(s->XMM_S(1), &env->sse_status);
705 d->XMM_L(2) = float32_to_int32(s->XMM_S(2), &env->sse_status);
706 d->XMM_L(3) = float32_to_int32(s->XMM_S(3), &env->sse_status);
707}
708
709void helper_cvtpd2dq(XMMReg *d, XMMReg *s)
710{
711 d->XMM_L(0) = float64_to_int32(s->XMM_D(0), &env->sse_status);
712 d->XMM_L(1) = float64_to_int32(s->XMM_D(1), &env->sse_status);
713 d->XMM_Q(1) = 0;
714}
715
716void helper_cvtps2pi(MMXReg *d, XMMReg *s)
717{
718 d->MMX_L(0) = float32_to_int32(s->XMM_S(0), &env->sse_status);
719 d->MMX_L(1) = float32_to_int32(s->XMM_S(1), &env->sse_status);
720}
721
722void helper_cvtpd2pi(MMXReg *d, XMMReg *s)
723{
724 d->MMX_L(0) = float64_to_int32(s->XMM_D(0), &env->sse_status);
725 d->MMX_L(1) = float64_to_int32(s->XMM_D(1), &env->sse_status);
726}
727
728int32_t helper_cvtss2si(XMMReg *s)
729{
730 return float32_to_int32(s->XMM_S(0), &env->sse_status);
731}
732
733int32_t helper_cvtsd2si(XMMReg *s)
734{
735 return float64_to_int32(s->XMM_D(0), &env->sse_status);
736}
737
738#ifdef TARGET_X86_64
739int64_t helper_cvtss2sq(XMMReg *s)
740{
741 return float32_to_int64(s->XMM_S(0), &env->sse_status);
742}
743
744int64_t helper_cvtsd2sq(XMMReg *s)
745{
746 return float64_to_int64(s->XMM_D(0), &env->sse_status);
747}
748#endif
749
750/* float to integer truncated */
751void helper_cvttps2dq(XMMReg *d, XMMReg *s)
752{
753 d->XMM_L(0) = float32_to_int32_round_to_zero(s->XMM_S(0), &env->sse_status);
754 d->XMM_L(1) = float32_to_int32_round_to_zero(s->XMM_S(1), &env->sse_status);
755 d->XMM_L(2) = float32_to_int32_round_to_zero(s->XMM_S(2), &env->sse_status);
756 d->XMM_L(3) = float32_to_int32_round_to_zero(s->XMM_S(3), &env->sse_status);
757}
758
759void helper_cvttpd2dq(XMMReg *d, XMMReg *s)
760{
761 d->XMM_L(0) = float64_to_int32_round_to_zero(s->XMM_D(0), &env->sse_status);
762 d->XMM_L(1) = float64_to_int32_round_to_zero(s->XMM_D(1), &env->sse_status);
763 d->XMM_Q(1) = 0;
764}
765
766void helper_cvttps2pi(MMXReg *d, XMMReg *s)
767{
768 d->MMX_L(0) = float32_to_int32_round_to_zero(s->XMM_S(0), &env->sse_status);
769 d->MMX_L(1) = float32_to_int32_round_to_zero(s->XMM_S(1), &env->sse_status);
770}
771
772void helper_cvttpd2pi(MMXReg *d, XMMReg *s)
773{
774 d->MMX_L(0) = float64_to_int32_round_to_zero(s->XMM_D(0), &env->sse_status);
775 d->MMX_L(1) = float64_to_int32_round_to_zero(s->XMM_D(1), &env->sse_status);
776}
777
778int32_t helper_cvttss2si(XMMReg *s)
779{
780 return float32_to_int32_round_to_zero(s->XMM_S(0), &env->sse_status);
781}
782
783int32_t helper_cvttsd2si(XMMReg *s)
784{
785 return float64_to_int32_round_to_zero(s->XMM_D(0), &env->sse_status);
786}
787
788#ifdef TARGET_X86_64
789int64_t helper_cvttss2sq(XMMReg *s)
790{
791 return float32_to_int64_round_to_zero(s->XMM_S(0), &env->sse_status);
792}
793
794int64_t helper_cvttsd2sq(XMMReg *s)
795{
796 return float64_to_int64_round_to_zero(s->XMM_D(0), &env->sse_status);
797}
798#endif
799
800void helper_rsqrtps(XMMReg *d, XMMReg *s)
801{
802 d->XMM_S(0) = approx_rsqrt(s->XMM_S(0));
803 d->XMM_S(1) = approx_rsqrt(s->XMM_S(1));
804 d->XMM_S(2) = approx_rsqrt(s->XMM_S(2));
805 d->XMM_S(3) = approx_rsqrt(s->XMM_S(3));
806}
807
808void helper_rsqrtss(XMMReg *d, XMMReg *s)
809{
810 d->XMM_S(0) = approx_rsqrt(s->XMM_S(0));
811}
812
813void helper_rcpps(XMMReg *d, XMMReg *s)
814{
815 d->XMM_S(0) = approx_rcp(s->XMM_S(0));
816 d->XMM_S(1) = approx_rcp(s->XMM_S(1));
817 d->XMM_S(2) = approx_rcp(s->XMM_S(2));
818 d->XMM_S(3) = approx_rcp(s->XMM_S(3));
819}
820
821void helper_rcpss(XMMReg *d, XMMReg *s)
822{
823 d->XMM_S(0) = approx_rcp(s->XMM_S(0));
824}
825
826void helper_haddps(XMMReg *d, XMMReg *s)
827{
828 XMMReg r;
829 r.XMM_S(0) = d->XMM_S(0) + d->XMM_S(1);
830 r.XMM_S(1) = d->XMM_S(2) + d->XMM_S(3);
831 r.XMM_S(2) = s->XMM_S(0) + s->XMM_S(1);
832 r.XMM_S(3) = s->XMM_S(2) + s->XMM_S(3);
833 *d = r;
834}
835
836void helper_haddpd(XMMReg *d, XMMReg *s)
837{
838 XMMReg r;
839 r.XMM_D(0) = d->XMM_D(0) + d->XMM_D(1);
840 r.XMM_D(1) = s->XMM_D(0) + s->XMM_D(1);
841 *d = r;
842}
843
844void helper_hsubps(XMMReg *d, XMMReg *s)
845{
846 XMMReg r;
847 r.XMM_S(0) = d->XMM_S(0) - d->XMM_S(1);
848 r.XMM_S(1) = d->XMM_S(2) - d->XMM_S(3);
849 r.XMM_S(2) = s->XMM_S(0) - s->XMM_S(1);
850 r.XMM_S(3) = s->XMM_S(2) - s->XMM_S(3);
851 *d = r;
852}
853
854void helper_hsubpd(XMMReg *d, XMMReg *s)
855{
856 XMMReg r;
857 r.XMM_D(0) = d->XMM_D(0) - d->XMM_D(1);
858 r.XMM_D(1) = s->XMM_D(0) - s->XMM_D(1);
859 *d = r;
860}
861
862void helper_addsubps(XMMReg *d, XMMReg *s)
863{
864 d->XMM_S(0) = d->XMM_S(0) - s->XMM_S(0);
865 d->XMM_S(1) = d->XMM_S(1) + s->XMM_S(1);
866 d->XMM_S(2) = d->XMM_S(2) - s->XMM_S(2);
867 d->XMM_S(3) = d->XMM_S(3) + s->XMM_S(3);
868}
869
870void helper_addsubpd(XMMReg *d, XMMReg *s)
871{
872 d->XMM_D(0) = d->XMM_D(0) - s->XMM_D(0);
873 d->XMM_D(1) = d->XMM_D(1) + s->XMM_D(1);
874}
875
876/* XXX: unordered */
877#define SSE_HELPER_CMP(name, F)\
878void helper_ ## name ## ps (Reg *d, Reg *s)\
879{\
880 d->XMM_L(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
881 d->XMM_L(1) = F(32, d->XMM_S(1), s->XMM_S(1));\
882 d->XMM_L(2) = F(32, d->XMM_S(2), s->XMM_S(2));\
883 d->XMM_L(3) = F(32, d->XMM_S(3), s->XMM_S(3));\
884}\
885\
886void helper_ ## name ## ss (Reg *d, Reg *s)\
887{\
888 d->XMM_L(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
889}\
890void helper_ ## name ## pd (Reg *d, Reg *s)\
891{\
892 d->XMM_Q(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
893 d->XMM_Q(1) = F(64, d->XMM_D(1), s->XMM_D(1));\
894}\
895\
896void helper_ ## name ## sd (Reg *d, Reg *s)\
897{\
898 d->XMM_Q(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
899}
900
901#define FPU_CMPEQ(size, a, b) float ## size ## _eq(a, b, &env->sse_status) ? -1 : 0
902#define FPU_CMPLT(size, a, b) float ## size ## _lt(a, b, &env->sse_status) ? -1 : 0
903#define FPU_CMPLE(size, a, b) float ## size ## _le(a, b, &env->sse_status) ? -1 : 0
904#define FPU_CMPUNORD(size, a, b) float ## size ## _unordered(a, b, &env->sse_status) ? - 1 : 0
905#define FPU_CMPNEQ(size, a, b) float ## size ## _eq(a, b, &env->sse_status) ? 0 : -1
906#define FPU_CMPNLT(size, a, b) float ## size ## _lt(a, b, &env->sse_status) ? 0 : -1
907#define FPU_CMPNLE(size, a, b) float ## size ## _le(a, b, &env->sse_status) ? 0 : -1
908#define FPU_CMPORD(size, a, b) float ## size ## _unordered(a, b, &env->sse_status) ? 0 : -1
909
910SSE_HELPER_CMP(cmpeq, FPU_CMPEQ)
911SSE_HELPER_CMP(cmplt, FPU_CMPLT)
912SSE_HELPER_CMP(cmple, FPU_CMPLE)
913SSE_HELPER_CMP(cmpunord, FPU_CMPUNORD)
914SSE_HELPER_CMP(cmpneq, FPU_CMPNEQ)
915SSE_HELPER_CMP(cmpnlt, FPU_CMPNLT)
916SSE_HELPER_CMP(cmpnle, FPU_CMPNLE)
917SSE_HELPER_CMP(cmpord, FPU_CMPORD)
918
919const int comis_eflags[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
920
921void helper_ucomiss(Reg *d, Reg *s)
922{
923 int ret;
924 float32 s0, s1;
925
926 s0 = d->XMM_S(0);
927 s1 = s->XMM_S(0);
928 ret = float32_compare_quiet(s0, s1, &env->sse_status);
929 CC_SRC = comis_eflags[ret + 1];
930 FORCE_RET();
931}
932
933void helper_comiss(Reg *d, Reg *s)
934{
935 int ret;
936 float32 s0, s1;
937
938 s0 = d->XMM_S(0);
939 s1 = s->XMM_S(0);
940 ret = float32_compare(s0, s1, &env->sse_status);
941 CC_SRC = comis_eflags[ret + 1];
942 FORCE_RET();
943}
944
945void helper_ucomisd(Reg *d, Reg *s)
946{
947 int ret;
948 float64 d0, d1;
949
950 d0 = d->XMM_D(0);
951 d1 = s->XMM_D(0);
952 ret = float64_compare_quiet(d0, d1, &env->sse_status);
953 CC_SRC = comis_eflags[ret + 1];
954 FORCE_RET();
955}
956
957void helper_comisd(Reg *d, Reg *s)
958{
959 int ret;
960 float64 d0, d1;
961
962 d0 = d->XMM_D(0);
963 d1 = s->XMM_D(0);
964 ret = float64_compare(d0, d1, &env->sse_status);
965 CC_SRC = comis_eflags[ret + 1];
966 FORCE_RET();
967}
968
969uint32_t helper_movmskps(Reg *s)
970{
971 int b0, b1, b2, b3;
972 b0 = s->XMM_L(0) >> 31;
973 b1 = s->XMM_L(1) >> 31;
974 b2 = s->XMM_L(2) >> 31;
975 b3 = s->XMM_L(3) >> 31;
976 return b0 | (b1 << 1) | (b2 << 2) | (b3 << 3);
977}
978
979uint32_t helper_movmskpd(Reg *s)
980{
981 int b0, b1;
982 b0 = s->XMM_L(1) >> 31;
983 b1 = s->XMM_L(3) >> 31;
984 return b0 | (b1 << 1);
985}
986
987#endif
988
989uint32_t glue(helper_pmovmskb, SUFFIX)(Reg *s)
990{
991 uint32_t val;
992 val = 0;
993 val |= (s->XMM_B(0) >> 7);
994 val |= (s->XMM_B(1) >> 6) & 0x02;
995 val |= (s->XMM_B(2) >> 5) & 0x04;
996 val |= (s->XMM_B(3) >> 4) & 0x08;
997 val |= (s->XMM_B(4) >> 3) & 0x10;
998 val |= (s->XMM_B(5) >> 2) & 0x20;
999 val |= (s->XMM_B(6) >> 1) & 0x40;
1000 val |= (s->XMM_B(7)) & 0x80;
1001#if SHIFT == 1
1002 val |= (s->XMM_B(8) << 1) & 0x0100;
1003 val |= (s->XMM_B(9) << 2) & 0x0200;
1004 val |= (s->XMM_B(10) << 3) & 0x0400;
1005 val |= (s->XMM_B(11) << 4) & 0x0800;
1006 val |= (s->XMM_B(12) << 5) & 0x1000;
1007 val |= (s->XMM_B(13) << 6) & 0x2000;
1008 val |= (s->XMM_B(14) << 7) & 0x4000;
1009 val |= (s->XMM_B(15) << 8) & 0x8000;
1010#endif
1011 return val;
1012}
1013
1014void glue(helper_packsswb, SUFFIX) (Reg *d, Reg *s)
1015{
1016 Reg r;
1017
1018 r.B(0) = satsb((int16_t)d->W(0));
1019 r.B(1) = satsb((int16_t)d->W(1));
1020 r.B(2) = satsb((int16_t)d->W(2));
1021 r.B(3) = satsb((int16_t)d->W(3));
1022#if SHIFT == 1
1023 r.B(4) = satsb((int16_t)d->W(4));
1024 r.B(5) = satsb((int16_t)d->W(5));
1025 r.B(6) = satsb((int16_t)d->W(6));
1026 r.B(7) = satsb((int16_t)d->W(7));
1027#endif
1028 r.B((4 << SHIFT) + 0) = satsb((int16_t)s->W(0));
1029 r.B((4 << SHIFT) + 1) = satsb((int16_t)s->W(1));
1030 r.B((4 << SHIFT) + 2) = satsb((int16_t)s->W(2));
1031 r.B((4 << SHIFT) + 3) = satsb((int16_t)s->W(3));
1032#if SHIFT == 1
1033 r.B(12) = satsb((int16_t)s->W(4));
1034 r.B(13) = satsb((int16_t)s->W(5));
1035 r.B(14) = satsb((int16_t)s->W(6));
1036 r.B(15) = satsb((int16_t)s->W(7));
1037#endif
1038 *d = r;
1039}
1040
1041void glue(helper_packuswb, SUFFIX) (Reg *d, Reg *s)
1042{
1043 Reg r;
1044
1045 r.B(0) = satub((int16_t)d->W(0));
1046 r.B(1) = satub((int16_t)d->W(1));
1047 r.B(2) = satub((int16_t)d->W(2));
1048 r.B(3) = satub((int16_t)d->W(3));
1049#if SHIFT == 1
1050 r.B(4) = satub((int16_t)d->W(4));
1051 r.B(5) = satub((int16_t)d->W(5));
1052 r.B(6) = satub((int16_t)d->W(6));
1053 r.B(7) = satub((int16_t)d->W(7));
1054#endif
1055 r.B((4 << SHIFT) + 0) = satub((int16_t)s->W(0));
1056 r.B((4 << SHIFT) + 1) = satub((int16_t)s->W(1));
1057 r.B((4 << SHIFT) + 2) = satub((int16_t)s->W(2));
1058 r.B((4 << SHIFT) + 3) = satub((int16_t)s->W(3));
1059#if SHIFT == 1
1060 r.B(12) = satub((int16_t)s->W(4));
1061 r.B(13) = satub((int16_t)s->W(5));
1062 r.B(14) = satub((int16_t)s->W(6));
1063 r.B(15) = satub((int16_t)s->W(7));
1064#endif
1065 *d = r;
1066}
1067
1068void glue(helper_packssdw, SUFFIX) (Reg *d, Reg *s)
1069{
1070 Reg r;
1071
1072 r.W(0) = satsw(d->L(0));
1073 r.W(1) = satsw(d->L(1));
1074#if SHIFT == 1
1075 r.W(2) = satsw(d->L(2));
1076 r.W(3) = satsw(d->L(3));
1077#endif
1078 r.W((2 << SHIFT) + 0) = satsw(s->L(0));
1079 r.W((2 << SHIFT) + 1) = satsw(s->L(1));
1080#if SHIFT == 1
1081 r.W(6) = satsw(s->L(2));
1082 r.W(7) = satsw(s->L(3));
1083#endif
1084 *d = r;
1085}
1086
1087#define UNPCK_OP(base_name, base) \
1088 \
1089void glue(helper_punpck ## base_name ## bw, SUFFIX) (Reg *d, Reg *s) \
1090{ \
1091 Reg r; \
1092 \
1093 r.B(0) = d->B((base << (SHIFT + 2)) + 0); \
1094 r.B(1) = s->B((base << (SHIFT + 2)) + 0); \
1095 r.B(2) = d->B((base << (SHIFT + 2)) + 1); \
1096 r.B(3) = s->B((base << (SHIFT + 2)) + 1); \
1097 r.B(4) = d->B((base << (SHIFT + 2)) + 2); \
1098 r.B(5) = s->B((base << (SHIFT + 2)) + 2); \
1099 r.B(6) = d->B((base << (SHIFT + 2)) + 3); \
1100 r.B(7) = s->B((base << (SHIFT + 2)) + 3); \
1101XMM_ONLY( \
1102 r.B(8) = d->B((base << (SHIFT + 2)) + 4); \
1103 r.B(9) = s->B((base << (SHIFT + 2)) + 4); \
1104 r.B(10) = d->B((base << (SHIFT + 2)) + 5); \
1105 r.B(11) = s->B((base << (SHIFT + 2)) + 5); \
1106 r.B(12) = d->B((base << (SHIFT + 2)) + 6); \
1107 r.B(13) = s->B((base << (SHIFT + 2)) + 6); \
1108 r.B(14) = d->B((base << (SHIFT + 2)) + 7); \
1109 r.B(15) = s->B((base << (SHIFT + 2)) + 7); \
1110) \
1111 *d = r; \
1112} \
1113 \
1114void glue(helper_punpck ## base_name ## wd, SUFFIX) (Reg *d, Reg *s) \
1115{ \
1116 Reg r; \
1117 \
1118 r.W(0) = d->W((base << (SHIFT + 1)) + 0); \
1119 r.W(1) = s->W((base << (SHIFT + 1)) + 0); \
1120 r.W(2) = d->W((base << (SHIFT + 1)) + 1); \
1121 r.W(3) = s->W((base << (SHIFT + 1)) + 1); \
1122XMM_ONLY( \
1123 r.W(4) = d->W((base << (SHIFT + 1)) + 2); \
1124 r.W(5) = s->W((base << (SHIFT + 1)) + 2); \
1125 r.W(6) = d->W((base << (SHIFT + 1)) + 3); \
1126 r.W(7) = s->W((base << (SHIFT + 1)) + 3); \
1127) \
1128 *d = r; \
1129} \
1130 \
1131void glue(helper_punpck ## base_name ## dq, SUFFIX) (Reg *d, Reg *s) \
1132{ \
1133 Reg r; \
1134 \
1135 r.L(0) = d->L((base << SHIFT) + 0); \
1136 r.L(1) = s->L((base << SHIFT) + 0); \
1137XMM_ONLY( \
1138 r.L(2) = d->L((base << SHIFT) + 1); \
1139 r.L(3) = s->L((base << SHIFT) + 1); \
1140) \
1141 *d = r; \
1142} \
1143 \
1144XMM_ONLY( \
1145void glue(helper_punpck ## base_name ## qdq, SUFFIX) (Reg *d, Reg *s) \
1146{ \
1147 Reg r; \
1148 \
1149 r.Q(0) = d->Q(base); \
1150 r.Q(1) = s->Q(base); \
1151 *d = r; \
1152} \
1153)
1154
1155UNPCK_OP(l, 0)
1156UNPCK_OP(h, 1)
1157
1158/* 3DNow! float ops */
1159#if SHIFT == 0
1160void helper_pi2fd(MMXReg *d, MMXReg *s)
1161{
1162 d->MMX_S(0) = int32_to_float32(s->MMX_L(0), &env->mmx_status);
1163 d->MMX_S(1) = int32_to_float32(s->MMX_L(1), &env->mmx_status);
1164}
1165
1166void helper_pi2fw(MMXReg *d, MMXReg *s)
1167{
1168 d->MMX_S(0) = int32_to_float32((int16_t)s->MMX_W(0), &env->mmx_status);
1169 d->MMX_S(1) = int32_to_float32((int16_t)s->MMX_W(2), &env->mmx_status);
1170}
1171
1172void helper_pf2id(MMXReg *d, MMXReg *s)
1173{
1174 d->MMX_L(0) = float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status);
1175 d->MMX_L(1) = float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status);
1176}
1177
1178void helper_pf2iw(MMXReg *d, MMXReg *s)
1179{
1180 d->MMX_L(0) = satsw(float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status));
1181 d->MMX_L(1) = satsw(float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status));
1182}
1183
1184void helper_pfacc(MMXReg *d, MMXReg *s)
1185{
1186 MMXReg r;
1187 r.MMX_S(0) = float32_add(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1188 r.MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1189 *d = r;
1190}
1191
1192void helper_pfadd(MMXReg *d, MMXReg *s)
1193{
1194 d->MMX_S(0) = float32_add(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1195 d->MMX_S(1) = float32_add(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1196}
1197
1198void helper_pfcmpeq(MMXReg *d, MMXReg *s)
1199{
1200 d->MMX_L(0) = float32_eq(d->MMX_S(0), s->MMX_S(0), &env->mmx_status) ? -1 : 0;
1201 d->MMX_L(1) = float32_eq(d->MMX_S(1), s->MMX_S(1), &env->mmx_status) ? -1 : 0;
1202}
1203
1204void helper_pfcmpge(MMXReg *d, MMXReg *s)
1205{
1206 d->MMX_L(0) = float32_le(s->MMX_S(0), d->MMX_S(0), &env->mmx_status) ? -1 : 0;
1207 d->MMX_L(1) = float32_le(s->MMX_S(1), d->MMX_S(1), &env->mmx_status) ? -1 : 0;
1208}
1209
1210void helper_pfcmpgt(MMXReg *d, MMXReg *s)
1211{
1212 d->MMX_L(0) = float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status) ? -1 : 0;
1213 d->MMX_L(1) = float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status) ? -1 : 0;
1214}
1215
1216void helper_pfmax(MMXReg *d, MMXReg *s)
1217{
1218 if (float32_lt(d->MMX_S(0), s->MMX_S(0), &env->mmx_status))
1219 d->MMX_S(0) = s->MMX_S(0);
1220 if (float32_lt(d->MMX_S(1), s->MMX_S(1), &env->mmx_status))
1221 d->MMX_S(1) = s->MMX_S(1);
1222}
1223
1224void helper_pfmin(MMXReg *d, MMXReg *s)
1225{
1226 if (float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status))
1227 d->MMX_S(0) = s->MMX_S(0);
1228 if (float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status))
1229 d->MMX_S(1) = s->MMX_S(1);
1230}
1231
1232void helper_pfmul(MMXReg *d, MMXReg *s)
1233{
1234 d->MMX_S(0) = float32_mul(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1235 d->MMX_S(1) = float32_mul(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1236}
1237
1238void helper_pfnacc(MMXReg *d, MMXReg *s)
1239{
1240 MMXReg r;
1241 r.MMX_S(0) = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1242 r.MMX_S(1) = float32_sub(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1243 *d = r;
1244}
1245
1246void helper_pfpnacc(MMXReg *d, MMXReg *s)
1247{
1248 MMXReg r;
1249 r.MMX_S(0) = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1250 r.MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1251 *d = r;
1252}
1253
1254void helper_pfrcp(MMXReg *d, MMXReg *s)
1255{
1256 d->MMX_S(0) = approx_rcp(s->MMX_S(0));
1257 d->MMX_S(1) = d->MMX_S(0);
1258}
1259
1260void helper_pfrsqrt(MMXReg *d, MMXReg *s)
1261{
1262 d->MMX_L(1) = s->MMX_L(0) & 0x7fffffff;
1263 d->MMX_S(1) = approx_rsqrt(d->MMX_S(1));
1264 d->MMX_L(1) |= s->MMX_L(0) & 0x80000000;
1265 d->MMX_L(0) = d->MMX_L(1);
1266}
1267
1268void helper_pfsub(MMXReg *d, MMXReg *s)
1269{
1270 d->MMX_S(0) = float32_sub(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1271 d->MMX_S(1) = float32_sub(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1272}
1273
1274void helper_pfsubr(MMXReg *d, MMXReg *s)
1275{
1276 d->MMX_S(0) = float32_sub(s->MMX_S(0), d->MMX_S(0), &env->mmx_status);
1277 d->MMX_S(1) = float32_sub(s->MMX_S(1), d->MMX_S(1), &env->mmx_status);
1278}
1279
1280void helper_pswapd(MMXReg *d, MMXReg *s)
1281{
1282 MMXReg r;
1283 r.MMX_L(0) = s->MMX_L(1);
1284 r.MMX_L(1) = s->MMX_L(0);
1285 *d = r;
1286}
1287#endif
1288
1289/* SSSE3 op helpers */
1290void glue(helper_pshufb, SUFFIX) (Reg *d, Reg *s)
1291{
1292 int i;
1293 Reg r;
1294
1295 for (i = 0; i < (8 << SHIFT); i++)
1296 r.B(i) = (s->B(i) & 0x80) ? 0 : (d->B(s->B(i) & ((8 << SHIFT) - 1)));
1297
1298 *d = r;
1299}
1300
1301void glue(helper_phaddw, SUFFIX) (Reg *d, Reg *s)
1302{
1303 d->W(0) = (int16_t)d->W(0) + (int16_t)d->W(1);
1304 d->W(1) = (int16_t)d->W(2) + (int16_t)d->W(3);
1305 XMM_ONLY(d->W(2) = (int16_t)d->W(4) + (int16_t)d->W(5));
1306 XMM_ONLY(d->W(3) = (int16_t)d->W(6) + (int16_t)d->W(7));
1307 d->W((2 << SHIFT) + 0) = (int16_t)s->W(0) + (int16_t)s->W(1);
1308 d->W((2 << SHIFT) + 1) = (int16_t)s->W(2) + (int16_t)s->W(3);
1309 XMM_ONLY(d->W(6) = (int16_t)s->W(4) + (int16_t)s->W(5));
1310 XMM_ONLY(d->W(7) = (int16_t)s->W(6) + (int16_t)s->W(7));
1311}
1312
1313void glue(helper_phaddd, SUFFIX) (Reg *d, Reg *s)
1314{
1315 d->L(0) = (int32_t)d->L(0) + (int32_t)d->L(1);
1316 XMM_ONLY(d->L(1) = (int32_t)d->L(2) + (int32_t)d->L(3));
1317 d->L((1 << SHIFT) + 0) = (int32_t)s->L(0) + (int32_t)s->L(1);
1318 XMM_ONLY(d->L(3) = (int32_t)s->L(2) + (int32_t)s->L(3));
1319}
1320
1321void glue(helper_phaddsw, SUFFIX) (Reg *d, Reg *s)
1322{
1323 d->W(0) = satsw((int16_t)d->W(0) + (int16_t)d->W(1));
1324 d->W(1) = satsw((int16_t)d->W(2) + (int16_t)d->W(3));
1325 XMM_ONLY(d->W(2) = satsw((int16_t)d->W(4) + (int16_t)d->W(5)));
1326 XMM_ONLY(d->W(3) = satsw((int16_t)d->W(6) + (int16_t)d->W(7)));
1327 d->W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) + (int16_t)s->W(1));
1328 d->W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) + (int16_t)s->W(3));
1329 XMM_ONLY(d->W(6) = satsw((int16_t)s->W(4) + (int16_t)s->W(5)));
1330 XMM_ONLY(d->W(7) = satsw((int16_t)s->W(6) + (int16_t)s->W(7)));
1331}
1332
1333void glue(helper_pmaddubsw, SUFFIX) (Reg *d, Reg *s)
1334{
1335 d->W(0) = satsw((int8_t)s->B( 0) * (uint8_t)d->B( 0) +
1336 (int8_t)s->B( 1) * (uint8_t)d->B( 1));
1337 d->W(1) = satsw((int8_t)s->B( 2) * (uint8_t)d->B( 2) +
1338 (int8_t)s->B( 3) * (uint8_t)d->B( 3));
1339 d->W(2) = satsw((int8_t)s->B( 4) * (uint8_t)d->B( 4) +
1340 (int8_t)s->B( 5) * (uint8_t)d->B( 5));
1341 d->W(3) = satsw((int8_t)s->B( 6) * (uint8_t)d->B( 6) +
1342 (int8_t)s->B( 7) * (uint8_t)d->B( 7));
1343#if SHIFT == 1
1344 d->W(4) = satsw((int8_t)s->B( 8) * (uint8_t)d->B( 8) +
1345 (int8_t)s->B( 9) * (uint8_t)d->B( 9));
1346 d->W(5) = satsw((int8_t)s->B(10) * (uint8_t)d->B(10) +
1347 (int8_t)s->B(11) * (uint8_t)d->B(11));
1348 d->W(6) = satsw((int8_t)s->B(12) * (uint8_t)d->B(12) +
1349 (int8_t)s->B(13) * (uint8_t)d->B(13));
1350 d->W(7) = satsw((int8_t)s->B(14) * (uint8_t)d->B(14) +
1351 (int8_t)s->B(15) * (uint8_t)d->B(15));
1352#endif
1353}
1354
1355void glue(helper_phsubw, SUFFIX) (Reg *d, Reg *s)
1356{
1357 d->W(0) = (int16_t)d->W(0) - (int16_t)d->W(1);
1358 d->W(1) = (int16_t)d->W(2) - (int16_t)d->W(3);
1359 XMM_ONLY(d->W(2) = (int16_t)d->W(4) - (int16_t)d->W(5));
1360 XMM_ONLY(d->W(3) = (int16_t)d->W(6) - (int16_t)d->W(7));
1361 d->W((2 << SHIFT) + 0) = (int16_t)s->W(0) - (int16_t)s->W(1);
1362 d->W((2 << SHIFT) + 1) = (int16_t)s->W(2) - (int16_t)s->W(3);
1363 XMM_ONLY(d->W(6) = (int16_t)s->W(4) - (int16_t)s->W(5));
1364 XMM_ONLY(d->W(7) = (int16_t)s->W(6) - (int16_t)s->W(7));
1365}
1366
1367void glue(helper_phsubd, SUFFIX) (Reg *d, Reg *s)
1368{
1369 d->L(0) = (int32_t)d->L(0) - (int32_t)d->L(1);
1370 XMM_ONLY(d->L(1) = (int32_t)d->L(2) - (int32_t)d->L(3));
1371 d->L((1 << SHIFT) + 0) = (int32_t)s->L(0) - (int32_t)s->L(1);
1372 XMM_ONLY(d->L(3) = (int32_t)s->L(2) - (int32_t)s->L(3));
1373}
1374
1375void glue(helper_phsubsw, SUFFIX) (Reg *d, Reg *s)
1376{
1377 d->W(0) = satsw((int16_t)d->W(0) - (int16_t)d->W(1));
1378 d->W(1) = satsw((int16_t)d->W(2) - (int16_t)d->W(3));
1379 XMM_ONLY(d->W(2) = satsw((int16_t)d->W(4) - (int16_t)d->W(5)));
1380 XMM_ONLY(d->W(3) = satsw((int16_t)d->W(6) - (int16_t)d->W(7)));
1381 d->W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) - (int16_t)s->W(1));
1382 d->W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) - (int16_t)s->W(3));
1383 XMM_ONLY(d->W(6) = satsw((int16_t)s->W(4) - (int16_t)s->W(5)));
1384 XMM_ONLY(d->W(7) = satsw((int16_t)s->W(6) - (int16_t)s->W(7)));
1385}
1386
1387#define FABSB(_, x) x > INT8_MAX ? -(int8_t ) x : x
1388#define FABSW(_, x) x > INT16_MAX ? -(int16_t) x : x
1389#define FABSL(_, x) x > INT32_MAX ? -(int32_t) x : x
1390SSE_HELPER_B(helper_pabsb, FABSB)
1391SSE_HELPER_W(helper_pabsw, FABSW)
1392SSE_HELPER_L(helper_pabsd, FABSL)
1393
1394#define FMULHRSW(d, s) ((int16_t) d * (int16_t) s + 0x4000) >> 15
1395SSE_HELPER_W(helper_pmulhrsw, FMULHRSW)
1396
1397#define FSIGNB(d, s) s <= INT8_MAX ? s ? d : 0 : -(int8_t ) d
1398#define FSIGNW(d, s) s <= INT16_MAX ? s ? d : 0 : -(int16_t) d
1399#define FSIGNL(d, s) s <= INT32_MAX ? s ? d : 0 : -(int32_t) d
1400SSE_HELPER_B(helper_psignb, FSIGNB)
1401SSE_HELPER_W(helper_psignw, FSIGNW)
1402SSE_HELPER_L(helper_psignd, FSIGNL)
1403
1404void glue(helper_palignr, SUFFIX) (Reg *d, Reg *s, int32_t shift)
1405{
1406 Reg r;
1407
1408 /* XXX could be checked during translation */
1409 if (shift >= (16 << SHIFT)) {
1410 r.Q(0) = 0;
1411 XMM_ONLY(r.Q(1) = 0);
1412 } else {
1413 shift <<= 3;
1414#define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0)
1415#if SHIFT == 0
1416 r.Q(0) = SHR(s->Q(0), shift - 0) |
1417 SHR(d->Q(0), shift - 64);
1418#else
1419 r.Q(0) = SHR(s->Q(0), shift - 0) |
1420 SHR(s->Q(1), shift - 64) |
1421 SHR(d->Q(0), shift - 128) |
1422 SHR(d->Q(1), shift - 192);
1423 r.Q(1) = SHR(s->Q(0), shift + 64) |
1424 SHR(s->Q(1), shift - 0) |
1425 SHR(d->Q(0), shift - 64) |
1426 SHR(d->Q(1), shift - 128);
1427#endif
1428#undef SHR
1429 }
1430
1431 *d = r;
1432}
1433
1434#define XMM0 env->xmm_regs[0]
1435
1436#if SHIFT == 1
1437#define SSE_HELPER_V(name, elem, num, F)\
1438void glue(name, SUFFIX) (Reg *d, Reg *s)\
1439{\
1440 d->elem(0) = F(d->elem(0), s->elem(0), XMM0.elem(0));\
1441 d->elem(1) = F(d->elem(1), s->elem(1), XMM0.elem(1));\
1442 if (num > 2) {\
1443 d->elem(2) = F(d->elem(2), s->elem(2), XMM0.elem(2));\
1444 d->elem(3) = F(d->elem(3), s->elem(3), XMM0.elem(3));\
1445 if (num > 4) {\
1446 d->elem(4) = F(d->elem(4), s->elem(4), XMM0.elem(4));\
1447 d->elem(5) = F(d->elem(5), s->elem(5), XMM0.elem(5));\
1448 d->elem(6) = F(d->elem(6), s->elem(6), XMM0.elem(6));\
1449 d->elem(7) = F(d->elem(7), s->elem(7), XMM0.elem(7));\
1450 if (num > 8) {\
1451 d->elem(8) = F(d->elem(8), s->elem(8), XMM0.elem(8));\
1452 d->elem(9) = F(d->elem(9), s->elem(9), XMM0.elem(9));\
1453 d->elem(10) = F(d->elem(10), s->elem(10), XMM0.elem(10));\
1454 d->elem(11) = F(d->elem(11), s->elem(11), XMM0.elem(11));\
1455 d->elem(12) = F(d->elem(12), s->elem(12), XMM0.elem(12));\
1456 d->elem(13) = F(d->elem(13), s->elem(13), XMM0.elem(13));\
1457 d->elem(14) = F(d->elem(14), s->elem(14), XMM0.elem(14));\
1458 d->elem(15) = F(d->elem(15), s->elem(15), XMM0.elem(15));\
1459 }\
1460 }\
1461 }\
1462}
1463
1464#define SSE_HELPER_I(name, elem, num, F)\
1465void glue(name, SUFFIX) (Reg *d, Reg *s, uint32_t imm)\
1466{\
1467 d->elem(0) = F(d->elem(0), s->elem(0), ((imm >> 0) & 1));\
1468 d->elem(1) = F(d->elem(1), s->elem(1), ((imm >> 1) & 1));\
1469 if (num > 2) {\
1470 d->elem(2) = F(d->elem(2), s->elem(2), ((imm >> 2) & 1));\
1471 d->elem(3) = F(d->elem(3), s->elem(3), ((imm >> 3) & 1));\
1472 if (num > 4) {\
1473 d->elem(4) = F(d->elem(4), s->elem(4), ((imm >> 4) & 1));\
1474 d->elem(5) = F(d->elem(5), s->elem(5), ((imm >> 5) & 1));\
1475 d->elem(6) = F(d->elem(6), s->elem(6), ((imm >> 6) & 1));\
1476 d->elem(7) = F(d->elem(7), s->elem(7), ((imm >> 7) & 1));\
1477 if (num > 8) {\
1478 d->elem(8) = F(d->elem(8), s->elem(8), ((imm >> 8) & 1));\
1479 d->elem(9) = F(d->elem(9), s->elem(9), ((imm >> 9) & 1));\
1480 d->elem(10) = F(d->elem(10), s->elem(10), ((imm >> 10) & 1));\
1481 d->elem(11) = F(d->elem(11), s->elem(11), ((imm >> 11) & 1));\
1482 d->elem(12) = F(d->elem(12), s->elem(12), ((imm >> 12) & 1));\
1483 d->elem(13) = F(d->elem(13), s->elem(13), ((imm >> 13) & 1));\
1484 d->elem(14) = F(d->elem(14), s->elem(14), ((imm >> 14) & 1));\
1485 d->elem(15) = F(d->elem(15), s->elem(15), ((imm >> 15) & 1));\
1486 }\
1487 }\
1488 }\
1489}
1490
1491/* SSE4.1 op helpers */
1492#define FBLENDVB(d, s, m) (m & 0x80) ? s : d
1493#define FBLENDVPS(d, s, m) (m & 0x80000000) ? s : d
1494#define FBLENDVPD(d, s, m) (m & 0x8000000000000000LL) ? s : d
1495SSE_HELPER_V(helper_pblendvb, B, 16, FBLENDVB)
1496SSE_HELPER_V(helper_blendvps, L, 4, FBLENDVPS)
1497SSE_HELPER_V(helper_blendvpd, Q, 2, FBLENDVPD)
1498
1499void glue(helper_ptest, SUFFIX) (Reg *d, Reg *s)
1500{
1501 uint64_t zf = (s->Q(0) & d->Q(0)) | (s->Q(1) & d->Q(1));
1502 uint64_t cf = (s->Q(0) & ~d->Q(0)) | (s->Q(1) & ~d->Q(1));
1503
1504 CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C);
1505}
1506
1507#define SSE_HELPER_F(name, elem, num, F)\
1508void glue(name, SUFFIX) (Reg *d, Reg *s)\
1509{\
1510 d->elem(0) = F(0);\
1511 d->elem(1) = F(1);\
1512 d->elem(2) = F(2);\
1513 d->elem(3) = F(3);\
1514 if (num > 3) {\
1515 d->elem(4) = F(4);\
1516 d->elem(5) = F(5);\
1517 if (num > 5) {\
1518 d->elem(6) = F(6);\
1519 d->elem(7) = F(7);\
1520 }\
1521 }\
1522}
1523
1524SSE_HELPER_F(helper_pmovsxbw, W, 8, (int8_t) s->B)
1525SSE_HELPER_F(helper_pmovsxbd, L, 4, (int8_t) s->B)
1526SSE_HELPER_F(helper_pmovsxbq, Q, 2, (int8_t) s->B)
1527SSE_HELPER_F(helper_pmovsxwd, L, 4, (int16_t) s->W)
1528SSE_HELPER_F(helper_pmovsxwq, Q, 2, (int16_t) s->W)
1529SSE_HELPER_F(helper_pmovsxdq, Q, 2, (int32_t) s->L)
1530SSE_HELPER_F(helper_pmovzxbw, W, 8, s->B)
1531SSE_HELPER_F(helper_pmovzxbd, L, 4, s->B)
1532SSE_HELPER_F(helper_pmovzxbq, Q, 2, s->B)
1533SSE_HELPER_F(helper_pmovzxwd, L, 4, s->W)
1534SSE_HELPER_F(helper_pmovzxwq, Q, 2, s->W)
1535SSE_HELPER_F(helper_pmovzxdq, Q, 2, s->L)
1536
1537void glue(helper_pmuldq, SUFFIX) (Reg *d, Reg *s)
1538{
1539 d->Q(0) = (int64_t) (int32_t) d->L(0) * (int32_t) s->L(0);
1540 d->Q(1) = (int64_t) (int32_t) d->L(2) * (int32_t) s->L(2);
1541}
1542
1543#define FCMPEQQ(d, s) d == s ? -1 : 0
1544SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ)
1545
1546void glue(helper_packusdw, SUFFIX) (Reg *d, Reg *s)
1547{
1548 d->W(0) = satuw((int32_t) d->L(0));
1549 d->W(1) = satuw((int32_t) d->L(1));
1550 d->W(2) = satuw((int32_t) d->L(2));
1551 d->W(3) = satuw((int32_t) d->L(3));
1552 d->W(4) = satuw((int32_t) s->L(0));
1553 d->W(5) = satuw((int32_t) s->L(1));
1554 d->W(6) = satuw((int32_t) s->L(2));
1555 d->W(7) = satuw((int32_t) s->L(3));
1556}
1557
1558#define FMINSB(d, s) MIN((int8_t) d, (int8_t) s)
1559#define FMINSD(d, s) MIN((int32_t) d, (int32_t) s)
1560#define FMAXSB(d, s) MAX((int8_t) d, (int8_t) s)
1561#define FMAXSD(d, s) MAX((int32_t) d, (int32_t) s)
1562SSE_HELPER_B(helper_pminsb, FMINSB)
1563SSE_HELPER_L(helper_pminsd, FMINSD)
1564SSE_HELPER_W(helper_pminuw, MIN)
1565SSE_HELPER_L(helper_pminud, MIN)
1566SSE_HELPER_B(helper_pmaxsb, FMAXSB)
1567SSE_HELPER_L(helper_pmaxsd, FMAXSD)
1568SSE_HELPER_W(helper_pmaxuw, MAX)
1569SSE_HELPER_L(helper_pmaxud, MAX)
1570
1571#define FMULLD(d, s) (int32_t) d * (int32_t) s
1572SSE_HELPER_L(helper_pmulld, FMULLD)
1573
1574void glue(helper_phminposuw, SUFFIX) (Reg *d, Reg *s)
1575{
1576 int idx = 0;
1577
1578 if (s->W(1) < s->W(idx))
1579 idx = 1;
1580 if (s->W(2) < s->W(idx))
1581 idx = 2;
1582 if (s->W(3) < s->W(idx))
1583 idx = 3;
1584 if (s->W(4) < s->W(idx))
1585 idx = 4;
1586 if (s->W(5) < s->W(idx))
1587 idx = 5;
1588 if (s->W(6) < s->W(idx))
1589 idx = 6;
1590 if (s->W(7) < s->W(idx))
1591 idx = 7;
1592
1593 d->Q(1) = 0;
1594 d->L(1) = 0;
1595 d->W(1) = idx;
1596 d->W(0) = s->W(idx);
1597}
1598
1599void glue(helper_roundps, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
1600{
1601 signed char prev_rounding_mode;
1602
1603 prev_rounding_mode = env->sse_status.float_rounding_mode;
1604 if (!(mode & (1 << 2)))
1605 switch (mode & 3) {
1606 case 0:
1607 set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1608 break;
1609 case 1:
1610 set_float_rounding_mode(float_round_down, &env->sse_status);
1611 break;
1612 case 2:
1613 set_float_rounding_mode(float_round_up, &env->sse_status);
1614 break;
1615 case 3:
1616 set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1617 break;
1618 }
1619
1620 d->L(0) = float64_round_to_int(s->L(0), &env->sse_status);
1621 d->L(1) = float64_round_to_int(s->L(1), &env->sse_status);
1622 d->L(2) = float64_round_to_int(s->L(2), &env->sse_status);
1623 d->L(3) = float64_round_to_int(s->L(3), &env->sse_status);
1624
1625#if 0 /* TODO */
1626 if (mode & (1 << 3))
1627 set_float_exception_flags(
1628 get_float_exception_flags(&env->sse_status) &
1629 ~float_flag_inexact,
1630 &env->sse_status);
1631#endif
1632 env->sse_status.float_rounding_mode = prev_rounding_mode;
1633}
1634
1635void glue(helper_roundpd, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
1636{
1637 signed char prev_rounding_mode;
1638
1639 prev_rounding_mode = env->sse_status.float_rounding_mode;
1640 if (!(mode & (1 << 2)))
1641 switch (mode & 3) {
1642 case 0:
1643 set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1644 break;
1645 case 1:
1646 set_float_rounding_mode(float_round_down, &env->sse_status);
1647 break;
1648 case 2:
1649 set_float_rounding_mode(float_round_up, &env->sse_status);
1650 break;
1651 case 3:
1652 set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1653 break;
1654 }
1655
1656 d->Q(0) = float64_round_to_int(s->Q(0), &env->sse_status);
1657 d->Q(1) = float64_round_to_int(s->Q(1), &env->sse_status);
1658
1659#if 0 /* TODO */
1660 if (mode & (1 << 3))
1661 set_float_exception_flags(
1662 get_float_exception_flags(&env->sse_status) &
1663 ~float_flag_inexact,
1664 &env->sse_status);
1665#endif
1666 env->sse_status.float_rounding_mode = prev_rounding_mode;
1667}
1668
1669void glue(helper_roundss, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
1670{
1671 signed char prev_rounding_mode;
1672
1673 prev_rounding_mode = env->sse_status.float_rounding_mode;
1674 if (!(mode & (1 << 2)))
1675 switch (mode & 3) {
1676 case 0:
1677 set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1678 break;
1679 case 1:
1680 set_float_rounding_mode(float_round_down, &env->sse_status);
1681 break;
1682 case 2:
1683 set_float_rounding_mode(float_round_up, &env->sse_status);
1684 break;
1685 case 3:
1686 set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1687 break;
1688 }
1689
1690 d->L(0) = float64_round_to_int(s->L(0), &env->sse_status);
1691
1692#if 0 /* TODO */
1693 if (mode & (1 << 3))
1694 set_float_exception_flags(
1695 get_float_exception_flags(&env->sse_status) &
1696 ~float_flag_inexact,
1697 &env->sse_status);
1698#endif
1699 env->sse_status.float_rounding_mode = prev_rounding_mode;
1700}
1701
1702void glue(helper_roundsd, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
1703{
1704 signed char prev_rounding_mode;
1705
1706 prev_rounding_mode = env->sse_status.float_rounding_mode;
1707 if (!(mode & (1 << 2)))
1708 switch (mode & 3) {
1709 case 0:
1710 set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1711 break;
1712 case 1:
1713 set_float_rounding_mode(float_round_down, &env->sse_status);
1714 break;
1715 case 2:
1716 set_float_rounding_mode(float_round_up, &env->sse_status);
1717 break;
1718 case 3:
1719 set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1720 break;
1721 }
1722
1723 d->Q(0) = float64_round_to_int(s->Q(0), &env->sse_status);
1724
1725#if 0 /* TODO */
1726 if (mode & (1 << 3))
1727 set_float_exception_flags(
1728 get_float_exception_flags(&env->sse_status) &
1729 ~float_flag_inexact,
1730 &env->sse_status);
1731#endif
1732 env->sse_status.float_rounding_mode = prev_rounding_mode;
1733}
1734
1735#define FBLENDP(d, s, m) m ? s : d
1736SSE_HELPER_I(helper_blendps, L, 4, FBLENDP)
1737SSE_HELPER_I(helper_blendpd, Q, 2, FBLENDP)
1738SSE_HELPER_I(helper_pblendw, W, 8, FBLENDP)
1739
1740void glue(helper_dpps, SUFFIX) (Reg *d, Reg *s, uint32_t mask)
1741{
1742 float32 iresult = 0 /*float32_zero*/;
1743
1744 if (mask & (1 << 4))
1745 iresult = float32_add(iresult,
1746 float32_mul(d->L(0), s->L(0), &env->sse_status),
1747 &env->sse_status);
1748 if (mask & (1 << 5))
1749 iresult = float32_add(iresult,
1750 float32_mul(d->L(1), s->L(1), &env->sse_status),
1751 &env->sse_status);
1752 if (mask & (1 << 6))
1753 iresult = float32_add(iresult,
1754 float32_mul(d->L(2), s->L(2), &env->sse_status),
1755 &env->sse_status);
1756 if (mask & (1 << 7))
1757 iresult = float32_add(iresult,
1758 float32_mul(d->L(3), s->L(3), &env->sse_status),
1759 &env->sse_status);
1760 d->L(0) = (mask & (1 << 0)) ? iresult : 0 /*float32_zero*/;
1761 d->L(1) = (mask & (1 << 1)) ? iresult : 0 /*float32_zero*/;
1762 d->L(2) = (mask & (1 << 2)) ? iresult : 0 /*float32_zero*/;
1763 d->L(3) = (mask & (1 << 3)) ? iresult : 0 /*float32_zero*/;
1764}
1765
1766void glue(helper_dppd, SUFFIX) (Reg *d, Reg *s, uint32_t mask)
1767{
1768 float64 iresult = 0 /*float64_zero*/;
1769
1770 if (mask & (1 << 4))
1771 iresult = float64_add(iresult,
1772 float64_mul(d->Q(0), s->Q(0), &env->sse_status),
1773 &env->sse_status);
1774 if (mask & (1 << 5))
1775 iresult = float64_add(iresult,
1776 float64_mul(d->Q(1), s->Q(1), &env->sse_status),
1777 &env->sse_status);
1778 d->Q(0) = (mask & (1 << 0)) ? iresult : 0 /*float64_zero*/;
1779 d->Q(1) = (mask & (1 << 1)) ? iresult : 0 /*float64_zero*/;
1780}
1781
1782void glue(helper_mpsadbw, SUFFIX) (Reg *d, Reg *s, uint32_t offset)
1783{
1784 int s0 = (offset & 3) << 2;
1785 int d0 = (offset & 4) << 0;
1786 int i;
1787 Reg r;
1788
1789 for (i = 0; i < 8; i++, d0++) {
1790 r.W(i) = 0;
1791 r.W(i) += abs1(d->B(d0 + 0) - s->B(s0 + 0));
1792 r.W(i) += abs1(d->B(d0 + 1) - s->B(s0 + 1));
1793 r.W(i) += abs1(d->B(d0 + 2) - s->B(s0 + 2));
1794 r.W(i) += abs1(d->B(d0 + 3) - s->B(s0 + 3));
1795 }
1796
1797 *d = r;
1798}
1799
1800/* SSE4.2 op helpers */
1801/* it's unclear whether signed or unsigned */
1802#define FCMPGTQ(d, s) d > s ? -1 : 0
1803SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ)
1804
1805static inline int pcmp_elen(int reg, uint32_t ctrl)
1806{
1807 int val;
1808
1809 /* Presence of REX.W is indicated by a bit higher than 7 set */
1810 if (ctrl >> 8)
1811 val = abs1((int64_t) env->regs[reg]);
1812 else
1813 val = abs1((int32_t) env->regs[reg]);
1814
1815 if (ctrl & 1) {
1816 if (val > 8)
1817 return 8;
1818 } else
1819 if (val > 16)
1820 return 16;
1821
1822 return val;
1823}
1824
1825static inline int pcmp_ilen(Reg *r, uint8_t ctrl)
1826{
1827 int val = 0;
1828
1829 if (ctrl & 1) {
1830 while (val < 8 && r->W(val))
1831 val++;
1832 } else
1833 while (val < 16 && r->B(val))
1834 val++;
1835
1836 return val;
1837}
1838
1839static inline int pcmp_val(Reg *r, uint8_t ctrl, int i)
1840{
1841 switch ((ctrl >> 0) & 3) {
1842 case 0:
1843 return r->B(i);
1844 case 1:
1845 return r->W(i);
1846 case 2:
1847 return (int8_t) r->B(i);
1848 case 3:
1849 default:
1850 return (int16_t) r->W(i);
1851 }
1852}
1853
1854static inline unsigned pcmpxstrx(Reg *d, Reg *s,
1855 int8_t ctrl, int valids, int validd)
1856{
1857 unsigned int res = 0;
1858 int v;
1859 int j, i;
1860 int upper = (ctrl & 1) ? 7 : 15;
1861
1862 valids--;
1863 validd--;
1864
1865 CC_SRC = (valids < upper ? CC_Z : 0) | (validd < upper ? CC_S : 0);
1866
1867 switch ((ctrl >> 2) & 3) {
1868 case 0:
1869 for (j = valids; j >= 0; j--) {
1870 res <<= 1;
1871 v = pcmp_val(s, ctrl, j);
1872 for (i = validd; i >= 0; i--)
1873 res |= (v == pcmp_val(d, ctrl, i));
1874 }
1875 break;
1876 case 1:
1877 for (j = valids; j >= 0; j--) {
1878 res <<= 1;
1879 v = pcmp_val(s, ctrl, j);
1880 for (i = ((validd - 1) | 1); i >= 0; i -= 2)
1881 res |= (pcmp_val(d, ctrl, i - 0) <= v &&
1882 pcmp_val(d, ctrl, i - 1) >= v);
1883 }
1884 break;
1885 case 2:
1886 res = (2 << (upper - MAX(valids, validd))) - 1;
1887 res <<= MAX(valids, validd) - MIN(valids, validd);
1888 for (i = MIN(valids, validd); i >= 0; i--) {
1889 res <<= 1;
1890 v = pcmp_val(s, ctrl, i);
1891 res |= (v == pcmp_val(d, ctrl, i));
1892 }
1893 break;
1894 case 3:
1895 for (j = valids - validd; j >= 0; j--) {
1896 res <<= 1;
1897 res |= 1;
1898 for (i = MIN(upper - j, validd); i >= 0; i--)
1899 res &= (pcmp_val(s, ctrl, i + j) == pcmp_val(d, ctrl, i));
1900 }
1901 break;
1902 }
1903
1904 switch ((ctrl >> 4) & 3) {
1905 case 1:
1906 res ^= (2 << upper) - 1;
1907 break;
1908 case 3:
1909 res ^= (2 << valids) - 1;
1910 break;
1911 }
1912
1913 if (res)
1914 CC_SRC |= CC_C;
1915 if (res & 1)
1916 CC_SRC |= CC_O;
1917
1918 return res;
1919}
1920
1921static inline int rffs1(unsigned int val)
1922{
1923 int ret = 1, hi;
1924
1925 for (hi = sizeof(val) * 4; hi; hi /= 2)
1926 if (val >> hi) {
1927 val >>= hi;
1928 ret += hi;
1929 }
1930
1931 return ret;
1932}
1933
1934static inline int ffs1(unsigned int val)
1935{
1936 int ret = 1, hi;
1937
1938 for (hi = sizeof(val) * 4; hi; hi /= 2)
1939 if (val << hi) {
1940 val <<= hi;
1941 ret += hi;
1942 }
1943
1944 return ret;
1945}
1946
1947void glue(helper_pcmpestri, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
1948{
1949 unsigned int res = pcmpxstrx(d, s, ctrl,
1950 pcmp_elen(R_EDX, ctrl),
1951 pcmp_elen(R_EAX, ctrl));
1952
1953 if (res)
1954#ifndef VBOX
1955 env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1 : ffs1)(res) - 1;
1956#else
1957 env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1(res) : ffs1(res)) - 1;
1958#endif
1959 else
1960 env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
1961}
1962
1963void glue(helper_pcmpestrm, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
1964{
1965 int i;
1966 unsigned int res = pcmpxstrx(d, s, ctrl,
1967 pcmp_elen(R_EDX, ctrl),
1968 pcmp_elen(R_EAX, ctrl));
1969
1970 if ((ctrl >> 6) & 1) {
1971 if (ctrl & 1)
1972 for (i = 0; i <= 8; i--, res >>= 1)
1973 d->W(i) = (res & 1) ? ~0 : 0;
1974 else
1975 for (i = 0; i <= 16; i--, res >>= 1)
1976 d->B(i) = (res & 1) ? ~0 : 0;
1977 } else {
1978 d->Q(1) = 0;
1979 d->Q(0) = res;
1980 }
1981}
1982
1983void glue(helper_pcmpistri, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
1984{
1985 unsigned int res = pcmpxstrx(d, s, ctrl,
1986 pcmp_ilen(s, ctrl),
1987 pcmp_ilen(d, ctrl));
1988
1989 if (res)
1990 env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1 : ffs1)(res) - 1;
1991 else
1992 env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
1993}
1994
1995void glue(helper_pcmpistrm, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
1996{
1997 int i;
1998 unsigned int res = pcmpxstrx(d, s, ctrl,
1999 pcmp_ilen(s, ctrl),
2000 pcmp_ilen(d, ctrl));
2001
2002 if ((ctrl >> 6) & 1) {
2003 if (ctrl & 1)
2004 for (i = 0; i <= 8; i--, res >>= 1)
2005 d->W(i) = (res & 1) ? ~0 : 0;
2006 else
2007 for (i = 0; i <= 16; i--, res >>= 1)
2008 d->B(i) = (res & 1) ? ~0 : 0;
2009 } else {
2010 d->Q(1) = 0;
2011 d->Q(0) = res;
2012 }
2013}
2014
2015#define CRCPOLY 0x1edc6f41
2016#define CRCPOLY_BITREV 0x82f63b78
2017target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len)
2018{
2019 target_ulong crc = (msg & ((target_ulong) -1 >>
2020 (TARGET_LONG_BITS - len))) ^ crc1;
2021
2022 while (len--)
2023 crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_BITREV : 0);
2024
2025 return crc;
2026}
2027
2028#define POPMASK(i) ((target_ulong) -1 / ((1LL << (1 << i)) + 1))
2029#define POPCOUNT(n, i) (n & POPMASK(i)) + ((n >> (1 << i)) & POPMASK(i))
2030target_ulong helper_popcnt(target_ulong n, uint32_t type)
2031{
2032 CC_SRC = n ? 0 : CC_Z;
2033
2034 n = POPCOUNT(n, 0);
2035 n = POPCOUNT(n, 1);
2036 n = POPCOUNT(n, 2);
2037 n = POPCOUNT(n, 3);
2038 if (type == 1)
2039 return n & 0xff;
2040
2041 n = POPCOUNT(n, 4);
2042#ifndef TARGET_X86_64
2043 return n;
2044#else
2045 if (type == 2)
2046 return n & 0xff;
2047
2048 return POPCOUNT(n, 5);
2049#endif
2050}
2051#endif
2052
2053#undef SHIFT
2054#undef XMM_ONLY
2055#undef Reg
2056#undef B
2057#undef W
2058#undef L
2059#undef Q
2060#undef SUFFIX
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette