SSE2技术是SSE技术的扩展,它与SSE一样,也使用的是XMM寄存器,只不过SSE2技术提供了一些指令,可以对压缩的双精度浮点数,压缩的32位整数,以及压缩的64位整数进行操作。此外,与SSE类似的是,SSE2既可以处理压缩浮点数,也可以处理标量浮点数,标量浮点数的含义在上一节已经解释过了...
Instruction 指令 |
Description 描述 |
---|---|
MOVAPD |
Moves two aligned, double-precision values to XMM registers or memory 将两个对齐的双精度浮点数加载到XMM寄存器或内存中,当源或目标操作数为内存位置时,该内存位置必须是16字节对齐的,否则就会产生#GP(通用保护异常) |
MOVUPD |
Moves two unaligned, double-precision values to XMM registers or memory 将两个非对齐的双精度浮点数加载到XMM寄存器或内存中,当源或目标操作数为内存位置时,该内存位置可以不是16字节对齐的。 |
MOVDQA |
Moves two aligned, quadword integer values to XMM registers or memory 将两个对齐的64位整数加载到XMM寄存器或内存中,当源或目标操作数为内存位置时,该内存位置必须是16字节对齐的,否则就会产生#GP(通用保护异常) |
MOVDQU |
Moves two unaligned, quadword integer values to XMM registers or memory 将两个非对齐的64位整数加载到XMM寄存器或内存中,当源或目标操作数为内存位置时,该内存位置可以不是16字节对齐的。 |
MOVSD |
Moves one double-precision value to memory or the low quadword of a register 将单个双精度浮点数(标量双精度浮点数)加载到内存中,或者加载到XMM寄存器的低64位中。 |
MOVHPD |
Moves one double-precision value to memory or the high quadword of a register 将单个双精度浮点数从XMM寄存器的高64位传值到64位内存中,或者从64位内存传值到XMM寄存器的高64位。 |
MOVLPD |
Moves one double-precision value to memory or the low quadword of a register 将单个双精度浮点数从XMM寄存器的低64位传值到64位内存中,或者从64位内存传值到XMM寄存器的低64位。 |
.section .data
.align 16
packedvalue1:
.double 10.235, 289.1
packedvalue2:
.int 10, 20, 30, 40
.section .text
.globl _start
_start:
movapd packedvalue1, %xmm0
movdqa packedvalue2, %xmm1
|
Instruction 指令 |
Description 描述 |
---|---|
ADDPD |
Adds packed double-precision floating-point values 将源与目标操作数里的压缩双精度浮点数相加 |
ADDSD |
Adds scalar double-precision floating-point values 将源与目标操作数里的低64位的标量双精度浮点数相加 |
PADDSB |
Adds packed signed byte integer values 将源与目标操作数里的有符号压缩字节整数相加 |
PADDSW |
Adds packed signed word integer values 将源与目标操作数里的有符号压缩字整数相加 |
PADDD |
Adds packed doubleword integer values 将源与目标操作数里的压缩双字整数相加 |
PADDQ |
Adds packed quadword integer values 将源与目标操作数里的压缩四字整数相加 |
# sse2math.s - An example of using SSE2 arithmetic instructions
.section .data
.align 16
value1:
.double 10.42, -5.330
value2:
.double 4.25, 2.10
value3:
.int 10, 20, 30, 40
value4:
.int 5, 15, 25, 35
.section .bss
.lcomm result1, 16
.lcomm result2, 16
.section .text
.globl _start
_start:
nop
movapd value1, %xmm0
movapd value2, %xmm1
movdqa value3, %xmm2
movdqa value4, %xmm3
mulpd %xmm1, %xmm0
paddd %xmm3, %xmm2
movapd %xmm0, result1
movdqa %xmm2, result2
movl $1, %eax
movl $0, %ebx
int $0x80
|
$ as -gstabs -o sse2math.o sse2math.s $ ld -o sse2math sse2math.o $ gdb -q sse2math Reading symbols from /root/asm_example/adv/sse2math...done. (gdb) b _start Breakpoint 1 at 0x8048074: file sse2math.s, line 18. (gdb) r Starting program: /root/asm_example/adv/sse2math Breakpoint 1, _start () at sse2math.s:18 18 nop (gdb) n 19 movapd value1, %xmm0 (gdb) n 20 movapd value2, %xmm1 (gdb) n 21 movdqa value3, %xmm2 (gdb) n 22 movdqa value4, %xmm3 (gdb) n 23 mulpd %xmm1, %xmm0 (gdb) print $xmm0 $1 = ............................................ v2_double = {10.42, -5.3300000000000001}, ............................................... (gdb) print $xmm1 $2 = ............................................ v2_double = {4.25, 2.1000000000000001}, ............................................... (gdb) print $xmm2 $3 = ............................................ v4_int32 = {10, 20, 30, 40}, ............................................... (gdb) print $xmm3 $4 = ............................................ v4_int32 = {5, 15, 25, 35}, ............................................... (gdb) |
(gdb) x/2gf &result1 0x8049100 |
from http://x86.renejeschke.de/html/file_module_x86_id_199.html MOVSHDUP xmm1, xmm2/m128 (intel汇编语法) if(source == m128) { //load instruction xmm1[0..31] = m128[32..63]; xmm1[32..63] = m128[32..63] xmm1[64..95] = m128[96..127]; xmm1[96..127] = m128[96..127]; } else { //move instruction xmm1[0..31] = xmm2[32..63]; xmm1[32..63] = xmm2[32..63]; xmm1[64..95] = xmm2[96..127]; xmm1[96..127] = xmm2[96..127]; } |
from http://x86.renejeschke.de/html/file_module_x86_id_200.html MOVSLDUP xmm1, xmm2/m128 (intel汇编语法) if(source == m128) { //load instruction xmm1[0..31] = m128[0..31]; xmm1[32..63] = m128[0..31] xmm1[64..95] = m128[64..95]; xmm1[96..127] = m128[64..95]; } else { //move instruction xmm1[0..31] = xmm2[0..31]; xmm1[32..63] = xmm2[0..31]; xmm1[64..95] = xmm2[64..95]; xmm1[96..127] = xmm2[64..95]; } |
from http://x86.renejeschke.de/html/file_module_x86_id_182.html MOVDDUP xmm1, xmm2/m64 (intel汇编语法) if(Source == m64) { //Load instruction xmm1[0..63] = m64; xmm1[64..127] = m64; } else { //Move instruction xmm1[0..63] = xmm2[0..63]; xmm1[64..127] = xmm2[0..63]; } |
from http://x86.renejeschke.de/html/file_module_x86_id_11.html ADDSUBPS xmm1, xmm2/m128 (intel汇编语法) xmm1[0..31] = xmm1[0..31] - xmm2/m128[0..31]; xmm1[32..63] = xmm1[32..63] + xmm2/m128[32..63]; xmm1[64..95] = xmm1[64..95] - xmm2/m128[64..95]; xmm1[96..127] = xmm1[96..127] + xmm2/m128[96..127]; |
from http://x86.renejeschke.de/html/file_module_x86_id_10.html ADDSUBPD xmm1, xmm2/m128 (intel汇编语法) xmm1[0..63] = xmm1[0..63] - xmm2/m128[0..63]; xmm1[64..127] = xmm1[64..127] + xmm2/m128[64..127]; |
from http://x86.renejeschke.de/html/file_module_x86_id_133.html HADDPS xmm1, xmm2/m128 (intel汇编语法) xmm1[0..31] = xmm1[0..31] + xmm1[32..63]; xmm1[32..63] = xmm1[64..95] + xmm1[96..127]; xmm1[64..95] = xmm2/m128[0..31] + xmm2/m128[32..63]; xmm1[96..127] = xmm2/m128[64..95] + xmm2/m128[96..127]; |
from http://x86.renejeschke.de/html/file_module_x86_id_132.html HADDPD xmm1, xmm2/m128 (intel汇编语法) xmm1[0..63] = xmm1[0..63] + xmm1[64..127]; xmm1[64..127] = xmm2/m128[0..63] + xmm2/m128[64..127]; |
from http://x86.renejeschke.de/html/file_module_x86_id_136.html HSUBPS xmm1, xmm2/m128 (intel汇编语法) xmm1[0..31] = xmm1[0..31] - xmm1[32..63]; xmm1[32..63] = xmm1[64..95] - xmm1[96..127]; xmm1[64..95] = xmm2/m128[0..31] - xmm2/m128[32..63]; xmm1[96..127] = xmm2/m128[64..95] - xmm2/m128[96..127]; |
from http://x86.renejeschke.de/html/file_module_x86_id_135.html HSUBPD xmm1, xmm2/m128 (intel汇编语法) xmm1[0..63] = xmm1[0..63] - xmm1[64..127]; xmm1[64..127] = xmm2/m128[0..63] - xmm2/m128[64..127]; |