x86 汇编/SSE

x86 汇编
快速链接：寄存器 • 移动 • 跳转 • 计算 • 逻辑 • 重排 • 其他 • FPU

SSE 代表流式 SIMD 扩展。它本质上是MMX 指令的浮点等效指令。SSE 寄存器为 128 位，可用于对各种数据大小和类型执行操作。与 MMX 不同，SSE 寄存器不与浮点栈重叠。

寄存器

SSE 由英特尔于 1999 年在奔腾 III 中推出，创建了八个新的 128 位寄存器

XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7

最初，SSE 寄存器只能用作四个 32 位单精度浮点数（相当于 C 中的 float）。SSE2 扩展了 XMM 寄存器的功能，因此现在可以将它们用作

2 个 64 位浮点数（双精度）
2 个 64 位整数
4 个 32 位浮点数（单精度）
4 个 32 位整数
8 个 16 位整数
16 个 8 位字符（字节）

数据移动示例

以下程序（使用NASM 语法）使用 SIMD 指令执行数据移动。

;
; nasm -felf32 -g sseMove.asm
; ld -g sseMove.o
;
global _start

section .data
	align 16
	v1:	dd 1.1, 2.2, 3.3, 4.4	; Four Single precision floats 32 bits each
	v1dp:	dq 1.1, 2.2		; Two Double precision floats 64 bits each
	v2:	dd 5.5, 6.6, 7.7, 8.8
	v2s1:	dd 5.5, 6.6, 7.7, -8.8
	v2s2:	dd 5.5, 6.6, -7.7, -8.8
	v2s3:	dd 5.5, -6.6, -7.7, -8.8
	v2s4:	dd -5.5, -6.6, -7.7, -8.8
	num1:	dd 1.2
	v3:	dd 1.2, 2.3, 4.5, 6.7	; No longer 16 byte aligned
	v3dp:	dq 1.2, 2.3		; No longer 16 byte aligned

section .bss
	mask1:	resd 1
	mask2:	resd 1
	mask3:	resd 1
	mask4:	resd 1

section .text
	_start:

;
;	op	dst,  src
;
				;
				; SSE
				;
				; Using movaps since vectors are 16 byte aligned
	movaps	xmm0, [v1]	; Move four 32-bit(single precision) floats to xmm0 
	movaps	xmm1, [v2]
	movups	xmm2, [v3]	; Need to use movups since v3 is not 16 byte aligned
	;movaps	xmm3, [v3]	; This would seg fault if uncommented 
	movss	xmm3, [num1]	; Move 32-bit float num1 to the least significant element of xmm3
	movss	xmm3, [v3]	; Move first 32-bit float of v3 to the least significant element of xmm3
	movlps	xmm4, [v3]	; Move 64-bits(two single precision floats) from memory to the lower 64-bit elements of xmm4
	movhps	xmm4, [v2]	; Move 64-bits(two single precision floats) from memory to the higher 64-bit elements of xmm4

				; Source and destination for movhlps and movlhps must be xmm registers
	movhlps	xmm5, xmm4	; Transfers the higher 64-bits of the source xmm4 to the lower 64-bits of the destination xmm5
	movlhps	xmm5, xmm4	; Transfers the lower 64-bits of the source xmm4 to the higher 64-bits of the destination xmm5


	movaps	xmm6, [v2s1]
	movmskps eax, xmm6	; Extract the sign bits from four 32-bits floats in xmm6 and create 4 bit mask in eax 
	mov	[mask1], eax	; Should be 8
	movaps	xmm6, [v2s2]
	movmskps eax, xmm6	; Extract the sign bits from four 32-bits floats in xmm6 and create 4 bit mask in eax
	mov	[mask2], eax	; Should be 12
	movaps	xmm6, [v2s3]
	movmskps eax, xmm6	; Extract the sign bits from four 32-bits floats in xmm6 and create 4 bit mask in eax
	mov	[mask3], eax	; Should be 14
	movaps	xmm6, [v2s4]
	movmskps eax, xmm6	; Extract the sign bits from four 32-bits floats in xmm6 and create 4 bit mask in eax
	mov	[mask4], eax	; Should be 15


				;
				; SSE2
				;
	movapd	xmm6, [v1dp]	; Move two 64-bit(double precision) floats to xmm6, using movapd since vector is 16 byte aligned 
				; Next two instruction should have equivalent results to movapd xmm6, [vldp]
	movhpd	xmm6, [v1dp+8]	; Move a 64-bit(double precision) float into the higher 64-bit elements of xmm6 
	movlpd	xmm6, [v1dp]	; Move a 64-bit(double precision) float into the lower 64-bit elements of xmm6
	movupd	xmm6, [v3dp]	; Move two 64-bit floats to xmm6, using movupd since vector is not 16 byte aligned

使用打包单精度浮点数进行算术运算的示例

以下程序（使用NASM 语法）对一些数字执行一些 SIMD 操作。

global _start

section .data
    v1: dd 1.1, 2.2, 3.3, 4.4    ;first set of 4 numbers
    v2: dd 5.5, 6.6, 7.7, 8.8    ;second set
    
section .bss
    v3: resd 4    ;result
    
section .text
    _start:
    
    movups xmm0, [v1]   ;load v1 into xmm0
    movups xmm1, [v2]   ;load v2 into xmm1
    
    addps xmm0, xmm1    ;add the 4 numbers in xmm1 (from v2) to the 4 numbers in xmm0 (from v1), store in xmm0. for the first float the result will be 5.5+1.1=6.6
    mulps xmm0, xmm1    ;multiply the four numbers in xmm1 (from v2, unchanged) with the results from the previous calculation (in xmm0), store in xmm0. for the first float the result will be 5.5*6.6=36.3
    subps xmm0, xmm1    ;subtract the four numbers in v2 (in xmm1, still unchanged) from result from previous calculation (in xmm1). for the first float, the result will be 36.3-5.5=30.8
    
    movups [v3], xmm0   ;store v1 in v3
    
    ;end program
    ret

结果值应为

30.800    51.480    77.000    107.360

使用 GNU 工具链，你可以像这样调试和单步执行

 % nasm -felf32 -g ssedemo.asm
 % ld -g ssedemo.o            
 % gdb -q ./a.out                
Reading symbols from a.out...done.
(gdb) break _start
Breakpoint 1 at 0x8048080
(gdb) r
Starting program: a.out 

Breakpoint 1, 0x08048080 in _start ()
(gdb) disass
Dump of assembler code for function _start:
=> 0x08048080 <+0>:	movups 0x80490a0,%xmm0
   0x08048087 <+7>:	movups 0x80490b0,%xmm1
   0x0804808e <+14>:	addps  %xmm1,%xmm0
   0x08048091 <+17>:	mulps  %xmm1,%xmm0
   0x08048094 <+20>:	subps  %xmm1,%xmm0
   0x08048097 <+23>:	movups %xmm0,0x80490c0
End of assembler dump.
(gdb) stepi
0x08048087 in _start ()
(gdb) 
0x0804808e in _start ()
(gdb) p $xmm0
$1 = {v4_float = {1.10000002, 2.20000005, 3.29999995, 4.4000001}, v2_double = {3.6000008549541236, 921.60022034645078}, v16_int8 = {-51, -52, -116, 63, 
    -51, -52, 12, 64, 51, 51, 83, 64, -51, -52, -116, 64}, v8_int16 = {-13107, 16268, -13107, 16396, 13107, 16467, -13107, 16524}, v4_int32 = {1066192077, 
    1074580685, 1079194419, 1082969293}, v2_int64 = {4615288900054469837, 4651317697086436147}, uint128 = 0x408ccccd40533333400ccccd3f8ccccd}
(gdb) x/4f &v1
0x80490a0 <v1>:	1.10000002	2.20000005	3.29999995	4.4000001
(gdb) stepi
0x08048091 in _start ()
(gdb) p $xmm0
$2 = {v4_float = {6.5999999, 8.80000019, 11, 13.2000008}, v2_double = {235929.65665283203, 5033169.0185546875}, v16_int8 = {51, 51, -45, 64, -51, -52, 12, 
    65, 0, 0, 48, 65, 52, 51, 83, 65}, v8_int16 = {13107, 16595, -13107, 16652, 0, 16688, 13108, 16723}, v4_int32 = {1087583027, 1091357901, 1093664768, 
    1095971636}, v2_int64 = {4687346494113788723, 4707162335057281024}, uint128 = 0x4153333441300000410ccccd40d33333}
(gdb)

调试器命令解释

break: 在本例中，在给定标签处设置断点
stepi: 在程序中向前执行一步指令
p: print 的缩写，打印给定寄存器或变量。在 GDB 中，寄存器以 $ 为前缀。
x: examine 的缩写，检查给定内存地址。"/4f" 表示 "4 个浮点数"（GDB 中的浮点数为 32 位）。你可以使用 c 表示字符，x 表示十六进制，当然也可以使用任何其他数字代替 4。"&" 获取 v1 的地址，与 C 中相同。

使用进行乱序的示例`shufps`

shufps IMM8, arg1, arg2	GAS 语法
shufps arg2, arg1, IMM8	英特尔语法

shufps 可用于对打包单精度浮点数进行乱序。该指令采用三个参数，arg1 为 xmm 寄存器，arg2 为 xmm 或 128 位内存位置，IMM8 为 8 位立即数控制字节。shufps 将分别从 arg1 和 arg2 获取两个元素，并将这些元素复制到 arg2。较低的两个元素将来自 arg1，较高的两个元素将来自 arg2。

IMM8 控制字节描述

IMM8 控制字节被分成四个位字段组，它们控制输出到 arg2，如下所示

IMM8[1:0] 指定 arg1 中哪个元素最终位于 arg2 的最低有效元素中

IMM8[1:0] 描述

00b 复制到最低有效元素

01b 复制到第二个元素

10b 复制到第三个元素

11b 复制到最高有效元素
IMM8[3:2] 指定 arg1 中哪个元素最终位于 arg2 的第二个元素中

IMM8[3:2] 描述

00b 复制到最低有效元素

01b 复制到第二个元素

10b 复制到第三个元素

11b 复制到最高有效元素
IMM8[5:4] 指定 arg2 中哪个元素最终位于 arg2 的第三个元素中

IMM8[5:4] 描述

00b 复制到最低有效元素

01b 复制到第二个元素

10b 复制到第三个元素

11b 复制到最高有效元素
IMM8[7:6] 指定 arg2 中哪个元素最终位于 arg2 的最高有效元素中

IMM8[7:6] 描述

00b 复制到最低有效元素

01b 复制到第二个元素

10b 复制到第三个元素

11b 复制到最高有效元素

IMM8 示例

考虑字节 0x1B

位号（0 为 LSB）	7	6	5	4	3	2	1	0
字节值	0x1B
四位字节值	0x1				0xB
2 位整数（十进制）值	0		1		2		3
位值	0	0	0	1	1	0	1	1

上面显示的 2 位值用于确定哪些元素被复制到 arg2。位 7-4 是 arg2 中的 "索引"，位 3-0 是 arg1 中的 "索引"。

由于位 7-6 为 0，因此 arg2 的最低有效元素被复制到 arg2 的最高有效元素中，即位 127-96。
由于位 5-4 为 1，因此 arg2 的第二个元素被复制到 arg2 的第三个元素中，即位 95-64。
由于位 3-2 为 2，因此 arg1 的第三个元素被复制到 arg2 的第二个元素中，即位 63-32。
由于位 0-1 为 3，因此 arg1 的第四个元素被复制到 arg2 的最低有效元素中，即位 (31-0)。

请注意，由于以下示例中的第一个和第二个参数相等，因此掩码 0x1B 将有效地反转 XMM 寄存器中浮点数的顺序，因为 2 位整数为 0、1、2、3。如果是 3、2、1、0 (0xE4)，它将是一个无操作。如果是 0、0、0、0 (0x00)，它将是最低有效 32 位的广播。

示例

.data
	.align 16
        v1: .float 1.1, 2.2, 3.3, 4.4
        v2: .float 5.5, 6.6, 7.7, 8.8
        v3: .float 0, 0, 0, 0
 
.text
.global _start 
_start:   
        movaps  v1,%xmm0        # load v1 into xmm0 to xmm6
        movaps  v1,%xmm1	# using movaps since v1 is 16 byte aligned
        movaps  v1,%xmm2
        movaps  v1,%xmm3
        movaps  v1,%xmm4
        movaps  v1,%xmm5
        movaps  v1,%xmm6
 
        shufps $0x1b, %xmm0, %xmm0 # reverse order of the 4 floats
        shufps $0x00, %xmm1, %xmm1 # Broadcast least significant element to all elements
        shufps $0x55, %xmm2, %xmm2 # Broadcast second element to all elements
        shufps $0xAA, %xmm3, %xmm3 # Broadcast third element to all elements
        shufps $0xFF, %xmm4, %xmm4 # Broadcast most significant element to all elements
        shufps $0x39, %xmm5, %xmm5 # Rotate elements right
        shufps $0x93, %xmm6, %xmm6 # Rotate elements left 

        movups  %xmm0,v3        #store v1 in v3
        ret

使用 GAS 构建 ELF 可执行文件

as -g shufps.S -o shufps.o
ld -g shufps.o

文本处理指令

SSE 4.2 添加了四个字符串文本处理指令 PCMPISTRI、PCMPISTRM、PCMPESTRI 和 PCMPESTRM。这些指令采用三个参数，arg1 为 xmm 寄存器，arg2 为 xmm 或 128 位内存位置，IMM8 为 8 位立即数控制字节。这些指令将对 arg1 和 arg2 的打包内容执行算术比较。IMM8 指定输入/输出格式以及两个中间处理阶段的操作。中间处理阶段 1 和阶段 2 的结果将分别称为 IntRes1 和 IntRes2。这些指令还通过对算术标志（AF、CF、OF、PF、SF 和 ZF）的重载使用提供有关结果的附加信息。

这些指令分多个步骤进行

比较 arg1 和 arg2
将聚合操作应用于比较结果，结果流入 IntRes1
执行可选的否定操作，结果流入 IntRes2
生成一个索引（在ECX中）或掩码（在XMM0中）形式的输出

IMM8 控制字节描述

IMM8 控制字节被分成四组位域，控制以下设置

IMM8[1:0] 指定 128 位源数据的格式（arg1 和 arg2）

IMM8[1:0]	描述
00b	无符号字节（16 个打包的无符号字节）
01b	无符号字（8 个打包的无符号字）
10b	有符号字节（16 个打包的有符号字节）
11b	有符号字（8 个打包的有符号字）

IMM8[3:2] 指定聚合操作，其结果将被放置在中间结果 1 中，我们将称之为 IntRes1。IntRes1 的大小将取决于源数据的格式，打包字节为 16 位，打包字为 8 位

IMM8[3:2]	描述
00b	等于任何，arg1 是一个字符集，arg2 是要搜索的字符串。如果 arg2[i] 位于 arg1 表示的集合中，则 IntRes1[i] 设置为 1 arg1 = "aeiou" arg2 = "Example string 1" IntRes1 = 0010001000010000
01b	范围，arg1 是一组字符范围，例如 "09az" 表示从 0 到 9 和从 a 到 z 的所有字符，arg2 是要搜索的字符串。如果 arg[i] 位于 arg1 表示的任何范围内，则 IntRes1[i] 设置为 1 arg1 = "09az" arg2 = "Testing 1 2 3, T" IntRes1 = 0111111010101000
10b	每个都相等，arg1 是字符串一，arg2 是字符串二。如果 arg1[i] == arg2[i]，则 IntRes1[i] 设置为 1 arg1 = "The quick brown " arg2 = "The quack green " IntRes1 = 1111110111010011
11b	有序相等，arg1 是要搜索的子字符串，arg2 是要搜索的字符串。如果子字符串 arg1 可以在位置 arg2[i] 处找到，则 IntRes1[i] 设置为 1 arg1 = "he" arg2 = ", he helped her " IntRes1 = 0010010000001000

IMM8[5:4] 指定 IntRes1 的极性或处理，到中间结果 2，将被称为 IntRes2

IMM8[5:4]	描述
00b	正极性	IntRes2 = IntRes1
01b	负极性	IntRes2 = -1 XOR IntRes1
10b	掩码正	IntRes2 = IntRes1
11b	掩码负	如果 reg/mem[i] 无效，则 IntRes2 = IntRes1，否则 ~IntRes1

IMM8[6] 指定输出选择，或 IntRes2 如何处理到输出中。对于 PCMPESTRI 和 PCMPISTRI，输出是当前由 arg2 引用的数据的索引

IMM8[6] 描述

0b 最低有效索引 ECX 包含 IntRes2 中最低有效位的集合

1b 最高有效索引 ECX 包含 IntRes2 中最高有效位的集合

对于 PCMPESTRM 和 PCMPISTRM，输出是一个掩码，反映了 IntRes2 中所有设置的位

IMM8[6]	描述
0b	最低有效索引	位掩码，XMM0 的最低有效位包含 IntRes2 16(8) 位掩码。XMM0 被零扩展到 128 位。
1b	最高有效索引	字节/字掩码，XMM0 包含扩展到字节/字掩码的 IntRes2

IMM8[7] 应设置为零，因为它没有设计的含义。

四个指令

pcmpistri IMM8, arg2, arg1	GAS 语法
pcmpistri arg1, arg2, IMM8	英特尔语法

PCMPISTRI，打包比较隐式长度字符串，返回索引。比较隐式长度的字符串并在 ECX 中生成索引。

操作数

arg1

XMM 寄存器

arg2

XMM 寄存器
内存

IMM8

8 位立即值

修改后的标志

如果 IntRes2 为零，则 CF 被重置，否则被设置
如果在 arg2 中找到空终止字符，则 ZF 被设置，否则被重置
如果在 arg1 中找到空终止字符，则 SF 被设置，否则被重置
OF 设置为 IntRes2[0]
AF 被重置
PF 被重置

示例

;
; nasm -felf32 -g sse4_2StrPcmpistri.asm -l sse4_2StrPcmpistri.lst
; gcc -o sse4_2StrPcmpistri sse4_2StrPcmpistri.o
;
global main 

extern printf
extern strlen
extern strcmp

section .data
	align 4
	;
	; Fill buf1 with a repeating pattern of ABCD
	;
	buf1:		times 10 dd 0x44434241
	s1:		db "This is a string", 0
	s2:		db "This is a string slightly different string", 0
	s3:		db "This is a str", 0
	fmtStr1:	db "String: %s len: %d", 0x0A, 0
	fmtStr1b:	db "strlen(3): String: %s len: %d", 0x0A, 0
	fmtStr2:	db "s1: =%s= and s2: =%s= compare: %d", 0x0A, 0
	fmtStr2b:	db "strcmp(3): s1: =%s= and s2: =%s= compare: %d", 0x0A, 0

;
; Functions will follow the cdecl call convention
;
section .text
	main:			; Using main since we are using gcc to link

	sub	esp, -16	; 16 byte align the stack
	sub	esp, 16		; space for four 4 byte parameters

	;
	; Null terminate buf1, make it proper C string, length is now 39
	;
	mov	[buf1+39], byte 0x00

	lea	eax, [buf1]
	mov	[esp], eax	; Arg1: pointer of string to calculate the length of
	mov	ebx, eax	; Save pointer in ebx since we will use it again
	call	strlenSSE42
	mov	edx, eax	; Copy length of arg1 into edx
	
	mov	[esp+8], edx	; Arg3: length of string
	mov	[esp+4], ebx	; Arg2: pointer to string
	lea	eax, [fmtStr1]
	mov	[esp], eax	; Arg1: pointer to format string
	call	printf		; Call printf(3):
				;	int printf(const char *format, ...);

	lea	eax, [buf1]
	mov	[esp], eax	; Arg1: pointer of string to calculate the length of
	mov	ebx, eax	; Save pointer in ebx since we will use it again
	call	strlen		; Call strlen(3):
				;	size_t strlen(const char *s);
	mov	edx, eax	; Copy length of arg1 into edx
	
	mov	[esp+8], edx	; Arg3: length of string
	mov	[esp+4], ebx	; Arg2: pointer to string
	lea	eax, [fmtStr1b]
	mov	[esp], eax	; Arg1: pointer to format string
	call	printf		; Call printf(3):
				;	int printf(const char *format, ...);

	lea	eax, [s2]
	mov	[esp+4], eax	; Arg2: pointer to second string to compare
	lea	eax, [s1]
	mov	[esp], eax	; Arg1: pointer to first string to compare
	call	strcmpSSE42

	mov	[esp+12], eax	; Arg4: result from strcmpSSE42  
	lea	eax, [s2]
	mov	[esp+8], eax	; Arg3: pointer to second string
	lea	eax, [s1]
	mov	[esp+4], eax	; Arg2: pointer to first string
	lea	eax, [fmtStr2]
	mov	[esp], eax	; Arg1: pointer to format string
	call	printf

	lea	eax, [s2]
	mov	[esp+4], eax	; Arg2: pointer to second string to compare
	lea	eax, [s1]
	mov	[esp], eax	; Arg1: pointer to first string to compare
	call	strcmp		; Call strcmp(3):
				;	int strcmp(const char *s1, const char *s2);

	mov	[esp+12], eax	; Arg4: result from strcmpSSE42  
	lea	eax, [s2]
	mov	[esp+8], eax	; Arg3: pointer to second string
	lea	eax, [s1]
	mov	[esp+4], eax	; Arg2: pointer to first string
	lea	eax, [fmtStr2b]
	mov	[esp], eax	; Arg1: pointer to format string
	call	printf

	lea	eax, [s3]
	mov	[esp+4], eax	; Arg2: pointer to second string to compare
	lea	eax, [s1]
	mov	[esp], eax	; Arg1: pointer to first string to compare
	call	strcmpSSE42

	mov	[esp+12], eax	; Arg4: result from strcmpSSE42  
	lea	eax, [s3]
	mov	[esp+8], eax	; Arg3: pointer to second string
	lea	eax, [s1]
	mov	[esp+4], eax	; Arg2: pointer to first string
	lea	eax, [fmtStr2]
	mov	[esp], eax	; Arg1: pointer to format string
	call	printf

	lea	eax, [s3]
	mov	[esp+4], eax	; Arg2: pointer to second string to compare
	lea	eax, [s1]
	mov	[esp], eax	; Arg1: pointer to first string to compare
	call	strcmp		; Call strcmp(3):
				;	int strcmp(const char *s1, const char *s2);

	mov	[esp+12], eax	; Arg4: result from strcmpSSE42  
	lea	eax, [s3]
	mov	[esp+8], eax	; Arg3: pointer to second string
	lea	eax, [s1]
	mov	[esp+4], eax	; Arg2: pointer to first string
	lea	eax, [fmtStr2b]
	mov	[esp], eax	; Arg1: pointer to format string
	call	printf

	call	exit


;
; size_t strlen(const char *s);
;
strlenSSE42:
	push	ebp
	mov	ebp, esp

	mov	edx, [ebp+8]	; Arg1: copy s(pointer to string) to edx 
	;
	; We are looking for null terminating char, so set xmm0 to zero
	;
	pxor	xmm0, xmm0
	mov	eax, -16	; Avoid extra jump in main loop

strlenLoop:
	add	eax, 16
	;
	; IMM8[1:0]	= 00b
	;	Src data is unsigned bytes(16 packed unsigned bytes)
	; IMM8[3:2]	= 10b
	; 	We are using Equal Each aggregation
	; IMM8[5:4]	= 00b
	;	Positive Polarity, IntRes2	= IntRes1
	; IMM8[6]	= 0b
	;	ECX contains the least significant set bit in IntRes2
	;
	pcmpistri	xmm0,[edx+eax], 0001000b
	;
	; Loop while ZF != 0, which means none of bytes pointed to by edx+eax
	; are zero.
	;
	jnz	strlenLoop
	
	;
	; ecx will contain the offset from edx+eax where the first null
	; terminating character was found.
	;
	add	eax, ecx
	pop	ebp
	ret

;
; int strcmp(const char *s1, const char *s2);
;
strcmpSSE42:
	push	ebp
	mov	ebp, esp

	mov	eax, [ebp+8]	; Arg1: copy s1(pointer to string) to eax
	mov	edx, [ebp+12]	; Arg2: copy s2(pointer to string) to edx
	;
	; Subtract s2(edx) from s1(eax). This admititedly looks odd, but we
	; can now use edx to index into s1 and s2. As we adjust edx to move
	; forward into s2, we can then add edx to eax and this will give us
	; the comparable offset into s1 i.e. if we take edx + 16 then:
	;
	;	edx 	= edx + 16		= edx + 16
	;	eax+edx	= eax -edx + edx + 16	= eax + 16
	;
	; therefore edx points to s2 + 16 and eax + edx points to s1 + 16.
	; We thus only need one index, convoluted but effective.
	;
	sub	eax, edx
	sub	edx, 16		; Avoid extra jump in main loop

strcmpLoop:
	add	edx, 16
	movdqu	xmm0, [edx]
	;
	; IMM8[1:0]	= 00b
	;	Src data is unsigned bytes(16 packed unsigned bytes)
	; IMM8[3:2]	= 10b
	; 	We are using Equal Each aggregation
	; IMM8[5:4]	= 01b
	;	Negative Polarity, IntRes2	= -1 XOR IntRes1
	; IMM8[6]	= 0b
	;	ECX contains the least significant set bit in IntRes2
	;
	pcmpistri	xmm0, [edx+eax], 0011000b
	;
	; Loop while ZF=0 and CF=0:
	;
	;	1) We find a null in s1(edx+eax) ZF=1
	;	2) We find a char that does not match CF=1
	;
	ja	strcmpLoop

	;
	; Jump if CF=1, we found a mismatched char
	;
	jc	strcmpDiff

	;
	; We terminated loop due to a null character i.e. CF=0 and ZF=1
	;
	xor	eax, eax	; They are equal so return zero
	jmp	exitStrcmp

strcmpDiff:
	add	eax, edx	; Set offset into s1 to match s2
	;
	; ecx is offset from current poition where two strings do not match,
	; so copy the respective non-matching byte into eax and edx and fill
	; in remaining bits w/ zero.
	;
	movzx	eax, byte[eax+ecx]
	movzx	edx, byte[edx+ecx]
	;
	; If s1 is less than s2 return integer less than zero, otherwise return
	; integer greater than zero.
	;
	sub	eax, edx

exitStrcmp:
	pop	ebp
	ret

exit:
				;
				; Call exit(3) syscall
				;	void exit(int status)
				;
	mov	ebx, 0		; Arg one: the status
	mov	eax, 1		; Syscall number:
	int 	0x80

预期输出

String: ABCDABCDABCDABCDABCDABCDABCDABCDABCDABC len: 39
strlen(3): String: ABCDABCDABCDABCDABCDABCDABCDABCDABCDABC len: 39
s1: =This is a string= and s2: =This is a string slightly different string= compare: -32
strcmp(3): s1: =This is a string= and s2: =This is a string slightly different string= compare: -32
s1: =This is a string= and s2: =This is a str= compare: 105
strcmp(3): s1: =This is a string= and s2: =This is a str= compare: 105

pcmpistrm IMM8, arg2, arg1	GAS 语法
pcmpistrm arg1, arg2, IMM8	英特尔语法

PCMPISTRM，打包比较隐式长度字符串，返回掩码。比较隐式长度的字符串并在 XMM0 中生成掩码。

操作数

arg1

XMM 寄存器

arg2

XMM 寄存器
内存

IMM8

8 位立即值

修改后的标志

如果 IntRes2 为零，则 CF 被重置，否则被设置
如果在 arg2 中找到空终止字符，则 ZF 被设置，否则被重置
如果在 arg2 中找到空终止字符，则 SF 被设置，否则被重置
OF 设置为 IntRes2[0]
AF 被重置
PF 被重置

pcmpestri IMM8, arg2, arg1	GAS 语法
pcmpestri arg1, arg2, IMM8	英特尔语法

PCMPESTRI，打包比较显式长度字符串，返回索引。比较显式长度的字符串并在 ECX 中生成索引。

操作数

arg1

XMM 寄存器

arg2

XMM 寄存器
内存

IMM8

8 位立即值

隐式操作数

EAX 保存 arg1 的长度
EDX 保存 arg2 的长度

修改后的标志

如果 IntRes2 为零，则 CF 被重置，否则被设置
如果 EDX < 16（对于字节）或 8（对于字），则 ZF 被设置，否则被重置
如果 EAX < 16（对于字节）或 8（对于字），则 SF 被设置，否则被重置
OF 设置为 IntRes2[0]
AF 被重置
PF 被重置

pcmpestrm IMM8, arg2, arg1	GAS 语法
pcmpestrm arg1, arg2, IMM8	英特尔语法

PCMPESTRM，打包比较显式长度字符串，返回掩码。比较显式长度的字符串并在 XMM0 中生成掩码。

操作数

arg1

XMM 寄存器

arg2

XMM 寄存器
内存

IMM8

8 位立即值

隐式操作数

EAX 保存 arg1 的长度
EDX 保存 arg2 的长度

修改后的标志

如果 IntRes2 为零，则 CF 被重置，否则被设置
如果 EDX < 16（对于字节）或 8（对于字），则 ZF 被设置，否则被重置
如果 EAX < 16（对于字节）或 8（对于字），则 SF 被设置，否则被重置
OF 设置为 IntRes2[0]
AF 被重置
PF 被重置

SSE 指令集

实际上有数百个 SSE 指令，其中一些能够完成比简单的 SIMD 算术运算更复杂的操作。有关更深入的参考资料，请查看本书的资源章节。

您可能会注意到许多浮点 SSE 指令以 PS 或 SD 之类的结尾。这些后缀区分操作的不同版本。第一个字母描述指令应该是Packed（打包）还是Scalar（标量）。打包操作应用于寄存器的每个成员，而标量操作仅应用于第一个值。例如，在伪代码中，打包加法将被执行为

v1[0] = v1[0] + v2[0]
v1[1] = v1[1] + v2[1]
v1[2] = v1[2] + v2[2]
v1[3] = v1[3] + v2[3]

而标量加法将仅为

v1[0] = v1[0] + v2[0]

第二个字母表示数据大小：Single（单精度）或Double（双精度）。这只是告诉处理器分别使用寄存器作为四个 32 位浮点数或两个 64 位双精度数。

SSE：在奔腾 III 中添加

浮点指令

ADDPS，ADDSS，CMPPS，CMPSS，COMISS，CVTPI2PS，CVTPS2PI，CVTSI2SS，CVTSS2SI，CVTTPS2PI，CVTTSS2SI，DIVPS，DIVSS，LDMXCSR，MAXPS，MAXSS，MINPS，MINSS，MOVAPS，MOVHLPS，MOVHPS，MOVLHPS，MOVLPS，MOVMSKPS，MOVNTPS，MOVSS，MOVUPS，MULPS，MULSS，RCPPS，RCPSS，RSQRTPS，RSQRTSS，SHUFPS，SQRTPS，SQRTSS，STMXCSR，SUBPS，SUBSS，UCOMISS，UNPCKHPS，UNPCKLPS

整数指令

ANDNPS，ANDPS，ORPS，PAVGB，PAVGW，PEXTRW，PINSRW，PMAXSW，PMAXUB，PMINSW，PMINUB，PMOVMSKB，PMULHUW，PSADBW，PSHUFW，XORPS

SSE2：在奔腾 4 中添加

浮点指令

ADDPD，ADDSD，ANDNPD，ANDPD，CMPPD，CMPSD*，COMISD，CVTDQ2PD，CVTDQ2PS，CVTPD2DQ，CVTPD2PI，CVTPD2PS，CVTPI2PD，CVTPS2DQ，CVTPS2PD，CVTSD2SI，CVTSD2SS，CVTSI2SD，CVTSS2SD，CVTTPD2DQ，CVTTPD2PI，CVTTPS2DQ，CVTTSD2SI，DIVPD，DIVSD，MAXPD，MAXSD，MINPD，MINSD，MOVAPD，MOVHPD，MOVLPD，MOVMSKPD，MOVSD*，MOVUPD，MULPD，MULSD，ORPD，SHUFPD，SQRTPD，SQRTSD，SUBPD，SUBSD，UCOMISD，UNPCKHPD，UNPCKLPD，XORPD

* CMPSD 和 MOVSD 与字符串指令助记符 CMPSD（CMPS）和 MOVSD（MOVS）具有相同的名称；但是，前者指的是标量双精度浮点数，而后者指的是双字字符串。

整数指令

MOVDQ2Q，MOVDQA，MOVDQU，MOVQ2DQ，PADDQ，PSUBQ，PMULUDQ，PSHUFHW，PSHUFLW，PSHUFD，PSLLDQ，PSRLDQ，PUNPCKHQDQ，PUNPCKLQDQ

SSE3：在后来的奔腾 4 中添加

ADDSUBPD，ADDSUBPS，HADDPD，HADDPS，HSUBPD，HSUBPS，MOVDDUP，MOVSHDUP，MOVSLDUP

SSSE3：在至强 5100 和早期酷睿 2 中添加

PSIGNW，PSIGND，PSIGNB，PSHUFB，PMULHRSW，PMADDUBSW，PHSUBW，PHSUBSW，PHSUBD，PHADDW，PHADDSW，PHADDD，PALIGNR，PABSW，PABSD，PABSB

SSE4

SSE4.1：在后来的酷睿 2 中添加

MPSADBW，PHMINPOSUW，PMULLD，PMULDQ，DPPS，DPPD，BLENDPS，BLENDPD，BLENDVPS，BLENDVPD，PBLENDVB，PBLENDW，PMINSB，PMAXSB，PMINUW，PMAXUW，PMINUD，PMAXUD，PMINSD，PMAXSD，ROUNDPS，ROUNDSS，ROUNDPD，ROUNDSD，INSERTPS，PINSRB，PINSRD，PINSRQ，EXTRACTPS，PEXTRB，PEXTRW，PEXTRD，PEXTRQ，PMOVSXBW，PMOVZXBW，PMOVSXBD，PMOVZXBD，PMOVSXBQ，PMOVZXBQ，PMOVSXWD，PMOVZXWD，PMOVSXWQ，PMOVZXWQ，PMOVSXDQ，PMOVZXDQ，PTEST，PCMPEQQ，PACKUSDW，MOVNTDQA

SSE4a：在羿龙中添加

LZCNT，POPCNT，EXTRQ，INSERTQ，MOVNTSD，MOVNTSS

SSE4.2：在 Nehalem 中添加

CRC32, PCMPESTRI, PCMPESTRM, PCMPISTRI, PCMPISTRM, PCMPGTQ

IMM8[7:6]	描述
00b	复制到最低有效元素
01b	复制到第二个元素
10b	复制到第三个元素
11b	复制到最高有效元素

寄存器

数据移动示例

使用打包单精度浮点数进行算术运算的示例

调试器命令解释

使用进行乱序的示例shufps

IMM8 控制字节描述

文本处理指令

IMM8 控制字节描述

四个指令

SSE 指令集

SSE：在奔腾 III 中添加

SSE2：在奔腾 4 中添加

SSE3：在后来的奔腾 4 中添加

SSSE3：在至强 5100 和早期酷睿 2 中添加

SSE4

SSE4.1：在后来的酷睿 2 中添加

SSE4a：在羿龙中添加

SSE4.2：在 Nehalem 中添加

使用进行乱序的示例`shufps`