使用 x86 汇编实现 C# 的快速内存拷贝 - linux编程基础

ps xmm0,[esi] copy 128 bytes from aligned source address
? 0x0F, 0x28, 0x4E, 0x10, //movaps xmm1,[esi][010] copy more
? 0x0F, 0x28, 0x56, 0x20, //movaps xmm2,[esi][020]
? 0x0F, 0x18, 0x86, 0xC0, 0x02, 0x00, 0x00, //prefetchnta [esi][0000002C0] pre-fetch more data
? 0x0F, 0x28, 0x5E, 0x30, //movaps xmm3,[esi][030]
? 0x0F, 0x28, 0x66, 0x40, //movaps xmm4,[esi][040]
? 0x0F, 0x28, 0x6E, 0x50, //movaps xmm5,[esi][050]
? 0x0F, 0x28, 0x76, 0x60, //movaps xmm6,[esi][060]
? 0x0F, 0x28, 0x7E, 0x70, //movaps xmm7,[esi][070] we've copied 128 bytes of source data
? 0x85, 0xDB, //test ebx,ebx check if destination address is 16 byte aligned
? 0x74, 0x21, //jz 000000112 ↓ go to past if aligned
? 0x0F, 0x11, 0x07, //movups [edi],xmm0 past 16 bytes to non-aligned destination address
? 0x0F, 0x11, 0x4F, 0x10, //movups [edi][010],xmm1 past more
? 0x0F, 0x11, 0x57, 0x20, //movups [edi][020],xmm2
? 0x0F, 0x11, 0x5F, 0x30, //movups [edi][030],xmm3
? 0x0F, 0x11, 0x67, 0x40, //movups [edi][040],xmm4
? 0x0F, 0x11, 0x6F, 0x50, //movups [edi][050],xmm5
? 0x0F, 0x11, 0x77, 0x60, //movups [edi][060],xmm6
? 0x0F, 0x11, 0x7F, 0x70, //movups [edi][070],xmm7 we've pasted 128 bytes of data
? 0xEB, 0x1F, //jmps 000000131 ↓ continue copy-past
? 0x0F, 0x2B, 0x07, //movntps [edi],xmm0 past 16 bytes to aligned destination address
? 0x0F, 0x2B, 0x4F, 0x10, //movntps [edi][010],xmm1 past more
? 0x0F, 0x2B, 0x57, 0x20, //movntps [edi][020],xmm2
? 0x0F, 0x2B, 0x5F, 0x30, //movntps [edi][030],xmm3
? 0x0F, 0x2B, 0x67, 0x40, //movntps [edi][040],xmm4
? 0x0F, 0x2B, 0x6F, 0x50, //movntps [edi][050],xmm5
? 0x0F, 0x2B, 0x77, 0x60, //movntps [edi][060],xmm6
? 0x0F, 0x2B, 0x7F, 0x70, //movntps [edi][070],xmm7 we've pasted 128 bytes of data
? 0x81, 0xC6, 0x80, 0x00, 0x00, 0x00, //add esi,000000080 increment source address by 128
? 0x81, 0xC7, 0x80, 0x00, 0x00, 0x00, //add edi,000000080 increment destination address by 128
? 0x83, 0xE9, 0x01, //sub ecx,1 decrement counter
? 0x0F, 0x85, 0x7A, 0xFF, 0xFF, 0xFF, //jnz 0000000C0 ↑ continue copy-past if non-zero
? 0x8B, 0x8D, 0x6A, 0x01, 0x00, 0x00, //mov ecx,[ebp][00000016A] get number of bytes to copy
? 0x83, 0xE1, 0x7F, //and ecx,07F get rest number of bytes
? 0x85, 0xC9, //test ecx,ecx check if there are bytes
? 0x74, 0x02, //jz 000000155 ↓ exit if there are no more bytes
? 0xF3, 0xA4, //rep movsb copy rest of bytes
? 0x0F, 0xAE, 0xF8, //sfence performs a serializing operation on all store-to-memory instructions
? 0x61, //popad restore flag register
? 0xC3, //retn return from our method to C#
?
? 0x00, 0x00, 0x00, 0x00, //source buffer address
? 0x00, 0x00, 0x00, 0x00,
?
? 0x00, 0x00, 0x00, 0x00, //destination buffer address
? 0x00, 0x00, 0x00, 0x00,
?
? 0x00, 0x00, 0x00, 0x00, //number of bytes to copy-past
? 0x00, 0x00, 0x00, 0x00
};

我们将会通过前面创建的托管来调用汇编方法。

该方法目前工作在32位模式下，将来我会实现64位模式。

谁感兴趣的话可以添加到源代码中（文章中几乎包含了所有的代码）

在实现及测试该方法期间，我发现prefetchnta命令描述的不是很清楚，甚至是Intel的说明书也是一样。所以我尝试自己以及通过google来弄明白 Smile | :) 。注意movntps和movaps说明，它们只在16字节内存地址对齐时工作。

使用 x86 汇编实现 C# 的快速内存拷贝(四)