ps xmm0,[esi] copy 128 bytes from aligned source address ? 0x0F, 0x28, 0x4E, 0x10, //movaps xmm1,[esi][010] copy more ? 0x0F, 0x28, 0x56, 0x20, //movaps xmm2,[esi][020] ? 0x0F, 0x18, 0x86, 0xC0, 0x02, 0x00, 0x00, //prefetchnta [esi][0000002C0] pre-fetch more data ? 0x0F, 0x28, 0x5E, 0x30, //movaps xmm3,[esi][030] ? 0x0F, 0x28, 0x66, 0x40, //movaps xmm4,[esi][040] ? 0x0F, 0x28, 0x6E, 0x50, //movaps xmm5,[esi][050] ? 0x0F, 0x28, 0x76, 0x60, //movaps xmm6,[esi][060] ? 0x0F, 0x28, 0x7E, 0x70, //movaps xmm7,[esi][070] we've copied 128 bytes of source data ? 0x85, 0xDB, //test ebx,ebx check if destination address is 16 byte aligned ? 0x74, 0x21, //jz 000000112 ↓ go to past if aligned ? 0x0F, 0x11, 0x07, //movups [edi],xmm0 past 16 bytes to non-aligned destination address ? 0x0F, 0x11, 0x4F, 0x10, //movups [edi][010],xmm1 past more ? 0x0F, 0x11, 0x57, 0x20, //movups [edi][020],xmm2 ? 0x0F, 0x11, 0x5F, 0x30, //movups [edi][030],xmm3 ? 0x0F, 0x11, 0x67, 0x40, //movups [edi][040],xmm4 ? 0x0F, 0x11, 0x6F, 0x50, //movups [edi][050],xmm5 ? 0x0F, 0x11, 0x77, 0x60, //movups [edi][060],xmm6 ? 0x0F, 0x11, 0x7F, 0x70, //movups [edi][070],xmm7 we've pasted 128 bytes of data ? 0xEB, 0x1F, //jmps 000000131 ↓ continue copy-past ? 0x0F, 0x2B, 0x07, //movntps [edi],xmm0 past 16 bytes to aligned destination address ? 0x0F, 0x2B, 0x4F, 0x10, //movntps [edi][010],xmm1 past more ? 0x0F, 0x2B, 0x57, 0x20, //movntps [edi][020],xmm2 ? 0x0F, 0x2B, 0x5F, 0x30, //movntps [edi][030],xmm3 ? 0x0F, 0x2B, 0x67, 0x40, //movntps [edi][040],xmm4 ? 0x0F, 0x2B, 0x6F, 0x50, //movntps [edi][050],xmm5 ? 0x0F, 0x2B, 0x77, 0x60, //movntps [edi][060],xmm6 ? 0x0F, 0x2B, 0x7F, 0x70, //movntps [edi][070],xmm7 we've pasted 128 bytes of data ? 0x81, 0xC6, 0x80, 0x00, 0x00, 0x00, //add esi,000000080 increment source address by 128 ? 0x81, 0xC7, 0x80, 0x00, 0x00, 0x00, //add edi,000000080 increment destination address by 128 ? 0x83, 0xE9, 0x01, //sub ecx,1 decrement counter ? 0x0F, 0x85, 0x7A, 0xFF, 0xFF, 0xFF, //jnz 0000000C0 ↑ continue copy-past if non-zero ? 0x8B, 0x8D, 0x6A, 0x01, 0x00, 0x00, //mov ecx,[ebp][00000016A] get number of bytes to copy ? 0x83, 0xE1, 0x7F, //and ecx,07F get rest number of bytes ? 0x85, 0xC9, //test ecx,ecx check if there are bytes ? 0x74, 0x02, //jz 000000155 ↓ exit if there are no more bytes ? 0xF3, 0xA4, //rep movsb copy rest of bytes ? 0x0F, 0xAE, 0xF8, //sfence performs a serializing operation on all store-to-memory instructions ? 0x61, //popad restore flag register ? 0xC3, //retn return from our method to C# ? ? 0x00, 0x00, 0x00, 0x00, //source buffer address ? 0x00, 0x00, 0x00, 0x00, ? ? 0x00, 0x00, 0x00, 0x00, //destination buffer address ? 0x00, 0x00, 0x00, 0x00, ? ? 0x00, 0x00, 0x00, 0x00, //number of bytes to copy-past ? 0x00, 0x00, 0x00, 0x00 };
我们将会通过前面创建的托管来调用汇编方法。
该方法目前工作在32位模式下,将来我会实现64位模式。
谁感兴趣的话可以添加到源代码中(文章中几乎包含了所有的代码)
在实现及测试该方法期间,我发现prefetchnta命令描述的不是很清楚,甚至是Intel的说明书也是一样。所以我尝试自己以及通过google来弄明白 。注意movntps和movaps说明,它们只在16字节内存地址对齐时工作。
|