![]() | ![]() ![]() ![]() |
As we know, it is not possible to do MOVS with SI=-X DI=+X, where X is 2,4,8 depending on the size which you want to move. However here is a solution for this problem (here X=2):
In 16-bit mode:
;
; movs with si=-x, di=+x, x=2,4,8
;
; input:
; si = source
; di = destination
; cx = number of bytes
;
; output:
; none (data copied)
;
; destroys:
; ax, bx, cx, dx, bp, si, di
; flags
;
add di,cx
sub cx,8 ; cx has (even) number of bytes to handle
jb done4
next4:
mov ax,[si]
mov bx,[si+2]
mov dx,[si+4]
mov bp,[si+6]
mov [di-8],bp
mov [di-6],dx
mov [di-4],bx
mov [di-2],ax
add si,8
sub di,8
sub cx,8
jae next4
done4:
; Handle the last 0 to 3 16-bit words here!This version pairs perfectly on a Pentium, for a throughput of 6 cycles for each block of 4 words, i.e. 1.5 cycles for each reversal.;
; movs with si=-x, di=+x, x=2,4,8
;
; input:
; esi = source
; edi = destination
; ecx = number of bytes (preferably divisible by 4!)
;
; output:
; none (data copied)
;
; destroys:
; eax, ebx, ecx, edx, ebp, esi, edi
; eflags
;
lea edi,[edi+ecx-4] ; Point edi at last 2-word block
sub ecx,16
jb done8
next8:
mov eax,[esi]
mov ebx,[esi+4]
rol ebx,16
mov edx,[esi+8]
rol edx,16
mov ebp,[esi+12]
rol ebp,16
add esi,16
rol eax,16
mov [edi-12],ebp
mov [edi-8],edx
mov [edi-4],ebx
mov [edi],eax
sub edi,16
sub ecx,16
jae next8
done8:This version would also pair perfectly, using just 8 cycles for the 16-instruction unrolled loop, which would copy/reverse 8 16-bit words at a rate of 1 reversal/cycle.