# FFT on AMD64Last edited on Jun 5, 2012

## Fast Fourier Transform with x86-64 assembly language

This is an old application I did a while ago. I did this in 2005 when I got my first 64bit CPU (AMD).
The first I did after installing my new CPU was to open VI and start coding an FFT using 64 bit registers. This is old news, but 64 bit at that time was awesome. Not only can you store 64 bits in a register, but you get 32 general purpose registers!

The only really annoying thing with this architecture is that they don't provide a bit reveral instruction. I don't understand why a simple RISC processor like the AVR32 (lookup "brev") has one but not a high end CISC like Intel or AMD. I don't actually show the bit reveral part of the FFT in here though.

By the way, I remember doing some tests with this algorithm and, although I don't remember the results exactly (7 years ago), I remember that it was running at least 5 times faster than most other FFTs in other libraries.

//; x8664realfft(float* source,float** spectrum,long size) x8664realifft: mov $1,%eax cvtsi2ss %eax,%xmm10 pshufd $0b00000000,%xmm10,%xmm10 mov $-1,%eax cvtsi2ss %eax,%xmm10 pshufd $0b11000100,%xmm10,%xmm10 jmp fftentry x8664realfft: mov $1,%eax cvtsi2ss %eax,%xmm10 pshufd $0b00000000,%xmm10,%xmm10 fftentry: pushq %rbp movq %rsp,%rbp pushq %rbp subq $0xFF,%rsp movq %rsp,%rbp //; make a 16bytes aligned buffer addq $16,%rbp andq $0xFFFFFFFFFFFFFFF0,%rbp pushq %r15 pushq %r14 pushq %r13 pushq %r12 pushq %r11 pushq %r10 pushq %r9 pushq %r8 //; rcx = size movq %rdx,%rcx pushq %rcx //; rdx = source mov %rdi,%rdx pushq %rdx //; rdi = spectrum[0] movq (%rsi), %rdi addq $8, %rsi //; rsi = spectrum[1] movq (%rsi), %rsi //; r8 = log2(N), r14= N pushq %rcx fld1 fild (%rsp) xorq %r8,%r8 pushq %r8 fyl2x fistp (%rsp) popq %r8 popq %r14 //; bit reversal has already been done prior to calling this function //; r9 = nLargeSpectrum //; r10 = nPointsLargeSpectrum movq %r14,%r9 movq $1,%r10 movq $1,%r11 mov %rdi,%r14 mov %rsi,%r15 //;load 2PI in st(0) fldpi fldpi faddp %st(0),%st(1) movq %r8,%rcx l1: pushq %rcx shrq $1,%r9 shlq $1,%r10 //;st(0) = theta, st(1) = 2pi fld %st(0) pushq %r10 fidiv (%rsp) popq %r10 //;xmm0 = 2*costheta[0],2*costheta[0],2*costheta[0],2*costheta[0] //; st(0) = theta, st(1) = 2pi pushq %rax fld %st(0) fcos fstp (%rsp) movss (%rsp),%xmm0 pshufd $0b00000000,%xmm0,%xmm0 popq %rax addps %xmm0,%xmm0 movq %r9,%rcx l2: pushq %rcx //; r12 = point1 (index *4bytes) r13 = point2 (index *4bytes) movq %r10,%r12 movq %r9,%rax subq %rcx,%rax pushq %rdx mulq %r12 popq %rdx movq %rax,%r12 movq %r11,%r13 addq %r12,%r13 shlq $2,%r13 shlq $2,%r12 //; xmm2 = costheta[2],sintheta[2],costheta[1],sintheta[1] movq %r12,16(%rbp) decq 16(%rbp) fld %st(0) fimul 16(%rbp) fsincos fstp (%rbp) fstp 4(%rbp) decq 16(%rbp) fld %st(0) fimul 16(%rbp) fsincos fstp 8(%rbp) fstp 12(%rbp) movaps (%rbp),%xmm2 pshufd $0b10110001 ,%xmm2,%xmm2 //;xmm1 = costheta[1],sintheta[1],0,0 movhlps %xmm2,%xmm1 movq %r11,%rcx l3: //; recurrence formula //; xmm3 = w.re,w.im,w.re,w.im movaps %xmm2,%xmm3 mulps %xmm0,%xmm3 subps %xmm1,%xmm3 movlhps %xmm3,%xmm3 movaps %xmm2,%xmm1 movaps %xmm3,%xmm2 mulps %xmm10,%xmm3 //; xmm5 := c.im,c.re,c.re,c.im movq %r14,%rdi movq %r15,%rsi addq %r13,%rdi addq %r13,%rsi movss (%rdi),%xmm5 pshufd $0b00000011,%xmm5,%xmm5 addss (%rsi),%xmm5 pshufd $0b00101000,%xmm5,%xmm5 //; xmm3 := inner product: re,re,im,im mulps %xmm3,%xmm5 pshufd $0b11011101 ,%xmm5,%xmm3 pshufd $0b10001000 ,%xmm5,%xmm5 addsubps %xmm5,%xmm3 pshufd $0b10101111,%xmm3,%xmm3 //;xmm6 := sortedArray[point1].re,sortedArray[point1].re,sortedArray[point1].im,sortedArray[point1].im movq %r14,%rdi movq %r15,%rsi addq %r12,%rdi addq %r12,%rsi movss (%rdi),%xmm6 pshufd $0b00001111,%xmm6,%xmm6 addss (%rsi),%xmm6 pshufd $0b11100000,%xmm6,%xmm6 addsubps %xmm3,%xmm6 pshufd $0b00100111,%xmm6,%xmm6 movss %xmm6,(%rdi) pshufd $0b11100001,%xmm6,%xmm6 movss %xmm6,(%rsi) movq %r14,%rdi movq %r15,%rsi addq %r13,%rdi addq %r13,%rsi pshufd $0b01001110,%xmm6,%xmm6 movss %xmm6,(%rdi) pshufd $0b11100001,%xmm6,%xmm6 movss %xmm6,(%rsi) //; increase point1 and point2 by 4 bytes (each index represent a float) addq $4,%r12 addq $4,%r13 decq %rcx jnz l3 popq %rcx decq %rcx jnz l2 //; remove theta from fpu stack fstp %st(0) shlq $1,%r11 popq %rcx decq %rcx jnz l1 popq %rdx //; rcx is already pushed in stack cvtsi2ss (%rsp),%xmm1 pshufd $0b00000000,%xmm1,%xmm1 popq %rcx shrq $2,%rcx movq %r14,%rdi movq %r15,%rsi //; is this a ifft or a fft? cvtss2si %xmm10,%eax cmp $-1,%eax jne nrm cp: movaps (%rdi),%xmm2 movntdq %xmm2,(%rdx) addq $16,%rdi addq $16,%rdx loop cp jmp cleanexit nrm: movaps (%rdi),%xmm2 movaps (%rsi),%xmm3 divps %xmm1,%xmm2 divps %xmm1,%xmm3 movntdq %xmm2,(%rdi) movntdq %xmm3,(%rsi) addq $16,%rdi addq $16,%rsi loop nrm cleanexit: fstp %st(0) popq %r8 popq %r9 popq %r10 popq %r11 popq %r12 popq %r13 popq %r14 popq %r15 addq $0xFF,%rsp popq %rbp leave ret