WWW.DUMAIS.IO

FFT on AMD64Last edited on Jun 5, 2012

Fast Fourier Transform with x86-64 assembly language

This is an old application I did a while ago. I did this in 2005 when I got my first 64bit CPU (AMD). The first I did after installing my new CPU was to open VI and start coding an FFT using 64 bit registers. This is old news, but 64 bit at that time was awesome. Not only can you store 64 bits in a register, but you get 32 general purpose registers!

The only really annoying thing with this architecture is that they don't provide a bit reveral instruction. I don't understand why a simple RISC processor like the AVR32 (lookup "brev") has one but not a high end CISC like Intel or AMD. I don't actually show the bit reveral part of the FFT in here though.

By the way, I remember doing some tests with this algorithm and, although I don't remember the results exactly (7 years ago), I remember that it was running at least 5 times faster than most other FFTs in other libraries.

//; x8664realfft(float* source,float** spectrum,long size)
x8664realifft:
        mov     	$1,%eax
        cvtsi2ss     %eax,%xmm10
        pshufd  	$0b00000000,%xmm10,%xmm10
        mov     	$-1,%eax
        cvtsi2ss     %eax,%xmm10
        pshufd  	$0b11000100,%xmm10,%xmm10
        jmp     	fftentry
x8664realfft:
	
	mov		$1,%eax
	cvtsi2ss	%eax,%xmm10
	pshufd	$0b00000000,%xmm10,%xmm10
fftentry:
        
        
        pushq   	%rbp
	movq    	%rsp,%rbp
	pushq	%rbp
	subq		$0xFF,%rsp
	movq	%rsp,%rbp
	
	//; make a 16bytes aligned buffer
	addq		$16,%rbp
	andq		$0xFFFFFFFFFFFFFFF0,%rbp

	pushq	%r15
	pushq	%r14
	pushq	%r13
	pushq	%r12
	pushq	%r11
	pushq	%r10
	pushq	%r9
	pushq	%r8


        //; rcx = size
        movq    	%rdx,%rcx  				
        pushq	%rcx
	//; rdx = source 
	mov		%rdi,%rdx				
	pushq		%rdx


	//; rdi = spectrum[0]
	movq	(%rsi), %rdi			
	addq		$8, %rsi
	
	//; rsi = spectrum[1]
	movq	(%rsi), %rsi			

	//; r8 = log2(N), r14= N
	pushq	%rcx
	fld1
	fild		(%rsp)
	xorq		%r8,%r8
	pushq	%r8
	fyl2x
	fistp		(%rsp)
	popq		%r8
	popq		%r14	
	
	//; bit reversal has already been done prior to calling this function
	
	
	//; r9 = nLargeSpectrum
	//; r10 = nPointsLargeSpectrum
	movq	%r14,%r9
	movq	$1,%r10
	movq	$1,%r11
	mov	%rdi,%r14
	mov	%rsi,%r15
	
	
	//;load 2PI in st(0)
	fldpi
	fldpi
	faddp	%st(0),%st(1)
	movq	%r8,%rcx


l1:	pushq	%rcx
	shrq	$1,%r9
	shlq	$1,%r10
	
	//;st(0) = theta, st(1) = 2pi
	fld	%st(0)
	pushq	%r10
	fidiv	(%rsp)
	popq	%r10

	//;xmm0 = 2*costheta[0],2*costheta[0],2*costheta[0],2*costheta[0]
	//;  st(0) = theta, st(1) = 2pi
	pushq	%rax
	fld	%st(0)
	fcos
	fstp	(%rsp)
	movss	(%rsp),%xmm0
	pshufd	$0b00000000,%xmm0,%xmm0
	popq	%rax
	addps	%xmm0,%xmm0
	
	movq	%r9,%rcx
l2:	pushq	%rcx
	
	//; r12 = point1 (index *4bytes)    r13 = point2 (index *4bytes)
	movq	%r10,%r12
	movq	%r9,%rax
	subq	%rcx,%rax
	pushq	%rdx
	mulq	%r12
	popq	%rdx
	movq	%rax,%r12
	movq	%r11,%r13
	addq	%r12,%r13
	shlq	$2,%r13
	shlq	$2,%r12

	//; xmm2 = costheta[2],sintheta[2],costheta[1],sintheta[1]  
	movq	%r12,16(%rbp)
	decq		16(%rbp)
	fld		%st(0)
	fimul		16(%rbp)
	fsincos
	fstp		(%rbp)
	fstp		4(%rbp)
	decq		16(%rbp)
	fld		%st(0)
	fimul		16(%rbp)
	fsincos
	fstp		8(%rbp)
	fstp		12(%rbp)
	movaps	(%rbp),%xmm2
	pshufd	$0b10110001 ,%xmm2,%xmm2
	
	//;xmm1 = costheta[1],sintheta[1],0,0
	movhlps	%xmm2,%xmm1
	
	
	movq	%r11,%rcx
l3:
	
	//; recurrence formula
	//; xmm3 = w.re,w.im,w.re,w.im
	movaps	%xmm2,%xmm3
	mulps	%xmm0,%xmm3
	subps	%xmm1,%xmm3
	movlhps	%xmm3,%xmm3
	movaps	%xmm2,%xmm1
	movaps	%xmm3,%xmm2
	mulps	%xmm10,%xmm3
	
	//; xmm5 := c.im,c.re,c.re,c.im
	movq	%r14,%rdi
	movq	%r15,%rsi
	addq		%r13,%rdi
	addq		%r13,%rsi
	movss	(%rdi),%xmm5
	pshufd	$0b00000011,%xmm5,%xmm5
	addss	(%rsi),%xmm5
	pshufd	$0b00101000,%xmm5,%xmm5
	
	//; xmm3 := inner product: re,re,im,im
	mulps	%xmm3,%xmm5
	pshufd	$0b11011101 ,%xmm5,%xmm3
	pshufd	$0b10001000 ,%xmm5,%xmm5
	addsubps	%xmm5,%xmm3
	pshufd	$0b10101111,%xmm3,%xmm3
	
	//;xmm6 := sortedArray[point1].re,sortedArray[point1].re,sortedArray[point1].im,sortedArray[point1].im
	movq	%r14,%rdi
	movq	%r15,%rsi
	addq	%r12,%rdi
	addq	%r12,%rsi
	movss	(%rdi),%xmm6
	pshufd	$0b00001111,%xmm6,%xmm6
	addss	(%rsi),%xmm6
	pshufd	$0b11100000,%xmm6,%xmm6
	
	addsubps	%xmm3,%xmm6
	pshufd	$0b00100111,%xmm6,%xmm6
	movss	%xmm6,(%rdi)
	pshufd	$0b11100001,%xmm6,%xmm6
	movss	%xmm6,(%rsi)
	
	movq	%r14,%rdi
	movq	%r15,%rsi
	addq	%r13,%rdi
	addq	%r13,%rsi
	pshufd	$0b01001110,%xmm6,%xmm6
	movss	%xmm6,(%rdi)
	pshufd	$0b11100001,%xmm6,%xmm6
	movss	%xmm6,(%rsi)
				
	//; increase point1 and point2 by 4 bytes (each index represent a float)
	addq		$4,%r12
	addq		$4,%r13
	
	decq		%rcx
	jnz		l3
	
	popq		%rcx
	decq		%rcx
	jnz		l2

	//; remove theta from fpu stack
	fstp		%st(0)
	
	shlq		$1,%r11
	popq		%rcx
	decq		%rcx
	jnz		l1

	popq	%rdx
	//; rcx is already pushed in stack
	cvtsi2ss      (%rsp),%xmm1
	pshufd  	$0b00000000,%xmm1,%xmm1
	popq		%rcx
	shrq          $2,%rcx
	movq	%r14,%rdi
	movq	%r15,%rsi

	//; is this a ifft or a fft?
	cvtss2si	%xmm10,%eax
	cmp	$-1,%eax
	jne	nrm

cp:	movaps	(%rdi),%xmm2
	movntdq	%xmm2,(%rdx)
	addq	$16,%rdi
	addq	$16,%rdx
	loop	cp
	jmp	cleanexit



nrm:
	movaps	        (%rdi),%xmm2
	movaps	        (%rsi),%xmm3
	divps		%xmm1,%xmm2
	divps		%xmm1,%xmm3
	movntdq	        %xmm2,(%rdi)
	movntdq	        %xmm3,(%rsi)
	addq		$16,%rdi
	addq		$16,%rsi
	loop		nrm

cleanexit:
	fstp		%st(0)
	popq		%r8
	popq		%r9
	popq		%r10
	popq		%r11
	popq		%r12
	popq		%r13
	popq		%r14
	popq		%r15
	addq		$0xFF,%rsp	
	popq		%rbp
	leave
	ret