FFT ON AMD64

2012-06-05

Fast Fourier Transform with x86-64 assembly language

This is an old application I did a while ago. I did this in 2005 when I got my first 64bit CPU (AMD). The first I did after installing my new CPU was to open VI and start coding an FFT using 64 bit registers. This is old news, but 64 bit at that time was awesome. Not only can you store 64 bits in a register, but you get 32 general purpose registers!

The only really annoying thing with this architecture is that they don't provide a bit reveral instruction. I don't understand why a simple RISC processor like the AVR32 (lookup "brev") has one but not a high end CISC like Intel or AMD. I don't actually show the bit reveral part of the FFT in here though.

By the way, I remember doing some tests with this algorithm and, although I don't remember the results exactly (7 years ago), I remember that it was running at least 5 times faster than most other FFTs in other libraries.


//; x8664realfft(float* source,float** spectrum,long size)
x8664realifft:
        mov     	$1,%eax
        cvtsi2ss     %eax,%xmm10
        pshufd  	$0b00000000,%xmm10,%xmm10
        mov     	$-1,%eax
        cvtsi2ss     %eax,%xmm10
        pshufd  	$0b11000100,%xmm10,%xmm10
        jmp     	fftentry
x8664realfft:
	
	mov		$1,%eax
	cvtsi2ss	%eax,%xmm10
	pshufd	$0b00000000,%xmm10,%xmm10
fftentry:
        
        
        pushq   	%rbp
	movq    	%rsp,%rbp
	pushq	%rbp
	subq		$0xFF,%rsp
	movq	%rsp,%rbp
	
	//; make a 16bytes aligned buffer
	addq		$16,%rbp
	andq		$0xFFFFFFFFFFFFFFF0,%rbp

	pushq	%r15
	pushq	%r14
	pushq	%r13
	pushq	%r12
	pushq	%r11
	pushq	%r10
	pushq	%r9
	pushq	%r8


        //; rcx = size
        movq    	%rdx,%rcx  				
        pushq	%rcx
	//; rdx = source 
	mov		%rdi,%rdx				
	pushq		%rdx


	//; rdi = spectrum[0]
	movq	(%rsi), %rdi			
	addq		$8, %rsi
	
	//; rsi = spectrum[1]
	movq	(%rsi), %rsi			

	//; r8 = log2(N), r14= N
	pushq	%rcx
	fld1
	fild		(%rsp)
	xorq		%r8,%r8
	pushq	%r8
	fyl2x
	fistp		(%rsp)
	popq		%r8
	popq		%r14	
	
	//; bit reversal has already been done prior to calling this function
	
	
	//; r9 = nLargeSpectrum
	//; r10 = nPointsLargeSpectrum
	movq	%r14,%r9
	movq	$1,%r10
	movq	$1,%r11
	mov	%rdi,%r14
	mov	%rsi,%r15
	
	
	//;load 2PI in st(0)
	fldpi
	fldpi
	faddp	%st(0),%st(1)
	movq	%r8,%rcx


l1:	pushq	%rcx
	shrq	$1,%r9
	shlq	$1,%r10
	
	//;st(0) = theta, st(1) = 2pi
	fld	%st(0)
	pushq	%r10
	fidiv	(%rsp)
	popq	%r10

	//;xmm0 = 2*costheta[0],2*costheta[0],2*costheta[0],2*costheta[0]
	//;  st(0) = theta, st(1) = 2pi
	pushq	%rax
	fld	%st(0)
	fcos
	fstp	(%rsp)
	movss	(%rsp),%xmm0
	pshufd	$0b00000000,%xmm0,%xmm0
	popq	%rax
	addps	%xmm0,%xmm0
	
	movq	%r9,%rcx
l2:	pushq	%rcx
	
	//; r12 = point1 (index *4bytes)    r13 = point2 (index *4bytes)
	movq	%r10,%r12
	movq	%r9,%rax
	subq	%rcx,%rax
	pushq	%rdx
	mulq	%r12
	popq	%rdx
	movq	%rax,%r12
	movq	%r11,%r13
	addq	%r12,%r13
	shlq	$2,%r13
	shlq	$2,%r12

	//; xmm2 = costheta[2],sintheta[2],costheta[1],sintheta[1]  
	movq	%r12,16(%rbp)
	decq		16(%rbp)
	fld		%st(0)
	fimul		16(%rbp)
	fsincos
	fstp		(%rbp)
	fstp		4(%rbp)
	decq		16(%rbp)
	fld		%st(0)
	fimul		16(%rbp)
	fsincos
	fstp		8(%rbp)
	fstp		12(%rbp)
	movaps	(%rbp),%xmm2
	pshufd	$0b10110001 ,%xmm2,%xmm2
	
	//;xmm1 = costheta[1],sintheta[1],0,0
	movhlps	%xmm2,%xmm1
	
	
	movq	%r11,%rcx
l3:
	
	//; recurrence formula
	//; xmm3 = w.re,w.im,w.re,w.im
	movaps	%xmm2,%xmm3
	mulps	%xmm0,%xmm3
	subps	%xmm1,%xmm3
	movlhps	%xmm3,%xmm3
	movaps	%xmm2,%xmm1
	movaps	%xmm3,%xmm2
	mulps	%xmm10,%xmm3
	
	//; xmm5 := c.im,c.re,c.re,c.im
	movq	%r14,%rdi
	movq	%r15,%rsi
	addq		%r13,%rdi
	addq		%r13,%rsi
	movss	(%rdi),%xmm5
	pshufd	$0b00000011,%xmm5,%xmm5
	addss	(%rsi),%xmm5
	pshufd	$0b00101000,%xmm5,%xmm5
	
	//; xmm3 := inner product: re,re,im,im
	mulps	%xmm3,%xmm5
	pshufd	$0b11011101 ,%xmm5,%xmm3
	pshufd	$0b10001000 ,%xmm5,%xmm5
	addsubps	%xmm5,%xmm3
	pshufd	$0b10101111,%xmm3,%xmm3
	
	//;xmm6 := sortedArray[point1].re,sortedArray[point1].re,sortedArray[point1].im,sortedArray[point1].im
	movq	%r14,%rdi
	movq	%r15,%rsi
	addq	%r12,%rdi
	addq	%r12,%rsi
	movss	(%rdi),%xmm6
	pshufd	$0b00001111,%xmm6,%xmm6
	addss	(%rsi),%xmm6
	pshufd	$0b11100000,%xmm6,%xmm6
	
	addsubps	%xmm3,%xmm6
	pshufd	$0b00100111,%xmm6,%xmm6
	movss	%xmm6,(%rdi)
	pshufd	$0b11100001,%xmm6,%xmm6
	movss	%xmm6,(%rsi)
	
	movq	%r14,%rdi
	movq	%r15,%rsi
	addq	%r13,%rdi
	addq	%r13,%rsi
	pshufd	$0b01001110,%xmm6,%xmm6
	movss	%xmm6,(%rdi)
	pshufd	$0b11100001,%xmm6,%xmm6
	movss	%xmm6,(%rsi)
				
	//; increase point1 and point2 by 4 bytes (each index represent a float)
	addq		$4,%r12
	addq		$4,%r13
	
	decq		%rcx
	jnz		l3
	
	popq		%rcx
	decq		%rcx
	jnz		l2

	//; remove theta from fpu stack
	fstp		%st(0)
	
	shlq		$1,%r11
	popq		%rcx
	decq		%rcx
	jnz		l1

	popq	%rdx
	//; rcx is already pushed in stack
	cvtsi2ss      (%rsp),%xmm1
	pshufd  	$0b00000000,%xmm1,%xmm1
	popq		%rcx
	shrq          $2,%rcx
	movq	%r14,%rdi
	movq	%r15,%rsi

	//; is this a ifft or a fft?
	cvtss2si	%xmm10,%eax
	cmp	$-1,%eax
	jne	nrm

cp:	movaps	(%rdi),%xmm2
	movntdq	%xmm2,(%rdx)
	addq	$16,%rdi
	addq	$16,%rdx
	loop	cp
	jmp	cleanexit



nrm:
	movaps	        (%rdi),%xmm2
	movaps	        (%rsi),%xmm3
	divps		%xmm1,%xmm2
	divps		%xmm1,%xmm3
	movntdq	        %xmm2,(%rdi)
	movntdq	        %xmm3,(%rsi)
	addq		$16,%rdi
	addq		$16,%rsi
	loop		nrm

cleanexit:
	fstp		%st(0)
	popq		%r8
	popq		%r9
	popq		%r10
	popq		%r11
	popq		%r12
	popq		%r13
	popq		%r14
	popq		%r15
	addq		$0xFF,%rsp	
	popq		%rbp
	leave
	ret

CLONING A HARD DRIVE

2012-05-17

Cloning a hard drive

In one of my computers, I have one hard drive that contains 2 partitions: 1 for the root filesystem and one for my /home partition. When I bought a new hard drive, I needed to clone the old one on the new one. This can be easily done with "dd" as long as your partitions are the same size. So I decided to keep the root filesystem with the same size, but wanted to grow the /home partition.

Create the partitions

First, you need to create the partitions on the new drive using fdisk. Remember to keep the same size for the partitions you wanna clone. If you create them smaller, you will end up with a corrupted filesystem. If you create them larger, you will not be able to access the extra space so it will be wasted. After creating the partitions, you don't need to create a filesystem on them (mkfs) since "dd" will clone the partition table of the old hard drive too. But of course, you will need to create a FS for the other partitions that won't be cloned.

Clone

You need to clone your master boot record (which contains lilo/grub). We need to copy the first 512 bytes (the first sector):


dd if=/dev/sda of=/dev/sdb bs=512 count=1

Then, we can clone the partition:


dd if=/dev/sda1 of=/dev/sdb1 bs=4096 conv=noerror

At this point, my root partition was cloned successfully. For the other partition (/dev/sdb2), I had to create a new filesystem (mkfs) because my partition needed to be larger. After that, I copied the files manually using "cp".

CONFIGURING AND USING KVM-QEMU

2012-02-28

KVM Qemu

I was tired of Vmware Server's sloooooow web interface that only works half of the time. I just couldn't take it anymore. So I started looking for other virtualization solutions. I found KVM. KVM/QEmu is, by far, easier to use than VMWARE Server. The thing I like about qemu is that there is no virtual machine files. You only create a virtual disk file but the machine itself is built from the command line when invoking qemu. That means you have to "rebuild" the machine every time you reload it. It looks painful but you just have to save your command in a script and invoke it. So it comes down to say that what a shell script is to qemu what a VMX file is to vmware. Don't ask me why, but this is a strong point for me.

Installing and preparing KVM Qemu

Compile kernel using KVM (see flags VIRTUALIZATION,KVM,KVM_AMD,KVM_INTEL)
Download and Install qemu-kvm
Install "tunctl"
make network bridge script. will need to create a script that will need to be run after every reboot (put in rc.local): #load tun driver and create a TAP interface modprobe tun tunctl -t tap0 # bring eth0 down, we will set it as promiscuous and it will be part of a bridge ifconfig eth0 down brctl addbr br0 ifconfig eth0 0.0.0.0 promisc up ifconfig tap0 0.0.0.0 promisc up # set the IP address of the bridge interface. This is the interface that we will use from now on. So use # an IP address on your LAN. This is the address of the host computer, not the guest. ifconfig br0 192.168.1.2 netmask 255.255.255.0 broadcast 192.168.1.255 up # add tap0 and eth0 as members of the bridge and bring it up. brctl stp br0 off brctl setfd br0 1 brctl sethello br0 1 brctl addif br0 eth0 brctl addif br0 tap0 # setup default gateway. route add default gw 192.168.1.1

Note that you will need to run that on every reboot. So you might want to save this is a boot script.

Create a VM

Create a 10g disk: qemu-img create -f qcow2 vdisk.img 10G.
install OS: qemu-system-x86_64 -hda vdisk.img -cdrom /path/to/boot-media.iso -boot d -m 512 -vnc :1. Let's analyze that command:
- "-hda vdisk.img": use vdisk.img as primary disk
- "-cdrom /path/to/boot-media.iso": cdrom should be mouted asboot-media.iso
- "-boot d": Boot from D drive, the cdrom
- "-m 512": 512 mb of RAM
- "-vnc :1" : The display will be on VNC port index number 1. Depending on your settings, if your base port is 5900, then the TCP port used in that case will be (5900 + 1).

So you can now use a VNC client to connect to port 5901 on your host to have access to the display. The VM will boot from the OS install CD you have provided so you will be able to install the OS like you would on a real computer.

Use a VM

Run: qemu-system-x86_64 -usbdevice tablet -daemonize -enable-kvm --hda /virtual-machines/vdisk.img -boot c -m 512 -vnc :1 -monitor telnet:127.0.0.1:3010,server,nowait,ipv4 -net tap,ifname=tap0,script=no -net nic Let's analyze that command:
- -usbdevice tablet: I had problems with my mouse cursor when using VNC if I didn't use that option.
- -daemonize: Run as background process
- -enable-kvm: Enable the use of kernel-based virtualization.
- "-hda vdisk.img": use vdisk.img as primary disk
- "-boot c": Boot from C drive, the primary disk
- "-m 512": 512 mb of RAM
- "-vnc :1" : The display will be on VNC port index number 1.
- -monitor telnet:127.0.0.1:3010,server,nowait,ipv4: Listen on 127.0.0.1:3010 for the telnet configuration.
- -net tap,ifname=tap0,script=no: Use tap0, and don't run network setup script.
Install a vnc viewer on some other computer (TightVNC). Connect to host on port 5901
Configure network on guest (If windows, enable remote desktop and disable firewall or poke a hole in it)

You should now have access to your VM through remote desktop or SSH or whatever you configured in that last step.

Managing the VM

You can telnet in the VM console to manage it. use the port you have setup with option "-monitor telnet". To exit the monitor, use 'ctrl-]' and press 'q'. If you type 'q' without 'ctrl-]', you will kill the VM.

Change CD in cdrom

telnet in management console and: change ide1-cd0 /shared/newimg.iso

Changing specs

Of course, if you want to add more RAM or change other system specs, you can do it from the command line when invoking qemu.

USING COUCHDB

2012-02-27

Introduction

Before using this information, you need to know how the JSON format works. JSON is kind of like XML, it is a way of representing data. I won't go into more details in here.

Concepts

If you are switching from a SQL database like MySQL to couchdb, then chances are you will be wondering where are the tables and how do I query them? Well there is no table. To make things simple, try to think of it this way:

CouchDB is like a database that contains only one table and one column. Each row is filled with a JSON document. You could easily do that with MySQL, except that the server doesn't understand JSON, so it can't do any special processing based on what your JSON document contains.
Everything is done through a web interface using a REST API. This doesn't mean that you query the DB directly from your website (you still make the queries from the server side). And for that matter, it doesn't mean that CouchDB is only made for websites.
If you are searching for "stored procedures", you wanna use "views" with couchDB.

So consider this: If you are building a simple blog where each posts contains a timestamp, a title and a content, then you will probably create a table like this in MySQL:

ID	TimeStamp	Title	Content
1	330133439	A Post	oh yeah
2	330133439	Another post	blah blah blah
...

What happens if you wanna add a "tag" column at one point? You'd have to modify your schema. So instead, for flexibility, you will decide to use one column only and store each post with a format you like, maybe you'll choose XML:

Data
<post> <id>1<\id> <title>A post</title> <timestamp>330133439</timestamp> <content>oh yeah</content> </post>
<post> <id>2<\id> <title>Another post</title> <timestamp>330133439</timestamp> <content>blah blah blah</content> </post>
...

This is exactly what couchDB is used for. Except that instead of a row, it calls it a document. Instead of using XML, it uses the JSON format. You might be wondering what's the point of using couchdb over mysql if both can do the same thing then. Couch DB adds more functionalities, like adding attachments to a document, create views with javascript and so much more. You will find a lot of blogs with people debating SQL vs NoSQL, so I won't cover this here. I just wanted to explain what CouchDB is.

Cheatsheet

Check if DB exists: curl -X GET http://127.0.0.1:5984/database1/
where 'database1' is the name of your database
Will return an error if DB does not exist
Create a database: curl -X PUT http://127.0.0.1:5984/database1/
where 'database1' is the name of your database
Create a document: curl -X PUT http://127.0.0.1:5984/database1/document1 -H "Content-Type: application/json" -d {"field1":"value1","field2":"value2"}
where 'database1' is the name of your database
where 'document1' is the ID of the document to create
Retrieve a document: curl -X GET http://127.0.0.1:5984/database1/document1
where 'database1' is the name of your database
where 'document1' is the ID of the document to retrieve
Create a view: curl -X PUT http://127.0.0.1:5984/database1/_design/designdocument1 -H "Content-Type: application/json" -d {JSON_REPRESENTATION_OF_VIEW}/
where 'designdocument1' is the name of your designdocument
Note that a design document can contain more than one view. A view contains a map function and a reduce function. The following is an example of what could be included as the "JSON_REPRESENTATION_OF_VIEW" { "language": "javascript", "views": { "view1": { "map": "function(doc){emit(doc._id,doc);}" }, "view2": { "map": "function(doc){emit(doc._id,doc);}", "reduce": "function (key, values){return null;}" } } }
Query a view: http://127.0.0.1:5984/database1/_design/designdocument1/_view/view2?reduce=true&group=true&skip=2&limit=5
where 'database1' is the name of your database
This will return the results of the view "view1" in "designdocument1". We have also provided parameters in the URL that says: we want the reduce function to be executed, we want results grouped, we want to skip the 2 first documents returned by the view, we want a maximum of 5 documents in total.

using the results in php

If we query curl -X GET http://127.0.0.1:5984/database1/document1 and we get the result


{
   "_id": "document1",
   "_rev": "1-a227e6b8d34d14fbc59c4dde72e53848",
   "field1": "value1",
   "field2": {"sub1":"val1","sub2":"val2"},
   "field3": ["val1","val2","val3"]
}

Then we can take that result and decode it using json_decode


$obj = json_decode($jsonString);

We get:

$obj->field1="value1"
$obj->field2->sub2"val2" ($obj->field2 is an object)
$obj->field3[1]="val2" ($obj->field3 is an array)

Text Search

Consider this SQL query: SELECT * FROM posts WHERE content LIKE 'test'. With CouchDB, it gets a little more complicated. First, you need to create a view that emits a map of ALL the words in your documents.


function(doc) {
    var tokens;
    if (doc.content) {
        var st = doc.content.replace(/<(?:.|\n)*?>/gm, '');
        tokens = st.split(/[^A-Z0-9\-_]+/i);
        var uniqueTokens = {};
        for (var i=0;i<tokens.length;i++)
        {
            var key = (tokens[i]);
            if (key!="") uniqueTokens[key] = key;
        }
        for (var token in uniqueTokens){
            emit(token,doc.title);
        }
    }
}

So if you have the following documents in your database:


{"title":"doc1","content":"hello this is a test"}
{"title":"doc2","content":"another document"}

Your view would output the following:


"hello",doc1
"this",doc1
"is",doc1
"a",doc1
"test",doc1
"another",doc2
"document",doc2

So if you want to retrieve only the documents that contains the word "test", then you could invoke the following: http://127.0.0.1:5984/database1/_design/designdocument1/_view/view1?keys=["test"]