WKdmCompress.s   [plain text]


// $Id: WKdmCompress.intel.s,v 1.1 2010/01/28 22:33:24 cclee Exp cclee $
//
// This file contains i386 and x86_64 (no SSE) optimized implementation of WKdm Compressor. The function prototype is
// 
// unsigned int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, unsigned int num_input_words);
// 
// The implementation assumes the input buffer is a memory page (4096 bytes or 1024 words), or something less than 4KB.
//
// WKdm Compression algorithm is briefly stated as follows:
// 
// There is a dynamically updated dictionary of 16 words, each initialized with "1".
//
// the dictionary is indexed as follows, 
//	0, x = input_word
//  1, hash_index = (x>>10)&255
//  2, dict_location = &dictionary[hash_index]
//  3, dict_word = *dict_location
//
// Sequentially for each input word, it is classified/tagged into 4 classes
//	0 : if the input word is 0
//  1 : the higher 22 bits of the input word is identically to the higher bits from the dictionary (hash table indexed)
//  2 : the above condition (partially 22 higher bits matched) is not met, a dictionary miss condition
//  3 : the input word is exactly matched to the word from the dictionary (hash table index)
//
// after each input word is classified, each tag is represented by 2 bits. Furthermore, for each class
//	0 : no further info is needed
//  1 : the hash_index is represented by 4-bits (8 packed into a word),
//		the lower 10-bits is sent to the decompressor (3 packed into a word)
//  2 : the 32-bit word is sent to the decompressor
//  3 : the hash_index is represented by 4-bits (8 packed into a word)
//
// for classes 1 and 2, the input word is used to update the dictionary after it is classified/tagged
//
// the following implementation was started from compiling (gcc -O3) the original C code (WKdmCompress.c)
// and then subsequentially improved and documented.
// For i386, it speeds up ~ 1.5 times
// For x86_64, it speeds up ~ 1.3 times
//
// cclee, 1/28/10

#if !(defined __i386__ || defined __x86_64__)

typedef char DummyDefinition;

#else		// i386 or x86_64 architectures

#if defined	__i386__			// 32-bit implementation

	.text
	.align 4,0x90

.globl _WKdm_compress
_WKdm_compress:

	pushl	%ebp
	movl	%esp, %ebp

	pushl	%edi
	pushl	%esi
	pushl	%ebx

	// allocate stack memory for local variables

	subl	$6316, %esp

	leal	_hashLookupTable, %ebx			        // hashTable

	movl	8(%ebp), %edx					// %edx = src_buf
	movl	12(%ebp), %esi					// %esi = dest_buf
	movl	16(%ebp), %eax					// %eax = num_input_words

	leal	-1112(%ebp), %ecx				// tempTagsArray
	movl	%ecx, -6272(%ebp)				// a copy of char* next_tag = (char *) tempTagsArray;

	leal	-2136(%ebp), %ecx				// tempQPosArray
	movl	%ecx, -6264(%ebp)				// char* next_qp = (char *) tempQPosArray;
	movl	%ecx, -6252(%ebp)

	leal	(%edx,%eax,4), %ecx				// src_buf + num_input_words*4
	movl	%ecx, -6244(%ebp)				// end_of_input = src_buf + num_input_words;

	// PRELOAD_DICTIONARY;
	movl	$1, -88(%ebp)
	movl	$1, -84(%ebp)
	movl	$1, -80(%ebp)
	movl	$1, -76(%ebp)
	movl	$1, -72(%ebp)
	movl	$1, -68(%ebp)
	movl	$1, -64(%ebp)
	movl	$1, -60(%ebp)
	movl	$1, -56(%ebp)
	movl	$1, -52(%ebp)
	movl	$1, -48(%ebp)
	movl	$1, -44(%ebp)
	movl	$1, -40(%ebp)
	movl	$1, -36(%ebp)
	movl	$1, -32(%ebp)
	movl	$1, -28(%ebp)

	shrl	$4, %eax						// (num_input_words / 16)
	leal	16(%esi,%eax,4), %eax			// dest_buf + [TAGS_AREA_OFFSET + (num_input_words / 16)]*4
	movl	%eax, -6256(%ebp)				// next_full_patt = dest_buf + TAGS_AREA_OFFSET + (num_input_words / 16);

	leal	-6232(%ebp), %eax				// &tempLowBitsArray[0]
	movl	%eax, -6260(%ebp)				// save a copy of &tempLowBitsArray[0]
	movl	%eax, -6248(%ebp)				// save a copy of &tempLowBitsArray[0]

	cmpl	%ecx, %edx						// next_input_word (%edx) vs end_of_input (%ecx) 
	jae		L_done_search					// if (next_input_word >= end_of_input) skip the following search loop

	leal	-1111(%ebp), %esi				// &next_tag[1]
	leal	-88(%ebp), %ebp					// dictionary 

	movl	%edx, %edi						// next_input_word

	#define		next_input_word		%edi
	#define		dictionary			%ebp
	#define		next_tag			%esi

	jmp		L5

	.align 4,0x90
L_RECORD_ZERO:
	movb	$0, -1(next_tag)				// *next_tag = ZERO;
L8:
	addl	$4, next_input_word				// next_input_word++; 
	incl	next_tag						// next_tag++
	cmpl	next_input_word, 84(%esp)		// end_of_input vs next_input_word
	jbe		L_done_search					// if (next_input_word>=end_of_input), skip to L_done_search 
L5:
	movl	(next_input_word), %ecx			// input_word = *next_input_word;
	movl	%ecx, %eax						// a copy of input_word
	testl	%ecx, %ecx						// input_word
	je		L_RECORD_ZERO					// if (input_word==0) RECORD_ZERO
	shrl	$10, %eax						// input_high_bits = HIGH_BITS(input_word);
	movl	%eax, (%esp)					// save a copy of input_high_bits;
	andl	$255, %eax						// 8 bits index to Hash Table
	movsbl	(%ebx,%eax),%edx				// HASH_TO_DICT_BYTE_OFFSET(input_word)
	addl	dictionary, %edx				// ((char*) dictionary) + HASH_TO_DICT_BYTE_OFFSET(input_word));
	movl	(%edx), %eax					// dict_word = *dict_location;
	cmpl	%eax, %ecx						// cmp input_word vs dict_word
	je		L_RECORD_EXACT
	shrl	$10, %eax						// HIGH_BITS(dict_word)
	cmpl	%eax, (%esp)					// input_high_bits vs HIGH_BITS(dict_word)
	je		L_RECORD_PARTIAL				// if (input_high_bits == HIGH_BITS(dict_word)) RECORD_PARTIAL 

L_RECORD_MISS:
	movb	$2, -1(next_tag)				// *next_tag = 2 for miss
	movl	72(%esp), %eax					// next_full_patt
	movl	%ecx, (%eax)					// *next_full_patt = input_word;
	addl	$4, %eax						// next_full_patt++;
	movl	%eax, 72(%esp)					// save next_full_patt
	movl	%ecx, (%edx)					// *dict_location = input_word
	jmp		L8

	.align 4,0x90
L_RECORD_EXACT:
	movb	$3, -1(next_tag)				// *next_tag = 3 for exact
	subl	dictionary, %edx				// dict_location - dictionary 
	sarl	$2, %edx						// divide by 4 for word offset 
	movl	76(%esp), %eax					// next_qp
	movb	%dl, (%eax)						// *next_qp = word offset (4-bit)
	incl	%eax							// next_qp++
	movl	%eax, 76(%esp)					// save next_qp
	jmp		L8

L_done_search:

	// restore %ebp as normal use (was used as dictionary)
	movl	%esp, %ebp						
	addl	$6328, %ebp

	// SET_QPOS_AREA_START(dest_buf,next_full_patt);
	movl	-6256(%ebp), %edi				// next_full_patt
	subl	12(%ebp), %edi					// next_full_patt - dest_buf
	movl	%edi, %eax						// next_full_patt - dest_buf
	sarl	$2, %eax						// in 4-byte words
	movl	%eax, -6240(%ebp)				// save (next_full_patt - dest_buf) in words
	movl	12(%ebp), %edx					// dest_buf
	movl	%eax, 4(%edx)					// dest_buf[1] = next_full_patt - dest_buf

	movl	-6272(%ebp), %ecx				// &tempTagsArray[0]
	decl	next_tag
	cmpl	next_tag, %ecx					// next_tag vs &tempTagsArray[0]
	jae		L13								// if &tempTagsArray[0] >= next_tag, skip the following WK_pack_2bits

	movl	%edx, %ebx						// a copy of dest_buf

	// boundary_tmp = WK_pack_2bits(tempTagsArray, (WK_word *) next_tag, dest_buf + HEADER_SIZE_IN_WORDS);	

	.align 4,0x90
L_WK_pack_2bits:
	movl	4(%ecx), %eax					// w1
	sall	$2, %eax						// w1 << 2		
	movl	8(%ecx), %edx					// w2
	sall	$4, %edx						// w2 << 4
	orl		%edx, %eax						// (w1<<2) | (w2<<4)
	orl		(%ecx), %eax					// (w0) | (w1<<2) | (w2<<4)
	movl	12(%ecx), %edx					// w3
	sall	$6, %edx						// (w3<<6)
	orl		%edx, %eax						// (w0) | (w1<<2) | (w2<<4) | (w3<<6)
	movl	%eax, 16(%ebx)					// save at *(dest_buf + HEADER_SIZE_IN_WORDS)
	addl	$16, %ecx						// tempTagsArray += 16;
	addl	$4, %ebx						// dest_buf += 4;
	cmpl    %ecx, next_tag					// cmp next_tag vs dest_buf
	ja		L_WK_pack_2bits					// if (next_tag > dest_buf) repeat L_WK_pack_2bits

	/* Pack the queue positions into the area just after the full words. */
L13:
	movl	-6252(%ebp), %eax				// next_qp
	movl	-6264(%ebp), %ecx				// (char *) tempQPosArray
	movl	%eax, %esi						// next_qp
	subl	%ecx, %eax						// num_bytes_to_pack = next_qp - (char *) tempQPosArray;
	addl	$7, %eax						// num_bytes_to_pack + 7
	andl	$-8, %eax						// clear lower 3 bits, (num_packed_words<<3)
	addl	%eax, %ecx						// endQPosArray = tempQPosArray + num_source_words;
	cmpl	%ecx, %esi						// next_qp vs endQPosArray
	jae		L16
	.align 4,0x90
L30:
	movb	$0, (%esi)						// *next_qp = 0;
	incl	%esi							// next_qp++
	cmpl	%ecx, %esi						// next_qp vs endQPosArray
	jne		L30								// 

L16:
	movl	-6256(%ebp), %ebx				// next_full_patt
	cmpl	-6264(%ebp), %ecx				// endQPosArray vs tempQPosArray
	jbe		L20								// if (endQPosArray<=tempQPosArray) skip L_WK_pack_4bits
	movl	-6264(%ebp), %edx				// tempQPosArray


	// boundary_tmp = WK_pack_4bits(tempQPosArray, endQPosArray, next_full_patt);

	.align 4,0x90
L21:
	movl	4(%edx), %eax					// src_next[1]
	sall	$4, %eax						// (src_next[1] << 4)
	orl		(%edx), %eax					// temp = src_next[0] | (src_next[1] << 4)
	movl	%eax, (%ebx)					// dest_next[0] = temp; 
	addl	$4, %ebx						// dest_next++;
	addl	$8, %edx						// src_next += 2;
	cmpl	%edx, %ecx						// source_end vs src_next
	ja		L21								// while (src_next < source_end) repeat the loop

	movl	%ebx, %edi						// boundary_tmp

	subl	12(%ebp), %edi					// boundary_tmp - dest_buf
	movl	%edi, %eax						// boundary_tmp - dest_buf
	sarl	$2, %eax						// translate into word offset

	movl	%eax, -6240(%ebp)				// save (next_full_patt - dest_buf) in words 

L20:
	// SET_LOW_BITS_AREA_START(dest_buf,boundary_tmp);
	movl	-6240(%ebp), %ecx				// boundary_tmp - dest_buf 
	movl	12(%ebp), %edx					// dest_buf
	movl	%ecx, 8(%edx)					// dest_buf[2] = boundary_tmp - dest_buf

	movl	-6260(%ebp), %ecx				// tempLowBitsArray
	movl	-6248(%ebp), %edx				// next_low_bits 
	subl	%ecx, %edx						// next_low_bits - tempLowBitsArray
	sarl	$2, %edx						// num_tenbits_to_pack 

	subl	$3, %edx						// pre-decrement num_tenbits_to_pack by 3
	jl		1f								// if num_tenbits_to_pack < 3, skip the following loop
	.align	4,0x90
0:
	movl	4(%ecx), %eax					// w1
	sall	$10, %eax						// w1<<10
	movl	8(%ecx), %esi					// w2
	sall	$20, %esi						// w2<<20
	orl		%esi, %eax						// (w1<<10) | (w2<<20)
	orl		(%ecx), %eax					// (w0) | (w1<<10) | (w2<<20)
	movl	%eax, (%ebx)					// pack w0,w1,w2 into 1 dest_buf word
	addl	$4, %ebx						// dest_buf++
	addl	$12, %ecx						// next w0/w1/w2 triplet
	subl	$3, %edx						// num_tenbits_to_pack-=3 
	jge		0b								// if no less than 3 elements, back to loop head

1:	addl	$3, %edx						// post-increment num_tenbits_to_pack by 3
	je		3f								// if num_tenbits_to_pack is a multiple of 3, skip the following
	movl	(%ecx), %eax					// w0
	subl	$1, %edx						// num_tenbits_to_pack --
	je		2f								// 
	movl    4(%ecx), %esi					// w1
	sall	$10, %esi						// w1<<10
	orl		%esi, %eax
2:
	movl	%eax, (%ebx)					// write the final dest_buf word
	addl	$4, %ebx						// dest_buf++
3:
	movl	%ebx, %eax						// boundary_tmp
	subl	12(%ebp), %eax					// boundary_tmp - dest_buf
	sarl	$2, %eax						// boundary_tmp - dest_buf in terms of words
	movl	12(%ebp), %esi					// dest_buf
	movl	%eax, 12(%esi)					// SET_LOW_BITS_AREA_END(dest_buf,boundary_tmp);
	sall	$2, %eax						// boundary_tmp - dest_buf in terms of bytes
	addl	$6316, %esp						// pop out stack memory
	popl	%ebx
	popl	%esi
	popl	%edi
	leave
	ret

	.align 4,0x90

L_RECORD_PARTIAL:
	movb	$1, -1(next_tag)						// *next_tag = 1 for partial matched
	movl	%edx, %eax								// dict_location
	subl	dictionary, %eax						// %eax = dict_location - dictionary
	movl	%ecx, (%edx)							// *dict_location = input_word;
	sarl	$2, %eax								// offset in 32-bit word
	movl	76(%esp), %edx							// next_qp
	movb	%al, (%edx)								// update *next_qp
	incl	%edx									// next_qp++
	movl	%edx, 76(%esp)							// save next_qp
	movl	%ecx, %eax								// a copy of input_word
	andl	$1023, %eax								// lower 10 bits
	movl	80(%esp), %edx							// next_low_bits
	movl	%eax, (%edx)							// EMIT_WORD(next_low_bits,(low_bits_pattern))
	addl	$4, %edx								// next_low_bits++
	movl	%edx, 80(%esp)							// save next_low_bits
	jmp		L8

#endif		// i386 architectures

#if defined __x86_64__			// 64-bit implementation	
	.text
	.align 4,0x90

.globl _WKdm_compress
_WKdm_compress:
	pushq	%rbp
	movq	%rsp, %rbp
	pushq	%r15
	pushq	%r14
	pushq	%r13
	pushq	%r12
	pushq	%rbx
	subq	$6112, %rsp

	#define	tempTagsArray	-6264(%rbp)
	#define	tempLowBitsArray	-6272(%rbp)
	#define	next_tag			%r8
	#define	next_input_word		%rdi
	#define	end_of_input		%r13
	#define	next_full_patt		%rbx
	#define	dict_location		%rcx
	#define	next_qp				%r10
	#define	dictionary			%r11
	#define	dest_buf			%r12
	#define	hashTable			%r14
	#define tempQPosArray		%r15
	#define	next_low_bits		%rsi

	movq	%rsi, %r12						// dest_buf

	leaq	-1136(%rbp), %rax				// &tempTagsArray[0]
	movq	%rax, tempTagsArray 
	leaq	1(%rax), next_tag				// next_tag always points to the one following the current tag 

	leaq	-2160(%rbp), %r15				// &tempQPosArray[0]
	movq	%r15, next_qp					// next_qp

	mov		%edx, %eax						// num_input_words
	leaq	(%rdi,%rax,4), end_of_input		// end_of_input = src_buf + num_input_words

	// PRELOAD_DICTIONARY;
	movl	$1, -112(%rbp)
	movl	$1, -108(%rbp)
	movl	$1, -104(%rbp)
	movl	$1, -100(%rbp)
	movl	$1, -96(%rbp)
	movl	$1, -92(%rbp)
	movl	$1, -88(%rbp)
	movl	$1, -84(%rbp)
	movl	$1, -80(%rbp)
	movl	$1, -76(%rbp)
	movl	$1, -72(%rbp)
	movl	$1, -68(%rbp)
	movl	$1, -64(%rbp)
	movl	$1, -60(%rbp)
	movl	$1, -56(%rbp)
	movl	$1, -52(%rbp)

	shrl	$4, %edx						// (num_input_words / 16)
	mov		%edx, %edx						// sign extension into quad word
	leaq	16(%rsi,%rdx,4), %rbx			// dest_buf + [TAGS_AREA_OFFSET + (num_input_words / 16)]*4

	leaq	-6256(%rbp), %rax				// &tempLowBitsArray[0]
	movq	%rax, tempLowBitsArray			// save for later reference
	movq	%rax, next_low_bits				// next_low_bits	

	cmpq	end_of_input, next_input_word	// next_input_word vs end_of_input
	jae		L_done_search					// if (next_input_word>=end_of_input) no work to do in search
	leaq	-112(%rbp), dictionary			// dictionary
	leaq	_hashLookupTable(%rip), hashTable	// hash look up table
	jmp	L5

	.align 4,0x90
L_RECORD_ZERO:
	movb	$0, -1(next_tag)						// *next_tag = ZERO;
L8:
	addq	$4, next_input_word 					// next_input_word++;
	incq	next_tag								// next_tag++
	cmpq	next_input_word, end_of_input 			// end_of_input vs next_input_word
	jbe		L_done_search
L5:
	movl	(next_input_word), %edx					// input_word = *next_input_word;
	movl	%edx, %r9d								// a copy of input_word
	testl	%edx, %edx								// input_word
	je		L_RECORD_ZERO							// if (input_word==0) RECORD_ZERO
	shrl	$10, %r9d								// input_high_bits = HIGH_BITS(input_word);
	movzbl	%r9b, %eax								// 8-bit index to the Hash Table
	movsbq	(hashTable,%rax),%rax					// HASH_TO_DICT_BYTE_OFFSET(input_word)
	leaq	(dictionary, %rax), dict_location		// ((char*) dictionary) + HASH_TO_DICT_BYTE_OFFSET(input_word));
	movl	(dict_location), %eax					// dict_word = *dict_location;
	cmpl	%eax, %edx								// dict_word vs input_word
	je		L_RECORD_EXACT							// if identical, RECORD_EXACT
	shrl	$10, %eax								// HIGH_BITS(dict_word)
	cmpl	%eax, %r9d								// input_high_bits vs HIGH_BITS(dict_word)
	je		L_RECORD_PARTIAL						// if identical, RECORD_PARTIAL

L_RECORD_MISS:
	movb	$2, -1(next_tag)						// *next_tag = 2 for miss
	movl	%edx, (next_full_patt)					// *next_full_patt = input_word;
	addq	$4, next_full_patt						// next_full_patt++ 
	movl	%edx, (dict_location)					// *dict_location = input_word
	addq	$4, next_input_word						// next_input_word++
	incq	next_tag								// next_tag++
	cmpq	next_input_word, end_of_input			// end_of_input vs next_input_word	
	ja		L5										// if (end_of_input>next_input_word) repeat from L5

L_done_search:

	// SET_QPOS_AREA_START(dest_buf,next_full_patt);
	//movq	next_full_patt, %r11					// next_full_patt
	movq	next_full_patt, %rax					// next_full_patt
	subq	dest_buf, %rax							// next_full_patt - dest_buf								
	sarq	$2, %rax								// offset in 4-bytes
	movl	%eax, %r13d								// r13d = (next_full_patt - dest_buf)
	movl	%eax, 4(dest_buf)						// dest_buf[1] = next_full_patt - dest_buf

	decq	next_tag
	cmpq	next_tag, tempTagsArray					// &tempTagsArray[0] vs next_tag
	jae		L13										// if (&tempTagsArray[0] >= next_tag), skip the following

	// boundary_tmp = WK_pack_2bits(tempTagsArray, (WK_word *) next_tag, dest_buf + HEADER_SIZE_IN_WORDS);

	movq	dest_buf, %rdi							// dest_buf
	movq	tempTagsArray, %rcx						// &tempTagsArray[0]

	.align 4,0x90
L_pack_2bits:
	movl	4(%rcx), %eax							// w1
	sall	$2, %eax								// w1 << 2
	movl	8(%rcx), %edx							// w2
	sall	$4, %edx								// w2 << 4
	orl		%edx, %eax								// (w1<<2) | (w2<<4)
	orl		(%rcx), %eax							// (w0) | (w1<<2) | (w2<<4)
	movl	12(%rcx), %edx							// w3
	sall	$6, %edx								// w3 << 6
	orl		%edx, %eax								// (w0) | (w1<<2) | (w2<<4) | (w3<<6)
	movl	%eax, 16(%rdi)							// save at *(dest_buf + HEADER_SIZE_IN_WORDS)
	addq	$16, %rcx								// tempTagsArray += 16;
	addq	$4, %rdi								// dest_buf += 4;
	cmpq	%rcx, next_tag							// cmp next_tag vs dest_buf
	ja		L_pack_2bits							// if (next_tag > dest_buf) repeat L_pack_2bits

	/* Pack the queue positions into the area just after the full words. */

L13:
	movl	%r10d, %eax								// next_qp
	subl	%r15d, %eax								// num_bytes_to_pack = next_qp - (char *) tempQPosArray; 
	addl	$7, %eax								// num_bytes_to_pack+7
	shrl	$3, %eax								// num_packed_words = (num_bytes_to_pack + 7) >> 3
	addl	%eax, %eax								// num_source_words = num_packed_words * 2;
	mov		%eax, %eax
	leaq	(tempQPosArray,%rax,4), %rcx			// endQPosArray = tempQPosArray + num_source_words
	cmpq	%rcx, %r10								// next_qp vs endQPosArray
	jae		L16										// if (next_qp >= endQPosArray) skip the following zero paddings
	.align 4,0x90
L30:
	movb	$0, (next_qp)							// *next_qp = 0
	incq	next_qp									// next_qp++							
	cmpq	%rcx, next_qp							// next_qp vs endQPosArray								
	jne		L30										// repeat while next_qp < endQPosArray
L16:
	movq	%rbx, %rdi								// next_full_patt
	cmpq	tempQPosArray, %rcx						// endQPosArray vs tempQPosArray
	jbe		L20										// if (endQPosArray <= tempQPosArray) skip the following
	movq	tempQPosArray, %rdx						// tempQPosArray

	.align 4,0x90
L_pack_4bits:
	movl	4(%rdx), %eax							// src_next[1]
	sall	$4, %eax								// (src_next[1] << 4)
	orl		(%rdx), %eax							// temp = src_next[0] | (src_next[1] << 4)
	movl	%eax, (%rdi)							// dest_next[0] = temp;
	addq	$4, %rdi								// dest_next++;
	addq	$8, %rdx								// src_next += 2;
	cmpq	%rdx, %rcx								// source_end vs src_next
	ja		L_pack_4bits							// while (src_next < source_end) repeat the loop

	// SET_LOW_BITS_AREA_START(dest_buf,boundary_tmp);
	//movq	%rdi, %r11								// boundary_tmp
	movq	%rdi, %rax								// boundary_tmp
	subq	dest_buf, %rax							// boundary_tmp - dest_buf
	movq	%rax, %r13								// boundary_tmp - dest_buf
	shrq	$2, %r13								// boundary_tmp - dest_buf in words
L20:
	movl	%r13d, 8(dest_buf)						// dest_buf[2] = boundary_tmp - dest_buf

	movq	tempLowBitsArray, %rcx					// tempLowBitsArray
	movq	next_low_bits, %rbx						// next_low_bits
	subq	%rcx, %rbx								// next_low_bits - tempLowBitsArray (in bytes)
	sarq	$2, %rbx								// num_tenbits_to_pack (in words)

	#define	size	%ebx

	subl	$3, size								// pre-decrement num_tenbits_to_pack by 3
	jl		1f										// if num_tenbits_to_pack < 3, skip the following loop

	.align	4,0x90
0:
	movl	4(%rcx), %eax							// w1
	sall	$10, %eax								// w1 << 10
	movl	8(%rcx), %edx							// w2	
	sall	$20, %edx								// w2 << 20
	orl		%edx, %eax								// (w1<<10) | (w2<<20)
	orl		(%rcx), %eax							// (w0) | (w1<<10) | (w2<<20)
	movl	%eax, (%rdi)							// pack w0,w1,w2 into 1 dest_buf word
	addq	$4, %rdi								// dest_buf++
	addq	$12, %rcx								// next w0/w1/w2 triplet
	subl	$3, size								// num_tenbits_to_pack-=3
	jge		0b										// if no less than 3 elements, back to loop head

1: 	addl	$3, size								// post-increment num_tenbits_to_pack by 3
	je		3f										// if num_tenbits_to_pack is a multiple of 3, skip the following
	movl	(%rcx), %eax							// w0
	subl	$1, size								// num_tenbits_to_pack--
	je		2f										//
	movl	4(%rcx), %edx							// w1
	sall	$10, %edx								// w1 << 10
	orl		%edx, %eax								// w0 | (w1<<10)

2:	movl	%eax, (%rdi)							// write the final dest_buf word
	addq	$4, %rdi								// dest_buf++

3:	movq	%rdi, %rax								// boundary_tmp
	subq	dest_buf, %rax							// boundary_tmp - dest_buf
	shrq	$2, %rax								// boundary_tmp - dest_buf in terms of words
	movl	%eax, 12(dest_buf)						// SET_LOW_BITS_AREA_END(dest_buf,boundary_tmp)
	shlq	$2, %rax								// boundary_tmp - dest_buf in terms of bytes

	// restore registers and return
	addq	$6112, %rsp
	popq	%rbx
	popq	%r12
	popq	%r13
	popq	%r14
	popq	%r15
	leave
	ret

	.align 4,0x90
L_RECORD_EXACT:
	movb	$3, -1(next_tag)					// *next_tag = 3 for exact
	subq	dictionary, %rcx					// dict_location - dictionary
	sarq	$2, %rcx							// divide by 4 for word offset
	movb	%cl, (next_qp)						// *next_qp = word offset (4-bit)
	incq	next_qp								// next_qp++
	jmp		L8

	.align 4,0x90
L_RECORD_PARTIAL:
	movb	$1, -1(next_tag)					// *next_tag = 1 for partial matched
	movq	%rcx, %rax							// dict_location
	subq	dictionary, %rax					// dict_location - dictionary
	movl	%edx, (%rcx)						// *dict_location = input_word;
	sarq	$2, %rax							// offset in 32-bit word
	movb	%al, (next_qp)						// update *next_qp
	incq	next_qp								// next_qp++
	andl	$1023, %edx							// lower 10 bits
	movl	%edx, (next_low_bits)				// save next_low_bits
	addq	$4, next_low_bits					// next_low_bits++
	jmp	L8

	// for some reason, keeping the following never executed code yields a better performance
L41:
	leaq	-6256(%rbp), %rax
	movq	%rax, -6272(%rbp)
	movq	%rax, %rsi
	jmp		L_done_search
#endif		// x86_64 architectures
#endif		// i386 or x86_64 architectures