 7ba7db9fa1
			
		
	
	
		7ba7db9fa1
		
	
	
	
	
		
			
			Perform the function selection once, and only if CONFIG_AVX512_OPT is enabled. Centralize the selection to xbzrle.c, instead of spreading the init across 3 files. Remove xbzrle-bench.c. The benefit of being able to benchmark the different implementations is less important than not peeking into the internals of the implementation. Reviewed-by: Alex Bennée <alex.bennee@linaro.org> Reviewed-by: Juan Quintela <quintela@redhat.com> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
		
			
				
	
	
		
			324 lines
		
	
	
		
			8.9 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			324 lines
		
	
	
		
			8.9 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*
 | |
|  * Xor Based Zero Run Length Encoding
 | |
|  *
 | |
|  * Copyright 2013 Red Hat, Inc. and/or its affiliates
 | |
|  *
 | |
|  * Authors:
 | |
|  *  Orit Wasserman  <owasserm@redhat.com>
 | |
|  *
 | |
|  * This work is licensed under the terms of the GNU GPL, version 2 or later.
 | |
|  * See the COPYING file in the top-level directory.
 | |
|  *
 | |
|  */
 | |
| #include "qemu/osdep.h"
 | |
| #include "qemu/cutils.h"
 | |
| #include "qemu/host-utils.h"
 | |
| #include "xbzrle.h"
 | |
| 
 | |
| #if defined(CONFIG_AVX512BW_OPT)
 | |
| #include <immintrin.h>
 | |
| #include "host/cpuinfo.h"
 | |
| 
 | |
| static int __attribute__((target("avx512bw")))
 | |
| xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
 | |
|                             uint8_t *dst, int dlen)
 | |
| {
 | |
|     uint32_t zrun_len = 0, nzrun_len = 0;
 | |
|     int d = 0, i = 0, num = 0;
 | |
|     uint8_t *nzrun_start = NULL;
 | |
|     /* add 1 to include residual part in main loop */
 | |
|     uint32_t count512s = (slen >> 6) + 1;
 | |
|     /* countResidual is tail of data, i.e., countResidual = slen % 64 */
 | |
|     uint32_t count_residual = slen & 0b111111;
 | |
|     bool never_same = true;
 | |
|     uint64_t mask_residual = 1;
 | |
|     mask_residual <<= count_residual;
 | |
|     mask_residual -= 1;
 | |
|     __m512i r = _mm512_set1_epi32(0);
 | |
| 
 | |
|     while (count512s) {
 | |
|         int bytes_to_check = 64;
 | |
|         uint64_t mask = 0xffffffffffffffff;
 | |
|         if (count512s == 1) {
 | |
|             bytes_to_check = count_residual;
 | |
|             mask = mask_residual;
 | |
|         }
 | |
|         __m512i old_data = _mm512_mask_loadu_epi8(r,
 | |
|                                                   mask, old_buf + i);
 | |
|         __m512i new_data = _mm512_mask_loadu_epi8(r,
 | |
|                                                   mask, new_buf + i);
 | |
|         uint64_t comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
 | |
|         count512s--;
 | |
| 
 | |
|         bool is_same = (comp & 0x1);
 | |
|         while (bytes_to_check) {
 | |
|             if (d + 2 > dlen) {
 | |
|                 return -1;
 | |
|             }
 | |
|             if (is_same) {
 | |
|                 if (nzrun_len) {
 | |
|                     d += uleb128_encode_small(dst + d, nzrun_len);
 | |
|                     if (d + nzrun_len > dlen) {
 | |
|                         return -1;
 | |
|                     }
 | |
|                     nzrun_start = new_buf + i - nzrun_len;
 | |
|                     memcpy(dst + d, nzrun_start, nzrun_len);
 | |
|                     d += nzrun_len;
 | |
|                     nzrun_len = 0;
 | |
|                 }
 | |
|                 /* 64 data at a time for speed */
 | |
|                 if (count512s && (comp == 0xffffffffffffffff)) {
 | |
|                     i += 64;
 | |
|                     zrun_len += 64;
 | |
|                     break;
 | |
|                 }
 | |
|                 never_same = false;
 | |
|                 num = ctz64(~comp);
 | |
|                 num = (num < bytes_to_check) ? num : bytes_to_check;
 | |
|                 zrun_len += num;
 | |
|                 bytes_to_check -= num;
 | |
|                 comp >>= num;
 | |
|                 i += num;
 | |
|                 if (bytes_to_check) {
 | |
|                     /* still has different data after same data */
 | |
|                     d += uleb128_encode_small(dst + d, zrun_len);
 | |
|                     zrun_len = 0;
 | |
|                 } else {
 | |
|                     break;
 | |
|                 }
 | |
|             }
 | |
|             if (never_same || zrun_len) {
 | |
|                 /*
 | |
|                  * never_same only acts if
 | |
|                  * data begins with diff in first count512s
 | |
|                  */
 | |
|                 d += uleb128_encode_small(dst + d, zrun_len);
 | |
|                 zrun_len = 0;
 | |
|                 never_same = false;
 | |
|             }
 | |
|             /* has diff, 64 data at a time for speed */
 | |
|             if ((bytes_to_check == 64) && (comp == 0x0)) {
 | |
|                 i += 64;
 | |
|                 nzrun_len += 64;
 | |
|                 break;
 | |
|             }
 | |
|             num = ctz64(comp);
 | |
|             num = (num < bytes_to_check) ? num : bytes_to_check;
 | |
|             nzrun_len += num;
 | |
|             bytes_to_check -= num;
 | |
|             comp >>= num;
 | |
|             i += num;
 | |
|             if (bytes_to_check) {
 | |
|                 /* mask like 111000 */
 | |
|                 d += uleb128_encode_small(dst + d, nzrun_len);
 | |
|                 /* overflow */
 | |
|                 if (d + nzrun_len > dlen) {
 | |
|                     return -1;
 | |
|                 }
 | |
|                 nzrun_start = new_buf + i - nzrun_len;
 | |
|                 memcpy(dst + d, nzrun_start, nzrun_len);
 | |
|                 d += nzrun_len;
 | |
|                 nzrun_len = 0;
 | |
|                 is_same = true;
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     if (nzrun_len != 0) {
 | |
|         d += uleb128_encode_small(dst + d, nzrun_len);
 | |
|         /* overflow */
 | |
|         if (d + nzrun_len > dlen) {
 | |
|             return -1;
 | |
|         }
 | |
|         nzrun_start = new_buf + i - nzrun_len;
 | |
|         memcpy(dst + d, nzrun_start, nzrun_len);
 | |
|         d += nzrun_len;
 | |
|     }
 | |
|     return d;
 | |
| }
 | |
| 
 | |
| static int xbzrle_encode_buffer_int(uint8_t *old_buf, uint8_t *new_buf,
 | |
|                                     int slen, uint8_t *dst, int dlen);
 | |
| 
 | |
| static int (*accel_func)(uint8_t *, uint8_t *, int, uint8_t *, int);
 | |
| 
 | |
| static void __attribute__((constructor)) init_accel(void)
 | |
| {
 | |
|     unsigned info = cpuinfo_init();
 | |
|     if (info & CPUINFO_AVX512BW) {
 | |
|         accel_func = xbzrle_encode_buffer_avx512;
 | |
|     } else {
 | |
|         accel_func = xbzrle_encode_buffer_int;
 | |
|     }
 | |
| }
 | |
| 
 | |
| int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
 | |
|                          uint8_t *dst, int dlen)
 | |
| {
 | |
|     return accel_func(old_buf, new_buf, slen, dst, dlen);
 | |
| }
 | |
| 
 | |
| #define xbzrle_encode_buffer xbzrle_encode_buffer_int
 | |
| #endif
 | |
| 
 | |
| /*
 | |
|   page = zrun nzrun
 | |
|        | zrun nzrun page
 | |
| 
 | |
|   zrun = length
 | |
| 
 | |
|   nzrun = length byte...
 | |
| 
 | |
|   length = uleb128 encoded integer
 | |
|  */
 | |
| int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
 | |
|                          uint8_t *dst, int dlen)
 | |
| {
 | |
|     uint32_t zrun_len = 0, nzrun_len = 0;
 | |
|     int d = 0, i = 0;
 | |
|     long res;
 | |
|     uint8_t *nzrun_start = NULL;
 | |
| 
 | |
|     g_assert(!(((uintptr_t)old_buf | (uintptr_t)new_buf | slen) %
 | |
|                sizeof(long)));
 | |
| 
 | |
|     while (i < slen) {
 | |
|         /* overflow */
 | |
|         if (d + 2 > dlen) {
 | |
|             return -1;
 | |
|         }
 | |
| 
 | |
|         /* not aligned to sizeof(long) */
 | |
|         res = (slen - i) % sizeof(long);
 | |
|         while (res && old_buf[i] == new_buf[i]) {
 | |
|             zrun_len++;
 | |
|             i++;
 | |
|             res--;
 | |
|         }
 | |
| 
 | |
|         /* word at a time for speed */
 | |
|         if (!res) {
 | |
|             while (i < slen &&
 | |
|                    (*(long *)(old_buf + i)) == (*(long *)(new_buf + i))) {
 | |
|                 i += sizeof(long);
 | |
|                 zrun_len += sizeof(long);
 | |
|             }
 | |
| 
 | |
|             /* go over the rest */
 | |
|             while (i < slen && old_buf[i] == new_buf[i]) {
 | |
|                 zrun_len++;
 | |
|                 i++;
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         /* buffer unchanged */
 | |
|         if (zrun_len == slen) {
 | |
|             return 0;
 | |
|         }
 | |
| 
 | |
|         /* skip last zero run */
 | |
|         if (i == slen) {
 | |
|             return d;
 | |
|         }
 | |
| 
 | |
|         d += uleb128_encode_small(dst + d, zrun_len);
 | |
| 
 | |
|         zrun_len = 0;
 | |
|         nzrun_start = new_buf + i;
 | |
| 
 | |
|         /* overflow */
 | |
|         if (d + 2 > dlen) {
 | |
|             return -1;
 | |
|         }
 | |
|         /* not aligned to sizeof(long) */
 | |
|         res = (slen - i) % sizeof(long);
 | |
|         while (res && old_buf[i] != new_buf[i]) {
 | |
|             i++;
 | |
|             nzrun_len++;
 | |
|             res--;
 | |
|         }
 | |
| 
 | |
|         /* word at a time for speed, use of 32-bit long okay */
 | |
|         if (!res) {
 | |
|             /* truncation to 32-bit long okay */
 | |
|             unsigned long mask = (unsigned long)0x0101010101010101ULL;
 | |
|             while (i < slen) {
 | |
|                 unsigned long xor;
 | |
|                 xor = *(unsigned long *)(old_buf + i)
 | |
|                     ^ *(unsigned long *)(new_buf + i);
 | |
|                 if ((xor - mask) & ~xor & (mask << 7)) {
 | |
|                     /* found the end of an nzrun within the current long */
 | |
|                     while (old_buf[i] != new_buf[i]) {
 | |
|                         nzrun_len++;
 | |
|                         i++;
 | |
|                     }
 | |
|                     break;
 | |
|                 } else {
 | |
|                     i += sizeof(long);
 | |
|                     nzrun_len += sizeof(long);
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         d += uleb128_encode_small(dst + d, nzrun_len);
 | |
|         /* overflow */
 | |
|         if (d + nzrun_len > dlen) {
 | |
|             return -1;
 | |
|         }
 | |
|         memcpy(dst + d, nzrun_start, nzrun_len);
 | |
|         d += nzrun_len;
 | |
|         nzrun_len = 0;
 | |
|     }
 | |
| 
 | |
|     return d;
 | |
| }
 | |
| 
 | |
| int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen)
 | |
| {
 | |
|     int i = 0, d = 0;
 | |
|     int ret;
 | |
|     uint32_t count = 0;
 | |
| 
 | |
|     while (i < slen) {
 | |
| 
 | |
|         /* zrun */
 | |
|         if ((slen - i) < 2) {
 | |
|             return -1;
 | |
|         }
 | |
| 
 | |
|         ret = uleb128_decode_small(src + i, &count);
 | |
|         if (ret < 0 || (i && !count)) {
 | |
|             return -1;
 | |
|         }
 | |
|         i += ret;
 | |
|         d += count;
 | |
| 
 | |
|         /* overflow */
 | |
|         if (d > dlen) {
 | |
|             return -1;
 | |
|         }
 | |
| 
 | |
|         /* nzrun */
 | |
|         if ((slen - i) < 2) {
 | |
|             return -1;
 | |
|         }
 | |
| 
 | |
|         ret = uleb128_decode_small(src + i, &count);
 | |
|         if (ret < 0 || !count) {
 | |
|             return -1;
 | |
|         }
 | |
|         i += ret;
 | |
| 
 | |
|         /* overflow */
 | |
|         if (d + count > dlen || i + count > slen) {
 | |
|             return -1;
 | |
|         }
 | |
| 
 | |
|         memcpy(dst + d, src + i, count);
 | |
|         d += count;
 | |
|         i += count;
 | |
|     }
 | |
| 
 | |
|     return d;
 | |
| }
 |