diff --git a/ext/json/ext/generator/extconf.rb b/ext/json/ext/generator/extconf.rb index 078068cf..ad573957 100644 --- a/ext/json/ext/generator/extconf.rb +++ b/ext/json/ext/generator/extconf.rb @@ -6,5 +6,40 @@ else append_cflags("-std=c99") $defs << "-DJSON_GENERATOR" + + if enable_config('generator-use-simd', default=true) + if RbConfig::CONFIG['host_cpu'] =~ /^(arm.*|aarch64.*)/ + # Try to compile a small program using NEON instructions + if have_header('arm_neon.h') + have_type('uint8x16_t', headers=['arm_neon.h']) && try_compile(<<~'SRC') + #include + int main() { + uint8x16_t test = vdupq_n_u8(32); + return 0; + } + SRC + $defs.push("-DENABLE_SIMD") + + if enable_config('generator-use-neon-lut', default=false) + $defs.push('-DUSE_NEON_LUT') + end + end + end + + if have_header('x86intrin.h') && have_type('__m128i', headers=['x86intrin.h']) && try_compile(<<~'SRC', opt='-msse2') + #include + int main() { + __m128i test = _mm_set1_epi8(32); + return 0; + } + SRC + $defs.push("-DENABLE_SIMD") + end + + have_header('cpuid.h') + end + + create_header + create_makefile 'json/ext/generator' end diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 428f5e21..0f1fff11 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -5,6 +5,8 @@ #include #include +#include "simd.h" + /* ruby api and some helpers */ typedef struct JSON_Generator_StateStruct { @@ -109,12 +111,39 @@ typedef struct _search_state { const char *end; const char *cursor; FBuffer *buffer; + +#ifdef ENABLE_SIMD + const char *chunk_base; + uint8_t has_matches; + +#ifdef HAVE_SIMD_NEON + uint64_t matches_mask; +#elif HAVE_SIMD_SSE2 + int matches_mask; +#else +#error "Unknown SIMD Implementation." +#endif /* HAVE_SIMD_NEON */ +#endif /* ENABLE_SIMD */ } search_state; -static inline void search_flush(search_state *search) -{ - fbuffer_append(search->buffer, search->cursor, search->ptr - search->cursor); - search->cursor = search->ptr; +#if (defined(__GNUC__ ) || defined(__clang__)) +#define FORCE_INLINE __attribute__((always_inline)) +#else +#define FORCE_INLINE +#endif + +static inline FORCE_INLINE void search_flush(search_state *search) +{ + // Do not remove this conditional without profiling, specfically escape-heavy text. + // escape_UTF8_char_basic will advance search->ptr and search->cursor (effectively a search_flush). + // For back-to-back characters that need to be escaped, specifcally for the SIMD code paths, this method + // will be called just before calling escape_UTF8_char_basic. There will be no characers to append for the + // consecutive characters that need to be escaped. While the fbuffer_append is a no-op if + // nothing needs to be flushed, we can save a few memory references with this conditional. + if (search->ptr > search->cursor) { + fbuffer_append(search->buffer, search->cursor, search->ptr - search->cursor); + search->cursor = search->ptr; + } } static const unsigned char escape_table_basic[256] = { @@ -130,6 +159,8 @@ static const unsigned char escape_table_basic[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; +static unsigned char (*search_escape_basic_impl)(search_state *); + static inline unsigned char search_escape_basic(search_state *search) { while (search->ptr < search->end) { @@ -144,7 +175,8 @@ static inline unsigned char search_escape_basic(search_state *search) return 0; } -static inline void escape_UTF8_char_basic(search_state *search) { +static inline void escape_UTF8_char_basic(search_state *search) +{ const unsigned char ch = (unsigned char)*search->ptr; switch (ch) { case '"': fbuffer_append(search->buffer, "\\\"", 2); break; @@ -186,12 +218,16 @@ static inline void escape_UTF8_char_basic(search_state *search) { */ static inline void convert_UTF8_to_JSON(search_state *search) { - while (search_escape_basic(search)) { - escape_UTF8_char_basic(search); + unsigned char num_chars = 0; + while ((num_chars = search_escape_basic_impl(search))) { + do { + escape_UTF8_char_basic(search); + } while (--num_chars); } } -static inline void escape_UTF8_char(search_state *search, unsigned char ch_len) { +static inline void escape_UTF8_char(search_state *search, unsigned char ch_len) +{ const unsigned char ch = (unsigned char)*search->ptr; switch (ch_len) { case 1: { @@ -227,6 +263,408 @@ static inline void escape_UTF8_char(search_state *search, unsigned char ch_len) search->cursor = (search->ptr += ch_len); } +#ifdef ENABLE_SIMD + +#ifdef HAVE_SIMD_NEON +#ifdef USE_NEON_LUT +struct _simd_state { + + struct { + uint8x16x4_t escape_table_basic[2]; + } neon; +}; + +static struct _simd_state simd_state; +#endif /* USE_NEON_LUT */ +#endif /* HAVE_SIMD_NEON */ +#endif /* ENABLE_SIMD */ + +#ifdef ENABLE_SIMD + +static inline FORCE_INLINE char *copy_remaining_bytes(search_state *search, unsigned long vec_len, unsigned long len) +{ + // Flush the buffer so everything up until the last 'len' characters are unflushed. + search_flush(search); + + FBuffer *buf = search->buffer; + fbuffer_inc_capa(buf, vec_len); + + char *s = (buf->ptr + buf->len); + + memset(s, 'X', vec_len); + + // Optimistically copy the remaining 'len' characters to the output FBuffer. If there are no characters + // to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage. + MEMCPY(s, search->ptr, char, len); + + return s; +} + +#ifdef HAVE_SIMD_NEON + +static inline FORCE_INLINE unsigned char neon_next_match(search_state *search) +{ + uint64_t mask = search->matches_mask; + uint32_t index = trailing_zeros64(mask) >> 2; + + // It is assumed escape_UTF8_char_basic will only ever increase search->ptr by at most one character. + // If we want to use a similar approach for full escaping we'll need to ensure: + // search->chunk_base + index >= search->ptr + // However, since we know escape_UTF8_char_basic only increases search->ptr by one, if the next match + // is one byte after the previous match then: + // search->chunk_base + index == search->ptr + search->ptr = search->chunk_base + index; + mask &= mask - 1; + search->matches_mask = mask; + search_flush(search); + return 1; +} + +// See: https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon +static inline FORCE_INLINE uint64_t neon_match_mask(uint8x16_t matches) +{ + const uint8x8_t res = vshrn_n_u16(vreinterpretq_u16_u8(matches), 4); + const uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(res), 0); + return mask & 0x8888888888888888ull; +} + +#ifdef USE_NEON_LUT +static inline FORCE_INLINE uint8x16_t neon_lut_update(uint8x16_t chunk) +{ + uint8x16_t tmp1 = vqtbl4q_u8(simd_state.neon.escape_table_basic[0], chunk); + uint8x16_t tmp2 = vqtbl4q_u8(simd_state.neon.escape_table_basic[1], veorq_u8(chunk, vdupq_n_u8(0x40))); + uint8x16_t result = vorrq_u8(tmp1, tmp2); + return result; +} + +static inline FORCE_INLINE unsigned char search_escape_basic_neon_advance_lut(search_state *search) +{ + while (search->ptr+sizeof(uint8x16_t) <= search->end) { + uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); + uint8x16_t needs_escape = neon_lut_update(chunk); + uint8_t popcnt = vaddvq_u8(vandq_u8(needs_escape, vdupq_n_u8(0x1))); + + if (popcnt == 0) { + search->ptr += sizeof(uint8x16_t); + continue; + } + + if (popcnt >= (int) sizeof(uint8x16_t)/2) { + return sizeof(uint8x16_t); + } + + search->matches_mask = neon_match_mask(needs_escape); + search->has_matches = 1; + search->chunk_base = search->ptr; + return neon_next_match(search); + } + + // There are fewer than 16 bytes left. + unsigned long remaining = (search->end - search->ptr); + if (remaining >= SIMD_MINIMUM_THRESHOLD) { + char *s = copy_remaining_bytes(search, sizeof(uint8x16_t), remaining); + + uint8x16_t chunk = vld1q_u8((const unsigned char *) s); + uint8x16_t needs_escape = neon_lut_update(chunk); + uint8_t popcnt = vaddvq_u8(vandq_u8(needs_escape, vdupq_n_u8(0x1))); + + if (popcnt == 0) { + // Nothing to escape, ensure search_flush doesn't do anything by setting + // search->cursor to search->ptr. + search->buffer->len += remaining; + search->ptr = search->end; + search->cursor = search->end; + return 0; + } + + if (popcnt >= sizeof(uint8x16_t)/2) { + return remaining; + } + + search->matches_mask = neon_match_mask(needs_escape); + search->has_matches = 1; + search->chunk_base = search->ptr; + return neon_next_match(search); + } + + return 0; +} + +#else + +static inline FORCE_INLINE uint8x16_t neon_rules_update(uint8x16_t chunk) +{ + const uint8x16_t lower_bound = vdupq_n_u8(' '); + const uint8x16_t backslash = vdupq_n_u8('\\'); + const uint8x16_t dblquote = vdupq_n_u8('\"'); + + uint8x16_t too_low = vcltq_u8(chunk, lower_bound); + uint8x16_t has_backslash = vceqq_u8(chunk, backslash); + uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); + uint8x16_t needs_escape = vorrq_u8(too_low, vorrq_u8(has_backslash, has_dblquote)); + + return needs_escape; +} + +static inline FORCE_INLINE unsigned char search_escape_basic_neon_advance_rules(search_state *search) +{ + /* + * The code below implements an SIMD-based algorithm to determine if N bytes at a time + * need to be escaped. + * + * Assume the ptr = "Te\sting!" (the double quotes are included in the string) + * + * The explanination will be limited to the first 8 bytes of the string for simplicity. However + * the vector insructions may work on larger vectors. + * + * First, we load three constants 'lower_bound', 'backslash' and 'dblquote" in vector registers. + * + * lower_bound: [20 20 20 20 20 20 20 20] + * backslash: [5C 5C 5C 5C 5C 5C 5C 5C] + * dblquote: [22 22 22 22 22 22 22 22] + * + * Next we load the first chunk of the ptr: + * [22 54 65 5C 73 74 69 6E] (" T e \ s t i n) + * + * First we check if any byte in chunk is less than 32 (0x20). This returns the following vector + * as no bytes are less than 32 (0x20): + * [0 0 0 0 0 0 0 0] + * + * Next, we check if any byte in chunk is equal to a backslash: + * [0 0 0 FF 0 0 0 0] + * + * Finally we check if any byte in chunk is equal to a double quote: + * [FF 0 0 0 0 0 0 0] + * + * Now we have three vectors where each byte indicates if the corresponding byte in chunk + * needs to be escaped. We combine these vectors with a series of logical OR instructions. + * This is the needs_escape vector and it is equal to: + * [FF 0 0 FF 0 0 0 0] + * + * For ARM Neon specifically, we check if the maximum number in the vector is 0. The maximum of + * the needs_escape vector is FF. Therefore, we know there is at least one byte that needs to be + * escaped. + * + * If the maximum of the needs_escape vector is 0, none of the bytes need to be escaped and + * we advance pos by the width of the vector. + * + * To determine how to escape characters, we look at each value in the needs_escape vector and take + * the appropriate action. + */ + while (search->ptr+sizeof(uint8x16_t) <= search->end) { + uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); + uint8x16_t needs_escape = neon_rules_update(chunk); + uint8_t popcnt = vaddvq_u8(vandq_u8(needs_escape, vdupq_n_u8(0x1))); + + if (popcnt == 0) { + search->ptr += sizeof(uint8x16_t); + continue; + } + + if (popcnt >= sizeof(uint8x16_t)/2) { + return sizeof(uint8x16_t); + } + + search->matches_mask = neon_match_mask(needs_escape); + search->has_matches = 1; + search->chunk_base = search->ptr; + return neon_next_match(search); + } + + // There are fewer than 16 bytes left. + unsigned long remaining = (search->end - search->ptr); + if (remaining >= SIMD_MINIMUM_THRESHOLD) { + char *s = copy_remaining_bytes(search, sizeof(uint8x16_t), remaining); + + uint8x16_t chunk = vld1q_u8((const unsigned char *) s); + uint8x16_t needs_escape = neon_rules_update(chunk); + uint8_t popcnt = vaddvq_u8(vandq_u8(needs_escape, vdupq_n_u8(0x1))); + + if (popcnt == 0) { + // Nothing to escape, ensure search_flush doesn't do anything by setting + // search->cursor to search->ptr. + search->buffer->len += remaining; + search->ptr = search->end; + search->cursor = search->end; + return 0; + } + + if (popcnt >= sizeof(uint8x16_t)/2) { + return remaining; + } + + search->matches_mask = neon_match_mask(needs_escape); + search->has_matches = 1; + search->chunk_base = search->ptr; + return neon_next_match(search); + } + + return 0; +} +#endif /* USE_NEON_LUT */ + +static inline unsigned char search_escape_basic_neon(search_state *search) +{ + if (RB_UNLIKELY(search->has_matches)) { + // There are more matches if search->matches_mask > 0. + if (search->matches_mask > 0) { + return neon_next_match(search); + } else { + // neon_next_match will only advance search->ptr up to the last matching character. + // Skip over any characters in the last chunk that occur after the last match. + search->has_matches = 0; + if (RB_UNLIKELY(search->chunk_base+sizeof(uint8x16_t) >= search->end)) { + search->ptr = search->end; + } else { + search->ptr = search->chunk_base+sizeof(uint8x16_t); + } + } + } + +#ifdef USE_NEON_LUT + unsigned char num_chars = 0; + if ((num_chars = search_escape_basic_neon_advance_lut(search))) { + return num_chars; + } +#else + unsigned char num_chars = 0; + if ((num_chars = search_escape_basic_neon_advance_rules(search))) { + return num_chars; + } +#endif /* USE_NEON_LUT */ + if (search->ptr < search->end) { + return search_escape_basic(search); + } + + search_flush(search); + return 0; +} +#endif /* HAVE_SIMD_NEON */ + +#ifdef HAVE_SIMD_SSE2 + +#define _mm_cmpge_epu8(a, b) _mm_cmpeq_epi8(_mm_max_epu8(a, b), a) +#define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a) +#define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1)) +#define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a) + +static inline FORCE_INLINE unsigned char sse2_next_match(search_state *search) +{ + int mask = search->matches_mask; + int index = trailing_zeros(mask); + + // It is assumed escape_UTF8_char_basic will only ever increase search->ptr by at most one character. + // If we want to use a similar approach for full escaping we'll need to ensure: + // search->chunk_base + index >= search->ptr + // However, since we know escape_UTF8_char_basic only increases search->ptr by one, if the next match + // is one byte after the previous match then: + // search->chunk_base + index == search->ptr + search->ptr = search->chunk_base + index; + mask &= mask - 1; + search->matches_mask = mask; + search_flush(search); + return 1; +} + +#if defined(__clang__) || defined(__GNUC__) +#define TARGET_SSE2 __attribute__((target("sse2"))) +#else +#define TARGET_SSE2 +#endif + +static inline TARGET_SSE2 FORCE_INLINE __m128i sse2_update(__m128i chunk) +{ + const __m128i lower_bound = _mm_set1_epi8(' '); + const __m128i backslash = _mm_set1_epi8('\\'); + const __m128i dblquote = _mm_set1_epi8('\"'); + + __m128i too_low = _mm_cmplt_epu8(chunk, lower_bound); + __m128i has_backslash = _mm_cmpeq_epi8(chunk, backslash); + __m128i has_dblquote = _mm_cmpeq_epi8(chunk, dblquote); + __m128i needs_escape = _mm_or_si128(too_low, _mm_or_si128(has_backslash, has_dblquote)); + return needs_escape; +} + +static inline TARGET_SSE2 FORCE_INLINE unsigned char search_escape_basic_sse2(search_state *search) +{ + if (RB_UNLIKELY(search->has_matches)) { + // There are more matches if search->matches_mask > 0. + if (search->matches_mask > 0) { + return sse2_next_match(search); + } else { + // sse2_next_match will only advance search->ptr up to the last matching character. + // Skip over any characters in the last chunk that occur after the last match. + search->has_matches = 0; + if (RB_UNLIKELY(search->chunk_base+sizeof(__m128i) >= search->end)) { + search->ptr = search->end; + } else { + search->ptr = search->chunk_base+sizeof(__m128i); + } + } + } + + while (search->ptr+sizeof(__m128i) <= search->end) { + __m128i chunk = _mm_loadu_si128((__m128i const*)search->ptr); + __m128i needs_escape = sse2_update(chunk); + + int needs_escape_mask = _mm_movemask_epi8(needs_escape); + + if (needs_escape_mask == 0) { + search->ptr += sizeof(__m128i); + continue; + } + + if (popcount32(needs_escape_mask) >= sizeof(__m128i)/2) { + return sizeof(__m128i); + } + + search->has_matches = 1; + search->matches_mask = needs_escape_mask; + search->chunk_base = search->ptr; + return sse2_next_match(search); + } + + // There are fewer than 16 bytes left. + unsigned long remaining = (search->end - search->ptr); + if (remaining >= SIMD_MINIMUM_THRESHOLD) { + char *s = copy_remaining_bytes(search, sizeof(__m128i), remaining); + + __m128i chunk = _mm_loadu_si128((__m128i const *) s); + __m128i needs_escape = sse2_update(chunk); + + int needs_escape_mask = _mm_movemask_epi8(needs_escape); + + if (needs_escape_mask == 0) { + // Nothing to escape, ensure search_flush doesn't do anything by setting + // search->cursor to search->ptr. + search->buffer->len += remaining; + search->ptr = search->end; + search->cursor = search->end; + return 0; + } + + if (popcount32(needs_escape_mask) >= sizeof(__m128i)/2) { + return remaining; + } + + search->has_matches = 1; + search->matches_mask = needs_escape_mask; + search->chunk_base = search->ptr; + return sse2_next_match(search); + } + + if (search->ptr < search->end) { + return search_escape_basic(search); + } + + search_flush(search); + return 0; +} + +#endif /* HAVE_SIMD_SSE2 */ + +#endif /* ENABLE_SIMD */ + static const unsigned char script_safe_escape_table[256] = { // ASCII Control Characters 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, @@ -974,6 +1412,12 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat search.cursor = search.ptr; search.end = search.ptr + len; +#ifdef ENABLE_SIMD + search.matches_mask = 0; + search.has_matches = 0; + search.chunk_base = NULL; +#endif /* ENABLE_SIMD */ + switch(rb_enc_str_coderange(obj)) { case ENC_CODERANGE_7BIT: case ENC_CODERANGE_VALID: @@ -1181,6 +1625,29 @@ static VALUE generate_json_rescue(VALUE d, VALUE exc) return Qundef; } +/* SIMD Utilities (if enabled) */ +#ifdef ENABLE_SIMD +#ifdef HAVE_SIMD_NEON +#ifdef USE_NEON_LUT +static void initialize_simd_neon(void) { + simd_state.neon.escape_table_basic[0] = load_uint8x16_4(escape_table_basic); + simd_state.neon.escape_table_basic[1] = load_uint8x16_4(escape_table_basic+64); + + simd_state.neon.escape_table_basic[0].val[0] = vceqq_u8(simd_state.neon.escape_table_basic[0].val[0], vdupq_n_u8(9)); + simd_state.neon.escape_table_basic[0].val[1] = vceqq_u8(simd_state.neon.escape_table_basic[0].val[1], vdupq_n_u8(9)); + simd_state.neon.escape_table_basic[0].val[2] = vceqq_u8(simd_state.neon.escape_table_basic[0].val[2], vdupq_n_u8(9)); + simd_state.neon.escape_table_basic[0].val[3] = vceqq_u8(simd_state.neon.escape_table_basic[0].val[3], vdupq_n_u8(9)); + + simd_state.neon.escape_table_basic[1].val[0] = vceqq_u8(simd_state.neon.escape_table_basic[1].val[0], vdupq_n_u8(9)); + simd_state.neon.escape_table_basic[1].val[1] = vceqq_u8(simd_state.neon.escape_table_basic[1].val[1], vdupq_n_u8(9)); + simd_state.neon.escape_table_basic[1].val[2] = vceqq_u8(simd_state.neon.escape_table_basic[1].val[2], vdupq_n_u8(9)); + simd_state.neon.escape_table_basic[1].val[3] = vceqq_u8(simd_state.neon.escape_table_basic[1].val[3], vdupq_n_u8(9)); +} +#endif /* USE_NEON_LUT */ +#endif /* HAVE_NEON_SIMD */ + +#endif + static VALUE cState_partial_generate(VALUE self, VALUE obj, generator_func func, VALUE io) { GET_STATE(self); @@ -1837,4 +2304,27 @@ void Init_generator(void) binary_encindex = rb_ascii8bit_encindex(); rb_require("json/ext/generator/state"); + + + switch(find_simd_implementation()) { +#ifdef ENABLE_SIMD +#ifdef HAVE_SIMD_NEON + case SIMD_NEON: + /* Initialize ARM Neon SIMD Implementation. */ +#ifdef USE_NEON_LUT + initialize_simd_neon(); +#endif /* USE_NEON_LUT */ + search_escape_basic_impl = search_escape_basic_neon; + break; +#endif /* HAVE_SIMD_NEON */ +#ifdef HAVE_SIMD_SSE2 + case SIMD_SSE2: + search_escape_basic_impl = search_escape_basic_sse2; + break; +#endif /* HAVE_SIMD_SSE2 */ +#endif /* ENABLE_SIMD */ + default: + search_escape_basic_impl = search_escape_basic; + break; + } } diff --git a/ext/json/ext/generator/simd.h b/ext/json/ext/generator/simd.h new file mode 100644 index 00000000..ca3e40bc --- /dev/null +++ b/ext/json/ext/generator/simd.h @@ -0,0 +1,147 @@ +#include "extconf.h" + +typedef enum { + SIMD_NONE, + SIMD_NEON, + SIMD_SSE2 +} SIMD_Implementation; + +#ifdef ENABLE_SIMD + +#ifdef __clang__ + #if __has_builtin(__builtin_ctzll) + #define HAVE_BUILTIN_CTZLL 1 + #else + #define HAVE_BUILTIN_CTZLL 0 + #endif +#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) + #define HAVE_BUILTIN_CTZLL 1 +#else + #define HAVE_BUILTIN_CTZLL 0 +#endif + +static inline uint32_t trailing_zeros64(uint64_t input) { +#if HAVE_BUILTIN_CTZLL + return __builtin_ctzll(input); +#else + uint32_t trailing_zeros = 0; + uint64_t temp = input; + while ((temp & 1) == 0 && temp > 0) { + trailing_zeros++; + temp >>= 1; + } + return trailing_zeros; +#endif +} + +static inline int trailing_zeros(int input) { + #if HAVE_BUILTIN_CTZLL + return __builtin_ctz(input); + #else + int trailing_zeros = 0; + int temp = input; + while ((temp & 1) == 0 && temp > 0) { + trailing_zeros++; + temp >>= 1; + } + return trailing_zeros; + #endif +} + +uint32_t popcount32(uint32_t x) { + #if defined(__GNUC__) || defined(__clang__) + return __builtin_popcount(x); + #elif defined(__ARM_NEON) + #include + return vaddv_u8(vcnt_u8(vcreate_u8((uint64_t)x))) & 0xFF; + #else + x = x - ((x >> 1) & 0x55555555); + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); + x = (x + (x >> 4)) & 0x0F0F0F0F; + return (x * 0x01010101) >> 24; + #endif +} + +#define SIMD_MINIMUM_THRESHOLD 6 + +#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64) +#include + +#define FIND_SIMD_IMPLEMENTATION_DEFINED 1 +static SIMD_Implementation find_simd_implementation() { + return SIMD_NEON; +} + +#define HAVE_SIMD_NEON 1 + +uint8x16x4_t load_uint8x16_4(const unsigned char *table) { + uint8x16x4_t tab; + tab.val[0] = vld1q_u8(table); + tab.val[1] = vld1q_u8(table+16); + tab.val[2] = vld1q_u8(table+32); + tab.val[3] = vld1q_u8(table+48); + return tab; +} + +void print_uint8x16(char *msg, uint8x16_t vec) { + printf("%s\n[ ", msg); + uint8_t store[16] = {0}; + vst1q_u8(store, vec); + for(int i=0; i<16; i++) { + printf("%3d ", store[i]); + } + printf("]\n"); +} + +#endif /* ARM Neon Support.*/ + +#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64) + +#ifdef HAVE_X86INTRIN_H +#include + +#define HAVE_SIMD_SSE2 1 + +void print_m128i(const char *prefix, __m128i vec) { + uint8_t r[16]; + _mm_storeu_si128((__m128i *) r, vec); + + printf("%s = [ ", prefix); + for(int i=0; i<16; i++) { + printf("%02x ", r[i]); + } + printf("]\n"); +} + +#ifdef HAVE_CPUID_H +#define FIND_SIMD_IMPLEMENTATION_DEFINED 1 + +#include +#endif /* HAVE_CPUID_H */ + +static SIMD_Implementation find_simd_implementation(void) { + +#if defined(__GNUC__ ) || defined(__clang__) +#ifdef __GNUC__ + __builtin_cpu_init(); +#endif /* __GNUC__ */ + + // TODO Revisit. I think the SSE version now only uses SSE2 instructions. + if (__builtin_cpu_supports("sse2")) { + return SIMD_SSE2; + } +#endif /* __GNUC__ || __clang__*/ + + return SIMD_NONE; +} + +#endif /* HAVE_X86INTRIN_H */ +#endif /* X86_64 Support */ + +#endif /* ENABLE_SIMD */ + +#ifndef FIND_SIMD_IMPLEMENTATION_DEFINED +static SIMD_Implementation find_simd_implementation(void) { + return SIMD_NONE; +} +#endif \ No newline at end of file diff --git a/test/json/json_generator_test.rb b/test/json/json_generator_test.rb index f87006ac..b4abcc47 100755 --- a/test/json/json_generator_test.rb +++ b/test/json/json_generator_test.rb @@ -410,18 +410,34 @@ def test_backslash json = '["\\\\.(?i:gif|jpe?g|png)$"]' assert_equal json, generate(data) # - data = [ '\\"' ] - json = '["\\\\\""]' + data = [ '\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$' ] + json = '["\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$"]' + assert_equal json, generate(data) + # + data = [ '\\"\\"\\"\\"\\"\\"\\"\\"\\"\\"\\"' ] + json = '["\\\\\"\\\\\"\\\\\"\\\\\"\\\\\"\\\\\"\\\\\"\\\\\"\\\\\"\\\\\"\\\\\""]' assert_equal json, generate(data) # data = [ '/' ] json = '["/"]' assert_equal json, generate(data) # + data = [ '////////////////////////////////////////////////////////////////////////////////////' ] + json = '["////////////////////////////////////////////////////////////////////////////////////"]' + assert_equal json, generate(data) + # data = [ '/' ] json = '["\/"]' assert_equal json, generate(data, :script_safe => true) # + data = [ '///////////' ] + json = '["\/\/\/\/\/\/\/\/\/\/\/"]' + assert_equal json, generate(data, :script_safe => true) + # + data = [ '///////////////////////////////////////////////////////' ] + json = '["\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/"]' + assert_equal json, generate(data, :script_safe => true) + # data = [ "\u2028\u2029" ] json = '["\u2028\u2029"]' assert_equal json, generate(data, :script_safe => true) @@ -438,6 +454,10 @@ def test_backslash json = '["\""]' assert_equal json, generate(data) # + data = ['"""""""""""""""""""""""""'] + json = '["\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\""]' + assert_equal json, generate(data) + # data = ["'"] json = '["\\\'"]' assert_equal '["\'"]', generate(data) @@ -445,6 +465,30 @@ def test_backslash data = ["倩", "瀨"] json = '["倩","瀨"]' assert_equal json, generate(data, script_safe: true) + # + data = '["This is a "test" of the emergency broadcast system."]' + json = "\"[\\\"This is a \\\"test\\\" of the emergency broadcast system.\\\"]\"" + assert_equal json, generate(data) + # + data = '\tThis is a test of the emergency broadcast system.' + json = "\"\\\\tThis is a test of the emergency broadcast system.\"" + assert_equal json, generate(data) + # + data = 'This\tis a test of the emergency broadcast system.' + json = "\"This\\\\tis a test of the emergency broadcast system.\"" + assert_equal json, generate(data) + # + data = 'This is\ta test of the emergency broadcast system.' + json = "\"This is\\\\ta test of the emergency broadcast system.\"" + assert_equal json, generate(data) + # + data = 'This is a test of the emergency broadcast\tsystem.' + json = "\"This is a test of the emergency broadcast\\\\tsystem.\"" + assert_equal json, generate(data) + # + data = 'This is a test of the emergency broadcast\tsystem.\n' + json = "\"This is a test of the emergency broadcast\\\\tsystem.\\\\n\"" + assert_equal json, generate(data) end def test_string_subclass