diff --git a/CHANGELOG.md b/CHANGELOG.md
index 19a92b90907da34256fc8d0ad8e9860f8d27f373..09b4a95cf50db9e518baf3e5d928e42ea3b4caa8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,55 @@
 
 This is a list of notable changes to Hyperscan, in reverse chronological order.
 
+## [5.4.2] 2023-04-19
+- Roll back bugfix for github issue #350: Besides using scratch for
+  corresponding database, Hyperscan also allows user to use larger scratch
+  allocated for another database. Users can leverage this property to achieve
+  safe scratch usage in multi-database scenarios. Behaviors beyond these are
+  discouraged and results are undefined.
+- Fix hsdump issue due to invalid nfa type.
+
+## [5.4.1] 2023-02-20
+- The Intel Hyperscan team is pleased to provide a bug fix release to our open source library.
+  Intel also maintains an upgraded version available through your Intel sales representative.
+- Bugfix for issue #184: fix random char value of UTF-8.
+- Bugfix for issue #291: bypass logical combination flag in hs_expression_info().
+- Bugfix for issue #292: fix build error due to libc symbol parsing.
+- Bugfix for issue #302/304: add empty string check for pure literal API.
+- Bugfix for issue #303: fix unknown instruction error in pure literal API.
+- Bugfix for issue #303: avoid memory leak in stream close stage.
+- Bugfix for issue #305: fix assertion failure in DFA construction.
+- Bugfix for issue #317: fix aligned allocator segment faults.
+- Bugfix for issue #350: add quick validity check for scratch.
+- Bugfix for issue #359: fix glibc-2.34 stack size issue.
+- Bugfix for issue #360: fix SKIP flag issue in chimera.
+- Bugfix for issue #362: fix one cotec check corner issue in UTF-8 validation.
+- Fix other compile issues.
+
+## [5.4.0] 2020-12-31
+- Improvement on literal matcher "Fat Teddy" performance, including
+  support for Intel(R) AVX-512 Vector Byte Manipulation Instructions (Intel(R)
+  AVX-512 VBMI).
+- Introduce a new 32-state shuffle-based DFA engine ("Sheng32"). This improves
+  scanning performance by leveraging AVX-512 VBMI.
+- Introduce a new 64-state shuffle-based DFA engine ("Sheng64"). This improves
+  scanning performance by leveraging AVX-512 VBMI.
+- Introduce a new shuffle-based hybrid DFA engine ("McSheng64"). This improves
+  scanning performance by leveraging AVX-512 VBMI.
+- Improvement on exceptional state handling performance for LimEx NFA, including
+  support for AVX-512 VBMI.
+- Improvement on lookaround performance with new models, including support for
+  AVX-512.
+- Improvement on DFA state space efficiency.
+- Optimization on decision of NFA/DFA generation.
+- hsbench: add CSV dump support for hsbench.
+- Bugfix for cmake error on Icelake under release mode.
+- Bugfix in find_vertices_in_cycles() to avoid self-loop checking in SCC.
+- Bugfix for issue #270: fix return value handling in chimera.
+- Bugfix for issue #284: use correct free function in logical combination.
+- Add BUILD_EXAMPLES cmake option to enable example code compilation. (#260)
+- Some typo fixing. (#242, #259)
+
 ## [5.3.0] 2020-05-15
 - Improvement on literal matcher "Teddy" performance, including support for
   Intel(R) AVX-512 Vector Byte Manipulation Instructions (Intel(R) AVX-512
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cfc1758941b1e089066359d93663996dd23662f0..8d4774d10b460d47c802aa2e5e440785b2240ec9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,8 +2,8 @@ cmake_minimum_required (VERSION 2.8.11)
 project (hyperscan C CXX)
 
 set (HS_MAJOR_VERSION 5)
-set (HS_MINOR_VERSION 3)
-set (HS_PATCH_VERSION 0)
+set (HS_MINOR_VERSION 4)
+set (HS_PATCH_VERSION 2)
 set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})
 
 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
@@ -131,9 +131,16 @@ CMAKE_DEPENDENT_OPTION(DUMP_SUPPORT "Dump code support; normally on, except in r
 
 CMAKE_DEPENDENT_OPTION(DISABLE_ASSERTS "Disable assert(); Asserts are enabled in debug builds, disabled in release builds" OFF "NOT RELEASE_BUILD" ON)
 
-option(BUILD_AVX512 "Experimental: support avx512 in the fat runtime" 
+option(BUILD_AVX512 "Experimental: support avx512 in the fat runtime"
     OFF)
 
+option(BUILD_AVX512VBMI "Experimental: support avx512vbmi in the fat runtime"
+    OFF)
+
+if (BUILD_AVX512VBMI)
+    set(BUILD_AVX512 ON)
+endif ()
+
 option(WINDOWS_ICC "Use Intel C++ Compiler on Windows, default off, requires ICC to be set in project" OFF)
 
 # TODO: per platform config files?
@@ -182,7 +189,7 @@ else()
         # generic, which isn't very good in some cases. march=native looks at
         # cpuid info and then chooses the best microarch it can (and replaces
         # the flag), so use that for tune.
-        
+
         if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i386")
             # arg1 might exist if using ccache
             string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1)
@@ -191,7 +198,7 @@ else()
                 OUTPUT_VARIABLE _GCC_OUTPUT)
             string(FIND "${_GCC_OUTPUT}" "march" POS)
             string(SUBSTRING "${_GCC_OUTPUT}" ${POS} -1 _GCC_OUTPUT)
-                    string(REGEX REPLACE "march=[ \t]*([^ \n]*)[ \n].*" "\\1"
+            string(REGEX REPLACE "march=[ \t]*([^ \n]*)[ \n].*" "\\1"
                 GNUCC_ARCH "${_GCC_OUTPUT}")
 
             # test the parsed flag
@@ -294,15 +301,16 @@ else()
         set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-omit-frame-pointer")
     endif()
 
+
     if (CMAKE_C_COMPILER_ID MATCHES "Intel")
         set(SKYLAKE_FLAG "-xCORE-AVX512")
     else ()
         set(SKYLAKE_FLAG "-march=skylake-avx512")
+        set(ICELAKE_FLAG "-march=icelake-server")
     endif ()
 endif()
 
 CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H)
-
 if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i386")
     CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H)
     CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H)
@@ -314,7 +322,7 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
     CHECK_INCLUDE_FILES(arm_neon.h HAVE_C_ARM_NEON_H)
     CHECK_INCLUDE_FILE_CXX(arm_neon.h HAVE_CXX_ARM_NEON_H)
 endif()
-
+    
 CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN)
 CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC)
 
@@ -423,6 +431,18 @@ if (CXX_UNUSED_CONST_VAR)
     set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-unused-const-variable")
 endif()
 
+# clang-14 complains about unused-but-set variable.
+CHECK_CXX_COMPILER_FLAG("-Wunused-but-set-variable" CXX_UNUSED_BUT_SET_VAR)
+if (CXX_UNUSED_BUT_SET_VAR)
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-unused-but-set-variable")
+endif()
+
+# clang-14 complains about using bitwise operator instead of logical ones.
+CHECK_CXX_COMPILER_FLAG("-Wbitwise-instead-of-logical" CXX_BITWISE_INSTEAD_OF_LOGICAL)
+if (CXX_BITWISE_INSTEAD_OF_LOGICAL)
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-bitwise-instead-of-logical")
+endif()
+
 # gcc 6 complains about type attributes that get ignored, like alignment
 CHECK_CXX_COMPILER_FLAG("-Wignored-attributes" CXX_IGNORED_ATTR)
 if (CXX_IGNORED_ATTR)
@@ -448,8 +468,10 @@ CHECK_CXX_COMPILER_FLAG("-Wunused-variable" CXX_WUNUSED_VARIABLE)
 
 # gcc 10 complains about this
 CHECK_C_COMPILER_FLAG("-Wstringop-overflow" CC_STRINGOP_OVERFLOW)
-if(CC_STRINGOP_OVERFLOW)
+CHECK_CXX_COMPILER_FLAG("-Wstringop-overflow" CXX_STRINGOP_OVERFLOW)
+if(CC_STRINGOP_OVERFLOW OR CXX_STRINGOP_OVERFLOW)
     set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-stringop-overflow")
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-stringop-overflow")
 endif()
 
 endif()
@@ -486,9 +508,9 @@ endif()
 # Test case for neon function.
 option(UNIT_SIMD "Simd funtion test case, default is OFF" OFF)
 if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
-  if (UNIT_SIMD)
-	add_subdirectory(unit-simd)
-  endif()
+    if (UNIT_SIMD)
+        add_subdirectory(unit-simd)
+    endif()
 endif()
 
 add_subdirectory(util)
@@ -593,7 +615,6 @@ set_source_files_properties(
     PROPERTIES
         COMPILE_FLAGS "${RAGEL_C_FLAGS}")
 
-
 if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i386")
     ragelmaker(src/parser/control_verbs.rl)
 endif()
@@ -620,7 +641,7 @@ set (hs_exec_common_SRCS
 
 set (hs_exec_SRCS
     ${hs_HEADERS}
-    src/hs_version.h
+    src/hs_version.h.in
     src/ue2common.h
     src/allocator.h
     src/crc32.c
@@ -777,7 +798,7 @@ SET (hs_compile_SRCS
     src/grey.h
     src/hs.cpp
     src/hs_internal.h
-    src/hs_version.h
+    src/hs_version.h.in
     src/scratch.h
     src/state.h
     src/ue2common.h
@@ -1246,6 +1267,9 @@ else (FAT_RUNTIME)
     if (NOT BUILD_AVX512)
         set (DISPATCHER_DEFINE "-DDISABLE_AVX512_DISPATCH")
     endif (NOT BUILD_AVX512)
+    if (NOT BUILD_AVX512VBMI)
+        set (DISPATCHER_DEFINE "${DISPATCHER_DEFINE} -DDISABLE_AVX512VBMI_DISPATCH")
+    endif (NOT BUILD_AVX512VBMI)
     set_source_files_properties(src/dispatcher.c PROPERTIES
         COMPILE_FLAGS "-Wno-unused-parameter -Wno-unused-function ${DISPATCHER_DEFINE}")
 
@@ -1278,6 +1302,14 @@ else (FAT_RUNTIME)
                RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx512 ${CMAKE_MODULE_PATH}/keep.syms.in"
                )
        endif (BUILD_AVX512)
+       if (BUILD_AVX512VBMI)
+           add_library(hs_exec_avx512vbmi OBJECT ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
+           list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_avx512vbmi>)
+           set_target_properties(hs_exec_avx512vbmi PROPERTIES
+               COMPILE_FLAGS "${ICELAKE_FLAG}"
+               RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx512vbmi ${CMAKE_MODULE_PATH}/keep.syms.in"
+               )
+       endif (BUILD_AVX512VBMI)
 
        add_library(hs_exec_common OBJECT
            ${hs_exec_common_SRCS}
@@ -1336,6 +1368,15 @@ else (FAT_RUNTIME)
                 RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx512 ${CMAKE_MODULE_PATH}/keep.syms.in"
                 )
         endif (BUILD_AVX512)
+        if (BUILD_AVX512VBMI)
+            add_library(hs_exec_shared_avx512vbmi OBJECT ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
+            list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_avx512vbmi>)
+            set_target_properties(hs_exec_shared_avx512vbmi PROPERTIES
+                COMPILE_FLAGS "${ICELAKE_FLAG}"
+                POSITION_INDEPENDENT_CODE TRUE
+                RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx512vbmi ${CMAKE_MODULE_PATH}/keep.syms.in"
+                )
+        endif (BUILD_AVX512VBMI)
         add_library(hs_exec_common_shared OBJECT
         ${hs_exec_common_SRCS}
         src/dispatcher.c
@@ -1429,7 +1470,7 @@ if (NOT BUILD_STATIC_LIBS)
     add_library(hs ALIAS hs_shared)
 endif ()
 
-
-if(NOT WIN32)
+option(BUILD_EXAMPLES "Build Hyperscan example code (default TRUE)" TRUE)
+if(NOT WIN32 AND BUILD_EXAMPLES)
     add_subdirectory(examples)
 endif()
diff --git a/README.md b/README.md
index bd796d3639b995ec8a7bed50b307c1877bf87c29..13bc5b6e0cb9e6dfe395b42ee528234c35166e9d 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ regular expression syntax of the commonly-used libpcre library, but is a
 standalone library with its own C API.
 
 Hyperscan uses hybrid automata techniques to allow simultaneous matching of
-large numbers (up to tens of thousands) of regular expressions and the
+large numbers (up to tens of thousands) of regular expressions and for the
 matching of regular expressions across streams of data.
 
 Hyperscan is typically used in a DPI library stack.
@@ -27,7 +27,7 @@ release of Intel Hyperscan.
 
 The `aarch64` branch on Github/kunpengcompute will always contain the most recent 
 release that supports the AArch64 architecture. The AArch64 branch was developed
-based on Intel Hyperscan 5.2.1. Each version released to `aarch64` branch goes through
+based on Intel Hyperscan 5.4.2. Each version released to `aarch64` branch goes through
 QA and testing before it is released; if you're a user of AArch64, rather than a developer,
 this is the version you should be using.
 
@@ -46,12 +46,12 @@ The official homepage for Hyperscan is at [www.hyperscan.io](https://www.hypersc
 
 `master` branch
 
-If you have questions or comments, we encourage you to [join the mailing list]
-(https://lists.01.org/mailman/listinfo/hyperscan). To file a bug, you can send an email 
-to the list, or create an issue on Github.
+If you have questions or comments, we encourage you to [join the mailing
+list](https://lists.01.org/mailman/listinfo/hyperscan). Bugs can be filed by
+sending email to the list, or by creating an issue on Github.
 
 If you wish to contact the Hyperscan team at Intel directly, without posting
-publicly to the mailing list, send an email to
+publicly to the mailing list, send email to
 [hyperscan@intel.com](mailto:hyperscan@intel.com).
 
 `aarch64` branch
diff --git a/chimera/ch_common.h b/chimera/ch_common.h
index 8caa44407f120a9fe4c8ee21acad5860c7b266d7..bdb0bafa93fd1021019ec53d3cd7b2b91f0e690d 100644
--- a/chimera/ch_common.h
+++ b/chimera/ch_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Intel Corporation
+ * Copyright (c) 2018-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -345,6 +345,16 @@ ch_error_t HS_CDECL ch_set_scratch_allocator(ch_alloc_t alloc_func,
  */
 #define CH_SCRATCH_IN_USE       (-10)
 
+/**
+ * Unexpected internal error from Hyperscan.
+ *
+ * This error indicates that there was unexpected matching behaviors from
+ * Hyperscan. This could be related to invalid usage of scratch space or
+ * invalid memory operations by users.
+ *
+ */
+#define CH_UNKNOWN_HS_ERROR     (-13)
+
 /**
  * Returned when pcre_exec (called for some expressions internally from @ref
  * ch_scan) failed due to a fatal error.
diff --git a/chimera/ch_runtime.c b/chimera/ch_runtime.c
index 212bbc7bec371bff23196908e737f47472e75577..af7d1f080ed1912042d1784ffcb6a8d80219b9a0 100644
--- a/chimera/ch_runtime.c
+++ b/chimera/ch_runtime.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Intel Corporation
+ * Copyright (c) 2018-2022, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -326,6 +326,10 @@ ch_error_t catchupPcre(struct HybridContext *hyctx, unsigned int id,
         } else if (cbrv == CH_CALLBACK_SKIP_PATTERN) {
             DEBUG_PRINTF("user callback told us to skip this pattern\n");
             pd->scanStart = hyctx->length;
+            if (top_id == id) {
+                break;
+            }
+            continue;
         }
 
         if (top_id == id) {
@@ -419,6 +423,7 @@ int HS_CDECL multiCallback(unsigned int id, unsigned long long from,
             DEBUG_PRINTF("user callback told us to skip this pattern\n");
             pd->scanStart = hyctx->length;
             ret = HS_SUCCESS;
+            hyctx->scratch->ret = ret;
         } else if (ret == CH_FAIL_INTERNAL) {
             return ret;
         }
@@ -590,11 +595,24 @@ ch_error_t ch_scan_i(const ch_database_t *hydb,
 
     if (!(db->flags & CHIMERA_FLAG_NO_MULTIMATCH)) {
         ret = scanHyperscan(&hyctx, data, length);
-        if (ret != HS_SUCCESS && scratch->ret != CH_SUCCESS) {
-            DEBUG_PRINTF("Hyperscan returned error %d\n", scratch->ret);
+        // Errors from pcre scan.
+        if (scratch->ret == CH_CALLBACK_TERMINATE) {
+            DEBUG_PRINTF("Pcre terminates scan\n");
+            unmarkScratchInUse(scratch);
+            return CH_SCAN_TERMINATED;
+        } else if (scratch->ret != CH_SUCCESS) {
+            DEBUG_PRINTF("Pcre internal error\n");
             unmarkScratchInUse(scratch);
             return scratch->ret;
         }
+        // Errors from Hyperscan scan. Note Chimera could terminate
+        // Hyperscan callback on purpose so this is not counted as an error.
+        if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
+            assert(scratch->ret == CH_SUCCESS);
+            DEBUG_PRINTF("Hyperscan returned error %d\n", ret);
+            unmarkScratchInUse(scratch);
+            return ret;
+        }
     }
 
     DEBUG_PRINTF("Flush priority queue\n");
diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index cced49c6978d7bebaddb55de8c8a944cc9acdfbb..eb4791e6b6800c32aa9c7bf7a3fbc971ffede842 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -17,10 +17,21 @@ if (BUILD_AVX512)
     endif ()
 endif ()
 
+if (BUILD_AVX512VBMI)
+    CHECK_C_COMPILER_FLAG(${ICELAKE_FLAG} HAS_ARCH_ICELAKE)
+    if (NOT HAS_ARCH_ICELAKE)
+        message (FATAL_ERROR "AVX512VBMI not supported by compiler")
+    endif ()
+endif ()
+
 if (FAT_RUNTIME)
     # test the highest level microarch to make sure everything works
     if (BUILD_AVX512)
-        set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${SKYLAKE_FLAG}")
+        if (BUILD_AVX512VBMI)
+            set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${ICELAKE_FLAG}")
+        else ()
+            set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${SKYLAKE_FLAG}")
+        endif (BUILD_AVX512VBMI)
     else ()
         set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-avx2")
     endif ()
@@ -80,6 +91,9 @@ if (FAT_RUNTIME)
     if (BUILD_AVX512 AND NOT HAVE_AVX512)
         message(FATAL_ERROR "AVX512 support requested but not supported")
     endif ()
+    if (BUILD_AVX512VBMI AND NOT HAVE_AVX512VBMI)
+        message(FATAL_ERROR "AVX512VBMI support requested but not supported")
+    endif ()
 else (NOT FAT_RUNTIME)
     if (NOT HAVE_AVX2)
         message(STATUS "Building without AVX2 support")
@@ -87,6 +101,9 @@ else (NOT FAT_RUNTIME)
     if (NOT HAVE_AVX512)
         message(STATUS "Building without AVX512 support")
     endif ()
+    if (NOT HAVE_AVX512VBMI)
+        message(STATUS "Building without AVX512VBMI support")
+    endif ()
     if (NOT HAVE_SSSE3)
         message(FATAL_ERROR "A minimum of SSSE3 compiler support is required")
     endif ()
diff --git a/cmake/build_wrapper.sh b/cmake/build_wrapper.sh
index 1962813fed6424eb78ce63738abc06ec1bfd4b80..895610c0074aeadec0c92c092f86f27cb7708efd 100755
--- a/cmake/build_wrapper.sh
+++ b/cmake/build_wrapper.sh
@@ -17,7 +17,7 @@ KEEPSYMS=$(mktemp -p /tmp keep.syms.XXXXX)
 LIBC_SO=$("$@" --print-file-name=libc.so.6)
 cp ${KEEPSYMS_IN} ${KEEPSYMS}
 # get all symbols from libc and turn them into patterns
-nm -f p -g -D ${LIBC_SO} | sed -s 's/\([^ ]*\).*/^\1$/' >> ${KEEPSYMS}
+nm -f p -g -D ${LIBC_SO} | sed -s 's/\([^ @]*\).*/^\1$/' >> ${KEEPSYMS}
 # build the object
 "$@"
 # rename the symbols in the object
diff --git a/cmake/config.h.in b/cmake/config.h.in
index 561c65fec0d704a1a07c515b6cfd364eac0dc044..336cf19eb8510fe1a0bf86d042ef8092383710e6 100644
--- a/cmake/config.h.in
+++ b/cmake/config.h.in
@@ -27,6 +27,9 @@
 /* Define if building AVX-512 in the fat runtime. */
 #cmakedefine BUILD_AVX512
 
+/* Define if building AVX512VBMI in the fat runtime. */
+#cmakedefine BUILD_AVX512VBMI
+
 /* Define to 1 if `backtrace' works. */
 #cmakedefine HAVE_BACKTRACE
 
diff --git a/cmake/pcre.cmake b/cmake/pcre.cmake
index e0acda5e70fe9331d651026f7e052fe982d837ff..876bf6662ec292f37d693ed898ec7c6b1b9c9f26 100644
--- a/cmake/pcre.cmake
+++ b/cmake/pcre.cmake
@@ -48,6 +48,9 @@ if (PCRE_BUILD_SOURCE)
     set(PCRE_SUPPORT_UNICODE_PROPERTIES ON CACHE BOOL "Build pcre with unicode")
     add_subdirectory(${PCRE_SOURCE} ${PROJECT_BINARY_DIR}/pcre EXCLUDE_FROM_ALL)
     set(PCRE_INCLUDE_DIRS ${PCRE_SOURCE} ${PROJECT_BINARY_DIR}/pcre)
+    if (NOT EXISTS ${PCRE_SOURCE}/config.h)
+        execute_process(COMMAND cp -rf ${PROJECT_BINARY_DIR}/pcre/config.h ${PCRE_SOURCE})
+    endif ()
     set(PCRE_LDFLAGS -L"${LIBDIR}" -lpcre)
 else ()
     # pkgconf should save us
diff --git a/cmake/platform.cmake b/cmake/platform.cmake
index 213dcc5c8139941d210e4f7319971d82c4220dbb..ce0e5ac73cdcf5da75e0b5386bda9b8ec55bcc57 100644
--- a/cmake/platform.cmake
+++ b/cmake/platform.cmake
@@ -11,4 +11,4 @@ if (ARCH_X86_64 OR ARCH_AARCH64)
     set(ARCH_64_BIT 1)
 elseif (ARCH_IA32)
     set(ARCH_32_BIT 1)
-endif()
\ No newline at end of file
+endif()
diff --git a/cmake/ragel.cmake b/cmake/ragel.cmake
index 3356cb9f8840b1e862c30bfda7bc536332a39a07..88f02885fb70190b7c87d62017005411cea9e843 100644
--- a/cmake/ragel.cmake
+++ b/cmake/ragel.cmake
@@ -33,4 +33,4 @@ endfunction(ragelmaker)
      )
      add_custom_target(ragel_${src_file} DEPENDS ${rl_out})
      set_source_files_properties(${rl_out} PROPERTIES GENERATED TRUE)
- endfunction(ragelcopyer)
\ No newline at end of file
+ endfunction(ragelcopyer)
diff --git a/doc/dev-reference/chimera.rst b/doc/dev-reference/chimera.rst
index 883cb5a0aefc2b0d5ebee0cd234cc8defcb1a01d..d35b116f5b1e461a81bb17d314b8378cb5f3de38 100644
--- a/doc/dev-reference/chimera.rst
+++ b/doc/dev-reference/chimera.rst
@@ -212,7 +212,7 @@ space is required for that context.
 In the absence of recursive scanning, only one such space is required per thread
 and can (and indeed should) be allocated before data scanning is to commence.
 
-In a scenario where a set of expressions are compiled by a single "master"
+In a scenario where a set of expressions are compiled by a single "main"
 thread and data will be scanned by multiple "worker" threads, the convenience
 function :c:func:`ch_clone_scratch` allows multiple copies of an existing
 scratch space to be made for each thread (rather than forcing the caller to pass
diff --git a/doc/dev-reference/compilation.rst b/doc/dev-reference/compilation.rst
index 205b7348b763d1f1296b80f3ec7d647ff3b7e6e1..6f5541ecfe92af7b50cd19632c24f064b2f9e4af 100644
--- a/doc/dev-reference/compilation.rst
+++ b/doc/dev-reference/compilation.rst
@@ -64,21 +64,21 @@ interpreted independently. No syntax association happens between any adjacent
 characters.
 
 For example, given an expression written as :regexp:`/bc?/`. We could say it is
-a regluar expression, with the meaning that character ``b`` followed by nothing
+a regular expression, with the meaning that character ``b`` followed by nothing
 or by one character ``c``. On the other view, we could also say it is a pure
 literal expression, with the meaning that this is a character sequence of 3-byte
 length, containing characters ``b``, ``c`` and ``?``. In regular case, the
 question mark character ``?`` has a particular syntax role called 0-1 quantifier,
-which has an syntax association with the character ahead of it. Similar
-characters exist in regular grammer like ``[``, ``]``, ``(``, ``)``, ``{``,
+which has a syntax association with the character ahead of it. Similar
+characters exist in regular grammar like ``[``, ``]``, ``(``, ``)``, ``{``,
 ``}``, ``-``, ``*``, ``+``, ``\``, ``|``, ``/``, ``:``, ``^``, ``.``, ``$``.
 While in pure literal case, all these meta characters lost extra meanings
 expect for that they are just common ASCII codes.
 
 Hyperscan is initially designed to process common regular expressions. It is
-hence embedded with a complex parser to do comprehensive regular grammer
-interpretion. Particularly, the identification of above meta characters is the
-basic step for the interpretion of far more complex regular grammers.
+hence embedded with a complex parser to do comprehensive regular grammar
+interpretation. Particularly, the identification of above meta characters is the
+basic step for the interpretation of far more complex regular grammars.
 
 However in real cases, patterns may not always be regular expressions. They
 could just be pure literals. Problem will come if the pure literals contain
@@ -165,7 +165,7 @@ The following regex constructs are supported by Hyperscan:
     :regexp:`{n,}` are supported with limitations.
 
     * For arbitrary repeated sub-patterns: *n* and *m* should be either small
-      or infinite, e.g. :regexp:`(a|b}{4}`, :regexp:`(ab?c?d){4,10}` or
+      or infinite, e.g. :regexp:`(a|b){4}`, :regexp:`(ab?c?d){4,10}` or
       :regexp:`(ab(cd)*){6,}`.
 
     * For single-character width sub-patterns such as :regexp:`[^\\a]` or
diff --git a/doc/dev-reference/getting_started.rst b/doc/dev-reference/getting_started.rst
index b381287339f496274895b27db084f24c8e5f2db6..aaff15ba22152376b764c0af1eab72f45f6cc34a 100644
--- a/doc/dev-reference/getting_started.rst
+++ b/doc/dev-reference/getting_started.rst
@@ -263,17 +263,19 @@ the current platform is supported by Hyperscan.
 As of this release, the variants of the runtime that are built, and the CPU
 capability that is required, are the following:
 
-+----------+-------------------------------+---------------------------+
-| Variant  | CPU Feature Flag(s) Required  | gcc arch flag             |
-+==========+===============================+===========================+
-| Core 2   | ``SSSE3``                     | ``-march=core2``          |
-+----------+-------------------------------+---------------------------+
-| Core i7  | ``SSE4_2`` and ``POPCNT``     | ``-march=corei7``         |
-+----------+-------------------------------+---------------------------+
-| AVX 2    | ``AVX2``                      | ``-march=core-avx2``      |
-+----------+-------------------------------+---------------------------+
-| AVX 512  | ``AVX512BW`` (see note below) | ``-march=skylake-avx512`` |
-+----------+-------------------------------+---------------------------+
++--------------+---------------------------------+---------------------------+
+| Variant      | CPU Feature Flag(s) Required    | gcc arch flag             |
++==============+=================================+===========================+
+| Core 2       | ``SSSE3``                       | ``-march=core2``          |
++--------------+---------------------------------+---------------------------+
+| Core i7      | ``SSE4_2`` and ``POPCNT``       | ``-march=corei7``         |
++--------------+---------------------------------+---------------------------+
+| AVX 2        | ``AVX2``                        | ``-march=core-avx2``      |
++--------------+---------------------------------+---------------------------+
+| AVX 512      | ``AVX512BW`` (see note below)   | ``-march=skylake-avx512`` |
++--------------+---------------------------------+---------------------------+
+| AVX 512 VBMI | ``AVX512VBMI`` (see note below) | ``-march=icelake-server`` |
++--------------+---------------------------------+---------------------------+
 
 .. note::
 
@@ -287,6 +289,16 @@ capability that is required, are the following:
 
         cmake -DBUILD_AVX512=on <...>
 
+    Hyperscan v5.3 adds support for AVX512VBMI instructions - in particular the
+    ``AVX512VBMI`` instruction set that was introduced on Intel "Icelake" Xeon
+    processors - however the AVX512VBMI runtime variant is **not** enabled by
+    default in fat runtime builds as not all toolchains support AVX512VBMI
+    instruction sets. To build an AVX512VBMI runtime, the CMake variable
+    ``BUILD_AVX512VBMI`` must be enabled manually during configuration. For
+    example: ::
+
+        cmake -DBUILD_AVX512VBMI=on <...>
+
 As the fat runtime requires compiler, libc, and binutils support, at this time
 it will only be enabled for Linux builds where the compiler supports the
 `indirect function "ifunc" function attribute
diff --git a/doc/dev-reference/runtime.rst b/doc/dev-reference/runtime.rst
index d64ec540d993ac37966a0b0769c1cc24001b4060..396521c94cc3ba467a6836d910b09e009834829e 100644
--- a/doc/dev-reference/runtime.rst
+++ b/doc/dev-reference/runtime.rst
@@ -178,7 +178,7 @@ space is required for that context.
 In the absence of recursive scanning, only one such space is required per thread
 and can (and indeed should) be allocated before data scanning is to commence.
 
-In a scenario where a set of expressions are compiled by a single "master"
+In a scenario where a set of expressions are compiled by a single "main"
 thread and data will be scanned by multiple "worker" threads, the convenience
 function :c:func:`hs_clone_scratch` allows multiple copies of an existing
 scratch space to be made for each thread (rather than forcing the caller to pass
diff --git a/examples/patbench.cc b/examples/patbench.cc
index 20de5745ee2890def858cd9433cdbf81923d988c..dac58db99ec12588327bf431598334458a787064 100644
--- a/examples/patbench.cc
+++ b/examples/patbench.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -115,6 +115,7 @@
 #include <algorithm>
 #include <cstring>
 #include <chrono>
+#include <climits>
 #include <fstream>
 #include <iomanip>
 #include <iostream>
@@ -657,6 +658,10 @@ int main(int argc, char **argv) {
             break;
         case 'n':
             repeatCount = atoi(optarg);
+            if (repeatCount <= 0 || repeatCount > UINT_MAX) {
+                cerr << "Invalid repeatCount." << endl;
+                exit(-1);
+            }
             break;
         default:
             usage(argv[0]);
diff --git a/examples/pcapscan.cc b/examples/pcapscan.cc
index 12b944388e594c7663c068a017dadecb9345e65c..2fd13e5b507f349de090b30af76aadbbe556a1e6 100644
--- a/examples/pcapscan.cc
+++ b/examples/pcapscan.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -51,6 +51,7 @@
 
 #include <cstring>
 #include <chrono>
+#include <climits>
 #include <fstream>
 #include <iomanip>
 #include <iostream>
@@ -489,6 +490,10 @@ int main(int argc, char **argv) {
 
     // Streaming mode scans.
     double secsStreamingScan = 0.0, secsStreamingOpenClose = 0.0;
+    if (repeatCount <= 0 || repeatCount > UINT_MAX) {
+        cerr << "Invalid repeatCount." << endl;
+        exit(-1);
+    }
     for (unsigned int i = 0; i < repeatCount; i++) {
         // Open streams.
         clock.start();
diff --git a/examples/simplegrep.c b/examples/simplegrep.c
index d6bd4b39260b407b35d651530c3aabe4d6ac4dff..30a97b0f058b666fb2364ec53235ecf2c7e7b54a 100644
--- a/examples/simplegrep.c
+++ b/examples/simplegrep.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -57,6 +57,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <unistd.h>
 
 #include <hs.h>
 
@@ -152,6 +153,15 @@ int main(int argc, char *argv[]) {
     char *pattern = argv[1];
     char *inputFN = argv[2];
 
+    if (access(inputFN, F_OK) != 0) {
+        fprintf(stderr, "ERROR: file doesn't exist.\n");
+        return -1;
+    }
+    if (access(inputFN, R_OK) != 0) {
+        fprintf(stderr, "ERROR: can't be read.\n");
+        return -1;
+    }
+
     /* First, we attempt to compile the pattern provided on the command line.
      * We assume 'DOTALL' semantics, meaning that the '.' meta-character will
      * match newline characters. The compiler will analyse the given pattern and
diff --git a/src/compiler/compiler.cpp b/src/compiler/compiler.cpp
index 666eefc9c30a2ad98d8a69f3c04b452b9551124e..35f46b3fea2e7af157e6c46302f764c188136289 100644
--- a/src/compiler/compiler.cpp
+++ b/src/compiler/compiler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2015-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -323,7 +323,8 @@ void addExpression(NG &ng, unsigned index, const char *expression,
     }
 
     // Ensure that our pattern isn't too long (in characters).
-    if (strlen(expression) > cc.grey.limitPatternLength) {
+    size_t maxlen = cc.grey.limitPatternLength + 1;
+    if (strnlen(expression, maxlen) >= maxlen) {
         throw CompileError("Pattern length exceeds limit.");
     }
 
@@ -416,6 +417,10 @@ void addLitExpression(NG &ng, unsigned index, const char *expression,
                            "HS_FLAG_SOM_LEFTMOST are supported in literal API.");
     }
 
+    if (!strcmp(expression, "")) {
+        throw CompileError("Pure literal API doesn't support empty string.");
+    }
+
     // This expression must be a pure literal, we can build ue2_literal
     // directly based on expression text.
     ParsedLitExpression ple(index, expression, expLength, flags, id);
@@ -458,6 +463,9 @@ platform_t target_to_platform(const target_t &target_info) {
     if (!target_info.has_avx512()) {
         p |= HS_PLATFORM_NOAVX512;
     }
+    if (!target_info.has_avx512vbmi()) {
+        p |= HS_PLATFORM_NOAVX512VBMI;
+    }
     return p;
 }
 
diff --git a/src/crc32.c b/src/crc32.c
index 4609c5dd23598184454cead26e634349377eb71f..c10bb44c30c1ed30034b9afc0b60566da92daf48 100644
--- a/src/crc32.c
+++ b/src/crc32.c
@@ -72,7 +72,6 @@ u32 crc32c_neon(u32 running_crc, const unsigned char * p_buf, const size_t lengt
 }
 #endif
 
-
 #if !defined(HAVE_SSE42)
 
 /***
diff --git a/src/database.c b/src/database.c
index 1a79800e2d70acf0136b8bd459f19f65f783c2cc..6adf1419ddead910ebd446faa151256d1d5f7e43 100644
--- a/src/database.c
+++ b/src/database.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -115,7 +115,8 @@ static
 hs_error_t db_check_platform(const u64a p) {
     if (p != hs_current_platform
         && p != (hs_current_platform | hs_current_platform_no_avx2)
-        && p != (hs_current_platform | hs_current_platform_no_avx512)) {
+        && p != (hs_current_platform | hs_current_platform_no_avx512)
+        && p != (hs_current_platform | hs_current_platform_no_avx512vbmi)) {
         return HS_DB_PLATFORM_ERROR;
     }
     // passed all checks
@@ -370,9 +371,11 @@ hs_error_t print_database_string(char **s, u32 version, const platform_t plat,
     u8 minor = (version >> 16) & 0xff;
     u8 major = (version >> 24) & 0xff;
 
-    const char *features = (plat & HS_PLATFORM_NOAVX512)
-                               ? (plat & HS_PLATFORM_NOAVX2) ? "" : "AVX2"
-                               : "AVX512";
+    const char *features = (plat & HS_PLATFORM_NOAVX512VBMI)
+                               ? (plat & HS_PLATFORM_NOAVX512)
+                                   ? (plat & HS_PLATFORM_NOAVX2) ? "" : "AVX2"
+                                   : "AVX512"
+                               : "AVX512VBMI";
 
     const char *mode = NULL;
 
diff --git a/src/database.h b/src/database.h
index 5715ed677d6ed2a656ada11f91603529795ed657..f122f97be7ca51d73b93e3395bb5e037722d0152 100644
--- a/src/database.h
+++ b/src/database.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -55,6 +55,7 @@ extern "C"
 
 #define HS_PLATFORM_NOAVX2          (4<<13)
 #define HS_PLATFORM_NOAVX512        (8<<13)
+#define HS_PLATFORM_NOAVX512VBMI    (0x10<<13)
 
 /** \brief Platform features bitmask. */
 typedef u64a platform_t;
@@ -66,6 +67,9 @@ const platform_t hs_current_platform = {
 #endif
 #if !defined(HAVE_AVX512)
     HS_PLATFORM_NOAVX512 |
+#endif
+#if !defined(HAVE_AVX512VBMI)
+    HS_PLATFORM_NOAVX512VBMI |
 #endif
     0,
 };
@@ -74,12 +78,20 @@ static UNUSED
 const platform_t hs_current_platform_no_avx2 = {
     HS_PLATFORM_NOAVX2 |
     HS_PLATFORM_NOAVX512 |
+    HS_PLATFORM_NOAVX512VBMI |
     0,
 };
 
 static UNUSED
 const platform_t hs_current_platform_no_avx512 = {
     HS_PLATFORM_NOAVX512 |
+    HS_PLATFORM_NOAVX512VBMI |
+    0,
+};
+
+static UNUSED
+const platform_t hs_current_platform_no_avx512vbmi = {
+    HS_PLATFORM_NOAVX512VBMI |
     0,
 };
 
diff --git a/src/dispatcher.c b/src/dispatcher.c
index a786b806d1f2a6025138341b118bb8161ada9e10..9a8afa623f4472f624bff1624985fdbfd577b7ae 100644
--- a/src/dispatcher.c
+++ b/src/dispatcher.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,8 +38,14 @@
 #define check_avx512() (0)
 #endif
 
+#if defined(DISABLE_AVX512VBMI_DISPATCH)
+#define avx512vbmi_ disabled_
+#define check_avx512vbmi() (0)
+#endif
+
 #define CREATE_DISPATCH(RTYPE, NAME, ...)                                      \
     /* create defns */                                                         \
+    RTYPE JOIN(avx512vbmi_, NAME)(__VA_ARGS__);                                \
     RTYPE JOIN(avx512_, NAME)(__VA_ARGS__);                                    \
     RTYPE JOIN(avx2_, NAME)(__VA_ARGS__);                                      \
     RTYPE JOIN(corei7_, NAME)(__VA_ARGS__);                                    \
@@ -52,6 +58,9 @@
                                                                                \
     /* resolver */                                                             \
     static RTYPE (*JOIN(resolve_, NAME)(void))(__VA_ARGS__) {                  \
+        if (check_avx512vbmi()) {                                              \
+            return JOIN(avx512vbmi_, NAME);                                    \
+        }                                                                      \
         if (check_avx512()) {                                                  \
             return JOIN(avx512_, NAME);                                        \
         }                                                                      \
diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c
index 713023e02e9003f9d3f1b5c27b45970f992834d9..b6b10f98cb0b0969f49dd32ff2979af45a221e7f 100644
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -133,7 +133,7 @@ u64a andn(const u32 a, const u8 *b) {
                          : "=r"(r)
                          : "r"(a), "m"(*(const u32 *)b)
                          : "w0"
-    );	
+    );
 #else
     r = unaligned_load_u32(b) & ~a;
 #endif
@@ -345,7 +345,6 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
     *conf8 = movq(*s);
     *s = rshiftbyte_m128(*s, 8);
     *conf8 ^= ~0ULL;
-
 #endif
 }
 
diff --git a/src/fdr/fdr_dump.cpp b/src/fdr/fdr_dump.cpp
index f4cd1f44ed5b791034c1fe52da56495ab13c7cc5..1dda751ace54fdbcb96b5b4e8035f82ba32f867f 100644
--- a/src/fdr/fdr_dump.cpp
+++ b/src/fdr/fdr_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -107,6 +107,25 @@ void dumpTeddyReinforced(const u8 *rmsk, const u32 num_tables, FILE *f) {
     }
 }
 
+static
+void dumpTeddyDupMasks(const u8 *dmsk, u32 numMasks, FILE *f) {
+    // dump nibble masks
+    u32 maskWidth = 2;
+    fprintf(f, "    dup nibble masks:\n");
+    for (u32 i = 0; i < numMasks * 2; i++) {
+        fprintf(f, "      -%d%s: ", 1 + i / 2, (i % 2) ? "hi" : "lo");
+        for (u32 j = 0; j < 16 * maskWidth * 2; j++) {
+            u8 val = dmsk[i * 16 * maskWidth * 2 + j];
+            for (u32 k = 0; k < 8; k++) {
+                fprintf(f, "%s", ((val >> k) & 0x1) ? "1" : "0");
+            }
+            fprintf(f, " ");
+        }
+        fprintf(f, "\n");
+    }
+    fprintf(f, "\n");
+}
+
 static
 void dumpTeddyMasks(const u8 *baseMsk, u32 numMasks, u32 maskWidth, FILE *f) {
     // dump nibble masks
@@ -146,12 +165,17 @@ void dumpTeddy(const Teddy *teddy, FILE *f) {
 
     u32 maskWidth = des->getNumBuckets() / 8;
     size_t headerSize = sizeof(Teddy);
-    size_t maskLen = des->numMasks * 16 * 2 * maskWidth;
     const u8 *teddy_base = (const u8 *)teddy;
     const u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize);
-    const u8 *rmsk = baseMsk + ROUNDUP_CL(maskLen);
     dumpTeddyMasks(baseMsk, des->numMasks, maskWidth, f);
-    dumpTeddyReinforced(rmsk, maskWidth, f);
+    size_t maskLen = des->numMasks * 16 * 2 * maskWidth;
+    const u8 *rdmsk = baseMsk + ROUNDUP_CL(maskLen);
+    if (maskWidth == 1) { // reinforcement table in Teddy
+        dumpTeddyReinforced(rdmsk, maskWidth, f);
+    } else { // dup nibble mask table in Fat Teddy
+        assert(maskWidth == 2);
+        dumpTeddyDupMasks(rdmsk, des->numMasks, f);
+    }
     dumpConfirms(teddy, teddy->confOffset, des->getNumBuckets(), f);
 }
 
diff --git a/src/fdr/teddy.c b/src/fdr/teddy.c
index 960e2a4154849d5e5b7143eac876d50fc73fa23d..e6f5476198fd27b7665020dc91152d8d99602253 100644
--- a/src/fdr/teddy.c
+++ b/src/fdr/teddy.c
@@ -284,14 +284,6 @@ m512 prep_conf_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
 #define PREP_CONF_FN(val, n)                                                  \
     prep_conf_teddy_m##n(&lo_mask, dup_mask, sl_msk, val)
 
-const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = {
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
-    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
-    0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
-    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f
-};
-
 #define TEDDY_VBMI_SL1_POS    15
 #define TEDDY_VBMI_SL2_POS    14
 #define TEDDY_VBMI_SL3_POS    13
diff --git a/src/fdr/teddy_avx2.c b/src/fdr/teddy_avx2.c
index 20ea938cf594ed96950e05fb875314253e41b7a9..6a6b27a5f2f5cacd63cbd63444820efb1a29cc18 100644
--- a/src/fdr/teddy_avx2.c
+++ b/src/fdr/teddy_avx2.c
@@ -109,6 +109,36 @@ const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
 };
 
+#if defined(HAVE_AVX512VBMI) // VBMI strong fat teddy
+
+#define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, pt, conf_fn)          \
+do {                                                                        \
+    if (unlikely(chunk != ones_u64a)) {                                     \
+        chunk = ~chunk;                                                     \
+        conf_fn(&chunk, bucket, off, confBase, reason, a, pt,               \
+                &control, &last_match);                                     \
+        CHECK_HWLM_TERMINATE_MATCHING;                                      \
+    }                                                                       \
+} while(0)
+
+#define CONF_FAT_CHUNK_32(chunk, bucket, off, reason, pt, conf_fn)          \
+do {                                                                        \
+    if (unlikely(chunk != ones_u32)) {                                      \
+        chunk = ~chunk;                                                     \
+        conf_fn(&chunk, bucket, off, confBase, reason, a, pt,               \
+                &control, &last_match);                                     \
+        CHECK_HWLM_TERMINATE_MATCHING;                                      \
+    }                                                                       \
+} while(0)
+
+static really_inline
+const m512 *getDupMaskBase(const struct Teddy *teddy, u8 numMask) {
+    return (const m512 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))
+                          + ROUNDUP_CL(2 * numMask * sizeof(m256)));
+}
+
+#else
+
 #define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, conf_fn)              \
 do {                                                                        \
     if (unlikely(chunk != ones_u64a)) {                                     \
@@ -134,203 +164,200 @@ const m256 *getMaskBase_fat(const struct Teddy *teddy) {
     return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
 }
 
-#if defined(HAVE_AVX512_REVERT) // revert to AVX2 Fat Teddy
+#endif
 
-static really_inline
-const u64a *getReinforcedMaskBase_fat(const struct Teddy *teddy, u8 numMask) {
-    return (const u64a *)((const u8 *)getMaskBase_fat(teddy)
-                          + ROUNDUP_CL(2 * numMask * sizeof(m256)));
-}
+#if defined(HAVE_AVX512VBMI) // VBMI strong fat teddy
+
+const u8 ALIGN_AVX_DIRECTIVE p_mask_interleave[64] = {
+    0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
+    8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+    16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
+    24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
+};
 
 #ifdef ARCH_64_BIT
-#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn)             \
+#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, pt, conf_fn)         \
 do {                                                                        \
     if (unlikely(diff512(var, ones512()))) {                                \
-        m512 swap = swap256in512(var);                                      \
-        m512 r = interleave512lo(var, swap);                                \
+        m512 msk_interleave = load512(p_mask_interleave);                   \
+        m512 r = vpermb512(msk_interleave, var);                            \
         m128 r0 = extract128from512(r, 0);                                  \
         m128 r1 = extract128from512(r, 1);                                  \
+        m128 r2 = extract128from512(r, 2);                                  \
+        m128 r3 = extract128from512(r, 3);                                  \
         u64a part1 = movq(r0);                                              \
         u64a part2 = extract64from128(r0, 1);                               \
-        u64a part5 = movq(r1);                                              \
-        u64a part6 = extract64from128(r1, 1);                               \
-        r = interleave512hi(var, swap);                                     \
-        r0 = extract128from512(r, 0);                                       \
-        r1 = extract128from512(r, 1);                                       \
-        u64a part3 = movq(r0);                                              \
-        u64a part4 = extract64from128(r0, 1);                               \
-        u64a part7 = movq(r1);                                              \
-        u64a part8 = extract64from128(r1, 1);                               \
-        CONF_FAT_CHUNK_64(part1, bucket, offset, reason, conf_fn);          \
-        CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, conf_fn);      \
-        CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, conf_fn);      \
-        CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, conf_fn);     \
-        CONF_FAT_CHUNK_64(part5, bucket, offset + 16, reason, conf_fn);     \
-        CONF_FAT_CHUNK_64(part6, bucket, offset + 20, reason, conf_fn);     \
-        CONF_FAT_CHUNK_64(part7, bucket, offset + 24, reason, conf_fn);     \
-        CONF_FAT_CHUNK_64(part8, bucket, offset + 28, reason, conf_fn);     \
+        u64a part3 = movq(r1);                                              \
+        u64a part4 = extract64from128(r1, 1);                               \
+        u64a part5 = movq(r2);                                              \
+        u64a part6 = extract64from128(r2, 1);                               \
+        u64a part7 = movq(r3);                                              \
+        u64a part8 = extract64from128(r3, 1);                               \
+        CONF_FAT_CHUNK_64(part1, bucket, offset, reason, pt, conf_fn);      \
+        CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, pt, conf_fn);  \
+        CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, pt, conf_fn);  \
+        CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, pt, conf_fn); \
+        CONF_FAT_CHUNK_64(part5, bucket, offset + 16, reason, pt, conf_fn); \
+        CONF_FAT_CHUNK_64(part6, bucket, offset + 20, reason, pt, conf_fn); \
+        CONF_FAT_CHUNK_64(part7, bucket, offset + 24, reason, pt, conf_fn); \
+        CONF_FAT_CHUNK_64(part8, bucket, offset + 28, reason, pt, conf_fn); \
     }                                                                       \
 } while(0)
 #else
-#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn)             \
+#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, pt, conf_fn)         \
 do {                                                                        \
     if (unlikely(diff512(var, ones512()))) {                                \
-        m512 swap = swap256in512(var);                                      \
-        m512 r = interleave512lo(var, swap);                                \
+        m512 msk_interleave = load512(p_mask_interleave);                   \
+        m512 r = vpermb512(msk_interleave, var);                            \
         m128 r0 = extract128from512(r, 0);                                  \
         m128 r1 = extract128from512(r, 1);                                  \
+        m128 r2 = extract128from512(r, 2);                                  \
+        m128 r3 = extract128from512(r, 3);                                  \
         u32 part1 = movd(r0);                                               \
         u32 part2 = extract32from128(r0, 1);                                \
         u32 part3 = extract32from128(r0, 2);                                \
         u32 part4 = extract32from128(r0, 3);                                \
-        u32 part9 = movd(r1);                                               \
-        u32 part10 = extract32from128(r1, 1);                               \
-        u32 part11 = extract32from128(r1, 2);                               \
-        u32 part12 = extract32from128(r1, 3);                               \
-        r = interleave512hi(var, swap);                                     \
-        r0 = extract128from512(r, 0);                                       \
-        r1 = extract128from512(r, 1);                                       \
-        u32 part5 = movd(r0);                                               \
-        u32 part6 = extract32from128(r0, 1);                                \
-        u32 part7 = extract32from128(r0, 2);                                \
-        u32 part8 = extract32from128(r0, 3);                                \
-        u32 part13 = movd(r1);                                              \
-        u32 part14 = extract32from128(r1, 1);                               \
-        u32 part15 = extract32from128(r1, 2);                               \
-        u32 part16 = extract32from128(r1, 3);                               \
-        CONF_FAT_CHUNK_32(part1, bucket, offset, reason, conf_fn);          \
-        CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, conf_fn);      \
-        CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, conf_fn);      \
-        CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, conf_fn);      \
-        CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, conf_fn);      \
-        CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, conf_fn);     \
-        CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, conf_fn);     \
-        CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, conf_fn);     \
-        CONF_FAT_CHUNK_32(part9, bucket, offset + 16, reason, conf_fn);     \
-        CONF_FAT_CHUNK_32(part10, bucket, offset + 18, reason, conf_fn);    \
-        CONF_FAT_CHUNK_32(part11, bucket, offset + 20, reason, conf_fn);    \
-        CONF_FAT_CHUNK_32(part12, bucket, offset + 22, reason, conf_fn);    \
-        CONF_FAT_CHUNK_32(part13, bucket, offset + 24, reason, conf_fn);    \
-        CONF_FAT_CHUNK_32(part14, bucket, offset + 26, reason, conf_fn);    \
-        CONF_FAT_CHUNK_32(part15, bucket, offset + 28, reason, conf_fn);    \
-        CONF_FAT_CHUNK_32(part16, bucket, offset + 30, reason, conf_fn);    \
+        u32 part5 = movd(r1);                                               \
+        u32 part6 = extract32from128(r1, 1);                                \
+        u32 part7 = extract32from128(r1, 2);                                \
+        u32 part8 = extract32from128(r1, 3);                                \
+        u32 part9 = movd(r2);                                               \
+        u32 part10 = extract32from128(r2, 1);                               \
+        u32 part11 = extract32from128(r2, 2);                               \
+        u32 part12 = extract32from128(r2, 3);                               \
+        u32 part13 = movd(r3);                                              \
+        u32 part14 = extract32from128(r3, 1);                               \
+        u32 part15 = extract32from128(r3, 2);                               \
+        u32 part16 = extract32from128(r3, 3);                               \
+        CONF_FAT_CHUNK_32(part1, bucket, offset, reason, pt, conf_fn);      \
+        CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, pt, conf_fn);  \
+        CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, pt, conf_fn);  \
+        CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, pt, conf_fn);  \
+        CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, pt, conf_fn);  \
+        CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, pt, conf_fn); \
+        CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, pt, conf_fn); \
+        CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, pt, conf_fn); \
+        CONF_FAT_CHUNK_32(part9, bucket, offset + 16, reason, pt, conf_fn); \
+        CONF_FAT_CHUNK_32(part10, bucket, offset + 18, reason, pt, conf_fn);\
+        CONF_FAT_CHUNK_32(part11, bucket, offset + 20, reason, pt, conf_fn);\
+        CONF_FAT_CHUNK_32(part12, bucket, offset + 22, reason, pt, conf_fn);\
+        CONF_FAT_CHUNK_32(part13, bucket, offset + 24, reason, pt, conf_fn);\
+        CONF_FAT_CHUNK_32(part14, bucket, offset + 26, reason, pt, conf_fn);\
+        CONF_FAT_CHUNK_32(part15, bucket, offset + 28, reason, pt, conf_fn);\
+        CONF_FAT_CHUNK_32(part16, bucket, offset + 30, reason, pt, conf_fn);\
     }                                                                       \
 } while(0)
 #endif
 
-static really_inline
-m512 vectoredLoad2x256(m512 *p_mask, const u8 *ptr, const size_t start_offset,
-                       const u8 *lo, const u8 *hi,
-                       const u8 *buf_history, size_t len_history,
-                       const u32 nMasks) {
-    m256 p_mask256;
-    m512 ret = set2x256(vectoredLoad256(&p_mask256, ptr, start_offset, lo, hi,
-                                        buf_history, len_history, nMasks));
-    *p_mask = set2x256(p_mask256);
-    return ret;
-}
-
-#define PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val)                            \
+#define PREP_FAT_SHUF_MASK                                                  \
     m512 lo = and512(val, *lo_mask);                                        \
     m512 hi = and512(rshift64_m512(val, 4), *lo_mask)
 
-#define PREP_FAT_SHUF_MASK                                                  \
-    PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(set2x256(load256(ptr)));            \
-    *c_16 = *(ptr + 15);                                                    \
-    m512 r_msk = set512_64(0ULL, r_msk_base_hi[*c_16],                      \
-                           0ULL, r_msk_base_hi[*c_0],                       \
-                           0ULL, r_msk_base_lo[*c_16],                      \
-                           0ULL, r_msk_base_lo[*c_0]);                      \
-    *c_0 = *(ptr + 31)
-
-#define FAT_SHIFT_OR_M1                                                     \
-    or512(pshufb_m512(dup_mask[0], lo), pshufb_m512(dup_mask[1], hi))
-
-#define FAT_SHIFT_OR_M2                                                     \
-    or512(lshift128_m512(or512(pshufb_m512(dup_mask[2], lo),                \
-                               pshufb_m512(dup_mask[3], hi)),               \
-                         1), FAT_SHIFT_OR_M1)
-
-#define FAT_SHIFT_OR_M3                                                     \
-    or512(lshift128_m512(or512(pshufb_m512(dup_mask[4], lo),                \
-                               pshufb_m512(dup_mask[5], hi)),               \
-                         2), FAT_SHIFT_OR_M2)
-
-#define FAT_SHIFT_OR_M4                                                     \
-    or512(lshift128_m512(or512(pshufb_m512(dup_mask[6], lo),                \
-                               pshufb_m512(dup_mask[7], hi)),               \
-                         3), FAT_SHIFT_OR_M3)
+#define FAT_TEDDY_VBMI_PSHUFB_OR_M1                          \
+    m512 shuf_or_b0 = or512(pshufb_m512(dup_mask[0], lo),    \
+                            pshufb_m512(dup_mask[1], hi));
 
-static really_inline
-m512 prep_conf_fat_teddy_no_reinforcement_m1(const m512 *lo_mask,
-                                             const m512 *dup_mask,
-                                             const m512 val) {
-    PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val);
-    return FAT_SHIFT_OR_M1;
-}
+#define FAT_TEDDY_VBMI_PSHUFB_OR_M2                          \
+    FAT_TEDDY_VBMI_PSHUFB_OR_M1                              \
+    m512 shuf_or_b1 = or512(pshufb_m512(dup_mask[2], lo),    \
+                            pshufb_m512(dup_mask[3], hi));
 
-static really_inline
-m512 prep_conf_fat_teddy_no_reinforcement_m2(const m512 *lo_mask,
-                                             const m512 *dup_mask,
-                                             const m512 val) {
-    PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val);
-    return FAT_SHIFT_OR_M2;
-}
+#define FAT_TEDDY_VBMI_PSHUFB_OR_M3                          \
+    FAT_TEDDY_VBMI_PSHUFB_OR_M2                              \
+    m512 shuf_or_b2 = or512(pshufb_m512(dup_mask[4], lo),    \
+                            pshufb_m512(dup_mask[5], hi));
 
-static really_inline
-m512 prep_conf_fat_teddy_no_reinforcement_m3(const m512 *lo_mask,
-                                             const m512 *dup_mask,
-                                             const m512 val) {
-    PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val);
-    return FAT_SHIFT_OR_M3;
-}
+#define FAT_TEDDY_VBMI_PSHUFB_OR_M4                          \
+    FAT_TEDDY_VBMI_PSHUFB_OR_M3                              \
+    m512 shuf_or_b3 = or512(pshufb_m512(dup_mask[6], lo),    \
+                            pshufb_m512(dup_mask[7], hi));
 
-static really_inline
-m512 prep_conf_fat_teddy_no_reinforcement_m4(const m512 *lo_mask,
-                                             const m512 *dup_mask,
-                                             const m512 val) {
-    PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val);
-    return FAT_SHIFT_OR_M4;
-}
+#define FAT_TEDDY_VBMI_SL1_MASK   0xfffffffefffffffeULL
+#define FAT_TEDDY_VBMI_SL2_MASK   0xfffffffcfffffffcULL
+#define FAT_TEDDY_VBMI_SL3_MASK   0xfffffff8fffffff8ULL
+
+#define FAT_TEDDY_VBMI_SHIFT_M1
+
+#define FAT_TEDDY_VBMI_SHIFT_M2                      \
+    FAT_TEDDY_VBMI_SHIFT_M1                          \
+    m512 sl1 = maskz_vpermb512(FAT_TEDDY_VBMI_SL1_MASK, sl_msk[0], shuf_or_b1);
+
+#define FAT_TEDDY_VBMI_SHIFT_M3                      \
+    FAT_TEDDY_VBMI_SHIFT_M2                          \
+    m512 sl2 = maskz_vpermb512(FAT_TEDDY_VBMI_SL2_MASK, sl_msk[1], shuf_or_b2);
+
+#define FAT_TEDDY_VBMI_SHIFT_M4                      \
+    FAT_TEDDY_VBMI_SHIFT_M3                          \
+    m512 sl3 = maskz_vpermb512(FAT_TEDDY_VBMI_SL3_MASK, sl_msk[2], shuf_or_b3);
+
+#define FAT_SHIFT_OR_M1            \
+    shuf_or_b0
+
+#define FAT_SHIFT_OR_M2            \
+    or512(sl1, FAT_SHIFT_OR_M1)
+
+#define FAT_SHIFT_OR_M3            \
+    or512(sl2, FAT_SHIFT_OR_M2)
+
+#define FAT_SHIFT_OR_M4            \
+    or512(sl3, FAT_SHIFT_OR_M3)
 
 static really_inline
 m512 prep_conf_fat_teddy_m1(const m512 *lo_mask, const m512 *dup_mask,
-                            const u8 *ptr, const u64a *r_msk_base_lo,
-                            const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) {
+                            UNUSED const m512 *sl_msk, const m512 val) {
     PREP_FAT_SHUF_MASK;
-    return or512(FAT_SHIFT_OR_M1, r_msk);
+    FAT_TEDDY_VBMI_PSHUFB_OR_M1;
+    FAT_TEDDY_VBMI_SHIFT_M1;
+    return FAT_SHIFT_OR_M1;
 }
 
 static really_inline
 m512 prep_conf_fat_teddy_m2(const m512 *lo_mask, const m512 *dup_mask,
-                            const u8 *ptr, const u64a *r_msk_base_lo,
-                            const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) {
+                            const m512 *sl_msk, const m512 val) {
     PREP_FAT_SHUF_MASK;
-    return or512(FAT_SHIFT_OR_M2, r_msk);
+    FAT_TEDDY_VBMI_PSHUFB_OR_M2;
+    FAT_TEDDY_VBMI_SHIFT_M2;
+    return FAT_SHIFT_OR_M2;
 }
 
 static really_inline
 m512 prep_conf_fat_teddy_m3(const m512 *lo_mask, const m512 *dup_mask,
-                            const u8 *ptr, const u64a *r_msk_base_lo,
-                            const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) {
+                            const m512 *sl_msk, const m512 val) {
     PREP_FAT_SHUF_MASK;
-    return or512(FAT_SHIFT_OR_M3, r_msk);
+    FAT_TEDDY_VBMI_PSHUFB_OR_M3;
+    FAT_TEDDY_VBMI_SHIFT_M3;
+    return FAT_SHIFT_OR_M3;
 }
 
 static really_inline
 m512 prep_conf_fat_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
-                            const u8 *ptr, const u64a *r_msk_base_lo,
-                            const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) {
+                            const m512 *sl_msk, const m512 val) {
     PREP_FAT_SHUF_MASK;
-    return or512(FAT_SHIFT_OR_M4, r_msk);
+    FAT_TEDDY_VBMI_PSHUFB_OR_M4;
+    FAT_TEDDY_VBMI_SHIFT_M4;
+    return FAT_SHIFT_OR_M4;
 }
 
-#define PREP_CONF_FAT_FN_NO_REINFORCEMENT(val, n)                             \
-    prep_conf_fat_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val)
+#define PREP_CONF_FAT_FN(val, n)    \
+    prep_conf_fat_teddy_m##n(&lo_mask, dup_mask, sl_msk, val)
 
-#define PREP_CONF_FAT_FN(ptr, n)                                              \
-    prep_conf_fat_teddy_m##n(&lo_mask, dup_mask, ptr,                         \
-                             r_msk_base_lo, r_msk_base_hi, &c_0, &c_16)
+#define FAT_TEDDY_VBMI_SL1_POS    15
+#define FAT_TEDDY_VBMI_SL2_POS    14
+#define FAT_TEDDY_VBMI_SL3_POS    13
+
+#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M1
+
+#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M2    \
+    FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M1        \
+    sl_msk[0] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL1_POS);
+
+#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M3    \
+    FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M2        \
+    sl_msk[1] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL2_POS);
+
+#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M4    \
+    FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M3        \
+    sl_msk[2] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL3_POS);
 
 /*
  * In FAT teddy, it needs 2 bytes to represent result of each position,
@@ -355,31 +382,15 @@ m512 prep_conf_fat_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
  * then do pshufb_m512(AABB, XYXY).
  */
 
-#define DUP_FAT_MASK(a) mask_set2x256(set2x256(swap128in256(a)), 0xC3, a)
-
-#define PREPARE_FAT_MASKS_1                                                   \
-    dup_mask[0] = DUP_FAT_MASK(maskBase[0]);                                  \
-    dup_mask[1] = DUP_FAT_MASK(maskBase[1]);
-
-#define PREPARE_FAT_MASKS_2                                                   \
-    PREPARE_FAT_MASKS_1                                                       \
-    dup_mask[2] = DUP_FAT_MASK(maskBase[2]);                                  \
-    dup_mask[3] = DUP_FAT_MASK(maskBase[3]);
-
-#define PREPARE_FAT_MASKS_3                                                   \
-    PREPARE_FAT_MASKS_2                                                       \
-    dup_mask[4] = DUP_FAT_MASK(maskBase[4]);                                  \
-    dup_mask[5] = DUP_FAT_MASK(maskBase[5]);
-
-#define PREPARE_FAT_MASKS_4                                                   \
-    PREPARE_FAT_MASKS_3                                                       \
-    dup_mask[6] = DUP_FAT_MASK(maskBase[6]);                                  \
-    dup_mask[7] = DUP_FAT_MASK(maskBase[7]);
-
 #define PREPARE_FAT_MASKS(n)                                                  \
     m512 lo_mask = set64x8(0xf);                                              \
-    m512 dup_mask[n * 2];                                                     \
-    PREPARE_FAT_MASKS_##n
+    m512 sl_msk[n - 1];                                                       \
+    FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M##n
+
+#define FAT_TEDDY_VBMI_CONF_MASK_HEAD   (0xffffffffULL >> n_sh)
+#define FAT_TEDDY_VBMI_CONF_MASK_FULL   ((0xffffffffULL << n_sh) & 0xffffffffULL)
+#define FAT_TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffULL >> (32 - n) << overlap)
+#define FAT_TEDDY_VBMI_LOAD_MASK_PATCH  (0xffffffffULL >> (32 - n_sh))
 
 #define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn)                   \
 do {                                                                          \
@@ -389,67 +400,53 @@ do {                                                                          \
     const u8 *tryFloodDetect = a->firstFloodDetect;                           \
     u32 last_match = ones_u32;                                                \
     const struct Teddy *teddy = (const struct Teddy *)fdr;                    \
-    const size_t iterBytes = 64;                                              \
+    const size_t iterBytes = 32;                                              \
+    u32 n_sh = n_msk - 1;                                                     \
+    const size_t loopBytes = 32 - n_sh;                                       \
     DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",                 \
                  a->buf, a->len, a->start_offset);                            \
                                                                               \
-    const m256 *maskBase = getMaskBase_fat(teddy);                            \
+    const m512 *dup_mask = getDupMaskBase(teddy, n_msk);                      \
     PREPARE_FAT_MASKS(n_msk);                                                 \
     const u32 *confBase = getConfBase(teddy);                                 \
                                                                               \
-    const u64a *r_msk_base_lo = getReinforcedMaskBase_fat(teddy, n_msk);      \
-    const u64a *r_msk_base_hi = r_msk_base_lo + (N_CHARS + 1);                \
-    u32 c_0 = 0x100;                                                          \
-    u32 c_16 = 0x100;                                                         \
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 32);                               \
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);           \
-    if (ptr < mainStart) {                                                    \
-        ptr = mainStart - 32;                                                 \
-        m512 p_mask;                                                          \
-        m512 val_0 = vectoredLoad2x256(&p_mask, ptr, a->start_offset,         \
-                                     a->buf, buf_end,                         \
-                                     a->buf_history, a->len_history, n_msk);  \
-        m512 r_0 = PREP_CONF_FAT_FN_NO_REINFORCEMENT(val_0, n_msk);           \
-        r_0 = or512(r_0, p_mask);                                             \
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn);                    \
-        ptr += 32;                                                            \
+    u64a k = FAT_TEDDY_VBMI_CONF_MASK_FULL;                                   \
+    m512 p_mask = set_mask_m512(~((k << 32) | k));                            \
+    u32 overlap = 0;                                                          \
+    u64a patch = 0;                                                           \
+    if (likely(ptr + loopBytes <= buf_end)) {                                 \
+        u64a k0 = FAT_TEDDY_VBMI_CONF_MASK_HEAD;                              \
+        m512 p_mask0 = set_mask_m512(~((k0 << 32) | k0));                     \
+        m512 r_0 = PREP_CONF_FAT_FN(set2x256(loadu256(ptr)), n_msk);          \
+        r_0 = or512(r_0, p_mask0);                                            \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, ptr, conf_fn);               \
+        ptr += loopBytes;                                                     \
+        overlap = n_sh;                                                       \
+        patch = FAT_TEDDY_VBMI_LOAD_MASK_PATCH;                               \
     }                                                                         \
                                                                               \
-    if (ptr + 32 <= buf_end) {                                                \
-        m512 r_0 = PREP_CONF_FAT_FN(ptr, n_msk);                              \
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn);                    \
-        ptr += 32;                                                            \
-    }                                                                         \
-                                                                              \
-    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {                    \
-        __builtin_prefetch(ptr + (iterBytes * 4));                            \
+    for (; ptr + loopBytes <= buf_end; ptr += loopBytes) {                    \
         CHECK_FLOOD;                                                          \
-        m512 r_0 = PREP_CONF_FAT_FN(ptr, n_msk);                              \
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn);                 \
-        m512 r_1 = PREP_CONF_FAT_FN(ptr + 32, n_msk);                         \
-        CONFIRM_FAT_TEDDY(r_1, 16, 32, NOT_CAUTIOUS, conf_fn);                \
-    }                                                                         \
-                                                                              \
-    if (ptr + 32 <= buf_end) {                                                \
-        m512 r_0 = PREP_CONF_FAT_FN(ptr, n_msk);                              \
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn);                 \
-        ptr += 32;                                                            \
+        m512 r_0 = PREP_CONF_FAT_FN(set2x256(loadu256(ptr - n_sh)), n_msk);   \
+        r_0 = or512(r_0, p_mask);                                             \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, ptr - n_sh, conf_fn);     \
     }                                                                         \
                                                                               \
-    assert(ptr + 32 > buf_end);                                               \
+    assert(ptr + loopBytes > buf_end);                                        \
     if (ptr < buf_end) {                                                      \
-        m512 p_mask;                                                          \
-        m512 val_0 = vectoredLoad2x256(&p_mask, ptr, 0, ptr, buf_end,         \
-                                     a->buf_history, a->len_history, n_msk);  \
-        m512 r_0 = PREP_CONF_FAT_FN_NO_REINFORCEMENT(val_0, n_msk);           \
-        r_0 = or512(r_0, p_mask);                                             \
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn);                    \
+        u32 left = (u32)(buf_end - ptr);                                      \
+        u64a k1 = FAT_TEDDY_VBMI_CONF_MASK_VAR(left);                         \
+        m512 p_mask1 = set_mask_m512(~((k1 << 32) | k1));                     \
+        m512 val_0 = set2x256(loadu_maskz_m256(k1 | patch, ptr - overlap));   \
+        m512 r_0 = PREP_CONF_FAT_FN(val_0, n_msk);                            \
+        r_0 = or512(r_0, p_mask1);                                            \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, ptr - overlap, conf_fn);     \
     }                                                                         \
                                                                               \
     return HWLM_SUCCESS;                                                      \
 } while(0)
 
-#else // HAVE_AVX512
+#else // !HAVE_AVX512VBMI, AVX2 normal fat teddy
 
 #ifdef ARCH_64_BIT
 #define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn)             \
@@ -659,7 +656,7 @@ do {                                                                        \
     return HWLM_SUCCESS;                                                    \
 } while(0)
 
-#endif // HAVE_AVX512
+#endif // HAVE_AVX512VBMI
 
 hwlm_error_t fdr_exec_fat_teddy_msks1(const struct FDR *fdr,
                                       const struct FDR_Runtime_Args *a,
diff --git a/src/fdr/teddy_compile.cpp b/src/fdr/teddy_compile.cpp
index 9a1e54a15e0b67436619535f42d2d64c3a3b055a..eae9c2c136b93674606b8b2e145767f70f57e89d 100644
--- a/src/fdr/teddy_compile.cpp
+++ b/src/fdr/teddy_compile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -353,6 +353,89 @@ void fillReinforcedMsk(u8 *rmsk, u16 c, u32 j, u8 bmsk) {
     }
 }
 
+static
+void fillDupNibbleMasks(const map<BucketIndex,
+                                  vector<LiteralIndex>> &bucketToLits,
+                        const vector<hwlmLiteral> &lits,
+                        u32 numMasks, size_t maskLen,
+                        u8 *baseMsk) {
+    u32 maskWidth = 2;
+    memset(baseMsk, 0xff, maskLen);
+
+    for (const auto &b2l : bucketToLits) {
+        const u32 &bucket_id = b2l.first;
+        const vector<LiteralIndex> &ids = b2l.second;
+        const u8 bmsk = 1U << (bucket_id % 8);
+
+        for (const LiteralIndex &lit_id : ids) {
+            const hwlmLiteral &l = lits[lit_id];
+            DEBUG_PRINTF("putting lit %u into bucket %u\n", lit_id, bucket_id);
+            const u32 sz = verify_u32(l.s.size());
+
+            // fill in masks
+            for (u32 j = 0; j < numMasks; j++) {
+                const u32 msk_id_lo = j * 2 * maskWidth + (bucket_id / 8);
+                const u32 msk_id_hi = (j * 2 + 1) * maskWidth + (bucket_id / 8);
+                const u32 lo_base0 = msk_id_lo * 32;
+                const u32 lo_base1 = msk_id_lo * 32 + 16;
+                const u32 hi_base0 = msk_id_hi * 32;
+                const u32 hi_base1 = msk_id_hi * 32 + 16;
+
+                // if we don't have a char at this position, fill in i
+                // locations in these masks with '1'
+                if (j >= sz) {
+                    for (u32 n = 0; n < 16; n++) {
+                        baseMsk[lo_base0 + n] &= ~bmsk;
+                        baseMsk[lo_base1 + n] &= ~bmsk;
+                        baseMsk[hi_base0 + n] &= ~bmsk;
+                        baseMsk[hi_base1 + n] &= ~bmsk;
+                    }
+                } else {
+                    u8 c = l.s[sz - 1 - j];
+                    // if we do have a char at this position
+                    const u32 hiShift = 4;
+                    u32 n_hi = (c >> hiShift) & 0xf;
+                    u32 n_lo = c & 0xf;
+
+                    if (j < l.msk.size() && l.msk[l.msk.size() - 1 - j]) {
+                        u8 m = l.msk[l.msk.size() - 1 - j];
+                        u8 m_hi = (m >> hiShift) & 0xf;
+                        u8 m_lo = m & 0xf;
+                        u8 cmp = l.cmp[l.msk.size() - 1 - j];
+                        u8 cmp_lo = cmp & 0xf;
+                        u8 cmp_hi = (cmp >> hiShift) & 0xf;
+
+                        for (u8 cm = 0; cm < 0x10; cm++) {
+                            if ((cm & m_lo) == (cmp_lo & m_lo)) {
+                                baseMsk[lo_base0 + cm] &= ~bmsk;
+                                baseMsk[lo_base1 + cm] &= ~bmsk;
+                            }
+                            if ((cm & m_hi) == (cmp_hi & m_hi)) {
+                                baseMsk[hi_base0 + cm] &= ~bmsk;
+                                baseMsk[hi_base1 + cm] &= ~bmsk;
+                            }
+                        }
+                    } else {
+                        if (l.nocase && ourisalpha(c)) {
+                            u32 cmHalfClear = (0xdf >> hiShift) & 0xf;
+                            u32 cmHalfSet = (0x20 >> hiShift) & 0xf;
+                            baseMsk[hi_base0 + (n_hi & cmHalfClear)] &= ~bmsk;
+                            baseMsk[hi_base1 + (n_hi & cmHalfClear)] &= ~bmsk;
+                            baseMsk[hi_base0 + (n_hi | cmHalfSet)] &= ~bmsk;
+                            baseMsk[hi_base1 + (n_hi | cmHalfSet)] &= ~bmsk;
+                        } else {
+                            baseMsk[hi_base0 + n_hi] &= ~bmsk;
+                            baseMsk[hi_base1 + n_hi] &= ~bmsk;
+                        }
+                        baseMsk[lo_base0 + n_lo] &= ~bmsk;
+                        baseMsk[lo_base1 + n_lo] &= ~bmsk;
+                    }
+                }
+            }
+        }
+    }
+}
+
 static
 void fillNibbleMasks(const map<BucketIndex,
                                vector<LiteralIndex>> &bucketToLits,
@@ -479,14 +562,17 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
 
     size_t headerSize = sizeof(Teddy);
     size_t maskLen = eng.numMasks * 16 * 2 * maskWidth;
-    size_t reinforcedMaskLen = RTABLE_SIZE * maskWidth;
+    size_t reinforcedDupMaskLen = RTABLE_SIZE * maskWidth;
+    if (maskWidth == 2) { // dup nibble mask table in Fat Teddy
+        reinforcedDupMaskLen = maskLen * 2;
+    }
 
     auto floodTable = setupFDRFloodControl(lits, eng, grey);
     auto confirmTable = setupFullConfs(lits, eng, bucketToLits, make_small);
 
     // Note: we place each major structure here on a cacheline boundary.
     size_t size = ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) +
-                  ROUNDUP_CL(reinforcedMaskLen) +
+                  ROUNDUP_CL(reinforcedDupMaskLen) +
                   ROUNDUP_CL(confirmTable.size()) + floodTable.size();
 
     auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
@@ -502,7 +588,7 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
 
     // Write confirm structures.
     u8 *ptr = teddy_base + ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) +
-              ROUNDUP_CL(reinforcedMaskLen);
+              ROUNDUP_CL(reinforcedDupMaskLen);
     assert(ISALIGNED_CL(ptr));
     teddy->confOffset = verify_u32(ptr - teddy_base);
     memcpy(ptr, confirmTable.get(), confirmTable.size());
@@ -519,9 +605,16 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
     fillNibbleMasks(bucketToLits, lits, eng.numMasks, maskWidth, maskLen,
                     baseMsk);
 
-    // Write reinforcement masks.
-    u8 *reinforcedMsk = baseMsk + ROUNDUP_CL(maskLen);
-    fillReinforcedTable(bucketToLits, lits, reinforcedMsk, maskWidth);
+    if (maskWidth == 1) { // reinforcement table in Teddy
+        // Write reinforcement masks.
+        u8 *reinforcedMsk = baseMsk + ROUNDUP_CL(maskLen);
+        fillReinforcedTable(bucketToLits, lits, reinforcedMsk, maskWidth);
+    } else { // dup nibble mask table in Fat Teddy
+        assert(maskWidth == 2);
+        u8 *dupMsk = baseMsk + ROUNDUP_CL(maskLen);
+        fillDupNibbleMasks(bucketToLits, lits, eng.numMasks,
+			   reinforcedDupMaskLen, dupMsk);
+    }
 
     return fdr;
 }
diff --git a/src/fdr/teddy_runtime_common.h b/src/fdr/teddy_runtime_common.h
index 730850cb73cbcf6ed973eb1fdbfed5209ee62cf3..b76800eb041129ee362798bba148887d3b850847 100644
--- a/src/fdr/teddy_runtime_common.h
+++ b/src/fdr/teddy_runtime_common.h
@@ -45,6 +45,16 @@ extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
 extern const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64];
 #endif
 
+#if defined(HAVE_AVX512VBMI)
+static const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = {
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+    0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f
+};
+#endif
+
 #ifdef ARCH_64_BIT
 #define TEDDY_CONF_TYPE u64a
 #define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_64(conf)
diff --git a/src/hs.cpp b/src/hs.cpp
index ab54105c53675c63d0e62eeb1d83c49cc6feb793..ae9cdf1468d9fff53a61abe2708dd653d392cc14 100644
--- a/src/hs.cpp
+++ b/src/hs.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2019, Intel Corporation
+ * Copyright (c) 2015-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -120,9 +120,10 @@ bool checkMode(unsigned int mode, hs_compile_error **comp_error) {
 
 static
 bool checkPlatform(const hs_platform_info *p, hs_compile_error **comp_error) {
-    static constexpr u32 HS_TUNE_LAST = HS_TUNE_FAMILY_GLM;
+    static constexpr u32 HS_TUNE_LAST = HS_TUNE_FAMILY_ICX;
     static constexpr u32 HS_CPU_FEATURES_ALL =
-        HS_CPU_FEATURES_AVX2 | HS_CPU_FEATURES_AVX512;
+        HS_CPU_FEATURES_AVX2 | HS_CPU_FEATURES_AVX512 |
+        HS_CPU_FEATURES_AVX512VBMI;
 
     if (!p) {
         return true;
@@ -513,6 +514,12 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
         return HS_COMPILER_ERROR;
     }
 
+    if (flags & HS_FLAG_COMBINATION) {
+        *error = generateCompileError("Invalid parameter: unsupported "
+                                      "logical combination expression", -1);
+        return HS_COMPILER_ERROR;
+    }
+
     *info = nullptr;
     *error = nullptr;
 
diff --git a/src/hs.h b/src/hs.h
index 105919fb8a248427cea890c8e7ba48e74c9153a1..3d3c5cdeac714a80ac9f92a617753285ec044256 100644
--- a/src/hs.h
+++ b/src/hs.h
@@ -42,8 +42,8 @@
 /* The current Hyperscan version information. */
 
 #define HS_MAJOR      5
-#define HS_MINOR      3
-#define HS_PATCH      0
+#define HS_MINOR      4
+#define HS_PATCH      2
 
 #include "hs_compile.h"
 #include "hs_runtime.h"
diff --git a/src/hs_compile.h b/src/hs_compile.h
index 081d46387bdc76bad33b96ca18592bb9df8a2abf..5aa24188689e4f10204fe8ebf4ed7612fd053d53 100644
--- a/src/hs_compile.h
+++ b/src/hs_compile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2015-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -748,10 +748,7 @@ hs_error_t HS_CDECL hs_free_compile_error(hs_compile_error_t *error);
  *       - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
  *       - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
  *                                when a match is found.
- *       - HS_FLAG_COMBINATION - Parse the expression in logical combination
- *                               syntax.
- *       - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for
- *                         the sub-expressions in logical combinations.
+ *       - HS_FLAG_QUIET - This flag will be ignored.
  *
  * @param info
  *      On success, a pointer to the pattern information will be returned in
@@ -814,10 +811,7 @@ hs_error_t HS_CDECL hs_expression_info(const char *expression,
  *       - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
  *       - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
  *                                when a match is found.
- *       - HS_FLAG_COMBINATION - Parse the expression in logical combination
- *                               syntax.
- *       - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for
- *                         the sub-expressions in logical combinations.
+ *       - HS_FLAG_QUIET - This flag will be ignored.
  *
  * @param ext
  *      A pointer to a filled @ref hs_expr_ext_t structure that defines
@@ -1034,6 +1028,15 @@ hs_error_t HS_CDECL hs_populate_platform(hs_platform_info_t *platform);
  */
 #define HS_CPU_FEATURES_AVX512           (1ULL << 3)
 
+/**
+ * CPU features flag - Intel(R) Advanced Vector Extensions 512
+ * Vector Byte Manipulation Instructions (Intel(R) AVX512VBMI)
+ *
+ * Setting this flag indicates that the target platform supports AVX512VBMI
+ * instructions. Using AVX512VBMI implies the use of AVX512.
+ */
+#define HS_CPU_FEATURES_AVX512VBMI       (1ULL << 4)
+
 /** @} */
 
 /**
@@ -1114,6 +1117,22 @@ hs_error_t HS_CDECL hs_populate_platform(hs_platform_info_t *platform);
  */
 #define HS_TUNE_FAMILY_GLM 8
 
+/**
+ * Tuning Parameter - Intel(R) microarchitecture code name Icelake
+ *
+ * This indicates that the compiled database should be tuned for the
+ * Icelake microarchitecture.
+ */
+#define HS_TUNE_FAMILY_ICL 9
+
+/**
+ * Tuning Parameter - Intel(R) microarchitecture code name Icelake Server
+ *
+ * This indicates that the compiled database should be tuned for the
+ * Icelake Server microarchitecture.
+ */
+#define HS_TUNE_FAMILY_ICX 10
+
 /** @} */
 
 /**
diff --git a/src/hs_internal.h b/src/hs_internal.h
index adf07b22cfc460dad242aafd4ae044e2371d52f6..4eb5e157cbfd8e0c4a14010aa3917df07d94a6b0 100644
--- a/src/hs_internal.h
+++ b/src/hs_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Intel Corporation
+ * Copyright (c) 2019-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -80,7 +80,9 @@ extern "C"
                     | HS_FLAG_PREFILTER \
                     | HS_FLAG_SINGLEMATCH \
                     | HS_FLAG_ALLOWEMPTY \
-                    | HS_FLAG_SOM_LEFTMOST)
+                    | HS_FLAG_SOM_LEFTMOST \
+                    | HS_FLAG_COMBINATION \
+                    | HS_FLAG_QUIET)
 
 #ifdef __cplusplus
 } /* extern "C" */
diff --git a/src/hwlm/noodle_engine_sse.c b/src/hwlm/noodle_engine_sse.c
index 7cd53d7cedc9ed45f3c76dc741d16e8c2340d8d5..58ace3b6de1225967d518deea7b7fa268a14032c 100644
--- a/src/hwlm/noodle_engine_sse.c
+++ b/src/hwlm/noodle_engine_sse.c
@@ -106,7 +106,7 @@ hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf,
     if (!l) {
         return HWLM_SUCCESS;
     }
-    assert(l <= 32);
+    assert(l <= 16);
 
     DEBUG_PRINTF("d %zu\n", d - buf);
     m128 v = zeroes128();
diff --git a/src/nfa/goughcompile.cpp b/src/nfa/goughcompile.cpp
index d41c6f42356875eba9a102b70d8564659ce02631..4479eef1fcb31c04a6db7646e296aa4e1f9fdec6 100644
--- a/src/nfa/goughcompile.cpp
+++ b/src/nfa/goughcompile.cpp
@@ -207,6 +207,10 @@ void makeCFG_top_edge(GoughGraph &cfg, const vector<GoughVertex> &vertices,
             assert(contains(src_slots, slot_id));
 
             shared_ptr<GoughSSAVarMin> vmin = make_shared<GoughSSAVarMin>();
+            if (!vmin) {
+                assert(0);
+                throw std::bad_alloc();
+            }
             cfg[e].vars.push_back(vmin);
             final_var = vmin.get();
 
@@ -318,6 +322,10 @@ void makeCFG_edge(GoughGraph &cfg, const map<u32, u32> &som_creators,
             DEBUG_PRINTF("bypassing min on join %u\n", slot_id);
         } else {
             shared_ptr<GoughSSAVarMin> vmin = make_shared<GoughSSAVarMin>();
+            if (!vmin) {
+                assert(0);
+                throw std::bad_alloc();
+            }
             cfg[e].vars.push_back(vmin);
             final_var = vmin.get();
 
@@ -1292,7 +1300,7 @@ unique_ptr<raw_report_info> gough_build_strat::gatherReports(
     *arbReport = MO_INVALID_IDX;
     assert(!ri->rl.empty()); /* all components should be able to generate
                                 reports */
-    return move(ri);
+    return ri;
 }
 
 u32 raw_gough_report_info_impl::getReportListSize() const {
diff --git a/src/nfa/limex_compile.cpp b/src/nfa/limex_compile.cpp
index bbb266051b5c7b1b0c5d63072a6fcab12bb75b6c..9233ae515efca55536cd0e6c661b5fab3339b3c3 100644
--- a/src/nfa/limex_compile.cpp
+++ b/src/nfa/limex_compile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -85,6 +85,18 @@ namespace ue2 {
  */
 static constexpr u32 NO_STATE = ~0;
 
+/* Maximum number of states taken as a small NFA */
+static constexpr u32 MAX_SMALL_NFA_STATES = 64;
+
+/* Maximum bounded repeat upper bound to consider as a fast NFA */
+static constexpr u64a MAX_REPEAT_SIZE = 200;
+
+/* Maximum bounded repeat char reach size to consider as a fast NFA */
+static constexpr u32 MAX_REPEAT_CHAR_REACH = 26;
+
+/* Minimum bounded repeat trigger distance to consider as a fast NFA */
+static constexpr u8 MIN_REPEAT_TRIGGER_DISTANCE = 6;
+
 namespace {
 
 struct precalcAccel {
@@ -1910,7 +1922,8 @@ struct Factory {
     }
 
     static
-    void writeExceptions(const map<ExceptionProto, vector<u32>> &exceptionMap,
+    void writeExceptions(const build_info &args,
+                         const map<ExceptionProto, vector<u32>> &exceptionMap,
                          const vector<u32> &repeatOffsets, implNFA_t *limex,
                          const u32 exceptionsOffset,
                          const u32 reportListOffset) {
@@ -1962,6 +1975,59 @@ struct Factory {
 
         limex->exceptionOffset = exceptionsOffset;
         limex->exceptionCount = ecount;
+
+        if (args.num_states > 64 && args.cc.target_info.has_avx512vbmi()) {
+            const u8 *exceptionMask = (const u8 *)(&limex->exceptionMask);
+            u8 *shufMask = (u8 *)&limex->exceptionShufMask;
+            u8 *bitMask = (u8 *)&limex->exceptionBitMask;
+            u8 *andMask = (u8 *)&limex->exceptionAndMask;
+
+            u32 tot_cnt = 0;
+            u32 pos = 0;
+            bool valid = true;
+            size_t tot = sizeof(limex->exceptionMask);
+            size_t base = 0;
+
+            // We normally have up to 64 exceptions to handle,
+            // but treat 384 state Limex differently to simplify operations
+            size_t limit = 64;
+            if (args.num_states > 256 && args.num_states <= 384) {
+                limit = 48;
+            }
+
+            for (size_t i = 0; i < tot; i++) {
+                if (!exceptionMask[i]) {
+                    continue;
+                }
+                u32 bit_cnt = popcount32(exceptionMask[i]);
+
+                tot_cnt += bit_cnt;
+                if (tot_cnt > limit) {
+                    valid = false;
+                    break;
+                }
+
+                u32 emsk = exceptionMask[i];
+                while (emsk) {
+                    u32 t = findAndClearLSB_32(&emsk);
+                    bitMask[pos] = 1U << t;
+                    andMask[pos] = 1U << t;
+                    shufMask[pos++] = i + base;
+
+                    if (pos == 32 &&
+                        (args.num_states > 128 && args.num_states <= 256)) {
+                        base += 32;
+                    }
+                }
+            }
+            // Avoid matching unused bytes
+            for (u32 i = pos; i < 64; i++) {
+                bitMask[i] = 0xff;
+            }
+            if (valid) {
+                setLimexFlag(limex, LIMEX_FLAG_EXTRACT_EXP);
+            }
+        }
     }
 
     static
@@ -2287,7 +2353,7 @@ struct Factory {
         writeRepeats(repeats, repeatOffsets, limex, repeatOffsetsOffset,
                      repeatsOffset);
 
-        writeExceptions(exceptionMap, repeatOffsets, limex, exceptionsOffset,
+        writeExceptions(args, exceptionMap, repeatOffsets, limex, exceptionsOffset,
                         reportListOffset);
 
         writeLimexMasks(args, limex);
@@ -2422,6 +2488,68 @@ bool isSane(const NGHolder &h, const map<u32, set<NFAVertex>> &tops,
 }
 #endif // NDEBUG
 
+static
+bool isFast(const build_info &args) {
+    const NGHolder &h = args.h;
+    const u32 num_states = args.num_states;
+
+    if (num_states > MAX_SMALL_NFA_STATES) {
+        return false;
+    }
+
+    unordered_map<NFAVertex, bool> pos_trigger;
+    for (u32 i = 0; i < args.repeats.size(); i++) {
+        const BoundedRepeatData &br = args.repeats[i];
+        assert(!contains(pos_trigger, br.pos_trigger));
+        pos_trigger[br.pos_trigger] = br.repeatMax <= MAX_REPEAT_SIZE;
+    }
+
+    // Small NFA without bounded repeat should be fast.
+    if (pos_trigger.empty()) {
+        return true;
+    }
+
+    vector<NFAVertex> cur;
+    unordered_set<NFAVertex> visited;
+    for (const auto &m : args.tops) {
+        for (NFAVertex v : m.second) {
+            cur.push_back(v);
+            visited.insert(v);
+        }
+    }
+
+    u8 pos_dist = 0;
+    while (!cur.empty()) {
+        vector<NFAVertex> next;
+        for (const auto &v : cur) {
+            if (contains(pos_trigger, v)) {
+                const CharReach &cr = h[v].char_reach;
+                if (!pos_trigger[v] && cr.count() > MAX_REPEAT_CHAR_REACH) {
+                    return false;
+                }
+            }
+            for (const auto &w : adjacent_vertices_range(v, h)) {
+                if (w == v) {
+                    continue;
+                }
+                u32 j = args.state_ids.at(w);
+                if (j == NO_STATE) {
+                    continue;
+                }
+                if (!contains(visited, w)) {
+                    next.push_back(w);
+                    visited.insert(w);
+                }
+            }
+        }
+        if (++pos_dist >= MIN_REPEAT_TRIGGER_DISTANCE) {
+            break;
+        }
+        swap(cur, next);
+    }
+    return true;
+}
+
 static
 u32 max_state(const unordered_map<NFAVertex, u32> &state_ids) {
     u32 rv = 0;
@@ -2442,7 +2570,7 @@ bytecode_ptr<NFA> generate(NGHolder &h,
                 const unordered_map<NFAVertex, NFAStateSet> &squashMap,
                 const map<u32, set<NFAVertex>> &tops,
                 const set<NFAVertex> &zombies, bool do_accel,
-                bool stateCompression, u32 hint,
+                bool stateCompression, bool &fast, u32 hint,
                 const CompileContext &cc) {
     const u32 num_states = max_state(states) + 1;
     DEBUG_PRINTF("total states: %u\n", num_states);
@@ -2497,6 +2625,7 @@ bytecode_ptr<NFA> generate(NGHolder &h,
         if (nfa) {
             DEBUG_PRINTF("successful build with NFA engine: %s\n",
                          nfa_type_name(limex_model));
+            fast = isFast(arg);
             return nfa;
         }
     }
diff --git a/src/nfa/limex_compile.h b/src/nfa/limex_compile.h
index a08e0ae562bba3e3b8d0bc993b2e366a4739d7b5..4afdcdb3e41c528fe2214ddd2f682d215d3c4829 100644
--- a/src/nfa/limex_compile.h
+++ b/src/nfa/limex_compile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -78,6 +78,7 @@ bytecode_ptr<NFA> generate(NGHolder &g,
             const std::set<NFAVertex> &zombies,
             bool do_accel,
             bool stateCompression,
+            bool &fast,
             u32 hint,
             const CompileContext &cc);
 
diff --git a/src/nfa/limex_exceptional.h b/src/nfa/limex_exceptional.h
index 57746c9137dd28b076d754af6a475ec2cda9543e..8304215f145ce477c6b42abe3790f9d0bccc1896 100644
--- a/src/nfa/limex_exceptional.h
+++ b/src/nfa/limex_exceptional.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -47,6 +47,8 @@
 #define AND_STATE               JOIN(and_, STATE_T)
 #define EQ_STATE(a, b)          (!JOIN(noteq_, STATE_T)((a), (b)))
 #define OR_STATE                JOIN(or_, STATE_T)
+#define EXPAND_STATE            JOIN(expand_, STATE_T)
+#define SHUFFLE_BYTE_STATE      JOIN(shuffle_byte_, STATE_T)
 #define TESTBIT_STATE           JOIN(testbit_, STATE_T)
 #define EXCEPTION_T             JOIN(struct NFAException, SIZE)
 #define CONTEXT_T               JOIN(NFAContext, SIZE)
@@ -208,7 +210,7 @@ int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG,
 /** \brief Process all of the exceptions associated with the states in the \a
  * estate. */
 static really_inline
-int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
+int PE_FN(STATE_ARG, ESTATE_ARG, UNUSED u32 diffmask, STATE_T *succ,
           const struct IMPL_NFA_T *limex, const EXCEPTION_T *exceptions,
           u64a offset, struct CONTEXT_T *ctx, char in_rev, char flags) {
     assert(diffmask > 0); // guaranteed by caller macro
@@ -233,6 +235,72 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
     ctx->local_succ = ZERO_STATE;
 #endif
 
+    struct proto_cache new_cache = {0, NULL};
+    enum CacheResult cacheable = CACHE_RESULT;
+
+#if defined(HAVE_AVX512VBMI) && SIZE > 64
+    if (likely(limex->flags & LIMEX_FLAG_EXTRACT_EXP)) {
+        m512 emask = EXPAND_STATE(*STATE_ARG_P);
+        emask = SHUFFLE_BYTE_STATE(load_m512(&limex->exceptionShufMask), emask);
+        emask = and512(emask, load_m512(&limex->exceptionAndMask));
+        u64a word = eq512mask(emask, load_m512(&limex->exceptionBitMask));
+
+        do {
+            u32 bit = FIND_AND_CLEAR_FN(&word);
+            const EXCEPTION_T *e = &exceptions[bit];
+
+            if (!RUN_EXCEPTION_FN(e, STATE_ARG_NAME, succ,
+#ifndef BIG_MODEL
+                                  &local_succ,
+#endif
+                                  limex, offset, ctx, &new_cache, &cacheable,
+                                  in_rev, flags)) {
+                return PE_RV_HALT;
+            }
+        } while (word);
+    } else {
+        // A copy of the estate as an array of GPR-sized chunks.
+        CHUNK_T chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
+        CHUNK_T emask_chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
+#ifdef ESTATE_ON_STACK
+        memcpy(chunks, &estate, sizeof(STATE_T));
+#else
+        memcpy(chunks, estatep, sizeof(STATE_T));
+#endif
+        memcpy(emask_chunks, &limex->exceptionMask, sizeof(STATE_T));
+
+        u32 base_index[sizeof(STATE_T) / sizeof(CHUNK_T)];
+        base_index[0] = 0;
+        for (s32 i = 0; i < (s32)ARRAY_LENGTH(base_index) - 1; i++) {
+            base_index[i + 1] = base_index[i] + POPCOUNT_FN(emask_chunks[i]);
+        }
+
+        do {
+            u32 t = findAndClearLSB_32(&diffmask);
+#ifdef ARCH_64_BIT
+            t >>= 1; // Due to diffmask64, which leaves holes in the bitmask.
+#endif
+            assert(t < ARRAY_LENGTH(chunks));
+            CHUNK_T word = chunks[t];
+            assert(word != 0);
+            do {
+                u32 bit = FIND_AND_CLEAR_FN(&word);
+                u32 local_index = RANK_IN_MASK_FN(emask_chunks[t], bit);
+                u32 idx = local_index + base_index[t];
+                const EXCEPTION_T *e = &exceptions[idx];
+
+                if (!RUN_EXCEPTION_FN(e, STATE_ARG_NAME, succ,
+#ifndef BIG_MODEL
+                                      &local_succ,
+#endif
+                                      limex, offset, ctx, &new_cache, &cacheable,
+                                      in_rev, flags)) {
+                    return PE_RV_HALT;
+                }
+            } while (word);
+        } while (diffmask);
+    }
+#else
     // A copy of the estate as an array of GPR-sized chunks.
     CHUNK_T chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
     CHUNK_T emask_chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
@@ -243,9 +311,6 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
 #endif
     memcpy(emask_chunks, &limex->exceptionMask, sizeof(STATE_T));
 
-    struct proto_cache new_cache = {0, NULL};
-    enum CacheResult cacheable = CACHE_RESULT;
-
     u32 base_index[sizeof(STATE_T) / sizeof(CHUNK_T)];
     base_index[0] = 0;
     for (s32 i = 0; i < (s32)ARRAY_LENGTH(base_index) - 1; i++) {
@@ -276,6 +341,7 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
             }
         } while (word);
     } while (diffmask);
+#endif
 
 #ifndef BIG_MODEL
     *succ = OR_STATE(*succ, local_succ);
@@ -307,6 +373,8 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
 #undef AND_STATE
 #undef EQ_STATE
 #undef OR_STATE
+#undef EXPAND_STATE
+#undef SHUFFLE_BYTE_STATE
 #undef TESTBIT_STATE
 #undef PE_FN
 #undef RUN_EXCEPTION_FN
diff --git a/src/nfa/limex_internal.h b/src/nfa/limex_internal.h
index e3e47e741600f340fa93337bdacf5b5c13486bd3..23b1bd970713176242c14f1e57208a82d2455e0a 100644
--- a/src/nfa/limex_internal.h
+++ b/src/nfa/limex_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -86,6 +86,7 @@
 #define LIMEX_FLAG_COMPRESS_STATE  1 /**< pack state into stream state */
 #define LIMEX_FLAG_COMPRESS_MASKED 2 /**< use reach mask-based compression */
 #define LIMEX_FLAG_CANNOT_DIE      4 /**< limex cannot have no states on */
+#define LIMEX_FLAG_EXTRACT_EXP     8 /**< use limex exception bit extraction */
 
 enum LimExTrigger {
     LIMEX_TRIGGER_NONE = 0,
@@ -118,7 +119,7 @@ struct NFAException##size {                                                 \
     u32 repeatOffset; /**< offset to NFARepeatInfo, or MO_INVALID_IDX */    \
     u8 hasSquash; /**< from enum LimExSquash */                             \
     u8 trigger; /**< from enum LimExTrigger */                              \
-}__attribute__ ((aligned (16)));                                            \
+};                                                                          \
                                                                             \
 struct LimExNFA##size {                                                     \
     u8 reachMap[N_CHARS]; /**< map of char -> entry in reach[] */           \
@@ -157,6 +158,9 @@ struct LimExNFA##size {                                                     \
     u_##size shift[MAX_SHIFT_COUNT];                                        \
     u32 shiftCount; /**< number of shift masks used */                      \
     u8 shiftAmount[MAX_SHIFT_COUNT]; /**< shift amount for each mask */     \
+    m512 exceptionShufMask; /**< exception byte shuffle mask  */            \
+    m512 exceptionBitMask; /**< exception bit mask */                       \
+    m512 exceptionAndMask; /**< exception and mask */                       \
 };
 
 CREATE_NFA_LIMEX(32)
diff --git a/src/nfa/mcclellancompile.cpp b/src/nfa/mcclellancompile.cpp
index c1a4f87fc026db807529bad6a6d603c93d17e643..5bfdf9e117a7d8b3b6b91798518b1ac4b9759e9f 100644
--- a/src/nfa/mcclellancompile.cpp
+++ b/src/nfa/mcclellancompile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2018, Intel Corporation
+ * Copyright (c) 2015-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -463,7 +463,7 @@ unique_ptr<raw_report_info> mcclellan_build_strat::gatherReports(
         *isSingleReport = 0;
     }
 
-    return move(ri);
+    return ri;
 }
 
 u32 raw_report_info_impl::getReportListSize() const {
@@ -1082,7 +1082,9 @@ void find_better_daddy(dfa_info &info, dstate_id_t curr_id, bool using8bit,
         // Use the daddy already set for this state so long as it isn't already
         // a Sherman state.
         dstate_id_t daddy = currState.daddy;
-        if (!info.is_sherman(daddy) && !info.is_widestate(daddy)) {
+        if (info.is_widestate(daddy)) {
+            return;
+        } else if (!info.is_sherman(daddy)) {
             hinted.insert(currState.daddy);
         } else {
             // Fall back to granddaddy, which has already been processed (due
@@ -1477,6 +1479,7 @@ bytecode_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &strat,
 
     bytecode_ptr<NFA> nfa;
     if (!using8bit) {
+        // Wide state optimization
         if (cc.grey.allowWideStates && strat.getType() == McClellan
             && !is_triggered(raw.kind)) {
             find_wide_state(info);
@@ -1486,19 +1489,22 @@ bytecode_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &strat,
         bool any_cyclic_near_anchored_state
             = is_cyclic_near(raw, raw.start_anchored);
 
-        for (u32 i = 0; i < info.size(); i++) {
-            if (info.is_widestate(i)) {
-                continue;
+        // Sherman optimization
+        if (info.impl_alpha_size > 16) {
+            for (u32 i = 0; i < info.size(); i++) {
+                if (info.is_widestate(i)) {
+                    continue;
+                }
+                find_better_daddy(info, i, using8bit,
+                                  any_cyclic_near_anchored_state,
+                                  trust_daddy_states, cc.grey);
+                total_daddy += info.extra[i].daddytaken;
             }
-            find_better_daddy(info, i, using8bit,
-                              any_cyclic_near_anchored_state,
-                              trust_daddy_states, cc.grey);
-            total_daddy += info.extra[i].daddytaken;
-        }
 
-        DEBUG_PRINTF("daddy %hu/%zu states=%zu alpha=%hu\n", total_daddy,
-                     info.size() * info.impl_alpha_size, info.size(),
-                     info.impl_alpha_size);
+            DEBUG_PRINTF("daddy %hu/%zu states=%zu alpha=%hu\n", total_daddy,
+                         info.size() * info.impl_alpha_size, info.size(),
+                         info.impl_alpha_size);
+        }
 
         nfa = mcclellanCompile16(info, cc, accel_states);
     } else {
diff --git a/src/nfa/mcsheng.c b/src/nfa/mcsheng.c
index 4619ff6fdf6a44c166c54ba2d488ac4937af9628..22cac119fb7996022272dfc4d9870e956f867676 100644
--- a/src/nfa/mcsheng.c
+++ b/src/nfa/mcsheng.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -1184,7 +1184,7 @@ char nfaExecMcSheng16_reportCurrent(const struct NFA *n, struct mq *q) {
 
 static
 char mcshengHasAccept(const struct mcsheng *m, const struct mstate_aux *aux,
-                        ReportID report) {
+                      ReportID report) {
     assert(m && aux);
 
     if (!aux->accept) {
@@ -1405,3 +1405,1332 @@ char nfaExecMcSheng16_expandState(UNUSED const struct NFA *nfa, void *dest,
     *(u16 *)dest = unaligned_load_u16(src);
     return 0;
 }
+
+#if defined(HAVE_AVX512VBMI)
+static really_inline
+const struct mstate_aux *get_aux64(const struct mcsheng64 *m, u32 s) {
+    const char *nfa = (const char *)m - sizeof(struct NFA);
+    const struct mstate_aux *aux
+        = s + (const struct mstate_aux *)(nfa + m->aux_offset);
+
+    assert(ISALIGNED(aux));
+    return aux;
+}
+
+static really_inline
+u32 mcshengEnableStarts64(const struct mcsheng64 *m, u32 s) {
+    const struct mstate_aux *aux = get_aux64(m, s);
+
+    DEBUG_PRINTF("enabling starts %u->%hu\n", s, aux->top);
+    return aux->top;
+}
+
+static really_inline
+char doComplexReport64(NfaCallback cb, void *ctxt, const struct mcsheng64 *m,
+                       u32 s, u64a loc, char eod, u32 *cached_accept_state,
+                       u32 *cached_accept_id) {
+    DEBUG_PRINTF("reporting state = %u, loc=%llu, eod %hhu\n",
+                 s & STATE_MASK, loc, eod);
+
+    if (!eod && s == *cached_accept_state) {
+        if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+
+    const struct mstate_aux *aux = get_aux64(m, s);
+    size_t offset = eod ? aux->accept_eod : aux->accept;
+
+    assert(offset);
+    const struct report_list *rl
+        = (const void *)((const char *)m + offset - sizeof(struct NFA));
+    assert(ISALIGNED(rl));
+
+    DEBUG_PRINTF("report list size %u\n", rl->count);
+    u32 count = rl->count;
+
+    if (!eod && count == 1) {
+        *cached_accept_state = s;
+        *cached_accept_id = rl->report[0];
+
+        DEBUG_PRINTF("reporting %u\n", rl->report[0]);
+        if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+
+    for (u32 i = 0; i < count; i++) {
+        DEBUG_PRINTF("reporting %u\n", rl->report[i]);
+        if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+    }
+
+    return MO_CONTINUE_MATCHING; /* continue execution */
+}
+
+static really_inline
+u32 doSheng64(const struct mcsheng64 *m, const u8 **c_inout, const u8 *soft_c_end,
+              const u8 *hard_c_end, u32 s_in, char do_accel) {
+    assert(s_in < m->sheng_end);
+    assert(s_in); /* should not already be dead */
+    assert(soft_c_end <= hard_c_end);
+    DEBUG_PRINTF("s_in = %u (adjusted %u)\n", s_in, s_in - 1);
+    m512 s = set64x8(s_in - 1);
+    const u8 *c = *c_inout;
+    const u8 *c_end = hard_c_end - SHENG_CHUNK + 1;
+    if (!do_accel) {
+        c_end = MIN(soft_c_end, hard_c_end - SHENG_CHUNK + 1);
+    }
+
+    const m512 *masks = m->sheng_succ_masks;
+    u8 sheng_limit = m->sheng_end - 1; /* - 1: no dead state */
+    u8 sheng_stop_limit = do_accel ? m->sheng_accel_limit : sheng_limit;
+
+    /* When we use movd to get a u32 containing our state, it will have 4 lanes
+     * all duplicating the state. We can create versions of our limits with 4
+     * copies to directly compare against, this prevents us generating code to
+     * extract a single copy of the state from the u32 for checking. */
+    u32 sheng_stop_limit_x4 = sheng_stop_limit * 0x01010101;
+
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
+    u32 sheng_limit_x4 = sheng_limit * 0x01010101;
+    m512 simd_stop_limit = set16x32(sheng_stop_limit_x4);
+    m512 accel_delta = set64x8(sheng_limit - sheng_stop_limit);
+    DEBUG_PRINTF("end %hhu, accel %hu --> limit %hhu\n", sheng_limit,
+                 m->sheng_accel_limit, sheng_stop_limit);
+#endif
+
+#define SHENG64_SINGLE_ITER do {                                             \
+        m512 succ_mask = masks[*(c++)];                                      \
+        s = vpermb512(s, succ_mask);                                         \
+        u32 s_gpr_x4 = movd512(s); /* convert to u8 */                       \
+        DEBUG_PRINTF("c %hhu (%c) --> s %u\n", c[-1], c[-1], s_gpr_x4);      \
+        if (s_gpr_x4 >= sheng_stop_limit_x4) {                               \
+            s_gpr = s_gpr_x4;                                                \
+            goto exit;                                                       \
+        }                                                                    \
+    } while (0)
+
+    u8 s_gpr;
+    while (c < c_end) {
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
+        /* This version uses pext for efficiently bitbashing out scaled
+         * versions of the bytes to process from a u64a */
+
+        u64a data_bytes = unaligned_load_u64a(c);
+        u64a cc0 = pdep64(data_bytes, 0x3fc0); /* extract scaled low byte */
+        data_bytes &= ~0xffULL; /* clear low bits for scale space */
+
+        m512 succ_mask0 = load512((const char *)masks + cc0);
+        s = vpermb512(s, succ_mask0);
+        m512 s_max = s;
+        m512 s_max0 = s_max;
+        DEBUG_PRINTF("c %02llx --> s %u\n", cc0 >> 6, movd512(s));
+
+#define SHENG64_SINGLE_UNROLL_ITER(iter)                                \
+        assert(iter);                                                   \
+        u64a cc##iter = pext64(data_bytes, mcsheng64_pext_mask[iter]);  \
+        assert(cc##iter == (u64a)c[iter] << 6);                         \
+        m512 succ_mask##iter = load512((const char *)masks + cc##iter); \
+        s = vpermb512(s, succ_mask##iter);                              \
+        if (do_accel && iter == 7) {                                    \
+            /* in the final iteration we also have to check against accel */ \
+            m512 s_temp = sadd_u8_m512(s, accel_delta);                 \
+            s_max = max_u8_m512(s_max, s_temp);                         \
+        } else {                                                        \
+            s_max = max_u8_m512(s_max, s);                              \
+        }                                                               \
+        m512 s_max##iter = s_max;                                       \
+        DEBUG_PRINTF("c %02llx --> s %u max %u\n", cc##iter >> 6,       \
+                     movd512(s), movd512(s_max));
+
+        SHENG64_SINGLE_UNROLL_ITER(1);
+        SHENG64_SINGLE_UNROLL_ITER(2);
+        SHENG64_SINGLE_UNROLL_ITER(3);
+        SHENG64_SINGLE_UNROLL_ITER(4);
+        SHENG64_SINGLE_UNROLL_ITER(5);
+        SHENG64_SINGLE_UNROLL_ITER(6);
+        SHENG64_SINGLE_UNROLL_ITER(7);
+
+        if (movd512(s_max7) >= sheng_limit_x4) {
+            DEBUG_PRINTF("exit found\n");
+
+            /* Explicitly check the last byte as it is more likely as it also
+             * checks for acceleration. */
+            if (movd512(s_max6) < sheng_limit_x4) {
+                c += SHENG_CHUNK;
+                s_gpr = movq512(s);
+                assert(s_gpr >= sheng_stop_limit);
+                goto exit;
+            }
+
+            /* use shift-xor to create a register containing all of the max
+             * values */
+            m512 blended = rshift64_m512(s_max0, 56);
+            blended = xor512(blended, rshift64_m512(s_max1, 48));
+            blended = xor512(blended, rshift64_m512(s_max2, 40));
+            blended = xor512(blended, rshift64_m512(s_max3, 32));
+            blended = xor512(blended, rshift64_m512(s_max4, 24));
+            blended = xor512(blended, rshift64_m512(s_max5, 16));
+            blended = xor512(blended, rshift64_m512(s_max6, 8));
+            blended = xor512(blended, s);
+            blended = xor512(blended, rshift64_m512(blended, 8));
+            DEBUG_PRINTF("blended %016llx\n", movq512(blended));
+
+            m512 final = min_u8_m512(blended, simd_stop_limit);
+            m512 cmp = sub_u8_m512(final, simd_stop_limit);
+            m128 tmp = cast512to128(cmp);
+            u64a stops = ~movemask128(tmp);
+            assert(stops);
+            u32 earliest = ctz32(stops);
+            DEBUG_PRINTF("stops %02llx, earliest %u\n", stops, earliest);
+            assert(earliest < 8);
+            c += earliest + 1;
+            s_gpr = movq512(blended) >> (earliest * 8);
+            assert(s_gpr >= sheng_stop_limit);
+            goto exit;
+        } else {
+            c += SHENG_CHUNK;
+        }
+#else
+        SHENG64_SINGLE_ITER;
+        SHENG64_SINGLE_ITER;
+        SHENG64_SINGLE_ITER;
+        SHENG64_SINGLE_ITER;
+
+        SHENG64_SINGLE_ITER;
+        SHENG64_SINGLE_ITER;
+        SHENG64_SINGLE_ITER;
+        SHENG64_SINGLE_ITER;
+#endif
+    }
+
+    assert(c_end - c < SHENG_CHUNK);
+    if (c < soft_c_end) {
+        assert(soft_c_end - c < SHENG_CHUNK);
+        switch (soft_c_end - c) {
+        case 7:
+            SHENG64_SINGLE_ITER; // fallthrough
+        case 6:
+            SHENG64_SINGLE_ITER; // fallthrough
+        case 5:
+            SHENG64_SINGLE_ITER; // fallthrough
+        case 4:
+            SHENG64_SINGLE_ITER; // fallthrough
+        case 3:
+            SHENG64_SINGLE_ITER; // fallthrough
+        case 2:
+            SHENG64_SINGLE_ITER; // fallthrough
+        case 1:
+            SHENG64_SINGLE_ITER; // fallthrough
+        }
+    }
+
+    assert(c >= soft_c_end);
+
+    s_gpr = movq512(s);
+exit:
+    assert(c <= hard_c_end);
+    DEBUG_PRINTF("%zu from end; s %hhu\n", c_end - c, s_gpr);
+    assert(c >= soft_c_end || s_gpr >= sheng_stop_limit);
+    /* undo state adjustment to match mcclellan view */
+    if (s_gpr == sheng_limit) {
+        s_gpr = 0;
+    } else if (s_gpr < sheng_limit) {
+        s_gpr++;
+    }
+
+    *c_inout = c;
+    return s_gpr;
+}
+
+static really_inline
+const char *findShermanState64(UNUSED const struct mcsheng64 *m,
+                               const char *sherman_base_offset,
+                               u32 sherman_base, u32 s) {
+    const char *rv
+        = sherman_base_offset + SHERMAN_FIXED_SIZE * (s - sherman_base);
+    assert(rv < (const char *)m + m->length - sizeof(struct NFA));
+    UNUSED u8 type = *(const u8 *)(rv + SHERMAN_TYPE_OFFSET);
+    assert(type == SHERMAN_STATE);
+    return rv;
+}
+
+static really_inline
+const u8 *run_mcsheng_accel64(const struct mcsheng64 *m,
+                              const struct mstate_aux *aux, u32 s,
+                              const u8 **min_accel_offset,
+                              const u8 *c, const u8 *c_end) {
+    DEBUG_PRINTF("skipping\n");
+    u32 accel_offset = aux[s].accel_offset;
+
+    assert(aux[s].accel_offset);
+    assert(accel_offset >= m->aux_offset);
+    assert(!m->sherman_offset || accel_offset < m->sherman_offset);
+
+    const union AccelAux *aaux = (const void *)((const char *)m + accel_offset);
+    const u8 *c2 = run_accel(aaux, c, c_end);
+
+    if (c2 < *min_accel_offset + BAD_ACCEL_DIST) {
+        *min_accel_offset = c2 + BIG_ACCEL_PENALTY;
+    } else {
+        *min_accel_offset = c2 + SMALL_ACCEL_PENALTY;
+    }
+
+    if (*min_accel_offset >= c_end - ACCEL_MIN_LEN) {
+        *min_accel_offset = c_end;
+    }
+
+    DEBUG_PRINTF("advanced %zd, next accel chance in %zd/%zd\n",
+                 c2 - c, *min_accel_offset - c2, c_end - c2);
+
+    return c2;
+}
+
+static really_inline
+u32 doNormal64_16(const struct mcsheng64 *m, const u8 **c_inout, const u8 *end,
+                  u32 s, char do_accel, enum MatchMode mode) {
+    const u8 *c = *c_inout;
+    const u16 *succ_table
+        = (const u16 *)((const char *)m + sizeof(struct mcsheng64));
+    assert(ISALIGNED_N(succ_table, 2));
+    u32 sheng_end = m->sheng_end;
+    u32 sherman_base = m->sherman_limit;
+    const char *sherman_base_offset
+        = (const char *)m - sizeof(struct NFA) + m->sherman_offset;
+    u32 as = m->alphaShift;
+
+    /* Adjust start of succ table so we can index into using state id (rather
+     * than adjust to normal id). As we will not be processing states with low
+     * state ids, we will not be accessing data before the succ table. Note: due
+     * to the size of the sheng tables, the succ_table pointer will still be
+     * inside the engine.*/
+    succ_table -= sheng_end << as;
+    s &= STATE_MASK;
+    while (c < end && s >= sheng_end) {
+        u8 cprime = m->remap[*c];
+        DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx (s=%u)\n", *c,
+                     ourisprint(*c) ? *c : '?', cprime, s);
+        if (s < sherman_base) {
+            DEBUG_PRINTF("doing normal\n");
+            assert(s < m->state_count);
+            s = succ_table[(s << as) + cprime];
+        } else {
+            const char *sherman_state
+                = findShermanState64(m, sherman_base_offset, sherman_base, s);
+            DEBUG_PRINTF("doing sherman (%u)\n", s);
+            s = doSherman16(sherman_state, cprime, succ_table, as);
+        }
+
+        DEBUG_PRINTF("s: %u (%u)\n", s, s & STATE_MASK);
+        c++;
+
+        if (do_accel && (s & ACCEL_FLAG)) {
+            break;
+        }
+        if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) {
+            break;
+        }
+
+        s &= STATE_MASK;
+    }
+
+    *c_inout = c;
+    return s;
+}
+
+static really_inline
+char mcsheng64Exec16_i(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                       size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                       char single, const u8 **c_final, enum MatchMode mode) {
+    assert(ISALIGNED_N(state, 2));
+    if (!len) {
+        if (mode == STOP_AT_MATCH) {
+            *c_final = buf;
+        }
+        return MO_ALIVE;
+    }
+
+    u32 s = *state;
+    const u8 *c = buf;
+    const u8 *c_end = buf + len;
+    const u8 sheng_end = m->sheng_end;
+    const struct mstate_aux *aux
+        = (const struct mstate_aux *)((const char *)m + m->aux_offset
+                                      - sizeof(struct NFA));
+
+    s &= STATE_MASK;
+
+    u32 cached_accept_id = 0;
+    u32 cached_accept_state = 0;
+
+    DEBUG_PRINTF("s: %u, len %zu\n", s, len);
+
+    const u8 *min_accel_offset = c;
+    if (!m->has_accel || len < ACCEL_MIN_LEN) {
+        min_accel_offset = c_end;
+        goto without_accel;
+    }
+
+    goto with_accel;
+
+without_accel:
+    do {
+        assert(c < min_accel_offset);
+        int do_accept;
+        if (!s) {
+            goto exit;
+        } else if (s < sheng_end) {
+            s = doSheng64(m, &c, min_accel_offset, c_end, s, 0);
+            do_accept = mode != NO_MATCHES && get_aux64(m, s)->accept;
+        } else {
+            s = doNormal64_16(m, &c, min_accel_offset, s, 0, mode);
+
+            do_accept = mode != NO_MATCHES && (s & ACCEPT_FLAG);
+        }
+
+        if (do_accept) {
+            if (mode == STOP_AT_MATCH) {
+                *state = s & STATE_MASK;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD; /* termination requested */
+                }
+            } else if (doComplexReport64(cb, ctxt, m, s & STATE_MASK, loc, 0,
+                                         &cached_accept_state,
+                                         &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end); /* sheng is fuzzy for min_accel_offset */
+    } while (c < min_accel_offset);
+
+    if (c == c_end) {
+        goto exit;
+    }
+
+with_accel:
+    do {
+        assert(c < c_end);
+        int do_accept;
+
+        if (!s) {
+            goto exit;
+        } else if (s < sheng_end) {
+            if (s > m->sheng_accel_limit) {
+                c = run_mcsheng_accel64(m, aux, s, &min_accel_offset, c, c_end);
+                if (c == c_end) {
+                    goto exit;
+                } else {
+                    goto without_accel;
+                }
+            }
+            s = doSheng64(m, &c, c_end, c_end, s, 1);
+            do_accept = mode != NO_MATCHES && get_aux64(m, s)->accept;
+        } else {
+            if (s & ACCEL_FLAG) {
+                DEBUG_PRINTF("skipping\n");
+                s &= STATE_MASK;
+                c = run_mcsheng_accel64(m, aux, s, &min_accel_offset, c, c_end);
+                if (c == c_end) {
+                    goto exit;
+                } else {
+                    goto without_accel;
+                }
+            }
+
+            s = doNormal64_16(m, &c, c_end, s, 1, mode);
+            do_accept = mode != NO_MATCHES && (s & ACCEPT_FLAG);
+        }
+
+        if (do_accept) {
+            if (mode == STOP_AT_MATCH) {
+                *state = s & STATE_MASK;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD; /* termination requested */
+                }
+            } else if (doComplexReport64(cb, ctxt, m, s & STATE_MASK, loc, 0,
+                                         &cached_accept_state,
+                                         &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end);
+    } while (c < c_end);
+
+exit:
+    s &= STATE_MASK;
+
+    if (mode == STOP_AT_MATCH) {
+        *c_final = c_end;
+    }
+    *state = s;
+
+    return MO_ALIVE;
+}
+
+static never_inline
+char mcsheng64Exec16_i_cb(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                          size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                          char single, const u8 **final_point) {
+    return mcsheng64Exec16_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                             final_point, CALLBACK_OUTPUT);
+}
+
+static never_inline
+char mcsheng64Exec16_i_sam(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                           size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                           char single, const u8 **final_point) {
+    return mcsheng64Exec16_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                             final_point, STOP_AT_MATCH);
+}
+
+static never_inline
+char mcsheng64Exec16_i_nm(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                          size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                          char single, const u8 **final_point) {
+    return mcsheng64Exec16_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                             final_point, NO_MATCHES);
+}
+
+static really_inline
+char mcsheng64Exec16_i_ni(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                          size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                          char single, const u8 **final_point,
+                          enum MatchMode mode) {
+    if (mode == CALLBACK_OUTPUT) {
+        return mcsheng64Exec16_i_cb(m, state, buf, len, offAdj, cb, ctxt,
+                                    single, final_point);
+    } else if (mode == STOP_AT_MATCH) {
+        return mcsheng64Exec16_i_sam(m, state, buf, len, offAdj, cb, ctxt,
+                                     single, final_point);
+    } else {
+        assert (mode == NO_MATCHES);
+        return mcsheng64Exec16_i_nm(m, state, buf, len, offAdj, cb, ctxt,
+                                    single, final_point);
+    }
+}
+
+static really_inline
+u32 doNormal64_8(const struct mcsheng64 *m, const u8 **c_inout, const u8 *end, u32 s,
+                 char do_accel, enum MatchMode mode) {
+    const u8 *c = *c_inout;
+    u32 sheng_end = m->sheng_end;
+    u32 accel_limit = m->accel_limit_8;
+    u32 accept_limit = m->accept_limit_8;
+
+    const u32 as = m->alphaShift;
+    const u8 *succ_table = (const u8 *)((const char *)m
+                                        + sizeof(struct mcsheng64));
+    /* Adjust start of succ table so we can index into using state id (rather
+     * than adjust to normal id). As we will not be processing states with low
+     * state ids, we will not be accessing data before the succ table. Note: due
+     * to the size of the sheng tables, the succ_table pointer will still be
+     * inside the engine.*/
+    succ_table -= sheng_end << as;
+
+    assert(s >= sheng_end);
+    while (c < end && s >= sheng_end) {
+        u8 cprime = m->remap[*c];
+        DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx\n", *c,
+                     ourisprint(*c) ? *c : '?', cprime);
+        s = succ_table[(s << as) + cprime];
+
+        DEBUG_PRINTF("s: %u\n", s);
+        c++;
+        if (do_accel) {
+            if (s >= accel_limit) {
+                break;
+            }
+        } else {
+            if (mode != NO_MATCHES && s >= accept_limit) {
+                break;
+            }
+        }
+    }
+    *c_inout = c;
+    return s;
+}
+
+static really_inline
+char mcsheng64Exec8_i(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                      size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                      char single, const u8 **c_final, enum MatchMode mode) {
+    if (!len) {
+        *c_final = buf;
+        return MO_ALIVE;
+    }
+    u32 s = *state;
+    const u8 *c = buf;
+    const u8 *c_end = buf + len;
+    const u8 sheng_end = m->sheng_end;
+
+    const struct mstate_aux *aux
+        = (const struct mstate_aux *)((const char *)m + m->aux_offset
+                                      - sizeof(struct NFA));
+    u32 accept_limit = m->accept_limit_8;
+
+    u32 cached_accept_id = 0;
+    u32 cached_accept_state = 0;
+
+    DEBUG_PRINTF("accel %hu, accept %u\n", m->accel_limit_8, accept_limit);
+
+    DEBUG_PRINTF("s: %u, len %zu\n", s, len);
+
+    const u8 *min_accel_offset = c;
+    if (!m->has_accel || len < ACCEL_MIN_LEN) {
+        min_accel_offset = c_end;
+        goto without_accel;
+    }
+
+    goto with_accel;
+
+without_accel:
+    do {
+        assert(c < min_accel_offset);
+        if (!s) {
+            goto exit;
+        } else if (s < sheng_end) {
+            s = doSheng64(m, &c, min_accel_offset, c_end, s, 0);
+        } else {
+            s = doNormal64_8(m, &c, min_accel_offset, s, 0, mode);
+            assert(c <= min_accel_offset);
+        }
+
+        if (mode != NO_MATCHES && s >= accept_limit) {
+            if (mode == STOP_AT_MATCH) {
+                DEBUG_PRINTF("match - pausing\n");
+                *state = s;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD;
+                }
+            } else if (doComplexReport64(cb, ctxt, m, s, loc, 0,
+                                         &cached_accept_state,
+                                         &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end); /* sheng is fuzzy for min_accel_offset */
+    } while (c < min_accel_offset);
+
+    if (c == c_end) {
+        goto exit;
+    }
+
+with_accel:
+    do {
+        u32 accel_limit = m->accel_limit_8;
+
+        assert(c < c_end);
+        if (!s) {
+            goto exit;
+        } else if (s < sheng_end) {
+            if (s > m->sheng_accel_limit) {
+                c = run_mcsheng_accel64(m, aux, s, &min_accel_offset, c, c_end);
+                if (c == c_end) {
+                    goto exit;
+                } else {
+                    goto without_accel;
+                }
+            }
+            s = doSheng64(m, &c, c_end, c_end, s, 1);
+        } else {
+            if (s >= accel_limit && aux[s].accel_offset) {
+                c = run_mcsheng_accel64(m, aux, s, &min_accel_offset, c, c_end);
+                if (c == c_end) {
+                    goto exit;
+                } else {
+                    goto without_accel;
+                }
+            }
+            s = doNormal64_8(m, &c, c_end, s, 1, mode);
+        }
+
+        if (mode != NO_MATCHES && s >= accept_limit) {
+            if (mode == STOP_AT_MATCH) {
+                DEBUG_PRINTF("match - pausing\n");
+                *state = s;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD;
+                }
+            } else if (doComplexReport64(cb, ctxt, m, s, loc, 0,
+                                         &cached_accept_state,
+                                         &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end);
+    } while (c < c_end);
+
+exit:
+    *state = s;
+    if (mode == STOP_AT_MATCH) {
+        *c_final = c_end;
+    }
+    return MO_ALIVE;
+}
+
+static never_inline
+char mcsheng64Exec8_i_cb(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                         size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                         char single, const u8 **final_point) {
+    return mcsheng64Exec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                            final_point, CALLBACK_OUTPUT);
+}
+
+static never_inline
+char mcsheng64Exec8_i_sam(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                          size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                          char single, const u8 **final_point) {
+    return mcsheng64Exec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                            final_point, STOP_AT_MATCH);
+}
+
+static never_inline
+char mcsheng64Exec8_i_nm(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                         size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                         char single, const u8 **final_point) {
+    return mcsheng64Exec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                            final_point, NO_MATCHES);
+}
+
+static really_inline
+char mcsheng64Exec8_i_ni(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                         size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                         char single, const u8 **final_point,
+                         enum MatchMode mode) {
+    if (mode == CALLBACK_OUTPUT) {
+        return mcsheng64Exec8_i_cb(m, state, buf, len, offAdj, cb, ctxt, single,
+                                   final_point);
+    } else if (mode == STOP_AT_MATCH) {
+        return mcsheng64Exec8_i_sam(m, state, buf, len, offAdj, cb, ctxt,
+                                    single, final_point);
+    } else {
+        assert(mode == NO_MATCHES);
+        return mcsheng64Exec8_i_nm(m, state, buf, len, offAdj, cb, ctxt, single,
+                                   final_point);
+    }
+}
+
+static really_inline
+char mcshengCheckEOD64(const struct NFA *nfa, u32 s, u64a offset,
+                       NfaCallback cb, void *ctxt) {
+    const struct mcsheng64 *m = getImplNfa(nfa);
+    const struct mstate_aux *aux = get_aux64(m, s);
+
+    if (!aux->accept_eod) {
+        return MO_CONTINUE_MATCHING;
+    }
+    return doComplexReport64(cb, ctxt, m, s, offset, 1, NULL, NULL);
+}
+
+static really_inline
+char nfaExecMcSheng64_16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
+                             const u8 *hend, NfaCallback cb, void *context,
+                             struct mq *q, char single, s64a end,
+                             enum MatchMode mode) {
+    assert(n->type == MCSHENG_64_NFA_16);
+    const struct mcsheng64 *m = getImplNfa(n);
+    s64a sp;
+
+    assert(ISALIGNED_N(q->state, 2));
+    u32 s = *(u16 *)q->state;
+
+    if (q->report_current) {
+        assert(s);
+        assert(get_aux64(m, s)->accept);
+
+        int rv;
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+            rv = cb(0, q_cur_offset(q), m->arb_report, context);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            rv = doComplexReport64(cb, context, m, s, q_cur_offset(q), 0,
+                                   &cached_accept_state, &cached_accept_id);
+        }
+
+        q->report_current = 0;
+
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+    }
+
+    sp = q_cur_loc(q);
+    q->cur++;
+
+    const u8 *cur_buf = sp < 0 ? hend : buffer;
+
+    assert(q->cur);
+    if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) {
+        DEBUG_PRINTF("this is as far as we go\n");
+        q->cur--;
+        q->items[q->cur].type = MQE_START;
+        q->items[q->cur].location = end;
+        *(u16 *)q->state = s;
+        return MO_ALIVE;
+    }
+
+    while (1) {
+        assert(q->cur < q->end);
+        s64a ep = q->items[q->cur].location;
+        if (mode != NO_MATCHES) {
+            ep = MIN(ep, end);
+        }
+
+        assert(ep >= sp);
+
+        s64a local_ep = ep;
+        if (sp < 0) {
+            local_ep = MIN(0, ep);
+        }
+
+        /* do main buffer region */
+        const u8 *final_look;
+        char rv = mcsheng64Exec16_i_ni(m, &s, cur_buf + sp, local_ep - sp,
+                                       offset + sp, cb, context, single,
+                                       &final_look, mode);
+        if (rv == MO_DEAD) {
+            *(u16 *)q->state = 0;
+            return MO_DEAD;
+        }
+        if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf);
+
+            assert(q->cur);
+            assert(final_look != cur_buf + local_ep);
+
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = final_look - cur_buf + 1; /* due to
+                                                                   * early -1 */
+            *(u16 *)q->state = s;
+            return MO_MATCHES_PENDING;
+        }
+
+        assert(rv == MO_ALIVE);
+        assert(q->cur);
+        if (mode != NO_MATCHES && q->items[q->cur].location > end) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = end;
+            *(u16 *)q->state = s;
+            return MO_ALIVE;
+        }
+
+        sp = local_ep;
+
+        if (sp == 0) {
+            cur_buf = buffer;
+        }
+
+        if (sp != ep) {
+            continue;
+        }
+
+        switch (q->items[q->cur].type) {
+        case MQE_TOP:
+            assert(sp + offset || !s);
+            if (sp + offset == 0) {
+                s = m->start_anchored;
+                break;
+            }
+            s = mcshengEnableStarts64(m, s);
+            break;
+        case MQE_END:
+            *(u16 *)q->state = s;
+            q->cur++;
+            return s ? MO_ALIVE : MO_DEAD;
+        default:
+            assert(!"invalid queue event");
+        }
+
+        q->cur++;
+    }
+}
+
+static really_inline
+char nfaExecMcSheng64_8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
+                            const u8 *hend, NfaCallback cb, void *context,
+                            struct mq *q, char single, s64a end,
+                            enum MatchMode mode) {
+    assert(n->type == MCSHENG_64_NFA_8);
+    const struct mcsheng64 *m = getImplNfa(n);
+    s64a sp;
+
+    u32 s = *(u8 *)q->state;
+
+    if (q->report_current) {
+        assert(s);
+        assert(s >= m->accept_limit_8);
+
+        int rv;
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+
+            rv = cb(0, q_cur_offset(q), m->arb_report, context);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            rv = doComplexReport64(cb, context, m, s, q_cur_offset(q), 0,
+                                   &cached_accept_state, &cached_accept_id);
+        }
+
+        q->report_current = 0;
+
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+    }
+
+    sp = q_cur_loc(q);
+    q->cur++;
+
+    const u8 *cur_buf = sp < 0 ? hend : buffer;
+
+    if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) {
+        DEBUG_PRINTF("this is as far as we go\n");
+        q->cur--;
+        q->items[q->cur].type = MQE_START;
+        q->items[q->cur].location = end;
+        *(u8 *)q->state = s;
+        return MO_ALIVE;
+    }
+
+    while (1) {
+        DEBUG_PRINTF("%s @ %llu\n", q->items[q->cur].type == MQE_TOP ? "TOP" :
+                     q->items[q->cur].type == MQE_END ? "END" : "???",
+                     q->items[q->cur].location + offset);
+        assert(q->cur < q->end);
+        s64a ep = q->items[q->cur].location;
+        if (mode != NO_MATCHES) {
+            ep = MIN(ep, end);
+        }
+
+        assert(ep >= sp);
+
+        s64a local_ep = ep;
+        if (sp < 0) {
+            local_ep = MIN(0, ep);
+        }
+
+        const u8 *final_look;
+        char rv = mcsheng64Exec8_i_ni(m, &s, cur_buf + sp, local_ep - sp,
+                                      offset + sp, cb, context, single,
+                                      &final_look, mode);
+        if (rv == MO_HALT_MATCHING) {
+            *(u8 *)q->state = 0;
+            return MO_DEAD;
+        }
+        if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf);
+
+            assert(q->cur);
+            assert(final_look != cur_buf + local_ep);
+
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = final_look - cur_buf + 1; /* due to
+                                                                   * early -1 */
+            *(u8 *)q->state = s;
+            return MO_MATCHES_PENDING;
+        }
+
+        assert(rv == MO_ALIVE);
+        assert(q->cur);
+        if (mode != NO_MATCHES && q->items[q->cur].location > end) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            assert(q->cur);
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = end;
+            *(u8 *)q->state = s;
+            return MO_ALIVE;
+        }
+
+        sp = local_ep;
+
+        if (sp == 0) {
+            cur_buf = buffer;
+        }
+
+        if (sp != ep) {
+            continue;
+        }
+
+        switch (q->items[q->cur].type) {
+        case MQE_TOP:
+            assert(sp + offset || !s);
+            if (sp + offset == 0) {
+                s = (u8)m->start_anchored;
+                break;
+            }
+            s = mcshengEnableStarts64(m, s);
+            break;
+        case MQE_END:
+            *(u8 *)q->state = s;
+            q->cur++;
+            return s ? MO_ALIVE : MO_DEAD;
+        default:
+            assert(!"invalid queue event");
+        }
+
+        q->cur++;
+    }
+}
+
+char nfaExecMcSheng64_8_Q(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_64_NFA_8);
+    const struct mcsheng64 *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcSheng64_8_Q2i(n, offset, buffer, hend, cb, context, q,
+                                  m->flags & MCSHENG_FLAG_SINGLE, end,
+                                  CALLBACK_OUTPUT);
+}
+
+char nfaExecMcSheng64_16_Q(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_64_NFA_16);
+    const struct mcsheng64 *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcSheng64_16_Q2i(n, offset, buffer, hend, cb, context, q,
+                                   m->flags & MCSHENG_FLAG_SINGLE, end,
+                                   CALLBACK_OUTPUT);
+}
+
+char nfaExecMcSheng64_8_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct mcsheng64 *m = getImplNfa(n);
+    NfaCallback cb = q->cb;
+    void *ctxt = q->context;
+    u32 s = *(u8 *)q->state;
+    u8 single = m->flags & MCSHENG_FLAG_SINGLE;
+    u64a offset = q_cur_offset(q);
+    assert(q_cur_type(q) == MQE_START);
+    assert(s);
+
+    if (s >= m->accept_limit_8) {
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+            cb(0, offset, m->arb_report, ctxt);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            doComplexReport64(cb, ctxt, m, s, offset, 0, &cached_accept_state,
+                              &cached_accept_id);
+        }
+    }
+
+    return 0;
+}
+
+char nfaExecMcSheng64_16_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct mcsheng64 *m = getImplNfa(n);
+    NfaCallback cb = q->cb;
+    void *ctxt = q->context;
+    u32 s = *(u16 *)q->state;
+    const struct mstate_aux *aux = get_aux64(m, s);
+    u8 single = m->flags & MCSHENG_FLAG_SINGLE;
+    u64a offset = q_cur_offset(q);
+    assert(q_cur_type(q) == MQE_START);
+    DEBUG_PRINTF("state %u\n", s);
+    assert(s);
+
+    if (aux->accept) {
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+            cb(0, offset, m->arb_report, ctxt);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            doComplexReport64(cb, ctxt, m, s, offset, 0, &cached_accept_state,
+                              &cached_accept_id);
+        }
+    }
+
+    return 0;
+}
+
+static
+char mcshengHasAccept64(const struct mcsheng64 *m, const struct mstate_aux *aux,
+                        ReportID report) {
+    assert(m && aux);
+
+    if (!aux->accept) {
+        return 0;
+    }
+
+    const struct report_list *rl = (const struct report_list *)
+            ((const char *)m + aux->accept - sizeof(struct NFA));
+    assert(ISALIGNED_N(rl, 4));
+
+    DEBUG_PRINTF("report list has %u entries\n", rl->count);
+
+    for (u32 i = 0; i < rl->count; i++) {
+        if (rl->report[i] == report) {
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+char nfaExecMcSheng64_8_inAccept(const struct NFA *n, ReportID report,
+                                 struct mq *q) {
+    assert(n && q);
+
+    const struct mcsheng64 *m = getImplNfa(n);
+    u8 s = *(u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hhu\n", s);
+
+    return mcshengHasAccept64(m, get_aux64(m, s), report);
+}
+
+char nfaExecMcSheng64_8_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct mcsheng64 *m = getImplNfa(n);
+    u8 s = *(u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hhu\n", s);
+
+    return !!get_aux64(m, s)->accept;
+}
+
+char nfaExecMcSheng64_16_inAccept(const struct NFA *n, ReportID report,
+                                  struct mq *q) {
+    assert(n && q);
+
+    const struct mcsheng64 *m = getImplNfa(n);
+    u16 s = *(u16 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hu\n", s);
+
+    return mcshengHasAccept64(m, get_aux64(m, s), report);
+}
+
+char nfaExecMcSheng64_16_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct mcsheng64 *m = getImplNfa(n);
+    u16 s = *(u16 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hu\n", s);
+
+    return !!get_aux64(m, s)->accept;
+}
+
+char nfaExecMcSheng64_8_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_64_NFA_8);
+    const struct mcsheng64 *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcSheng64_8_Q2i(n, offset, buffer, hend, cb, context, q,
+                                  m->flags & MCSHENG_FLAG_SINGLE, end,
+                                  STOP_AT_MATCH);
+}
+
+char nfaExecMcSheng64_16_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_64_NFA_16);
+    const struct mcsheng64 *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcSheng64_16_Q2i(n, offset, buffer, hend, cb, context, q,
+                                   m->flags & MCSHENG_FLAG_SINGLE, end,
+                                   STOP_AT_MATCH);
+}
+
+char nfaExecMcSheng64_8_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_64_NFA_8);
+    const struct mcsheng64 *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    char rv = nfaExecMcSheng64_8_Q2i(n, offset, buffer, hend, cb, context, q,
+                                     m->flags & MCSHENG_FLAG_SINGLE,
+                                     0 /* end */, NO_MATCHES);
+    if (rv && nfaExecMcSheng64_8_inAccept(n, report, q)) {
+        return MO_MATCHES_PENDING;
+    } else {
+        return rv;
+    }
+}
+
+char nfaExecMcSheng64_16_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_64_NFA_16);
+    const struct mcsheng64 *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    char rv = nfaExecMcSheng64_16_Q2i(n, offset, buffer, hend, cb, context, q,
+                                      m->flags & MCSHENG_FLAG_SINGLE,
+                                      0 /* end */, NO_MATCHES);
+
+    if (rv && nfaExecMcSheng64_16_inAccept(n, report, q)) {
+        return MO_MATCHES_PENDING;
+    } else {
+        return rv;
+    }
+}
+
+char nfaExecMcSheng64_8_initCompressedState(const struct NFA *nfa, u64a offset,
+                                            void *state, UNUSED u8 key) {
+    const struct mcsheng64 *m = getImplNfa(nfa);
+    u8 s = offset ? m->start_floating : m->start_anchored;
+    if (s) {
+        *(u8 *)state = s;
+        return 1;
+    }
+    return 0;
+}
+
+char nfaExecMcSheng64_16_initCompressedState(const struct NFA *nfa, u64a offset,
+                                             void *state, UNUSED u8 key) {
+    const struct mcsheng64 *m = getImplNfa(nfa);
+    u16 s = offset ? m->start_floating : m->start_anchored;
+    if (s) {
+        unaligned_store_u16(state, s);
+        return 1;
+    }
+    return 0;
+}
+
+char nfaExecMcSheng64_8_testEOD(const struct NFA *nfa, const char *state,
+                                UNUSED const char *streamState, u64a offset,
+                                NfaCallback callback, void *context) {
+    return mcshengCheckEOD64(nfa, *(const u8 *)state, offset, callback,
+                             context);
+}
+
+char nfaExecMcSheng64_16_testEOD(const struct NFA *nfa, const char *state,
+                                 UNUSED const char *streamState, u64a offset,
+                                 NfaCallback callback, void *context) {
+    assert(ISALIGNED_N(state, 2));
+    return mcshengCheckEOD64(nfa, *(const u16 *)state, offset, callback,
+                             context);
+}
+
+char nfaExecMcSheng64_8_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) {
+    assert(nfa->scratchStateSize == 1);
+    *(u8 *)q->state = 0;
+    return 0;
+}
+
+char nfaExecMcSheng64_16_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) {
+    assert(nfa->scratchStateSize == 2);
+    assert(ISALIGNED_N(q->state, 2));
+    *(u16 *)q->state = 0;
+    return 0;
+}
+
+char nfaExecMcSheng64_8_queueCompressState(UNUSED const struct NFA *nfa,
+                                           const struct mq *q, UNUSED s64a loc) {
+    void *dest = q->streamState;
+    const void *src = q->state;
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+char nfaExecMcSheng64_8_expandState(UNUSED const struct NFA *nfa, void *dest,
+                                    const void *src, UNUSED u64a offset,
+                                    UNUSED u8 key) {
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+char nfaExecMcSheng64_16_queueCompressState(UNUSED const struct NFA *nfa,
+                                            const struct mq *q,
+                                            UNUSED s64a loc) {
+    void *dest = q->streamState;
+    const void *src = q->state;
+    assert(nfa->scratchStateSize == 2);
+    assert(nfa->streamStateSize == 2);
+    assert(ISALIGNED_N(src, 2));
+    unaligned_store_u16(dest, *(const u16 *)(src));
+    return 0;
+}
+
+char nfaExecMcSheng64_16_expandState(UNUSED const struct NFA *nfa, void *dest,
+                                     const void *src, UNUSED u64a offset,
+                                     UNUSED u8 key) {
+    assert(nfa->scratchStateSize == 2);
+    assert(nfa->streamStateSize == 2);
+    assert(ISALIGNED_N(dest, 2));
+    *(u16 *)dest = unaligned_load_u16(src);
+    return 0;
+}
+#endif
diff --git a/src/nfa/mcsheng.h b/src/nfa/mcsheng.h
index 19fd69614d11cb92823b49ca6eb859ceea539231..0329e12128bfe0d9c24ebc5e54865a4fb1edf4f0 100644
--- a/src/nfa/mcsheng.h
+++ b/src/nfa/mcsheng.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -80,5 +80,78 @@ char nfaExecMcSheng16_expandState(const struct NFA *nfa, void *dest,
 
 #define nfaExecMcSheng16_B_Reverse NFA_API_NO_IMPL
 #define nfaExecMcSheng16_zombie_status NFA_API_ZOMBIE_NO_IMPL
+#if defined(HAVE_AVX512VBMI)
+/* 64-8 bit Sheng-McClellan hybrid  */
+char nfaExecMcSheng64_8_testEOD(const struct NFA *nfa, const char *state,
+                                const char *streamState, u64a offset,
+                                NfaCallback callback, void *context);
+char nfaExecMcSheng64_8_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng64_8_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng64_8_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecMcSheng64_8_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng64_8_inAccept(const struct NFA *n, ReportID report,
+                                 struct mq *q);
+char nfaExecMcSheng64_8_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng64_8_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng64_8_initCompressedState(const struct NFA *n, u64a offset,
+                                            void *state, u8 key);
+char nfaExecMcSheng64_8_queueCompressState(const struct NFA *nfa,
+                                           const struct mq *q, s64a loc);
+char nfaExecMcSheng64_8_expandState(const struct NFA *nfa, void *dest,
+                                    const void *src, u64a offset, u8 key);
+
+#define nfaExecMcSheng64_8_B_Reverse NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+/* 64-16 bit Sheng-McClellan hybrid  */
+char nfaExecMcSheng64_16_testEOD(const struct NFA *nfa, const char *state,
+                                 const char *streamState, u64a offset,
+                                 NfaCallback callback, void *context);
+char nfaExecMcSheng64_16_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng64_16_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng64_16_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecMcSheng64_16_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng64_16_inAccept(const struct NFA *n, ReportID report,
+                                  struct mq *q);
+char nfaExecMcSheng64_16_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng64_16_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng64_16_initCompressedState(const struct NFA *n, u64a offset,
+                                             void *state, u8 key);
+char nfaExecMcSheng64_16_queueCompressState(const struct NFA *nfa,
+                                            const struct mq *q, s64a loc);
+char nfaExecMcSheng64_16_expandState(const struct NFA *nfa, void *dest,
+                                     const void *src, u64a offset, u8 key);
+#define nfaExecMcSheng64_16_B_Reverse NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_zombie_status NFA_API_ZOMBIE_NO_IMPL
+#else // !HAVE_AVX512VBMI
+#define nfaExecMcSheng64_8_B_Reverse NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_zombie_status NFA_API_ZOMBIE_NO_IMPL
+#define nfaExecMcSheng64_8_Q NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_Q2 NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_QR NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_inAccept NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_inAnyAccept NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_queueInitState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_queueCompressState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_expandState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_initCompressedState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_testEOD NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_reportCurrent NFA_API_NO_IMPL
+
+#define nfaExecMcSheng64_16_B_Reverse NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_zombie_status NFA_API_ZOMBIE_NO_IMPL
+#define nfaExecMcSheng64_16_Q NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_Q2 NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_QR NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_inAccept NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_inAnyAccept NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_queueInitState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_queueCompressState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_expandState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_initCompressedState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_testEOD NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_reportCurrent NFA_API_NO_IMPL
+
+#endif //end of HAVE_AVX512VBMI
 
 #endif
diff --git a/src/nfa/mcsheng_compile.cpp b/src/nfa/mcsheng_compile.cpp
index 871ca4fb17b229b739f4d5b602af2375f56b62e1..fb75e49a352b4237df5f4ffa47e96b2dac3a8a96 100644
--- a/src/nfa/mcsheng_compile.cpp
+++ b/src/nfa/mcsheng_compile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -64,7 +64,6 @@
 #include <set>
 #include <deque>
 #include <vector>
-
 #include <boost/range/adaptor/map.hpp>
 
 using namespace std;
@@ -244,6 +243,106 @@ void populateBasicInfo(size_t state_size, const dfa_info &info,
     }
 }
 
+static
+mstate_aux *getAux64(NFA *n, dstate_id_t i) {
+    mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(n);
+    mstate_aux *aux_base = (mstate_aux *)((char *)n + m->aux_offset);
+
+    mstate_aux *aux = aux_base + i;
+    assert((const char *)aux < (const char *)n + m->length);
+    return aux;
+}
+
+static
+void createShuffleMasks64(mcsheng64 *m, const dfa_info &info,
+                      dstate_id_t sheng_end,
+                      const map<dstate_id_t, AccelScheme> &accel_escape_info) {
+    DEBUG_PRINTF("using first %hu states for a sheng\n", sheng_end);
+    assert(sheng_end > DEAD_STATE + 1);
+    assert(sheng_end <= sizeof(m512) + 1);
+    vector<array<u8, sizeof(m512)>> masks;
+    masks.resize(info.alpha_size);
+    /* -1 to avoid wasting a slot as we do not include dead state */
+    vector<dstate_id_t> raw_ids;
+    raw_ids.resize(sheng_end - 1);
+    for (dstate_id_t s = DEAD_STATE + 1; s < info.states.size(); s++) {
+        assert(info.implId(s)); /* should not map to DEAD_STATE */
+        if (info.is_sheng(s)) {
+            raw_ids[info.extra[s].sheng_id] = s;
+        }
+    }
+    for (u32 i = 0; i < info.alpha_size; i++) {
+        if (i == info.alpha_remap[TOP]) {
+            continue;
+        }
+        auto &mask = masks[i];
+        assert(sizeof(mask) == sizeof(m512));
+        mask.fill(0);
+
+        for (dstate_id_t sheng_id = 0; sheng_id < sheng_end - 1; sheng_id++) {
+            dstate_id_t raw_id = raw_ids[sheng_id];
+            dstate_id_t next_id = info.implId(info.states[raw_id].next[i]);
+            if (next_id == DEAD_STATE) {
+                next_id = sheng_end - 1;
+            } else if (next_id < sheng_end) {
+                next_id--;
+            }
+            DEBUG_PRINTF("%hu: %u->next %hu\n", sheng_id, i, next_id);
+            mask[sheng_id] = verify_u8(next_id);
+        }
+    }
+    for (u32 i = 0; i < N_CHARS; i++) {
+        assert(info.alpha_remap[i] != info.alpha_remap[TOP]);
+        memcpy((u8 *)&m->sheng_succ_masks[i],
+               (u8 *)masks[info.alpha_remap[i]].data(), sizeof(m512));
+    }
+    m->sheng_end = sheng_end;
+    m->sheng_accel_limit = sheng_end - 1;
+
+    for (dstate_id_t s : raw_ids) {
+        if (contains(accel_escape_info, s)) {
+            LIMIT_TO_AT_MOST(&m->sheng_accel_limit, info.extra[s].sheng_id);
+        }
+    }
+}
+
+static
+void populateBasicInfo64(size_t state_size, const dfa_info &info,
+                         u32 total_size, u32 aux_offset, u32 accel_offset,
+                         u32 accel_count, ReportID arb, bool single, NFA *nfa) {
+    assert(state_size == sizeof(u16) || state_size == sizeof(u8));
+
+    nfa->length = total_size;
+    nfa->nPositions = info.states.size();
+
+    nfa->scratchStateSize = verify_u32(state_size);
+    nfa->streamStateSize = verify_u32(state_size);
+
+    if (state_size == sizeof(u8)) {
+        nfa->type = MCSHENG_64_NFA_8;
+    } else {
+        nfa->type = MCSHENG_64_NFA_16;
+    }
+
+    mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa);
+    for (u32 i = 0; i < 256; i++) {
+        m->remap[i] = verify_u8(info.alpha_remap[i]);
+    }
+    m->alphaShift = info.getAlphaShift();
+    m->length = total_size;
+    m->aux_offset = aux_offset;
+    m->accel_offset = accel_offset;
+    m->arb_report = arb;
+    m->state_count = verify_u16(info.size());
+    m->start_anchored = info.implId(info.raw.start_anchored);
+    m->start_floating = info.implId(info.raw.start_floating);
+    m->has_accel = accel_count ? 1 : 0;
+
+    if (single) {
+        m->flags |= MCSHENG_FLAG_SINGLE;
+    }
+}
+
 static
 size_t calcShermanRegionSize(const dfa_info &info) {
     size_t rv = 0;
@@ -272,7 +371,7 @@ void fillInAux(mstate_aux *aux, dstate_id_t i, const dfa_info &info,
 /* returns false on error */
 static
 bool allocateImplId16(dfa_info &info, dstate_id_t sheng_end,
-                     dstate_id_t *sherman_base) {
+                      dstate_id_t *sherman_base) {
     info.states[0].impl_id = 0; /* dead is always 0 */
 
     vector<dstate_id_t> norm;
@@ -382,6 +481,7 @@ CharReach get_edge_reach(dstate_id_t u, dstate_id_t v, const dfa_info &info) {
 }
 
 #define MAX_SHENG_STATES 16
+#define MAX_SHENG64_STATES 64
 #define MAX_SHENG_LEAKINESS 0.05
 
 using LeakinessCache = ue2_unordered_map<pair<RdfaVertex, u32>, double>;
@@ -435,7 +535,8 @@ double leakiness(const RdfaGraph &g, dfa_info &info,
 
 static
 dstate_id_t find_sheng_states(dfa_info &info,
-                             map<dstate_id_t, AccelScheme> &accel_escape_info) {
+                              map<dstate_id_t, AccelScheme> &accel_escape_info,
+                              size_t max_sheng_states) {
     RdfaGraph g(info.raw);
     auto cyclics = find_vertices_in_cycles(g);
 
@@ -470,7 +571,7 @@ dstate_id_t find_sheng_states(dfa_info &info,
     flat_set<dstate_id_t> considered = { DEAD_STATE };
     bool seen_back_edge = false;
     while (!to_consider.empty()
-           && sheng_states.size() < MAX_SHENG_STATES) {
+           && sheng_states.size() < max_sheng_states) {
         auto v = to_consider.front();
         to_consider.pop_front();
         if (!considered.insert(g[v].index).second) {
@@ -616,6 +717,80 @@ void fill_in_succ_table_16(NFA *nfa, const dfa_info &info,
     }
 }
 
+static
+void fill_in_aux_info64(NFA *nfa, const dfa_info &info,
+                        const map<dstate_id_t, AccelScheme> &accel_escape_info,
+                        u32 accel_offset, UNUSED u32 accel_end_offset,
+                        const vector<u32> &reports,
+                        const vector<u32> &reports_eod,
+                        u32 report_base_offset,
+                        const raw_report_info &ri) {
+    mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa);
+
+    vector<u32> reportOffsets;
+
+    ri.fillReportLists(nfa, report_base_offset, reportOffsets);
+
+    for (u32 i = 0; i < info.size(); i++) {
+        u16 impl_id = info.implId(i);
+        mstate_aux *this_aux = getAux64(nfa, impl_id);
+
+        fillInAux(this_aux, i, info, reports, reports_eod, reportOffsets);
+        if (contains(accel_escape_info, i)) {
+            this_aux->accel_offset = accel_offset;
+            accel_offset += info.strat.accelSize();
+            assert(accel_offset <= accel_end_offset);
+            assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
+            info.strat.buildAccel(i, accel_escape_info.at(i),
+                                  (void *)((char *)m + this_aux->accel_offset));
+        }
+    }
+}
+
+static
+u16 get_edge_flags64(NFA *nfa, dstate_id_t target_impl_id) {
+    mstate_aux *aux = getAux64(nfa, target_impl_id);
+    u16 flags = 0;
+
+    if (aux->accept) {
+        flags |= ACCEPT_FLAG;
+    }
+
+    if (aux->accel_offset) {
+        flags |= ACCEL_FLAG;
+    }
+
+    return flags;
+}
+
+static
+void fill_in_succ_table_64_16(NFA *nfa, const dfa_info &info,
+                              dstate_id_t sheng_end,
+                              UNUSED dstate_id_t sherman_base) {
+    u16 *succ_table = (u16 *)((char *)nfa + sizeof(NFA) + sizeof(mcsheng64));
+
+    u8 alphaShift = info.getAlphaShift();
+    assert(alphaShift <= 8);
+
+    for (size_t i = 0; i < info.size(); i++) {
+        if (!info.is_normal(i)) {
+            assert(info.implId(i) < sheng_end || info.is_sherman(i));
+            continue;
+        }
+
+        assert(info.implId(i) < sherman_base);
+        u16 normal_id = verify_u16(info.implId(i) - sheng_end);
+
+        for (size_t s = 0; s < info.impl_alpha_size; s++) {
+            dstate_id_t raw_succ = info.states[i].next[s];
+            u16 &entry = succ_table[((size_t)normal_id << alphaShift) + s];
+
+            entry = info.implId(raw_succ);
+            entry |= get_edge_flags64(nfa, entry);
+        }
+    }
+}
+
 #define MAX_SHERMAN_LIST_LEN 8
 
 static
@@ -842,17 +1017,20 @@ bytecode_ptr<NFA> mcshengCompile16(dfa_info &info, dstate_id_t sheng_end,
 
     assert(info.getAlphaShift() <= 8);
 
-    u16 total_daddy = 0;
-    for (u32 i = 0; i < info.size(); i++) {
-        find_better_daddy(info, i,
-                          is_cyclic_near(info.raw, info.raw.start_anchored),
-                          grey);
-        total_daddy += info.extra[i].daddytaken;
-    }
+    // Sherman optimization
+    if (info.impl_alpha_size > 16) {
+        u16 total_daddy = 0;
+        for (u32 i = 0; i < info.size(); i++) {
+            find_better_daddy(info, i,
+                              is_cyclic_near(info.raw, info.raw.start_anchored),
+                              grey);
+            total_daddy += info.extra[i].daddytaken;
+        }
 
-    DEBUG_PRINTF("daddy %hu/%zu states=%zu alpha=%hu\n", total_daddy,
-                 info.size() * info.impl_alpha_size, info.size(),
-                 info.impl_alpha_size);
+        DEBUG_PRINTF("daddy %hu/%zu states=%zu alpha=%hu\n", total_daddy,
+                     info.size() * info.impl_alpha_size, info.size(),
+                     info.impl_alpha_size);
+    }
 
     u16 sherman_limit;
     if (!allocateImplId16(info, sheng_end, &sherman_limit)) {
@@ -931,6 +1109,160 @@ void fill_in_succ_table_8(NFA *nfa, const dfa_info &info,
     }
 }
 
+static
+void fill_in_sherman64(NFA *nfa, dfa_info &info, UNUSED u16 sherman_limit) {
+    char *nfa_base = (char *)nfa;
+    mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa);
+    char *sherman_table = nfa_base + m->sherman_offset;
+
+    assert(ISALIGNED_16(sherman_table));
+    for (size_t i = 0; i < info.size(); i++) {
+        if (!info.is_sherman(i)) {
+            continue;
+        }
+        u16 fs = verify_u16(info.implId(i));
+        DEBUG_PRINTF("building sherman %zu impl %hu\n", i, fs);
+
+        assert(fs >= sherman_limit);
+
+        char *curr_sherman_entry
+            = sherman_table + (fs - m->sherman_limit) * SHERMAN_FIXED_SIZE;
+        assert(curr_sherman_entry <= nfa_base + m->length);
+
+        u8 len = verify_u8(info.impl_alpha_size - info.extra[i].daddytaken);
+        assert(len <= 9);
+        dstate_id_t d = info.states[i].daddy;
+
+        *(u8 *)(curr_sherman_entry + SHERMAN_TYPE_OFFSET) = SHERMAN_STATE;
+        *(u8 *)(curr_sherman_entry + SHERMAN_LEN_OFFSET) = len;
+        *(u16 *)(curr_sherman_entry + SHERMAN_DADDY_OFFSET) = info.implId(d);
+        u8 *chars = (u8 *)(curr_sherman_entry + SHERMAN_CHARS_OFFSET);
+
+        for (u16 s = 0; s < info.impl_alpha_size; s++) {
+            if (info.states[i].next[s] != info.states[d].next[s]) {
+                *(chars++) = (u8)s;
+            }
+        }
+
+        u16 *states = (u16 *)(curr_sherman_entry + SHERMAN_STATES_OFFSET(len));
+        for (u16 s = 0; s < info.impl_alpha_size; s++) {
+            if (info.states[i].next[s] != info.states[d].next[s]) {
+                DEBUG_PRINTF("s overrider %hu dad %hu char next %hu\n", fs,
+                             info.implId(d),
+                             info.implId(info.states[i].next[s]));
+                u16 entry_val = info.implId(info.states[i].next[s]);
+                entry_val |= get_edge_flags64(nfa, entry_val);
+                unaligned_store_u16((u8 *)states++, entry_val);
+            }
+        }
+    }
+}
+
+static
+bytecode_ptr<NFA> mcsheng64Compile16(dfa_info&info, dstate_id_t sheng_end,
+                         const map<dstate_id_t, AccelScheme>&accel_escape_info,
+                         const Grey &grey) {
+    DEBUG_PRINTF("building mcsheng 64-16\n");
+
+    vector<u32> reports; /* index in ri for the appropriate report list */
+    vector<u32> reports_eod; /* as above */
+    ReportID arb;
+    u8 single;
+
+    assert(info.getAlphaShift() <= 8);
+
+    // Sherman optimization
+    if (info.impl_alpha_size > 16) {
+        u16 total_daddy = 0;
+        for (u32 i = 0; i < info.size(); i++) {
+            find_better_daddy(info, i,
+                              is_cyclic_near(info.raw, info.raw.start_anchored),
+                              grey);
+            total_daddy += info.extra[i].daddytaken;
+        }
+
+        DEBUG_PRINTF("daddy %hu/%zu states=%zu alpha=%hu\n", total_daddy,
+                     info.size() * info.impl_alpha_size, info.size(),
+                     info.impl_alpha_size);
+    }
+
+    u16 sherman_limit;
+    if (!allocateImplId16(info, sheng_end, &sherman_limit)) {
+        DEBUG_PRINTF("failed to allocate state numbers, %zu states total\n",
+                     info.size());
+        return nullptr;
+    }
+    u16 count_real_states = sherman_limit - sheng_end;
+
+    auto ri = info.strat.gatherReports(reports, reports_eod, &single, &arb);
+
+    size_t tran_size = (1 << info.getAlphaShift()) * sizeof(u16)
+                     * count_real_states;
+
+    size_t aux_size = sizeof(mstate_aux) * info.size();
+
+    size_t aux_offset = ROUNDUP_16(sizeof(NFA) + sizeof(mcsheng64) + tran_size);
+    size_t accel_size = info.strat.accelSize() * accel_escape_info.size();
+    size_t accel_offset = ROUNDUP_N(aux_offset + aux_size
+                                    + ri->getReportListSize(), 32);
+    size_t sherman_offset = ROUNDUP_16(accel_offset + accel_size);
+    size_t sherman_size = calcShermanRegionSize(info);
+
+    size_t total_size = sherman_offset + sherman_size;
+
+    accel_offset -= sizeof(NFA); /* adj accel offset to be relative to m */
+    assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
+
+    auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
+    mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa.get());
+
+    populateBasicInfo64(sizeof(u16), info, total_size, aux_offset, accel_offset,
+                        accel_escape_info.size(), arb, single, nfa.get());
+    createShuffleMasks64(m, info, sheng_end, accel_escape_info);
+
+    /* copy in the mc header information */
+    m->sherman_offset = sherman_offset;
+    m->sherman_end = total_size;
+    m->sherman_limit = sherman_limit;
+
+    DEBUG_PRINTF("%hu sheng, %hu norm, %zu total\n", sheng_end,
+                 count_real_states, info.size());
+
+    fill_in_aux_info64(nfa.get(), info, accel_escape_info, accel_offset,
+                       sherman_offset - sizeof(NFA), reports, reports_eod,
+                       aux_offset + aux_size, *ri);
+
+    fill_in_succ_table_64_16(nfa.get(), info, sheng_end, sherman_limit);
+
+    fill_in_sherman64(nfa.get(), info, sherman_limit);
+
+    return nfa;
+}
+
+static
+void fill_in_succ_table_64_8(NFA *nfa, const dfa_info &info,
+                             dstate_id_t sheng_end) {
+    u8 *succ_table = (u8 *)nfa + sizeof(NFA) + sizeof(mcsheng64);
+
+    u8 alphaShift = info.getAlphaShift();
+    assert(alphaShift <= 8);
+
+    for (size_t i = 0; i < info.size(); i++) {
+        assert(!info.is_sherman(i));
+        if (!info.is_normal(i)) {
+            assert(info.implId(i) < sheng_end);
+            continue;
+        }
+        u8 normal_id = verify_u8(info.implId(i) - sheng_end);
+
+        for (size_t s = 0; s < info.impl_alpha_size; s++) {
+            dstate_id_t raw_succ = info.states[i].next[s];
+            succ_table[((size_t)normal_id << alphaShift) + s]
+                = info.implId(raw_succ);
+        }
+    }
+}
+
 static
 void allocateImplId8(dfa_info &info, dstate_id_t sheng_end,
                      const map<dstate_id_t, AccelScheme> &accel_escape_info,
@@ -1028,6 +1360,58 @@ bytecode_ptr<NFA> mcshengCompile8(dfa_info &info, dstate_id_t sheng_end,
     return nfa;
 }
 
+static
+bytecode_ptr<NFA> mcsheng64Compile8(dfa_info &info, dstate_id_t sheng_end,
+                      const map<dstate_id_t, AccelScheme> &accel_escape_info) {
+    DEBUG_PRINTF("building mcsheng 64-8\n");
+
+    vector<u32> reports;
+    vector<u32> reports_eod;
+    ReportID arb;
+    u8 single;
+
+    auto ri = info.strat.gatherReports(reports, reports_eod, &single, &arb);
+
+    size_t normal_count = info.size() - sheng_end;
+
+    size_t tran_size = sizeof(u8) * (1 << info.getAlphaShift()) * normal_count;
+    size_t aux_size = sizeof(mstate_aux) * info.size();
+    size_t aux_offset = ROUNDUP_16(sizeof(NFA) + sizeof(mcsheng64) + tran_size);
+    size_t accel_size = info.strat.accelSize() * accel_escape_info.size();
+    size_t accel_offset = ROUNDUP_N(aux_offset + aux_size
+                                    + ri->getReportListSize(), 32);
+    size_t total_size = accel_offset + accel_size;
+
+    DEBUG_PRINTF("aux_size %zu\n", aux_size);
+    DEBUG_PRINTF("aux_offset %zu\n", aux_offset);
+    DEBUG_PRINTF("rl size %u\n", ri->getReportListSize());
+    DEBUG_PRINTF("accel_size %zu\n", accel_size);
+    DEBUG_PRINTF("accel_offset %zu\n", accel_offset);
+    DEBUG_PRINTF("total_size %zu\n", total_size);
+
+    accel_offset -= sizeof(NFA); /* adj accel offset to be relative to m */
+    assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
+
+    auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
+    mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa.get());
+
+    allocateImplId8(info, sheng_end, accel_escape_info, &m->accel_limit_8,
+                    &m->accept_limit_8);
+
+    populateBasicInfo64(sizeof(u8), info, total_size, aux_offset, accel_offset,
+                        accel_escape_info.size(), arb, single, nfa.get());
+    createShuffleMasks64(m, info, sheng_end, accel_escape_info);
+
+    fill_in_aux_info64(nfa.get(), info, accel_escape_info, accel_offset,
+                       total_size - sizeof(NFA), reports, reports_eod,
+                       aux_offset + aux_size, *ri);
+
+    fill_in_succ_table_64_8(nfa.get(), info, sheng_end);
+    DEBUG_PRINTF("rl size %zu\n", ri->size());
+
+    return nfa;
+}
+
 bytecode_ptr<NFA> mcshengCompile(raw_dfa &raw, const CompileContext &cc,
                                  const ReportManager &rm) {
     if (!cc.grey.allowMcSheng) {
@@ -1047,19 +1431,83 @@ bytecode_ptr<NFA> mcshengCompile(raw_dfa &raw, const CompileContext &cc,
 
     map<dstate_id_t, AccelScheme> accel_escape_info
         = info.strat.getAccelInfo(cc.grey);
+    auto old_states = info.states;
+    dstate_id_t sheng_end = find_sheng_states(info, accel_escape_info, MAX_SHENG_STATES);
 
-    dstate_id_t sheng_end = find_sheng_states(info, accel_escape_info);
     if (sheng_end <= DEAD_STATE + 1) {
+        info.states = old_states;
         return nullptr;
     }
 
     bytecode_ptr<NFA> nfa;
+
     if (!using8bit) {
         nfa = mcshengCompile16(info, sheng_end, accel_escape_info, cc.grey);
     } else {
         nfa = mcshengCompile8(info, sheng_end, accel_escape_info);
     }
 
+    if (!nfa) {
+        info.states = old_states;
+        return nfa;
+    }
+
+    if (has_eod_reports) {
+        nfa->flags |= NFA_ACCEPTS_EOD;
+    }
+
+    DEBUG_PRINTF("compile done\n");
+    return nfa;
+}
+
+bytecode_ptr<NFA> mcshengCompile64(raw_dfa &raw, const CompileContext &cc,
+                                   const ReportManager &rm) {
+    if (!cc.grey.allowMcSheng) {
+        return nullptr;
+    }
+
+    if (!cc.target_info.has_avx512vbmi()) {
+        DEBUG_PRINTF("McSheng64 failed, no HS_CPU_FEATURES_AVX512VBMI!\n");
+        return nullptr;
+    }
+
+    mcclellan_build_strat mbs(raw, rm, false);
+    dfa_info info(mbs);
+    bool using8bit = cc.grey.allowMcClellan8 && info.size() <= 256;
+
+    if (!cc.streaming) { /* TODO: work out if we can do the strip in streaming
+                          * mode with our semantics */
+        raw.stripExtraEodReports();
+    }
+
+    bool has_eod_reports = raw.hasEodReports();
+
+    map<dstate_id_t, AccelScheme> accel_escape_info
+        = info.strat.getAccelInfo(cc.grey);
+    bool using64state = false; /*default flag*/
+    dstate_id_t sheng_end64;
+    sheng_end64 = find_sheng_states(info, accel_escape_info, MAX_SHENG64_STATES);
+
+    if (sheng_end64 <= DEAD_STATE + 1) {
+        return nullptr;
+    } else {
+        using64state = true;
+    }
+
+    bytecode_ptr<NFA> nfa;
+
+    if (using64state) {
+        assert((sheng_end64 > 17) && (sheng_end64 <= 65));
+        if (!using8bit) {
+            nfa = mcsheng64Compile16(info, sheng_end64, accel_escape_info, cc.grey);
+        } else {
+            assert(using8bit);
+            nfa = mcsheng64Compile8(info, sheng_end64, accel_escape_info);
+            assert(nfa);
+            assert(nfa->type == MCSHENG_64_NFA_8);
+        }
+    }
+
     if (!nfa) {
         return nfa;
     }
diff --git a/src/nfa/mcsheng_compile.h b/src/nfa/mcsheng_compile.h
index 487ab45f4fcd4cd02574d6bd75685b7638012213..3a79b46a23b8a6388a0f19701e0637354d3f7fb3 100644
--- a/src/nfa/mcsheng_compile.h
+++ b/src/nfa/mcsheng_compile.h
@@ -42,7 +42,8 @@ struct raw_dfa;
 
 bytecode_ptr<NFA> mcshengCompile(raw_dfa &raw, const CompileContext &cc,
                                  const ReportManager &rm);
-
+bytecode_ptr<NFA> mcshengCompile64(raw_dfa &raw, const CompileContext &cc,
+                                   const ReportManager &rm);
 bool has_accel_mcsheng(const NFA *nfa);
 
 } // namespace ue2
diff --git a/src/nfa/mcsheng_data.c b/src/nfa/mcsheng_data.c
index eaf3cbbb3ee99bb18c0879c3279e499ee797d54a..0701b4b3130d88cc821648b97cc2248ba030f6f0 100644
--- a/src/nfa/mcsheng_data.c
+++ b/src/nfa/mcsheng_data.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -41,3 +41,15 @@ const u64a mcsheng_pext_mask[8] = {
     0x00ff00000000000f,
     0xff0000000000000f,
 };
+#if defined(HAVE_AVX512VBMI)
+const u64a mcsheng64_pext_mask[8] = {
+    0, /* dummy */
+    0x000000000000ff3f,
+    0x0000000000ff003f,
+    0x00000000ff00003f,
+    0x000000ff0000003f,
+    0x0000ff000000003f,
+    0x00ff00000000003f,
+    0xff0000000000003f,
+};
+#endif
diff --git a/src/nfa/mcsheng_dump.cpp b/src/nfa/mcsheng_dump.cpp
index 2b56307999d84dfc88224f3ffb64343988f63242..7cef82f4d80a34e204677c45e520d4112c27f85d 100644
--- a/src/nfa/mcsheng_dump.cpp
+++ b/src/nfa/mcsheng_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -174,6 +174,124 @@ void describeEdge(FILE *f, const mcsheng *m, const u16 *t, u16 i) {
     }
 }
 
+static
+const mstate_aux *getAux64(const NFA *n, dstate_id_t i) {
+    auto *m = (const mcsheng64 *)getImplNfa(n);
+    auto *aux_base = (const mstate_aux *)((const char *)n + m->aux_offset);
+
+    const mstate_aux *aux = aux_base + i;
+
+    assert((const char *)aux < (const char *)n + m->length);
+    return aux;
+}
+
+static
+void next_states64(const NFA *n, u16 s, u16 *t) {
+    const mcsheng64 *m = (const mcsheng64 *)getImplNfa(n);
+    const mstate_aux *aux = getAux64(n, s);
+    const u32 as = m->alphaShift;
+    assert(s != DEAD_STATE);
+
+    if (s < m->sheng_end) {
+        for (u16 c = 0; c < N_CHARS; c++) {
+            u8 sheng_s = s - 1;
+            auto trans_for_c = (const char *)&m->sheng_succ_masks[c];
+            assert(sheng_s < sizeof(m512));
+            u8 raw_succ = trans_for_c[sheng_s];
+            if (raw_succ == m->sheng_end - 1) {
+                t[c] = DEAD_STATE;
+            } else if (raw_succ < m->sheng_end) {
+                t[c] = raw_succ + 1;
+            } else {
+                t[c] = raw_succ;
+            }
+        }
+    } else  if (n->type == MCSHENG_64_NFA_8) {
+        const u8 *succ_table = (const u8 *)((const char *)m + sizeof(mcsheng64));
+        for (u16 c = 0; c < N_CHARS; c++) {
+            u32 normal_id = s - m->sheng_end;
+            t[c] = succ_table[(normal_id << as) + m->remap[c]];
+        }
+    } else {
+        u16 base_s = s;
+        const char *winfo_base = (const char *)n + m->sherman_offset;
+        const char *state_base
+                = winfo_base + SHERMAN_FIXED_SIZE * (s - m->sherman_limit);
+
+        if (s >= m->sherman_limit) {
+            base_s = unaligned_load_u16(state_base + SHERMAN_DADDY_OFFSET);
+            assert(base_s >= m->sheng_end);
+        }
+
+        const u16 *succ_table = (const u16 *)((const char *)m
+                                              + sizeof(mcsheng64));
+        for (u16 c = 0; c < N_CHARS; c++) {
+            u32 normal_id = base_s - m->sheng_end;
+            t[c] = succ_table[(normal_id << as) + m->remap[c]];
+        }
+
+        if (s >= m->sherman_limit) {
+            UNUSED char type = *(state_base + SHERMAN_TYPE_OFFSET);
+            assert(type == SHERMAN_STATE);
+            u8 len = *(const u8 *)(SHERMAN_LEN_OFFSET + state_base);
+            const char *chars = state_base + SHERMAN_CHARS_OFFSET;
+            const u16 *states = (const u16 *)(state_base
+                                              + SHERMAN_STATES_OFFSET(len));
+
+            for (u8 i = 0; i < len; i++) {
+                for (u16 c = 0; c < N_CHARS; c++) {
+                    if (m->remap[c] == chars[i]) {
+                        t[c] = unaligned_load_u16((const u8*)&states[i]);
+                    }
+                }
+            }
+        }
+
+        for (u16 c = 0; c < N_CHARS; c++) {
+            t[c] &= STATE_MASK;
+        }
+
+    }
+
+    t[TOP] = aux->top & STATE_MASK;
+}
+
+static
+void describeEdge64(FILE *f, const mcsheng64 *m, const u16 *t, u16 i) {
+    for (u16 s = 0; s < N_CHARS; s++) {
+        if (!t[s]) {
+            continue;
+        }
+
+        u16 ss;
+        for (ss = 0; ss < s; ss++) {
+            if (t[s] == t[ss]) {
+                break;
+            }
+        }
+
+        if (ss != s) {
+            continue;
+        }
+
+        CharReach reach;
+        for (ss = s; ss < 256; ss++) {
+            if (t[s] == t[ss]) {
+                reach.set(ss);
+            }
+        }
+
+        fprintf(f, "%u -> %u [ ", i, t[s]);
+        if (i < m->sheng_end && t[s] < m->sheng_end) {
+            fprintf(f, "color = red, fontcolor = red ");
+        }
+        fprintf(f, "label = \"");
+        describeClass(f, reach, 5, CC_OUT_DOT);
+
+        fprintf(f, "\" ];\n");
+    }
+}
+
 static
 void dumpAccelDot(FILE *f, u16 i, const union AccelAux *accel) {
     switch(accel->accel_type) {
@@ -256,6 +374,66 @@ void describeNode(const NFA *n, const mcsheng *m, u16 i, FILE *f) {
 
 }
 
+static
+void describeNode64(const NFA *n, const mcsheng64 *m, u16 i, FILE *f) {
+    const mstate_aux *aux = getAux64(n, i);
+
+    bool isSherman = m->sherman_limit && i >= m->sherman_limit;
+
+    fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, "
+            "label = \"%u%s\" ]; \n", i, i, isSherman ? "w":"");
+
+    if (aux->accel_offset) {
+        dumpAccelDot(f, i, (const union AccelAux *)
+                     ((const char *)m + aux->accel_offset));
+    }
+
+    if (i && i < m->sheng_end) {
+        fprintf(f, "%u [color = red, fontcolor = red]; \n", i);
+    }
+
+    if (aux->accept_eod) {
+        fprintf(f, "%u [ color = darkorchid ];\n", i);
+    }
+
+    if (aux->accept) {
+        fprintf(f, "%u [ shape = doublecircle ];\n", i);
+    }
+
+    if (aux->top && aux->top != i) {
+        fprintf(f, "%u -> %u [color = darkgoldenrod weight=0.1 ]\n", i,
+                aux->top);
+    }
+
+    if (i == m->start_anchored) {
+        fprintf(f, "STARTA -> %u [color = blue ]\n", i);
+    }
+
+    if (i == m->start_floating) {
+        fprintf(f, "STARTF -> %u [color = red ]\n", i);
+    }
+
+    if (isSherman) {
+        const char *winfo_base = (const char *)n + m->sherman_offset;
+        const char *state_base
+                = winfo_base + SHERMAN_FIXED_SIZE * (i - m->sherman_limit);
+        assert(state_base < (const char *)m + m->length - sizeof(NFA));
+        UNUSED u8 type = *(const u8 *)(state_base + SHERMAN_TYPE_OFFSET);
+        assert(type == SHERMAN_STATE);
+        fprintf(f, "%u [ fillcolor = lightblue style=filled ];\n", i);
+        u16 daddy = *(const u16 *)(state_base + SHERMAN_DADDY_OFFSET);
+        if (daddy) {
+            fprintf(f, "%u -> %u [ color=royalblue style=dashed weight=0.1]\n",
+                    i, daddy);
+        }
+    }
+
+    if (i && i < m->sheng_end) {
+        fprintf(f, "subgraph cluster_sheng { %u } \n", i);
+    }
+
+}
+
 static
 void dumpDotPreambleDfa(FILE *f) {
     dumpDotPreamble(f);
@@ -392,6 +570,131 @@ void dump_text_8(const NFA *nfa, FILE *f) {
     dumpTextReverse(nfa, f);
 }
 
+static
+void dump64_dot_16(const NFA *nfa, FILE *f) {
+    auto  *m = (const mcsheng64 *)getImplNfa(nfa);
+
+    dumpDotPreambleDfa(f);
+
+    for (u16 i = 1; i < m->state_count; i++) {
+        describeNode64(nfa, m, i, f);
+
+        u16 t[ALPHABET_SIZE];
+
+        next_states64(nfa, i, t);
+
+        describeEdge64(f, m, t, i);
+    }
+
+    fprintf(f, "}\n");
+}
+
+static
+void dump64_dot_8(const NFA *nfa, FILE *f) {
+    auto m = (const mcsheng64 *)getImplNfa(nfa);
+
+    dumpDotPreambleDfa(f);
+
+    for (u16 i = 1; i < m->state_count; i++) {
+        describeNode64(nfa, m, i, f);
+
+        u16 t[ALPHABET_SIZE];
+
+        next_states64(nfa, i, t);
+
+        describeEdge64(f, m, t, i);
+    }
+
+    fprintf(f, "}\n");
+}
+
+static
+void dumpAccelMasks64(FILE *f, const mcsheng64 *m, const mstate_aux *aux) {
+    fprintf(f, "\n");
+    fprintf(f, "Acceleration\n");
+    fprintf(f, "------------\n");
+
+    for (u16 i = 0; i < m->state_count; i++) {
+        if (!aux[i].accel_offset) {
+            continue;
+        }
+
+        auto accel = (const AccelAux *)((const char *)m + aux[i].accel_offset);
+        fprintf(f, "%05hu ", i);
+        dumpAccelInfo(f, *accel);
+    }
+}
+
+static
+void describeAlphabet64(FILE *f, const mcsheng64 *m) {
+    map<u8, CharReach> rev;
+
+    for (u16 i = 0; i < N_CHARS; i++) {
+        rev[m->remap[i]].clear();
+    }
+
+    for (u16 i = 0; i < N_CHARS; i++) {
+        rev[m->remap[i]].set(i);
+    }
+
+    map<u8, CharReach>::const_iterator it;
+    fprintf(f, "\nAlphabet\n");
+    for (it = rev.begin(); it != rev.end(); ++it) {
+        fprintf(f, "%3hhu: ", it->first);
+        describeClass(f, it->second, 10240, CC_OUT_TEXT);
+        fprintf(f, "\n");
+    }
+    fprintf(f, "\n");
+}
+
+static
+void dumpCommonHeader64(FILE *f, const mcsheng64 *m) {
+    fprintf(f, "report: %u, states: %u, length: %u\n", m->arb_report,
+            m->state_count, m->length);
+    fprintf(f, "astart: %hu, fstart: %hu\n", m->start_anchored,
+            m->start_floating);
+    fprintf(f, "single accept: %d, has_accel: %d\n",
+            !!(int)m->flags & MCSHENG_FLAG_SINGLE, m->has_accel);
+    fprintf(f, "sheng_end:         %hu\n", m->sheng_end);
+    fprintf(f, "sheng_accel_limit: %hu\n", m->sheng_accel_limit);
+}
+
+static
+void dump64_text_8(const NFA *nfa, FILE *f) {
+    auto m = (const mcsheng64 *)getImplNfa(nfa);
+    auto aux = (const mstate_aux *)((const char *)nfa + m->aux_offset);
+
+    fprintf(f, "mcsheng 64-8\n");
+    dumpCommonHeader64(f, m);
+    fprintf(f, "accel_limit: %hu, accept_limit %hu\n", m->accel_limit_8,
+            m->accept_limit_8);
+    fprintf(f, "\n");
+
+    describeAlphabet64(f, m);
+    dumpAccelMasks64(f, m, aux);
+
+    fprintf(f, "\n");
+    dumpTextReverse(nfa, f);
+}
+
+static
+void dump64_text_16(const NFA *nfa, FILE *f) {
+    auto *m = (const mcsheng64 *)getImplNfa(nfa);
+    auto *aux = (const mstate_aux *)((const char *)nfa + m->aux_offset);
+
+    fprintf(f, "mcsheng 64-16\n");
+    dumpCommonHeader64(f, m);
+    fprintf(f, "sherman_limit: %d, sherman_end: %d\n", (int)m->sherman_limit,
+            (int)m->sherman_end);
+    fprintf(f, "\n");
+
+    describeAlphabet64(f, m);
+    dumpAccelMasks64(f, m, aux);
+
+    fprintf(f, "\n");
+    dumpTextReverse(nfa, f);
+}
+
 void nfaExecMcSheng16_dump(const NFA *nfa, const string &base) {
     assert(nfa->type == MCSHENG_NFA_16);
     dump_text_16(nfa, StdioFile(base + ".txt", "w"));
@@ -404,4 +707,16 @@ void nfaExecMcSheng8_dump(const NFA *nfa, const string &base) {
     dump_dot_8(nfa, StdioFile(base + ".dot", "w"));
 }
 
+void nfaExecMcSheng64_16_dump(UNUSED const NFA *nfa, UNUSED const string &base) {
+    assert(nfa->type == MCSHENG_64_NFA_16);
+    dump64_text_16(nfa, StdioFile(base + ".txt", "w"));
+    dump64_dot_16(nfa, StdioFile(base + ".dot", "w"));
+}
+
+void nfaExecMcSheng64_8_dump(UNUSED const NFA *nfa, UNUSED const string &base) {
+    assert(nfa->type == MCSHENG_64_NFA_8);
+    dump64_text_8(nfa, StdioFile(base + ".txt", "w"));
+    dump64_dot_8(nfa, StdioFile(base + ".dot", "w"));
+}
+
 } // namespace ue2
diff --git a/src/nfa/mcsheng_dump.h b/src/nfa/mcsheng_dump.h
index 1b69936741d1c04a3c50c0354cb6e5351b2d0e33..26e6cfda71277eb6decad35b3e6c5b788b0af2f5 100644
--- a/src/nfa/mcsheng_dump.h
+++ b/src/nfa/mcsheng_dump.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -42,7 +42,8 @@ namespace ue2 {
 
 void nfaExecMcSheng8_dump(const struct NFA *nfa, const std::string &base);
 void nfaExecMcSheng16_dump(const struct NFA *nfa, const std::string &base);
-
+void nfaExecMcSheng64_8_dump(const struct NFA *nfa, const std::string &base);
+void nfaExecMcSheng64_16_dump(const struct NFA *nfa, const std::string &base);
 } // namespace ue2
 
 #endif // DUMP_SUPPORT
diff --git a/src/nfa/mcsheng_internal.h b/src/nfa/mcsheng_internal.h
index bb45ae23f9263590497d80ec4bc7e213868aff76..d98557462415d8389abf61100a929e31b0aaa707 100644
--- a/src/nfa/mcsheng_internal.h
+++ b/src/nfa/mcsheng_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -92,4 +92,33 @@ struct mcsheng {
  * representing the data from a u64a. */
 extern const u64a mcsheng_pext_mask[8];
 
+struct mcsheng64 {
+    u16 state_count; /**< total number of states */
+    u32 length; /**< length of dfa in bytes */
+    u16 start_anchored; /**< anchored start state */
+    u16 start_floating; /**< floating start state */
+    u32 aux_offset; /**< offset of the aux structures relative to the start of
+                     *  the nfa structure */
+    u32 sherman_offset; /**< offset of array of sherman state offsets the
+                         * state_info structures relative to the start of the
+                         * nfa structure */
+    u32 sherman_end; /**< offset of the end of the state_info structures
+                      * relative to the start of the nfa structure */
+    u16 sheng_end; /**< first non-sheng state */
+    u16 sheng_accel_limit; /**< first sheng accel state. state given in terms of
+                            * internal sheng ids */
+    u16 accel_limit_8; /**< 8 bit, lowest accelerable state */
+    u16 accept_limit_8; /**< 8 bit, lowest accept state */
+    u16 sherman_limit; /**< lowest sherman state */
+    u8  alphaShift;
+    u8  flags;
+    u8  has_accel; /**< 1 iff there are any accel plans */
+    u8  remap[256]; /**< remaps characters to a smaller alphabet */
+    ReportID arb_report; /**< one of the accepts that this dfa may raise */
+    u32 accel_offset; /**< offset of accel structures from start of McClellan */
+    m512 sheng_succ_masks[N_CHARS];
+};
+
+extern const u64a mcsheng64_pext_mask[8];
+
 #endif
diff --git a/src/nfa/nfa_api_dispatch.c b/src/nfa/nfa_api_dispatch.c
index f4b7552ef03ab00e987eb1aaec043399a275117e..75cac4b4815db38c16b3efd30cf99675552ca24a 100644
--- a/src/nfa/nfa_api_dispatch.c
+++ b/src/nfa/nfa_api_dispatch.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -76,6 +76,10 @@
         DISPATCH_CASE(TAMARAMA_NFA, Tamarama, dbnt_func);                      \
         DISPATCH_CASE(MCSHENG_NFA_8, McSheng8, dbnt_func);                     \
         DISPATCH_CASE(MCSHENG_NFA_16, McSheng16, dbnt_func);                   \
+        DISPATCH_CASE(SHENG_NFA_32, Sheng32, dbnt_func);                       \
+        DISPATCH_CASE(SHENG_NFA_64, Sheng64, dbnt_func);                       \
+        DISPATCH_CASE(MCSHENG_64_NFA_8, McSheng64_8, dbnt_func);               \
+        DISPATCH_CASE(MCSHENG_64_NFA_16, McSheng64_16, dbnt_func);             \
     default:                                                                   \
         assert(0);                                                             \
     }
diff --git a/src/nfa/nfa_build_util.cpp b/src/nfa/nfa_build_util.cpp
index 9185ccdd7599f116cffba0354c46f53124218cef..47153163e9f3b412b422890beec8329acff0ae58 100644
--- a/src/nfa/nfa_build_util.cpp
+++ b/src/nfa/nfa_build_util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -181,7 +181,6 @@ enum NFACategory {NFA_LIMEX, NFA_OTHER};
         static const nfa_dispatch_fn has_repeats_other_than_firsts;     \
         static const u32 stateAlign =                                   \
                 MAX(mlt_align, alignof(RepeatControl));                 \
-        static const bool fast = mlt_size <= 64;                        \
     };                                                                  \
     const nfa_dispatch_fn NFATraits<LIMEX_NFA_##mlt_size>::has_accel    \
             = has_accel_limex<LimExNFA##mlt_size>;                      \
@@ -210,7 +209,6 @@ template<> struct NFATraits<MCCLELLAN_NFA_8> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 1;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -226,7 +224,6 @@ template<> struct NFATraits<MCCLELLAN_NFA_16> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 2;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -242,7 +239,6 @@ template<> struct NFATraits<GOUGH_NFA_8> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -258,7 +254,6 @@ template<> struct NFATraits<GOUGH_NFA_16> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -274,7 +269,6 @@ template<> struct NFATraits<MPV_NFA> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -290,7 +284,6 @@ template<> struct NFATraits<CASTLE_NFA> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -306,7 +299,6 @@ template<> struct NFATraits<LBR_NFA_DOT> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -322,7 +314,6 @@ template<> struct NFATraits<LBR_NFA_VERM> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -338,7 +329,6 @@ template<> struct NFATraits<LBR_NFA_NVERM> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -354,7 +344,6 @@ template<> struct NFATraits<LBR_NFA_SHUF> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -370,7 +359,6 @@ template<> struct NFATraits<LBR_NFA_TRUF> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -386,7 +374,6 @@ template<> struct NFATraits<SHENG_NFA> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 1;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -402,7 +389,6 @@ template<> struct NFATraits<TAMARAMA_NFA> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 64;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -418,7 +404,6 @@ template<> struct NFATraits<MCSHENG_NFA_8> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 1;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -434,7 +419,6 @@ template<> struct NFATraits<MCSHENG_NFA_16> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 2;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -446,6 +430,65 @@ const nfa_dispatch_fn NFATraits<MCSHENG_NFA_16>::has_repeats_other_than_firsts =
 const char *NFATraits<MCSHENG_NFA_16>::name = "Shengy McShengFace 16";
 #endif
 
+template<> struct NFATraits<SHENG_NFA_32> {
+    UNUSED static const char *name;
+    static const NFACategory category = NFA_OTHER;
+    static const u32 stateAlign = 1;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
+};
+const nfa_dispatch_fn NFATraits<SHENG_NFA_32>::has_accel = has_accel_sheng;
+const nfa_dispatch_fn NFATraits<SHENG_NFA_32>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<SHENG_NFA_32>::has_repeats_other_than_firsts = dispatch_false;
+#if defined(DUMP_SUPPORT)
+const char *NFATraits<SHENG_NFA_32>::name = "Sheng 32";
+#endif
+
+template<> struct NFATraits<SHENG_NFA_64> {
+    UNUSED static const char *name;
+    static const NFACategory category = NFA_OTHER;
+    static const u32 stateAlign = 1;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
+};
+const nfa_dispatch_fn NFATraits<SHENG_NFA_64>::has_accel = has_accel_sheng;
+const nfa_dispatch_fn NFATraits<SHENG_NFA_64>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<SHENG_NFA_64>::has_repeats_other_than_firsts = dispatch_false;
+#if defined(DUMP_SUPPORT)
+const char *NFATraits<SHENG_NFA_64>::name = "Sheng 64";
+#endif
+
+template<> struct NFATraits<MCSHENG_64_NFA_8> {
+    UNUSED static const char *name;
+    static const NFACategory category = NFA_OTHER;
+    static const u32 stateAlign = 1;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
+};
+const nfa_dispatch_fn NFATraits<MCSHENG_64_NFA_8>::has_accel = has_accel_mcsheng;
+const nfa_dispatch_fn NFATraits<MCSHENG_64_NFA_8>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<MCSHENG_64_NFA_8>::has_repeats_other_than_firsts = dispatch_false;
+#if defined(DUMP_SUPPORT)
+const char *NFATraits<MCSHENG_64_NFA_8>::name = "Shengy64 McShengFace 8";
+#endif
+
+template<> struct NFATraits<MCSHENG_64_NFA_16> {
+    UNUSED static const char *name;
+    static const NFACategory category = NFA_OTHER;
+    static const u32 stateAlign = 2;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
+};
+const nfa_dispatch_fn NFATraits<MCSHENG_64_NFA_16>::has_accel = has_accel_mcsheng;
+const nfa_dispatch_fn NFATraits<MCSHENG_64_NFA_16>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<MCSHENG_64_NFA_16>::has_repeats_other_than_firsts = dispatch_false;
+#if defined(DUMP_SUPPORT)
+const char *NFATraits<MCSHENG_64_NFA_16>::name = "Shengy64 McShengFace 16";
+#endif
 } // namespace
 
 #if defined(DUMP_SUPPORT)
@@ -473,20 +516,6 @@ u32 state_alignment(const NFA &nfa) {
     return DISPATCH_BY_NFA_TYPE((NFAEngineType)nfa.type, getStateAlign, nullptr);
 }
 
-namespace {
-template<NFAEngineType t>
-struct getFastness {
-    static u32 call(void *) {
-        return NFATraits<t>::fast;
-    }
-};
-}
-
-bool is_fast(const NFA &nfa) {
-    NFAEngineType t = (NFAEngineType)nfa.type;
-    return DISPATCH_BY_NFA_TYPE(t, getFastness, nullptr);
-}
-
 namespace {
 template<NFAEngineType t>
 struct is_limex {
diff --git a/src/nfa/nfa_build_util.h b/src/nfa/nfa_build_util.h
index 92a1091ecee6980f1d4cc8ac5576e4b66446891c..ee7a3094941e505c0d0f763f67ac3b2bcb14829d 100644
--- a/src/nfa/nfa_build_util.h
+++ b/src/nfa/nfa_build_util.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -47,10 +47,6 @@ std::string describe(const NFA &nfa);
 // For a given NFA, retrieve the alignment required by its uncompressed state.
 u32 state_alignment(const NFA &nfa);
 
-/* returns true if the nfa is considered 'fast'. TODO: work out what we mean by
- * fast. */
-bool is_fast(const NFA &n);
-
 bool has_bounded_repeats_other_than_firsts(const NFA &n);
 
 bool has_bounded_repeats(const NFA &n);
diff --git a/src/nfa/nfa_dump_dispatch.cpp b/src/nfa/nfa_dump_dispatch.cpp
index 5607ed27ad6e4dc219d2813b044552853f36c20a..bc8c175d37d6abec9f794d0efb852bffa0e314d6 100644
--- a/src/nfa/nfa_dump_dispatch.cpp
+++ b/src/nfa/nfa_dump_dispatch.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -81,6 +81,10 @@ namespace ue2 {
         DISPATCH_CASE(TAMARAMA_NFA, Tamarama, dbnt_func);                      \
         DISPATCH_CASE(MCSHENG_NFA_8, McSheng8, dbnt_func);                     \
         DISPATCH_CASE(MCSHENG_NFA_16, McSheng16, dbnt_func);                   \
+        DISPATCH_CASE(SHENG_NFA_32, Sheng32, dbnt_func);                       \
+        DISPATCH_CASE(SHENG_NFA_64, Sheng64, dbnt_func);                       \
+        DISPATCH_CASE(MCSHENG_64_NFA_8, McSheng64_8, dbnt_func);               \
+        DISPATCH_CASE(MCSHENG_64_NFA_16, McSheng64_16, dbnt_func);             \
     default:                                                                   \
         assert(0);                                                             \
     }
diff --git a/src/nfa/nfa_internal.h b/src/nfa/nfa_internal.h
index 9d28082250bd0be96a1e38e644c005aaf7ecfcf5..ad27e28b14ddba8cdbf0f8d44ed7b2b5418ff7db 100644
--- a/src/nfa/nfa_internal.h
+++ b/src/nfa/nfa_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -72,6 +72,10 @@ enum NFAEngineType {
     TAMARAMA_NFA,       /**< magic nfa container */
     MCSHENG_NFA_8,      /**< magic pseudo nfa */
     MCSHENG_NFA_16,     /**< magic pseudo nfa */
+    SHENG_NFA_32,       /**< magic pseudo nfa */
+    SHENG_NFA_64,       /**< magic pseudo nfa */
+    MCSHENG_64_NFA_8,   /**< magic pseudo nfa */
+    MCSHENG_64_NFA_16,  /**< magic pseudo nfa */
     /** \brief bogus NFA - not used */
     INVALID_NFA
 };
@@ -148,7 +152,8 @@ static really_inline int isMcClellanType(u8 t) {
 /** \brief True if the given type (from NFA::type) is a Sheng-McClellan hybrid
  * DFA. */
 static really_inline int isShengMcClellanType(u8 t) {
-    return t == MCSHENG_NFA_8 || t == MCSHENG_NFA_16;
+    return t == MCSHENG_NFA_8 || t == MCSHENG_NFA_16 ||
+           t == MCSHENG_64_NFA_8 || t == MCSHENG_64_NFA_16;
 }
 
 /** \brief True if the given type (from NFA::type) is a Gough DFA. */
@@ -157,10 +162,25 @@ static really_inline int isGoughType(u8 t) {
 }
 
 /** \brief True if the given type (from NFA::type) is a Sheng DFA. */
-static really_inline int isShengType(u8 t) {
+static really_inline int isSheng16Type(u8 t) {
     return t == SHENG_NFA;
 }
 
+/** \brief True if the given type (from NFA::type) is a Sheng32 DFA. */
+static really_inline int isSheng32Type(u8 t) {
+    return t == SHENG_NFA_32;
+}
+
+/** \brief True if the given type (from NFA::type) is a Sheng64 DFA. */
+static really_inline int isSheng64Type(u8 t) {
+    return t == SHENG_NFA_64;
+}
+
+/** \brief True if the given type (from NFA::type) is a Sheng16/32/64 DFA. */
+static really_inline int isShengType(u8 t) {
+    return t == SHENG_NFA || t == SHENG_NFA_32 || t == SHENG_NFA_64;
+}
+
 /**
  * \brief True if the given type (from NFA::type) is a McClellan, Gough or
  * Sheng DFA.
diff --git a/src/nfa/repeatcompile.cpp b/src/nfa/repeatcompile.cpp
index 934dd29e6b9067ea91e65e0151f0ddbb9259cb4a..d15ae89b569d3ecbf53c518dfa9b648cd24b03c4 100644
--- a/src/nfa/repeatcompile.cpp
+++ b/src/nfa/repeatcompile.cpp
@@ -124,6 +124,10 @@ RepeatStateInfo::RepeatStateInfo(enum RepeatType type, const depth &repeatMin,
                                  const depth &repeatMax, u32 minPeriod)
     : stateSize(0), packedCtrlSize(0), horizon(0), patchCount(0),
       patchSize(0), encodingSize(0), patchesOffset(0) {
+    if (type == REPEAT_SPARSE_OPTIMAL_P && minPeriod == 0) {
+        assert(0);
+        throw std::domain_error("SPARSE_OPTIMAL_P must have non-zero minPeriod.");
+    }
     assert(repeatMin <= repeatMax);
     assert(repeatMax.is_reachable());
     assert(minPeriod || type != REPEAT_SPARSE_OPTIMAL_P);
diff --git a/src/nfa/sheng.c b/src/nfa/sheng.c
index 4f30910b5f865bc3818622133418cf6c0d5a94e2..3f36e21891710819a0f81aa9c66cbb86513b87f4 100644
--- a/src/nfa/sheng.c
+++ b/src/nfa/sheng.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -154,6 +154,205 @@ char fireReports(const struct sheng *sh, NfaCallback cb, void *ctxt,
     return MO_CONTINUE_MATCHING; /* continue execution */
 }
 
+#if defined(HAVE_AVX512VBMI)
+// Sheng32
+static really_inline
+const struct sheng32 *get_sheng32(const struct NFA *n) {
+    return (const struct sheng32 *)getImplNfa(n);
+}
+
+static really_inline
+const struct sstate_aux *get_aux32(const struct sheng32 *sh, u8 id) {
+    u32 offset = sh->aux_offset - sizeof(struct NFA) +
+            (id & SHENG32_STATE_MASK) * sizeof(struct sstate_aux);
+    DEBUG_PRINTF("Getting aux for state %u at offset %llu\n",
+                 id & SHENG32_STATE_MASK, (u64a)offset + sizeof(struct NFA));
+    return (const struct sstate_aux *)((const char *) sh + offset);
+}
+
+static really_inline
+const union AccelAux *get_accel32(const struct sheng32 *sh, u8 id) {
+    const struct sstate_aux *saux = get_aux32(sh, id);
+    DEBUG_PRINTF("Getting accel aux at offset %u\n", saux->accel);
+    const union AccelAux *aux = (const union AccelAux *)
+            ((const char *)sh + saux->accel - sizeof(struct NFA));
+    return aux;
+}
+
+static really_inline
+const struct report_list *get_rl32(const struct sheng32 *sh,
+                                   const struct sstate_aux *aux) {
+    DEBUG_PRINTF("Getting report list at offset %u\n", aux->accept);
+    return (const struct report_list *)
+        ((const char *)sh + aux->accept - sizeof(struct NFA));
+}
+
+static really_inline
+const struct report_list *get_eod_rl32(const struct sheng32 *sh,
+                                       const struct sstate_aux *aux) {
+    DEBUG_PRINTF("Getting EOD report list at offset %u\n", aux->accept);
+    return (const struct report_list *)
+        ((const char *)sh + aux->accept_eod - sizeof(struct NFA));
+}
+
+static really_inline
+char sheng32HasAccept(const struct sheng32 *sh, const struct sstate_aux *aux,
+                      ReportID report) {
+    assert(sh && aux);
+
+    const struct report_list *rl = get_rl32(sh, aux);
+    assert(ISALIGNED_N(rl, 4));
+
+    DEBUG_PRINTF("report list has %u entries\n", rl->count);
+
+    for (u32 i = 0; i < rl->count; i++) {
+        if (rl->report[i] == report) {
+            DEBUG_PRINTF("reporting %u\n", rl->report[i]);
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+static really_inline
+char fireReports32(const struct sheng32 *sh, NfaCallback cb, void *ctxt,
+                   const u8 state, u64a loc, u8 *const cached_accept_state,
+                   ReportID *const cached_accept_id, char eod) {
+    DEBUG_PRINTF("reporting matches @ %llu\n", loc);
+
+    if (!eod && state == *cached_accept_state) {
+        DEBUG_PRINTF("reporting %u\n", *cached_accept_id);
+        if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+    const struct sstate_aux *aux = get_aux32(sh, state);
+    const struct report_list *rl = eod ? get_eod_rl32(sh, aux) :
+                                         get_rl32(sh, aux);
+    assert(ISALIGNED(rl));
+
+    DEBUG_PRINTF("report list has %u entries\n", rl->count);
+    u32 count = rl->count;
+
+    if (!eod && count == 1) {
+        *cached_accept_state = state;
+        *cached_accept_id = rl->report[0];
+
+        DEBUG_PRINTF("reporting %u\n", rl->report[0]);
+        if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+
+    for (u32 i = 0; i < count; i++) {
+        DEBUG_PRINTF("reporting %u\n", rl->report[i]);
+        if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+    }
+    return MO_CONTINUE_MATCHING; /* continue execution */
+}
+
+// Sheng64
+static really_inline
+const struct sheng64 *get_sheng64(const struct NFA *n) {
+    return (const struct sheng64 *)getImplNfa(n);
+}
+
+static really_inline
+const struct sstate_aux *get_aux64(const struct sheng64 *sh, u8 id) {
+    u32 offset = sh->aux_offset - sizeof(struct NFA) +
+            (id & SHENG64_STATE_MASK) * sizeof(struct sstate_aux);
+    DEBUG_PRINTF("Getting aux for state %u at offset %llu\n",
+                 id & SHENG64_STATE_MASK, (u64a)offset + sizeof(struct NFA));
+    return (const struct sstate_aux *)((const char *) sh + offset);
+}
+
+static really_inline
+const struct report_list *get_rl64(const struct sheng64 *sh,
+                                   const struct sstate_aux *aux) {
+    DEBUG_PRINTF("Getting report list at offset %u\n", aux->accept);
+    return (const struct report_list *)
+        ((const char *)sh + aux->accept - sizeof(struct NFA));
+}
+
+static really_inline
+const struct report_list *get_eod_rl64(const struct sheng64 *sh,
+                                       const struct sstate_aux *aux) {
+    DEBUG_PRINTF("Getting EOD report list at offset %u\n", aux->accept);
+    return (const struct report_list *)
+        ((const char *)sh + aux->accept_eod - sizeof(struct NFA));
+}
+
+static really_inline
+char sheng64HasAccept(const struct sheng64 *sh, const struct sstate_aux *aux,
+                      ReportID report) {
+    assert(sh && aux);
+
+    const struct report_list *rl = get_rl64(sh, aux);
+    assert(ISALIGNED_N(rl, 4));
+
+    DEBUG_PRINTF("report list has %u entries\n", rl->count);
+
+    for (u32 i = 0; i < rl->count; i++) {
+        if (rl->report[i] == report) {
+            DEBUG_PRINTF("reporting %u\n", rl->report[i]);
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+static really_inline
+char fireReports64(const struct sheng64 *sh, NfaCallback cb, void *ctxt,
+                   const u8 state, u64a loc, u8 *const cached_accept_state,
+                   ReportID *const cached_accept_id, char eod) {
+    DEBUG_PRINTF("reporting matches @ %llu\n", loc);
+
+    if (!eod && state == *cached_accept_state) {
+        DEBUG_PRINTF("reporting %u\n", *cached_accept_id);
+        if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+    const struct sstate_aux *aux = get_aux64(sh, state);
+    const struct report_list *rl = eod ? get_eod_rl64(sh, aux) :
+                                         get_rl64(sh, aux);
+    assert(ISALIGNED(rl));
+
+    DEBUG_PRINTF("report list has %u entries\n", rl->count);
+    u32 count = rl->count;
+
+    if (!eod && count == 1) {
+        *cached_accept_state = state;
+        *cached_accept_id = rl->report[0];
+
+        DEBUG_PRINTF("reporting %u\n", rl->report[0]);
+        if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+
+    for (u32 i = 0; i < count; i++) {
+        DEBUG_PRINTF("reporting %u\n", rl->report[i]);
+        if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+    }
+    return MO_CONTINUE_MATCHING; /* continue execution */
+}
+#endif // end of HAVE_AVX512VBMI
+
 /* include Sheng function definitions */
 #include "sheng_defs.h"
 
@@ -671,3 +870,1008 @@ char nfaExecSheng_expandState(UNUSED const struct NFA *nfa, void *dest,
     *(u8 *)dest = *(const u8 *)src;
     return 0;
 }
+
+#if defined(HAVE_AVX512VBMI)
+// Sheng32
+static really_inline
+char runSheng32Cb(const struct sheng32 *sh, NfaCallback cb, void *ctxt,
+                  u64a offset, u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id, const u8 *cur_buf,
+                  const u8 *start, const u8 *end, u8 can_die,
+                  u8 has_accel, u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in callback mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die,
+                 !!has_accel, !!single);
+    int rv;
+    /* scan and report all matches */
+    if (can_die) {
+        if (has_accel) {
+            rv = sheng32_4_coda(state, cb, ctxt, sh, cached_accept_state,
+                                cached_accept_id, single, offset, cur_buf,
+                                start, end, scanned);
+        } else {
+            rv = sheng32_4_cod(state, cb, ctxt, sh, cached_accept_state,
+                               cached_accept_id, single, offset, cur_buf,
+                               start, end, scanned);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        rv = sheng32_cod(state, cb, ctxt, sh, cached_accept_state,
+                         cached_accept_id, single, offset, cur_buf,
+                         *scanned, end, scanned);
+    } else {
+        if (has_accel) {
+            rv = sheng32_4_coa(state, cb, ctxt, sh, cached_accept_state,
+                               cached_accept_id, single, offset, cur_buf,
+                               start, end, scanned);
+        } else {
+            rv = sheng32_4_co(state, cb, ctxt, sh, cached_accept_state,
+                              cached_accept_id, single, offset, cur_buf,
+                              start, end, scanned);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        rv = sheng32_co(state, cb, ctxt, sh, cached_accept_state,
+                        cached_accept_id, single, offset, cur_buf,
+                        *scanned, end, scanned);
+    }
+    if (rv == MO_HALT_MATCHING) {
+        return MO_DEAD;
+    }
+    return MO_ALIVE;
+}
+
+static really_inline
+void runSheng32Nm(const struct sheng32 *sh, NfaCallback cb, void *ctxt,
+                  u64a offset, u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id, const u8 *cur_buf,
+                  const u8 *start, const u8 *end, u8 can_die, u8 has_accel,
+                  u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in nomatch mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die,
+                 !!has_accel, !!single);
+    /* just scan the buffer */
+    if (can_die) {
+        if (has_accel) {
+            sheng32_4_nmda(state, cb, ctxt, sh, cached_accept_state,
+                           cached_accept_id, single, offset, cur_buf,
+                           start, end, scanned);
+        } else {
+            sheng32_4_nmd(state, cb, ctxt, sh, cached_accept_state,
+                          cached_accept_id, single, offset, cur_buf,
+                          start, end, scanned);
+        }
+        sheng32_nmd(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                    single, offset, cur_buf, *scanned, end, scanned);
+    } else {
+        sheng32_4_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                     single, offset, cur_buf, start, end, scanned);
+        sheng32_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                   single, offset, cur_buf, *scanned, end, scanned);
+    }
+}
+
+static really_inline
+char runSheng32Sam(const struct sheng32 *sh, NfaCallback cb, void *ctxt,
+                   u64a offset, u8 *const cached_accept_state,
+                   ReportID *const cached_accept_id, const u8 *cur_buf,
+                   const u8 *start, const u8 *end, u8 can_die, u8 has_accel,
+                   u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in stop at match mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die,
+                 !!has_accel, !!single);
+    int rv;
+    /* scan until first match */
+    if (can_die) {
+        if (has_accel) {
+            rv = sheng32_4_samda(state, cb, ctxt, sh, cached_accept_state,
+                                 cached_accept_id, single, offset, cur_buf,
+                                 start, end, scanned);
+        } else {
+            rv = sheng32_4_samd(state, cb, ctxt, sh, cached_accept_state,
+                                cached_accept_id, single, offset, cur_buf,
+                                start, end, scanned);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        /* if we stopped before we expected, we found a match */
+        if (rv == MO_MATCHES_PENDING) {
+            return MO_MATCHES_PENDING;
+        }
+
+        rv = sheng32_samd(state, cb, ctxt, sh, cached_accept_state,
+                          cached_accept_id, single, offset, cur_buf,
+                          *scanned, end, scanned);
+    } else {
+        if (has_accel) {
+            rv = sheng32_4_sama(state, cb, ctxt, sh, cached_accept_state,
+                                cached_accept_id, single, offset, cur_buf,
+                                start, end, scanned);
+        } else {
+            rv = sheng32_4_sam(state, cb, ctxt, sh, cached_accept_state,
+                               cached_accept_id, single, offset, cur_buf,
+                               start, end, scanned);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        /* if we stopped before we expected, we found a match */
+        if (rv == MO_MATCHES_PENDING) {
+            return MO_MATCHES_PENDING;
+        }
+
+        rv = sheng32_sam(state, cb, ctxt, sh, cached_accept_state,
+                         cached_accept_id, single, offset, cur_buf,
+                         *scanned, end, scanned);
+    }
+    if (rv == MO_HALT_MATCHING) {
+        return MO_DEAD;
+    }
+    /* if we stopped before we expected, we found a match */
+    if (rv == MO_MATCHES_PENDING) {
+        return MO_MATCHES_PENDING;
+    }
+    return MO_ALIVE;
+}
+
+static never_inline
+char runSheng32(const struct sheng32 *sh, struct mq *q, s64a b_end,
+                enum MatchMode mode) {
+    u8 state = *(u8 *)q->state;
+    u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE;
+    u8 has_accel = sh->flags & SHENG_FLAG_HAS_ACCEL;
+    u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT;
+
+    u8 cached_accept_state = 0;
+    ReportID cached_accept_id = 0;
+
+    DEBUG_PRINTF("starting Sheng32 execution in state %u\n",
+                 state & SHENG32_STATE_MASK);
+
+    if (q->report_current) {
+        DEBUG_PRINTF("reporting current pending matches\n");
+        assert(sh);
+
+        q->report_current = 0;
+
+        int rv;
+        if (single) {
+            rv = fireSingleReport(q->cb, q->context, sh->report,
+                                  q_cur_offset(q));
+        } else {
+            rv = fireReports32(sh, q->cb, q->context, state, q_cur_offset(q),
+                               &cached_accept_state, &cached_accept_id, 0);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG32_STATE_MASK);
+            return MO_DEAD;
+        }
+
+        DEBUG_PRINTF("proceeding with matching\n");
+    }
+
+    assert(q_cur_type(q) == MQE_START);
+    s64a start = q_cur_loc(q);
+
+    DEBUG_PRINTF("offset: %lli, location: %lli, mode: %s\n", q->offset, start,
+                 mode == CALLBACK_OUTPUT ? "CALLBACK OUTPUT" :
+                     mode == NO_MATCHES ? "NO MATCHES" :
+                         mode == STOP_AT_MATCH ? "STOP AT MATCH" : "???");
+
+    DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q),
+                 q_cur_type(q) == MQE_START ? "START" :
+                     q_cur_type(q) == MQE_TOP ? "TOP" :
+                         q_cur_type(q) == MQE_END ? "END" : "???");
+
+    const u8* cur_buf;
+    if (start < 0) {
+        DEBUG_PRINTF("negative location, scanning history\n");
+        DEBUG_PRINTF("min location: %zd\n", -q->hlength);
+        cur_buf = q->history + q->hlength;
+    } else {
+        DEBUG_PRINTF("positive location, scanning buffer\n");
+        DEBUG_PRINTF("max location: %lli\n", b_end);
+        cur_buf = q->buffer;
+    }
+
+    /* if we our queue event is past our end */
+    if (mode != NO_MATCHES && q_cur_loc(q) > b_end) {
+        DEBUG_PRINTF("current location past buffer end\n");
+        DEBUG_PRINTF("setting q location to %llu\n", b_end);
+        DEBUG_PRINTF("exiting in state %u\n", state & SHENG32_STATE_MASK);
+        q->items[q->cur].location = b_end;
+        return MO_ALIVE;
+    }
+
+    q->cur++;
+
+    s64a cur_start = start;
+
+    while (1) {
+        DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q),
+                     q_cur_type(q) == MQE_START ? "START" :
+                             q_cur_type(q) == MQE_TOP ? "TOP" :
+                                     q_cur_type(q) == MQE_END ? "END" : "???");
+        s64a end = q_cur_loc(q);
+        if (mode != NO_MATCHES) {
+            end = MIN(end, b_end);
+        }
+        assert(end <= (s64a) q->length);
+        s64a cur_end = end;
+
+        /* we may cross the border between history and current buffer */
+        if (cur_start < 0) {
+            cur_end = MIN(0, cur_end);
+        }
+
+        DEBUG_PRINTF("start: %lli end: %lli\n", start, end);
+
+        /* don't scan zero length buffer */
+        if (cur_start != cur_end) {
+            const u8 * scanned = cur_buf;
+            char rv;
+
+            if (mode == NO_MATCHES) {
+                runSheng32Nm(sh, q->cb, q->context, q->offset,
+                             &cached_accept_state, &cached_accept_id, cur_buf,
+                             cur_buf + cur_start, cur_buf + cur_end, can_die,
+                             has_accel, single, &scanned, &state);
+            } else if (mode == CALLBACK_OUTPUT) {
+                rv = runSheng32Cb(sh, q->cb, q->context, q->offset,
+                                  &cached_accept_state, &cached_accept_id,
+                                  cur_buf, cur_buf + cur_start, cur_buf + cur_end,
+                                  can_die, has_accel, single, &scanned, &state);
+                if (rv == MO_DEAD) {
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG32_STATE_MASK);
+                    return MO_DEAD;
+                }
+            } else if (mode == STOP_AT_MATCH) {
+                rv = runSheng32Sam(sh, q->cb, q->context, q->offset,
+                                   &cached_accept_state, &cached_accept_id,
+                                   cur_buf, cur_buf + cur_start,
+                                   cur_buf + cur_end, can_die, has_accel, single,
+                                   &scanned, &state);
+                if (rv == MO_DEAD) {
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG32_STATE_MASK);
+                    return rv;
+                } else if (rv == MO_MATCHES_PENDING) {
+                    assert(q->cur);
+                    DEBUG_PRINTF("found a match, setting q location to %zd\n",
+                                 scanned - cur_buf + 1);
+                    q->cur--;
+                    q->items[q->cur].type = MQE_START;
+                    q->items[q->cur].location =
+                            scanned - cur_buf + 1; /* due to exiting early */
+                    *(u8 *)q->state = state;
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG32_STATE_MASK);
+                    return rv;
+                }
+            } else {
+                assert(!"invalid scanning mode!");
+            }
+            assert(scanned == cur_buf + cur_end);
+
+            cur_start = cur_end;
+        }
+
+        /* if we our queue event is past our end */
+        if (mode != NO_MATCHES && q_cur_loc(q) > b_end) {
+            DEBUG_PRINTF("current location past buffer end\n");
+            DEBUG_PRINTF("setting q location to %llu\n", b_end);
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG32_STATE_MASK);
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = b_end;
+            *(u8 *)q->state = state;
+            return MO_ALIVE;
+        }
+
+        /* crossing over into actual buffer */
+        if (cur_start == 0) {
+            DEBUG_PRINTF("positive location, scanning buffer\n");
+            DEBUG_PRINTF("max offset: %lli\n", b_end);
+            cur_buf = q->buffer;
+        }
+
+        /* continue scanning the same buffer */
+        if (end != cur_end) {
+            continue;
+        }
+
+        switch (q_cur_type(q)) {
+        case MQE_END:
+            *(u8 *)q->state = state;
+            q->cur++;
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG32_STATE_MASK);
+            if (can_die) {
+                return (state & SHENG32_STATE_DEAD) ? MO_DEAD : MO_ALIVE;
+            }
+            return MO_ALIVE;
+        case MQE_TOP:
+            if (q->offset + cur_start == 0) {
+                DEBUG_PRINTF("Anchored start, going to state %u\n",
+                             sh->anchored);
+                state = sh->anchored;
+            } else {
+                u8 new_state = get_aux32(sh, state)->top;
+                DEBUG_PRINTF("Top event %u->%u\n", state & SHENG32_STATE_MASK,
+                             new_state & SHENG32_STATE_MASK);
+                state = new_state;
+            }
+            break;
+        default:
+            assert(!"invalid queue event");
+            break;
+        }
+        q->cur++;
+    }
+}
+
+char nfaExecSheng32_B(const struct NFA *n, u64a offset, const u8 *buffer,
+                      size_t length, NfaCallback cb, void *context) {
+    DEBUG_PRINTF("smallwrite Sheng32\n");
+    assert(n->type == SHENG_NFA_32);
+    const struct sheng32 *sh = getImplNfa(n);
+    u8 state = sh->anchored;
+    u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE;
+    u8 has_accel = sh->flags & SHENG_FLAG_HAS_ACCEL;
+    u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT;
+    u8 cached_accept_state = 0;
+    ReportID cached_accept_id = 0;
+
+    /* scan and report all matches */
+    int rv;
+    s64a end = length;
+    const u8 *scanned;
+
+    rv = runSheng32Cb(sh, cb, context, offset, &cached_accept_state,
+                      &cached_accept_id, buffer, buffer, buffer + end, can_die,
+                      has_accel, single, &scanned, &state);
+    if (rv == MO_DEAD) {
+        DEBUG_PRINTF("exiting in state %u\n",
+                     state & SHENG32_STATE_MASK);
+        return MO_DEAD;
+    }
+
+    DEBUG_PRINTF("%u\n", state & SHENG32_STATE_MASK);
+
+    const struct sstate_aux *aux = get_aux32(sh, state);
+
+    if (aux->accept_eod) {
+        DEBUG_PRINTF("Reporting EOD matches\n");
+        fireReports32(sh, cb, context, state, end + offset,
+                      &cached_accept_state, &cached_accept_id, 1);
+    }
+
+    return state & SHENG32_STATE_DEAD ? MO_DEAD : MO_ALIVE;
+}
+
+char nfaExecSheng32_Q(const struct NFA *n, struct mq *q, s64a end) {
+    const struct sheng32 *sh = get_sheng32(n);
+    char rv = runSheng32(sh, q, end, CALLBACK_OUTPUT);
+    return rv;
+}
+
+char nfaExecSheng32_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    const struct sheng32 *sh = get_sheng32(n);
+    char rv = runSheng32(sh, q, end, STOP_AT_MATCH);
+    return rv;
+}
+
+char nfaExecSheng32_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    assert(q_cur_type(q) == MQE_START);
+
+    const struct sheng32 *sh = get_sheng32(n);
+    char rv = runSheng32(sh, q, 0 /* end */, NO_MATCHES);
+
+    if (rv && nfaExecSheng32_inAccept(n, report, q)) {
+        return MO_MATCHES_PENDING;
+    }
+    return rv;
+}
+
+char nfaExecSheng32_inAccept(const struct NFA *n, ReportID report,
+                             struct mq *q) {
+    assert(n && q);
+
+    const struct sheng32 *sh = get_sheng32(n);
+    u8 s = *(const u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG32_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux32(sh, s);
+
+    if (!aux->accept) {
+        return 0;
+    }
+
+    return sheng32HasAccept(sh, aux, report);
+}
+
+char nfaExecSheng32_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct sheng32 *sh = get_sheng32(n);
+    u8 s = *(const u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG32_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux32(sh, s);
+    return !!aux->accept;
+}
+
+char nfaExecSheng32_testEOD(const struct NFA *nfa, const char *state,
+                            UNUSED const char *streamState, u64a offset,
+                            NfaCallback cb, void *ctxt) {
+    assert(nfa);
+
+    const struct sheng32 *sh = get_sheng32(nfa);
+    u8 s = *(const u8 *)state;
+    DEBUG_PRINTF("checking EOD accepts for %u\n", (u8)(s & SHENG32_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux32(sh, s);
+
+    if (!aux->accept_eod) {
+        return MO_CONTINUE_MATCHING;
+    }
+
+    return fireReports32(sh, cb, ctxt, s, offset, NULL, NULL, 1);
+}
+
+char nfaExecSheng32_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct sheng32 *sh = (const struct sheng32 *)getImplNfa(n);
+    NfaCallback cb = q->cb;
+    void *ctxt = q->context;
+    u8 s = *(u8 *)q->state;
+    const struct sstate_aux *aux = get_aux32(sh, s);
+    u64a offset = q_cur_offset(q);
+    u8 cached_state_id = 0;
+    ReportID cached_report_id = 0;
+    assert(q_cur_type(q) == MQE_START);
+
+    if (aux->accept) {
+        if (sh->flags & SHENG_FLAG_SINGLE_REPORT) {
+            fireSingleReport(cb, ctxt, sh->report, offset);
+        } else {
+            fireReports32(sh, cb, ctxt, s, offset, &cached_state_id,
+                          &cached_report_id, 0);
+        }
+    }
+
+    return 0;
+}
+
+char nfaExecSheng32_initCompressedState(const struct NFA *nfa, u64a offset,
+                                        void *state, UNUSED u8 key) {
+    const struct sheng32 *sh = get_sheng32(nfa);
+    u8 *s = (u8 *)state;
+    *s = offset ? sh->floating: sh->anchored;
+    return !(*s & SHENG32_STATE_DEAD);
+}
+
+char nfaExecSheng32_queueInitState(const struct NFA *nfa, struct mq *q) {
+    assert(nfa->scratchStateSize == 1);
+
+    /* starting in floating state */
+    const struct sheng32 *sh = get_sheng32(nfa);
+    *(u8 *)q->state = sh->floating;
+    DEBUG_PRINTF("starting in floating state\n");
+    return 0;
+}
+
+char nfaExecSheng32_queueCompressState(UNUSED const struct NFA *nfa,
+                                       const struct mq *q, UNUSED s64a loc) {
+    void *dest = q->streamState;
+    const void *src = q->state;
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+char nfaExecSheng32_expandState(UNUSED const struct NFA *nfa, void *dest,
+                                const void *src, UNUSED u64a offset,
+                                UNUSED u8 key) {
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+// Sheng64
+static really_inline
+char runSheng64Cb(const struct sheng64 *sh, NfaCallback cb, void *ctxt,
+                  u64a offset, u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id, const u8 *cur_buf,
+                  const u8 *start, const u8 *end, u8 can_die,
+                  u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in callback mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u single: %u\n", !!can_die, !!single);
+    int rv;
+    /* scan and report all matches */
+    if (can_die) {
+        rv = sheng64_4_cod(state, cb, ctxt, sh, cached_accept_state,
+                           cached_accept_id, single, offset, cur_buf,
+                           start, end, scanned);
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        rv = sheng64_cod(state, cb, ctxt, sh, cached_accept_state,
+                         cached_accept_id, single, offset, cur_buf,
+                         *scanned, end, scanned);
+    } else {
+        rv = sheng64_4_co(state, cb, ctxt, sh, cached_accept_state,
+                          cached_accept_id, single, offset, cur_buf,
+                          start, end, scanned);
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        rv = sheng64_co(state, cb, ctxt, sh, cached_accept_state,
+                        cached_accept_id, single, offset, cur_buf,
+                        *scanned, end, scanned);
+    }
+    if (rv == MO_HALT_MATCHING) {
+        return MO_DEAD;
+    }
+    return MO_ALIVE;
+}
+
+static really_inline
+void runSheng64Nm(const struct sheng64 *sh, NfaCallback cb, void *ctxt,
+                  u64a offset, u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id, const u8 *cur_buf,
+                  const u8 *start, const u8 *end, u8 can_die,
+                  u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in nomatch mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u single: %u\n", !!can_die, !!single);
+    /* just scan the buffer */
+    if (can_die) {
+        sheng64_4_nmd(state, cb, ctxt, sh, cached_accept_state,
+                      cached_accept_id, single, offset, cur_buf,
+                      start, end, scanned);
+        sheng64_nmd(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                    single, offset, cur_buf, *scanned, end, scanned);
+    } else {
+        sheng64_4_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                     single, offset, cur_buf, start, end, scanned);
+        sheng64_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                   single, offset, cur_buf, *scanned, end, scanned);
+    }
+}
+
+static really_inline
+char runSheng64Sam(const struct sheng64 *sh, NfaCallback cb, void *ctxt,
+                   u64a offset, u8 *const cached_accept_state,
+                   ReportID *const cached_accept_id, const u8 *cur_buf,
+                   const u8 *start, const u8 *end, u8 can_die,
+                   u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in stop at match mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u single: %u\n", !!can_die, !!single);
+    int rv;
+    /* scan until first match */
+    if (can_die) {
+        rv = sheng64_4_samd(state, cb, ctxt, sh, cached_accept_state,
+                            cached_accept_id, single, offset, cur_buf,
+                            start, end, scanned);
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        /* if we stopped before we expected, we found a match */
+        if (rv == MO_MATCHES_PENDING) {
+            return MO_MATCHES_PENDING;
+        }
+
+        rv = sheng64_samd(state, cb, ctxt, sh, cached_accept_state,
+                          cached_accept_id, single, offset, cur_buf,
+                          *scanned, end, scanned);
+    } else {
+        rv = sheng64_4_sam(state, cb, ctxt, sh, cached_accept_state,
+                           cached_accept_id, single, offset, cur_buf,
+                           start, end, scanned);
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        /* if we stopped before we expected, we found a match */
+        if (rv == MO_MATCHES_PENDING) {
+            return MO_MATCHES_PENDING;
+        }
+
+        rv = sheng64_sam(state, cb, ctxt, sh, cached_accept_state,
+                         cached_accept_id, single, offset, cur_buf,
+                         *scanned, end, scanned);
+    }
+    if (rv == MO_HALT_MATCHING) {
+        return MO_DEAD;
+    }
+    /* if we stopped before we expected, we found a match */
+    if (rv == MO_MATCHES_PENDING) {
+        return MO_MATCHES_PENDING;
+    }
+    return MO_ALIVE;
+}
+
+static never_inline
+char runSheng64(const struct sheng64 *sh, struct mq *q, s64a b_end,
+                enum MatchMode mode) {
+    u8 state = *(u8 *)q->state;
+    u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE;
+    u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT;
+
+    u8 cached_accept_state = 0;
+    ReportID cached_accept_id = 0;
+
+    DEBUG_PRINTF("starting Sheng64 execution in state %u\n",
+                 state & SHENG64_STATE_MASK);
+
+    if (q->report_current) {
+        DEBUG_PRINTF("reporting current pending matches\n");
+        assert(sh);
+
+        q->report_current = 0;
+
+        int rv;
+        if (single) {
+            rv = fireSingleReport(q->cb, q->context, sh->report,
+                                  q_cur_offset(q));
+        } else {
+            rv = fireReports64(sh, q->cb, q->context, state, q_cur_offset(q),
+                               &cached_accept_state, &cached_accept_id, 0);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG64_STATE_MASK);
+            return MO_DEAD;
+        }
+
+        DEBUG_PRINTF("proceeding with matching\n");
+    }
+
+    assert(q_cur_type(q) == MQE_START);
+    s64a start = q_cur_loc(q);
+
+    DEBUG_PRINTF("offset: %lli, location: %lli, mode: %s\n", q->offset, start,
+                 mode == CALLBACK_OUTPUT ? "CALLBACK OUTPUT" :
+                     mode == NO_MATCHES ? "NO MATCHES" :
+                         mode == STOP_AT_MATCH ? "STOP AT MATCH" : "???");
+
+    DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q),
+                 q_cur_type(q) == MQE_START ? "START" :
+                     q_cur_type(q) == MQE_TOP ? "TOP" :
+                         q_cur_type(q) == MQE_END ? "END" : "???");
+
+    const u8* cur_buf;
+    if (start < 0) {
+        DEBUG_PRINTF("negative location, scanning history\n");
+        DEBUG_PRINTF("min location: %zd\n", -q->hlength);
+        cur_buf = q->history + q->hlength;
+    } else {
+        DEBUG_PRINTF("positive location, scanning buffer\n");
+        DEBUG_PRINTF("max location: %lli\n", b_end);
+        cur_buf = q->buffer;
+    }
+
+    /* if we our queue event is past our end */
+    if (mode != NO_MATCHES && q_cur_loc(q) > b_end) {
+        DEBUG_PRINTF("current location past buffer end\n");
+        DEBUG_PRINTF("setting q location to %llu\n", b_end);
+        DEBUG_PRINTF("exiting in state %u\n", state & SHENG64_STATE_MASK);
+        q->items[q->cur].location = b_end;
+        return MO_ALIVE;
+    }
+
+    q->cur++;
+
+    s64a cur_start = start;
+
+    while (1) {
+        DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q),
+                     q_cur_type(q) == MQE_START ? "START" :
+                             q_cur_type(q) == MQE_TOP ? "TOP" :
+                                     q_cur_type(q) == MQE_END ? "END" : "???");
+        s64a end = q_cur_loc(q);
+        if (mode != NO_MATCHES) {
+            end = MIN(end, b_end);
+        }
+        assert(end <= (s64a) q->length);
+        s64a cur_end = end;
+
+        /* we may cross the border between history and current buffer */
+        if (cur_start < 0) {
+            cur_end = MIN(0, cur_end);
+        }
+
+        DEBUG_PRINTF("start: %lli end: %lli\n", start, end);
+
+        /* don't scan zero length buffer */
+        if (cur_start != cur_end) {
+            const u8 * scanned = cur_buf;
+            char rv;
+
+            if (mode == NO_MATCHES) {
+                runSheng64Nm(sh, q->cb, q->context, q->offset,
+                             &cached_accept_state, &cached_accept_id, cur_buf,
+                             cur_buf + cur_start, cur_buf + cur_end, can_die,
+                             single, &scanned, &state);
+            } else if (mode == CALLBACK_OUTPUT) {
+                rv = runSheng64Cb(sh, q->cb, q->context, q->offset,
+                                  &cached_accept_state, &cached_accept_id,
+                                  cur_buf, cur_buf + cur_start, cur_buf + cur_end,
+                                  can_die, single, &scanned, &state);
+                if (rv == MO_DEAD) {
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG64_STATE_MASK);
+                    return MO_DEAD;
+                }
+            } else if (mode == STOP_AT_MATCH) {
+                rv = runSheng64Sam(sh, q->cb, q->context, q->offset,
+                                   &cached_accept_state, &cached_accept_id,
+                                   cur_buf, cur_buf + cur_start,
+                                   cur_buf + cur_end, can_die, single,
+                                   &scanned, &state);
+                if (rv == MO_DEAD) {
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG64_STATE_MASK);
+                    return rv;
+                } else if (rv == MO_MATCHES_PENDING) {
+                    assert(q->cur);
+                    DEBUG_PRINTF("found a match, setting q location to %zd\n",
+                                 scanned - cur_buf + 1);
+                    q->cur--;
+                    q->items[q->cur].type = MQE_START;
+                    q->items[q->cur].location =
+                            scanned - cur_buf + 1; /* due to exiting early */
+                    *(u8 *)q->state = state;
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG64_STATE_MASK);
+                    return rv;
+                }
+            } else {
+                assert(!"invalid scanning mode!");
+            }
+            assert(scanned == cur_buf + cur_end);
+
+            cur_start = cur_end;
+        }
+
+        /* if we our queue event is past our end */
+        if (mode != NO_MATCHES && q_cur_loc(q) > b_end) {
+            DEBUG_PRINTF("current location past buffer end\n");
+            DEBUG_PRINTF("setting q location to %llu\n", b_end);
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG64_STATE_MASK);
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = b_end;
+            *(u8 *)q->state = state;
+            return MO_ALIVE;
+        }
+
+        /* crossing over into actual buffer */
+        if (cur_start == 0) {
+            DEBUG_PRINTF("positive location, scanning buffer\n");
+            DEBUG_PRINTF("max offset: %lli\n", b_end);
+            cur_buf = q->buffer;
+        }
+
+        /* continue scanning the same buffer */
+        if (end != cur_end) {
+            continue;
+        }
+
+        switch (q_cur_type(q)) {
+        case MQE_END:
+            *(u8 *)q->state = state;
+            q->cur++;
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG64_STATE_MASK);
+            if (can_die) {
+                return (state & SHENG64_STATE_DEAD) ? MO_DEAD : MO_ALIVE;
+            }
+            return MO_ALIVE;
+        case MQE_TOP:
+            if (q->offset + cur_start == 0) {
+                DEBUG_PRINTF("Anchored start, going to state %u\n",
+                             sh->anchored);
+                state = sh->anchored;
+            } else {
+                u8 new_state = get_aux64(sh, state)->top;
+                DEBUG_PRINTF("Top event %u->%u\n", state & SHENG64_STATE_MASK,
+                             new_state & SHENG64_STATE_MASK);
+                state = new_state;
+            }
+            break;
+        default:
+            assert(!"invalid queue event");
+            break;
+        }
+        q->cur++;
+    }
+}
+
+char nfaExecSheng64_B(const struct NFA *n, u64a offset, const u8 *buffer,
+                      size_t length, NfaCallback cb, void *context) {
+    DEBUG_PRINTF("smallwrite Sheng64\n");
+    assert(n->type == SHENG_NFA_64);
+    const struct sheng64 *sh = getImplNfa(n);
+    u8 state = sh->anchored;
+    u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE;
+    u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT;
+    u8 cached_accept_state = 0;
+    ReportID cached_accept_id = 0;
+
+    /* scan and report all matches */
+    int rv;
+    s64a end = length;
+    const u8 *scanned;
+
+    rv = runSheng64Cb(sh, cb, context, offset, &cached_accept_state,
+                      &cached_accept_id, buffer, buffer, buffer + end, can_die,
+                      single, &scanned, &state);
+    if (rv == MO_DEAD) {
+        DEBUG_PRINTF("exiting in state %u\n",
+                     state & SHENG64_STATE_MASK);
+        return MO_DEAD;
+    }
+
+    DEBUG_PRINTF("%u\n", state & SHENG64_STATE_MASK);
+
+    const struct sstate_aux *aux = get_aux64(sh, state);
+
+    if (aux->accept_eod) {
+        DEBUG_PRINTF("Reporting EOD matches\n");
+        fireReports64(sh, cb, context, state, end + offset,
+                      &cached_accept_state, &cached_accept_id, 1);
+    }
+
+    return state & SHENG64_STATE_DEAD ? MO_DEAD : MO_ALIVE;
+}
+
+char nfaExecSheng64_Q(const struct NFA *n, struct mq *q, s64a end) {
+    const struct sheng64 *sh = get_sheng64(n);
+    char rv = runSheng64(sh, q, end, CALLBACK_OUTPUT);
+    return rv;
+}
+
+char nfaExecSheng64_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    const struct sheng64 *sh = get_sheng64(n);
+    char rv = runSheng64(sh, q, end, STOP_AT_MATCH);
+    return rv;
+}
+
+char nfaExecSheng64_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    assert(q_cur_type(q) == MQE_START);
+
+    const struct sheng64 *sh = get_sheng64(n);
+    char rv = runSheng64(sh, q, 0 /* end */, NO_MATCHES);
+
+    if (rv && nfaExecSheng64_inAccept(n, report, q)) {
+        return MO_MATCHES_PENDING;
+    }
+    return rv;
+}
+
+char nfaExecSheng64_inAccept(const struct NFA *n, ReportID report,
+                             struct mq *q) {
+    assert(n && q);
+
+    const struct sheng64 *sh = get_sheng64(n);
+    u8 s = *(const u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG64_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux64(sh, s);
+
+    if (!aux->accept) {
+        return 0;
+    }
+
+    return sheng64HasAccept(sh, aux, report);
+}
+
+char nfaExecSheng64_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct sheng64 *sh = get_sheng64(n);
+    u8 s = *(const u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG64_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux64(sh, s);
+    return !!aux->accept;
+}
+
+char nfaExecSheng64_testEOD(const struct NFA *nfa, const char *state,
+                            UNUSED const char *streamState, u64a offset,
+                            NfaCallback cb, void *ctxt) {
+    assert(nfa);
+
+    const struct sheng64 *sh = get_sheng64(nfa);
+    u8 s = *(const u8 *)state;
+    DEBUG_PRINTF("checking EOD accepts for %u\n", (u8)(s & SHENG64_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux64(sh, s);
+
+    if (!aux->accept_eod) {
+        return MO_CONTINUE_MATCHING;
+    }
+
+    return fireReports64(sh, cb, ctxt, s, offset, NULL, NULL, 1);
+}
+
+char nfaExecSheng64_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct sheng64 *sh = (const struct sheng64 *)getImplNfa(n);
+    NfaCallback cb = q->cb;
+    void *ctxt = q->context;
+    u8 s = *(u8 *)q->state;
+    const struct sstate_aux *aux = get_aux64(sh, s);
+    u64a offset = q_cur_offset(q);
+    u8 cached_state_id = 0;
+    ReportID cached_report_id = 0;
+    assert(q_cur_type(q) == MQE_START);
+
+    if (aux->accept) {
+        if (sh->flags & SHENG_FLAG_SINGLE_REPORT) {
+            fireSingleReport(cb, ctxt, sh->report, offset);
+        } else {
+            fireReports64(sh, cb, ctxt, s, offset, &cached_state_id,
+                          &cached_report_id, 0);
+        }
+    }
+
+    return 0;
+}
+
+char nfaExecSheng64_initCompressedState(const struct NFA *nfa, u64a offset,
+                                        void *state, UNUSED u8 key) {
+    const struct sheng64 *sh = get_sheng64(nfa);
+    u8 *s = (u8 *)state;
+    *s = offset ? sh->floating: sh->anchored;
+    return !(*s & SHENG64_STATE_DEAD);
+}
+
+char nfaExecSheng64_queueInitState(const struct NFA *nfa, struct mq *q) {
+    assert(nfa->scratchStateSize == 1);
+
+    /* starting in floating state */
+    const struct sheng64 *sh = get_sheng64(nfa);
+    *(u8 *)q->state = sh->floating;
+    DEBUG_PRINTF("starting in floating state\n");
+    return 0;
+}
+
+char nfaExecSheng64_queueCompressState(UNUSED const struct NFA *nfa,
+                                       const struct mq *q, UNUSED s64a loc) {
+    void *dest = q->streamState;
+    const void *src = q->state;
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+char nfaExecSheng64_expandState(UNUSED const struct NFA *nfa, void *dest,
+                                const void *src, UNUSED u64a offset,
+                                UNUSED u8 key) {
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+#endif // end of HAVE_AVX512VBMI
diff --git a/src/nfa/sheng.h b/src/nfa/sheng.h
index 84a2b6b515d36280b6a4e5d5269b3d6e483b237e..7b90e3034f05af7e1d80499b079de4b59323a7b8 100644
--- a/src/nfa/sheng.h
+++ b/src/nfa/sheng.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -58,4 +58,86 @@ char nfaExecSheng_reportCurrent(const struct NFA *n, struct mq *q);
 char nfaExecSheng_B(const struct NFA *n, u64a offset, const u8 *buffer,
                     size_t length, NfaCallback cb, void *context);
 
+#if defined(HAVE_AVX512VBMI)
+#define nfaExecSheng32_B_Reverse NFA_API_NO_IMPL
+#define nfaExecSheng32_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+char nfaExecSheng32_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecSheng32_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecSheng32_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecSheng32_inAccept(const struct NFA *n, ReportID report,
+                             struct mq *q);
+char nfaExecSheng32_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecSheng32_queueInitState(const struct NFA *nfa, struct mq *q);
+char nfaExecSheng32_queueCompressState(const struct NFA *nfa,
+                                       const struct mq *q, s64a loc);
+char nfaExecSheng32_expandState(const struct NFA *nfa, void *dest,
+                                const void *src, u64a offset, u8 key);
+char nfaExecSheng32_initCompressedState(const struct NFA *nfa, u64a offset,
+                                        void *state, u8 key);
+char nfaExecSheng32_testEOD(const struct NFA *nfa, const char *state,
+                            const char *streamState, u64a offset,
+                            NfaCallback callback, void *context);
+char nfaExecSheng32_reportCurrent(const struct NFA *n, struct mq *q);
+
+char nfaExecSheng32_B(const struct NFA *n, u64a offset, const u8 *buffer,
+                      size_t length, NfaCallback cb, void *context);
+
+#define nfaExecSheng64_B_Reverse NFA_API_NO_IMPL
+#define nfaExecSheng64_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+char nfaExecSheng64_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecSheng64_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecSheng64_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecSheng64_inAccept(const struct NFA *n, ReportID report,
+                             struct mq *q);
+char nfaExecSheng64_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecSheng64_queueInitState(const struct NFA *nfa, struct mq *q);
+char nfaExecSheng64_queueCompressState(const struct NFA *nfa,
+                                       const struct mq *q, s64a loc);
+char nfaExecSheng64_expandState(const struct NFA *nfa, void *dest,
+                                const void *src, u64a offset, u8 key);
+char nfaExecSheng64_initCompressedState(const struct NFA *nfa, u64a offset,
+                                        void *state, u8 key);
+char nfaExecSheng64_testEOD(const struct NFA *nfa, const char *state,
+                            const char *streamState, u64a offset,
+                            NfaCallback callback, void *context);
+char nfaExecSheng64_reportCurrent(const struct NFA *n, struct mq *q);
+
+char nfaExecSheng64_B(const struct NFA *n, u64a offset, const u8 *buffer,
+                      size_t length, NfaCallback cb, void *context);
+
+#else // !HAVE_AVX512VBMI
+
+#define nfaExecSheng32_B_Reverse NFA_API_NO_IMPL
+#define nfaExecSheng32_zombie_status NFA_API_ZOMBIE_NO_IMPL
+#define nfaExecSheng32_Q NFA_API_NO_IMPL
+#define nfaExecSheng32_Q2 NFA_API_NO_IMPL
+#define nfaExecSheng32_QR NFA_API_NO_IMPL
+#define nfaExecSheng32_inAccept NFA_API_NO_IMPL
+#define nfaExecSheng32_inAnyAccept NFA_API_NO_IMPL
+#define nfaExecSheng32_queueInitState NFA_API_NO_IMPL
+#define nfaExecSheng32_queueCompressState NFA_API_NO_IMPL
+#define nfaExecSheng32_expandState NFA_API_NO_IMPL
+#define nfaExecSheng32_initCompressedState NFA_API_NO_IMPL
+#define nfaExecSheng32_testEOD NFA_API_NO_IMPL
+#define nfaExecSheng32_reportCurrent NFA_API_NO_IMPL
+#define nfaExecSheng32_B NFA_API_NO_IMPL
+
+#define nfaExecSheng64_B_Reverse NFA_API_NO_IMPL
+#define nfaExecSheng64_zombie_status NFA_API_ZOMBIE_NO_IMPL
+#define nfaExecSheng64_Q NFA_API_NO_IMPL
+#define nfaExecSheng64_Q2 NFA_API_NO_IMPL
+#define nfaExecSheng64_QR NFA_API_NO_IMPL
+#define nfaExecSheng64_inAccept NFA_API_NO_IMPL
+#define nfaExecSheng64_inAnyAccept NFA_API_NO_IMPL
+#define nfaExecSheng64_queueInitState NFA_API_NO_IMPL
+#define nfaExecSheng64_queueCompressState NFA_API_NO_IMPL
+#define nfaExecSheng64_expandState NFA_API_NO_IMPL
+#define nfaExecSheng64_initCompressedState NFA_API_NO_IMPL
+#define nfaExecSheng64_testEOD NFA_API_NO_IMPL
+#define nfaExecSheng64_reportCurrent NFA_API_NO_IMPL
+#define nfaExecSheng64_B NFA_API_NO_IMPL
+#endif // end of HAVE_AVX512VBMI
+
 #endif /* SHENG_H_ */
diff --git a/src/nfa/sheng_defs.h b/src/nfa/sheng_defs.h
index 26bdbcee234f20e47be31b0ff2127fbb5a4c450e..390af7522122699f0be19e118a5cbd58c63a43e3 100644
--- a/src/nfa/sheng_defs.h
+++ b/src/nfa/sheng_defs.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -52,6 +52,43 @@ u8 hasInterestingStates(const u8 a, const u8 b, const u8 c, const u8 d) {
     return (a | b | c | d) & (SHENG_STATE_FLAG_MASK);
 }
 
+#if defined(HAVE_AVX512VBMI)
+static really_inline
+u8 isDeadState32(const u8 a) {
+    return a & SHENG32_STATE_DEAD;
+}
+
+static really_inline
+u8 isAcceptState32(const u8 a) {
+    return a & SHENG32_STATE_ACCEPT;
+}
+
+static really_inline
+u8 isAccelState32(const u8 a) {
+    return a & SHENG32_STATE_ACCEL;
+}
+
+static really_inline
+u8 hasInterestingStates32(const u8 a, const u8 b, const u8 c, const u8 d) {
+    return (a | b | c | d) & (SHENG32_STATE_FLAG_MASK);
+}
+
+static really_inline
+u8 isDeadState64(const u8 a) {
+    return a & SHENG64_STATE_DEAD;
+}
+
+static really_inline
+u8 isAcceptState64(const u8 a) {
+    return a & SHENG64_STATE_ACCEPT;
+}
+
+static really_inline
+u8 hasInterestingStates64(const u8 a, const u8 b, const u8 c, const u8 d) {
+    return (a | b | c | d) & (SHENG64_STATE_FLAG_MASK);
+}
+#endif
+
 /* these functions should be optimized out, used by NO_MATCHES mode */
 static really_inline
 u8 dummyFunc4(UNUSED const u8 a, UNUSED const u8 b, UNUSED const u8 c,
@@ -71,66 +108,162 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define SHENG_IMPL sheng_cod
 #define DEAD_FUNC isDeadState
 #define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_cod
+#define DEAD_FUNC32 isDeadState32
+#define ACCEPT_FUNC32 isAcceptState32
+#define SHENG64_IMPL sheng64_cod
+#define DEAD_FUNC64 isDeadState64
+#define ACCEPT_FUNC64 isAcceptState64
+#endif
 #define STOP_AT_MATCH 0
 #include "sheng_impl.h"
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef DEAD_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
 #undef STOP_AT_MATCH
 
 /* callback output, can't die */
 #define SHENG_IMPL sheng_co
 #define DEAD_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_co
+#define DEAD_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define SHENG64_IMPL sheng64_co
+#define DEAD_FUNC64 dummyFunc
+#define ACCEPT_FUNC64 isAcceptState64
+#endif
 #define STOP_AT_MATCH 0
 #include "sheng_impl.h"
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef DEAD_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
 #undef STOP_AT_MATCH
 
 /* stop at match, can die */
 #define SHENG_IMPL sheng_samd
 #define DEAD_FUNC isDeadState
 #define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_samd
+#define DEAD_FUNC32 isDeadState32
+#define ACCEPT_FUNC32 isAcceptState32
+#define SHENG64_IMPL sheng64_samd
+#define DEAD_FUNC64 isDeadState64
+#define ACCEPT_FUNC64 isAcceptState64
+#endif
 #define STOP_AT_MATCH 1
 #include "sheng_impl.h"
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef DEAD_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
 #undef STOP_AT_MATCH
 
 /* stop at match, can't die */
 #define SHENG_IMPL sheng_sam
 #define DEAD_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_sam
+#define DEAD_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define SHENG64_IMPL sheng64_sam
+#define DEAD_FUNC64 dummyFunc
+#define ACCEPT_FUNC64 isAcceptState64
+#endif
 #define STOP_AT_MATCH 1
 #include "sheng_impl.h"
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef DEAD_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
 #undef STOP_AT_MATCH
 
 /* no match, can die */
 #define SHENG_IMPL sheng_nmd
 #define DEAD_FUNC isDeadState
 #define ACCEPT_FUNC dummyFunc
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_nmd
+#define DEAD_FUNC32 isDeadState32
+#define ACCEPT_FUNC32 dummyFunc
+#define SHENG64_IMPL sheng64_nmd
+#define DEAD_FUNC64 isDeadState64
+#define ACCEPT_FUNC64 dummyFunc
+#endif
 #define STOP_AT_MATCH 0
 #include "sheng_impl.h"
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef DEAD_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
 #undef STOP_AT_MATCH
 
 /* no match, can't die */
 #define SHENG_IMPL sheng_nm
 #define DEAD_FUNC dummyFunc
 #define ACCEPT_FUNC dummyFunc
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_nm
+#define DEAD_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 dummyFunc
+#define SHENG64_IMPL sheng64_nm
+#define DEAD_FUNC64 dummyFunc
+#define ACCEPT_FUNC64 dummyFunc
+#endif
 #define STOP_AT_MATCH 0
 #include "sheng_impl.h"
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef DEAD_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
 #undef STOP_AT_MATCH
 
 /*
@@ -144,6 +277,16 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC isAccelState
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_coda
+#define INTERESTING_FUNC32 hasInterestingStates32
+#define INNER_DEAD_FUNC32 isDeadState32
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 isAccelState32
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define NO_SHENG64_IMPL
+#endif
 #define STOP_AT_MATCH 0
 #include "sheng_impl4.h"
 #undef SHENG_IMPL
@@ -153,6 +296,16 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef NO_SHENG64_IMPL
+#endif
 #undef STOP_AT_MATCH
 
 /* callback output, can die, not accelerated */
@@ -163,6 +316,20 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_cod
+#define INTERESTING_FUNC32 hasInterestingStates32
+#define INNER_DEAD_FUNC32 isDeadState32
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 dummyFunc
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define SHENG64_IMPL sheng64_4_cod
+#define INTERESTING_FUNC64 hasInterestingStates64
+#define INNER_DEAD_FUNC64 isDeadState64
+#define OUTER_DEAD_FUNC64 dummyFunc
+#define ACCEPT_FUNC64 isAcceptState64
+#endif
 #define STOP_AT_MATCH 0
 #include "sheng_impl4.h"
 #undef SHENG_IMPL
@@ -172,6 +339,20 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef INTERESTING_FUNC64
+#undef INNER_DEAD_FUNC64
+#undef OUTER_DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
 #undef STOP_AT_MATCH
 
 /* callback output, can't die, accelerated */
@@ -182,6 +363,16 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC isAccelState
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_coa
+#define INTERESTING_FUNC32 hasInterestingStates32
+#define INNER_DEAD_FUNC32 dummyFunc
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 isAccelState32
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define NO_SHENG64_IMPL
+#endif
 #define STOP_AT_MATCH 0
 #include "sheng_impl4.h"
 #undef SHENG_IMPL
@@ -191,6 +382,16 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef NO_SHENG64_IMPL
+#endif
 #undef STOP_AT_MATCH
 
 /* callback output, can't die, not accelerated */
@@ -201,6 +402,20 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_co
+#define INTERESTING_FUNC32 hasInterestingStates32
+#define INNER_DEAD_FUNC32 dummyFunc
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 dummyFunc
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define SHENG64_IMPL sheng64_4_co
+#define INTERESTING_FUNC64 hasInterestingStates64
+#define INNER_DEAD_FUNC64 dummyFunc
+#define OUTER_DEAD_FUNC64 dummyFunc
+#define ACCEPT_FUNC64 isAcceptState64
+#endif
 #define STOP_AT_MATCH 0
 #include "sheng_impl4.h"
 #undef SHENG_IMPL
@@ -210,6 +425,20 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef INTERESTING_FUNC64
+#undef INNER_DEAD_FUNC64
+#undef OUTER_DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
 #undef STOP_AT_MATCH
 
 /* stop at match, can die, accelerated */
@@ -220,6 +449,16 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC isAccelState
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_samda
+#define INTERESTING_FUNC32 hasInterestingStates32
+#define INNER_DEAD_FUNC32 isDeadState32
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 isAccelState32
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define NO_SHENG64_IMPL
+#endif
 #define STOP_AT_MATCH 1
 #include "sheng_impl4.h"
 #undef SHENG_IMPL
@@ -229,6 +468,16 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef NO_SHENG64_IMPL
+#endif
 #undef STOP_AT_MATCH
 
 /* stop at match, can die, not accelerated */
@@ -239,6 +488,20 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_samd
+#define INTERESTING_FUNC32 hasInterestingStates32
+#define INNER_DEAD_FUNC32 isDeadState32
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 dummyFunc
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define SHENG64_IMPL sheng64_4_samd
+#define INTERESTING_FUNC64 hasInterestingStates64
+#define INNER_DEAD_FUNC64 isDeadState64
+#define OUTER_DEAD_FUNC64 dummyFunc
+#define ACCEPT_FUNC64 isAcceptState64
+#endif
 #define STOP_AT_MATCH 1
 #include "sheng_impl4.h"
 #undef SHENG_IMPL
@@ -248,6 +511,20 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef INTERESTING_FUNC64
+#undef INNER_DEAD_FUNC64
+#undef OUTER_DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
 #undef STOP_AT_MATCH
 
 /* stop at match, can't die, accelerated */
@@ -258,6 +535,16 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC isAccelState
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_sama
+#define INTERESTING_FUNC32 hasInterestingStates32
+#define INNER_DEAD_FUNC32 dummyFunc
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 isAccelState32
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define NO_SHENG64_IMPL
+#endif
 #define STOP_AT_MATCH 1
 #include "sheng_impl4.h"
 #undef SHENG_IMPL
@@ -267,6 +554,16 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef NO_SHENG64_IMPL
+#endif
 #undef STOP_AT_MATCH
 
 /* stop at match, can't die, not accelerated */
@@ -277,6 +574,20 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_sam
+#define INTERESTING_FUNC32 hasInterestingStates32
+#define INNER_DEAD_FUNC32 dummyFunc
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 dummyFunc
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define SHENG64_IMPL sheng64_4_sam
+#define INTERESTING_FUNC64 hasInterestingStates64
+#define INNER_DEAD_FUNC64 dummyFunc
+#define OUTER_DEAD_FUNC64 dummyFunc
+#define ACCEPT_FUNC64 isAcceptState64
+#endif
 #define STOP_AT_MATCH 1
 #include "sheng_impl4.h"
 #undef SHENG_IMPL
@@ -286,6 +597,20 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef INTERESTING_FUNC64
+#undef INNER_DEAD_FUNC64
+#undef OUTER_DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
 #undef STOP_AT_MATCH
 
 /* no-match have interesting func as dummy, and die/accel checks are outer */
@@ -298,6 +623,16 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC isAccelState
 #define ACCEPT_FUNC dummyFunc
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_nmda
+#define INTERESTING_FUNC32 dummyFunc4
+#define INNER_DEAD_FUNC32 dummyFunc
+#define OUTER_DEAD_FUNC32 isDeadState32
+#define INNER_ACCEL_FUNC32 dummyFunc
+#define OUTER_ACCEL_FUNC32 isAccelState32
+#define ACCEPT_FUNC32 dummyFunc
+#define NO_SHENG64_IMPL
+#endif
 #define STOP_AT_MATCH 0
 #include "sheng_impl4.h"
 #undef SHENG_IMPL
@@ -307,6 +642,16 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef NO_SHENG64_IMPL
+#endif
 #undef STOP_AT_MATCH
 
 /* no match, can die, not accelerated */
@@ -317,6 +662,20 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC dummyFunc
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_nmd
+#define INTERESTING_FUNC32 dummyFunc4
+#define INNER_DEAD_FUNC32 dummyFunc
+#define OUTER_DEAD_FUNC32 isDeadState32
+#define INNER_ACCEL_FUNC32 dummyFunc
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 dummyFunc
+#define SHENG64_IMPL sheng64_4_nmd
+#define INTERESTING_FUNC64 dummyFunc4
+#define INNER_DEAD_FUNC64 dummyFunc
+#define OUTER_DEAD_FUNC64 isDeadState64
+#define ACCEPT_FUNC64 dummyFunc
+#endif
 #define STOP_AT_MATCH 0
 #include "sheng_impl4.h"
 #undef SHENG_IMPL
@@ -326,6 +685,20 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef INTERESTING_FUNC64
+#undef INNER_DEAD_FUNC64
+#undef OUTER_DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
 #undef STOP_AT_MATCH
 
 /* there is no performance benefit in accelerating a no-match case that can't
@@ -339,6 +712,20 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC dummyFunc
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_nm
+#define INTERESTING_FUNC32 dummyFunc4
+#define INNER_DEAD_FUNC32 dummyFunc
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 dummyFunc
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 dummyFunc
+#define SHENG64_IMPL sheng64_4_nm
+#define INTERESTING_FUNC64 dummyFunc4
+#define INNER_DEAD_FUNC64 dummyFunc
+#define OUTER_DEAD_FUNC64 dummyFunc
+#define ACCEPT_FUNC64 dummyFunc
+#endif
 #define STOP_AT_MATCH 0
 #include "sheng_impl4.h"
 #undef SHENG_IMPL
@@ -348,6 +735,20 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef INTERESTING_FUNC64
+#undef INNER_DEAD_FUNC64
+#undef OUTER_DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
 #undef STOP_AT_MATCH
 
 #endif // SHENG_DEFS_H
diff --git a/src/nfa/sheng_impl.h b/src/nfa/sheng_impl.h
index 9552fe15d31213599dc0b3c15c124d6df43ac6fa..fb8ee168345a38dd6ff51b3bd21f1c20b34617c3 100644
--- a/src/nfa/sheng_impl.h
+++ b/src/nfa/sheng_impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -95,3 +95,127 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
     *scan_end = cur_buf;
     return MO_CONTINUE_MATCHING;
 }
+
+#if defined(HAVE_AVX512VBMI)
+static really_inline
+char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
+                  const struct sheng32 *s,
+                  u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id,
+                  u8 single, u64a base_offset, const u8 *buf, const u8 *start,
+                  const u8 *end, const u8 **scan_end) {
+    DEBUG_PRINTF("Starting DFA execution in state %u\n",
+                 *state & SHENG32_STATE_MASK);
+    const u8 *cur_buf = start;
+    if (DEAD_FUNC32(*state)) {
+        DEBUG_PRINTF("Dead on arrival\n");
+        *scan_end = end;
+        return MO_CONTINUE_MATCHING;
+    }
+    DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start));
+
+    m512 cur_state = set64x8(*state);
+    const m512 *masks = s->succ_masks;
+
+    while (likely(cur_buf != end)) {
+        const u8 c = *cur_buf;
+        const m512 succ_mask = masks[c];
+        cur_state = vpermb512(cur_state, succ_mask);
+        const u8 tmp = movd512(cur_state);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", tmp & SHENG32_STATE_MASK,
+                     tmp & SHENG32_STATE_FLAG_MASK);
+
+        if (unlikely(ACCEPT_FUNC32(tmp))) {
+            DEBUG_PRINTF("Accept state %u reached\n", tmp & SHENG32_STATE_MASK);
+            u64a match_offset = base_offset + (cur_buf - buf) + 1;
+            DEBUG_PRINTF("Match @ %llu\n", match_offset);
+            if (STOP_AT_MATCH) {
+                DEBUG_PRINTF("Stopping at match @ %lli\n",
+                             (u64a)(cur_buf - start));
+                *state = tmp;
+                *scan_end = cur_buf;
+                return MO_MATCHES_PENDING;
+            }
+            if (single) {
+                if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                    MO_HALT_MATCHING) {
+                    return MO_HALT_MATCHING;
+                }
+            } else {
+                if (fireReports32(s, cb, ctxt, tmp, match_offset,
+                                  cached_accept_state, cached_accept_id,
+                                  0) == MO_HALT_MATCHING) {
+                    return MO_HALT_MATCHING;
+                }
+            }
+        }
+        cur_buf++;
+    }
+    *state = movd512(cur_state);
+    *scan_end = cur_buf;
+    return MO_CONTINUE_MATCHING;
+}
+
+static really_inline
+char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt,
+                  const struct sheng64 *s,
+                  u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id,
+                  u8 single, u64a base_offset, const u8 *buf, const u8 *start,
+                  const u8 *end, const u8 **scan_end) {
+    DEBUG_PRINTF("Starting DFA execution in state %u\n",
+                 *state & SHENG64_STATE_MASK);
+    const u8 *cur_buf = start;
+    if (DEAD_FUNC64(*state)) {
+        DEBUG_PRINTF("Dead on arrival\n");
+        *scan_end = end;
+        return MO_CONTINUE_MATCHING;
+    }
+    DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start));
+
+    m512 cur_state = set64x8(*state);
+    const m512 *masks = s->succ_masks;
+
+    while (likely(cur_buf != end)) {
+        const u8 c = *cur_buf;
+        const m512 succ_mask = masks[c];
+        cur_state = vpermb512(cur_state, succ_mask);
+        const u8 tmp = movd512(cur_state);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", tmp & SHENG64_STATE_MASK,
+                     tmp & SHENG64_STATE_FLAG_MASK);
+
+        if (unlikely(ACCEPT_FUNC64(tmp))) {
+            DEBUG_PRINTF("Accept state %u reached\n", tmp & SHENG64_STATE_MASK);
+            u64a match_offset = base_offset + (cur_buf - buf) + 1;
+            DEBUG_PRINTF("Match @ %llu\n", match_offset);
+            if (STOP_AT_MATCH) {
+                DEBUG_PRINTF("Stopping at match @ %lli\n",
+                             (u64a)(cur_buf - start));
+                *state = tmp;
+                *scan_end = cur_buf;
+                return MO_MATCHES_PENDING;
+            }
+            if (single) {
+                if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                    MO_HALT_MATCHING) {
+                    return MO_HALT_MATCHING;
+                }
+            } else {
+                if (fireReports64(s, cb, ctxt, tmp, match_offset,
+                                  cached_accept_state, cached_accept_id,
+                                  0) == MO_HALT_MATCHING) {
+                    return MO_HALT_MATCHING;
+                }
+            }
+        }
+        cur_buf++;
+    }
+    *state = movd512(cur_state);
+    *scan_end = cur_buf;
+    return MO_CONTINUE_MATCHING;
+}
+#endif
diff --git a/src/nfa/sheng_impl4.h b/src/nfa/sheng_impl4.h
index 740322010f61f2dec76da2a0877540b7d60ce32b..440e7396e2fdb4bae9445633dfb2afb800fca1fb 100644
--- a/src/nfa/sheng_impl4.h
+++ b/src/nfa/sheng_impl4.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -282,3 +282,430 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
     *scan_end = cur_buf;
     return MO_CONTINUE_MATCHING;
 }
+
+#if defined(HAVE_AVX512VBMI)
+static really_inline
+char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
+                  const struct sheng32 *s,
+                  u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id,
+                  u8 single, u64a base_offset, const u8 *buf, const u8 *start,
+                  const u8 *end, const u8 **scan_end) {
+    DEBUG_PRINTF("Starting DFAx4 execution in state %u\n",
+                 *state & SHENG32_STATE_MASK);
+    const u8 *cur_buf = start;
+    const u8 *min_accel_dist = start;
+    base_offset++;
+    DEBUG_PRINTF("Scanning %llu bytes\n", (u64a)(end - start));
+
+    if (INNER_ACCEL_FUNC32(*state) || OUTER_ACCEL_FUNC32(*state)) {
+        DEBUG_PRINTF("Accel state reached @ 0\n");
+        const union AccelAux *aaux =
+            get_accel32(s, *state & SHENG32_STATE_MASK);
+        const u8 *new_offset = run_accel(aaux, cur_buf, end);
+        if (new_offset < cur_buf + BAD_ACCEL_DIST) {
+            min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
+        } else {
+            min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
+        }
+        DEBUG_PRINTF("Next accel chance: %llu\n",
+                     (u64a)(min_accel_dist - start));
+        DEBUG_PRINTF("Accel scanned %zu bytes\n", new_offset - cur_buf);
+        cur_buf = new_offset;
+        DEBUG_PRINTF("New offset: %lli\n", (s64a)(cur_buf - start));
+    }
+    if (INNER_DEAD_FUNC32(*state) || OUTER_DEAD_FUNC32(*state)) {
+        DEBUG_PRINTF("Dead on arrival\n");
+        *scan_end = end;
+        return MO_CONTINUE_MATCHING;
+    }
+
+    m512 cur_state = set64x8(*state);
+    const m512 *masks = s->succ_masks;
+
+    while (likely(end - cur_buf >= 4)) {
+        const u8 *b1 = cur_buf;
+        const u8 *b2 = cur_buf + 1;
+        const u8 *b3 = cur_buf + 2;
+        const u8 *b4 = cur_buf + 3;
+        const u8 c1 = *b1;
+        const u8 c2 = *b2;
+        const u8 c3 = *b3;
+        const u8 c4 = *b4;
+
+        const m512 succ_mask1 = masks[c1];
+        cur_state = vpermb512(cur_state, succ_mask1);
+        const u8 a1 = movd512(cur_state);
+
+        const m512 succ_mask2 = masks[c2];
+        cur_state = vpermb512(cur_state, succ_mask2);
+        const u8 a2 = movd512(cur_state);
+
+        const m512 succ_mask3 = masks[c3];
+        cur_state = vpermb512(cur_state, succ_mask3);
+        const u8 a3 = movd512(cur_state);
+
+        const m512 succ_mask4 = masks[c4];
+        cur_state = vpermb512(cur_state, succ_mask4);
+        const u8 a4 = movd512(cur_state);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a1 & SHENG32_STATE_MASK,
+                     a1 & SHENG32_STATE_FLAG_MASK);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c2, ourisprint(c2) ? c2 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a2 & SHENG32_STATE_MASK,
+                     a2 & SHENG32_STATE_FLAG_MASK);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c3, ourisprint(c3) ? c3 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a3 & SHENG32_STATE_MASK,
+                     a3 & SHENG32_STATE_FLAG_MASK);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c4, ourisprint(c4) ? c4 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a4 & SHENG32_STATE_MASK,
+                     a4 & SHENG32_STATE_FLAG_MASK);
+
+        if (unlikely(INTERESTING_FUNC32(a1, a2, a3, a4))) {
+            if (ACCEPT_FUNC32(a1)) {
+                u64a match_offset = base_offset + b1 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a1 & SHENG32_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b1 - start));
+                    *scan_end = b1;
+                    *state = a1;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports32(s, cb, ctxt, a1, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC32(a2)) {
+                u64a match_offset = base_offset + b2 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a2 & SHENG32_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b2 - start));
+                    *scan_end = b2;
+                    *state = a2;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports32(s, cb, ctxt, a2, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC32(a3)) {
+                u64a match_offset = base_offset + b3 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a3 & SHENG32_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b3 - start));
+                    *scan_end = b3;
+                    *state = a3;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports32(s, cb, ctxt, a3, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC32(a4)) {
+                u64a match_offset = base_offset + b4 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a4 & SHENG32_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b4 - start));
+                    *scan_end = b4;
+                    *state = a4;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports32(s, cb, ctxt, a4, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (INNER_DEAD_FUNC32(a4)) {
+                DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(b4 - buf));
+                *scan_end = end;
+                *state = a4;
+                return MO_CONTINUE_MATCHING;
+            }
+            if (cur_buf > min_accel_dist && INNER_ACCEL_FUNC32(a4)) {
+                DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf));
+                const union AccelAux *aaux =
+                    get_accel32(s, a4 & SHENG32_STATE_MASK);
+                const u8 *new_offset = run_accel(aaux, cur_buf + 4, end);
+                if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) {
+                    min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
+                } else {
+                    min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
+                }
+                DEBUG_PRINTF("Next accel chance: %llu\n",
+                             (u64a)(min_accel_dist - start));
+                DEBUG_PRINTF("Accel scanned %llu bytes\n",
+                             (u64a)(new_offset - cur_buf - 4));
+                cur_buf = new_offset;
+                DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf));
+                continue;
+            }
+        }
+        if (OUTER_DEAD_FUNC32(a4)) {
+            DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(cur_buf - buf));
+            *scan_end = end;
+            *state = a4;
+            return MO_CONTINUE_MATCHING;
+        };
+        if (cur_buf > min_accel_dist && OUTER_ACCEL_FUNC32(a4)) {
+            DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf));
+            const union AccelAux *aaux =
+                get_accel32(s, a4 & SHENG32_STATE_MASK);
+            const u8 *new_offset = run_accel(aaux, cur_buf + 4, end);
+            if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) {
+                min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
+            } else {
+                min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
+            }
+            DEBUG_PRINTF("Next accel chance: %llu\n",
+                         (u64a)(min_accel_dist - start));
+            DEBUG_PRINTF("Accel scanned %llu bytes\n",
+                         (u64a)(new_offset - cur_buf - 4));
+            cur_buf = new_offset;
+            DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf));
+            continue;
+        };
+        cur_buf += 4;
+    }
+    *state = movd512(cur_state);
+    *scan_end = cur_buf;
+    return MO_CONTINUE_MATCHING;
+}
+
+#ifndef NO_SHENG64_IMPL
+static really_inline
+char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt,
+                  const struct sheng64 *s,
+                  u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id,
+                  u8 single, u64a base_offset, const u8 *buf, const u8 *start,
+                  const u8 *end, const u8 **scan_end) {
+    DEBUG_PRINTF("Starting DFAx4 execution in state %u\n",
+                 *state & SHENG64_STATE_MASK);
+    const u8 *cur_buf = start;
+    base_offset++;
+    DEBUG_PRINTF("Scanning %llu bytes\n", (u64a)(end - start));
+
+    if (INNER_DEAD_FUNC64(*state) || OUTER_DEAD_FUNC64(*state)) {
+        DEBUG_PRINTF("Dead on arrival\n");
+        *scan_end = end;
+        return MO_CONTINUE_MATCHING;
+    }
+
+    m512 cur_state = set64x8(*state);
+    const m512 *masks = s->succ_masks;
+
+    while (likely(end - cur_buf >= 4)) {
+        const u8 *b1 = cur_buf;
+        const u8 *b2 = cur_buf + 1;
+        const u8 *b3 = cur_buf + 2;
+        const u8 *b4 = cur_buf + 3;
+        const u8 c1 = *b1;
+        const u8 c2 = *b2;
+        const u8 c3 = *b3;
+        const u8 c4 = *b4;
+
+        const m512 succ_mask1 = masks[c1];
+        cur_state = vpermb512(cur_state, succ_mask1);
+        const u8 a1 = movd512(cur_state);
+
+        const m512 succ_mask2 = masks[c2];
+        cur_state = vpermb512(cur_state, succ_mask2);
+        const u8 a2 = movd512(cur_state);
+
+        const m512 succ_mask3 = masks[c3];
+        cur_state = vpermb512(cur_state, succ_mask3);
+        const u8 a3 = movd512(cur_state);
+
+        const m512 succ_mask4 = masks[c4];
+        cur_state = vpermb512(cur_state, succ_mask4);
+        const u8 a4 = movd512(cur_state);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a1 & SHENG64_STATE_MASK,
+                     a1 & SHENG64_STATE_FLAG_MASK);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c2, ourisprint(c2) ? c2 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a2 & SHENG64_STATE_MASK,
+                     a2 & SHENG64_STATE_FLAG_MASK);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c3, ourisprint(c3) ? c3 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a3 & SHENG64_STATE_MASK,
+                     a3 & SHENG64_STATE_FLAG_MASK);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c4, ourisprint(c4) ? c4 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a4 & SHENG64_STATE_MASK,
+                     a4 & SHENG64_STATE_FLAG_MASK);
+
+        if (unlikely(INTERESTING_FUNC64(a1, a2, a3, a4))) {
+            if (ACCEPT_FUNC64(a1)) {
+                u64a match_offset = base_offset + b1 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a1 & SHENG64_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b1 - start));
+                    *scan_end = b1;
+                    *state = a1;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports64(s, cb, ctxt, a1, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC64(a2)) {
+                u64a match_offset = base_offset + b2 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a2 & SHENG64_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b2 - start));
+                    *scan_end = b2;
+                    *state = a2;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports64(s, cb, ctxt, a2, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC64(a3)) {
+                u64a match_offset = base_offset + b3 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a3 & SHENG64_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b3 - start));
+                    *scan_end = b3;
+                    *state = a3;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports64(s, cb, ctxt, a3, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC64(a4)) {
+                u64a match_offset = base_offset + b4 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a4 & SHENG64_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b4 - start));
+                    *scan_end = b4;
+                    *state = a4;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports64(s, cb, ctxt, a4, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (INNER_DEAD_FUNC64(a4)) {
+                DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(b4 - buf));
+                *scan_end = end;
+                *state = a4;
+                return MO_CONTINUE_MATCHING;
+            }
+        }
+        if (OUTER_DEAD_FUNC64(a4)) {
+            DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(cur_buf - buf));
+            *scan_end = end;
+            *state = a4;
+            return MO_CONTINUE_MATCHING;
+        }
+        cur_buf += 4;
+    }
+    *state = movd512(cur_state);
+    *scan_end = cur_buf;
+    return MO_CONTINUE_MATCHING;
+}
+#endif // !NO_SHENG64_IMPL
+#endif
diff --git a/src/nfa/sheng_internal.h b/src/nfa/sheng_internal.h
index ff843ebee3f37da854ccde1d72eb956b2a8e24a5..98536886c59626beb19297324c420859e376873b 100644
--- a/src/nfa/sheng_internal.h
+++ b/src/nfa/sheng_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,6 +38,17 @@
 #define SHENG_STATE_MASK 0xF
 #define SHENG_STATE_FLAG_MASK 0x70
 
+#define SHENG32_STATE_ACCEPT 0x20
+#define SHENG32_STATE_DEAD 0x40
+#define SHENG32_STATE_ACCEL 0x80
+#define SHENG32_STATE_MASK 0x1F
+#define SHENG32_STATE_FLAG_MASK 0xE0
+
+#define SHENG64_STATE_ACCEPT 0x40
+#define SHENG64_STATE_DEAD 0x80
+#define SHENG64_STATE_MASK 0x3F
+#define SHENG64_STATE_FLAG_MASK 0xC0
+
 #define SHENG_FLAG_SINGLE_REPORT 0x1
 #define SHENG_FLAG_CAN_DIE 0x2
 #define SHENG_FLAG_HAS_ACCEL 0x4
@@ -67,4 +78,30 @@ struct sheng {
     ReportID report;
 };
 
+struct sheng32 {
+    m512 succ_masks[256];
+    u32 length;
+    u32 aux_offset;
+    u32 report_offset;
+    u32 accel_offset;
+    u8 n_states;
+    u8 anchored;
+    u8 floating;
+    u8 flags;
+    ReportID report;
+};
+
+struct sheng64 {
+    m512 succ_masks[256];
+    u32 length;
+    u32 aux_offset;
+    u32 report_offset;
+    u32 accel_offset;
+    u8 n_states;
+    u8 anchored;
+    u8 floating;
+    u8 flags;
+    ReportID report;
+};
+
 #endif /* SHENG_INTERNAL_H_ */
diff --git a/src/nfa/shengcompile.cpp b/src/nfa/shengcompile.cpp
index c4094cedc720968fecc0ed0c9e6dfb58431c807c..321fcf71db71f2a9d23eea410fab6a366be2215f 100644
--- a/src/nfa/shengcompile.cpp
+++ b/src/nfa/shengcompile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -271,7 +271,7 @@ unique_ptr<raw_report_info> sheng_build_strat::gatherReports(
         *isSingleReport = 0;
     }
 
-    return move(ri);
+    return ri;
 }
 
 u32 sheng_build_strat::max_allowed_offset_accel() const {
@@ -301,6 +301,28 @@ void dumpShuffleMask(const u8 chr, const u8 *buf, unsigned sz) {
     }
     DEBUG_PRINTF("chr %3u: %s\n", chr, o.str().c_str());
 }
+
+static really_inline
+void dumpShuffleMask32(const u8 chr, const u8 *buf, unsigned sz) {
+    stringstream o;
+
+    for (unsigned i = 0; i < sz; i++) {
+        o.width(2);
+        o << (buf[i] & SHENG32_STATE_MASK) << " ";
+    }
+    DEBUG_PRINTF("chr %3u: %s\n", chr, o.str().c_str());
+}
+
+static really_inline
+void dumpShuffleMask64(const u8 chr, const u8 *buf, unsigned sz) {
+    stringstream o;
+
+    for (unsigned i = 0; i < sz; i++) {
+        o.width(2);
+        o << (buf[i] & SHENG64_STATE_MASK) << " ";
+    }
+    DEBUG_PRINTF("chr %3u: %s\n", chr, o.str().c_str());
+}
 #endif
 
 static
@@ -311,9 +333,16 @@ void fillAccelOut(const map<dstate_id_t, AccelScheme> &accel_escape_info,
     }
 }
 
+template <typename T>
 static
-u8 getShengState(dstate &state, dfa_info &info,
-                 map<dstate_id_t, AccelScheme> &accelInfo) {
+u8 getShengState(UNUSED dstate &state, UNUSED dfa_info &info,
+                 UNUSED map<dstate_id_t, AccelScheme> &accelInfo) {
+    return 0;
+}
+
+template <>
+u8 getShengState<sheng>(dstate &state, dfa_info &info,
+                        map<dstate_id_t, AccelScheme> &accelInfo) {
     u8 s = state.impl_id;
     if (!state.reports.empty()) {
         s |= SHENG_STATE_ACCEPT;
@@ -327,11 +356,41 @@ u8 getShengState(dstate &state, dfa_info &info,
     return s;
 }
 
+template <>
+u8 getShengState<sheng32>(dstate &state, dfa_info &info,
+                          map<dstate_id_t, AccelScheme> &accelInfo) {
+    u8 s = state.impl_id;
+    if (!state.reports.empty()) {
+        s |= SHENG32_STATE_ACCEPT;
+    }
+    if (info.isDead(state)) {
+        s |= SHENG32_STATE_DEAD;
+    }
+    if (accelInfo.find(info.raw_id(state.impl_id)) != accelInfo.end()) {
+        s |= SHENG32_STATE_ACCEL;
+    }
+    return s;
+}
+
+template <>
+u8 getShengState<sheng64>(dstate &state, dfa_info &info,
+                          UNUSED map<dstate_id_t, AccelScheme> &accelInfo) {
+    u8 s = state.impl_id;
+    if (!state.reports.empty()) {
+        s |= SHENG64_STATE_ACCEPT;
+    }
+    if (info.isDead(state)) {
+        s |= SHENG64_STATE_DEAD;
+    }
+    return s;
+}
+
+template <typename T>
 static
 void fillAccelAux(struct NFA *n, dfa_info &info,
                   map<dstate_id_t, AccelScheme> &accelInfo) {
     DEBUG_PRINTF("Filling accel aux structures\n");
-    sheng *s = (sheng *)getMutableImplNfa(n);
+    T *s = (T *)getMutableImplNfa(n);
     u32 offset = s->accel_offset;
 
     for (dstate_id_t i = 0; i < info.size(); i++) {
@@ -349,11 +408,21 @@ void fillAccelAux(struct NFA *n, dfa_info &info,
     }
 }
 
+template <typename T>
 static
-void populateBasicInfo(struct NFA *n, dfa_info &info,
-                       map<dstate_id_t, AccelScheme> &accelInfo, u32 aux_offset,
-                       u32 report_offset, u32 accel_offset, u32 total_size,
-                       u32 dfa_size) {
+void populateBasicInfo(UNUSED struct NFA *n, UNUSED dfa_info &info,
+                       UNUSED map<dstate_id_t, AccelScheme> &accelInfo,
+                       UNUSED u32 aux_offset, UNUSED u32 report_offset,
+                       UNUSED u32 accel_offset, UNUSED u32 total_size,
+                       UNUSED u32 dfa_size) {
+}
+
+template <>
+void populateBasicInfo<sheng>(struct NFA *n, dfa_info &info,
+                              map<dstate_id_t, AccelScheme> &accelInfo,
+                              u32 aux_offset, u32 report_offset,
+                              u32 accel_offset, u32 total_size,
+                              u32 dfa_size) {
     n->length = total_size;
     n->scratchStateSize = 1;
     n->streamStateSize = 1;
@@ -369,14 +438,65 @@ void populateBasicInfo(struct NFA *n, dfa_info &info,
     s->length = dfa_size;
     s->flags |= info.can_die ? SHENG_FLAG_CAN_DIE : 0;
 
-    s->anchored = getShengState(info.anchored, info, accelInfo);
-    s->floating = getShengState(info.floating, info, accelInfo);
+    s->anchored = getShengState<sheng>(info.anchored, info, accelInfo);
+    s->floating = getShengState<sheng>(info.floating, info, accelInfo);
+}
+
+template <>
+void populateBasicInfo<sheng32>(struct NFA *n, dfa_info &info,
+                                map<dstate_id_t, AccelScheme> &accelInfo,
+                                u32 aux_offset, u32 report_offset,
+                                u32 accel_offset, u32 total_size,
+                                u32 dfa_size) {
+    n->length = total_size;
+    n->scratchStateSize = 1;
+    n->streamStateSize = 1;
+    n->nPositions = info.size();
+    n->type = SHENG_NFA_32;
+    n->flags |= info.raw.hasEodReports() ? NFA_ACCEPTS_EOD : 0;
+
+    sheng32 *s = (sheng32 *)getMutableImplNfa(n);
+    s->aux_offset = aux_offset;
+    s->report_offset = report_offset;
+    s->accel_offset = accel_offset;
+    s->n_states = info.size();
+    s->length = dfa_size;
+    s->flags |= info.can_die ? SHENG_FLAG_CAN_DIE : 0;
+
+    s->anchored = getShengState<sheng32>(info.anchored, info, accelInfo);
+    s->floating = getShengState<sheng32>(info.floating, info, accelInfo);
 }
 
+template <>
+void populateBasicInfo<sheng64>(struct NFA *n, dfa_info &info,
+                                map<dstate_id_t, AccelScheme> &accelInfo,
+                                u32 aux_offset, u32 report_offset,
+                                u32 accel_offset, u32 total_size,
+                                u32 dfa_size) {
+    n->length = total_size;
+    n->scratchStateSize = 1;
+    n->streamStateSize = 1;
+    n->nPositions = info.size();
+    n->type = SHENG_NFA_64;
+    n->flags |= info.raw.hasEodReports() ? NFA_ACCEPTS_EOD : 0;
+
+    sheng64 *s = (sheng64 *)getMutableImplNfa(n);
+    s->aux_offset = aux_offset;
+    s->report_offset = report_offset;
+    s->accel_offset = accel_offset;
+    s->n_states = info.size();
+    s->length = dfa_size;
+    s->flags |= info.can_die ? SHENG_FLAG_CAN_DIE : 0;
+
+    s->anchored = getShengState<sheng64>(info.anchored, info, accelInfo);
+    s->floating = getShengState<sheng64>(info.floating, info, accelInfo);
+}
+
+template <typename T>
 static
 void fillTops(NFA *n, dfa_info &info, dstate_id_t id,
               map<dstate_id_t, AccelScheme> &accelInfo) {
-    sheng *s = (sheng *)getMutableImplNfa(n);
+    T *s = (T *)getMutableImplNfa(n);
     u32 aux_base = s->aux_offset;
 
     DEBUG_PRINTF("Filling tops for state %u\n", id);
@@ -393,13 +513,14 @@ void fillTops(NFA *n, dfa_info &info, dstate_id_t id,
 
     DEBUG_PRINTF("Top transition for state %u: %u\n", id, top_state.impl_id);
 
-    aux->top = getShengState(top_state, info, accelInfo);
+    aux->top = getShengState<T>(top_state, info, accelInfo);
 }
 
+template <typename T>
 static
 void fillAux(NFA *n, dfa_info &info, dstate_id_t id, vector<u32> &reports,
                  vector<u32> &reports_eod, vector<u32> &report_offsets) {
-    sheng *s = (sheng *)getMutableImplNfa(n);
+    T *s = (T *)getMutableImplNfa(n);
     u32 aux_base = s->aux_offset;
     auto raw_id = info.raw_id(id);
 
@@ -419,60 +540,97 @@ void fillAux(NFA *n, dfa_info &info, dstate_id_t id, vector<u32> &reports,
     DEBUG_PRINTF("EOD report list offset: %u\n", aux->accept_eod);
 }
 
+template <typename T>
 static
 void fillSingleReport(NFA *n, ReportID r_id) {
-    sheng *s = (sheng *)getMutableImplNfa(n);
+    T *s = (T *)getMutableImplNfa(n);
 
     DEBUG_PRINTF("Single report ID: %u\n", r_id);
     s->report = r_id;
     s->flags |= SHENG_FLAG_SINGLE_REPORT;
 }
 
+template <typename T>
 static
-void createShuffleMasks(sheng *s, dfa_info &info,
-                        map<dstate_id_t, AccelScheme> &accelInfo) {
+bool createShuffleMasks(UNUSED T *s, UNUSED dfa_info &info,
+                        UNUSED map<dstate_id_t, AccelScheme> &accelInfo) {
+    return true;
+}
+
+template <>
+bool createShuffleMasks<sheng>(sheng *s, dfa_info &info,
+                               map<dstate_id_t, AccelScheme> &accelInfo) {
     for (u16 chr = 0; chr < 256; chr++) {
         u8 buf[16] = {0};
 
         for (dstate_id_t idx = 0; idx < info.size(); idx++) {
             auto &succ_state = info.next(idx, chr);
 
-            buf[idx] = getShengState(succ_state, info, accelInfo);
+            buf[idx] = getShengState<sheng>(succ_state, info, accelInfo);
         }
 #ifdef DEBUG
         dumpShuffleMask(chr, buf, sizeof(buf));
 #endif
         memcpy(&s->shuffle_masks[chr], buf, sizeof(m128));
     }
+    return true;
 }
 
-bool has_accel_sheng(const NFA *) {
-    return true; /* consider the sheng region as accelerated */
-}
+template <>
+bool createShuffleMasks<sheng32>(sheng32 *s, dfa_info &info,
+                                 map<dstate_id_t, AccelScheme> &accelInfo) {
+    for (u16 chr = 0; chr < 256; chr++) {
+        u8 buf[64] = {0};
 
-bytecode_ptr<NFA> shengCompile(raw_dfa &raw, const CompileContext &cc,
-                               const ReportManager &rm, bool only_accel_init,
-                               set<dstate_id_t> *accel_states) {
-    if (!cc.grey.allowSheng) {
-        DEBUG_PRINTF("Sheng is not allowed!\n");
-        return nullptr;
-    }
+        assert(info.size() <= 32);
+        for (dstate_id_t idx = 0; idx < info.size(); idx++) {
+            auto &succ_state = info.next(idx, chr);
 
-    sheng_build_strat strat(raw, rm, only_accel_init);
-    dfa_info info(strat);
+            buf[idx] = getShengState<sheng32>(succ_state, info, accelInfo);
+            buf[32 + idx] = buf[idx];
+        }
+#ifdef DEBUG
+        dumpShuffleMask32(chr, buf, sizeof(buf));
+#endif
+        memcpy(&s->succ_masks[chr], buf, sizeof(m512));
+    }
+    return true;
+}
 
-    DEBUG_PRINTF("Trying to compile a %zu state Sheng\n", raw.states.size());
+template <>
+bool createShuffleMasks<sheng64>(sheng64 *s, dfa_info &info,
+                                 map<dstate_id_t, AccelScheme> &accelInfo) {
+    for (u16 chr = 0; chr < 256; chr++) {
+        u8 buf[64] = {0};
 
-    DEBUG_PRINTF("Anchored start state id: %u, floating start state id: %u\n",
-                 raw.start_anchored, raw.start_floating);
+        assert(info.size() <= 64);
+        for (dstate_id_t idx = 0; idx < info.size(); idx++) {
+            auto &succ_state = info.next(idx, chr);
 
-    DEBUG_PRINTF("This DFA %s die so effective number of states is %zu\n",
-                 info.can_die ? "can" : "cannot", info.size());
-    if (info.size() > 16) {
-        DEBUG_PRINTF("Too many states\n");
-        return nullptr;
+            if (accelInfo.find(info.raw_id(succ_state.impl_id))
+                != accelInfo.end()) {
+                return false;
+            }
+            buf[idx] = getShengState<sheng64>(succ_state, info, accelInfo);
+        }
+#ifdef DEBUG
+        dumpShuffleMask64(chr, buf, sizeof(buf));
+#endif
+        memcpy(&s->succ_masks[chr], buf, sizeof(m512));
     }
+    return true;
+}
+
+bool has_accel_sheng(const NFA *) {
+    return true; /* consider the sheng region as accelerated */
+}
 
+template <typename T>
+static
+bytecode_ptr<NFA> shengCompile_int(raw_dfa &raw, const CompileContext &cc,
+                                   set<dstate_id_t> *accel_states,
+                                   sheng_build_strat &strat,
+                                   dfa_info &info) {
     if (!cc.streaming) { /* TODO: work out if we can do the strip in streaming
                           * mode with our semantics */
         raw.stripExtraEodReports();
@@ -487,7 +645,7 @@ bytecode_ptr<NFA> shengCompile(raw_dfa &raw, const CompileContext &cc,
     DEBUG_PRINTF("Anchored start state: %u, floating start state: %u\n",
                  info.anchored.impl_id, info.floating.impl_id);
 
-    u32 nfa_size = ROUNDUP_16(sizeof(NFA) + sizeof(sheng));
+    u32 nfa_size = ROUNDUP_16(sizeof(NFA) + sizeof(T));
     vector<u32> reports, eod_reports, report_offsets;
     u8 isSingle = 0;
     ReportID single_report = 0;
@@ -509,29 +667,128 @@ bytecode_ptr<NFA> shengCompile(raw_dfa &raw, const CompileContext &cc,
 
     auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
 
-    populateBasicInfo(nfa.get(), info, accelInfo, nfa_size, reports_offset,
-                      accel_offset, total_size, total_size - sizeof(NFA));
+    populateBasicInfo<T>(nfa.get(), info, accelInfo, nfa_size,
+                             reports_offset, accel_offset, total_size,
+                             total_size - sizeof(NFA));
 
     DEBUG_PRINTF("Setting up aux and report structures\n");
 
     ri->fillReportLists(nfa.get(), reports_offset, report_offsets);
 
     for (dstate_id_t idx = 0; idx < info.size(); idx++) {
-        fillTops(nfa.get(), info, idx, accelInfo);
-        fillAux(nfa.get(), info, idx, reports, eod_reports, report_offsets);
+        fillTops<T>(nfa.get(), info, idx, accelInfo);
+        fillAux<T>(nfa.get(), info, idx, reports, eod_reports,
+                       report_offsets);
     }
     if (isSingle) {
-        fillSingleReport(nfa.get(), single_report);
+        fillSingleReport<T>(nfa.get(), single_report);
     }
 
-    fillAccelAux(nfa.get(), info, accelInfo);
+    fillAccelAux<T>(nfa.get(), info, accelInfo);
 
     if (accel_states) {
         fillAccelOut(accelInfo, accel_states);
     }
 
-    createShuffleMasks((sheng *)getMutableImplNfa(nfa.get()), info, accelInfo);
+    if (!createShuffleMasks<T>((T *)getMutableImplNfa(nfa.get()), info, accelInfo)) {
+        return nullptr;
+    }
+
+    return nfa;
+}
+
+bytecode_ptr<NFA> shengCompile(raw_dfa &raw, const CompileContext &cc,
+                               const ReportManager &rm, bool only_accel_init,
+                               set<dstate_id_t> *accel_states) {
+    if (!cc.grey.allowSheng) {
+        DEBUG_PRINTF("Sheng is not allowed!\n");
+        return nullptr;
+    }
+
+    sheng_build_strat strat(raw, rm, only_accel_init);
+    dfa_info info(strat);
+
+    DEBUG_PRINTF("Trying to compile a %zu state Sheng\n", raw.states.size());
+
+    DEBUG_PRINTF("Anchored start state id: %u, floating start state id: %u\n",
+                 raw.start_anchored, raw.start_floating);
+
+    DEBUG_PRINTF("This DFA %s die so effective number of states is %zu\n",
+                 info.can_die ? "can" : "cannot", info.size());
+    if (info.size() > 16) {
+        DEBUG_PRINTF("Too many states\n");
+        return nullptr;
+    }
 
+    return shengCompile_int<sheng>(raw, cc, accel_states, strat, info);
+}
+
+bytecode_ptr<NFA> sheng32Compile(raw_dfa &raw, const CompileContext &cc,
+                                 const ReportManager &rm, bool only_accel_init,
+                                 set<dstate_id_t> *accel_states) {
+    if (!cc.grey.allowSheng) {
+        DEBUG_PRINTF("Sheng is not allowed!\n");
+        return nullptr;
+    }
+
+    if (!cc.target_info.has_avx512vbmi()) {
+        DEBUG_PRINTF("Sheng32 failed, no HS_CPU_FEATURES_AVX512VBMI!\n");
+        return nullptr;
+    }
+
+    sheng_build_strat strat(raw, rm, only_accel_init);
+    dfa_info info(strat);
+
+    DEBUG_PRINTF("Trying to compile a %zu state Sheng\n", raw.states.size());
+
+    DEBUG_PRINTF("Anchored start state id: %u, floating start state id: %u\n",
+                 raw.start_anchored, raw.start_floating);
+
+    DEBUG_PRINTF("This DFA %s die so effective number of states is %zu\n",
+                 info.can_die ? "can" : "cannot", info.size());
+    assert(info.size() > 16);
+    if (info.size() > 32) {
+        DEBUG_PRINTF("Too many states\n");
+        return nullptr;
+    }
+
+    return shengCompile_int<sheng32>(raw, cc, accel_states, strat, info);
+}
+
+bytecode_ptr<NFA> sheng64Compile(raw_dfa &raw, const CompileContext &cc,
+                                 const ReportManager &rm, bool only_accel_init,
+                                 set<dstate_id_t> *accel_states) {
+    if (!cc.grey.allowSheng) {
+        DEBUG_PRINTF("Sheng is not allowed!\n");
+        return nullptr;
+    }
+
+    if (!cc.target_info.has_avx512vbmi()) {
+        DEBUG_PRINTF("Sheng64 failed, no HS_CPU_FEATURES_AVX512VBMI!\n");
+        return nullptr;
+    }
+
+    sheng_build_strat strat(raw, rm, only_accel_init);
+    dfa_info info(strat);
+
+    DEBUG_PRINTF("Trying to compile a %zu state Sheng\n", raw.states.size());
+
+    DEBUG_PRINTF("Anchored start state id: %u, floating start state id: %u\n",
+                 raw.start_anchored, raw.start_floating);
+
+    DEBUG_PRINTF("This DFA %s die so effective number of states is %zu\n",
+                 info.can_die ? "can" : "cannot", info.size());
+    assert(info.size() > 32);
+    if (info.size() > 64) {
+        DEBUG_PRINTF("Too many states\n");
+        return nullptr;
+    }
+    vector<dstate> old_states;
+    old_states = info.states;
+    auto nfa = shengCompile_int<sheng64>(raw, cc, accel_states, strat, info);
+    if (!nfa) {
+        info.states = old_states;
+    }
     return nfa;
 }
 
diff --git a/src/nfa/shengcompile.h b/src/nfa/shengcompile.h
index d795b362381b5f778d46ca7fd9fb4d2c99b61dcd..256f4a4e5047e69367dfc459e57b7f2a42976be2 100644
--- a/src/nfa/shengcompile.h
+++ b/src/nfa/shengcompile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -71,6 +71,14 @@ bytecode_ptr<NFA> shengCompile(raw_dfa &raw, const CompileContext &cc,
                                const ReportManager &rm, bool only_accel_init,
                                std::set<dstate_id_t> *accel_states = nullptr);
 
+bytecode_ptr<NFA> sheng32Compile(raw_dfa &raw, const CompileContext &cc,
+                                 const ReportManager &rm, bool only_accel_init,
+                                 std::set<dstate_id_t> *accel_states = nullptr);
+
+bytecode_ptr<NFA> sheng64Compile(raw_dfa &raw, const CompileContext &cc,
+                                 const ReportManager &rm, bool only_accel_init,
+                                 std::set<dstate_id_t> *accel_states = nullptr);
+
 struct sheng_escape_info {
     CharReach outs;
     CharReach outs2_single;
diff --git a/src/nfa/shengdump.cpp b/src/nfa/shengdump.cpp
index 99fda76fd87d2ffcd4db3a367f97a1ee9aae1e20..6eb784077345ad2d6e62f55fbff24188a5ffd0e7 100644
--- a/src/nfa/shengdump.cpp
+++ b/src/nfa/shengdump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -51,7 +51,7 @@ namespace ue2 {
 
 static
 const sstate_aux *get_aux(const NFA *n, dstate_id_t i) {
-    assert(n && isShengType(n->type));
+    assert(n && isSheng16Type(n->type));
 
     const sheng *s = (const sheng *)getImplNfa(n);
     const sstate_aux *aux_base =
@@ -64,6 +64,36 @@ const sstate_aux *get_aux(const NFA *n, dstate_id_t i) {
     return aux;
 }
 
+static
+const sstate_aux *get_aux32(const NFA *n, dstate_id_t i) {
+    assert(n && isSheng32Type(n->type));
+
+    const sheng32 *s = (const sheng32 *)getImplNfa(n);
+    const sstate_aux *aux_base =
+        (const sstate_aux *)((const char *)n + s->aux_offset);
+
+    const sstate_aux *aux = aux_base + i;
+
+    assert((const char *)aux < (const char *)s + s->length);
+
+    return aux;
+}
+
+static
+const sstate_aux *get_aux64(const NFA *n, dstate_id_t i) {
+    assert(n && isSheng64Type(n->type));
+
+    const sheng64 *s = (const sheng64 *)getImplNfa(n);
+    const sstate_aux *aux_base =
+        (const sstate_aux *)((const char *)n + s->aux_offset);
+
+    const sstate_aux *aux = aux_base + i;
+
+    assert((const char *)aux < (const char *)s + s->length);
+
+    return aux;
+}
+
 static
 void dumpHeader(FILE *f, const sheng *s) {
     fprintf(f, "number of states: %u, DFA engine size: %u\n", s->n_states,
@@ -79,6 +109,36 @@ void dumpHeader(FILE *f, const sheng *s) {
             !!(s->flags & SHENG_FLAG_SINGLE_REPORT));
 }
 
+static
+void dumpHeader32(FILE *f, const sheng32 *s) {
+    fprintf(f, "number of states: %u, DFA engine size: %u\n", s->n_states,
+            s->length);
+    fprintf(f, "aux base offset: %u, reports base offset: %u, "
+               "accel offset: %u\n",
+            s->aux_offset, s->report_offset, s->accel_offset);
+    fprintf(f, "anchored start state: %u, floating start state: %u\n",
+            s->anchored & SHENG32_STATE_MASK, s->floating & SHENG32_STATE_MASK);
+    fprintf(f, "has accel: %u can die: %u single report: %u\n",
+            !!(s->flags & SHENG_FLAG_HAS_ACCEL),
+            !!(s->flags & SHENG_FLAG_CAN_DIE),
+            !!(s->flags & SHENG_FLAG_SINGLE_REPORT));
+}
+
+static
+void dumpHeader64(FILE *f, const sheng64 *s) {
+    fprintf(f, "number of states: %u, DFA engine size: %u\n", s->n_states,
+            s->length);
+    fprintf(f, "aux base offset: %u, reports base offset: %u, "
+               "accel offset: %u\n",
+            s->aux_offset, s->report_offset, s->accel_offset);
+    fprintf(f, "anchored start state: %u, floating start state: %u\n",
+            s->anchored & SHENG64_STATE_MASK, s->floating & SHENG64_STATE_MASK);
+    fprintf(f, "has accel: %u can die: %u single report: %u\n",
+            !!(s->flags & SHENG_FLAG_HAS_ACCEL),
+            !!(s->flags & SHENG_FLAG_CAN_DIE),
+            !!(s->flags & SHENG_FLAG_SINGLE_REPORT));
+}
+
 static
 void dumpAux(FILE *f, u32 state, const sstate_aux *aux) {
     fprintf(f, "state id: %u, reports offset: %u, EOD reports offset: %u, "
@@ -87,6 +147,22 @@ void dumpAux(FILE *f, u32 state, const sstate_aux *aux) {
             aux->top & SHENG_STATE_MASK);
 }
 
+static
+void dumpAux32(FILE *f, u32 state, const sstate_aux *aux) {
+    fprintf(f, "state id: %u, reports offset: %u, EOD reports offset: %u, "
+               "accel offset: %u, top: %u\n",
+            state, aux->accept, aux->accept_eod, aux->accel,
+            aux->top & SHENG32_STATE_MASK);
+}
+
+static
+void dumpAux64(FILE *f, u32 state, const sstate_aux *aux) {
+    fprintf(f, "state id: %u, reports offset: %u, EOD reports offset: %u, "
+               "accel offset: %u, top: %u\n",
+            state, aux->accept, aux->accept_eod, aux->accel,
+            aux->top & SHENG64_STATE_MASK);
+}
+
 static
 void dumpReports(FILE *f, const report_list *rl) {
     fprintf(f, "reports count: %u\n", rl->count);
@@ -115,6 +191,46 @@ void dumpMasks(FILE *f, const sheng *s) {
     }
 }
 
+static
+void dumpMasks32(FILE *f, const sheng32 *s) {
+    for (u32 chr = 0; chr < 256; chr++) {
+        u8 buf[64];
+        m512 succ_mask = s->succ_masks[chr];
+        memcpy(buf, &succ_mask, sizeof(m512));
+
+        fprintf(f, "%3u: ", chr);
+        for (u32 pos = 0; pos < 64; pos++) {
+            u8 c = buf[pos];
+            if (c & SHENG32_STATE_FLAG_MASK) {
+                fprintf(f, "%2u* ", c & SHENG32_STATE_MASK);
+            } else {
+                fprintf(f, "%2u  ", c & SHENG32_STATE_MASK);
+            }
+        }
+        fprintf(f, "\n");
+    }
+}
+
+static
+void dumpMasks64(FILE *f, const sheng64 *s) {
+    for (u32 chr = 0; chr < 256; chr++) {
+        u8 buf[64];
+        m512 succ_mask = s->succ_masks[chr];
+        memcpy(buf, &succ_mask, sizeof(m512));
+
+        fprintf(f, "%3u: ", chr);
+        for (u32 pos = 0; pos < 64; pos++) {
+            u8 c = buf[pos];
+            if (c & SHENG64_STATE_FLAG_MASK) {
+                fprintf(f, "%2u* ", c & SHENG64_STATE_MASK);
+            } else {
+                fprintf(f, "%2u  ", c & SHENG64_STATE_MASK);
+            }
+        }
+        fprintf(f, "\n");
+    }
+}
+
 static
 void nfaExecSheng_dumpText(const NFA *nfa, FILE *f) {
     assert(nfa->type == SHENG_NFA);
@@ -153,6 +269,82 @@ void nfaExecSheng_dumpText(const NFA *nfa, FILE *f) {
     fprintf(f, "\n");
 }
 
+static
+void nfaExecSheng32_dumpText(const NFA *nfa, FILE *f) {
+    assert(nfa->type == SHENG_NFA_32);
+    const sheng32 *s = (const sheng32 *)getImplNfa(nfa);
+
+    fprintf(f, "sheng32 DFA\n");
+    dumpHeader32(f, s);
+
+    for (u32 state = 0; state < s->n_states; state++) {
+        const sstate_aux *aux = get_aux32(nfa, state);
+        dumpAux32(f, state, aux);
+        if (aux->accept) {
+            fprintf(f, "report list:\n");
+            const report_list *rl =
+                (const report_list *)((const char *)nfa + aux->accept);
+            dumpReports(f, rl);
+        }
+        if (aux->accept_eod) {
+            fprintf(f, "EOD report list:\n");
+            const report_list *rl =
+                (const report_list *)((const char *)nfa + aux->accept_eod);
+            dumpReports(f, rl);
+        }
+        if (aux->accel) {
+            fprintf(f, "accel:\n");
+            const AccelAux *accel =
+                (const AccelAux *)((const char *)nfa + aux->accel);
+            dumpAccelInfo(f, *accel);
+        }
+    }
+
+    fprintf(f, "\n");
+
+    dumpMasks32(f, s);
+
+    fprintf(f, "\n");
+}
+
+static
+void nfaExecSheng64_dumpText(const NFA *nfa, FILE *f) {
+    assert(nfa->type == SHENG_NFA_64);
+    const sheng64 *s = (const sheng64 *)getImplNfa(nfa);
+
+    fprintf(f, "sheng64 DFA\n");
+    dumpHeader64(f, s);
+
+    for (u32 state = 0; state < s->n_states; state++) {
+        const sstate_aux *aux = get_aux64(nfa, state);
+        dumpAux64(f, state, aux);
+        if (aux->accept) {
+            fprintf(f, "report list:\n");
+            const report_list *rl =
+                (const report_list *)((const char *)nfa + aux->accept);
+            dumpReports(f, rl);
+        }
+        if (aux->accept_eod) {
+            fprintf(f, "EOD report list:\n");
+            const report_list *rl =
+                (const report_list *)((const char *)nfa + aux->accept_eod);
+            dumpReports(f, rl);
+        }
+        if (aux->accel) {
+            fprintf(f, "accel:\n");
+            const AccelAux *accel =
+                (const AccelAux *)((const char *)nfa + aux->accel);
+            dumpAccelInfo(f, *accel);
+        }
+    }
+
+    fprintf(f, "\n");
+
+    dumpMasks64(f, s);
+
+    fprintf(f, "\n");
+}
+
 static
 void dumpDotPreambleDfa(FILE *f) {
     dumpDotPreamble(f);
@@ -163,8 +355,14 @@ void dumpDotPreambleDfa(FILE *f) {
     fprintf(f, "0 [style=invis];\n");
 }
 
+template <typename T>
 static
-void describeNode(const NFA *n, const sheng *s, u16 i, FILE *f) {
+void describeNode(UNUSED const NFA *n, UNUSED const T *s, UNUSED u16 i,
+                  UNUSED FILE *f) {
+}
+
+template <>
+void describeNode<sheng>(const NFA *n, const sheng *s, u16 i, FILE *f) {
     const sstate_aux *aux = get_aux(n, i);
 
     fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, "
@@ -193,6 +391,66 @@ void describeNode(const NFA *n, const sheng *s, u16 i, FILE *f) {
     }
 }
 
+template <>
+void describeNode<sheng32>(const NFA *n, const sheng32 *s, u16 i, FILE *f) {
+    const sstate_aux *aux = get_aux32(n, i);
+
+    fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, "
+               "label = \"%u\" ]; \n",
+            i, i);
+
+    if (aux->accept_eod) {
+        fprintf(f, "%u [ color = darkorchid ];\n", i);
+    }
+
+    if (aux->accept) {
+        fprintf(f, "%u [ shape = doublecircle ];\n", i);
+    }
+
+    if (aux->top && (aux->top & SHENG32_STATE_MASK) != i) {
+        fprintf(f, "%u -> %u [color = darkgoldenrod weight=0.1 ]\n", i,
+                aux->top & SHENG32_STATE_MASK);
+    }
+
+    if (i == (s->anchored & SHENG32_STATE_MASK)) {
+        fprintf(f, "STARTA -> %u [color = blue ]\n", i);
+    }
+
+    if (i == (s->floating & SHENG32_STATE_MASK)) {
+        fprintf(f, "STARTF -> %u [color = red ]\n", i);
+    }
+}
+
+template <>
+void describeNode<sheng64>(const NFA *n, const sheng64 *s, u16 i, FILE *f) {
+    const sstate_aux *aux = get_aux64(n, i);
+
+    fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, "
+               "label = \"%u\" ]; \n",
+            i, i);
+
+    if (aux->accept_eod) {
+        fprintf(f, "%u [ color = darkorchid ];\n", i);
+    }
+
+    if (aux->accept) {
+        fprintf(f, "%u [ shape = doublecircle ];\n", i);
+    }
+
+    if (aux->top && (aux->top & SHENG64_STATE_MASK) != i) {
+        fprintf(f, "%u -> %u [color = darkgoldenrod weight=0.1 ]\n", i,
+                aux->top & SHENG64_STATE_MASK);
+    }
+
+    if (i == (s->anchored & SHENG64_STATE_MASK)) {
+        fprintf(f, "STARTA -> %u [color = blue ]\n", i);
+    }
+
+    if (i == (s->floating & SHENG64_STATE_MASK)) {
+        fprintf(f, "STARTF -> %u [color = red ]\n", i);
+    }
+}
+
 static
 void describeEdge(FILE *f, const u16 *t, u16 i) {
     for (u16 s = 0; s < N_CHARS; s++) {
@@ -228,7 +486,7 @@ void describeEdge(FILE *f, const u16 *t, u16 i) {
 
 static
 void shengGetTransitions(const NFA *n, u16 state, u16 *t) {
-    assert(isShengType(n->type));
+    assert(isSheng16Type(n->type));
     const sheng *s = (const sheng *)getImplNfa(n);
     const sstate_aux *aux = get_aux(n, state);
 
@@ -244,6 +502,42 @@ void shengGetTransitions(const NFA *n, u16 state, u16 *t) {
     t[TOP] = aux->top & SHENG_STATE_MASK;
 }
 
+static
+void sheng32GetTransitions(const NFA *n, u16 state, u16 *t) {
+    assert(isSheng32Type(n->type));
+    const sheng32 *s = (const sheng32 *)getImplNfa(n);
+    const sstate_aux *aux = get_aux32(n, state);
+
+    for (unsigned i = 0; i < N_CHARS; i++) {
+        u8 buf[64];
+        m512 succ_mask = s->succ_masks[i];
+
+        memcpy(buf, &succ_mask, sizeof(m512));
+
+        t[i] = buf[state] & SHENG32_STATE_MASK;
+    }
+
+    t[TOP] = aux->top & SHENG32_STATE_MASK;
+}
+
+static
+void sheng64GetTransitions(const NFA *n, u16 state, u16 *t) {
+    assert(isSheng64Type(n->type));
+    const sheng64 *s = (const sheng64 *)getImplNfa(n);
+    const sstate_aux *aux = get_aux64(n, state);
+
+    for (unsigned i = 0; i < N_CHARS; i++) {
+        u8 buf[64];
+        m512 succ_mask = s->succ_masks[i];
+
+        memcpy(buf, &succ_mask, sizeof(m512));
+
+        t[i] = buf[state] & SHENG64_STATE_MASK;
+    }
+
+    t[TOP] = aux->top & SHENG64_STATE_MASK;
+}
+
 static
 void nfaExecSheng_dumpDot(const NFA *nfa, FILE *f) {
     assert(nfa->type == SHENG_NFA);
@@ -252,7 +546,7 @@ void nfaExecSheng_dumpDot(const NFA *nfa, FILE *f) {
     dumpDotPreambleDfa(f);
 
     for (u16 i = 1; i < s->n_states; i++) {
-        describeNode(nfa, s, i, f);
+        describeNode<sheng>(nfa, s, i, f);
 
         u16 t[ALPHABET_SIZE];
 
@@ -264,10 +558,62 @@ void nfaExecSheng_dumpDot(const NFA *nfa, FILE *f) {
     fprintf(f, "}\n");
 }
 
+static
+void nfaExecSheng32_dumpDot(const NFA *nfa, FILE *f) {
+    assert(nfa->type == SHENG_NFA_32);
+    const sheng32 *s = (const sheng32 *)getImplNfa(nfa);
+
+    dumpDotPreambleDfa(f);
+
+    for (u16 i = 1; i < s->n_states; i++) {
+        describeNode<sheng32>(nfa, s, i, f);
+
+        u16 t[ALPHABET_SIZE];
+
+        sheng32GetTransitions(nfa, i, t);
+
+        describeEdge(f, t, i);
+    }
+
+    fprintf(f, "}\n");
+}
+
+static
+void nfaExecSheng64_dumpDot(const NFA *nfa, FILE *f) {
+    assert(nfa->type == SHENG_NFA_64);
+    const sheng64 *s = (const sheng64 *)getImplNfa(nfa);
+
+    dumpDotPreambleDfa(f);
+
+    for (u16 i = 1; i < s->n_states; i++) {
+        describeNode<sheng64>(nfa, s, i, f);
+
+        u16 t[ALPHABET_SIZE];
+
+        sheng64GetTransitions(nfa, i, t);
+
+        describeEdge(f, t, i);
+    }
+
+    fprintf(f, "}\n");
+}
+
 void nfaExecSheng_dump(const NFA *nfa, const string &base) {
     assert(nfa->type == SHENG_NFA);
     nfaExecSheng_dumpText(nfa, StdioFile(base + ".txt", "w"));
     nfaExecSheng_dumpDot(nfa, StdioFile(base + ".dot", "w"));
 }
 
+void nfaExecSheng32_dump(UNUSED const NFA *nfa, UNUSED const string &base) {
+    assert(nfa->type == SHENG_NFA_32);
+    nfaExecSheng32_dumpText(nfa, StdioFile(base + ".txt", "w"));
+    nfaExecSheng32_dumpDot(nfa, StdioFile(base + ".dot", "w"));
+}
+
+void nfaExecSheng64_dump(UNUSED const NFA *nfa, UNUSED const string &base) {
+    assert(nfa->type == SHENG_NFA_64);
+    nfaExecSheng64_dumpText(nfa, StdioFile(base + ".txt", "w"));
+    nfaExecSheng64_dumpDot(nfa, StdioFile(base + ".dot", "w"));
+}
+
 } // namespace ue2
diff --git a/src/nfa/shengdump.h b/src/nfa/shengdump.h
index 2bdffeb9a3f97b6bc36fec58167a7aaf726f6de4..3215367428f43172c7029dd4d1c7007525522c65 100644
--- a/src/nfa/shengdump.h
+++ b/src/nfa/shengdump.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,6 +38,8 @@ struct NFA;
 namespace ue2 {
 
 void nfaExecSheng_dump(const struct NFA *nfa, const std::string &base);
+void nfaExecSheng32_dump(const struct NFA *nfa, const std::string &base);
+void nfaExecSheng64_dump(const struct NFA *nfa, const std::string &base);
 
 } // namespace ue2
 
diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c
index 6231e6192156224a8dc17ee8f94bb20812e0068f..2cb74f0f326727bb44cd5a4aff5878955e84c3b7 100644
--- a/src/nfa/shufti.c
+++ b/src/nfa/shufti.c
@@ -42,36 +42,39 @@
 #ifdef DEBUG
 #include <ctype.h>
 
-#define DUMP_MSK(_t)                                                           \
-    static UNUSED void dumpMsk##_t(m##_t msk) {                                \
-        u8 *mskAsU8 = (u8 *)&msk;                                              \
-        for (unsigned i = 0; i < sizeof(msk); i++) {                           \
-            u8 c = mskAsU8[i];                                                 \
-            for (int j = 0; j < 8; j++) {                                      \
-                if ((c >> (7 - j)) & 0x1)                                      \
-                    printf("1");                                               \
-                else                                                           \
-                    printf("0");                                               \
-            }                                                                  \
-            printf(" ");                                                       \
-        }                                                                      \
-    }                                                                          \
-    static UNUSED void dumpMsk##_t##AsChars(m##_t msk) {                       \
-        u8 *mskAsU8 = (u8 *)&msk;                                              \
-        for (unsigned i = 0; i < sizeof(msk); i++) {                           \
-            u8 c = mskAsU8[i];                                                 \
-            if (isprint(c))                                                    \
-                printf("%c", c);                                               \
-            else                                                               \
-                printf(".");                                                   \
-        }                                                                      \
-    }
+#define DUMP_MSK(_t)                                \
+static UNUSED                                       \
+void dumpMsk##_t(m##_t msk) {                       \
+    u8 * mskAsU8 = (u8 *)&msk;                      \
+    for (unsigned i = 0; i < sizeof(msk); i++) {    \
+        u8 c = mskAsU8[i];                          \
+        for (int j = 0; j < 8; j++) {               \
+            if ((c >> (7-j)) & 0x1)                 \
+                printf("1");                        \
+            else                                    \
+                printf("0");                        \
+        }                                           \
+        printf(" ");                                \
+    }                                               \
+}                                                   \
+static UNUSED                                       \
+void dumpMsk##_t##AsChars(m##_t msk) {              \
+    u8 * mskAsU8 = (u8 *)&msk;                      \
+    for (unsigned i = 0; i < sizeof(msk); i++) {    \
+        u8 c = mskAsU8[i];                          \
+        if (isprint(c))                             \
+            printf("%c",c);                         \
+        else                                        \
+            printf(".");                            \
+    }                                               \
+}
 
 #endif
 
 /** \brief Naive byte-by-byte implementation. */
-static really_inline const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi,
-                                             const u8 *buf, const u8 *buf_end) {
+static really_inline
+const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, const u8 *buf,
+                        const u8 *buf_end) {
     assert(buf < buf_end);
 
     for (; buf < buf_end; ++buf) {
@@ -84,8 +87,9 @@ static really_inline const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi,
 }
 
 /** \brief Naive byte-by-byte implementation. */
-static really_inline const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi,
-                                             const u8 *buf, const u8 *buf_end) {
+static really_inline
+const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf,
+                        const u8 *buf_end) {
     assert(buf < buf_end);
 
     for (buf_end--; buf_end >= buf; buf_end--) {
@@ -107,33 +111,25 @@ DUMP_MSK(128)
 #define GET_LO_4(chars) and128(chars, low4bits)
 #define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
 
-static really_inline u32 block(m128 mask_lo, m128 mask_hi, m128 chars,
-                               const m128 low4bits, const m128 compare) {
-    m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars));
-    m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars));
-    m128 t = and128(c_lo, c_hi);
+static really_inline
+u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits,
+          const m128 compare) {
+    m128 c_lo  = pshufb_m128(mask_lo, GET_LO_4(chars));
+    m128 c_hi  = pshufb_m128(mask_hi, GET_HI_4(chars));
+    m128 t     = and128(c_lo, c_hi);
 
 #ifdef DEBUG
-    DEBUG_PRINTF(" chars: ");
-    dumpMsk128AsChars(chars);
-    printf("\n");
-    DEBUG_PRINTF("  char: ");
-    dumpMsk128(chars);
-    printf("\n");
-    DEBUG_PRINTF("  c_lo: ");
-    dumpMsk128(c_lo);
-    printf("\n");
-    DEBUG_PRINTF("  c_hi: ");
-    dumpMsk128(c_hi);
-    printf("\n");
-    DEBUG_PRINTF("     t: ");
-    dumpMsk128(t);
-    printf("\n");
+    DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
+    DEBUG_PRINTF("  char: "); dumpMsk128(chars);        printf("\n");
+    DEBUG_PRINTF("  c_lo: "); dumpMsk128(c_lo);         printf("\n");
+    DEBUG_PRINTF("  c_hi: "); dumpMsk128(c_hi);         printf("\n");
+    DEBUG_PRINTF("     t: "); dumpMsk128(t);            printf("\n");
 #endif
     return movemask128(eq128(t, compare));
 }
 
-static really_inline const u8 *firstMatch(const u8 *buf, u32 z) {
+static really_inline
+const u8 *firstMatch(const u8 *buf, u32 z) {
     if (unlikely(z != 0xffff)) {
         u32 pos = ctz32(~z & 0xffff);
         assert(pos < 16);
@@ -143,9 +139,9 @@ static really_inline const u8 *firstMatch(const u8 *buf, u32 z) {
     }
 }
 
-static really_inline const u8 *fwdBlock(m128 mask_lo, m128 mask_hi, m128 chars,
-                                        const u8 *buf, const m128 low4bits,
-                                        const m128 zeroes) {
+static really_inline
+const u8 *fwdBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf,
+                   const m128 low4bits, const m128 zeroes) {
     u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
 
     return firstMatch(buf, z);
@@ -158,8 +154,8 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
 
     // Slow path for small cases.
     if (unlikely(buf_end - buf < 16)) {
-        return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf,
-                             buf_end);
+        return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
+                             buf, buf_end);
     }
 
     const m128 zeroes = zeroes128();
@@ -207,11 +203,10 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
     return buf_end;
 }
 
-static really_inline const u8 *lastMatch(const u8 *buf, m128 t, m128 compare) {
+static really_inline
+const u8 *lastMatch(const u8 *buf, m128 t, m128 compare) {
 #ifdef DEBUG
-    DEBUG_PRINTF("confirming match in:");
-    dumpMsk128(t);
-    printf("\n");
+    DEBUG_PRINTF("confirming match in:"); dumpMsk128(t); printf("\n");
 #endif
 
     u32 z = movemask128(eq128(t, compare));
@@ -225,29 +220,20 @@ static really_inline const u8 *lastMatch(const u8 *buf, m128 t, m128 compare) {
     }
 }
 
-static really_inline const u8 *revBlock(m128 mask_lo, m128 mask_hi, m128 chars,
-                                        const u8 *buf, const m128 low4bits,
-                                        const m128 zeroes) {
-    m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars));
-    m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars));
-    m128 t = and128(c_lo, c_hi);
+
+static really_inline
+const u8 *revBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf,
+                   const m128 low4bits, const m128 zeroes) {
+    m128 c_lo  = pshufb_m128(mask_lo, GET_LO_4(chars));
+    m128 c_hi  = pshufb_m128(mask_hi, GET_HI_4(chars));
+    m128 t     = and128(c_lo, c_hi);
 
 #ifdef DEBUG
-    DEBUG_PRINTF(" chars: ");
-    dumpMsk128AsChars(chars);
-    printf("\n");
-    DEBUG_PRINTF("  char: ");
-    dumpMsk128(chars);
-    printf("\n");
-    DEBUG_PRINTF("  c_lo: ");
-    dumpMsk128(c_lo);
-    printf("\n");
-    DEBUG_PRINTF("  c_hi: ");
-    dumpMsk128(c_hi);
-    printf("\n");
-    DEBUG_PRINTF("     t: ");
-    dumpMsk128(t);
-    printf("\n");
+    DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
+    DEBUG_PRINTF("  char: "); dumpMsk128(chars);        printf("\n");
+    DEBUG_PRINTF("  c_lo: "); dumpMsk128(c_lo);         printf("\n");
+    DEBUG_PRINTF("  c_hi: "); dumpMsk128(c_hi);         printf("\n");
+    DEBUG_PRINTF("     t: "); dumpMsk128(t);            printf("\n");
 #endif
 
     return lastMatch(buf, t, zeroes);
@@ -260,8 +246,8 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
 
     // Slow path for small cases.
     if (buf_end - buf < 16) {
-        return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf,
-                             buf_end);
+        return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
+                             buf, buf_end);
     }
 
     const m128 zeroes = zeroes128();
@@ -302,48 +288,32 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
     return buf - 1;
 }
 
-static really_inline const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi,
-                                         m128 mask2_lo, m128 mask2_hi,
-                                         m128 chars, const u8 *buf,
-                                         const m128 low4bits, const m128 ones) {
+static really_inline
+const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi,
+                    m128 chars, const u8 *buf, const m128 low4bits,
+                    const m128 ones) {
     m128 chars_lo = GET_LO_4(chars);
     m128 chars_hi = GET_HI_4(chars);
-    m128 c_lo = pshufb_m128(mask1_lo, chars_lo);
-    m128 c_hi = pshufb_m128(mask1_hi, chars_hi);
-    m128 t = or128(c_lo, c_hi);
+    m128 c_lo  = pshufb_m128(mask1_lo, chars_lo);
+    m128 c_hi  = pshufb_m128(mask1_hi, chars_hi);
+    m128 t     = or128(c_lo, c_hi);
 
 #ifdef DEBUG
-    DEBUG_PRINTF(" chars: ");
-    dumpMsk128AsChars(chars);
-    printf("\n");
-    DEBUG_PRINTF("  char: ");
-    dumpMsk128(chars);
-    printf("\n");
-    DEBUG_PRINTF("  c_lo: ");
-    dumpMsk128(c_lo);
-    printf("\n");
-    DEBUG_PRINTF("  c_hi: ");
-    dumpMsk128(c_hi);
-    printf("\n");
-    DEBUG_PRINTF("     t: ");
-    dumpMsk128(t);
-    printf("\n");
+    DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
+    DEBUG_PRINTF("  char: "); dumpMsk128(chars);        printf("\n");
+    DEBUG_PRINTF("  c_lo: "); dumpMsk128(c_lo);         printf("\n");
+    DEBUG_PRINTF("  c_hi: "); dumpMsk128(c_hi);         printf("\n");
+    DEBUG_PRINTF("     t: "); dumpMsk128(t);            printf("\n");
 #endif
 
-    m128 c2_lo = pshufb_m128(mask2_lo, chars_lo);
-    m128 c2_hi = pshufb_m128(mask2_hi, chars_hi);
-    m128 t2 = or128(t, rshiftbyte_m128(or128(c2_lo, c2_hi), 1));
+    m128 c2_lo  = pshufb_m128(mask2_lo, chars_lo);
+    m128 c2_hi  = pshufb_m128(mask2_hi, chars_hi);
+    m128 t2     = or128(t, rshiftbyte_m128(or128(c2_lo, c2_hi), 1));
 
 #ifdef DEBUG
-    DEBUG_PRINTF(" c2_lo: ");
-    dumpMsk128(c2_lo);
-    printf("\n");
-    DEBUG_PRINTF(" c2_hi: ");
-    dumpMsk128(c2_hi);
-    printf("\n");
-    DEBUG_PRINTF("    t2: ");
-    dumpMsk128(t2);
-    printf("\n");
+    DEBUG_PRINTF(" c2_lo: "); dumpMsk128(c2_lo);        printf("\n");
+    DEBUG_PRINTF(" c2_hi: "); dumpMsk128(c2_hi);        printf("\n");
+    DEBUG_PRINTF("    t2: "); dumpMsk128(t2);           printf("\n");
 #endif
 
     u32 z = movemask128(eq128(t2, ones));
@@ -351,8 +321,9 @@ static really_inline const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi,
     return firstMatch(buf, z);
 }
 
-const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
-                           m128 mask2_hi, const u8 *buf, const u8 *buf_end) {
+const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
+                           m128 mask2_lo, m128 mask2_hi,
+                           const u8 *buf, const u8 *buf_end) {
     const m128 ones = ones128();
     const m128 low4bits = set16x8(0xf);
     const u8 *rv;
@@ -361,8 +332,8 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
 
     // Preconditioning: most of the time our buffer won't be aligned.
     m128 chars = loadu128(buf);
-    rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, chars, buf, low4bits,
-                   ones);
+    rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
+                   chars, buf, low4bits, ones);
     if (rv) {
         return rv;
     }
@@ -379,8 +350,8 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
         __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(buf + 256)));
 #endif
 
-        rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, lchars, buf,
-                       low4bits, ones);
+        rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
+                       lchars, buf, low4bits, ones);
         if (rv) {
             return rv;
         }
@@ -390,8 +361,8 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
     // Use an unaligned load to mop up the last 16 bytes and get an accurate
     // picture to buf_end.
     chars = loadu128(buf_end - 16);
-    rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, chars, buf_end - 16,
-                   low4bits, ones);
+    rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
+                   chars, buf_end - 16, low4bits, ones);
     if (rv) {
         return rv;
     }
@@ -409,34 +380,26 @@ DUMP_MSK(256)
 #define GET_LO_4(chars) and256(chars, low4bits)
 #define GET_HI_4(chars) rshift64_m256(andnot256(low4bits, chars), 4)
 
-static really_inline u32 block(m256 mask_lo, m256 mask_hi, m256 chars,
-                               const m256 low4bits, const m256 compare) {
-    m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars));
-    m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars));
+static really_inline
+u32 block(m256 mask_lo, m256 mask_hi, m256 chars, const m256 low4bits,
+          const m256 compare) {
+    m256 c_lo  = pshufb_m256(mask_lo, GET_LO_4(chars));
+    m256 c_hi  = pshufb_m256(mask_hi, GET_HI_4(chars));
     m256 t = and256(c_lo, c_hi);
 
 #ifdef DEBUG
-    DEBUG_PRINTF(" chars: ");
-    dumpMsk256AsChars(chars);
-    printf("\n");
-    DEBUG_PRINTF("  char: ");
-    dumpMsk256(chars);
-    printf("\n");
-    DEBUG_PRINTF("  c_lo: ");
-    dumpMsk256(c_lo);
-    printf("\n");
-    DEBUG_PRINTF("  c_hi: ");
-    dumpMsk256(c_hi);
-    printf("\n");
-    DEBUG_PRINTF("     t: ");
-    dumpMsk256(t);
-    printf("\n");
+    DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
+    DEBUG_PRINTF("  char: "); dumpMsk256(chars); printf("\n");
+    DEBUG_PRINTF("  c_lo: "); dumpMsk256(c_lo); printf("\n");
+    DEBUG_PRINTF("  c_hi: "); dumpMsk256(c_hi); printf("\n");
+    DEBUG_PRINTF("     t: "); dumpMsk256(t); printf("\n");
 #endif
 
     return movemask256(eq256(t, compare));
 }
 
-static really_inline const u8 *firstMatch(const u8 *buf, u32 z) {
+static really_inline
+const u8 *firstMatch(const u8 *buf, u32 z) {
     DEBUG_PRINTF("z 0x%08x\n", z);
     if (unlikely(z != 0xffffffff)) {
         u32 pos = ctz32(~z);
@@ -448,8 +411,9 @@ static really_inline const u8 *firstMatch(const u8 *buf, u32 z) {
     }
 }
 
-static really_inline const u8 *
-fwdBlockShort(m256 mask, m128 chars, const u8 *buf, const m256 low4bits) {
+static really_inline
+const u8 *fwdBlockShort(m256 mask, m128 chars, const u8 *buf,
+                        const m256 low4bits) {
     // do the hi and lo shuffles in the one avx register
     m256 c = combine2x128(rshift64_m128(chars, 4), chars);
     c = and256(c, low4bits);
@@ -461,9 +425,9 @@ fwdBlockShort(m256 mask, m128 chars, const u8 *buf, const m256 low4bits) {
     return firstMatch(buf, z);
 }
 
-static really_inline const u8 *shuftiFwdShort(m128 mask_lo, m128 mask_hi,
-                                              const u8 *buf, const u8 *buf_end,
-                                              const m256 low4bits) {
+static really_inline
+const u8 *shuftiFwdShort(m128 mask_lo, m128 mask_hi, const u8 *buf,
+                         const u8 *buf_end, const m256 low4bits) {
     // run shufti over two overlapping 16-byte unaligned reads
     const m256 mask = combine2x128(mask_hi, mask_lo);
     m128 chars = loadu128(buf);
@@ -480,9 +444,9 @@ static really_inline const u8 *shuftiFwdShort(m128 mask_lo, m128 mask_hi,
     return buf_end;
 }
 
-static really_inline const u8 *fwdBlock(m256 mask_lo, m256 mask_hi, m256 chars,
-                                        const u8 *buf, const m256 low4bits,
-                                        const m256 zeroes) {
+static really_inline
+const u8 *fwdBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf,
+                   const m256 low4bits, const m256 zeroes) {
     u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
 
     return firstMatch(buf, z);
@@ -497,8 +461,8 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
 
     // Slow path for small cases.
     if (buf_end - buf < 16) {
-        return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf,
-                             buf_end);
+        return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
+                             buf, buf_end);
     }
 
     const m256 low4bits = set32x8(0xf);
@@ -529,8 +493,7 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
     const u8 *last_block = buf_end - 32;
     while (buf < last_block) {
         m256 lchars = load256(buf);
-        rv =
-            fwdBlock(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits, zeroes);
+        rv = fwdBlock(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits, zeroes);
         if (rv) {
             return rv;
         }
@@ -541,8 +504,7 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
     // picture to buf_end.
     assert(buf <= buf_end && buf >= buf_end - 32);
     chars = loadu256(buf_end - 32);
-    rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits,
-                  zeroes);
+    rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, zeroes);
     if (rv) {
         return rv;
     }
@@ -550,7 +512,8 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
     return buf_end;
 }
 
-static really_inline const u8 *lastMatch(const u8 *buf, u32 z) {
+static really_inline
+const u8 *lastMatch(const u8 *buf, u32 z) {
     if (unlikely(z != 0xffffffff)) {
         u32 pos = clz32(~z);
         DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
@@ -560,37 +523,28 @@ static really_inline const u8 *lastMatch(const u8 *buf, u32 z) {
     }
 }
 
-static really_inline const u8 *revBlock(m256 mask_lo, m256 mask_hi, m256 chars,
-                                        const u8 *buf, const m256 low4bits,
-                                        const m256 zeroes) {
-    m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars));
-    m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars));
-    m256 t = and256(c_lo, c_hi);
+static really_inline
+const u8 *revBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf,
+                   const m256 low4bits, const m256 zeroes) {
+    m256 c_lo  = pshufb_m256(mask_lo, GET_LO_4(chars));
+    m256 c_hi  = pshufb_m256(mask_hi, GET_HI_4(chars));
+    m256 t     = and256(c_lo, c_hi);
 
 #ifdef DEBUG
-    DEBUG_PRINTF(" chars: ");
-    dumpMsk256AsChars(chars);
-    printf("\n");
-    DEBUG_PRINTF("  char: ");
-    dumpMsk256(chars);
-    printf("\n");
-    DEBUG_PRINTF("  c_lo: ");
-    dumpMsk256(c_lo);
-    printf("\n");
-    DEBUG_PRINTF("  c_hi: ");
-    dumpMsk256(c_hi);
-    printf("\n");
-    DEBUG_PRINTF("     t: ");
-    dumpMsk256(t);
-    printf("\n");
+    DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
+    DEBUG_PRINTF("  char: "); dumpMsk256(chars);        printf("\n");
+    DEBUG_PRINTF("  c_lo: "); dumpMsk256(c_lo);         printf("\n");
+    DEBUG_PRINTF("  c_hi: "); dumpMsk256(c_hi);         printf("\n");
+    DEBUG_PRINTF("     t: "); dumpMsk256(t);            printf("\n");
 #endif
 
     u32 z = movemask256(eq256(t, zeroes));
     return lastMatch(buf, z);
 }
 
-static really_inline const u8 *
-revBlockShort(m256 mask, m128 chars, const u8 *buf, const m256 low4bits) {
+static really_inline
+const u8 *revBlockShort(m256 mask, m128 chars, const u8 *buf,
+                        const m256 low4bits) {
     // do the hi and lo shuffles in the one avx register
     m256 c = combine2x128(rshift64_m128(chars, 4), chars);
     c = and256(c, low4bits);
@@ -602,9 +556,9 @@ revBlockShort(m256 mask, m128 chars, const u8 *buf, const m256 low4bits) {
     return lastMatch(buf, z);
 }
 
-static really_inline const u8 *shuftiRevShort(m128 mask_lo, m128 mask_hi,
-                                              const u8 *buf, const u8 *buf_end,
-                                              const m256 low4bits) {
+static really_inline
+const u8 *shuftiRevShort(m128 mask_lo, m128 mask_hi, const u8 *buf,
+                         const u8 *buf_end, const m256 low4bits) {
     // run shufti over two overlapping 16-byte unaligned reads
     const m256 mask = combine2x128(mask_hi, mask_lo);
 
@@ -622,6 +576,7 @@ static really_inline const u8 *shuftiRevShort(m128 mask_lo, m128 mask_hi,
     return buf - 1;
 }
 
+
 /* takes 128 bit masks, but operates on 256 bits of data */
 const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
                       const u8 *buf_end) {
@@ -630,8 +585,8 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
 
     // Slow path for small cases.
     if (buf_end - buf < 16) {
-        return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf,
-                             buf_end);
+        return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
+                             buf, buf_end);
     }
 
     const m256 low4bits = set32x8(0xf);
@@ -649,8 +604,7 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
 
     // Preconditioning: most of the time our buffer won't be aligned.
     m256 chars = loadu256(buf_end - 32);
-    rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits,
-                  zeroes);
+    rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, zeroes);
     if (rv) {
         return rv;
     }
@@ -662,8 +616,7 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
     while (buf_end > last_block) {
         buf_end -= 32;
         m256 lchars = load256(buf_end);
-        rv = revBlock(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits,
-                      zeroes);
+        rv = revBlock(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits, zeroes);
         if (rv) {
             return rv;
         }
@@ -680,58 +633,42 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
     return buf - 1;
 }
 
-static really_inline const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi,
-                                         m256 mask2_lo, m256 mask2_hi,
-                                         m256 chars, const u8 *buf,
-                                         const m256 low4bits, const m256 ones) {
+static really_inline
+const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, m256 mask2_lo, m256 mask2_hi,
+                    m256 chars, const u8 *buf, const m256 low4bits,
+                    const m256 ones) {
     DEBUG_PRINTF("buf %p\n", buf);
     m256 chars_lo = GET_LO_4(chars);
     m256 chars_hi = GET_HI_4(chars);
-    m256 c_lo = pshufb_m256(mask1_lo, chars_lo);
-    m256 c_hi = pshufb_m256(mask1_hi, chars_hi);
-    m256 t = or256(c_lo, c_hi);
+    m256 c_lo  = pshufb_m256(mask1_lo, chars_lo);
+    m256 c_hi  = pshufb_m256(mask1_hi, chars_hi);
+    m256 t     = or256(c_lo, c_hi);
 
 #ifdef DEBUG
-    DEBUG_PRINTF(" chars: ");
-    dumpMsk256AsChars(chars);
-    printf("\n");
-    DEBUG_PRINTF("  char: ");
-    dumpMsk256(chars);
-    printf("\n");
-    DEBUG_PRINTF("  c_lo: ");
-    dumpMsk256(c_lo);
-    printf("\n");
-    DEBUG_PRINTF("  c_hi: ");
-    dumpMsk256(c_hi);
-    printf("\n");
-    DEBUG_PRINTF("     t: ");
-    dumpMsk256(t);
-    printf("\n");
+    DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
+    DEBUG_PRINTF("  char: "); dumpMsk256(chars);        printf("\n");
+    DEBUG_PRINTF("  c_lo: "); dumpMsk256(c_lo);         printf("\n");
+    DEBUG_PRINTF("  c_hi: "); dumpMsk256(c_hi);         printf("\n");
+    DEBUG_PRINTF("     t: "); dumpMsk256(t);            printf("\n");
 #endif
 
-    m256 c2_lo = pshufb_m256(mask2_lo, chars_lo);
-    m256 c2_hi = pshufb_m256(mask2_hi, chars_hi);
+    m256 c2_lo  = pshufb_m256(mask2_lo, chars_lo);
+    m256 c2_hi  = pshufb_m256(mask2_hi, chars_hi);
     m256 t2 = or256(t, rshift128_m256(or256(c2_lo, c2_hi), 1));
 
 #ifdef DEBUG
-    DEBUG_PRINTF(" c2_lo: ");
-    dumpMsk256(c2_lo);
-    printf("\n");
-    DEBUG_PRINTF(" c2_hi: ");
-    dumpMsk256(c2_hi);
-    printf("\n");
-    DEBUG_PRINTF("    t2: ");
-    dumpMsk256(t2);
-    printf("\n");
+    DEBUG_PRINTF(" c2_lo: "); dumpMsk256(c2_lo);        printf("\n");
+    DEBUG_PRINTF(" c2_hi: "); dumpMsk256(c2_hi);        printf("\n");
+    DEBUG_PRINTF("    t2: "); dumpMsk256(t2);           printf("\n");
 #endif
     u32 z = movemask256(eq256(t2, ones));
 
     return firstMatch(buf, z);
 }
 
-static really_inline const u8 *fwdBlockShort2(m256 mask1, m256 mask2,
-                                              m128 chars, const u8 *buf,
-                                              const m256 low4bits) {
+static really_inline
+const u8 *fwdBlockShort2(m256 mask1, m256 mask2, m128 chars, const u8 *buf,
+                         const m256 low4bits) {
     // do the hi and lo shuffles in the one avx register
     m256 c = combine2x128(rshift64_m128(chars, 4), chars);
     c = and256(c, low4bits);
@@ -745,10 +682,9 @@ static really_inline const u8 *fwdBlockShort2(m256 mask1, m256 mask2,
     return firstMatch(buf, z);
 }
 
-static really_inline const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi,
-                                                 m128 mask2_lo, m128 mask2_hi,
-                                                 const u8 *buf,
-                                                 const u8 *buf_end) {
+static really_inline
+const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
+                            m128 mask2_hi, const u8 *buf, const u8 *buf_end) {
     DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
     const m256 low4bits = set32x8(0xf);
     // run shufti over two overlapping 16-byte unaligned reads
@@ -769,8 +705,9 @@ static really_inline const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi,
 }
 
 /* takes 128 bit masks, but operates on 256 bits of data */
-const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
-                           m128 mask2_hi, const u8 *buf, const u8 *buf_end) {
+const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
+                           m128 mask2_lo, m128 mask2_hi,
+                           const u8 *buf, const u8 *buf_end) {
     /* we should always have at least 16 bytes */
     assert(buf_end - buf >= 16);
     DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
@@ -804,8 +741,8 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
     const u8 *last_block = buf_end - 32;
     while (buf < last_block) {
         m256 lchars = load256(buf);
-        rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo,
-                       wide_mask2_hi, lchars, buf, low4bits, ones);
+        rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi,
+                       lchars, buf, low4bits, ones);
         if (rv) {
             return rv;
         }
@@ -830,34 +767,26 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
 DUMP_MSK(512)
 #endif
 
-static really_inline u64a block(m512 mask_lo, m512 mask_hi, m512 chars,
-                                const m512 low4bits, const m512 compare) {
+static really_inline
+u64a block(m512 mask_lo, m512 mask_hi, m512 chars, const m512 low4bits,
+           const m512 compare) {
     m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits));
-    m512 c_hi =
-        pshufb_m512(mask_hi, rshift64_m512(andnot512(low4bits, chars), 4));
+    m512 c_hi = pshufb_m512(mask_hi,
+                            rshift64_m512(andnot512(low4bits, chars), 4));
     m512 t = and512(c_lo, c_hi);
 
 #ifdef DEBUG
-    DEBUG_PRINTF(" chars: ");
-    dumpMsk512AsChars(chars);
-    printf("\n");
-    DEBUG_PRINTF("  char: ");
-    dumpMsk512(chars);
-    printf("\n");
-    DEBUG_PRINTF("  c_lo: ");
-    dumpMsk512(c_lo);
-    printf("\n");
-    DEBUG_PRINTF("  c_hi: ");
-    dumpMsk512(c_hi);
-    printf("\n");
-    DEBUG_PRINTF("     t: ");
-    dumpMsk512(t);
-    printf("\n");
+    DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n");
+    DEBUG_PRINTF("  char: "); dumpMsk512(chars); printf("\n");
+    DEBUG_PRINTF("  c_lo: "); dumpMsk512(c_lo); printf("\n");
+    DEBUG_PRINTF("  c_hi: "); dumpMsk512(c_hi); printf("\n");
+    DEBUG_PRINTF("     t: "); dumpMsk512(t); printf("\n");
 #endif
 
     return eq512mask(t, compare);
 }
-static really_inline const u8 *firstMatch64(const u8 *buf, u64a z) {
+static really_inline
+const u8 *firstMatch64(const u8 *buf, u64a z) {
     DEBUG_PRINTF("z 0x%016llx\n", z);
     if (unlikely(z != ~0ULL)) {
         u32 pos = ctz64(~z);
@@ -869,19 +798,18 @@ static really_inline const u8 *firstMatch64(const u8 *buf, u64a z) {
     }
 }
 
-static really_inline const u8 *fwdBlock512(m512 mask_lo, m512 mask_hi,
-                                           m512 chars, const u8 *buf,
-                                           const m512 low4bits,
-                                           const m512 zeroes) {
+static really_inline
+const u8 *fwdBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf,
+                      const m512 low4bits, const m512 zeroes) {
     u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
 
     return firstMatch64(buf, z);
 }
 
-static really_inline const u8 *shortShufti512(m512 mask_lo, m512 mask_hi,
-                                              const u8 *buf, const u8 *buf_end,
-                                              const m512 low4bits,
-                                              const m512 zeroes) {
+static really_inline
+const u8 *shortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf,
+                         const u8 *buf_end, const m512 low4bits,
+                         const m512 zeroes) {
     DEBUG_PRINTF("short shufti %p len %zu\n", buf, buf_end - buf);
     uintptr_t len = buf_end - buf;
     assert(len <= 64);
@@ -959,7 +887,8 @@ done:
     return buf_end;
 }
 
-static really_inline const u8 *lastMatch64(const u8 *buf, u64a z) {
+static really_inline
+const u8 *lastMatch64(const u8 *buf, u64a z) {
     DEBUG_PRINTF("z 0x%016llx\n", z);
     if (unlikely(z != ~0ULL)) {
         u32 pos = clz64(~z);
@@ -970,10 +899,10 @@ static really_inline const u8 *lastMatch64(const u8 *buf, u64a z) {
     }
 }
 
-static really_inline const u8 *rshortShufti512(m512 mask_lo, m512 mask_hi,
-                                               const u8 *buf, const u8 *buf_end,
-                                               const m512 low4bits,
-                                               const m512 zeroes) {
+static really_inline
+const u8 *rshortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf,
+                          const u8 *buf_end, const m512 low4bits,
+                          const m512 zeroes) {
     DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf);
     uintptr_t len = buf_end - buf;
     assert(len <= 64);
@@ -990,31 +919,20 @@ static really_inline const u8 *rshortShufti512(m512 mask_lo, m512 mask_hi,
     return lastMatch64(buf, z | ~k);
 }
 
-static really_inline const u8 *revBlock512(m512 mask_lo, m512 mask_hi,
-                                           m512 chars, const u8 *buf,
-                                           const m512 low4bits,
-                                           const m512 zeroes) {
-    m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits));
-    m512 c_hi =
-        pshufb_m512(mask_hi, rshift64_m512(andnot512(low4bits, chars), 4));
-    m512 t = and512(c_lo, c_hi);
+static really_inline
+const u8 *revBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf,
+                      const m512 low4bits, const m512 zeroes) {
+    m512 c_lo  = pshufb_m512(mask_lo, and512(chars, low4bits));
+    m512 c_hi  = pshufb_m512(mask_hi,
+                             rshift64_m512(andnot512(low4bits, chars), 4));
+    m512 t     = and512(c_lo, c_hi);
 
 #ifdef DEBUG
-    DEBUG_PRINTF(" chars: ");
-    dumpMsk512AsChars(chars);
-    printf("\n");
-    DEBUG_PRINTF("  char: ");
-    dumpMsk512(chars);
-    printf("\n");
-    DEBUG_PRINTF("  c_lo: ");
-    dumpMsk512(c_lo);
-    printf("\n");
-    DEBUG_PRINTF("  c_hi: ");
-    dumpMsk512(c_hi);
-    printf("\n");
-    DEBUG_PRINTF("     t: ");
-    dumpMsk512(t);
-    printf("\n");
+    DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n");
+    DEBUG_PRINTF("  char: "); dumpMsk512(chars);        printf("\n");
+    DEBUG_PRINTF("  c_lo: "); dumpMsk512(c_lo);         printf("\n");
+    DEBUG_PRINTF("  c_hi: "); dumpMsk512(c_hi);         printf("\n");
+    DEBUG_PRINTF("     t: "); dumpMsk512(t);            printf("\n");
 #endif
 
     u64a z = eq512mask(t, zeroes);
@@ -1077,60 +995,43 @@ done:
     return buf - 1;
 }
 
-static really_inline const u8 *fwdBlock2(m512 mask1_lo, m512 mask1_hi,
-                                         m512 mask2_lo, m512 mask2_hi,
-                                         m512 chars, const u8 *buf,
-                                         const m512 low4bits, const m512 ones,
-                                         __mmask64 k) {
+static really_inline
+const u8 *fwdBlock2(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, m512 mask2_hi,
+                    m512 chars, const u8 *buf, const m512 low4bits,
+                    const m512 ones, __mmask64 k) {
     DEBUG_PRINTF("buf %p %.64s\n", buf, buf);
     m512 chars_lo = and512(chars, low4bits);
     m512 chars_hi = rshift64_m512(andnot512(low4bits, chars), 4);
-    m512 c_lo = maskz_pshufb_m512(k, mask1_lo, chars_lo);
-    m512 c_hi = maskz_pshufb_m512(k, mask1_hi, chars_hi);
-    m512 t = or512(c_lo, c_hi);
+    m512 c_lo  = maskz_pshufb_m512(k, mask1_lo, chars_lo);
+    m512 c_hi  = maskz_pshufb_m512(k, mask1_hi, chars_hi);
+    m512 t     = or512(c_lo, c_hi);
 
 #ifdef DEBUG
-    DEBUG_PRINTF(" chars: ");
-    dumpMsk512AsChars(chars);
-    printf("\n");
-    DEBUG_PRINTF("  char: ");
-    dumpMsk512(chars);
-    printf("\n");
-    DEBUG_PRINTF("  c_lo: ");
-    dumpMsk512(c_lo);
-    printf("\n");
-    DEBUG_PRINTF("  c_hi: ");
-    dumpMsk512(c_hi);
-    printf("\n");
-    DEBUG_PRINTF("     t: ");
-    dumpMsk512(t);
-    printf("\n");
+    DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n");
+    DEBUG_PRINTF("  char: "); dumpMsk512(chars);        printf("\n");
+    DEBUG_PRINTF("  c_lo: "); dumpMsk512(c_lo);         printf("\n");
+    DEBUG_PRINTF("  c_hi: "); dumpMsk512(c_hi);         printf("\n");
+    DEBUG_PRINTF("     t: "); dumpMsk512(t);            printf("\n");
 #endif
 
-    m512 c2_lo = maskz_pshufb_m512(k, mask2_lo, chars_lo);
-    m512 c2_hi = maskz_pshufb_m512(k, mask2_hi, chars_hi);
+    m512 c2_lo  = maskz_pshufb_m512(k, mask2_lo, chars_lo);
+    m512 c2_hi  = maskz_pshufb_m512(k, mask2_hi, chars_hi);
     m512 t2 = or512(t, rshift128_m512(or512(c2_lo, c2_hi), 1));
 
 #ifdef DEBUG
-    DEBUG_PRINTF(" c2_lo: ");
-    dumpMsk512(c2_lo);
-    printf("\n");
-    DEBUG_PRINTF(" c2_hi: ");
-    dumpMsk512(c2_hi);
-    printf("\n");
-    DEBUG_PRINTF("    t2: ");
-    dumpMsk512(t2);
-    printf("\n");
+    DEBUG_PRINTF(" c2_lo: "); dumpMsk512(c2_lo);        printf("\n");
+    DEBUG_PRINTF(" c2_hi: "); dumpMsk512(c2_hi);        printf("\n");
+    DEBUG_PRINTF("    t2: "); dumpMsk512(t2);           printf("\n");
 #endif
     u64a z = eq512mask(t2, ones);
 
     return firstMatch64(buf, z | ~k);
 }
 
-static really_inline const u8 *
-shortDoubleShufti512(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, m512 mask2_hi,
-                     const u8 *buf, const u8 *buf_end, const m512 low4bits,
-                     const m512 ones) {
+static really_inline
+const u8 *shortDoubleShufti512(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo,
+                               m512 mask2_hi, const u8 *buf, const u8 *buf_end,
+                               const m512 low4bits, const m512 ones) {
     DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf);
     uintptr_t len = buf_end - buf;
     assert(len <= 64);
@@ -1147,8 +1048,9 @@ shortDoubleShufti512(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, m512 mask2_hi,
 }
 
 /* takes 128 bit masks, but operates on 512 bits of data */
-const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
-                           m128 mask2_hi, const u8 *buf, const u8 *buf_end) {
+const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
+                           m128 mask2_lo, m128 mask2_hi,
+                           const u8 *buf, const u8 *buf_end) {
     /* we should always have at least 16 bytes */
     assert(buf_end - buf >= 16);
     DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
diff --git a/src/nfagraph/ng_limex.cpp b/src/nfagraph/ng_limex.cpp
index 922100e7a0a8ff487cf4a985757abf71b05f4540..2f0a55eab9abdbd86ed5bd2ea1894fcadd589aff 100644
--- a/src/nfagraph/ng_limex.cpp
+++ b/src/nfagraph/ng_limex.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -632,8 +632,8 @@ bytecode_ptr<NFA>
 constructNFA(const NGHolder &h_in, const ReportManager *rm,
              const map<u32, u32> &fixed_depth_tops,
              const map<u32, vector<vector<CharReach>>> &triggers,
-             bool compress_state, bool do_accel, bool impl_test_only, u32 hint,
-             const CompileContext &cc) {
+             bool compress_state, bool do_accel, bool impl_test_only,
+             bool &fast, u32 hint, const CompileContext &cc) {
     if (!has_managed_reports(h_in)) {
         rm = nullptr;
     } else {
@@ -684,19 +684,19 @@ constructNFA(const NGHolder &h_in, const ReportManager *rm,
     }
 
     return generate(*h, state_ids, repeats, reportSquashMap, squashMap, tops,
-                    zombies, do_accel, compress_state, hint, cc);
+                    zombies, do_accel, compress_state, fast, hint, cc);
 }
 
 bytecode_ptr<NFA>
 constructNFA(const NGHolder &h_in, const ReportManager *rm,
              const map<u32, u32> &fixed_depth_tops,
              const map<u32, vector<vector<CharReach>>> &triggers,
-             bool compress_state, const CompileContext &cc) {
+             bool compress_state, bool &fast, const CompileContext &cc) {
     const u32 hint = INVALID_NFA;
     const bool do_accel = cc.grey.accelerateNFA;
     const bool impl_test_only = false;
     return constructNFA(h_in, rm, fixed_depth_tops, triggers, compress_state,
-                        do_accel, impl_test_only, hint, cc);
+                        do_accel, impl_test_only, fast, hint, cc);
 }
 
 #ifndef RELEASE_BUILD
@@ -705,11 +705,11 @@ bytecode_ptr<NFA>
 constructNFA(const NGHolder &h_in, const ReportManager *rm,
              const map<u32, u32> &fixed_depth_tops,
              const map<u32, vector<vector<CharReach>>> &triggers,
-             bool compress_state, u32 hint, const CompileContext &cc) {
+             bool compress_state, bool &fast, u32 hint, const CompileContext &cc) {
     const bool do_accel = cc.grey.accelerateNFA;
     const bool impl_test_only = false;
-    return constructNFA(h_in, rm, fixed_depth_tops, triggers,
-                        compress_state, do_accel, impl_test_only, hint, cc);
+    return constructNFA(h_in, rm, fixed_depth_tops, triggers, compress_state,
+                        do_accel, impl_test_only, fast, hint, cc);
 }
 #endif // RELEASE_BUILD
 
@@ -739,9 +739,10 @@ bytecode_ptr<NFA> constructReversedNFA_i(const NGHolder &h_in, u32 hint,
     vector<BoundedRepeatData> repeats;
     unordered_map<NFAVertex, NFAStateSet> reportSquashMap;
     unordered_map<NFAVertex, NFAStateSet> squashMap;
+    UNUSED bool fast = false;
 
     return generate(h, state_ids, repeats, reportSquashMap, squashMap, tops,
-                    zombies, false, false, hint, cc);
+                    zombies, false, false, fast, hint, cc);
 }
 
 bytecode_ptr<NFA> constructReversedNFA(const NGHolder &h_in,
diff --git a/src/nfagraph/ng_limex.h b/src/nfagraph/ng_limex.h
index 9bf46d69399509f76dea0b7157291956eefe26fd..7eba2eff065f613f4af4e8f631b370c82cb84134 100644
--- a/src/nfagraph/ng_limex.h
+++ b/src/nfagraph/ng_limex.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -100,7 +100,7 @@ bytecode_ptr<NFA>
 constructNFA(const NGHolder &g, const ReportManager *rm,
              const std::map<u32, u32> &fixed_depth_tops,
              const std::map<u32, std::vector<std::vector<CharReach>>> &triggers,
-             bool compress_state, const CompileContext &cc);
+             bool compress_state, bool &fast, const CompileContext &cc);
 
 /**
  * \brief Build a reverse NFA from the graph given, which should have already
@@ -129,7 +129,7 @@ bytecode_ptr<NFA>
 constructNFA(const NGHolder &g, const ReportManager *rm,
              const std::map<u32, u32> &fixed_depth_tops,
              const std::map<u32, std::vector<std::vector<CharReach>>> &triggers,
-             bool compress_state, u32 hint, const CompileContext &cc);
+             bool compress_state, bool &fast, u32 hint, const CompileContext &cc);
 
 /**
  * \brief Build a reverse NFA (with model type hint) from the graph given,
diff --git a/src/nfagraph/ng_literal_analysis.cpp b/src/nfagraph/ng_literal_analysis.cpp
index ea0def0218aa94f465514d8048cf10be93f13421..d25ac43e8755cd465ab2f5e9013fdfda981793bf 100644
--- a/src/nfagraph/ng_literal_analysis.cpp
+++ b/src/nfagraph/ng_literal_analysis.cpp
@@ -69,14 +69,14 @@ struct LitGraphVertexProps {
     LitGraphVertexProps() = default;
     explicit LitGraphVertexProps(ue2_literal::elem c_in) : c(move(c_in)) {}
     ue2_literal::elem c; // string element (char + bool)
-    size_t index; // managed by ue2_graph
+    size_t index = 0; // managed by ue2_graph
 };
 
 struct LitGraphEdgeProps {
     LitGraphEdgeProps() = default;
     explicit LitGraphEdgeProps(u64a score_in) : score(score_in) {}
     u64a score = NO_LITERAL_AT_EDGE_SCORE;
-    size_t index; // managed by ue2_graph
+    size_t index = 0; // managed by ue2_graph
 };
 
 struct LitGraph
diff --git a/src/nfagraph/ng_som.cpp b/src/nfagraph/ng_som.cpp
index d23ac408b04dbd1c53c3727173929a3392239c04..47cc82dae860d51c13031b264fc9923b146c185b 100644
--- a/src/nfagraph/ng_som.cpp
+++ b/src/nfagraph/ng_som.cpp
@@ -2446,6 +2446,10 @@ static
 bool doLitHaigSom(NG &ng, NGHolder &g, som_type som) {
     ue2_literal lit;
     shared_ptr<NGHolder> rhs = make_shared<NGHolder>();
+    if (!rhs) {
+        assert(0);
+        throw std::bad_alloc();
+    }
     if (!ng.cc.grey.allowLitHaig) {
         return false;
     }
@@ -2510,6 +2514,11 @@ bool doHaigLitHaigSom(NG &ng, NGHolder &g,
     ue2_literal lit;
     shared_ptr<NGHolder> rhs = make_shared<NGHolder>();
     shared_ptr<NGHolder> lhs = make_shared<NGHolder>();
+    if (!rhs || !lhs) {
+        assert(0);
+        throw std::bad_alloc();
+    }
+
     if (!splitOffBestLiteral(g, regions, &lit, &*lhs, &*rhs, ng.cc)) {
         return false;
     }
diff --git a/src/nfagraph/ng_violet.cpp b/src/nfagraph/ng_violet.cpp
index 78d73082a5d36653d1106ec3c1f5be4787efa182..ba6b3501dc8b70a220aaf58bf81dd1e33d6f5fd9 100644
--- a/src/nfagraph/ng_violet.cpp
+++ b/src/nfagraph/ng_violet.cpp
@@ -1036,6 +1036,11 @@ bool splitRoseEdge(const NGHolder &base_graph, RoseInGraph &vg,
     shared_ptr<NGHolder> lhs = make_shared<NGHolder>();
     shared_ptr<NGHolder> rhs = make_shared<NGHolder>();
 
+    if (!lhs || !rhs) {
+        assert(0);
+        throw std::bad_alloc();
+    }
+
     unordered_map<NFAVertex, NFAVertex> lhs_map;
     unordered_map<NFAVertex, NFAVertex> rhs_map;
 
@@ -1229,6 +1234,10 @@ void splitEdgesByCut(NGHolder &h, RoseInGraph &vg,
             DEBUG_PRINTF("splitting on pivot %zu\n", h[pivot].index);
             unordered_map<NFAVertex, NFAVertex> temp_map;
             shared_ptr<NGHolder> new_lhs = make_shared<NGHolder>();
+            if (!new_lhs) {
+                assert(0);
+                throw std::bad_alloc();
+            }
             splitLHS(h, pivot, new_lhs.get(), &temp_map);
 
             /* want to cut off paths to pivot from things other than the pivot -
@@ -1310,6 +1319,10 @@ void splitEdgesByCut(NGHolder &h, RoseInGraph &vg,
             if (!contains(done_rhs, adj)) {
                 unordered_map<NFAVertex, NFAVertex> temp_map;
                 shared_ptr<NGHolder> new_rhs = make_shared<NGHolder>();
+                if (!new_rhs) {
+                    assert(0);
+                    throw std::bad_alloc();
+                }
                 splitRHS(h, adj, new_rhs.get(), &temp_map);
                 remove_edge(new_rhs->start, new_rhs->accept, *new_rhs);
                 remove_edge(new_rhs->start, new_rhs->acceptEod, *new_rhs);
@@ -2281,6 +2294,10 @@ void splitEdgesForSuffix(const NGHolder &base_graph, RoseInGraph &vg,
     assert(!splitters.empty());
 
     shared_ptr<NGHolder> lhs = make_shared<NGHolder>();
+    if (!lhs) {
+        assert(0);
+        throw bad_alloc();
+    }
     unordered_map<NFAVertex, NFAVertex> v_map;
     cloneHolder(*lhs, base_graph, &v_map);
     lhs->kind = NFA_INFIX;
diff --git a/src/parser/Parser.cpp b/src/parser/Parser.cpp
index 36d0e053453d20a8d5afaf0ae2d8fc05f6f1a093..b18ec4efcb884d6a0e7c15bf741791f1cb5f3c31 100644
--- a/src/parser/Parser.cpp
+++ b/src/parser/Parser.cpp
@@ -6383,7 +6383,7 @@ unique_ptr<Component> parse(const char *ptr, ParseMode &globalMode) {
         // Ensure that all references are valid.
         checkReferences(*rootSeq, groupIndex, groupNames);
 
-        return move(rootSeq);
+        return rootSeq;
     } catch (LocatedParseError &error) {
         if (ts >= ptr && ts <= pe) {
             error.locate(ts - ptr);
diff --git a/src/parser/logical_combination.cpp b/src/parser/logical_combination.cpp
index 49e060c9813f60166585ea795c6f5a9af52b3cc8..96c3bd89dea5921606564ec7eb520a481b13721e 100644
--- a/src/parser/logical_combination.cpp
+++ b/src/parser/logical_combination.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019, Intel Corporation
+ * Copyright (c) 2018-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,6 +33,7 @@
 #include "parser/parse_error.h"
 #include "util/container.h"
 #include "hs_compile.h"
+#include "allocator.h"
 
 #include <vector>
 
@@ -139,7 +140,8 @@ void ParsedLogical::validateSubIDs(const unsigned *ids,
         }
         hs_compile_error_t *compile_err = NULL;
         hs_expr_info_t *info = NULL;
-        hs_error_t err = hs_expression_info(expressions[i], flags[i], &info,
+        hs_error_t err = hs_expression_info(expressions[i],
+                                            flags ? flags[i] : 0, &info,
                                             &compile_err);
         if (err != HS_SUCCESS) {
             hs_free_compile_error(compile_err);
@@ -151,7 +153,7 @@ void ParsedLogical::validateSubIDs(const unsigned *ids,
             if (info->unordered_matches) {
                 throw CompileError("Have unordered match in sub-expressions.");
             }
-            free(info);
+            hs_misc_free(info);
         }
     }
 }
diff --git a/src/parser/utf8_validate.cpp b/src/parser/utf8_validate.cpp
index 50aa06d8e7832cdcb5a7da47f17fd76889dc81e4..54c9755e8a8aee449a2002489e39107b41c06a7f 100644
--- a/src/parser/utf8_validate.cpp
+++ b/src/parser/utf8_validate.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2022, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -72,7 +72,7 @@ bool isValidUtf8(const char *expression, const size_t len) {
     while (i < len) {
         DEBUG_PRINTF("byte %zu: 0x%02x\n", i, s[i]);
         // One octet.
-        if (s[i] < 0x7f) {
+        if (s[i] <= 0x7f) {
             DEBUG_PRINTF("one octet\n");
             i++;
             continue;
diff --git a/src/rose/program_runtime.c b/src/rose/program_runtime.c
index 0f2d1083b6e4d88a8c5f7696fbccae6a194d09f9..579ce27835a1bf550d7494b613b164238ee18ae6 100644
--- a/src/rose/program_runtime.c
+++ b/src/rose/program_runtime.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2019, Intel Corporation
+ * Copyright (c) 2015-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -767,10 +767,10 @@ int roseCheckMask32(const struct core_info *ci, const u8 *and_mask,
                 c_shift = c_len - ci->len;
                 c_len = ci->len;
             }
-            copy_upto_32_bytes((u8 *)&data - offset, ci->buf, c_len);
+            copy_upto_64_bytes((u8 *)&data - offset, ci->buf, c_len);
         }
         assert(h_shift + h_len + c_len + c_shift == 32);
-        copy_upto_32_bytes((u8 *)&data + h_shift, ci->hbuf + h_offset, h_len);
+        copy_upto_64_bytes((u8 *)&data + h_shift, ci->hbuf + h_offset, h_len);
     } else {
         if (offset + 32 > (s64a)ci->len) {
             if (offset >= (s64a)ci->len) {
@@ -779,7 +779,7 @@ int roseCheckMask32(const struct core_info *ci, const u8 *and_mask,
             }
             c_len = ci->len - offset;
             c_shift = 32 - c_len;
-            copy_upto_32_bytes((u8 *)&data, ci->buf + offset, c_len);
+            copy_upto_64_bytes((u8 *)&data, ci->buf + offset, c_len);
         } else {
             data = loadu256(ci->buf + offset);
         }
@@ -800,12 +800,90 @@ int roseCheckMask32(const struct core_info *ci, const u8 *and_mask,
     return 0;
 }
 
-// get 128/256 bits data from history and current buffer.
+#ifdef HAVE_AVX512
+static rose_inline
+int roseCheckMask64(const struct core_info *ci, const u8 *and_mask,
+                    const u8 *cmp_mask, const u64a neg_mask,
+                    s32 checkOffset, u64a end) {
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    m512 data = zeroes512(); // consists of the following four parts.
+    s32 c_shift = 0; // blank bytes after current.
+    s32 h_shift = 0; // blank bytes before history.
+    s32 h_len = 64; // number of bytes from history buffer.
+    s32 c_len = 0; // number of bytes from current buffer.
+    /* h_shift + h_len + c_len + c_shift = 64 need to be hold.*/
+
+    if (offset < 0) {
+        s32 h_offset = 0; // the start offset in history buffer.
+        if (offset < -(s64a)ci->hlen) {
+            if (offset + 64 <= -(s64a)ci->hlen) {
+                DEBUG_PRINTF("all before history\n");
+                return 1;
+            }
+            h_shift = -(offset + (s64a)ci->hlen);
+            h_len = 64 - h_shift;
+        } else {
+            h_offset = ci->hlen + offset;
+        }
+        if (offset + 64 > 0) {
+            // part in current buffer.
+            c_len = offset + 64;
+            h_len = -(offset + h_shift);
+            if (c_len > (s64a)ci->len) {
+                // out of current buffer.
+                c_shift = c_len - ci->len;
+                c_len = ci->len;
+            }
+            copy_upto_64_bytes((u8 *)&data - offset, ci->buf, c_len);
+        }
+        assert(h_shift + h_len + c_len + c_shift == 64);
+        copy_upto_64_bytes((u8 *)&data + h_shift, ci->hbuf + h_offset, h_len);
+    } else {
+        if (offset + 64 > (s64a)ci->len) {
+            if (offset >= (s64a)ci->len) {
+                DEBUG_PRINTF("all in the future.\n");
+                return 1;
+            }
+            c_len = ci->len - offset;
+            c_shift = 64 - c_len;
+            copy_upto_64_bytes((u8 *)&data, ci->buf + offset, c_len);
+        } else {
+            data = loadu512(ci->buf + offset);
+        }
+    }
+    DEBUG_PRINTF("h_shift %d c_shift %d\n", h_shift, c_shift);
+    DEBUG_PRINTF("h_len %d c_len %d\n", h_len, c_len);
+    // we use valid_data_mask to blind bytes before history/in the future.
+    u64a valid_data_mask;
+    valid_data_mask = (~0ULL) << (h_shift + c_shift) >> (c_shift);
+
+    m512 and_mask_m512 = loadu512(and_mask);
+    m512 cmp_mask_m512 = loadu512(cmp_mask);
+
+    if (validateMask64(data, valid_data_mask, and_mask_m512,
+                       cmp_mask_m512, neg_mask)) {
+        DEBUG_PRINTF("Mask64 passed\n");
+        return 1;
+    }
+    return 0;
+}
+#endif
+
+// get 128/256/512 bits data from history and current buffer.
 // return data and valid_data_mask.
 static rose_inline
-u32 getBufferDataComplex(const struct core_info *ci, const s64a loc,
+u64a getBufferDataComplex(const struct core_info *ci, const s64a loc,
                          u8 *data, const u32 data_len) {
-    assert(data_len == 16 || data_len == 32);
+    assert(data_len == 16 || data_len == 32 || data_len == 64);
     s32 c_shift = 0; // blank bytes after current.
     s32 h_shift = 0; // blank bytes before history.
     s32 h_len = data_len; // number of bytes from history buffer.
@@ -831,10 +909,10 @@ u32 getBufferDataComplex(const struct core_info *ci, const s64a loc,
                 c_shift = c_len - ci->len;
                 c_len = ci->len;
             }
-            copy_upto_32_bytes(data - loc, ci->buf, c_len);
+            copy_upto_64_bytes(data - loc, ci->buf, c_len);
         }
         assert(h_shift + h_len + c_len + c_shift == (s32)data_len);
-        copy_upto_32_bytes(data + h_shift, ci->hbuf + h_offset, h_len);
+        copy_upto_64_bytes(data + h_shift, ci->hbuf + h_offset, h_len);
     } else {
         if (loc + data_len > (s64a)ci->len) {
             if (loc >= (s64a)ci->len) {
@@ -843,8 +921,14 @@ u32 getBufferDataComplex(const struct core_info *ci, const s64a loc,
             }
             c_len = ci->len - loc;
             c_shift = data_len - c_len;
-            copy_upto_32_bytes(data, ci->buf + loc, c_len);
+            copy_upto_64_bytes(data, ci->buf + loc, c_len);
         } else {
+#ifdef HAVE_AVX512
+            if (data_len == 64) {
+                storeu512(data, loadu512(ci->buf + loc));
+                return ~0ULL;
+            }
+#endif
             if (data_len == 16) {
                 storeu128(data, loadu128(ci->buf + loc));
                 return 0xffff;
@@ -857,6 +941,11 @@ u32 getBufferDataComplex(const struct core_info *ci, const s64a loc,
     DEBUG_PRINTF("h_shift %d c_shift %d\n", h_shift, c_shift);
     DEBUG_PRINTF("h_len %d c_len %d\n", h_len, c_len);
 
+#ifdef HAVE_AVX512
+    if (data_len == 64) {
+        return (~0ULL) << (h_shift + c_shift) >> c_shift;
+    }
+#endif
     if (data_len == 16) {
         return (u16)(0xffff << (h_shift + c_shift)) >> c_shift;
     } else {
@@ -886,6 +975,19 @@ m256 getData256(const struct core_info *ci, s64a offset, u32 *valid_data_mask) {
     return *(m256 *)data;
 }
 
+#ifdef HAVE_AVX512
+static rose_inline
+m512 getData512(const struct core_info *ci, s64a offset, u64a *valid_data_mask) {
+    if (offset > 0 && offset + sizeof(m512) <= ci->len) {
+        *valid_data_mask = ~0ULL;
+        return loadu512(ci->buf + offset);
+    }
+    ALIGN_CL_DIRECTIVE u8 data[sizeof(m512)];
+    *valid_data_mask = getBufferDataComplex(ci, offset, data, 64);
+    return *(m512 *)data;
+}
+#endif
+
 static rose_inline
 int roseCheckShufti16x8(const struct core_info *ci, const u8 *nib_mask,
                         const u8 *bucket_select_mask, u32 neg_mask,
@@ -1025,6 +1127,83 @@ int roseCheckShufti32x16(const struct core_info *ci, const u8 *hi_mask,
     }
 }
 
+#ifdef HAVE_AVX512
+static rose_inline
+int roseCheckShufti64x8(const struct core_info *ci, const u8 *hi_mask,
+                        const u8 *lo_mask, const u8 *bucket_select_mask,
+                        u64a neg_mask, s32 checkOffset, u64a end) {
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    u64a valid_data_mask = 0;
+    m512 data = getData512(ci, offset, &valid_data_mask);
+
+    if (unlikely(!valid_data_mask)) {
+        return 1;
+    }
+
+    m512 hi_mask_m512 = loadu512(hi_mask);
+    m512 lo_mask_m512 = loadu512(lo_mask);
+    m512 bucket_select_mask_m512 = loadu512(bucket_select_mask);
+    if (validateShuftiMask64x8(data, hi_mask_m512, lo_mask_m512,
+                               bucket_select_mask_m512,
+                               neg_mask, valid_data_mask)) {
+        DEBUG_PRINTF("check shufti 64x8 successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static rose_inline
+int roseCheckShufti64x16(const struct core_info *ci, const u8 *hi_mask_1,
+                         const u8 *hi_mask_2, const u8 *lo_mask_1,
+                         const u8 *lo_mask_2, const u8 *bucket_select_mask_hi,
+                         const u8 *bucket_select_mask_lo, u64a neg_mask,
+                         s32 checkOffset, u64a end) {
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    u64a valid_data_mask = 0;
+    m512 data = getData512(ci, offset, &valid_data_mask);
+    if (unlikely(!valid_data_mask)) {
+        return 1;
+    }
+
+    m512 hi_mask_1_m512 = loadu512(hi_mask_1);
+    m512 hi_mask_2_m512 = loadu512(hi_mask_2);
+    m512 lo_mask_1_m512 = loadu512(lo_mask_1);
+    m512 lo_mask_2_m512 = loadu512(lo_mask_2);
+
+    m512 bucket_select_mask_hi_m512 = loadu512(bucket_select_mask_hi);
+    m512 bucket_select_mask_lo_m512 = loadu512(bucket_select_mask_lo);
+    if (validateShuftiMask64x16(data, hi_mask_1_m512, hi_mask_2_m512,
+                              lo_mask_1_m512, lo_mask_2_m512,
+                              bucket_select_mask_hi_m512,
+                              bucket_select_mask_lo_m512,
+                              neg_mask, valid_data_mask)) {
+        DEBUG_PRINTF("check shufti 64x16 successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+#endif
+
 static rose_inline
 int roseCheckSingleLookaround(const struct RoseEngine *t,
                               const struct hs_scratch *scratch,
@@ -2068,6 +2247,12 @@ hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
         &&LABEL_ROSE_INSTR_FLUSH_COMBINATION,
         &&LABEL_ROSE_INSTR_SET_EXHAUST,
         &&LABEL_ROSE_INSTR_LAST_FLUSH_COMBINATION
+#ifdef HAVE_AVX512
+        ,
+        &&LABEL_ROSE_INSTR_CHECK_SHUFTI_64x8, //!< Check 64-byte data by 8-bucket shufti.
+        &&LABEL_ROSE_INSTR_CHECK_SHUFTI_64x16, //!< Check 64-byte data by 16-bucket shufti.
+        &&LABEL_ROSE_INSTR_CHECK_MASK_64     //!< 64-bytes and/cmp/neg mask check.
+#endif
     };
 #endif
 
@@ -2258,6 +2443,45 @@ hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
             }
             PROGRAM_NEXT_INSTRUCTION
 
+#ifdef HAVE_AVX512
+            PROGRAM_CASE(CHECK_MASK_64) {
+                struct core_info *ci = &scratch->core_info;
+                if (!roseCheckMask64(ci, ri->and_mask, ri->cmp_mask,
+                                     ri->neg_mask, ri->offset, end)) {
+                    assert(ri->fail_jump);
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_SHUFTI_64x8) {
+                const struct core_info *ci = &scratch->core_info;
+                if (!roseCheckShufti64x8(ci, ri->hi_mask, ri->lo_mask,
+                                         ri->bucket_select_mask,
+                                         ri->neg_mask, ri->offset, end)) {
+                    assert(ri->fail_jump);
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_SHUFTI_64x16) {
+                const struct core_info *ci = &scratch->core_info;
+                if (!roseCheckShufti64x16(ci, ri->hi_mask_1, ri->hi_mask_2,
+                                          ri->lo_mask_1, ri->lo_mask_2,
+                                          ri->bucket_select_mask_hi,
+                                          ri->bucket_select_mask_lo,
+                                          ri->neg_mask, ri->offset, end)) {
+                    assert(ri->fail_jump);
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+#endif
+
             PROGRAM_CASE(CHECK_INFIX) {
                 if (!roseTestInfix(t, scratch, ri->queue, ri->lag, ri->report,
                                    end)) {
@@ -2886,6 +3110,7 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t,
 
     const char in_catchup = prog_flags & ROSE_PROG_FLAG_IN_CATCHUP;
     const char from_mpv = prog_flags & ROSE_PROG_FLAG_FROM_MPV;
+    const char skip_mpv_catchup = prog_flags & ROSE_PROG_FLAG_SKIP_MPV_CATCHUP;
 
     const char *pc_base = getByOffset(t, programOffset);
     const char *pc = pc_base;
@@ -2945,6 +3170,19 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t,
             }
             L_PROGRAM_NEXT_INSTRUCTION
 
+#ifdef HAVE_AVX512
+            L_PROGRAM_CASE(CHECK_MASK_64) {
+                struct core_info *ci = &scratch->core_info;
+                if (!roseCheckMask64(ci, ri->and_mask, ri->cmp_mask,
+                                     ri->neg_mask, ri->offset, end)) {
+                    assert(ri->fail_jump);
+                    pc += ri->fail_jump;
+                    L_PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+#endif
+
             L_PROGRAM_CASE(CHECK_BYTE) {
                 const struct core_info *ci = &scratch->core_info;
                 if (!roseCheckByte(ci, ri->and_mask, ri->cmp_mask,
@@ -2969,6 +3207,17 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t,
             }
             L_PROGRAM_NEXT_INSTRUCTION
 
+            L_PROGRAM_CASE(CATCH_UP_MPV) {
+                if (from_mpv || skip_mpv_catchup) {
+                    DEBUG_PRINTF("skipping mpv catchup\n");
+                } else if (roseCatchUpMPV(t,
+                                          end - scratch->core_info.buf_offset,
+                                          scratch) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
             L_PROGRAM_CASE(SOM_FROM_REPORT) {
                 som = handleSomExternal(scratch, &ri->som, end);
                 DEBUG_PRINTF("som from report %u is %llu\n", ri->som.onmatch,
@@ -2976,6 +3225,15 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t,
             }
             L_PROGRAM_NEXT_INSTRUCTION
 
+            L_PROGRAM_CASE(TRIGGER_SUFFIX) {
+                if (roseTriggerSuffix(t, scratch, ri->queue, ri->event, som,
+                                      end) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
             L_PROGRAM_CASE(DEDUPE) {
                 updateSeqPoint(tctxt, end, from_mpv);
                 const char do_som = t->hasSom; // TODO: constant propagate
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 5cbb5c8486a8384cabd6a3a49774cafc9ae492a2..df464c2800a9a5ee9888ae8211ef05347196dd6c 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2019, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -554,7 +554,8 @@ void findFixedDepthTops(const RoseGraph &g, const set<PredTopPair> &triggers,
  */
 static
 bytecode_ptr<NFA> pickImpl(bytecode_ptr<NFA> dfa_impl,
-                           bytecode_ptr<NFA> nfa_impl) {
+                           bytecode_ptr<NFA> nfa_impl,
+                           bool fast_nfa) {
     assert(nfa_impl);
     assert(dfa_impl);
     assert(isDfaType(dfa_impl->type));
@@ -584,7 +585,7 @@ bytecode_ptr<NFA> pickImpl(bytecode_ptr<NFA> dfa_impl,
                 return nfa_impl;
             }
         } else {
-            if (n_accel) {
+            if (n_accel && fast_nfa) {
                 return nfa_impl;
             } else {
                 return dfa_impl;
@@ -632,6 +633,15 @@ bytecode_ptr<NFA> getDfa(raw_dfa &rdfa, bool is_transient,
          * bytecode and that they are usually run on small blocks */
         dfa = mcshengCompile(rdfa, cc, rm);
     }
+    if (!dfa) {
+        dfa = sheng32Compile(rdfa, cc, rm, false);
+    }
+    if (!dfa) {
+        dfa = sheng64Compile(rdfa, cc, rm, false);
+    }
+    if (!dfa && !is_transient) {
+        dfa = mcshengCompile64(rdfa, cc, rm);
+    }
     if (!dfa) {
         // Sheng wasn't successful, so unleash McClellan!
         dfa = mcclellanCompile(rdfa, cc, rm, false);
@@ -678,20 +688,21 @@ buildSuffix(const ReportManager &rm, const SomSlotManager &ssm,
         }
     }
 
+    bool fast_nfa = false;
     auto n = constructNFA(holder, &rm, fixed_depth_tops, triggers,
-                          compress_state, cc);
+                          compress_state, fast_nfa, cc);
     assert(n);
 
     if (oneTop && cc.grey.roseMcClellanSuffix) {
         if (cc.grey.roseMcClellanSuffix == 2 || n->nPositions > 128 ||
-            !has_bounded_repeats_other_than_firsts(*n)) {
+            !has_bounded_repeats_other_than_firsts(*n) || !fast_nfa) {
             auto rdfa = buildMcClellan(holder, &rm, false, triggers.at(0),
                                        cc.grey);
             if (rdfa) {
                 auto d = getDfa(*rdfa, false, cc, rm);
                 assert(d);
                 if (cc.grey.roseMcClellanSuffix != 2) {
-                    n = pickImpl(move(d), move(n));
+                    n = pickImpl(move(d), move(n), fast_nfa);
                 } else {
                     n = move(d);
                 }
@@ -826,23 +837,24 @@ bytecode_ptr<NFA> makeLeftNfa(const RoseBuildImpl &tbi, left_id &left,
         n = constructLBR(*left.graph(), triggers.begin()->second, cc, rm);
     }
 
+    bool fast_nfa = false;
     if (!n && left.graph()) {
         map<u32, vector<vector<CharReach>>> triggers;
         if (left.graph()->kind == NFA_INFIX) {
             findTriggerSequences(tbi, infixTriggers.at(left), &triggers);
         }
         n = constructNFA(*left.graph(), nullptr, fixed_depth_tops, triggers,
-                         compress_state, cc);
+                         compress_state, fast_nfa, cc);
     }
 
     if (cc.grey.roseMcClellanPrefix == 1 && is_prefix && !left.dfa()
         && left.graph()
-        && (!n || !has_bounded_repeats_other_than_firsts(*n) || !is_fast(*n))) {
+        && (!n || !has_bounded_repeats_other_than_firsts(*n) || !fast_nfa)) {
         auto rdfa = buildMcClellan(*left.graph(), nullptr, cc.grey);
         if (rdfa) {
             auto d = getDfa(*rdfa, is_transient, cc, rm);
             assert(d);
-            n = pickImpl(move(d), move(n));
+            n = pickImpl(move(d), move(n), fast_nfa);
         }
     }
 
@@ -1627,17 +1639,18 @@ public:
         const map<u32, u32> fixed_depth_tops; /* no tops */
         const map<u32, vector<vector<CharReach>>> triggers; /* no tops */
         bool compress_state = cc.streaming;
+        bool fast_nfa = false;
         auto n = constructNFA(h, &rm, fixed_depth_tops, triggers,
-                              compress_state, cc);
+                              compress_state, fast_nfa, cc);
 
         // Try for a DFA upgrade.
         if (n && cc.grey.roseMcClellanOutfix &&
-            !has_bounded_repeats_other_than_firsts(*n)) {
+            (!has_bounded_repeats_other_than_firsts(*n) || !fast_nfa)) {
             auto rdfa = buildMcClellan(h, &rm, cc.grey);
             if (rdfa) {
                 auto d = getDfa(*rdfa, false, cc, rm);
                 if (d) {
-                    n = pickImpl(move(d), move(n));
+                    n = pickImpl(move(d), move(n), fast_nfa);
                 }
             }
         }
diff --git a/src/rose/rose_build_convert.cpp b/src/rose/rose_build_convert.cpp
index 33351099f70c84abea196108b274e1820a568386..d5b73cad550e095bac1674074e31e51eb8068119 100644
--- a/src/rose/rose_build_convert.cpp
+++ b/src/rose/rose_build_convert.cpp
@@ -562,6 +562,10 @@ bool handleMixedPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v,
     DEBUG_PRINTF("woot?\n");
 
     shared_ptr<NGHolder> h_new = make_shared<NGHolder>();
+    if (!h_new) {
+        assert(0);
+        throw std::bad_alloc();
+    }
     unordered_map<NFAVertex, NFAVertex> rhs_map;
     vector<NFAVertex> exits_vec;
     insert(&exits_vec, exits_vec.end(), exits);
diff --git a/src/rose/rose_build_dump.cpp b/src/rose/rose_build_dump.cpp
index 8999daef25d80b7cf4fbfd982e3d6268bc3802fa..dbc938a5ca078cae8757be7fac0601aab7733dc6 100644
--- a/src/rose/rose_build_dump.cpp
+++ b/src/rose/rose_build_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2019, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -757,13 +757,12 @@ CharReach shufti2cr(const u8 *lo, const u8 *hi, u8 bucket_mask) {
 
 static
 void dumpLookaroundShufti(ofstream &os, u32 len, const u8 *lo, const u8 *hi,
-                          const u8 *bucket_mask, u32 neg_mask, s32 offset) {
-    assert(len == 16 || len == 32);
+                          const u8 *bucket_mask, u64a neg_mask, s32 offset) {
+    assert(len == 16 || len == 32 || len == 64);
     os << "    contents:" << endl;
     for (u32 idx = 0; idx < len; idx++) {
         CharReach cr = shufti2cr(lo, hi, bucket_mask[idx]);
-
-        if (neg_mask & (1U << idx)) {
+        if (neg_mask & (1ULL << idx)) {
             cr.flip();
         }
 
@@ -779,14 +778,13 @@ void dumpLookaroundShufti(ofstream &os, u32 len, const u8 *lo, const u8 *hi,
 static
 void dumpLookaroundShufti(ofstream &os, u32 len, const u8 *lo, const u8 *hi,
                           const u8 *lo_2, const u8 *hi_2, const u8 *bucket_mask,
-                          const u8 *bucket_mask_2, u32 neg_mask, s32 offset) {
-    assert(len == 16 || len == 32);
+                          const u8 *bucket_mask_2, u64a neg_mask, s32 offset) {
+    assert(len == 16 || len == 32 || len == 64);
     os << "    contents:" << endl;
     for (u32 idx = 0; idx < len; idx++) {
         CharReach cr = shufti2cr(lo, hi, bucket_mask[idx]);
         cr |= shufti2cr(lo_2, hi_2, bucket_mask_2[idx]);
-
-        if (neg_mask & (1U << idx)) {
+        if (neg_mask & (1ULL << idx)) {
             cr.flip();
         }
 
@@ -970,6 +968,20 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(CHECK_MASK_64) {
+                os << "    and_mask "
+                   << dumpStrMask(ri->and_mask, sizeof(ri->and_mask))
+                   << endl;
+                os << "    cmp_mask "
+                   << dumpStrMask(ri->cmp_mask, sizeof(ri->cmp_mask))
+                   << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
+                os << "    offset " << ri->offset << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(CHECK_BYTE) {
                 os << "    and_mask 0x" << std::hex << std::setw(2)
                    << std::setfill('0') << u32{ri->and_mask} << std::dec
@@ -1072,6 +1084,60 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(CHECK_SHUFTI_64x8) {
+                os << "    hi_mask "
+                   << dumpStrMask(ri->hi_mask, sizeof(ri->hi_mask))
+                   << endl;
+                os << "    lo_mask "
+                   << dumpStrMask(ri->hi_mask, sizeof(ri->hi_mask))
+                   << endl;
+                os << "    bucket_select_mask "
+                   << dumpStrMask(ri->bucket_select_mask,
+                                  sizeof(ri->bucket_select_mask))
+                   << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
+                os << "    offset " << ri->offset << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+                dumpLookaroundShufti(os, 64, ri->lo_mask, ri->hi_mask,
+                                     ri->bucket_select_mask, ri->neg_mask,
+                                     ri->offset);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_SHUFTI_64x16) {
+                os << "    hi_mask_1 "
+                   << dumpStrMask(ri->hi_mask_1, sizeof(ri->hi_mask_1))
+                   << endl;
+                os << "    hi_mask_2 "
+                   << dumpStrMask(ri->hi_mask_2, sizeof(ri->hi_mask_2))
+                   << endl;
+                os << "    lo_mask_1 "
+                   << dumpStrMask(ri->lo_mask_1, sizeof(ri->lo_mask_1))
+                   << endl;
+                os << "    lo_mask_2 "
+                   << dumpStrMask(ri->lo_mask_2, sizeof(ri->lo_mask_2))
+                   << endl;
+                os << "    bucket_select_mask_hi "
+                   << dumpStrMask(ri->bucket_select_mask_hi,
+                                  sizeof(ri->bucket_select_mask_hi))
+                   << endl;
+                os << "    bucket_select_mask_lo "
+                   << dumpStrMask(ri->bucket_select_mask_lo,
+                                  sizeof(ri->bucket_select_mask_lo))
+                   << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
+                os << "    offset " << ri->offset << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+                dumpLookaroundShufti(os, 64, ri->lo_mask_1, ri->hi_mask_1,
+                                     ri->lo_mask_2, ri->hi_mask_2,
+                                     ri->bucket_select_mask_lo,
+                                     ri->bucket_select_mask_hi,
+                                     ri->neg_mask, ri->offset);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(CHECK_INFIX) {
                 os << "    queue " << ri->queue << endl;
                 os << "    lag " << ri->lag << endl;
diff --git a/src/rose/rose_build_groups.cpp b/src/rose/rose_build_groups.cpp
index c670e60334125919a0c84564b561c58da816f2e4..209889e558bb419b550cddc3caacd3f6abd98b0b 100644
--- a/src/rose/rose_build_groups.cpp
+++ b/src/rose/rose_build_groups.cpp
@@ -96,7 +96,7 @@ bool eligibleForAlwaysOnGroup(const RoseBuildImpl &build, u32 id) {
 static
 bool requires_group_assignment(const rose_literal_id &lit,
                                const rose_literal_info &info) {
-    if (lit.delay) { /* we will check the shadow's master */
+    if (lit.delay) { /* we will check the shadow's leader */
         return false;
     }
 
diff --git a/src/rose/rose_build_instructions.cpp b/src/rose/rose_build_instructions.cpp
index c503f7311a5e6b63d7fca3bd313ca1bb7d47a709..f96221b247982b3e52dd91f0d1eb7b096dfeacf6 100644
--- a/src/rose/rose_build_instructions.cpp
+++ b/src/rose/rose_build_instructions.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019, Intel Corporation
+ * Copyright (c) 2017-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -162,6 +162,17 @@ void RoseInstrCheckMask32::write(void *dest, RoseEngineBlob &blob,
     inst->fail_jump = calc_jump(offset_map, this, target);
 }
 
+void RoseInstrCheckMask64::write(void *dest, RoseEngineBlob &blob,
+                                 const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    copy(begin(and_mask), end(and_mask), inst->and_mask);
+    copy(begin(cmp_mask), end(cmp_mask), inst->cmp_mask);
+    inst->neg_mask = neg_mask;
+    inst->offset = offset;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
 void RoseInstrCheckByte::write(void *dest, RoseEngineBlob &blob,
                                const OffsetMap &offset_map) const {
     RoseInstrBase::write(dest, blob, offset_map);
@@ -227,6 +238,36 @@ void RoseInstrCheckShufti32x16::write(void *dest, RoseEngineBlob &blob,
     inst->fail_jump = calc_jump(offset_map, this, target);
 }
 
+void RoseInstrCheckShufti64x8::write(void *dest, RoseEngineBlob &blob,
+                                     const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    copy(begin(hi_mask), end(hi_mask), inst->hi_mask);
+    copy(begin(lo_mask), end(lo_mask), inst->lo_mask);
+    copy(begin(bucket_select_mask), end(bucket_select_mask),
+         inst->bucket_select_mask);
+    inst->neg_mask = neg_mask;
+    inst->offset = offset;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckShufti64x16::write(void *dest, RoseEngineBlob &blob,
+                                      const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    copy(begin(hi_mask_1), end(hi_mask_1), inst->hi_mask_1);
+    copy(begin(hi_mask_2), end(hi_mask_2), inst->hi_mask_2);
+    copy(begin(lo_mask_1), end(lo_mask_1), inst->lo_mask_1);
+    copy(begin(lo_mask_2), end(lo_mask_2), inst->lo_mask_2);
+    copy(begin(bucket_select_mask_hi), end(bucket_select_mask_hi),
+         inst->bucket_select_mask_hi);
+    copy(begin(bucket_select_mask_lo), end(bucket_select_mask_lo),
+         inst->bucket_select_mask_lo);
+    inst->neg_mask = neg_mask;
+    inst->offset = offset;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
 void RoseInstrCheckInfix::write(void *dest, RoseEngineBlob &blob,
                                 const OffsetMap &offset_map) const {
     RoseInstrBase::write(dest, blob, offset_map);
diff --git a/src/rose/rose_build_instructions.h b/src/rose/rose_build_instructions.h
index 306a4166cb0e02ab3cc16eff22e63c04443051f8..f18f4a47152c2495ea93088ddb87bede7f360b43 100644
--- a/src/rose/rose_build_instructions.h
+++ b/src/rose/rose_build_instructions.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019, Intel Corporation
+ * Copyright (c) 2017-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -519,6 +519,43 @@ public:
     }
 };
 
+class RoseInstrCheckMask64
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_MASK_64,
+                                    ROSE_STRUCT_CHECK_MASK_64,
+                                    RoseInstrCheckMask64> {
+public:
+    std::array<u8, 64> and_mask;
+    std::array<u8, 64> cmp_mask;
+    u64a neg_mask;
+    s32 offset;
+    const RoseInstruction *target;
+
+    RoseInstrCheckMask64(std::array<u8, 64> and_mask_in,
+                         std::array<u8, 64> cmp_mask_in, u64a neg_mask_in,
+                         s32 offset_in, const RoseInstruction *target_in)
+        : and_mask(std::move(and_mask_in)), cmp_mask(std::move(cmp_mask_in)),
+          neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
+    bool operator==(const RoseInstrCheckMask64 &ri) const {
+        return and_mask == ri.and_mask && cmp_mask == ri.cmp_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(opcode, and_mask, cmp_mask, neg_mask, offset);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckMask64 &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return and_mask == ri.and_mask && cmp_mask == ri.cmp_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
 class RoseInstrCheckByte
     : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_BYTE,
                                     ROSE_STRUCT_CHECK_BYTE,
@@ -738,6 +775,109 @@ public:
     }
 };
 
+class RoseInstrCheckShufti64x8
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_SHUFTI_64x8,
+                                    ROSE_STRUCT_CHECK_SHUFTI_64x8,
+                                    RoseInstrCheckShufti64x8> {
+public:
+    std::array<u8, 64> hi_mask;
+    std::array<u8, 64> lo_mask;
+    std::array<u8, 64> bucket_select_mask;
+    u64a neg_mask;
+    s32 offset;
+    const RoseInstruction *target;
+
+    RoseInstrCheckShufti64x8(std::array<u8, 64> hi_mask_in,
+                             std::array<u8, 64> lo_mask_in,
+                             std::array<u8, 64> bucket_select_mask_in,
+                             u64a neg_mask_in, s32 offset_in,
+                             const RoseInstruction *target_in)
+        : hi_mask(std::move(hi_mask_in)), lo_mask(std::move(lo_mask_in)),
+          bucket_select_mask(std::move(bucket_select_mask_in)),
+          neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckShufti64x8 &ri) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(opcode, hi_mask, lo_mask, bucket_select_mask, neg_mask,
+                        offset);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckShufti64x8 &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckShufti64x16
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_SHUFTI_64x16,
+                                    ROSE_STRUCT_CHECK_SHUFTI_64x16,
+                                    RoseInstrCheckShufti64x16> {
+public:
+    std::array<u8, 64> hi_mask_1;
+    std::array<u8, 64> hi_mask_2;
+    std::array<u8, 64> lo_mask_1;
+    std::array<u8, 64> lo_mask_2;
+    std::array<u8, 64> bucket_select_mask_hi;
+    std::array<u8, 64> bucket_select_mask_lo;
+    u64a neg_mask;
+    s32 offset;
+    const RoseInstruction *target;
+
+    RoseInstrCheckShufti64x16(std::array<u8, 64> hi_mask_1_in,
+                              std::array<u8, 64> hi_mask_2_in,
+                              std::array<u8, 64> lo_mask_1_in,
+                              std::array<u8, 64> lo_mask_2_in,
+                              std::array<u8, 64> bucket_select_mask_hi_in,
+                              std::array<u8, 64> bucket_select_mask_lo_in,
+                              u64a neg_mask_in, s32 offset_in,
+                              const RoseInstruction *target_in)
+        : hi_mask_1(std::move(hi_mask_1_in)), hi_mask_2(std::move(hi_mask_2_in)),
+          lo_mask_1(std::move(lo_mask_1_in)), lo_mask_2(std::move(lo_mask_2_in)),
+          bucket_select_mask_hi(std::move(bucket_select_mask_hi_in)),
+          bucket_select_mask_lo(std::move(bucket_select_mask_lo_in)),
+          neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckShufti64x16 &ri) const {
+        return hi_mask_1 == ri.hi_mask_1 && hi_mask_2 == ri.hi_mask_2 &&
+               lo_mask_1 == ri.lo_mask_1 && lo_mask_2 == ri.lo_mask_2 &&
+               bucket_select_mask_hi == ri.bucket_select_mask_hi &&
+               bucket_select_mask_lo == ri.bucket_select_mask_lo &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(opcode, hi_mask_1, hi_mask_2, lo_mask_1, lo_mask_2,
+                        bucket_select_mask_hi, bucket_select_mask_lo, neg_mask,
+                        offset);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckShufti64x16 &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return hi_mask_1 == ri.hi_mask_1 && hi_mask_2 == ri.hi_mask_2 &&
+               lo_mask_1 == ri.lo_mask_1 && lo_mask_2 == ri.lo_mask_2 &&
+               bucket_select_mask_hi == ri.bucket_select_mask_hi &&
+               bucket_select_mask_lo == ri.bucket_select_mask_lo &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
 class RoseInstrCheckInfix
     : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_INFIX,
                                     ROSE_STRUCT_CHECK_INFIX,
diff --git a/src/rose/rose_build_lookaround.cpp b/src/rose/rose_build_lookaround.cpp
index 7cc1c584d66d04d6b66686e09d08bebb3d7967dd..d0540d79b07d1e1e665d54b031019eb0540dfa15 100644
--- a/src/rose/rose_build_lookaround.cpp
+++ b/src/rose/rose_build_lookaround.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -58,7 +58,7 @@ static const u32 MAX_FWD_LEN = 64;
 static const u32 MAX_BACK_LEN = 64;
 
 /** \brief Max lookaround entries for a role. */
-static const u32 MAX_LOOKAROUND_ENTRIES = 16;
+static const u32 MAX_LOOKAROUND_ENTRIES = 32;
 
 /** \brief We would rather have lookarounds with smaller reach than this. */
 static const u32 LOOKAROUND_WIDE_REACH = 200;
diff --git a/src/rose/rose_build_program.cpp b/src/rose/rose_build_program.cpp
index 501932c5ca9001206dc24515608d3fe5bdc1635b..96c95dbf0571eb74415d0b3449c376b6847b2ca0 100644
--- a/src/rose/rose_build_program.cpp
+++ b/src/rose/rose_build_program.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -1061,6 +1061,49 @@ bool makeRoleMask32(const vector<LookEntry> &look,
     return true;
 }
 
+static
+bool makeRoleMask64(const vector<LookEntry> &look,
+                    RoseProgram &program, const target_t &target) {
+    if (!target.has_avx512()) {
+        return false;
+    }
+
+    if (look.back().offset >= look.front().offset + 64) {
+        return false;
+    }
+    s32 base_offset = verify_s32(look.front().offset);
+    array<u8, 64> and_mask, cmp_mask;
+    and_mask.fill(0);
+    cmp_mask.fill(0);
+    u64a neg_mask = 0;
+    for (const auto &entry : look) {
+        u8 andmask_u8, cmpmask_u8, flip;
+        if (!checkReachWithFlip(entry.reach, andmask_u8, cmpmask_u8, flip)) {
+            return false;
+        }
+        u32 shift = entry.offset - base_offset;
+        assert(shift < 64);
+        and_mask[shift] = andmask_u8;
+        cmp_mask[shift] = cmpmask_u8;
+        if (flip) {
+            neg_mask |= 1ULL << shift;
+        }
+    }
+
+    DEBUG_PRINTF("and_mask %s\n",
+                 convertMaskstoString(and_mask.data(), 64).c_str());
+    DEBUG_PRINTF("cmp_mask %s\n",
+                 convertMaskstoString(cmp_mask.data(), 64).c_str());
+    DEBUG_PRINTF("neg_mask %llx\n", neg_mask);
+    DEBUG_PRINTF("base_offset %d\n", base_offset);
+
+    const auto *end_inst = program.end_instruction();
+    auto ri = make_unique<RoseInstrCheckMask64>(and_mask, cmp_mask, neg_mask,
+                                                base_offset, end_inst);
+    program.add_before_end(move(ri));
+    return true;
+}
+
 // Sorting by the size of every bucket.
 // Used in map<u32, vector<s8>, cmpNibble>.
 struct cmpNibble {
@@ -1084,6 +1127,7 @@ void getAllBuckets(const vector<LookEntry> &look,
         } else {
             neg_mask ^= 1ULL << (entry.offset - base_offset);
         }
+
         map <u16, u16> lo2hi;
         // We treat Ascii Table as a 16x16 grid.
         // Push every row in cr into lo2hi and mark the row number.
@@ -1237,6 +1281,7 @@ makeCheckShufti16x16(u32 offset_range, u8 bucket_idx,
            (hi_mask, lo_mask, bucket_select_mask_32,
             neg_mask & 0xffff, base_offset, end_inst);
 }
+
 static
 unique_ptr<RoseInstruction>
 makeCheckShufti32x16(u32 offset_range, u8 bucket_idx,
@@ -1255,10 +1300,83 @@ makeCheckShufti32x16(u32 offset_range, u8 bucket_idx,
 }
 
 static
-bool makeRoleShufti(const vector<LookEntry> &look, RoseProgram &program) {
+unique_ptr<RoseInstruction>
+makeCheckShufti64x8(u32 offset_range, u8 bucket_idx,
+                    const array<u8, 32> &hi_mask, const array<u8, 32> &lo_mask,
+                    const array<u8, 64> &bucket_select_mask,
+                    u64a neg_mask, s32 base_offset,
+                    const RoseInstruction *end_inst) {
+    if (offset_range > 64 || bucket_idx > 8) {
+        return nullptr;
+    }
+
+    array<u8, 64> hi_mask_64;
+    array<u8, 64> lo_mask_64;
+    copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_64.begin());
+    copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_64.begin() + 16);
+    copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_64.begin() + 32);
+    copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_64.begin() + 48);
+    copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_64.begin());
+    copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_64.begin() + 16);
+    copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_64.begin() + 32);
+    copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_64.begin() + 48);
+
+    return make_unique<RoseInstrCheckShufti64x8>
+           (hi_mask_64, lo_mask_64, bucket_select_mask,
+            neg_mask, base_offset, end_inst);
+}
+
+static
+unique_ptr<RoseInstruction>
+makeCheckShufti64x16(u32 offset_range, u8 bucket_idx,
+                     const array<u8, 32> &hi_mask, const array<u8, 32> &lo_mask,
+                     const array<u8, 64> &bucket_select_mask_lo,
+                     const array<u8, 64> &bucket_select_mask_hi,
+                     u64a neg_mask, s32 base_offset,
+                     const RoseInstruction *end_inst) {
+    if (offset_range > 64 || bucket_idx > 16) {
+        return nullptr;
+    }
+
+    array<u8, 64> hi_mask_1;
+    array<u8, 64> hi_mask_2;
+    array<u8, 64> lo_mask_1;
+    array<u8, 64> lo_mask_2;
+
+    copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_1.begin());
+    copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_1.begin() + 16);
+    copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_1.begin() + 32);
+    copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_1.begin() + 48);
+    copy(hi_mask.begin() + 16, hi_mask.begin() + 32, hi_mask_2.begin());
+    copy(hi_mask.begin() + 16, hi_mask.begin() + 32, hi_mask_2.begin() + 16);
+    copy(hi_mask.begin() + 16, hi_mask.begin() + 32, hi_mask_2.begin() + 32);
+    copy(hi_mask.begin() + 16, hi_mask.begin() + 32, hi_mask_2.begin() + 48);
+
+    copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_1.begin());
+    copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_1.begin() + 16);
+    copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_1.begin() + 32);
+    copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_1.begin() + 48);
+    copy(lo_mask.begin() + 16, lo_mask.begin() + 32, lo_mask_2.begin());
+    copy(lo_mask.begin() + 16, lo_mask.begin() + 32, lo_mask_2.begin() + 16);
+    copy(lo_mask.begin() + 16, lo_mask.begin() + 32, lo_mask_2.begin() + 32);
+    copy(lo_mask.begin() + 16, lo_mask.begin() + 32, lo_mask_2.begin() + 48);
+
+    return make_unique<RoseInstrCheckShufti64x16>
+           (hi_mask_1, hi_mask_2, lo_mask_1, lo_mask_2, bucket_select_mask_hi,
+            bucket_select_mask_lo, neg_mask, base_offset, end_inst);
+}
 
+static
+bool makeRoleShufti(const vector<LookEntry> &look, RoseProgram &program,
+                    const target_t &target) {
+    s32 offset_limit;
+    if (target.has_avx512()) {
+        offset_limit = 64;
+    } else {
+        offset_limit = 32;
+    }
     s32 base_offset = verify_s32(look.front().offset);
-    if (look.back().offset >= base_offset + 32) {
+    if (look.back().offset >= base_offset + offset_limit) {
         return false;
     }
 
@@ -1266,17 +1384,40 @@ bool makeRoleShufti(const vector<LookEntry> &look, RoseProgram &program) {
     u64a neg_mask_64;
     array<u8, 32> hi_mask;
     array<u8, 32> lo_mask;
+    array<u8, 64> bucket_select_hi_64; // for AVX512
+    array<u8, 64> bucket_select_lo_64; // for AVX512
     array<u8, 32> bucket_select_hi;
     array<u8, 32> bucket_select_lo;
     hi_mask.fill(0);
     lo_mask.fill(0);
+    bucket_select_hi_64.fill(0);
+    bucket_select_lo_64.fill(0);
     bucket_select_hi.fill(0); // will not be used in 16x8 and 32x8.
     bucket_select_lo.fill(0);
 
-    if (!getShuftiMasks(look, hi_mask, lo_mask, bucket_select_hi.data(),
-                        bucket_select_lo.data(), neg_mask_64, bucket_idx, 32)) {
-        return false;
+    if (target.has_avx512()) {
+        if (!getShuftiMasks(look, hi_mask, lo_mask, bucket_select_hi_64.data(),
+                            bucket_select_lo_64.data(), neg_mask_64, bucket_idx,
+                            32)) {
+            return false;
+        }
+        copy(bucket_select_hi_64.begin(), bucket_select_hi_64.begin() + 32,
+             bucket_select_hi.begin());
+        copy(bucket_select_lo_64.begin(), bucket_select_lo_64.begin() + 32,
+            bucket_select_lo.begin());
+
+        DEBUG_PRINTF("bucket_select_hi_64 %s\n",
+                     convertMaskstoString(bucket_select_hi_64.data(), 64).c_str());
+        DEBUG_PRINTF("bucket_select_lo_64 %s\n",
+                     convertMaskstoString(bucket_select_lo_64.data(), 64).c_str());
+    } else {
+        if (!getShuftiMasks(look, hi_mask, lo_mask, bucket_select_hi.data(),
+                            bucket_select_lo.data(), neg_mask_64, bucket_idx,
+                            32)) {
+            return false;
+        }
     }
+
     u32 neg_mask = (u32)neg_mask_64;
 
     DEBUG_PRINTF("hi_mask %s\n",
@@ -1299,6 +1440,13 @@ bool makeRoleShufti(const vector<LookEntry> &look, RoseProgram &program) {
                                  bucket_select_lo, neg_mask, base_offset,
                                  end_inst);
     }
+    if (target.has_avx512()) {
+        if (!ri) {
+            ri = makeCheckShufti64x8(offset_range, bucket_idx, hi_mask, lo_mask,
+                                     bucket_select_lo_64, neg_mask_64,
+                                     base_offset, end_inst);
+        }
+    }
     if (!ri) {
         ri = makeCheckShufti16x16(offset_range, bucket_idx, hi_mask, lo_mask,
                                   bucket_select_lo, bucket_select_hi,
@@ -1309,6 +1457,13 @@ bool makeRoleShufti(const vector<LookEntry> &look, RoseProgram &program) {
                                   bucket_select_lo, bucket_select_hi,
                                   neg_mask, base_offset, end_inst);
     }
+    if (target.has_avx512()) {
+        if (!ri) {
+            ri = makeCheckShufti64x16(offset_range, bucket_idx, hi_mask, lo_mask,
+                                      bucket_select_lo_64, bucket_select_hi_64,
+                                      neg_mask_64, base_offset, end_inst);
+        }
+    }
     assert(ri);
     program.add_before_end(move(ri));
 
@@ -1321,7 +1476,7 @@ bool makeRoleShufti(const vector<LookEntry> &look, RoseProgram &program) {
  */
 static
 void makeLookaroundInstruction(const vector<LookEntry> &look,
-                               RoseProgram &program) {
+                               RoseProgram &program, const target_t &target) {
     assert(!look.empty());
 
     if (makeRoleByte(look, program)) {
@@ -1345,7 +1500,11 @@ void makeLookaroundInstruction(const vector<LookEntry> &look,
         return;
     }
 
-    if (makeRoleShufti(look, program)) {
+    if (makeRoleMask64(look, program, target)) {
+        return;
+    }
+
+    if (makeRoleShufti(look, program, target)) {
         return;
     }
 
@@ -1386,7 +1545,7 @@ void makeCheckLitMaskInstruction(const RoseBuildImpl &build, u32 lit_id,
         return; // all caseful chars handled by HWLM mask.
     }
 
-    makeLookaroundInstruction(look, program);
+    makeLookaroundInstruction(look, program, build.cc.target_info);
 }
 
 static
@@ -1730,7 +1889,7 @@ void makeRoleLookaround(const RoseBuildImpl &build,
         findLookaroundMasks(build, v, look_more);
         mergeLookaround(look, look_more);
         if (!look.empty()) {
-            makeLookaroundInstruction(look, program);
+            makeLookaroundInstruction(look, program, build.cc.target_info);
         }
         return;
     }
diff --git a/src/rose/rose_program.h b/src/rose/rose_program.h
index e5485476b7fb0f9d2071bad6197a8e39cb006a99..7e21303cb7cf46a432af0bdaa25d572a6f89d99c 100644
--- a/src/rose/rose_program.h
+++ b/src/rose/rose_program.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2019, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -208,7 +208,11 @@ enum RoseInstructionCode {
      */
     ROSE_INSTR_LAST_FLUSH_COMBINATION,
 
-    LAST_ROSE_INSTRUCTION = ROSE_INSTR_LAST_FLUSH_COMBINATION //!< Sentinel.
+    ROSE_INSTR_CHECK_SHUFTI_64x8, //!< Check 64-byte data by 8-bucket shufti.
+    ROSE_INSTR_CHECK_SHUFTI_64x16, //!< Check 64-byte data by 16-bucket shufti.
+    ROSE_INSTR_CHECK_MASK_64,     //!< 64-bytes and/cmp/neg mask check.
+
+    LAST_ROSE_INSTRUCTION = ROSE_INSTR_CHECK_MASK_64 //!< Sentinel.
 };
 
 struct ROSE_STRUCT_END {
@@ -285,6 +289,15 @@ struct ROSE_STRUCT_CHECK_MASK_32 {
     u32 fail_jump; //!< Jump forward this many bytes on failure.
 };
 
+struct ROSE_STRUCT_CHECK_MASK_64 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 and_mask[64]; //!< 64-byte and mask.
+    u8 cmp_mask[64]; //!< 64-byte cmp mask.
+    u64a neg_mask; //!< negation mask with 32 bits.
+    s32 offset; //!< Relative offset of the first byte.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
 struct ROSE_STRUCT_CHECK_BYTE {
     u8 code; //!< From enum RoseInstructionCode.
     u8 and_mask; //!< 8-bits and mask.
@@ -336,6 +349,29 @@ struct ROSE_STRUCT_CHECK_SHUFTI_32x16 {
     u32 fail_jump; //!< Jump forward this many bytes on failure.
 };
 
+struct ROSE_STRUCT_CHECK_SHUFTI_64x8 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 hi_mask[64]; //!< High nibble mask in shufti.
+    u8 lo_mask[64]; //!< Low nibble mask in shufti.
+    u8 bucket_select_mask[64]; //!< Mask for bucket assigning.
+    u64a neg_mask; //!< 64 bits negation mask.
+    s32 offset; //!< Relative offset of the first byte.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_SHUFTI_64x16 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 hi_mask_1[64]; //!< 4 copies of 0-15 High nibble mask.
+    u8 hi_mask_2[64]; //!< 4 copies of 16-32 High nibble mask.
+    u8 lo_mask_1[64]; //!< 4 copies of 0-15 Low nibble mask.
+    u8 lo_mask_2[64]; //!< 4 copies of 16-32 Low nibble mask.
+    u8 bucket_select_mask_hi[64]; //!< Bucket mask for high 8 buckets.
+    u8 bucket_select_mask_lo[64]; //!< Bucket mask for low 8 buckets.
+    u64a neg_mask; //!< 64 bits negation mask.
+    s32 offset; //!< Relative offset of the first byte.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
 struct ROSE_STRUCT_CHECK_INFIX {
     u8 code; //!< From enum RoseInstructionCode.
     u32 queue; //!< Queue of leftfix to check.
diff --git a/src/rose/stream_long_lit.h b/src/rose/stream_long_lit.h
index 3486760878db1bda0a54386726779df03badcc8a..df9b57f4e27f51ad32064b72665a250bb657f9c4 100644
--- a/src/rose/stream_long_lit.h
+++ b/src/rose/stream_long_lit.h
@@ -201,12 +201,12 @@ const u8 *prepScanBuffer(const struct core_info *ci,
         } else {
             // Copy: first chunk from history buffer.
             assert(overhang <= ci->hlen);
-            copy_upto_32_bytes(tempbuf, ci->hbuf + ci->hlen - overhang,
+            copy_upto_64_bytes(tempbuf, ci->hbuf + ci->hlen - overhang,
                                overhang);
             // Copy: second chunk from current buffer.
             size_t copy_buf_len = LONG_LIT_HASH_LEN - overhang;
             assert(copy_buf_len <= ci->len);
-            copy_upto_32_bytes(tempbuf + overhang, ci->buf, copy_buf_len);
+            copy_upto_64_bytes(tempbuf + overhang, ci->buf, copy_buf_len);
             // Read from our temporary buffer for the hash.
             base = tempbuf;
         }
diff --git a/src/rose/validate_mask.h b/src/rose/validate_mask.h
index ac8cc312e8a543639041883657b6a44618bb30e7..8191db52f8a6c7818808b54bd21407c6522ba896 100644
--- a/src/rose/validate_mask.h
+++ b/src/rose/validate_mask.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -41,6 +41,17 @@ void validateMask32Print(const u8 *mask) {
     }
     printf("\n");
 }
+
+#ifdef HAVE_AVX512
+static
+void validateMask64Print(const u8 *mask) {
+    int i;
+    for (i = 0; i < 64; i++) {
+        printf("%02x ", mask[i]);
+    }
+    printf("\n");
+}
+#endif
 #endif
 
 // check positive bytes in cmp_result.
@@ -115,4 +126,29 @@ int validateMask32(const m256 data, const u32 valid_data_mask,
     }
 }
 
+#ifdef HAVE_AVX512
+static really_inline
+int validateMask64(const m512 data, const u64a valid_data_mask,
+                   const m512 and_mask, const m512 cmp_mask,
+                   const u64a neg_mask) {
+    u64a cmp_result = ~eq512mask(and512(data, and_mask), cmp_mask);
+#ifdef DEBUG
+    DEBUG_PRINTF("data\n");
+    validateMask64Print((const u8 *)&data);
+    DEBUG_PRINTF("cmp_result\n");
+    validateMask64Print((const u8 *)&cmp_result);
+#endif
+    DEBUG_PRINTF("cmp_result %016llx neg_mask %016llx\n", cmp_result, neg_mask);
+    DEBUG_PRINTF("valid_data_mask %016llx\n", valid_data_mask);
+
+    if ((cmp_result & valid_data_mask) == (neg_mask & valid_data_mask)) {
+        DEBUG_PRINTF("checkCompareResult64 passed\n");
+        return 1;
+    } else {
+        DEBUG_PRINTF("checkCompareResult64 failed\n");
+        return 0;
+    }
+}
+#endif
+
 #endif
diff --git a/src/rose/validate_shufti.h b/src/rose/validate_shufti.h
index 1dc855d9933a3b878978da00b796cae9ca283beb..351df36a7664985f2fdc09f976425615399b491d 100644
--- a/src/rose/validate_shufti.h
+++ b/src/rose/validate_shufti.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -175,6 +175,84 @@ int validateShuftiMask32x16(const m256 data,
     return !cmp_result;
 }
 
+#ifdef HAVE_AVX512
+static really_inline
+int validateShuftiMask64x8(const m512 data, const m512 hi_mask,
+                           const m512 lo_mask, const m512 and_mask,
+                           const u64a neg_mask, const u64a valid_data_mask) {
+    m512 low4bits = set64x8(0xf);
+    m512 c_lo = pshufb_m512(lo_mask, and512(data, low4bits));
+    m512 c_hi = pshufb_m512(hi_mask,
+                            rshift64_m512(andnot512(low4bits, data), 4));
+    m512 t = and512(c_lo, c_hi);
+    u64a nresult = eq512mask(and512(t, and_mask), zeroes512());
+#ifdef DEBUG
+    DEBUG_PRINTF("data\n");
+    dumpMask(&data, 64);
+    DEBUG_PRINTF("hi_mask\n");
+    dumpMask(&hi_mask, 64);
+    DEBUG_PRINTF("lo_mask\n");
+    dumpMask(&lo_mask, 64);
+    DEBUG_PRINTF("c_lo\n");
+    dumpMask(&c_lo, 64);
+    DEBUG_PRINTF("c_hi\n");
+    dumpMask(&c_hi, 64);
+    DEBUG_PRINTF("nresult %llx\n", nresult);
+    DEBUG_PRINTF("valid_data_mask %llx\n", valid_data_mask);
+#endif
+    u64a cmp_result = (nresult ^ neg_mask) & valid_data_mask;
+    return !cmp_result;
+}
+
+static really_inline
+int validateShuftiMask64x16(const m512 data,
+                            const m512 hi_mask_1, const m512 hi_mask_2,
+                            const m512 lo_mask_1, const m512 lo_mask_2,
+                            const m512 and_mask_hi, const m512 and_mask_lo,
+                            const u64a neg_mask, const u64a valid_data_mask) {
+    m512 low4bits = set64x8(0xf);
+    m512 data_lo = and512(data, low4bits);
+    m512 data_hi = and512(rshift64_m512(data, 4), low4bits);
+    m512 c_lo_1 = pshufb_m512(lo_mask_1, data_lo);
+    m512 c_lo_2 = pshufb_m512(lo_mask_2, data_lo);
+    m512 c_hi_1 = pshufb_m512(hi_mask_1, data_hi);
+    m512 c_hi_2 = pshufb_m512(hi_mask_2, data_hi);
+    m512 t1 = and512(c_lo_1, c_hi_1);
+    m512 t2 = and512(c_lo_2, c_hi_2);
+    m512 result = or512(and512(t1, and_mask_lo), and512(t2, and_mask_hi));
+    u64a nresult = eq512mask(result, zeroes512());
+#ifdef DEBUG
+    DEBUG_PRINTF("data\n");
+    dumpMask(&data, 64);
+    DEBUG_PRINTF("data_lo\n");
+    dumpMask(&data_lo, 64);
+    DEBUG_PRINTF("data_hi\n");
+    dumpMask(&data_hi, 64);
+    DEBUG_PRINTF("hi_mask_1\n");
+    dumpMask(&hi_mask_1, 64);
+    DEBUG_PRINTF("hi_mask_2\n");
+    dumpMask(&hi_mask_2, 64);
+    DEBUG_PRINTF("lo_mask_1\n");
+    dumpMask(&lo_mask_1, 64);
+    DEBUG_PRINTF("lo_mask_2\n");
+    dumpMask(&lo_mask_2, 64);
+    DEBUG_PRINTF("c_lo_1\n");
+    dumpMask(&c_lo_1, 64);
+    DEBUG_PRINTF("c_lo_2\n");
+    dumpMask(&c_lo_2, 64);
+    DEBUG_PRINTF("c_hi_1\n");
+    dumpMask(&c_hi_1, 64);
+    DEBUG_PRINTF("c_hi_2\n");
+    dumpMask(&c_hi_2, 64);
+    DEBUG_PRINTF("result\n");
+    dumpMask(&result, 64);
+    DEBUG_PRINTF("valid_data_mask %llx\n", valid_data_mask);
+#endif
+    u64a cmp_result = (nresult ^ neg_mask) & valid_data_mask;
+    return !cmp_result;
+}
+#endif
+
 static really_inline
 int checkMultipath32(u32 data, u32 hi_bits, u32 lo_bits) {
     u32 t = ~(data | hi_bits);
diff --git a/src/runtime.c b/src/runtime.c
index a3659348c529c4a008457c287bdd9cd71199a6a0..a055e5f4f53899bdd6acb4f2ed2308a9bc6029af 100644
--- a/src/runtime.c
+++ b/src/runtime.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2019, Intel Corporation
+ * Copyright (c) 2015-2022, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -1013,6 +1013,7 @@ hs_error_t HS_CDECL hs_close_stream(hs_stream_t *id, hs_scratch_t *scratch,
         report_eod_matches(id, scratch, onEvent, context);
         if (unlikely(internal_matching_error(scratch))) {
             unmarkScratchInUse(scratch);
+            hs_stream_free(id);
             return HS_UNKNOWN_ERROR;
         }
         unmarkScratchInUse(scratch);
diff --git a/src/scratch.c b/src/scratch.c
index 25991e2bbad481ba52926742d95b9ce351007887..9f6d77cdc4efc4bd4f18300703a0cf0e41ad0642 100644
--- a/src/scratch.c
+++ b/src/scratch.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2019, Intel Corporation
+ * Copyright (c) 2015-2023, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/src/scratch.h b/src/scratch.h
index 1256f7aba82e587b663ff694ae5f3ac599b312d8..e3cd9245218831b3973e68883f0b0339feaa4c2a 100644
--- a/src/scratch.h
+++ b/src/scratch.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2019, Intel Corporation
+ * Copyright (c) 2015-2023, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/src/smallwrite/smallwrite_build.cpp b/src/smallwrite/smallwrite_build.cpp
index 345edfe95033a16e60684d4108906b10594e633a..ea89669a8bdb64953edfb9aab58264a6409d9a55 100644
--- a/src/smallwrite/smallwrite_build.cpp
+++ b/src/smallwrite/smallwrite_build.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -78,7 +78,7 @@ namespace ue2 {
 struct LitTrieVertexProps {
     LitTrieVertexProps() = default;
     explicit LitTrieVertexProps(u8 c_in) : c(c_in) {}
-    size_t index; // managed by ue2_graph
+    size_t index = 0; // managed by ue2_graph
     u8 c = 0; //!< character reached on this vertex
     flat_set<ReportID> reports; //!< managed reports fired on this vertex
 };
@@ -793,6 +793,12 @@ bytecode_ptr<NFA> getDfa(raw_dfa &rdfa, const CompileContext &cc,
     bytecode_ptr<NFA> dfa = nullptr;
     if (cc.grey.allowSmallWriteSheng) {
         dfa = shengCompile(rdfa, cc, rm, only_accel_init, &accel_states);
+        if (!dfa) {
+            dfa = sheng32Compile(rdfa, cc, rm, only_accel_init, &accel_states);
+        }
+        if (!dfa) {
+            dfa = sheng64Compile(rdfa, cc, rm, only_accel_init, &accel_states);
+        }
     }
     if (!dfa) {
         dfa = mcclellanCompile(rdfa, cc, rm, only_accel_init,
diff --git a/src/state.h b/src/state.h
index 9ade59db4bb9ce8e4680246e02a095850c279424..68600a910f63305f8bd90ce69f5dbfada1d100d9 100644
--- a/src/state.h
+++ b/src/state.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2023, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/src/stream_compress_impl.h b/src/stream_compress_impl.h
index d1ccf5e6d03485c508277a9b418bd9e0c93ddea1..f02543efa59167a09e33b3f162f59022433db554 100644
--- a/src/stream_compress_impl.h
+++ b/src/stream_compress_impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018, Intel Corporation
+ * Copyright (c) 2017-2023, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/src/util/alloc.h b/src/util/alloc.h
index de20c8d028e4112f377ad8bf5f00d21209b65589..49b4a824d10d639e7310d16e6e450238f37e757e 100644
--- a/src/util/alloc.h
+++ b/src/util/alloc.h
@@ -76,7 +76,11 @@ public:
 
     T *allocate(std::size_t size) const {
         size_t alloc_size = size * sizeof(T);
-        return static_cast<T *>(aligned_malloc_internal(alloc_size, N));
+        T *ptr = static_cast<T *>(aligned_malloc_internal(alloc_size, N));
+        if (!ptr) {
+            throw std::bad_alloc();
+        }
+        return ptr;
     }
 
     void deallocate(T *x, std::size_t) const noexcept {
diff --git a/src/util/copybytes.h b/src/util/copybytes.h
index 872b8d2893cc48c7ede55d377cd828afa37566e8..7f37d96bc5fca4f21dee92cd1bcd03a346ffe90b 100644
--- a/src/util/copybytes.h
+++ b/src/util/copybytes.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,7 +33,7 @@
 #include "simd_utils.h"
 
 static really_inline
-void copy_upto_32_bytes(u8 *dst, const u8 *src, unsigned int len) {
+void copy_upto_64_bytes(u8 *dst, const u8 *src, unsigned int len) {
     switch (len) {
     case 0:
         break;
@@ -72,14 +72,41 @@ void copy_upto_32_bytes(u8 *dst, const u8 *src, unsigned int len) {
     case 16:
         storeu128(dst, loadu128(src));
         break;
+    case 17:
+    case 18:
+    case 19:
+    case 20:
+    case 21:
+    case 22:
+    case 23:
+    case 24:
+    case 25:
+    case 26:
+    case 27:
+    case 28:
+    case 29:
+    case 30:
+    case 31:
+        storeu128(dst + len - 16, loadu128(src + len - 16));
+        storeu128(dst, loadu128(src));
+        break;
     case 32:
         storeu256(dst, loadu256(src));
         break;
+#ifdef HAVE_AVX512
+    case 64:
+        storebytes512(dst, loadu512(src), 64);
+        break;
     default:
-        assert(len < 32);
-        storeu128(dst + len - 16, loadu128(src + len - 16));
-        storeu128(dst, loadu128(src));
+        assert(len < 64);
+        u64a k = (1ULL << len) - 1;
+        storeu_mask_m512(dst, k, loadu_maskz_m512(k, src));
+        break;
+#else
+    default:
+        assert(0);
         break;
+#endif
     }
 }
 
diff --git a/src/util/cpuid_flags.c b/src/util/cpuid_flags.c
index 1ad2ee4cd635743ae3bbb4554298388487864a86..e0f63684deb8699a0c87079f1c7632d17e6182dc 100644
--- a/src/util/cpuid_flags.c
+++ b/src/util/cpuid_flags.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -50,6 +50,11 @@ u64a cpuid_flags(void) {
         cap |= HS_CPU_FEATURES_AVX512;
     }
 
+    if (check_avx512vbmi()) {
+        DEBUG_PRINTF("AVX512VBMI enabled\n");
+        cap |= HS_CPU_FEATURES_AVX512VBMI;
+    }
+
 #if !defined(FAT_RUNTIME) && !defined(HAVE_AVX2)
     cap &= ~HS_CPU_FEATURES_AVX2;
 #endif
@@ -58,6 +63,11 @@ u64a cpuid_flags(void) {
     (defined(FAT_RUNTIME) && !defined(BUILD_AVX512))
     cap &= ~HS_CPU_FEATURES_AVX512;
 #endif
+
+#if (!defined(FAT_RUNTIME) && !defined(HAVE_AVX512VBMI)) ||                    \
+    (defined(FAT_RUNTIME) && !defined(BUILD_AVX512VBMI))
+    cap &= ~HS_CPU_FEATURES_AVX512VBMI;
+#endif
 #endif
     return cap;
 }
@@ -106,6 +116,11 @@ static const struct family_id known_microarch[] = {
     { 0x6, 0x8E, HS_TUNE_FAMILY_SKL }, /* Kabylake Mobile */
     { 0x6, 0x9E, HS_TUNE_FAMILY_SKL }, /* Kabylake desktop */
 
+    { 0x6, 0x7D, HS_TUNE_FAMILY_ICL }, /* Icelake */
+    { 0x6, 0x7E, HS_TUNE_FAMILY_ICL }, /* Icelake */
+    { 0x6, 0x6A, HS_TUNE_FAMILY_ICX }, /* Icelake Xeon-D */
+    { 0x6, 0x6C, HS_TUNE_FAMILY_ICX }, /* Icelake Xeon */
+
 };
 #endif
 
@@ -122,6 +137,8 @@ const char *dumpTune(u32 tune) {
         T_CASE(HS_TUNE_FAMILY_BDW);
         T_CASE(HS_TUNE_FAMILY_SKL);
         T_CASE(HS_TUNE_FAMILY_SKX);
+        T_CASE(HS_TUNE_FAMILY_ICL);
+        T_CASE(HS_TUNE_FAMILY_ICX);
     }
 #undef T_CASE
     return "unknown";
diff --git a/src/util/cpuid_inline.h b/src/util/cpuid_inline.h
index 4af43ed28bb49336bb31bd2f9194b16fc6e4348d..b228c1d6b9a0b0f101d671af5235b93e07970df6 100644
--- a/src/util/cpuid_inline.h
+++ b/src/util/cpuid_inline.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Intel Corporation
+ * Copyright (c) 2017-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -78,11 +78,12 @@ void cpuid(unsigned int op, unsigned int leaf, unsigned int *eax,
 #define CPUID_HTT (1 << 28)
 
 // Structured Extended Feature Flags Enumeration Leaf ECX values
+#define CPUID_AVX512VBMI (1 << 1)
+
+// Structured Extended Feature Flags Enumeration Leaf EBX values
 #define CPUID_BMI (1 << 3)
 #define CPUID_AVX2 (1 << 5)
 #define CPUID_BMI2 (1 << 8)
-
-// Structured Extended Feature Flags Enumeration Leaf EBX values
 #define CPUID_AVX512F (1 << 16)
 #define CPUID_AVX512BW (1 << 30)
 
@@ -96,7 +97,7 @@ void cpuid(unsigned int op, unsigned int leaf, unsigned int *eax,
 #define CPUID_XCR0_AVX512                                                      \
     (CPUID_XCR0_OPMASK | CPUID_XCR0_ZMM_Hi256 | CPUID_XCR0_Hi16_ZMM)
 
-#if defined(__x86_64__) 
+#if defined(__x86_64__)
 static inline
 u64a xgetbv(u32 op) {
 #if defined(_WIN32) || defined(__INTEL_COMPILER)
@@ -191,6 +192,51 @@ int check_avx512(void) {
 #endif
 }
 
+static inline
+int check_avx512vbmi(void) {
+#if defined(__INTEL_COMPILER)
+    return _may_i_use_cpu_feature(_FEATURE_AVX512VBMI);
+#else
+    unsigned int eax, ebx, ecx, edx;
+
+    cpuid(1, 0, &eax, &ebx, &ecx, &edx);
+
+    /* check XSAVE is enabled by OS */
+    if (!(ecx & CPUID_XSAVE)) {
+        DEBUG_PRINTF("AVX and XSAVE not supported\n");
+        return 0;
+    }
+
+    /* check that AVX 512 registers are enabled by OS */
+    u64a xcr0 = xgetbv(0);
+    if ((xcr0 & CPUID_XCR0_AVX512) != CPUID_XCR0_AVX512) {
+        DEBUG_PRINTF("AVX512 registers not enabled\n");
+        return 0;
+    }
+
+    /* ECX and EDX contain capability flags */
+    ecx = 0;
+    cpuid(7, 0, &eax, &ebx, &ecx, &edx);
+
+    if (!(ebx & CPUID_AVX512F)) {
+        DEBUG_PRINTF("AVX512F (AVX512 Foundation) instructions not enabled\n");
+        return 0;
+    }
+
+    if (!(ebx & CPUID_AVX512BW)) {
+        DEBUG_PRINTF("AVX512BW instructions not enabled\n");
+        return 0;
+    }
+
+    if (ecx & CPUID_AVX512VBMI) {
+        DEBUG_PRINTF("AVX512VBMI instructions enabled\n");
+        return 1;
+    }
+
+    return 0;
+#endif
+}
+
 static inline
 int check_ssse3(void) {
     unsigned int eax, ebx, ecx, edx;
diff --git a/src/util/graph.h b/src/util/graph.h
index 660afd0299583411080b5d96b944539efa4ce6e4..3e18dae55276cae6be5f166b8fa1c44e96135139 100644
--- a/src/util/graph.h
+++ b/src/util/graph.h
@@ -170,6 +170,7 @@ find_vertices_in_cycles(const Graph &g) {
         assert(!comp.empty());
         if (comp.size() > 1) {
             insert(&rv, comp);
+            continue;
         }
         vertex_descriptor v = *comp.begin();
         if (hasSelfLoop(v, g)) {
diff --git a/src/util/graph_undirected.h b/src/util/graph_undirected.h
index 049964ab075f141df84bf334c2ed9090154fa718..507172847386454719e84eee63e8b52f90bbdb82 100644
--- a/src/util/graph_undirected.h
+++ b/src/util/graph_undirected.h
@@ -70,8 +70,8 @@ class undirected_graph_edge_descriptor
     using base_vertex_type = typename base_graph_traits::vertex_descriptor;
 
     base_edge_type underlying_edge;
-    const base_graph_type *g;
-    bool reverse; // if true, reverse vertices in source() and target()
+    const base_graph_type *g = nullptr;
+    bool reverse = false; // if true, reverse vertices in source() and target()
 
     inline std::pair<base_vertex_type, base_vertex_type>
     canonical_edge() const {
diff --git a/src/util/simd_types.h b/src/util/simd_types.h
index 62d39ec2f19352b6f73e7ec7f0820015a36b0a9e..b3f96eab2a5081a84edb5da8eaae91b99b72dce8 100644
--- a/src/util/simd_types.h
+++ b/src/util/simd_types.h
@@ -30,9 +30,9 @@
 #define SIMD_TYPES_H
 
 #include "config.h"
-#include "ue2common.h"
 #include "util/arch.h"
 #include "util/intrinsics.h"
+#include "ue2common.h"
 
 #if defined(HAVE_SSE2)
 typedef __m128i m128;
@@ -54,34 +54,21 @@ typedef float64x2_t __m128d;
 
 typedef __m128i m128;
 #else
-typedef struct ALIGN_DIRECTIVE {
-    u64a hi;
-    u64a lo;
-} m128;
-
+typedef struct ALIGN_DIRECTIVE {u64a hi; u64a lo;} m128;
 #endif
 
 #if defined(HAVE_AVX2)
 typedef __m256i m256;
 #else
-typedef struct ALIGN_AVX_DIRECTIVE {
-    m128 lo;
-    m128 hi;
-} m256;
+typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256;
 #endif
 
-typedef struct {
-    m128 lo;
-    m128 mid;
-    m128 hi;
-} m384;
+typedef struct {m128 lo; m128 mid; m128 hi;} m384;
 #if defined(HAVE_AVX512)
 typedef __m512i m512;
 #else
-typedef struct ALIGN_ATTR(64) {
-    m256 lo;
-    m256 hi;
-} m512;
+typedef struct ALIGN_ATTR(64) {m256 lo; m256 hi;} m512;
 #endif
 
 #endif /* SIMD_TYPES_H */
+
diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
index 8a54ccc74964120fc31c5d2cbc3d545742333c8d..f23e0ed184187e57c16fdcb4e7b344f16b79c237 100644
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/src/util/simd_x86.h b/src/util/simd_x86.h
index f01e50277836ca78f53f33cf6b6e2e846f25f8f0..fe53590a2d08c158eeaaebcac0ee1033bfd03864 100644
--- a/src/util/simd_x86.h
+++ b/src/util/simd_x86.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2015-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -138,6 +138,12 @@ m128 lshift64_m128(m128 a, unsigned b) {
 #define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
 #define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
 
+#if defined(HAVE_AVX512)
+static really_inline m128 cast512to128(const m512 in) {
+    return _mm512_castsi512_si128(in);
+}
+#endif
+
 static really_inline m128 set16x8(u8 c) {
     return _mm_set1_epi8(c);
 }
@@ -154,14 +160,6 @@ static really_inline u32 movd(const m128 in) {
     return _mm_cvtsi128_si32(in);
 }
 
-#if defined(HAVE_AVX512)
-static really_inline u32 movd512(const m512 in) {
-    // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in),
-    //       so we use 2-step convertions to work around.
-    return _mm_cvtsi128_si32(_mm512_castsi512_si128(in));
-}
-#endif
-
 static really_inline u64a movq(const m128 in) {
 #if defined(ARCH_X86_64)
     return _mm_cvtsi128_si64(in);
@@ -172,6 +170,20 @@ static really_inline u64a movq(const m128 in) {
 #endif
 }
 
+#if defined(HAVE_AVX512)
+static really_inline u32 movd512(const m512 in) {
+    // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in),
+    //       so we use 2-step convertions to work around.
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(in));
+}
+
+static really_inline u64a movq512(const m512 in) {
+    // NOTE: seems AVX512 doesn't support _mm512_cvtsi512_si64(in),
+    //       so we use 2-step convertions to work around.
+    return movq(_mm512_castsi512_si128(in));
+}
+#endif
+
 /* another form of movq */
 static really_inline
 m128 load_m128_from_u64a(const u64a *p) {
@@ -215,6 +227,24 @@ static really_inline m128 or128(m128 a, m128 b) {
     return _mm_or_si128(a,b);
 }
 
+#if defined(HAVE_AVX512VBMI)
+static really_inline m512 expand128(m128 a) {
+    return _mm512_broadcast_i32x4(a);
+}
+
+static really_inline m512 expand256(m256 a) {
+    return _mm512_broadcast_i64x4(a);
+}
+
+static really_inline m512 expand384(m384 a) {
+    u64a *lo = (u64a*)&a.lo;
+    u64a *mid = (u64a*)&a.mid;
+    u64a *hi = (u64a*)&a.hi;
+    return _mm512_set_epi64(0ULL, 0ULL, hi[1], hi[0], mid[1], mid[0],
+                            lo[1], lo[0]);
+}
+#endif
+
 static really_inline m128 andnot128(m128 a, m128 b) {
     return _mm_andnot_si128(a, b);
 }
@@ -1009,6 +1039,11 @@ m512 set8x64(u64a a) {
     return _mm512_set1_epi64(a);
 }
 
+static really_inline
+m512 set16x32(u32 a) {
+    return _mm512_set1_epi32(a);
+}
+
 static really_inline
 m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
                u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
@@ -1026,6 +1061,26 @@ static really_inline
 m512 set4x128(m128 a) {
     return _mm512_broadcast_i32x4(a);
 }
+
+static really_inline
+m512 sadd_u8_m512(m512 a, m512 b) {
+    return _mm512_adds_epu8(a, b);
+}
+
+static really_inline
+m512 max_u8_m512(m512 a, m512 b) {
+    return _mm512_max_epu8(a, b);
+}
+
+static really_inline
+m512 min_u8_m512(m512 a, m512 b) {
+    return _mm512_min_epu8(a, b);
+}
+
+static really_inline
+m512 sub_u8_m512(m512 a, m512 b) {
+    return _mm512_sub_epi8(a, b);
+}
 #endif
 
 static really_inline
@@ -1213,6 +1268,22 @@ m512 loadu512(const void *ptr) {
 #endif
 }
 
+// unaligned store
+static really_inline
+void storeu512(void *ptr, m512 a) {
+#if defined(HAVE_AVX512)
+    _mm512_storeu_si512((m512 *)ptr, a);
+#elif defined(HAVE_AVX2)
+    storeu256(ptr, a.lo);
+    storeu256((char *)ptr + 32, a.hi);
+#else
+    storeu128(ptr, a.lo.lo);
+    storeu128((char *)ptr + 16, a.lo.hi);
+    storeu128((char *)ptr + 32, a.hi.lo);
+    storeu128((char *)ptr + 48, a.hi.hi);
+#endif
+}
+
 #if defined(HAVE_AVX512)
 static really_inline
 m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
@@ -1224,10 +1295,20 @@ m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
     return _mm512_mask_loadu_epi8(src, k, ptr);
 }
 
+static really_inline
+void storeu_mask_m512(void *ptr, __mmask64 k, m512 a) {
+    _mm512_mask_storeu_epi8(ptr, k, a);
+}
+
 static really_inline
 m512 set_mask_m512(__mmask64 k) {
     return _mm512_movm_epi8(k);
 }
+
+static really_inline
+m256 loadu_maskz_m256(__mmask32 k, const void *ptr) {
+    return _mm256_maskz_loadu_epi8(k, ptr);
+}
 #endif
 
 // packed unaligned store of first N bytes
diff --git a/src/util/target_info.cpp b/src/util/target_info.cpp
index 3a41e02078d367ad0c45e4df0e3ff90466e0bb6b..66ba5f5acc54974f5ce17f82c37d554edb2c8e13 100644
--- a/src/util/target_info.cpp
+++ b/src/util/target_info.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -50,6 +50,10 @@ bool target_t::can_run_on_code_built_for(const target_t &code_target) const {
         return false;
     }
 
+    if (!has_avx512vbmi() && code_target.has_avx512vbmi()) {
+        return false;
+    }
+
     return true;
 }
 
@@ -64,6 +68,10 @@ bool target_t::has_avx512(void) const {
     return cpu_features & HS_CPU_FEATURES_AVX512;
 }
 
+bool target_t::has_avx512vbmi(void) const {
+    return cpu_features & HS_CPU_FEATURES_AVX512VBMI;
+}
+
 bool target_t::is_atom_class(void) const {
     return tune == HS_TUNE_FAMILY_SLM || tune == HS_TUNE_FAMILY_GLM;
 }
diff --git a/src/util/target_info.h b/src/util/target_info.h
index 794b2985579a409097531f2a252fb4d54b5ee360..f64573aedad9156cd115942fc7ad0e113324f77d 100644
--- a/src/util/target_info.h
+++ b/src/util/target_info.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -42,6 +42,8 @@ struct target_t {
 
     bool has_avx512(void) const;
 
+    bool has_avx512vbmi(void) const;
+
     bool is_atom_class(void) const;
 
     // This asks: can this target (the object) run on code that was built for
diff --git a/src/util/ue2string.h b/src/util/ue2string.h
index 0aa846896efcdc1b9c286c8875857c3972b4cf67..f436936d710b9a7301ba2784d43b2cfe60af3798 100644
--- a/src/util/ue2string.h
+++ b/src/util/ue2string.h
@@ -133,7 +133,7 @@ public:
             : lit(&lit_in), idx(idx_in) {}
 
         const ue2_literal *lit = nullptr;
-        size_t idx;
+        size_t idx = 0;
     };
 
     using const_reverse_iterator = std::reverse_iterator<const_iterator>;
diff --git a/src/util/uniform_ops.h b/src/util/uniform_ops.h
index 3385e4418b4c04f76f83d2143ed65f39643da976..262104aca2d9af9c775daccc58f7f7575ec30fe3 100644
--- a/src/util/uniform_ops.h
+++ b/src/util/uniform_ops.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -101,6 +101,18 @@
 #define or_m384(a, b)       (or384(a, b))
 #define or_m512(a, b)       (or512(a, b))
 
+#if defined(HAVE_AVX512VBMI)
+#define expand_m128(a)      (expand128(a))
+#define expand_m256(a)      (expand256(a))
+#define expand_m384(a)      (expand384(a))
+#define expand_m512(a)      (a)
+
+#define shuffle_byte_m128(a, b)       (pshufb_m512(b, a))
+#define shuffle_byte_m256(a, b)       (vpermb512(a, b))
+#define shuffle_byte_m384(a, b)       (vpermb512(a, b))
+#define shuffle_byte_m512(a, b)       (vpermb512(a, b))
+#endif
+
 #define and_u8(a, b)        ((a) & (b))
 #define and_u32(a, b)       ((a) & (b))
 #define and_u64a(a, b)      ((a) & (b))
diff --git a/tools/hsbench/data_corpus.cpp b/tools/hsbench/data_corpus.cpp
index 8e761ec34fc503aa2ea82d682935181672fb1121..b23da1fb3b3a6722d7986dff684ad2aee0484d45 100644
--- a/tools/hsbench/data_corpus.cpp
+++ b/tools/hsbench/data_corpus.cpp
@@ -58,7 +58,10 @@ void readRow(sqlite3_stmt *statement, vector<DataBlock> &blocks,
     }
     auto internal_stream_index = stream_indices[stream_id];
 
-    assert(blob || bytes > 0);
+    if (!(blob &&  bytes > 0)) {
+        assert(0);
+        throw std::domain_error("Invalid blob or bytes from sqlite3.");
+    }
     blocks.emplace_back(id, stream_id, internal_stream_index,
                         string(blob, blob + bytes));
 }
diff --git a/tools/hsbench/engine.h b/tools/hsbench/engine.h
index e41f9948c25f8fb3b3124e36b8428cd5cd88b83a..aea1c816256665db10a24888153d373f6dd078fd 100644
--- a/tools/hsbench/engine.h
+++ b/tools/hsbench/engine.h
@@ -88,6 +88,8 @@ public:
 
     virtual void printStats() const = 0;
 
+    virtual void printCsvStats() const = 0;
+
     virtual void sqlStats(SqlDB &db) const = 0;
 };
 
diff --git a/tools/hsbench/engine_chimera.cpp b/tools/hsbench/engine_chimera.cpp
index 8a15c5bee2480442a1d14d0c2496e1c06cbbbadc..24a99d61f507b72e7580b17df649eb3be6532a2d 100644
--- a/tools/hsbench/engine_chimera.cpp
+++ b/tools/hsbench/engine_chimera.cpp
@@ -187,6 +187,16 @@ void EngineChimera::printStats() const {
 #endif
 }
 
+void EngineChimera::printCsvStats() const {
+    printf(",\"%s\"", compile_stats.signatures.c_str());
+    printf(",\"%zu\"", compile_stats.expressionCount);
+    printf(",\"0x%x\"", compile_stats.crc32);
+    printf(",\"%zu\"", compile_stats.compiledSize);
+    printf(",\"%zu\"", compile_stats.scratchSize);
+    printf(",\"%0.3Lf\"", compile_stats.compileSecs);
+    printf(",\"%u\"", compile_stats.peakMemorySize);
+}
+
 void EngineChimera::sqlStats(SqlDB &sqldb) const {
     ostringstream crc;
     crc << "0x" << hex << compile_stats.crc32;
diff --git a/tools/hsbench/engine_chimera.h b/tools/hsbench/engine_chimera.h
index 8e2cd0f6ca3ea38492d5d6b4069d1a6b298cc51f..187dec8cbd47cfa3c5902d6026f3fa4443223949 100644
--- a/tools/hsbench/engine_chimera.h
+++ b/tools/hsbench/engine_chimera.h
@@ -89,6 +89,8 @@ public:
 
     void printStats() const;
 
+    void printCsvStats() const;
+
     void sqlStats(SqlDB &db) const;
 
 private:
diff --git a/tools/hsbench/engine_hyperscan.cpp b/tools/hsbench/engine_hyperscan.cpp
index 79c58f77dc4b7baeccf0c894a041bfb0f948bf8e..734f5334e26527cf957bf8fd9c347f2bad01a43a 100644
--- a/tools/hsbench/engine_hyperscan.cpp
+++ b/tools/hsbench/engine_hyperscan.cpp
@@ -175,7 +175,7 @@ unique_ptr<EngineStream> EngineHyperscan::streamOpen(EngineContext &ectx,
         return nullptr;
     }
     stream->sn = streamId;
-    return move(stream);
+    return stream;
 }
 
 void EngineHyperscan::streamClose(unique_ptr<EngineStream> stream,
@@ -276,6 +276,17 @@ void EngineHyperscan::printStats() const {
 #endif
 }
 
+void EngineHyperscan::printCsvStats() const {
+    printf(",\"%s\"", compile_stats.signatures.c_str());
+    printf(",\"%zu\"", compile_stats.expressionCount);
+    printf(",\"0x%x\"", compile_stats.crc32);
+    printf(",\"%zu\"", compile_stats.compiledSize);
+    printf(",\"%zu\"", compile_stats.streamSize);
+    printf(",\"%zu\"", compile_stats.scratchSize);
+    printf(",\"%0.3Lf\"", compile_stats.compileSecs);
+    printf(",\"%u\"", compile_stats.peakMemorySize);
+}
+
 void EngineHyperscan::sqlStats(SqlDB &sqldb) const {
     ostringstream crc;
     crc << "0x" << hex << compile_stats.crc32;
diff --git a/tools/hsbench/engine_hyperscan.h b/tools/hsbench/engine_hyperscan.h
index a8105d753a675c4b2d92f79ac4d28aad42f8282d..afbdf098d591072437ac0aa8092cd8cd8e1051c0 100644
--- a/tools/hsbench/engine_hyperscan.h
+++ b/tools/hsbench/engine_hyperscan.h
@@ -65,8 +65,8 @@ public:
 class EngineHSStream : public EngineStream {
 public:
     ~EngineHSStream();
-    hs_stream_t *id;
-    EngineHSContext *ctx;
+    hs_stream_t *id = nullptr;
+    EngineHSContext *ctx = nullptr;
 };
 
 /** Hyperscan Engine for scanning data. */
@@ -98,6 +98,8 @@ public:
 
     void printStats() const;
 
+    void printCsvStats() const;
+
     void sqlStats(SqlDB &db) const;
 
 private:
diff --git a/tools/hsbench/engine_pcre.cpp b/tools/hsbench/engine_pcre.cpp
index 85616e987aa8b52a662bcaf132955f7d03f75c83..23ab9d176c033a666a7653d09b4bbf1d9feaa645 100644
--- a/tools/hsbench/engine_pcre.cpp
+++ b/tools/hsbench/engine_pcre.cpp
@@ -227,6 +227,15 @@ void EnginePCRE::printStats() const {
 #endif
 }
 
+void EnginePCRE::printCsvStats() const {
+    printf(",\"%s\"", compile_stats.signatures.c_str());
+    printf(",\"%zu\"", compile_stats.expressionCount);
+    printf(",\"%zu\"", compile_stats.compiledSize);
+    printf(",\"%zu\"", compile_stats.scratchSize);
+    printf(",\"%0.3Lf\"", compile_stats.compileSecs);
+    printf(",\"%u\"", compile_stats.peakMemorySize);
+}
+
 void EnginePCRE::sqlStats(SqlDB &sqldb) const {
     ostringstream crc;
 
diff --git a/tools/hsbench/engine_pcre.h b/tools/hsbench/engine_pcre.h
index 2e7dad9c54465c13f37b0fb09af2a9deca706a80..9569bef480dbe9cd1cbf795f37253339fd638d16 100644
--- a/tools/hsbench/engine_pcre.h
+++ b/tools/hsbench/engine_pcre.h
@@ -62,7 +62,7 @@ public:
 struct PcreDB {
     bool highlander = false;
     bool utf8 = false;
-    u32 id;
+    u32 id = 0;
     pcre *db = nullptr;
     pcre_extra *extra = nullptr;
 };
@@ -97,6 +97,8 @@ public:
 
     void printStats() const;
 
+    void printCsvStats() const;
+
     void sqlStats(SqlDB &db) const;
 
 private:
diff --git a/tools/hsbench/main.cpp b/tools/hsbench/main.cpp
index 4e65c8e0bc6a02bc63426318246d44802b664e09..22becbd14857a10dd083cc5cd3e2a2db799b9503 100644
--- a/tools/hsbench/main.cpp
+++ b/tools/hsbench/main.cpp
@@ -98,6 +98,7 @@ bool display_per_scan = false;
 ScanMode scan_mode = ScanMode::STREAMING;
 bool useHybrid = false;
 bool usePcre = false;
+bool dumpCsvOut = false;
 unsigned repeats = 20;
 string exprPath("");
 string corpusFile("");
@@ -211,6 +212,7 @@ void usage(const char *error) {
     printf("                  Benchmark with threads on specified CPUs or CPU"
            " range.\n");
 #endif
+    printf("  -C              Dump CSV output for tput matrix.\n");
     printf("  -i DIR          Don't compile, load from files in DIR"
            " instead.\n");
     printf("  -w DIR          After compiling, save to files in DIR.\n");
@@ -275,6 +277,9 @@ void processArgs(int argc, char *argv[], vector<BenchmarkSigs> &sigSets,
         case 'c':
             corpusFile.assign(optarg);
             break;
+        case 'C':
+            dumpCsvOut = true;
+            break;
         case 'd': {
             unsigned dist;
             if (!fromString(optarg, dist)) {
@@ -755,6 +760,11 @@ u64a byte_size(const vector<DataBlock> &corpus_blocks) {
         total += block.payload.size();
     }
 
+    if (total == 0) {
+        assert(0);
+        throw std::invalid_argument("Empty corpus.");
+    }
+
     return total;
 }
 
@@ -849,6 +859,40 @@ void displayResults(const vector<unique_ptr<ThreadContext>> &threads,
     }
 }
 
+/** Dump benchmark results to csv. */
+static
+void displayCsvResults(const vector<unique_ptr<ThreadContext>> &threads,
+                       const vector<DataBlock> &corpus_blocks) {
+    u64a bytesPerRun = byte_size(corpus_blocks);
+    u64a matchesPerRun = threads[0]->results[0].matches;
+
+    // Sanity check: all of our results should have the same match count.
+    for (const auto &t : threads) {
+        if (!all_of(begin(t->results), end(t->results),
+                    [&matchesPerRun](const ResultEntry &e) {
+                        return e.matches == matchesPerRun;
+                    })) {
+            printf("\nWARNING: PER-SCAN MATCH COUNTS ARE INCONSISTENT!\n\n");
+            break;
+        }
+    }
+
+    u64a totalBytes = bytesPerRun * repeats * threads.size();
+    u64a totalBlocks = corpus_blocks.size() * repeats * threads.size();
+    printf(",\"%0.3f\"", totalSecs);
+    printf(",\"%0.2Lf\"", calc_mbps(totalSecs, totalBytes));
+
+    assert(bytesPerRun);
+    double matchRate = ((double)matchesPerRun * 1024) / bytesPerRun;
+    printf(",\"%llu\"", matchesPerRun);
+    printf(",\"%0.3f\"", matchRate);
+
+    double blockRate = (double)totalBlocks / (double)totalSecs;
+    printf(",\"%0.2f\"", blockRate);
+    printf("\n");
+}
+
+
 /** Dump per-scan throughput data to sql. */
 static
 void sqlPerScanResults(const vector<unique_ptr<ThreadContext>> &threads,
@@ -982,7 +1026,9 @@ void runBenchmark(const Engine &db,
         t->join();
     }
 
-    if (sqloutFile.empty()) {
+    if (dumpCsvOut) {
+        displayCsvResults(threads, corpus_blocks);
+    } else if (sqloutFile.empty()) {
         // Display global results.
         displayResults(threads, corpus_blocks);
     } else {
@@ -1059,7 +1105,9 @@ int HS_CDECL main(int argc, char *argv[]) {
                 exit(1);
             }
 
-            if (sqloutFile.empty()) {
+            if (dumpCsvOut) {
+                engine->printCsvStats();
+            } else if (sqloutFile.empty()) {
                 // Display global results.
                 engine->printStats();
                 printf("\n");
diff --git a/tools/hscollider/CMakeLists.txt b/tools/hscollider/CMakeLists.txt
index 777ae9f0fbd18b5e94bd47ca582e9486bce598e6..27c93b5715738c0ec8ded8b2e49ac62279d60082 100644
--- a/tools/hscollider/CMakeLists.txt
+++ b/tools/hscollider/CMakeLists.txt
@@ -21,7 +21,6 @@ set_source_files_properties(
     PROPERTIES
     COMPILE_FLAGS "${RAGEL_C_FLAGS} -I${CMAKE_CURRENT_SOURCE_DIR}")
 
-
 if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i386")
     ragelmaker(ColliderCorporaParser.rl)
 endif()
diff --git a/tools/hscollider/DatabaseProxy.h b/tools/hscollider/DatabaseProxy.h
index 831ab1484f5dd8e79ed79082150344d2a6c4af90..f6957d2968aa597e056b5aec052a6a3fa4c9bda7 100644
--- a/tools/hscollider/DatabaseProxy.h
+++ b/tools/hscollider/DatabaseProxy.h
@@ -61,7 +61,7 @@ public:
         std::lock_guard<std::mutex> lock(mutex);
         if (failed) {
             // We have previously failed to compile this database.
-            return nullptr;
+            throw CompileFailed("Unable to compile db previously.");
         }
         if (db) {
             return db;
diff --git a/tools/hscollider/NfaGeneratedCorpora.cpp b/tools/hscollider/NfaGeneratedCorpora.cpp
index 66ae270be8af91963afb3dcad5f4655db0f0e92b..4de320e172e0707b2f5bfb2206665c4e8be18227 100644
--- a/tools/hscollider/NfaGeneratedCorpora.cpp
+++ b/tools/hscollider/NfaGeneratedCorpora.cpp
@@ -101,7 +101,7 @@ void NfaGeneratedCorpora::generate(unsigned id, vector<Corpus> &data) {
         pl.logicalKeyRenumber();
         const auto &m_lkey = pl.getLkeyMap();
         assert(!m_lkey.empty());
-        u32 a_subid; // arbitrary sub id
+        u32 a_subid = 0; // arbitrary sub id
         unordered_map<u32, vector<Corpus>> m_data;
         for (const auto &it : m_lkey) {
             a_subid = it.first;
diff --git a/tools/hscollider/Thread.cpp b/tools/hscollider/Thread.cpp
index 5fff82398ce62f014fb972962ca29dfcf15a97ff..c63793d963dd4093b1c17c4245ca21b6bb1b6dc4 100644
--- a/tools/hscollider/Thread.cpp
+++ b/tools/hscollider/Thread.cpp
@@ -98,6 +98,6 @@ void *Thread::runThread(void *thr) {
 }
 
 
-Thread::Thread(size_t num) : thread_id(num) {}
+Thread::Thread(size_t num) : thread_id(num), thread() {}
 
 Thread::~Thread() {}
diff --git a/tools/hscollider/UltimateTruth.cpp b/tools/hscollider/UltimateTruth.cpp
index 038fbf777d85b5f32ea8fef697dc9a91f1208b59..6fd6051b7967a7c8c7049be320410a2c5e744f05 100644
--- a/tools/hscollider/UltimateTruth.cpp
+++ b/tools/hscollider/UltimateTruth.cpp
@@ -1080,7 +1080,7 @@ shared_ptr<BaseDB> UltimateTruth::compile(const set<unsigned> &ids,
         }
     }
 
-    return move(db);
+    return db;
 }
 
 bool UltimateTruth::allocScratch(shared_ptr<const BaseDB> db) {
diff --git a/tools/hscollider/args.cpp b/tools/hscollider/args.cpp
index 2eb510e0020e295b41017c7ff29f68b50de71d64..8a52d99fa28e76289247d42f38d1aa02ce3542d5 100644
--- a/tools/hscollider/args.cpp
+++ b/tools/hscollider/args.cpp
@@ -503,8 +503,8 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop,
                 } else if (in_corpora) {
                     corpora->push_back(optarg);
                     in_corpora = 2;
-                    break;
                 }
+                break;
             case 0:
                 break;
             default:
diff --git a/tools/hscollider/main.cpp b/tools/hscollider/main.cpp
index afa6ef5a92b36025a6367d2b0655031a58f80f00..57b36f1b7c6c24364951693c40ab2bd19fe4aada 100644
--- a/tools/hscollider/main.cpp
+++ b/tools/hscollider/main.cpp
@@ -1831,11 +1831,11 @@ unique_ptr<CorporaSource> buildCorpora(const vector<string> &corporaFiles,
                 exit_with_fail();
             }
         }
-        return move(c); /* move allows unique_ptr<CorporaSource> conversion */
+        return c; /* move allows unique_ptr<CorporaSource> conversion */
     } else {
         auto c = ue2::make_unique<NfaGeneratedCorpora>(
             exprMap, corpus_gen_prop, force_utf8, force_prefilter);
-        return move(c);
+        return c;
     }
 }
 
diff --git a/tools/hscollider/sig.cpp b/tools/hscollider/sig.cpp
index 7d580e410f0ab8c628f0674b052f1e0733c074a0..5f4fb5672df9464828af2a86e04aae913d328fed 100644
--- a/tools/hscollider/sig.cpp
+++ b/tools/hscollider/sig.cpp
@@ -38,6 +38,7 @@
 
 #if defined(HAVE_SIGACTION) || defined(_WIN32)
 #include <signal.h>
+#define STACK_SIZE 8192 // linux kernel default stack size for x86
 #endif
 
 #ifdef HAVE_BACKTRACE
@@ -175,7 +176,7 @@ void installSignalHandler(void) {
 }
 
 #ifdef HAVE_SIGALTSTACK
-static TLS_VARIABLE char alt_stack_loc[SIGSTKSZ];
+static TLS_VARIABLE char alt_stack_loc[STACK_SIZE];
 #endif
 
 void setSignalStack(void) {
@@ -187,7 +188,7 @@ void setSignalStack(void) {
     stack_t alt_stack;
     memset(&alt_stack, 0, sizeof(alt_stack));
     alt_stack.ss_flags = 0;
-    alt_stack.ss_size = SIGSTKSZ;
+    alt_stack.ss_size = STACK_SIZE;
     alt_stack.ss_sp = alt_stack_loc;
     if (!sigaltstack(&alt_stack, nullptr)) {
         act.sa_flags |= SA_ONSTACK;
diff --git a/unit/hyperscan/single.cpp b/unit/hyperscan/single.cpp
index 01fbfeab56c4c429dbd85f0deb22ba16acd654ff..07269cf002de595840386722cd6b64b641b2e32d 100644
--- a/unit/hyperscan/single.cpp
+++ b/unit/hyperscan/single.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -363,8 +363,9 @@ static const unsigned validModes[] = {
 // Mode bits for switching off various architecture features
 static const unsigned long long featureMask[] = {
     ~0ULL, /* native */
-    ~(HS_CPU_FEATURES_AVX2 | HS_CPU_FEATURES_AVX512), /* no avx2 */
-    ~HS_CPU_FEATURES_AVX512, /* no avx512 */
+    ~(HS_CPU_FEATURES_AVX2 | HS_CPU_FEATURES_AVX512 | HS_CPU_FEATURES_AVX512VBMI), /* no avx2 */
+    ~(HS_CPU_FEATURES_AVX512 | HS_CPU_FEATURES_AVX512VBMI), /* no avx512 */
+    ~HS_CPU_FEATURES_AVX512VBMI, /* no avx512vbmi */
 };
 
 INSTANTIATE_TEST_CASE_P(Single,
diff --git a/unit/internal/database.cpp b/unit/internal/database.cpp
index 8f0c1a695de9a8ba49ca8c4c11499f7f62694410..0070fbc96565cdcb7bce778eac3ffd6d04ef0798 100644
--- a/unit/internal/database.cpp
+++ b/unit/internal/database.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -56,6 +56,10 @@ TEST(DB, flagsToPlatform) {
     p.cpu_features |= HS_CPU_FEATURES_AVX512;
 #endif
 
+#if defined(HAVE_AVX512VBMI)
+    p.cpu_features |= HS_CPU_FEATURES_AVX512VBMI;
+#endif
+
     platform_t pp = target_to_platform(target_t(p));
     ASSERT_EQ(pp, hs_current_platform);
 }
diff --git a/unit/internal/limex_nfa.cpp b/unit/internal/limex_nfa.cpp
index c70ceeae1cd2151f690c5e4635420abef7a43e7e..28433c968c0910fde2e41f0ca9b4832b1aa49dc5 100644
--- a/unit/internal/limex_nfa.cpp
+++ b/unit/internal/limex_nfa.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -83,9 +83,10 @@ protected:
         const map<u32, u32> fixed_depth_tops;
         const map<u32, vector<vector<CharReach>>> triggers;
         bool compress_state = false;
+        bool fast_nfa = false;
 
         nfa = constructNFA(*g, &rm, fixed_depth_tops, triggers, compress_state,
-                           type, cc);
+                           fast_nfa, type, cc);
         ASSERT_TRUE(nfa != nullptr);
 
         full_state = make_bytecode_ptr<char>(nfa->scratchStateSize, 64);
@@ -376,9 +377,10 @@ protected:
         const map<u32, u32> fixed_depth_tops;
         const map<u32, vector<vector<CharReach>>> triggers;
         bool compress_state = false;
+        bool fast_nfa = false;
 
         nfa = constructNFA(*g, &rm, fixed_depth_tops, triggers, compress_state,
-                           type, cc);
+                           fast_nfa, type, cc);
         ASSERT_TRUE(nfa != nullptr);
 
         full_state = make_bytecode_ptr<char>(nfa->scratchStateSize, 64);
diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp
index 7d5177868f9cf5b04a553e7a0b11f70e5fc5dee0..22945d612dbf93398c0790230ac8034df636b34c 100644
--- a/unit/internal/simd_utils.cpp
+++ b/unit/internal/simd_utils.cpp
@@ -40,7 +40,7 @@ using namespace ue2;
 namespace {
 
 // Switch one bit on in a bitmask.
-template <class Mask> 
+template<class Mask>
 Mask setbit(unsigned int bit) {
     union {
         Mask simd;
@@ -148,7 +148,7 @@ m256 simd_lshift64(const m256 &a, unsigned i) { return lshift64_m256(a, i); }
 m384 simd_lshift64(const m384 &a, unsigned i) { return lshift64_m384(a, i); }
 m512 simd_lshift64(const m512 &a, unsigned i) { return lshift64_m512(a, i); }
 
-template <typename T> 
+template<typename T>
 class SimdUtilsTest : public testing::Test {
     // empty
 };
@@ -260,9 +260,9 @@ TYPED_TEST(SimdUtilsTest, or2) {
 
     for (unsigned j = 0; j < 8; j++) {
         for (unsigned i = 0; i < 32; i++) {
-            m256 x = setbit<m256>(j * 32 + i);
+            m256 x = setbit<m256>(j*32+i);
             m256 y = zeroes256();
-            ASSERT_EQ(1U << j, diffrich256(x, y)) << "bit " << j * 32 + i << " not happy";
+            ASSERT_EQ(1U << j, diffrich256(x, y)) << "bit " << j*32+i << " not happy";
         }
     }
 
@@ -431,8 +431,8 @@ TYPED_TEST(SimdUtilsTest, testbit) {
     for (unsigned int i = 0; i < total_bits; i++) {
         TypeParam a = setbit<TypeParam>(i);
         for (unsigned int j = 0; j < total_bits; j++) {
-            ASSERT_EQ(i == j ? 1 : 0, simd_testbit(a, j))
-                << "bit " << i << " is wrong";
+            ASSERT_EQ(i == j ? 1 : 0, simd_testbit(a, j)) << "bit " << i
+                                                          << " is wrong";
         }
     }
 }
@@ -455,6 +455,7 @@ TYPED_TEST(SimdUtilsTest, setbit) {
         simd_setbit(&a, i);
     }
     ASSERT_FALSE(simd_diff(simd_ones(), a));
+
 }
 
 TYPED_TEST(SimdUtilsTest, diffrich) {
@@ -667,6 +668,7 @@ TEST(SimdUtilsTest, movq) {
     ASSERT_EQ(r, 0x123456789abcdef);
 }
 
+
 TEST(SimdUtilsTest, set16x8) {
     char cmp[sizeof(m128)];
 
@@ -678,7 +680,7 @@ TEST(SimdUtilsTest, set16x8) {
 }
 
 TEST(SimdUtilsTest, set4x32) {
-    u32 cmp[4] = {0x12345678, 0x12345678, 0x12345678, 0x12345678};
+    u32 cmp[4] = { 0x12345678, 0x12345678, 0x12345678, 0x12345678 };
     m128 simd = set4x32(cmp[0]);
     ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd)));
 }
@@ -712,51 +714,51 @@ TEST(SimdUtilsTest, variableByteShift128) {
     char base[] = "0123456789ABCDEF";
     m128 in = loadu128(base);
 
-    EXPECT_TRUE(
-        !diff128(rshiftbyte_m128(in, 0), variable_byte_shift_m128(in, 0)));
-    EXPECT_TRUE(
-        !diff128(rshiftbyte_m128(in, 1), variable_byte_shift_m128(in, -1)));
-    EXPECT_TRUE(
-        !diff128(rshiftbyte_m128(in, 2), variable_byte_shift_m128(in, -2)));
-    EXPECT_TRUE(
-        !diff128(rshiftbyte_m128(in, 3), variable_byte_shift_m128(in, -3)));
-    EXPECT_TRUE(
-        !diff128(rshiftbyte_m128(in, 4), variable_byte_shift_m128(in, -4)));
-    EXPECT_TRUE(
-        !diff128(rshiftbyte_m128(in, 5), variable_byte_shift_m128(in, -5)));
-    EXPECT_TRUE(
-        !diff128(rshiftbyte_m128(in, 6), variable_byte_shift_m128(in, -6)));
-    EXPECT_TRUE(
-        !diff128(rshiftbyte_m128(in, 7), variable_byte_shift_m128(in, -7)));
-    EXPECT_TRUE(
-        !diff128(rshiftbyte_m128(in, 8), variable_byte_shift_m128(in, -8)));
-    EXPECT_TRUE(
-        !diff128(rshiftbyte_m128(in, 9), variable_byte_shift_m128(in, -9)));
-    EXPECT_TRUE(
-        !diff128(rshiftbyte_m128(in, 10), variable_byte_shift_m128(in, -10)));
-
-    EXPECT_TRUE(
-        !diff128(lshiftbyte_m128(in, 0), variable_byte_shift_m128(in, 0)));
-    EXPECT_TRUE(
-        !diff128(lshiftbyte_m128(in, 1), variable_byte_shift_m128(in, 1)));
-    EXPECT_TRUE(
-        !diff128(lshiftbyte_m128(in, 2), variable_byte_shift_m128(in, 2)));
-    EXPECT_TRUE(
-        !diff128(lshiftbyte_m128(in, 3), variable_byte_shift_m128(in, 3)));
-    EXPECT_TRUE(
-        !diff128(lshiftbyte_m128(in, 4), variable_byte_shift_m128(in, 4)));
-    EXPECT_TRUE(
-        !diff128(lshiftbyte_m128(in, 5), variable_byte_shift_m128(in, 5)));
-    EXPECT_TRUE(
-        !diff128(lshiftbyte_m128(in, 6), variable_byte_shift_m128(in, 6)));
-    EXPECT_TRUE(
-        !diff128(lshiftbyte_m128(in, 7), variable_byte_shift_m128(in, 7)));
-    EXPECT_TRUE(
-        !diff128(lshiftbyte_m128(in, 8), variable_byte_shift_m128(in, 8)));
-    EXPECT_TRUE(
-        !diff128(lshiftbyte_m128(in, 9), variable_byte_shift_m128(in, 9)));
-    EXPECT_TRUE(
-        !diff128(lshiftbyte_m128(in, 10), variable_byte_shift_m128(in, 10)));
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 0),
+                         variable_byte_shift_m128(in, 0)));
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 1),
+                         variable_byte_shift_m128(in, -1)));
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 2),
+                         variable_byte_shift_m128(in, -2)));
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 3),
+                         variable_byte_shift_m128(in, -3)));
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 4),
+                         variable_byte_shift_m128(in, -4)));
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 5),
+                         variable_byte_shift_m128(in, -5)));
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 6),
+                         variable_byte_shift_m128(in, -6)));
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 7),
+                         variable_byte_shift_m128(in, -7)));
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 8),
+                         variable_byte_shift_m128(in, -8)));
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 9),
+                         variable_byte_shift_m128(in, -9)));
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 10),
+                         variable_byte_shift_m128(in, -10)));
+
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 0),
+                         variable_byte_shift_m128(in, 0)));
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 1),
+                         variable_byte_shift_m128(in, 1)));
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 2),
+                         variable_byte_shift_m128(in, 2)));
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 3),
+                         variable_byte_shift_m128(in, 3)));
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 4),
+                         variable_byte_shift_m128(in, 4)));
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 5),
+                         variable_byte_shift_m128(in, 5)));
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 6),
+                         variable_byte_shift_m128(in, 6)));
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 7),
+                         variable_byte_shift_m128(in, 7)));
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 8),
+                         variable_byte_shift_m128(in, 8)));
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 9),
+                         variable_byte_shift_m128(in, 9)));
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 10),
+                         variable_byte_shift_m128(in, 10)));
 
     EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, 16)));
     EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, -16)));
@@ -783,12 +785,12 @@ TEST(SimdUtilsTest, min_u8_m128) {
 }
 
 TEST(SimdUtilsTest, sadd_u8_m128) {
-    unsigned char base1[] = {0,   0x80, 0xff, 'A', '1', '2', '3', '4',
-                             '1', '2',  '3',  '4', '1', '2', '3', '4'};
-    unsigned char base2[] = {'a',  0x80, 'b',  'A',  0x10, 0x10, 0x10, 0x10,
-                             0x30, 0x30, 0x30, 0x30, 0,    0,    0,    0};
+    unsigned char base1[] = {0, 0x80, 0xff, 'A', '1', '2', '3', '4',
+                             '1', '2', '3', '4', '1', '2', '3', '4'};
+    unsigned char base2[] = {'a', 0x80, 'b', 'A', 0x10, 0x10, 0x10, 0x10,
+                             0x30, 0x30, 0x30, 0x30, 0, 0, 0, 0};
     unsigned char expec[] = {'a', 0xff, 0xff, 0x82, 'A', 'B', 'C', 'D',
-                             'a', 'b',  'c',  'd',  '1', '2', '3', '4'};
+                             'a', 'b', 'c', 'd', '1', '2', '3', '4'};
     m128 in1 = loadu128(base1);
     m128 in2 = loadu128(base2);
     m128 result = sadd_u8_m128(in1, in2);
@@ -797,11 +799,11 @@ TEST(SimdUtilsTest, sadd_u8_m128) {
 
 TEST(SimdUtilsTest, sub_u8_m128) {
     unsigned char base1[] = {'a', 0xff, 0xff, 0x82, 'A', 'B', 'C', 'D',
-                             'a', 'b',  'c',  'd',  '1', '2', '3', '4'};
-    unsigned char base2[] = {0,   0x80, 0xff, 'A', '1', '2', '3', '4',
-                             '1', '2',  '3',  '4', '1', '2', '3', '4'};
-    unsigned char expec[] = {'a',  0x7f, 0,    'A',  0x10, 0x10, 0x10, 0x10,
-                             0x30, 0x30, 0x30, 0x30, 0,    0,    0,    0};
+                             'a', 'b', 'c', 'd', '1', '2', '3', '4'};
+    unsigned char base2[] = {0, 0x80, 0xff, 'A', '1', '2', '3', '4',
+                             '1', '2', '3', '4', '1', '2', '3', '4'};
+    unsigned char expec[] = {'a', 0x7f, 0, 'A', 0x10, 0x10, 0x10, 0x10,
+                             0x30, 0x30, 0x30, 0x30, 0, 0, 0, 0};
     m128 in1 = loadu128(base1);
     m128 in2 = loadu128(base2);
     m128 result = sub_u8_m128(in1, in2);
diff --git a/unit/internal/utf8_validate.cpp b/unit/internal/utf8_validate.cpp
index 0335794203355c362893602d733a178e2e651e8e..03f529036f4a2299f2ec16b58917495c4dda537f 100644
--- a/unit/internal/utf8_validate.cpp
+++ b/unit/internal/utf8_validate.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2022, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -64,8 +64,8 @@ static ValidUtf8TestInfo valid_utf8_tests[] = {
     {"공동경비구역", true},
     {"জলসাঘর", true},
 
-    // Invalid one-byte caseS.
-    {"\x7f", false},
+    // Valid one-byte caseS.
+    {"\x7f", true}, // \x7f is valid
 
     // These bytes should never appear in a UTF-8 stream.
     {"\xc0", false},
diff --git a/util/ng_corpus_editor.cpp b/util/ng_corpus_editor.cpp
index ac4f8b65405fef32bb91e7197f68d2ceaf9c6c24..c1149216dee71bc13ee70f8be3633a0381c4b7ff 100644
--- a/util/ng_corpus_editor.cpp
+++ b/util/ng_corpus_editor.cpp
@@ -268,12 +268,12 @@ void CorpusEditorUtf8::flip_case(vector<unichar> &corpus) {
 unichar CorpusEditorUtf8::chooseCodePoint(void) {
     /* We need to ensure that we don't pick a surrogate cp */
     const u32 range =
-        MAX_UNICODE + 1 - (UNICODE_SURROGATE_MAX + UNICODE_SURROGATE_MIN + 1);
+        MAX_UNICODE + 1 - (UNICODE_SURROGATE_MAX - UNICODE_SURROGATE_MIN + 1);
     unichar raw = props.rand(0, range - 1);
     if (raw < UNICODE_SURROGATE_MIN) {
         return raw;
     } else {
-        return raw + UNICODE_SURROGATE_MAX + 1;
+        return raw + UNICODE_SURROGATE_MAX - UNICODE_SURROGATE_MIN + 1;
     }
 }
 
diff --git a/util/ng_corpus_generator.cpp b/util/ng_corpus_generator.cpp
index e5e8e06cd95f822ddf78f2577de212ad1163cc64..f796cd45f57ceece9008dc698287041646422960 100644
--- a/util/ng_corpus_generator.cpp
+++ b/util/ng_corpus_generator.cpp
@@ -477,14 +477,14 @@ void CorpusGeneratorUtf8::generateCorpus(vector<string> &data) {
  * that we've been asked for. */
 unichar CorpusGeneratorUtf8::getRandomChar() {
     u32 range = MAX_UNICODE + 1
-                - (UNICODE_SURROGATE_MAX + UNICODE_SURROGATE_MIN + 1);
+                - (UNICODE_SURROGATE_MAX - UNICODE_SURROGATE_MIN + 1);
     range = min(cProps.alphabetSize, range);
     assert(range);
 
     unichar c = 'a' + cProps.rand(0, range - 1);
 
     if (c >= UNICODE_SURROGATE_MIN) {
-        c =+ UNICODE_SURROGATE_MAX + 1;
+        c += UNICODE_SURROGATE_MAX - UNICODE_SURROGATE_MIN + 1;
     }
 
     return c % (MAX_UNICODE + 1);
diff --git a/util/ng_corpus_properties.cpp b/util/ng_corpus_properties.cpp
index e784e05827a75059abea40aaf35805a7fbc7a9ad..511ad60ac9c618d68cc2cba93cbb080a99ee5d72 100644
--- a/util/ng_corpus_properties.cpp
+++ b/util/ng_corpus_properties.cpp
@@ -42,7 +42,7 @@ CorpusProperties::CorpusProperties()
     : matchness(100), unmatchness(0), randomness(0), prefixRange(0, 0),
       suffixRange(0, 0), cycleMin(1), cycleMax(1),
       corpusLimit(DEFAULT_CORPUS_GENERATOR_LIMIT), editDistance(0),
-      alphabetSize(~0) {
+      alphabetSize(~0), rngSeed(0) {
     // empty
 }