Skip to content

Instantly share code, notes, and snippets.

@Alexhuszagh
Created August 4, 2019 20:52
Show Gist options
  • Save Alexhuszagh/c549883835878cb88649b7d340c7849c to your computer and use it in GitHub Desktop.
Save Alexhuszagh/c549883835878cb88649b7d340c7849c to your computer and use it in GitHub Desktop.
memchr implementation with portable SIMD instructions using packed_simd. Nightly-only.
// This is derived off of BurntSushi's rust-memchr, and is re-released under
// the same licensing conditions as rust-memchr.
//
// Dual-licensed under MIT or the UNLICENSE.
//
// LICENSE
// =======
//
// This is free and unencumbered software released into the public domain.
//
// Anyone is free to copy, modify, publish, use, compile, sell, or
// distribute this software, either in source code form or as a compiled
// binary, for any purpose, commercial or non-commercial, and by any
// means.
//
// In jurisdictions that recognize copyright laws, the author or authors
// of this software dedicate any and all copyright interest in the
// software to the public domain. We make this dedication for the benefit
// of the public at large and to the detriment of our heirs and
// successors. We intend this dedication to be an overt act of
// relinquishment in perpetuity of all present and future rights to this
// software under copyright law.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
// IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// For more information, please refer to <http://unlicense.org/>
mod private {
use core::cmp;
use core::mem::size_of;
use core::ops::BitOr;
use core::slice;
use packed_simd::*;
const VECTOR_SIZE: usize = size_of::<i8x16>();
const VECTOR_ALIGN: usize = VECTOR_SIZE - 1;
// The number of bytes to loop at in one iteration of memchr/memrchr.
const LOOP_SIZE: usize = 4 * VECTOR_SIZE;
// The number of bytes to loop at in one iteration of memchr2/memrchr2 and
// memchr3/memrchr3. There was no observable difference between 64 and 32 bytes
// in benchmarks. memchr3 in particular only gets a very slight speed up from
// the loop unrolling.
const LOOP_SIZE2: usize = 2 * VECTOR_SIZE;
pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
let vn1 = i8x16::splat(n1 as i8);
let len = haystack.len();
let loop_size = cmp::min(LOOP_SIZE, len);
let start_ptr = haystack.as_ptr();
let end_ptr = haystack[haystack.len()..].as_ptr();
let mut ptr = start_ptr;
if haystack.len() < VECTOR_SIZE {
while ptr < end_ptr {
if *ptr == n1 {
return Some(sub(ptr, start_ptr));
}
ptr = ptr.offset(1);
}
return None;
}
if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
return Some(i);
}
ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) {
debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
let a = i8x16_load(ptr);
let b = i8x16_load(ptr.add(VECTOR_SIZE));
let c = i8x16_load(ptr.add(2 * VECTOR_SIZE));
let d = i8x16_load(ptr.add(3 * VECTOR_SIZE));
let eqa = vn1.eq(a);
let eqb = vn1.eq(b);
let eqc = vn1.eq(c);
let eqd = vn1.eq(d);
let or1 = eqa.bitor(eqb);
let or2 = eqc.bitor(eqd);
let or3 = or1.bitor(or2);
if or3.any() {
let mut at = sub(ptr, start_ptr);
if eqa.any() {
return Some(at + forward_pos(eqa.bitmask()))
}
at += VECTOR_SIZE;
if eqb.any() {
return Some(at + forward_pos(eqb.bitmask()))
}
at += VECTOR_SIZE;
if eqc.any() {
return Some(at + forward_pos(eqc.bitmask()))
}
at += VECTOR_SIZE;
debug_assert!(eqd.bitmask() != 0);
return Some(at + forward_pos(eqd.bitmask()));
}
ptr = ptr.add(loop_size);
}
while ptr <= end_ptr.sub(VECTOR_SIZE) {
debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE);
if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
return Some(i);
}
ptr = ptr.add(VECTOR_SIZE);
}
if ptr < end_ptr {
debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
return forward_search1(start_ptr, end_ptr, ptr, vn1);
}
None
}
pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
let vn1 = i8x16::splat(n1 as i8);
let vn2 = i8x16::splat(n2 as i8);
let len = haystack.len();
let loop_size = cmp::min(LOOP_SIZE2, len);
let start_ptr = haystack.as_ptr();
let end_ptr = haystack[haystack.len()..].as_ptr();
let mut ptr = start_ptr;
if haystack.len() < VECTOR_SIZE {
while ptr < end_ptr {
if *ptr == n1 || *ptr == n2 {
return Some(sub(ptr, start_ptr));
}
ptr = ptr.offset(1);
}
return None;
}
if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
return Some(i);
}
ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
let a = i8x16_load(ptr);
let b = i8x16_load(ptr.add(VECTOR_SIZE));
let eqa1 = vn1.eq(a);
let eqb1 = vn1.eq(b);
let eqa2 = vn2.eq(a);
let eqb2 = vn2.eq(b);
let or1 = eqa1.bitor(eqb1);
let or2 = eqa2.bitor(eqb2);
let or3 = or1.bitor(or2);
if or3.any() {
let mut at = sub(ptr, start_ptr);
if eqa1.any() || eqa2.any() {
return Some(at + forward_pos2(eqa1.bitmask(), eqa2.bitmask()))
}
at += VECTOR_SIZE;
if eqb1.any() || eqb2.any() {
return Some(at + forward_pos2(eqb1.bitmask(), eqb2.bitmask()))
}
}
ptr = ptr.add(loop_size);
}
while ptr <= end_ptr.sub(VECTOR_SIZE) {
if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
return Some(i);
}
ptr = ptr.add(VECTOR_SIZE);
}
if ptr < end_ptr {
debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
return forward_search2(start_ptr, end_ptr, ptr, vn1, vn2);
}
None
}
pub unsafe fn memchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
let vn1 = i8x16::splat(n1 as i8);
let vn2 = i8x16::splat(n2 as i8);
let vn3 = i8x16::splat(n3 as i8);
let len = haystack.len();
let loop_size = cmp::min(LOOP_SIZE2, len);
let start_ptr = haystack.as_ptr();
let end_ptr = haystack[haystack.len()..].as_ptr();
let mut ptr = start_ptr;
if haystack.len() < VECTOR_SIZE {
while ptr < end_ptr {
if *ptr == n1 || *ptr == n2 || *ptr == n3 {
return Some(sub(ptr, start_ptr));
}
ptr = ptr.offset(1);
}
return None;
}
if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
return Some(i);
}
ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
let a = i8x16_load(ptr);
let b = i8x16_load(ptr.add(VECTOR_SIZE));
let eqa1 = vn1.eq(a);
let eqb1 = vn1.eq(b);
let eqa2 = vn2.eq(a);
let eqb2 = vn2.eq(b);
let eqa3 = vn3.eq(a);
let eqb3 = vn3.eq(b);
let or1 = eqa1.bitor(eqb1);
let or2 = eqa2.bitor(eqb2);
let or3 = eqa3.bitor(eqb3);
let or4 = or1.bitor(or2);
let or5 = or3.bitor(or4);
if or5.any() {
let mut at = sub(ptr, start_ptr);
if eqa1.any() || eqa2.any() || eqa3.any() {
return Some(at + forward_pos3(eqa1.bitmask(), eqa2.bitmask(), eqa3.bitmask()))
}
at += VECTOR_SIZE;
if eqb1.any() || eqb2.any() || eqb3.any() {
return Some(at + forward_pos3(eqb1.bitmask(), eqb2.bitmask(), eqb3.bitmask()))
}
}
ptr = ptr.add(loop_size);
}
while ptr <= end_ptr.sub(VECTOR_SIZE) {
if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
return Some(i);
}
ptr = ptr.add(VECTOR_SIZE);
}
if ptr < end_ptr {
debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
return forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3);
}
None
}
pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
let vn1 = i8x16::splat(n1 as i8);
let len = haystack.len();
let loop_size = cmp::min(LOOP_SIZE, len);
let start_ptr = haystack.as_ptr();
let end_ptr = haystack[haystack.len()..].as_ptr();
let mut ptr = end_ptr;
if haystack.len() < VECTOR_SIZE {
while ptr > start_ptr {
ptr = ptr.offset(-1);
if *ptr == n1 {
return Some(sub(ptr, start_ptr));
}
}
return None;
}
ptr = ptr.sub(VECTOR_SIZE);
if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
return Some(i);
}
ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) {
debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
ptr = ptr.sub(loop_size);
let a = i8x16_load(ptr);
let b = i8x16_load(ptr.add(VECTOR_SIZE));
let c = i8x16_load(ptr.add(2 * VECTOR_SIZE));
let d = i8x16_load(ptr.add(3 * VECTOR_SIZE));
let eqa = vn1.eq(a);
let eqb = vn1.eq(b);
let eqc = vn1.eq(c);
let eqd = vn1.eq(d);
let or1 = eqa.bitor(eqb);
let or2 = eqc.bitor(eqd);
let or3 = or1.bitor(or2);
if or3.any() {
let mut at = sub(ptr.add(3 * VECTOR_SIZE), start_ptr);
if eqd.any() {
return Some(at + reverse_pos(eqd.bitmask()))
}
at -= VECTOR_SIZE;
if eqc.any() {
return Some(at + reverse_pos(eqc.bitmask()))
}
at -= VECTOR_SIZE;
if eqb.any() {
return Some(at + reverse_pos(eqb.bitmask()))
}
at -= VECTOR_SIZE;
debug_assert!(eqa.bitmask() != 0);
return Some(at + reverse_pos(eqa.bitmask()));
}
}
while ptr >= start_ptr.add(VECTOR_SIZE) {
ptr = ptr.sub(VECTOR_SIZE);
if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
return Some(i);
}
}
if ptr > start_ptr {
debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
return reverse_search1(start_ptr, end_ptr, start_ptr, vn1);
}
None
}
pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
let vn1 = i8x16::splat(n1 as i8);
let vn2 = i8x16::splat(n2 as i8);
let len = haystack.len();
let loop_size = cmp::min(LOOP_SIZE2, len);
let start_ptr = haystack.as_ptr();
let end_ptr = haystack[haystack.len()..].as_ptr();
let mut ptr = end_ptr;
if haystack.len() < VECTOR_SIZE {
while ptr > start_ptr {
ptr = ptr.offset(-1);
if *ptr == n1 || *ptr == n2 {
return Some(sub(ptr, start_ptr));
}
}
return None;
}
ptr = ptr.sub(VECTOR_SIZE);
if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
return Some(i);
}
ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) {
debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
ptr = ptr.sub(loop_size);
let a = i8x16_load(ptr);
let b = i8x16_load(ptr.add(VECTOR_SIZE));
let eqa1 = vn1.eq(a);
let eqb1 = vn1.eq(b);
let eqa2 = vn2.eq(a);
let eqb2 = vn2.eq(b);
let or1 = eqa1.bitor(eqb1);
let or2 = eqa2.bitor(eqb2);
let or3 = or1.bitor(or2);
if or3.any() {
let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
if eqb1.any() || eqb2.any() {
return Some(at + reverse_pos2(eqb1.bitmask(), eqb2.bitmask()))
}
at -= VECTOR_SIZE;
return Some(at + reverse_pos2(eqa1.bitmask(), eqa2.bitmask()));
}
}
while ptr >= start_ptr.add(VECTOR_SIZE) {
ptr = ptr.sub(VECTOR_SIZE);
if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
return Some(i);
}
}
if ptr > start_ptr {
debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
return reverse_search2(start_ptr, end_ptr, start_ptr, vn1, vn2);
}
None
}
pub unsafe fn memrchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
let vn1 = i8x16::splat(n1 as i8);
let vn2 = i8x16::splat(n2 as i8);
let vn3 = i8x16::splat(n3 as i8);
let len = haystack.len();
let loop_size = cmp::min(LOOP_SIZE2, len);
let start_ptr = haystack.as_ptr();
let end_ptr = haystack[haystack.len()..].as_ptr();
let mut ptr = end_ptr;
if haystack.len() < VECTOR_SIZE {
while ptr > start_ptr {
ptr = ptr.offset(-1);
if *ptr == n1 || *ptr == n2 || *ptr == n3 {
return Some(sub(ptr, start_ptr));
}
}
return None;
}
ptr = ptr.sub(VECTOR_SIZE);
if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
return Some(i);
}
ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) {
debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
ptr = ptr.sub(loop_size);
let a = i8x16_load(ptr);
let b = i8x16_load(ptr.add(VECTOR_SIZE));
let eqa1 = vn1.eq(a);
let eqb1 = vn1.eq(b);
let eqa2 = vn2.eq(a);
let eqb2 = vn2.eq(b);
let eqa3 = vn3.eq(a);
let eqb3 = vn3.eq(b);
let or1 = eqa1.bitor(eqb1);
let or2 = eqa2.bitor(eqb2);
let or3 = eqa3.bitor(eqb3);
let or4 = or1.bitor(or2);
let or5 = or3.bitor(or4);
if or5.any() {
let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
if eqb1.any() || eqb2.any() || eqb3.any() {
return Some(at + reverse_pos3(eqb1.bitmask(), eqb2.bitmask(), eqb3.bitmask()))
}
at -= VECTOR_SIZE;
return Some(at + reverse_pos3(eqa1.bitmask(), eqa2.bitmask(), eqa3.bitmask()));
}
}
while ptr >= start_ptr.add(VECTOR_SIZE) {
ptr = ptr.sub(VECTOR_SIZE);
if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
return Some(i);
}
}
if ptr > start_ptr {
debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
return reverse_search3(start_ptr, end_ptr, start_ptr, vn1, vn2, vn3);
}
None
}
pub unsafe fn forward_search1(
start_ptr: *const u8,
end_ptr: *const u8,
ptr: *const u8,
vn1: i8x16,
) -> Option<usize> {
debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
debug_assert!(start_ptr <= ptr);
debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
let chunk = i8x16_load(ptr);
let eq = vn1.eq(chunk);
if eq.any() {
Some(sub(ptr, start_ptr) + forward_pos(eq.bitmask()))
} else {
None
}
}
unsafe fn forward_search2(
start_ptr: *const u8,
end_ptr: *const u8,
ptr: *const u8,
vn1: i8x16,
vn2: i8x16,
) -> Option<usize> {
debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
debug_assert!(start_ptr <= ptr);
debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
let chunk = i8x16_load(ptr);
let eq1 = vn1.eq(chunk);
let eq2 = vn2.eq(chunk);
let or = eq1.bitor(eq2);
if or.any() {
Some(sub(ptr, start_ptr) + forward_pos2(eq1.bitmask(), eq2.bitmask()))
} else {
None
}
}
unsafe fn forward_search3(
start_ptr: *const u8,
end_ptr: *const u8,
ptr: *const u8,
vn1: i8x16,
vn2: i8x16,
vn3: i8x16,
) -> Option<usize> {
debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
debug_assert!(start_ptr <= ptr);
debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
let chunk = i8x16_load(ptr);
let eq1 = vn1.eq(chunk);
let eq2 = vn2.eq(chunk);
let eq3 = vn3.eq(chunk);
let or1 = eq1.bitor(eq2);
let or2 = or1.bitor(eq3);
if or2.any() {
Some(sub(ptr, start_ptr) + forward_pos3(eq1.bitmask(), eq2.bitmask(), eq3.bitmask()))
} else {
None
}
}
unsafe fn reverse_search1(
start_ptr: *const u8,
end_ptr: *const u8,
ptr: *const u8,
vn1: i8x16,
) -> Option<usize> {
debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
debug_assert!(start_ptr <= ptr);
debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
let chunk = i8x16_load(ptr);
let eq = vn1.eq(chunk);
if eq.any() {
Some(sub(ptr, start_ptr) + reverse_pos(eq.bitmask()))
} else {
None
}
}
unsafe fn reverse_search2(
start_ptr: *const u8,
end_ptr: *const u8,
ptr: *const u8,
vn1: i8x16,
vn2: i8x16,
) -> Option<usize> {
debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
debug_assert!(start_ptr <= ptr);
debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
let chunk = i8x16_load(ptr);
let eq1 = vn1.eq(chunk);
let eq2 = vn2.eq(chunk);
let or = eq1.bitor(eq2);
if or.any() {
Some(sub(ptr, start_ptr) + reverse_pos2(eq1.bitmask(), eq2.bitmask()))
} else {
None
}
}
unsafe fn reverse_search3(
start_ptr: *const u8,
end_ptr: *const u8,
ptr: *const u8,
vn1: i8x16,
vn2: i8x16,
vn3: i8x16,
) -> Option<usize> {
debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
debug_assert!(start_ptr <= ptr);
debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
let chunk = i8x16_load(ptr);
let eq1 = vn1.eq(chunk);
let eq2 = vn2.eq(chunk);
let eq3 = vn3.eq(chunk);
let or1 = eq1.bitor(eq2);
let or2 = or1.bitor(eq3);
if or2.any() {
Some(sub(ptr, start_ptr) + reverse_pos3(eq1.bitmask(), eq2.bitmask(), eq3.bitmask()))
} else {
None
}
}
/// Compute the position of the first matching byte from the given mask. The
/// position returned is always in the range [0, 15].
///
/// The mask given is expected to be the result of _mm_movemask_epi8.
fn forward_pos(mask: u16) -> usize {
// We are dealing with little endian here, where the most significant byte
// is at a higher address. That means the least significant bit that is set
// corresponds to the position of our first matching byte. That position
// corresponds to the number of zeros after the least significant bit.
mask.trailing_zeros() as usize
}
/// Compute the position of the first matching byte from the given masks. The
/// position returned is always in the range [0, 15]. Each mask corresponds to
/// the equality comparison of a single byte.
///
/// The masks given are expected to be the result of _mm_movemask_epi8, where
/// at least one of the masks is non-zero (i.e., indicates a match).
fn forward_pos2(mask1: u16, mask2: u16) -> usize {
debug_assert!(mask1 != 0 || mask2 != 0);
forward_pos(mask1 | mask2)
}
/// Compute the position of the first matching byte from the given masks. The
/// position returned is always in the range [0, 15]. Each mask corresponds to
/// the equality comparison of a single byte.
///
/// The masks given are expected to be the result of _mm_movemask_epi8, where
/// at least one of the masks is non-zero (i.e., indicates a match).
fn forward_pos3(mask1: u16, mask2: u16, mask3: u16) -> usize {
debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
forward_pos(mask1 | mask2 | mask3)
}
/// Compute the position of the last matching byte from the given mask. The
/// position returned is always in the range [0, 15].
///
/// The mask given is expected to be the result of _mm_movemask_epi8.
fn reverse_pos(mask: u16) -> usize {
// We are dealing with little endian here, where the most significant byte
// is at a higher address. That means the most significant bit that is set
// corresponds to the position of our last matching byte. The position from
// the end of the mask is therefore the number of leading zeros in a 16
// bit integer, and the position from the start of the mask is therefore
// 16 - (leading zeros) - 1.
VECTOR_SIZE - mask.leading_zeros() as usize - 1
}
/// Compute the position of the last matching byte from the given masks. The
/// position returned is always in the range [0, 15]. Each mask corresponds to
/// the equality comparison of a single byte.
///
/// The masks given are expected to be the result of _mm_movemask_epi8, where
/// at least one of the masks is non-zero (i.e., indicates a match).
fn reverse_pos2(mask1: u16, mask2: u16) -> usize {
debug_assert!(mask1 != 0 || mask2 != 0);
reverse_pos(mask1 | mask2)
}
/// Compute the position of the last matching byte from the given masks. The
/// position returned is always in the range [0, 15]. Each mask corresponds to
/// the equality comparison of a single byte.
///
/// The masks given are expected to be the result of _mm_movemask_epi8, where
/// at least one of the masks is non-zero (i.e., indicates a match).
fn reverse_pos3(mask1: u16, mask2: u16, mask3: u16) -> usize {
debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
reverse_pos(mask1 | mask2 | mask3)
}
/// Subtract `b` from `a` and return the difference. `a` should be greater than
/// or equal to `b`.
fn sub(a: *const u8, b: *const u8) -> usize {
debug_assert!(a >= b);
(a as usize) - (b as usize)
}
// Load 128 bits from a ptr into i8x16.
unsafe fn i8x16_load(ptr: *const u8) -> i8x16 {
let slc = slice::from_raw_parts(ptr as *const i8, 16);
i8x16::from_slice_unaligned_unchecked(slc)
}
} // private
#[inline(always)]
pub fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
unsafe { private::memchr(n1, haystack) }
}
#[inline(always)]
pub fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
unsafe { private::memchr2(n1, n2, haystack) }
}
#[inline(always)]
pub fn memchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
unsafe { private::memchr3(n1, n2, n3, haystack) }
}
#[inline(always)]
pub fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
unsafe { private::memrchr(n1, haystack) }
}
#[inline(always)]
pub fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
unsafe { private::memrchr2(n1, n2, haystack) }
}
#[inline(always)]
pub fn memrchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
unsafe { private::memrchr3(n1, n2, n3, haystack) }
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment