|
use clap::{App, Arg}; |
|
use memmap::Mmap; |
|
use serde::Deserialize; |
|
use serde_json::Value; |
|
use serde::de; |
|
use serde::de::{Deserializer, MapAccess, SeqAccess, Visitor}; |
|
|
|
// use std::collections::{HashMap, HashSet}; |
|
use fnv::{FnvHashMap as HashMap, FnvHashSet as HashSet}; |
|
use std::error::Error; |
|
use std::result::Result; |
|
use std::fmt; |
|
|
|
// use smallstr::SmallString; |
|
// type S = SmallString<[u8; 16]>; |
|
type S = String; |
|
|
|
//source data |
|
#[derive(Default, Debug, Deserialize)] |
|
struct DebtRec { |
|
company: S, |
|
phones: Vec<S>, |
|
debt: f64, |
|
} |
|
|
|
#[derive(Default, Debug)] |
|
struct ManualRec { |
|
company: S, |
|
phones: Vec<S>, |
|
debt: f64, |
|
} |
|
|
|
//result data |
|
#[derive(Default)] |
|
struct Debtor { |
|
companies: HashSet<S>, |
|
phones: HashSet<S>, |
|
debt: f64, |
|
} |
|
|
|
#[derive(Default)] |
|
struct Debtors { |
|
all: Vec<Debtor>, |
|
by_phone: HashMap<S, usize>, |
|
} |
|
|
|
use std::arch::x86_64::{ |
|
__m128i, _mm_cmpestri, _mm_loadu_si128, _SIDD_CMP_EQUAL_ANY, _SIDD_UBYTE_OPS, |
|
}; |
|
const CHUNK_LEN: usize = 16; |
|
|
|
struct Pattern { |
|
needle: __m128i, |
|
needle_len: usize, |
|
} |
|
|
|
impl Pattern { |
|
fn new(bytes: &[u8]) -> Self { |
|
let mut needle = [0u8; CHUNK_LEN]; |
|
needle[..bytes.len()].copy_from_slice(bytes); |
|
Self { |
|
needle: unsafe { _mm_loadu_si128((&needle as *const u8) as *const __m128i) }, |
|
needle_len: bytes.len(), |
|
} |
|
} |
|
|
|
fn find(&self, buf: &[u8]) -> usize { |
|
let mut chunks = buf.chunks_exact(CHUNK_LEN); |
|
let mut index = 0; |
|
while let Some(c) = chunks.next() { |
|
let i = unsafe { self.find_in_chunk(c) }; |
|
if i < CHUNK_LEN { |
|
return i + index; |
|
} |
|
index += CHUNK_LEN; |
|
} |
|
let r = chunks.remainder(); |
|
let mut c = [0u8; CHUNK_LEN]; |
|
c[..r.len()].copy_from_slice(r); |
|
let i = unsafe { self.find_in_chunk(&c) }; |
|
if i < r.len() { |
|
return i + index; |
|
} |
|
buf.len() |
|
} |
|
#[inline] |
|
unsafe fn find_in_chunk(&self, haystack: &[u8]) -> usize { |
|
let p = haystack[..CHUNK_LEN].as_ptr(); |
|
let haystack = _mm_loadu_si128(p as *const __m128i); |
|
_mm_cmpestri( |
|
self.needle, |
|
self.needle_len as i32, |
|
haystack, |
|
CHUNK_LEN as i32, |
|
_SIDD_CMP_EQUAL_ANY | _SIDD_UBYTE_OPS, |
|
) as usize |
|
} |
|
} |
|
|
|
impl<'de> Deserialize<'de> for ManualRec { |
|
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> |
|
where |
|
D: Deserializer<'de>, |
|
{ |
|
struct Phone(S); |
|
impl<'de> Deserialize<'de> for Phone { |
|
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> |
|
where |
|
D: Deserializer<'de>, |
|
{ |
|
struct VisitorImpl; |
|
impl<'de> Visitor<'de> for VisitorImpl { |
|
type Value = S; |
|
|
|
fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { |
|
f.write_str("Phone as a number or string") |
|
} |
|
|
|
fn visit_u64<E>(self, n: u64) -> Result<Self::Value, E> |
|
where |
|
E: de::Error, |
|
{ |
|
Ok(S::from(n.to_string())) |
|
} |
|
|
|
fn visit_str<E>(self, s: &str) -> Result<Self::Value, E> |
|
where |
|
E: de::Error, |
|
{ |
|
Ok(S::from(s)) |
|
} |
|
} |
|
deserializer.deserialize_any(VisitorImpl).map(|s| Phone(s)) |
|
} |
|
} |
|
|
|
struct Phones(Vec<S>); |
|
impl<'de> Deserialize<'de> for Phones { |
|
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> |
|
where |
|
D: Deserializer<'de>, |
|
{ |
|
struct VisitorImpl; |
|
impl<'de> Visitor<'de> for VisitorImpl { |
|
type Value = Vec<S>; |
|
|
|
fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { |
|
f.write_str("Phones as one Phone or list of Phone") |
|
} |
|
|
|
fn visit_u64<E>(self, n: u64) -> Result<Self::Value, E> |
|
where |
|
E: de::Error, |
|
{ |
|
Ok(vec![S::from(n.to_string())]) |
|
} |
|
|
|
fn visit_str<E>(self, s: &str) -> Result<Self::Value, E> |
|
where |
|
E: de::Error, |
|
{ |
|
Ok(vec![S::from(s)]) |
|
} |
|
|
|
fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error> |
|
where |
|
A: SeqAccess<'de>, |
|
{ |
|
let mut phones = Vec::new(); |
|
while let Some(p) = seq.next_element::<Phone>()? { |
|
phones.push(p.0) |
|
} |
|
Ok(phones) |
|
} |
|
} |
|
deserializer.deserialize_any(VisitorImpl).map(|p| Phones(p)) |
|
} |
|
} |
|
|
|
struct Company(S); |
|
impl<'de> Deserialize<'de> for Company { |
|
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> |
|
where |
|
D: Deserializer<'de>, |
|
{ |
|
struct CompanyVisitor; |
|
impl<'de> Visitor<'de> for CompanyVisitor { |
|
type Value = S; |
|
|
|
fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { |
|
f.write_str("Company") |
|
} |
|
|
|
fn visit_map<V>(self, mut map: V) -> Result<Self::Value, V::Error> |
|
where |
|
V: MapAccess<'de>, |
|
{ |
|
while let Some(k) = map.next_key::<&str>()? { |
|
if k == "name" { |
|
let value = map.next_value::<S>()?; |
|
return Ok(value); |
|
} else { |
|
map.next_value::<de::IgnoredAny>()?; |
|
} |
|
} |
|
Err(de::Error::missing_field("name")) |
|
} |
|
|
|
fn visit_str<E>(self, s: &str) -> Result<Self::Value, E> |
|
where |
|
E: de::Error, |
|
{ |
|
Ok(S::from(s)) |
|
} |
|
} |
|
deserializer |
|
.deserialize_any(CompanyVisitor) |
|
.map(|s| Company(s)) |
|
} |
|
} |
|
|
|
struct Debt(f64); |
|
impl<'de> Deserialize<'de> for Debt { |
|
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> |
|
where |
|
D: Deserializer<'de>, |
|
{ |
|
struct VisitorImpl; |
|
impl<'de> Visitor<'de> for VisitorImpl { |
|
type Value = Debt; |
|
|
|
fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { |
|
f.write_str("Debt") |
|
} |
|
|
|
fn visit_str<E>(self, s: &str) -> Result<Self::Value, E> |
|
where |
|
E: de::Error, |
|
{ |
|
s.parse::<f64>().map(Debt).map_err(de::Error::custom) |
|
} |
|
fn visit_u64<E>(self, v: u64) -> Result<Self::Value, E> { |
|
Ok(Debt(v as f64)) |
|
} |
|
fn visit_f64<E>(self, v: f64) -> Result<Self::Value, E> { |
|
Ok(Debt(v)) |
|
} |
|
} |
|
deserializer.deserialize_any(VisitorImpl) |
|
} |
|
} |
|
|
|
struct RecordVisitor; |
|
impl<'de> Visitor<'de> for RecordVisitor { |
|
type Value = ManualRec; |
|
|
|
fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { |
|
f.write_str("Record") |
|
} |
|
|
|
fn visit_map<V>(self, mut map: V) -> Result<Self::Value, V::Error> |
|
where |
|
V: MapAccess<'de>, |
|
{ |
|
let mut rec = ManualRec::default(); |
|
while let Some(k) = map.next_key()? { |
|
// println!("k={}", k); |
|
match k { |
|
"company" => { |
|
let company = map.next_value::<Company>()?; |
|
rec.company = company.0; |
|
} |
|
"phones" => { |
|
let phones = map.next_value::<Phones>()?; |
|
rec.phones.extend(phones.0); |
|
} |
|
"phone" => { |
|
let phone = map.next_value::<Phone>()?; |
|
rec.phones.push(phone.0); |
|
} |
|
"debt" => { |
|
let debt = map.next_value::<Debt>()?; |
|
rec.debt = debt.0; |
|
} |
|
_ => { |
|
map.next_value::<de::IgnoredAny>()?; |
|
} |
|
} |
|
} |
|
// println!("{:?}", rec); |
|
Ok(rec) |
|
} |
|
} |
|
deserializer.deserialize_map(RecordVisitor) |
|
} |
|
} |
|
|
|
fn main() -> Result<(), Box<Error>> { |
|
let matches = App::new("json-parser") |
|
.arg( |
|
Arg::with_name("file") |
|
.short("f") |
|
.takes_value(true) |
|
.required(true), |
|
) |
|
.get_matches(); |
|
|
|
let file_name = matches.value_of("file").unwrap(); |
|
println!("Reading json from {}", file_name); |
|
|
|
let result = process_file(&file_name)?; |
|
for (di, d) in result.all.iter().enumerate() { |
|
println!("#{}: debt: {}", di, &d.debt); |
|
println!("companies: {:?}\nphones: {:?}", &d.companies, &d.phones); |
|
} |
|
Ok(()) |
|
} |
|
|
|
fn process_file(file_name: &str) -> Result<Debtors, Box<Error>> { |
|
let file = std::fs::File::open(file_name)?; |
|
let buf = unsafe { Mmap::map(&file)? }; |
|
|
|
let mut deb = Debtors::default(); |
|
|
|
let mut count = 0; |
|
let mut braces = 0; |
|
let mut start_idx = 0; |
|
|
|
let mut idx = 0; |
|
let needles = Pattern::new(&[b'{', b'}']); |
|
|
|
loop { |
|
idx += unsafe { needles.find(&buf.get_unchecked(idx..)) }; |
|
if idx >= buf.len() { |
|
break; |
|
} |
|
let b = buf[idx]; |
|
|
|
// for (idx, b) in buf.iter().enumerate() { |
|
|
|
match b { |
|
b'{' => { |
|
if braces == 0 { |
|
start_idx = idx; |
|
} |
|
braces += 1; |
|
} |
|
b'}' => { |
|
braces -= 1; |
|
if braces == 0 { |
|
let utf = std::str::from_utf8(&buf[start_idx..=idx])?; |
|
// let dom = serde_json::from_str::<Value>(utf)?; |
|
// let rec = serde_json::from_str::<DebtRec>(utf)?; |
|
let rec = serde_json::from_str::<ManualRec>(utf)?; |
|
// println!("{:?}", rec); |
|
// let rec = extract_data(dom)?; |
|
process_object(rec, &mut deb); |
|
count += 1; |
|
} |
|
} |
|
_ => {} |
|
}; |
|
|
|
idx += 1; |
|
} |
|
println!("Processed {}", count); |
|
|
|
Ok(deb) |
|
} |
|
|
|
fn process_object(rec: ManualRec, result: &mut Debtors) -> std::option::Option<()> { |
|
let phones = rec.phones; |
|
|
|
let di = phones |
|
.iter() |
|
.find_map(|p| result.by_phone.get(p).cloned()) |
|
.unwrap_or_else(|| { |
|
result.all.push(Debtor::default()); |
|
result.all.len() - 1 |
|
}); |
|
let d = &mut result.all[di]; |
|
|
|
use std::collections::hash_map::Entry; |
|
for phone in phones { |
|
match result.by_phone.entry(phone) { |
|
Entry::Vacant(e) => { |
|
d.phones.insert(e.key().to_owned()); |
|
e.insert(di); |
|
} |
|
Entry::Occupied(e) => { |
|
// Commenting this out add some ms. Doesn't affect wrong algorithm. |
|
//d.phones.insert(e.key().clone()); |
|
} |
|
} |
|
} |
|
d.companies.insert(rec.company); |
|
d.debt += rec.debt; |
|
|
|
Some(()) |
|
} |
|
|
|
fn merge_result(part: Debtors, result: &mut Debtors) { |
|
for dr in part.all.into_iter() { |
|
let di = match dr.phones.iter().find_map(|p| result.by_phone.get(p)) { |
|
Some(i) => *i, |
|
None => { |
|
result.all.push(Debtor::default()); |
|
result.all.len() - 1 |
|
} |
|
}; |
|
let d = &mut result.all[di]; |
|
for p in &dr.phones { |
|
result.by_phone.insert(p.to_owned(), di); |
|
} |
|
d.phones.extend(dr.phones); |
|
d.companies.extend(dr.companies); |
|
d.debt += dr.debt; |
|
} |
|
} |
|
|
|
fn extract_data(dom: Value) -> serde_json::Result<DebtRec> { |
|
let mut rec = DebtRec::default(); |
|
|
|
fn val2str(v: Value) -> S { |
|
match v { |
|
Value::String(s) => S::from(s), |
|
Value::Number(n) => S::from(n.to_string()), //n.to_string(), |
|
_ => S::default(), |
|
} |
|
} |
|
|
|
if let Value::Object(o) = dom { |
|
for (k, v) in o.into_iter() { |
|
if k == "company" { |
|
rec.company = match v { |
|
Value::Object(mut c) => val2str(c["name"].take()), |
|
Value::String(s) => S::from(s), |
|
_ => S::default(), |
|
}; |
|
} else if k == "phones" { |
|
match v { |
|
Value::Array(phones) => { |
|
rec.phones.extend(phones.into_iter().map(|p| val2str(p))) |
|
} |
|
phones => rec.phones.push(val2str(phones)), |
|
} |
|
} else if k == "phone" { |
|
match v { |
|
phone => rec.phones.push(val2str(phone)), |
|
} |
|
} else if k == "debt" { |
|
rec.debt = match v { |
|
Value::Number(d) => d.as_f64().unwrap_or(0.0), |
|
Value::String(d) => d.parse::<f64>().unwrap_or(0.0), |
|
_ => 0.0, |
|
}; |
|
} |
|
} |
|
} |
|
|
|
// println!("Record {:?}", dr); |
|
|
|
Ok(rec) |
|
} |