mseri · March 19, 2021 10:24
diff --git a/count_words.ml b/count_words.ml
 module Trie = struct
  type t = { mutable count : int; children : t option Array.t }

  let[@inline] empty () = { count = 0; children = Array.make 128 None }

  let add buf pos len t =
    let rec loop idx t =
      if idx > pos + len - 1 then t.count <- t.count + 1
      else
        let c = Char.lowercase_ascii (Bytes.get buf idx) in
        let t' =
          match t.children.(Char.code c) with
          | None ->
              let t'' = empty () in
              t.children.(Char.code c) <- Some t'';
              t''
          | Some c -> c
        in
        loop (idx + 1) t'
    in
    loop pos t

  let to_assoc t =
    let rec loop acc curr t =
      let acc =
        match (curr, t.count) with
        | [], _ -> acc
        | _, 0 -> acc
        | curr, c -> (List.rev curr, c) :: acc
      in
      let r = ref acc in
      Array.iteri
        (fun idx t' ->
          match t' with
          | None -> ()
          | Some t' -> r := loop !r (Char.chr idx :: curr) t')
        t.children;
      !r
    in
    loop [] [] t
 end

 let[@inline] is_whitespace c = c = ' ' || c = '\n'

 let rec skip_whitespace countwords pad ~lim i =
  if i >= lim then (read_more_and_white [@tailcall]) countwords pad i
  else if is_whitespace @@ Bytes.unsafe_get pad i then
    (skip_whitespace [@tailcall]) countwords pad ~lim (i + 1)
  else (read_next_word [@tailcall]) countwords pad ~lim i i

 and read_next_word countwords pad ~lim i j =
  if j >= lim then (read_more_and_next [@tailcall]) countwords pad i j
  else if is_whitespace @@ Bytes.unsafe_get pad j then (
    Trie.add pad i (j - i) countwords;
    (skip_whitespace [@tailcall]) countwords pad ~lim (j + 1))
  else (read_next_word [@tailcall]) countwords pad ~lim i (j + 1)

 and read_more_and_next countwords pad i j =
  let l = j - i in
  Bytes.blit pad i pad 0 l;
  let read = input stdin pad l (Bytes.length pad - l) in
  let lim = l + read in
  if lim = 0 then () else (read_next_word [@tailcall]) countwords pad ~lim 0 l

 and read_more_and_white countwords pad i =
  let lim = input stdin pad 0 (Bytes.length pad) in
  if lim = 0 then () else (skip_whitespace [@tailcall]) countwords pad ~lim 0

 let[@inline] read_start countwords pad =
  (read_more_and_next [@specialize]) countwords pad 0 0

 let () =
  let t = Trie.empty () in
  let pad = Bytes.make (64 * 1024) ' ' in
  read_start t pad;
  let assoc = Array.of_list (Trie.to_assoc t) in
  Array.sort (fun (_, a) (_, b) -> Int.compare b a) assoc;
  Array.iter
    (fun (word, count) ->
      List.iter print_char word;
      print_char ' ';
      print_int count;
      print_newline ())
    assoc
diff --git a/count_words_2.ml b/count_words_2.ml
 (* Simpler IO doesn't make the timing worse *)
 module Trie = struct
  type t = { mutable count : int; children : t option Array.t }

  let[@inline] empty () = { count = 0; children = Array.make 128 None }

  let add s pos len t =
    let rec loop idx t =
      if idx > pos + len - 1 then t.count <- t.count + 1
      else
        let c = Char.lowercase_ascii (String.get s idx) in
        let t' =
          match t.children.(Char.code c) with
          | None ->
              let t'' = empty () in
              t.children.(Char.code c) <- Some t'';
              t''
          | Some c -> c
        in
        loop (idx + 1) t'
    in
    loop pos t

  let to_assoc t =
    let rec loop acc curr t =
      let acc =
        match (curr, t.count) with
        | [], _ -> acc
        | _, 0 -> acc
        | curr, c -> (List.rev curr, c) :: acc
      in
      let r = ref acc in
      Array.iteri
        (fun idx t' ->
          match t' with
          | None -> ()
          | Some t' -> r := loop !r (Char.chr idx :: curr) t')
        t.children;
      !r
    in
    loop [] [] t
 end

 let read_input t =
  try
    let rec loop () =
      let line = input_line stdin in
      let start = ref 0 in
      let len = ref 0 in
      for i = 0 to String.length line - 1 do
        if line.[i] <> ' ' then incr len
        else (
          Trie.add line !start !len t;
          start := i + 1;
          len := 0)
      done;
      Trie.add line !start !len t;
      loop ()
    in
    loop ()
  with End_of_file -> ()

 let () =
  let t = Trie.empty () in
  read_input t;
  let assoc = Array.of_list (Trie.to_assoc t) in
  Array.sort (fun (_, a) (_, b) -> Int.compare b a) assoc;
    Array.iter
    (fun (word, count) ->
      List.iter print_char word;
      print_char ' ';
      print_int count;
      print_newline ())
    assoc
diff --git a/count_words_drup.ml b/count_words_drup.ml
 module BytesHash = struct
  include Bytes
  let hash (k : t) = Hashtbl.hash k
 end
 module BytesHashtbl = Hashtbl.Make (BytesHash)

 let lowercase_inplace pad i j =
  for k = i to j - 1 do
    Bytes.unsafe_set pad k @@ Char.lowercase_ascii @@ Bytes.unsafe_get pad k
  done

 let sub_and_add_word countwords source i len =
  let word = Bytes.sub source i len in
  match BytesHashtbl.find countwords word with
  | v ->
    BytesHashtbl.replace countwords word (v + 1)
  | exception Not_found ->
    BytesHashtbl.add countwords word 1

 let[@inline] is_whitespace c = c = ' ' || c = '\n'

 let rec skip_whitespace countwords pad ~lim i =
  if i >= lim then
    (read_more_and_white[@taillcall]) countwords pad i
  else if is_whitespace @@ Bytes.unsafe_get pad i then
    (skip_whitespace[@tailcall]) countwords pad ~lim (i + 1)
  else
    (read_next_word[@tailcall]) countwords pad ~lim i i
 and read_next_word countwords pad ~lim i j =
  if j >= lim then
    (read_more_and_next[@taillcall]) countwords pad i j
  else if is_whitespace @@ Bytes.unsafe_get pad j then begin
    sub_and_add_word countwords pad i (j-i);
    (skip_whitespace[@taillcall]) countwords pad ~lim (j+1)
  end else
    (read_next_word[@taillcall]) countwords pad ~lim i (j + 1)
 and read_more_and_next countwords pad i j =
  let l = j - i in
  Bytes.blit pad i pad 0 l;
  let read = input stdin pad l (Bytes.length pad - l) in
  let lim = l+read in
  lowercase_inplace pad l lim;
  if lim = 0 then ()
  else (read_next_word[@taillcall]) countwords pad ~lim 0 l
 and read_more_and_white countwords pad i =
  let lim = input stdin pad 0 (Bytes.length pad) in
  lowercase_inplace pad 0 lim;
  if lim = 0 then ()
  else (skip_whitespace[@taillcall]) countwords pad ~lim 0

 let[@inline] read_start countwords pad =
  (read_more_and_next[@specialize]) countwords pad 0 0

 let[@inline] print_slot (w, c) =
  print_bytes w;
  print_char ' ';
  print_int c;
  print_newline ()

 let () =
  let countwords = BytesHashtbl.create 33_000 in
  let size = 64*1024 in
  let pad = Bytes.make size ' ' in
  read_start countwords pad ;

  let a = Array.make (BytesHashtbl.length countwords) (Bytes.empty,0) in
  let f k v i = a.(i) <- (k,v); i+1 in
  let _ = BytesHashtbl.fold f countwords 0 in
  Array.sort (fun (_,i) (_,j) -> Int.compare i j) a;
  Array.iter print_slot a
diff --git a/mutable_countwords.ml b/mutable_countwords.ml
 module BytesHash = struct
  include Bytes

  let hash (k : bytes) = Hashtbl.hash k
 end

 module BytesHashtbl = Hashtbl.Make (BytesHash)

 module Buf = struct
  type t = { buf : Bytes.t; mutable pos : int; mutable len : int; size : int }

  let create size = { buf = Bytes.create size; pos = 0; len = 0; size }

  let[@inline] compress t =
    if t.len <= 0 || t.pos = t.len then (
      t.pos <- 0;
      t.len <- 0)
    else if t.pos > 0 then (
      BytesLabels.unsafe_blit ~src:t.buf ~src_pos:t.pos ~dst:t.buf ~dst_pos:0
        ~len:t.len;
      t.pos <- 0)

  let read_into t fd =
    compress t;
    let count =
      UnixLabels.read fd ~buf:t.buf ~pos:(t.pos + t.len) ~len:(t.size - t.len)
    in
    t.len <- t.len + count;
    for i = t.pos to t.len - 1 do
      Bytes.unsafe_set t.buf i (Char.lowercase_ascii (Bytes.unsafe_get t.buf i))
    done;
    count

  let iter_words t tbl =
    let prev = ref t.pos in
    for i = t.pos to t.len - 1 do
      (* We perform bounds check so its okay to call unsafe_get here *)
      let c = Bytes.unsafe_get t.buf i in
      if c = '\n' || c = ' ' then (
        let pos = !prev in
        let len = i - !prev in
        (if len > 0 then
         let word = Bytes.sub t.buf pos len in
         try incr (BytesHashtbl.find tbl word)
         with Not_found -> BytesHashtbl.add tbl word (ref 1));

        prev := i + 1)
    done;
    t.len <- t.len - !prev;
    t.pos <- !prev
 end

 let () =
  let countwords = BytesHashtbl.create 33_000 in
  let buf = Buf.create (64 * 1024) in
  while Buf.read_into buf Unix.stdin > 0 do
    Buf.iter_words buf countwords
  done;
  let arr = Array.make (BytesHashtbl.length countwords) (Bytes.empty, 0) in
  let idx = ref 0 in
  BytesHashtbl.iter
    (fun word count ->
      arr.(!idx) <- (word, !count);
      incr idx)
    countwords;
  Array.sort (fun (_, x) (_, y) -> Int.compare y x) arr;
  Array.iter
    (fun (w, c) ->
      print_bytes w;
      print_char ' ';
      print_int c;
      print_newline ())
    arr
	module Trie = struct
	type t = { mutable count : int; children : t option Array.t }

	let[@inline] empty () = { count = 0; children = Array.make 128 None }

	let add buf pos len t =
	let rec loop idx t =
	if idx > pos + len - 1 then t.count <- t.count + 1
	else
	let c = Char.lowercase_ascii (Bytes.get buf idx) in
	let t' =
	match t.children.(Char.code c) with
	\| None ->
	let t'' = empty () in
	t.children.(Char.code c) <- Some t'';
	t''
	\| Some c -> c
	in
	loop (idx + 1) t'
	in
	loop pos t

	let to_assoc t =
	let rec loop acc curr t =
	let acc =
	match (curr, t.count) with
	\| [], _ -> acc
	\| _, 0 -> acc
	\| curr, c -> (List.rev curr, c) :: acc
	in
	let r = ref acc in
	Array.iteri
	(fun idx t' ->
	match t' with
	\| None -> ()
	\| Some t' -> r := loop !r (Char.chr idx :: curr) t')
	t.children;
	!r
	in
	loop [] [] t
	end

	let[@inline] is_whitespace c = c = ' ' \|\| c = '\n'

	let rec skip_whitespace countwords pad ~lim i =
	if i >= lim then (read_more_and_white [@tailcall]) countwords pad i
	else if is_whitespace @@ Bytes.unsafe_get pad i then
	(skip_whitespace [@tailcall]) countwords pad ~lim (i + 1)
	else (read_next_word [@tailcall]) countwords pad ~lim i i

	and read_next_word countwords pad ~lim i j =
	if j >= lim then (read_more_and_next [@tailcall]) countwords pad i j
	else if is_whitespace @@ Bytes.unsafe_get pad j then (
	Trie.add pad i (j - i) countwords;
	(skip_whitespace [@tailcall]) countwords pad ~lim (j + 1))
	else (read_next_word [@tailcall]) countwords pad ~lim i (j + 1)

	and read_more_and_next countwords pad i j =
	let l = j - i in
	Bytes.blit pad i pad 0 l;
	let read = input stdin pad l (Bytes.length pad - l) in
	let lim = l + read in
	if lim = 0 then () else (read_next_word [@tailcall]) countwords pad ~lim 0 l

	and read_more_and_white countwords pad i =
	let lim = input stdin pad 0 (Bytes.length pad) in
	if lim = 0 then () else (skip_whitespace [@tailcall]) countwords pad ~lim 0

	let[@inline] read_start countwords pad =
	(read_more_and_next [@specialize]) countwords pad 0 0

	let () =
	let t = Trie.empty () in
	let pad = Bytes.make (64 * 1024) ' ' in
	read_start t pad;
	let assoc = Array.of_list (Trie.to_assoc t) in
	Array.sort (fun (_, a) (_, b) -> Int.compare b a) assoc;
	Array.iter
	(fun (word, count) ->
	List.iter print_char word;
	print_char ' ';
	print_int count;
	print_newline ())
	assoc
	(* Simpler IO doesn't make the timing worse *)
	module Trie = struct
	type t = { mutable count : int; children : t option Array.t }

	let[@inline] empty () = { count = 0; children = Array.make 128 None }

	let add s pos len t =
	let rec loop idx t =
	if idx > pos + len - 1 then t.count <- t.count + 1
	else
	let c = Char.lowercase_ascii (String.get s idx) in
	let t' =
	match t.children.(Char.code c) with
	\| None ->
	let t'' = empty () in
	t.children.(Char.code c) <- Some t'';
	t''
	\| Some c -> c
	in
	loop (idx + 1) t'
	in
	loop pos t

	let to_assoc t =
	let rec loop acc curr t =
	let acc =
	match (curr, t.count) with
	\| [], _ -> acc
	\| _, 0 -> acc
	\| curr, c -> (List.rev curr, c) :: acc
	in
	let r = ref acc in
	Array.iteri
	(fun idx t' ->
	match t' with
	\| None -> ()
	\| Some t' -> r := loop !r (Char.chr idx :: curr) t')
	t.children;
	!r
	in
	loop [] [] t
	end

	let read_input t =
	try
	let rec loop () =
	let line = input_line stdin in
	let start = ref 0 in
	let len = ref 0 in
	for i = 0 to String.length line - 1 do
	if line.[i] <> ' ' then incr len
	else (
	Trie.add line !start !len t;
	start := i + 1;
	len := 0)
	done;
	Trie.add line !start !len t;
	loop ()
	in
	loop ()
	with End_of_file -> ()

	let () =
	let t = Trie.empty () in
	read_input t;
	let assoc = Array.of_list (Trie.to_assoc t) in
	Array.sort (fun (_, a) (_, b) -> Int.compare b a) assoc;
	Array.iter
	(fun (word, count) ->
	List.iter print_char word;
	print_char ' ';
	print_int count;
	print_newline ())
	assoc
	module BytesHash = struct
	include Bytes
	let hash (k : t) = Hashtbl.hash k
	end
	module BytesHashtbl = Hashtbl.Make (BytesHash)

	let lowercase_inplace pad i j =
	for k = i to j - 1 do
	Bytes.unsafe_set pad k @@ Char.lowercase_ascii @@ Bytes.unsafe_get pad k
	done

	let sub_and_add_word countwords source i len =
	let word = Bytes.sub source i len in
	match BytesHashtbl.find countwords word with
	\| v ->
	BytesHashtbl.replace countwords word (v + 1)
	\| exception Not_found ->
	BytesHashtbl.add countwords word 1

	let[@inline] is_whitespace c = c = ' ' \|\| c = '\n'

	let rec skip_whitespace countwords pad ~lim i =
	if i >= lim then
	(read_more_and_white[@taillcall]) countwords pad i
	else if is_whitespace @@ Bytes.unsafe_get pad i then
	(skip_whitespace[@tailcall]) countwords pad ~lim (i + 1)
	else
	(read_next_word[@tailcall]) countwords pad ~lim i i
	and read_next_word countwords pad ~lim i j =
	if j >= lim then
	(read_more_and_next[@taillcall]) countwords pad i j
	else if is_whitespace @@ Bytes.unsafe_get pad j then begin
	sub_and_add_word countwords pad i (j-i);
	(skip_whitespace[@taillcall]) countwords pad ~lim (j+1)
	end else
	(read_next_word[@taillcall]) countwords pad ~lim i (j + 1)
	and read_more_and_next countwords pad i j =
	let l = j - i in
	Bytes.blit pad i pad 0 l;
	let read = input stdin pad l (Bytes.length pad - l) in
	let lim = l+read in
	lowercase_inplace pad l lim;
	if lim = 0 then ()
	else (read_next_word[@taillcall]) countwords pad ~lim 0 l
	and read_more_and_white countwords pad i =
	let lim = input stdin pad 0 (Bytes.length pad) in
	lowercase_inplace pad 0 lim;
	if lim = 0 then ()
	else (skip_whitespace[@taillcall]) countwords pad ~lim 0

	let[@inline] read_start countwords pad =
	(read_more_and_next[@specialize]) countwords pad 0 0

	let[@inline] print_slot (w, c) =
	print_bytes w;
	print_char ' ';
	print_int c;
	print_newline ()

	let () =
	let countwords = BytesHashtbl.create 33_000 in
	let size = 64*1024 in
	let pad = Bytes.make size ' ' in
	read_start countwords pad ;

	let a = Array.make (BytesHashtbl.length countwords) (Bytes.empty,0) in
	let f k v i = a.(i) <- (k,v); i+1 in
	let _ = BytesHashtbl.fold f countwords 0 in
	Array.sort (fun (_,i) (_,j) -> Int.compare i j) a;
	Array.iter print_slot a
	module BytesHash = struct
	include Bytes

	let hash (k : bytes) = Hashtbl.hash k
	end

	module BytesHashtbl = Hashtbl.Make (BytesHash)

	module Buf = struct
	type t = { buf : Bytes.t; mutable pos : int; mutable len : int; size : int }

	let create size = { buf = Bytes.create size; pos = 0; len = 0; size }

	let[@inline] compress t =
	if t.len <= 0 \|\| t.pos = t.len then (
	t.pos <- 0;
	t.len <- 0)
	else if t.pos > 0 then (
	BytesLabels.unsafe_blit ~src:t.buf ~src_pos:t.pos ~dst:t.buf ~dst_pos:0
	~len:t.len;
	t.pos <- 0)

	let read_into t fd =
	compress t;
	let count =
	UnixLabels.read fd ~buf:t.buf ~pos:(t.pos + t.len) ~len:(t.size - t.len)
	in
	t.len <- t.len + count;
	for i = t.pos to t.len - 1 do
	Bytes.unsafe_set t.buf i (Char.lowercase_ascii (Bytes.unsafe_get t.buf i))
	done;
	count

	let iter_words t tbl =
	let prev = ref t.pos in
	for i = t.pos to t.len - 1 do
	(* We perform bounds check so its okay to call unsafe_get here *)
	let c = Bytes.unsafe_get t.buf i in
	if c = '\n' \|\| c = ' ' then (
	let pos = !prev in
	let len = i - !prev in
	(if len > 0 then
	let word = Bytes.sub t.buf pos len in
	try incr (BytesHashtbl.find tbl word)
	with Not_found -> BytesHashtbl.add tbl word (ref 1));

	prev := i + 1)
	done;
	t.len <- t.len - !prev;
	t.pos <- !prev
	end

	let () =
	let countwords = BytesHashtbl.create 33_000 in
	let buf = Buf.create (64 * 1024) in
	while Buf.read_into buf Unix.stdin > 0 do
	Buf.iter_words buf countwords
	done;
	let arr = Array.make (BytesHashtbl.length countwords) (Bytes.empty, 0) in
	let idx = ref 0 in
	BytesHashtbl.iter
	(fun word count ->
	arr.(!idx) <- (word, !count);
	incr idx)
	countwords;
	Array.sort (fun (_, x) (_, y) -> Int.compare y x) arr;
	Array.iter
	(fun (w, c) ->
	print_bytes w;
	print_char ' ';
	print_int c;
	print_newline ())
	arr