Last active
May 20, 2022 03:03
-
-
Save svoisen/c6e9e495392074d20a4187226de10521 to your computer and use it in GitHub Desktop.
Print histogram of file line lengths using Rust
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Dependencies required in Cargo.toml | |
// clap = { version = "3.1.7", features = ["derive"] } | |
// deque = "0.3.2" | |
// ignore = "0.1.6" | |
// num_cpus = "1.13.1" | |
use clap::Parser; | |
use deque::{Stealer, Stolen}; | |
use ignore::WalkBuilder; | |
use num_cpus; | |
use std::cmp::max; | |
use std::collections::HashMap; | |
use std::path::PathBuf; | |
use std::time::Instant; | |
#[derive(Parser)] | |
struct Args { | |
#[clap(parse(from_os_str))] | |
path: PathBuf | |
} | |
// Map of line length (columns) to number of lines with that length. | |
type LineLenMap = HashMap<usize, usize>; | |
// Represents a single job in the work queue. | |
enum Job { | |
File(String), | |
Quit | |
} | |
// Runs a job on a single thread. This threading model is basically | |
// stolen from how ripgrep performs threading. | |
struct Worker { | |
channel: Stealer<Job> | |
} | |
impl Worker { | |
fn run(self) -> Vec<LineLenMap> { | |
let mut worker_results: Vec<LineLenMap> = vec![]; | |
loop { | |
match self.channel.steal() { | |
Stolen::Empty | Stolen::Abort => continue, | |
Stolen::Data(Job::Quit) => break, | |
Stolen::Data(Job::File(file_path)) => { | |
worker_results.push(measure_line_lengths(&file_path)); | |
} | |
} | |
} | |
worker_results | |
} | |
} | |
fn main() { | |
let start = Instant::now(); | |
let args = Args::parse(); | |
// Add other file extensions here as needed. | |
let include_extensions = vec!["js", "ts", "c", "cc", "cpp", "h", "rs"]; | |
let walker = WalkBuilder::new(&args.path) | |
.ignore(true) | |
.git_ignore(true) | |
.hidden(true) | |
.build(); | |
let files = walker | |
.filter_map(Result::ok) | |
.filter(|entry| entry.file_type().expect("No file type").is_file()) | |
.filter_map(|entry| { | |
let path = entry.path(); | |
let ext = path.extension().and_then(std::ffi::OsStr::to_str); | |
match ext { | |
None => None, | |
Some(ext) => match include_extensions.contains(&ext) { | |
false => None, | |
true => Some(String::from(path.to_str().unwrap())) | |
} | |
} | |
}); | |
let num_threads = num_cpus::get(); | |
let mut workers = vec![]; | |
let (work_queue, stealer) = deque::new(); | |
for _ in 0..num_threads { | |
let worker = Worker { channel: stealer.clone() }; | |
workers.push(std::thread::spawn(|| { worker.run() })); | |
} | |
// Add all of the files to the work queue. | |
let mut num_files = 0; | |
for file_path in files { | |
num_files += 1; | |
work_queue.push(Job::File(file_path)); | |
} | |
// Ensure the threads terminate when they are done. | |
for _ in 0..num_threads { | |
work_queue.push(Job::Quit); | |
} | |
let mut all_results = LineLenMap::new(); | |
for worker in workers { | |
let worker_results = worker.join().unwrap(); | |
for results in worker_results { | |
merge_line_lengths(&mut all_results, &results); | |
} | |
} | |
print_histogram(&all_results); | |
println!("Average line length: {:.2}", compute_average(&all_results)); | |
println!("Total files measured: {}", num_files); | |
let end = Instant::now(); | |
println!("Total time: {}ms", end.duration_since(start).as_millis()); | |
} | |
fn measure_line_lengths(path: &String) -> LineLenMap { | |
let content = match std::fs::read_to_string(path) { | |
Ok(content) => content, | |
Err(_) => { | |
return HashMap::new(); | |
} | |
}; | |
let mut line_lengths = LineLenMap::new(); | |
for line in content.lines() { | |
let len = line.len(); | |
// Round up to nearest even number. | |
let len = match len % 2 { | |
0 => len, | |
_ => len + 1 | |
}; | |
// Limit to lengths of <= 100. Remove this restriction | |
// to cover all lines. | |
if len > 0 && len <= 100 { | |
match line_lengths.get_mut(&len) { | |
Some(count) => *count += 1, | |
None => { | |
line_lengths.insert(len, 1); | |
} | |
} | |
} | |
} | |
line_lengths | |
} | |
// Rust doesn't (yet) have extend_with, so this is a use-specific version of | |
// extending the first HashMap with the values of the second by summing | |
// them together. | |
fn merge_line_lengths(first: &mut LineLenMap, second: &LineLenMap) { | |
for (line_length, num_lines) in second { | |
match first.get_mut(&line_length) { | |
Some(count) => *count += num_lines, | |
None => { | |
first.insert(*line_length, *num_lines); | |
} | |
} | |
} | |
} | |
fn compute_average(line_lengths: &LineLenMap) -> f64 { | |
let mut total_length: usize = 0; | |
let mut total_lines: usize = 0; | |
for (length, num_lines) in line_lengths { | |
total_length += length * num_lines; | |
total_lines += num_lines; | |
} | |
total_length as f64 / total_lines as f64 | |
} | |
fn print_histogram(line_lengths: &LineLenMap) { | |
let bars = vec!["█", "▏", "▎", "▍", "▌", "▋", "▊", "▉"]; | |
let mut line_lengths_vec: Vec<_> = line_lengths.iter().collect(); | |
// Sort by largest value in descending order. | |
// line_lengths_vec.sort_by(|a, b| a.1.cmp(b.1).reverse()); | |
// Sort by line length in asceding order. | |
line_lengths_vec.sort_by(|a, b| a.0.cmp(b.0)); | |
// Normalize the maximum bar length of the histogram to no more than | |
// 80 columns. | |
let max_count = line_lengths_vec[0].1; | |
let divisor = max(8, max_count / 80 as usize); | |
println!(" COLS COUNT"); | |
for tuple in line_lengths_vec { | |
let full_bar_count = tuple.1 / divisor; | |
let partial_bar_idx = (((tuple.1 % divisor) as f64 / divisor as f64) * 8.0) as usize; | |
print!("{cols:>width$} {count:>width$} ", cols=tuple.0, count=tuple.1, width=5); | |
for _ in 0..full_bar_count { | |
print!("{}", bars[0]); | |
} | |
if partial_bar_idx > 0 { | |
print!("{}", bars[partial_bar_idx]); | |
} | |
println!(""); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment