Created
January 9, 2020 15:21
-
-
Save thegedge/f92abf2ab435402e5263cc34ca8f6f6b to your computer and use it in GitHub Desktop.
Standalone script to read git history, and render a graph showing files that change together
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby --disable-gems | |
# | |
# # Prerequisites | |
# | |
# - ghostscript | |
# - graphviz | |
# | |
# # Example usage | |
# | |
# graph_mutual_changes --clustering=package --since="6 months ago" && open graph.pdf | |
# | |
require "json" | |
require "open3" | |
require "optparse" | |
require "pathname" | |
require "set" | |
require "time" | |
require "tmpdir" | |
module Extractors | |
FileGrouping = Struct.new("FileGrouping", :key, :files, :weight) | |
class Git | |
def initialize(git_dir:, since:) | |
@git_dir = git_dir | |
@since = since | |
end | |
def extract | |
pull_requests_files | |
end | |
private | |
CommitData = Struct.new( | |
"CommitData", | |
:author_date, :sha, :subject, :first_parent_sha, :second_parent_sha, :changed_files | |
) | |
private_constant :CommitData | |
COMMIT_DATA_SEPARATOR = "--->" | |
private_constant :COMMIT_DATA_SEPARATOR | |
SUBJECT_SEPARATOR = ">>>" | |
private_constant :SUBJECT_SEPARATOR | |
MERGE_COMMIT_PR_REGEX = /Merge pull request #(\d+) from/ | |
private_constant :MERGE_COMMIT_PR_REGEX | |
def pull_requests_files | |
now = Time.now | |
oldest = now - master_commits.entries.last.last.author_date | |
youngest = now - master_commits.entries.first.last.author_date | |
range = (oldest - youngest).to_f | |
master_commits.map do |_sha, master_commit| | |
weight = 1 - ((now - master_commit.author_date) / range) | |
FileGrouping.new(pr_number(master_commit), branch_files(master_commit), weight) | |
end | |
end | |
private | |
attr_reader :since | |
def pr_number(commit) | |
match = commit.subject.match(MERGE_COMMIT_PR_REGEX) | |
return nil unless match | |
match[1] | |
end | |
def branch_files(merge_commit) | |
files = merge_commit.changed_files.dup | |
# Traverse backwards through the branch until we return to master | |
branch_commits = [] | |
commit = commits[merge_commit.second_parent_sha] | |
while commit && !master_commits.key?(commit.sha) | |
files += commit.changed_files | |
commit = commits[commit.first_parent_sha] | |
end | |
files.uniq! | |
files | |
end | |
def master_commits | |
@master_commits ||= begin | |
master_commits = {} | |
# Traverse the first parent of every merge commit (similar to the --first-parent switch | |
# in some git commands). This should eventually end at the root commit. | |
_, commit = commits.first | |
while commit | |
master_commits[commit.sha] = commit | |
# First parent of a merge commit is on the same branch | |
commit = commits[commit.first_parent_sha] | |
end | |
master_commits | |
end | |
end | |
def commits | |
@commits ||= begin | |
format_arg = "--format=format:#{COMMIT_DATA_SEPARATOR}%aI %H %P #{SUBJECT_SEPARATOR} %s" | |
since_arg = "--since=\"#{since}\"" | |
branch_arg = "origin/master" | |
commit_data = git("log", "--name-status", format_arg, since_arg, branch_arg).split(COMMIT_DATA_SEPARATOR) | |
# Keep track of files that are deleted. There's no need to keep them for consideration, so | |
# we'll drop them if we encounter them again. Also, since we're going in chronologically | |
# descending order, we should capture everything in a single pass. | |
deleted_files = Set.new | |
# First entry will always be empty, because there's a separator at the beginning of the file | |
commit_data.drop(1).each_with_object({}) do |data, commits| | |
lines = data.lines.map(&:chomp) | |
header, subject = lines.first.split(SUBJECT_SEPARATOR) | |
changed_files = Array(lines[1..-1]).map do |line| | |
next nil if line.empty? | |
status, file1, file2 = line.split | |
file = case status | |
when 'D' | |
deleted_files << file1 | |
nil | |
when /R\d+/ | |
deleted_files << file1 | |
file2 | |
else | |
file1 | |
end | |
if deleted_files.include?(file) | |
nil | |
else | |
file | |
end | |
end | |
author_date, sha, first_parent_sha, second_parent_sha = header.split | |
commits[sha] = CommitData.new( | |
Time.iso8601(author_date), | |
sha, | |
subject, | |
first_parent_sha, | |
second_parent_sha, | |
changed_files.compact | |
) | |
end | |
end | |
end | |
def git(*args) | |
stdout, stderr, status = Open3.capture3("git", "-C", @git_dir, *args) | |
return stdout if status.success? | |
STDERR.puts("Failed to run git command: #{args.join(" ")}") | |
STDERR.puts(stderr) | |
exit status.to_i | |
end | |
end | |
end | |
class Graph | |
attr_reader :edges | |
def initialize(edges) | |
@edges = edges.freeze | |
end | |
def nodes | |
@nodes ||= edges.keys.flat_map(&:itself).uniq | |
end | |
end | |
module Transformers | |
class ExcludeNonExistantFiles | |
def filter!(data) | |
current_files = INCLUDE_GLOBS.flat_map { |glob| Dir[glob] } | |
current_files = Set.new(current_files) | |
data.each do |file_grouping| | |
filtered_files = file_grouping.files.select! { |file| current_files.include?(file) } | |
end | |
end | |
end | |
class FileGlobFilter | |
def initialize(include_globs: INCLUDE_GLOBS, exclude_globs: EXCLUDE_GLOBS) | |
@include_globs = include_globs | |
@exclude_globs = exclude_globs | |
end | |
def filter!(data) | |
opts = File::FNM_PATHNAME | File::FNM_EXTGLOB | |
data.each do |file_grouping| | |
file_grouping.files.select! do |file| | |
@include_globs.any? { |glob| File.fnmatch?(glob, file, opts) } | |
end | |
end | |
data.each do |file_grouping| | |
file_grouping.files.reject! do |file| | |
@exclude_globs.any? { |glob| File.fnmatch?(glob, file, opts) } | |
end | |
end | |
end | |
end | |
class FileChangesToGraph | |
def initialize(max_changed) | |
@max_changed = max_changed | |
end | |
def transform(pull_request_files) | |
edges = pull_request_files.each_with_object({}) do |file_grouping, edges| | |
next if file_grouping.files.length > @max_changed | |
file_grouping.files.combination(2).each do |file1, file2| | |
key = [file1, file2] | |
key = [key.min, key.max] | |
edges[key] ||= 0 | |
edges[key] += file_grouping.weight | |
end | |
end | |
Graph.new(edges) | |
end | |
end | |
class FindSubgraphs | |
def transform(graph) | |
node_to_subgraph = {} | |
subgraphs = graph.edges.each_with_object([]) do |(edge, weight), subgraphs| | |
a, b = edge | |
a_subgraph = node_to_subgraph[a] | |
b_subgraph = node_to_subgraph[b] | |
if a_subgraph | |
a_subgraph[edge] = weight | |
node_to_subgraph[b] = a_subgraph | |
elsif b_subgraph | |
b_subgraph[edge] = weight | |
node_to_subgraph[a] = b_subgraph | |
else | |
subgraph = { edge => weight } | |
node_to_subgraph[a] = subgraph | |
node_to_subgraph[b] = subgraph | |
subgraphs << subgraph | |
end | |
end | |
subgraphs.sort_by!(&:length) | |
subgraphs.reverse! | |
subgraphs.map! { |edges| Graph.new(edges) } | |
subgraphs | |
end | |
end | |
class RemoveSmallSubgraphs | |
def initialize(threshold) | |
@threshold = threshold | |
end | |
def transform(graphs) | |
graphs.select do |graph| | |
graph.nodes.length >= @threshold | |
end | |
end | |
end | |
class NormalizeGraphWeights | |
def transform(graphs) | |
graphs.map do |graph| | |
min_edge_weight = graph.edges.values.min | |
max_edge_weight = graph.edges.values.max | |
edges = graph.edges.map do |edge, weight| | |
weight = if min_edge_weight == max_edge_weight | |
0.5 | |
else | |
normalizing_factor = 1 / (max_edge_weight - min_edge_weight) | |
normalizing_factor * (weight - min_edge_weight) | |
end | |
[edge, weight] | |
end | |
Graph.new(edges.to_h) | |
end | |
end | |
end | |
end | |
module Clustering | |
class None | |
def cluster(graph) | |
{ nil => graph.nodes } | |
end | |
end | |
class ByDirectory | |
def initialize(max_depth) | |
@max_depth = max_depth | |
end | |
def cluster(graph) | |
graph.nodes.group_by do |node| | |
File.dirname(node).split("/", @max_depth + 1).take(@max_depth).join("/") | |
end | |
end | |
end | |
class ByPackage | |
UNKNOWN_PACKAGE_NAME = "other" | |
def initialize(exclude_unknown: false) | |
@package_paths = Dir["**/package.yml"] | |
.map { |path| File.dirname(path) } | |
.sort_by(&:length) | |
@exclude_unknown = exclude_unknown | |
end | |
def cluster(graph) | |
graph.nodes.group_by { |node| cluster_for(node) }.tap do |clusters| | |
clusters.delete(UNKNOWN_PACKAGE_NAME) if @exclude_unknown | |
end | |
end | |
private | |
def cluster_for(node) | |
package = @package_paths.find { |path| node.start_with?(path) } | |
package || UNKNOWN_PACKAGE_NAME | |
end | |
end | |
end | |
module Loaders | |
class Graphviz | |
def initialize(graph, clustering: Clustering::None.new) | |
@graph = graph | |
@node_aliases = build_node_aliases(graph) | |
@clustering = clustering | |
# HSV pastelle color palette | |
hsvs = (1..19).map do |x| | |
[ | |
(x*11 % 19) / 19.0, # make consecutive hues be reasonably distant from each other | |
0.05 + (rand * 0.1), | |
0.80 + (rand * 0.15), | |
] | |
end | |
@cluster_color_list = hsvs | |
.map { |h, s, v| hsv2rgb(h, s ,v) } | |
.cycle | |
end | |
def render_to(io = STDOUT) | |
io.puts("graph {") | |
io.puts(" overlap = false;") | |
io.puts(" outputorder = edgesfirst;") | |
io.puts(" model = mds;") | |
io.puts(" node [shape=rectangle style=\"filled, rounded\"];") | |
io.puts("") | |
edges = graph.edges.dup | |
cluster_to_color = {} | |
node_to_cluster = {} | |
clusters = clustering.cluster(graph) | |
clusters.each do |cluster_name, nodes| | |
cluster_color = @cluster_color_list.next | |
cluster_to_color[cluster_name] = cluster_color | |
# TODO hack to not draw subgraph if just a single cluster | |
if clusters.length > 1 | |
io.puts(" subgraph cluster_#{cluster_name.gsub(/[^A-Za-z0-9_]/, '_')} {") | |
io.puts(" margin = 50;") | |
io.puts(" style = invis;") | |
# These currently aren't rendered because style = invis above. | |
# TODO perhaps make the subgraph style an option? | |
io.puts(" color = \"#{cluster_color}55\";") | |
io.puts(" pencolor = \"#00000055\";") | |
io.puts(" penwidth = 5;") | |
io.puts(" label = \"#{cluster_name}\";") | |
io.puts(" labeljust = l;") | |
io.puts(" fontsize = 64;") | |
io.puts(" fontname = \"times bold\";") | |
io.puts("") | |
end | |
io.puts(" node [fillcolor=\"#{cluster_color}\"];") | |
nodes.each do |node| | |
node_to_cluster[node] = cluster_name | |
node_alias = node_aliases[node] | |
io.puts(" #{node_alias} [#{node_attributes(node)}];") | |
end | |
io.puts("") | |
edges = edges.delete_if do |(a, b), weight| | |
next false if !nodes.include?(a) || !nodes.include?(b) | |
a_name = node_aliases[a] | |
b_name = node_aliases[b] | |
io.puts(" #{a_name} -- #{b_name} [#{edge_attributes(a, b, weight, cluster_color)}];") | |
true | |
end | |
io.puts(" }") if clusters.length > 1 | |
io.puts("") | |
end | |
edges = edges.each do |(a, b), weight| | |
a_cluster = node_to_cluster[a] | |
b_cluster = node_to_cluster[b] | |
next unless a_cluster && b_cluster | |
a_name = node_aliases[a] | |
b_name = node_aliases[b] | |
a_color = cluster_to_color[a_cluster] | |
b_color = cluster_to_color[b_cluster] | |
color = "#{b_color};0.5:#{a_color}" | |
io.puts(" #{a_name} -- #{b_name} [#{edge_attributes(a, b, weight, color)}];") | |
end | |
io.puts("}") | |
end | |
private | |
attr_reader :graph, :node_aliases, :clustering | |
def build_node_aliases(graph) | |
node_name = "A" | |
graph.nodes.each_with_object({}) do |node, map| | |
map[node] = node_name | |
node_name = node_name.next | |
end | |
end | |
def node_attributes(node) | |
%(label="#{node}") | |
end | |
def edge_attributes(a, b, weight, color) | |
length = Math.exp(1 - weight) - 0.9 | |
weight = Math.exp(3*weight) - 1 | |
%(weight=#{weight.round(2)} len=#{length.round(2)} color="#{color}") | |
end | |
def hsv2rgb(h, s, v) | |
h = (h * 360) % 360 | |
c = v*s | |
x = c*(1 - ((h / 60) % 2 - 1).abs) | |
m = v - c | |
rgb = case h | |
when 0...60 | |
[c, x, 0] | |
when 60...120 | |
[x, c, 0] | |
when 120...180 | |
[0, c, x] | |
when 180...240 | |
[0, x, c] | |
when 240...300 | |
[x, 0, c] | |
when 300...360 | |
[c, 0, x] | |
end | |
r, g, b = rgb.map { |v| to_hex(255 * (v + m)) } | |
"##{r}#{g}#{b}" | |
end | |
def to_hex(value) | |
value.to_i.to_s(16).rjust(2, '0') | |
end | |
end | |
end | |
# TODO parameterize these constants | |
INCLUDE_GLOBS = [ | |
"**/*.rb", | |
] | |
EXCLUDE_GLOBS = [ | |
"config/**/*", | |
"db/**/*", | |
"**/test/**/*", | |
"**/spec/**/*", | |
#"**/graph{ql,_api}/**/*", | |
] | |
options = { | |
history_since: "3 months ago", | |
max_changed: 25, | |
minimum_graph_size: 10, | |
max_graphs: 25, | |
exclude_unknown: false, | |
clustering: "package", | |
output_graphs: true, | |
output_directory: Pathname.pwd, | |
output_format: "pdf", | |
graphviz_command: "neato", | |
} | |
option_parser = OptionParser.new do |opts| | |
opts.banner = "Usage: #{$PROGRAM_NAME} [options]" | |
opts.on("--clustering=CLUSTERING", "Cluster by package, directory, or none (default: package)") do |v| | |
options[:clustering] = v | |
end | |
opts.on( | |
"--max-changed=N", | |
Integer, | |
"Do not include PRs that have more than this many files (default: 50)" | |
) do |v| | |
options[:max_changed] = v | |
end | |
opts.on( "--since=SINCE", "Only grab file data this far back in git history (default: 3 months ago)") do |v| | |
options[:history_since] = v | |
end | |
opts.on( "--min-graph-size=N", Integer, "Filter out graphs with fewer than this many nodes (default: 10)") do |v| | |
options[:history_since] = v | |
end | |
opts.on("--max-graphs=N", Integer, "Only output this many subgraphs (default: 25)") do |v| | |
options[:max_graphs] = v | |
end | |
opts.on("--output-directory=PATH", "Directory to emit files (default: tmp/packages)") do |v| | |
options[:output_directory] = v | |
end | |
opts.on("--output-format=FORMAT", "File type to emit (default: pdf)") do |v| | |
options[:output_format] = v | |
end | |
opts.on("--no-graph-output", "If specified, don't render any graphs ") do |v| | |
options[:output_graphs] = false | |
end | |
opts.on("--graphviz-command=CMD", "Graphviz command to use for emitting results (default: fdp)") do |v| | |
options[:graphviz_command] = v | |
end | |
opts.on("--exclude-unknown", "Exclude anything not in a package, if clustering by package") do |v| | |
options[:exclude_unknown] = v | |
end | |
end | |
args = ARGV.dup | |
option_parser.parse!(args) | |
files_to_show_prs_for = args | |
clusterer = case options[:clustering] | |
when "none" | |
Clustering::None.new | |
when "directory" | |
Clustering::ByDirectory.new(5) | |
when "package" | |
Clustering::ByPackage.new(exclude_unknown: options[:exclude_unknown]) | |
else | |
abort("Unknown clustering option: #{options[:clustering]}") | |
end | |
#-------------------------------------------------------------------------------- | |
# ETL pipeline: | |
# | |
# -- Extract -- | |
# | |
# Get data that shows files that change together. | |
# | |
# Output :: Array[FileGrouping] | |
# | |
# | |
# -- Transform -- | |
# | |
# Filtering followed by transformations. End result should be a list of graphs. | |
# | |
# Output :: Array[Graph] | |
# | |
# | |
# -- Loaders -- | |
# | |
# Can output any graph format. Currently just graphviz. | |
#-------------------------------------------------------------------------------- | |
file_groupings = Extractors::Git.new(git_dir: Dir.pwd, since: options.fetch(:history_since)).extract | |
file_groupings = Transformers::FileGlobFilter.new.filter!(file_groupings) | |
file_groupings = Transformers::ExcludeNonExistantFiles.new.filter!(file_groupings) | |
graph = Transformers::FileChangesToGraph.new(options.fetch(:max_changed)).transform(file_groupings) | |
subgraphs = Transformers::FindSubgraphs.new.transform(graph) | |
subgraphs = Transformers::RemoveSmallSubgraphs.new(options.fetch(:minimum_graph_size)).transform(subgraphs) | |
subgraphs = Transformers::NormalizeGraphWeights.new.transform(subgraphs) | |
subgraphs = subgraphs.take(options.fetch(:max_graphs)) | |
abort "No data to render!" if subgraphs.empty? | |
if options.fetch(:output_graphs) | |
Dir.mktmpdir do |dir| | |
output_format = options.fetch(:output_format) | |
subgraphs.each_with_index do |graph, index| | |
output_file_path = File.join(dir, "graph_#{index.to_s.rjust(3, '0')}.#{output_format}") | |
args = [options.fetch(:graphviz_command), "-T", output_format, "-o", output_file_path] | |
Open3.popen3(*args) do |stdin, stdout, stderr, waiter_thread| | |
Loaders::Graphviz.new(graph, clustering: clusterer).render_to(stdin) | |
stdin.close | |
IO.copy_stream(stdout, STDOUT) | |
IO.copy_stream(stderr, STDERR) | |
abort unless waiter_thread.value.success? | |
end | |
end | |
output_directory = Pathname.new(options.fetch(:output_directory)) | |
output_directory.mkpath | |
output_file = output_directory.join("graph.pdf") | |
abort unless system("gs -q -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -sOutputFile=\"#{output_file}\" #{dir}/*.pdf") | |
end | |
end | |
unless files_to_show_prs_for.empty? | |
file_groupings.each do |file_grouping| | |
files = file_grouping.files | |
if files.any? { |f| files_to_show_prs_for.include?(f) } | |
# TODO perhaps parse origin remote to construct a URL to PR | |
puts(file_grouping.key) | |
files.each do |file| | |
if files_to_show_prs_for.include?(file) | |
# output green for a file that was chosen | |
puts "\t -\e[32m#{file}\e[0m" | |
else | |
puts("\t- #{file}") | |
end | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment