Skip to content

Instantly share code, notes, and snippets.

@ethomson
Last active August 3, 2024 02:37
Show Gist options
  • Save ethomson/1829e4107ed030f91f2c85f5fe7c8eb4 to your computer and use it in GitHub Desktop.
Save ethomson/1829e4107ed030f91f2c85f5fe7c8eb4 to your computer and use it in GitHub Desktop.
#!/usr/bin/env perl -w
use strict;
use IPC::Open2;
use IPC::Open3;
my $SHA1_PATH="/Users/ethomson/Projects/libgit2/libgit2/tests/resources/testrepo.git";
my $SHA256_PATH="/tmp/testrepo_256.git";
my $MAPPING_PATH="${SHA256_PATH}/object-idx";
my %REWRITE = (
# builtin
'0000000000000000000000000000000000000000' => '0000000000000000000000000000000000000000000000000000000000000000',
'4b825dc642cb6eb9a060e54bf8d69288fbee4904' => '6ef19b41225c5369f1c104d45d8d85efa9b057b53b14b4b9b939dd74decc5321',
'18f1181c98a277950b8bc4524b645c3240d0d605' => '18f1181c98a277950b8bc4524b645c3240d0d60518f1181c98a277950b8bc452',
'24b42adea646b2cbea0e0e43b08e46f3c8b18ec4' => '24b42adea646b2cbea0e0e43b08e46f3c8b18ec424b42adea646b2cbea0e0e43',
'3161df8cbf3a006b4ef85be6497a0ea6bde98541' => '3161df8cbf3a006b4ef85be6497a0ea6bde985413161df8cbf3a006b4ef85be6',
'82eb88663cedceef90965e3439ff6829914f2473' => '82eb88663cedceef90965e3439ff6829914f247382eb88663cedceef90965e34',
'f4b83a6f9c2b8081ebd5b0e9e780e70c26392c91' => 'f4b83a6f9c2b8081ebd5b0e9e780e70c26392c91f4b83a6f9c2b8081ebd5b0e9'
);
# globals
my %BY_SHA1 = ();
my @ALL_OBJECTS = ();
# locate source (sha1) repository
chomp(my $SHA1_GIT_DIR=`cd '${SHA1_PATH}' && git rev-parse --absolute-git-dir`);
# create destination (sha256) repository
system("git init --bare '${SHA256_PATH}'") && die;
chomp(my $SHA256_GIT_DIR=`cd '${SHA256_PATH}' && git rev-parse --absolute-git-dir`);
system("git --git-dir '${SHA256_GIT_DIR}' config core.repositoryFormatVersion 1") && die;
system("git --git-dir '${SHA256_GIT_DIR}' config extensions.objectFormat sha256") && die;
# read_object_idx();
convert_packed_objects($SHA1_GIT_DIR, $SHA256_GIT_DIR);
convert_loose_objects($SHA1_GIT_DIR, $SHA256_GIT_DIR);
convert_packed_refs($SHA1_GIT_DIR, $SHA256_GIT_DIR);
convert_loose_refs($SHA1_GIT_DIR, $SHA256_GIT_DIR);
convert_metadata($SHA1_GIT_DIR, $SHA256_GIT_DIR);
convert_reflogs($SHA1_GIT_DIR, $SHA256_GIT_DIR);
convert_index($SHA1_GIT_DIR, $SHA256_GIT_DIR);
write_object_idx(\@ALL_OBJECTS);
sub convert_packed_objects {
my($source, $dest) = @_;
print "Reading packed objects in ${source}...\n";
my @packfiles;
chomp(my $source_objects=`cd '${source}' && git rev-parse --path-format=absolute --git-path objects`);
opendir(PACKS, "${source_objects}/pack") || die "$0: cannot open directory ${source_objects}/pack: $!\n";
while(my $filename = readdir(PACKS)) {
next if ($filename !~ /^pack-([0-9a-f]{40})\.idx$/);
print "Converting packed objects in $1...\n";
my (@objects);
open(PACK, "git --git-dir '${source_objects}' verify-pack -v '${source_objects}/pack/${filename}' |") || die;
while(<PACK>) {
next unless (/^([0-9a-f]{40}) (blob|tree|commit|tag) /);
my $sha1_id = $1;
my $type = $2;
my $object = { 'sha1_id' => $sha1_id, 'type' => $type };
push(@objects, $object);
push(@ALL_OBJECTS, $object);
$BY_SHA1{$sha1_id} = $object;
}
close(PACK);
convert_objects($source, $dest, \@objects);
push(@packfiles, pack_objects($dest, \@objects));
}
closedir(PACKS);
}
sub convert_loose_objects {
my($source, $dest) = @_;
chomp(my $source_objects=`cd '${source}' && git rev-parse --path-format=absolute --git-path objects`);
print "Reading loose objects in ${source}...\n";
my (@loose_objects);
opendir(PREFIXES, "${source_objects}") || die;
while(my $prefix = readdir(PREFIXES)) {
next unless $prefix =~ /^[0-9a-f]{2}$/;
opendir(SUFFIXES, "${source_objects}/${prefix}") || die;
while(my $suffix = readdir(SUFFIXES)) {
next unless $suffix =~ /^[0-9a-f]{38}$/;
my $sha1_id = $prefix . $suffix;
chomp(my $type = `git --git-dir='${source}' cat-file -t '${sha1_id}'`);
my $object = { 'sha1_id' => $sha1_id, 'type' => $type };
push(@loose_objects, $object);
push(@ALL_OBJECTS, $object);
$BY_SHA1{$sha1_id} = $object;
}
closedir(SUFFIXES);
}
closedir(PREFIXES);
convert_objects($source, $dest, \@loose_objects);
}
sub convert_objects {
my($source, $dest, $objects) = @_;
my %by_type = ('blob' => [ ], 'tree' => [ ], 'commit' => [ ], 'tag' => [ ]);
foreach my $object (@$objects) {
die unless $object->{'sha1_id'};
push(@{$by_type{$object->{'type'}}}, $object);
}
convert_blobs($source, $dest, $by_type{'blob'});
convert_trees($source, $dest, $by_type{'tree'});
convert_commits($source, $dest, $by_type{'commit'});
convert_tags($source, $dest, $by_type{'tag'});
}
sub convert_blobs {
my($source, $dest, $blobs) = @_;
print " Step one: Converting blobs...\n";
foreach my $blob (@$blobs) {
chomp(my $sha256_id = `git --git-dir='${source}' cat-file $blob->{'type'} $blob->{'sha1_id'} | git --git-dir='${dest}' hash-object -w --stdin`);
$blob->{'sha256_id'} = $sha256_id;
print " blob: $blob->{'sha1_id'} -> $blob->{'sha256_id'}\n";
}
}
sub convert_trees {
my($source, $dest, $trees) = @_;
my $trees_converted = 0;
print " Step two: Converting trees...\n";
# loop-within-a-loop because trees can reference other trees; brute
# force this problem rather than building a dependency graph
while (scalar(@$trees) > 0) {
my $loop_converted = 0;
for my $i (0 .. scalar(@$trees) - 1) {
my $tree = $trees->[$i - $loop_converted];
my $newdata;
my $incomplete = 0;
open(TREE, "git --git-dir='${source}' ls-tree $tree->{'sha1_id'} |") || die "$0: could not execute git-ls-tree: $!\n";
while(my $entry = <TREE>) {
chomp $entry;
my($metadata, $name) = split(/\t/, $entry);
my($mode, $type, $sha1_id) = split(' ', $metadata);
my $sha256_id = $REWRITE{$sha1_id} || $BY_SHA1{$sha1_id}->{'sha256_id'};
if (!$sha256_id) {
#print " (no sha256 for ${sha1_id}, skipping)\n";
$incomplete = 1;
last;
}
$newdata .= "${mode} ${type} ${sha256_id}\t${name}\n";
}
close(TREE);
next if ($incomplete);
open2(\*MKTREE_OUT, \*MKTREE_IN, "git --git-dir='${dest}' mktree") || die "$0: could not execute git-mktree: $!";
print MKTREE_IN $newdata;
close(MKTREE_IN);
chomp(my $sha256_id = <MKTREE_OUT>);
close(MKTREE_OUT);
die unless($sha256_id);
$tree->{'sha256_id'} = $sha256_id;
splice (@$trees, ($i - $loop_converted), 1);
$loop_converted++;
print " tree: $tree->{'sha1_id'} -> $tree->{'sha256_id'}\n";
}
die "could not process some trees: $trees->[0]->{'sha1_id'}" if ($loop_converted == 0);
$trees_converted = $loop_converted;
}
}
sub convert_commits {
my($source, $dest, $commits) = @_;
my $commits_converted = 0;
print " Step three Converting commits...\n";
# loop-within-a-loop because commits can reference other commits; brute
# force this problem rather than building a dependency graph
while (scalar(@$commits) > 0) {
my $loop_converted = 0;
for my $i (0 .. scalar(@$commits) - 1) {
my $commit = $commits->[$i - $loop_converted];
my $newdata;
my $incomplete = 0;
my $headers = 1;
open(COMMIT, "git --git-dir='${SHA1_GIT_DIR}' cat-file $commit->{'type'} $commit->{'sha1_id'} |") || die;
while(my $line = <COMMIT>) {
if ($headers == 1 && $line =~ /^(tree|parent) (.*)\n$/) {
my $type = $1;
my $sha1_id = $2;
my $sha256_id = $REWRITE{$sha1_id} || $BY_SHA1{$sha1_id}->{'sha256_id'};
if (!$sha256_id) {
#print " (no sha256 for ${sha1_id}, skipping)\n";
$commit->{'missing_sha1'} = $sha1_id;
$incomplete = 1;
last;
}
$line = "${type} ${sha256_id}\n";
}
if ($headers == 1 && $line eq '\n') {
$headers = 0;
}
$newdata .= $line;
}
close(COMMIT);
next if ($incomplete);
delete $commit->{'missing_sha1'};
open2(\*HASH_OUT, \*HASH_IN, "git --git-dir='${dest}' hash-object -t commit -w --stdin");
print HASH_IN $newdata;
close(HASH_IN);
chomp(my $sha256_id = <HASH_OUT>);
close(HASH_OUT);
die unless($sha256_id);
$commit->{'sha256_id'} = $sha256_id;
splice (@$commits, ($i - $loop_converted), 1);
$loop_converted++;
print " commit: $commit->{'sha1_id'} -> $commit->{'sha256_id'}\n";
}
die "could not process commit $commits->[0]->{'sha1_id'} (missing $commits->[0]->{'missing_sha1'})" if ($loop_converted == 0);
$commits_converted = $loop_converted;
}
}
sub convert_tags {
my($source, $dest, $tags) = @_;
my $tags_converted = 0;
print " Step four: converting tags...\n";
while (scalar(@$tags) > 0) {
my $loop_converted = 0;
for my $i (0 .. scalar(@$tags) - 1) {
my $tag = $tags->[$i - $loop_converted];
my $newdata;
my $incomplete = 0;
my $headers = 1;
open(TAG, "git --git-dir='${SHA1_GIT_DIR}' cat-file $tag->{'type'} $tag->{'sha1_id'} |") || die;
while(my $line = <TAG>) {
if ($headers == 1 && $line =~ /^object (.*)\n$/) {
my $sha1_id = $1;
my $sha256_id = $REWRITE{$sha1_id} || $BY_SHA1{$sha1_id}->{'sha256_id'};
if (!$sha256_id) {
#print " (no sha256 for ${sha1_id}, skipping)\n";
$tag->{'missing_sha1'} = $sha1_id;
$incomplete = 1;
last;
}
$line = "object ${sha256_id}\n";
}
if ($headers == 1 && $line eq '\n') {
$headers = 0;
}
$newdata .= $line;
}
close(TAG);
next if ($incomplete);
delete $tag->{'missing_sha1'};
open2(\*HASH_OUT, \*HASH_IN, "git --git-dir='${dest}' hash-object -t tag -w --stdin");
print HASH_IN $newdata;
close(HASH_IN);
chomp(my $sha256_id = <HASH_OUT>);
close(HASH_OUT);
die unless($sha256_id);
$tag->{'sha256_id'} = $sha256_id;
splice (@$tags, ($i - $loop_converted), 1);
$loop_converted++;
print " tag: $tag->{'sha1_id'} -> $tag->{'sha256_id'}\n";
}
die "could not process tag $tags->[0]->{'sha1_id'} (missing $tags->[0]->{'missing_sha1'})" if ($loop_converted == 0);
$tags_converted = $loop_converted;
}
}
sub pack_objects {
my($git_dir, $objects) = @_;
print " Writing packfile...\n";
open3(\*PACK_OBJECTS_IN, \*PACK_OBJECTS_OUT, \*PACK_OBJECTS_ERR, "git --git-dir='${git_dir}' pack-objects --include-tag '${git_dir}/objects/pack/pack'");
foreach my $object (@$objects) {
die if (! $object->{'sha256_id'});
print PACK_OBJECTS_IN "$object->{'sha256_id'}\n";
}
close(PACK_OBJECTS_IN);
chomp(my $packfile = <PACK_OBJECTS_OUT>);
close(PACK_OBJECTS_OUT);
close(PACK_OBJECTS_ERR);
my @prefixes;
foreach my $object (@$objects) {
my($prefix, $suffix) = ($object->{'sha256_id'} =~ /^([0-9a-f]{2})(.*)/);
my $filename = "${git_dir}/objects/${prefix}/${suffix}";
push(@prefixes, $prefix);
next unless (-f "${filename}");
chmod(0644, $filename) || die "$0: could not chmod ${filename}: $!";
unlink($filename) || die "$0: could not delete ${filename}: $!";
}
push(@prefixes, '59');
my $last_prefix = '';
foreach my $prefix (sort(@prefixes)) {
if ($last_prefix ne $prefix) {
my $dirname = "${git_dir}/objects/${prefix}";
next unless(-d "${dirname}");
rmdir("${git_dir}/objects/${prefix}") || die "$0: could not rmdir ${dirname}: $!";
}
$last_prefix = $prefix;
}
return $packfile;
}
sub convert_packed_refs {
my ($source, $dest) = @_;
print "Converting packed refs in ${source}...\n";
chomp(my $source_file=`cd '${source}' && git rev-parse --path-format=absolute --git-path packed-refs`);
chomp(my $dest_file=`cd '${dest}' && git rev-parse --path-format=absolute --git-path packed-refs`);
open(SOURCE, "${source_file}") || die "$0: could not open ${source_file}: $!\n";
open(DEST, ">${dest_file}") || die "$0: could not open ${dest_file}: $!\n";
while(<SOURCE>) {
if (!/^[0-9a-f]{40} /) {
print DEST;
next;
}
chomp;
my($sha1_id, $name) = split(/ /);
my $target = $REWRITE{$sha1_id} || $BY_SHA1{$sha1_id}->{'sha256_id'};
die "$0: could not find target of ${name}: no object for ${sha1_id}\n" unless($target);
print DEST "${target} ${name}\n";
print " ${name}: ${sha1_id} -> ${target}\n";
}
close(SOURCE);
close(DEST);
}
sub convert_loose_refs {
my ($source, $dest) = @_;
print "Converting loose refs in ${source}...\n";
chomp(my $source_dir=`cd '${source}' && git rev-parse --path-format=absolute --git-path refs`);
chomp(my $dest_dir=`cd '${dest}' && git rev-parse --path-format=absolute --git-path refs`);
convert_loose_refs_dir($source_dir, $dest_dir, undef);
}
sub convert_loose_refs_dir {
my ($source, $dest, $base) = @_;
my $source_path = $base ? "${source}/${base}" : $source;
my $source_fh;
opendir($source_fh, $source_path) || die "$0: could not open directory $source_path: $!\n";
while(my $filename = readdir($source_fh)) {
next if($filename eq '.' || $filename eq '..');
my $ref = $base ? "${base}/${filename}" : $filename;
my $source_refpath = "${source}/${ref}";
my $dest_refpath = "${dest}/${ref}";
if (-d $source_refpath) {
if (! -d $dest_refpath) {
mkdir($dest_refpath) || die "$0: could not create directory ${dest_refpath}: $!\n";
}
convert_loose_refs_dir($source, $dest, $ref);
} else {
convert_loose_ref($source, $dest, $ref);
}
}
closedir($source_fh);
}
sub convert_loose_ref {
my ($source, $dest, $ref) = @_;
my $source_refpath = "${source}/${ref}";
my $dest_refpath = "${dest}/${ref}";
open(SOURCE, "${source_refpath}") || die "$0: could not open reference ${source_refpath}: $!\n";
open(DEST, "> ${dest_refpath}") || die "$0: could not open reference ${dest_refpath}: $!\n";
chomp(my $source_target = <SOURCE>);
my $dest_target;
if ($source_target =~ /^[0-9a-f]{40}$/) {
$dest_target = $REWRITE{$source_target} || $BY_SHA1{$source_target}->{'sha256_id'};
}
else {
$dest_target = $source_target;
}
die "$0: could not find target of ${ref}: no object for ${source_target}\n" unless($dest_target);
print DEST "${dest_target}\n";
close(SOURCE);
close(DEST);
print " ${ref}: ${source_target} -> ${dest_target}\n";
}
sub convert_reflogs {
my ($source, $dest) = @_;
print "Converting reflogs in ${source}...\n";
chomp(my $source_dir=`cd '${source}' && git rev-parse --path-format=absolute --git-path logs`);
chomp(my $dest_dir=`cd '${dest}' && git rev-parse --path-format=absolute --git-path logs`);
if (! -d $dest_dir) {
mkdir($dest_dir) || die "$0: could not create directory ${dest_dir}: $!\n";
}
convert_reflogs_dir($source_dir, $dest_dir, undef);
}
sub convert_reflogs_dir {
my ($source, $dest, $base) = @_;
my $source_path = $base ? "${source}/${base}" : $source;
my $source_fh;
opendir($source_fh, $source_path) || die "$0: could not open directory $source_path: $!\n";
while(my $filename = readdir($source_fh)) {
next if($filename eq '.' || $filename eq '..');
my $ref = $base ? "${base}/${filename}" : $filename;
my $source_refpath = "${source}/${ref}";
my $dest_refpath = "${dest}/${ref}";
if (-d $source_refpath) {
if (! -d $dest_refpath) {
mkdir($dest_refpath) || die "$0: could not create directory ${dest_refpath}: $!\n";
}
convert_reflogs_dir($source, $dest, $ref);
} else {
convert_reflog($source, $dest, $ref);
}
}
closedir($source_fh);
}
sub convert_reflog {
my ($source, $dest, $ref) = @_;
my $source_logpath = "${source}/${ref}";
my $dest_logpath = "${dest}/${ref}";
open(SOURCE, "${source_logpath}") || die "$0: could not open reflog ${source_logpath}: $!\n";
open(DEST, "> ${dest_logpath}") || die "$0: could not open reflog ${dest_logpath}: $!\n";
while(<SOURCE>) {
chomp;
my($start_sha1, $finish_sha1, @rest) = split(/ /);
my $start_sha256 = $REWRITE{$start_sha1} || $BY_SHA1{$start_sha1}->{'sha256_id'};
my $finish_sha256 = $REWRITE{$finish_sha1} || $BY_SHA1{$finish_sha1}->{'sha256_id'};
die "$0: could not find source of log for ${ref}: no object for ${start_sha1}\n" unless($start_sha256);
die "$0: could not find target of log for ${ref}: no object for ${finish_sha1}\n" unless($finish_sha256);
print DEST join(" ", $start_sha256, $finish_sha256, @rest) . "\n";
print " log ${ref}: ${start_sha1} -> ${start_sha256} / ${finish_sha1} -> ${finish_sha256}\n";
}
close(SOURCE);
close(DEST);
}
sub convert_fetch_head {
my ($source, $dest) = @_;
my $source_path = "${source}/FETCH_HEAD";
my $dest_path = "${dest}/FETCH_HEAD";
open(SOURCE, "${source_path}") || die "$0: could not open file ${source_path}: $!\n";
open(DEST, "> ${dest_path}") || die "$0: could not open file ${dest_path}: $!\n";
while(<SOURCE>) {
chomp;
my($sha1_id, @rest) = split(/\t/);
my $sha256_id = $REWRITE{$sha1_id} || $BY_SHA1{$sha1_id}->{'sha256_id'};
die "$0: could not find FETCH_HEAD identifier: no object for ${sha1_id}\n" unless($sha256_id);
print DEST join("\t", $sha256_id, @rest) . "\n";
print " FETCH_HEAD: ${sha1_id} -> ${sha256_id}\n";
}
close(SOURCE);
close(DEST);
}
sub convert_metadata {
my ($source, $dest) = @_;
print "Converting metadata in ${source}...\n";
convert_loose_ref($source, $dest, 'HEAD');
convert_loose_ref($source, $dest, 'HEAD_TRACKER');
convert_fetch_head($source, $dest);
}
sub convert_index {
my($source, $dest) = @_;
print "Converting index in ${source}...\n";
open(SOURCE_INDEX, "git --git-dir='${source}' ls-files --stage |") || die "$0: could not open index: $!\n";
open(DEST_INDEX, "| git --git-dir='${dest}' update-index --index-info") || die "$0: could not open index: $!\n";
while(<SOURCE_INDEX>) {
chomp;
my($metadata, $path) = split(/\t/);
my($mode, $sha1_id, $stage) = split(' ', $metadata);
my $sha256_id = $REWRITE{$sha1_id} || $BY_SHA1{$sha1_id}->{'sha256_id'};
die "$0: could not find index entry no object for ${sha1_id}\n" unless($sha256_id);
print DEST_INDEX "${mode} ${sha256_id} ${stage}\t${path}\n";
print " ${path}: ${sha1_id} -> ${sha256_id}\n";
}
close(SOURCE_INDEX);
close(DEST_INDEX);
}
sub read_object_idx {
open(OBJECT_IDX, "${MAPPING_PATH}") || die "$0: could not open ${MAPPING_PATH}: $!\n";
while(<OBJECT_IDX>) {
chomp;
my($sha256_id, $sha1_id) = /^([0-9a-f]{64}) ([0-9a-f]{40})$/;
my $object = { 'sha1_id' => $sha1_id, 'sha256_id' => $sha256_id };
push(@ALL_OBJECTS, $object);
$BY_SHA1{$sha1_id} = $object;
}
close(OBJECT_IDX);
}
sub write_object_idx {
my ($objects) = @_;
open(OBJECT_IDX, ">${MAPPING_PATH}") || die "$0: could not open ${MAPPING_PATH}: $!\n";
foreach my $object (sort { $a->{'sha256_id'} cmp $b->{'sha256_id'} } @$objects) {
print OBJECT_IDX "$object->{'sha256_id'} $object->{'sha1_id'}\n";
}
close(OBJECT_IDX);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment