Skip to content

Instantly share code, notes, and snippets.

@FeepingCreature
Created July 4, 2024 10:30
Show Gist options
  • Save FeepingCreature/74e04a7167e5f63dea306e656dc5df18 to your computer and use it in GitHub Desktop.
Save FeepingCreature/74e04a7167e5f63dea306e656dc5df18 to your computer and use it in GitHub Desktop.
git centralize tool, written by opus 3.5 sonnet
#!/usr/bin/env python3
# Warning: Undertested! May corrupt your git repos!
import os
import sys
import subprocess
import re
import argparse
import shutil
from pathlib import Path
from urllib.parse import urlparse
DATA_DIR = Path(os.environ.get('XDG_DATA_HOME', Path.home() / '.local' / 'share')) / 'git-submodules'
# Global variables for dry run and verbose modes
DRY_RUN = False
VERBOSE = False
# Global cache for fetched URLs
FETCHED_URLS = set()
def run_command(cmd, dry_run=None, verbose=None):
dry_run = DRY_RUN if dry_run is None else dry_run
verbose = VERBOSE if verbose is None else verbose
# Convert all arguments to strings
cmd = [str(arg) for arg in cmd]
if verbose:
print(f"Running command: {' '.join(cmd)}")
if dry_run:
print(f"[DRY RUN] Would run: {' '.join(cmd)}")
return None
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
stdout = result.stdout.strip()
if verbose and stdout:
print(f"Command output:\n{stdout}")
return result
def get_submodules(repo_path):
gitmodules_path = repo_path / '.gitmodules'
if not gitmodules_path.exists():
return {}
result = run_command(['git', '-C', repo_path, 'config', '--file', '.gitmodules', '--list'], dry_run=False)
pattern = re.compile(r'^submodule\.(.+?)\.(path|url)=(.+)$')
submodules = {}
for line in result.stdout.splitlines():
match = pattern.match(line)
if match:
name, key, value = match.groups()
if name not in submodules:
submodules[name] = {}
submodules[name][key] = value
return {name: submodule for name, submodule in submodules.items() if 'path' in submodule and 'url' in submodule}
def encode_url(url):
parsed = urlparse(url)
hostname = parsed.hostname or ''
path = parsed.path.lstrip('/')
path = re.sub(r'\.git$', '', path)
encoded = re.sub(r'[^a-zA-Z0-9]+', '_', f"{hostname}_{path}")
return encoded
def update_cache(submodule_url):
global FETCHED_URLS
if submodule_url in FETCHED_URLS:
if VERBOSE:
print(f"Skipping already fetched URL: {submodule_url}")
return
encoded_url = encode_url(submodule_url)
cache_path = DATA_DIR / encoded_url
if VERBOSE:
print(f"Updating cache for {submodule_url}")
if not cache_path.exists():
if VERBOSE:
print(f"Creating directory: {cache_path}")
cache_path.mkdir(parents=True, exist_ok=True)
run_command(['git', '-C', cache_path, 'init', '--bare'], dry_run=False)
run_command(['git', '-C', cache_path, 'remote', 'add', 'origin', submodule_url], dry_run=False)
run_command(['git', '-C', cache_path, 'fetch', '--tags', 'origin'], dry_run=False)
FETCHED_URLS.add(submodule_url)
def update_alternates(git_path, submodule_name, submodule_url, submodule_path):
encoded_url = encode_url(submodule_url)
cache_path = DATA_DIR / encoded_url
alternates_path = git_path / 'modules' / submodule_name / 'objects' / 'info' / 'alternates'
if VERBOSE:
print(f"Updating alternates for {submodule_name} in {git_path}")
if DRY_RUN:
print(f"[DRY RUN] Would update {alternates_path} with content: {str(cache_path / 'objects')}")
return
alternates_path.parent.mkdir(parents=True, exist_ok=True)
alternates_path.write_text(str(cache_path / 'objects'))
run_command(['git', '-C', submodule_path, 'repack', '-a', '-d', '--local'])
def remove_alternates(git_path, submodule_name, submodule_path):
# After this arcane command, alternates are no longer referenced.
run_command(['git', '-C', submodule_path, 'repack', '-a', '-d'])
alternates_path = git_path / 'modules' / submodule_name / 'objects' / 'info' / 'alternates'
if VERBOSE:
print(f"Removing alternates for {submodule_name} in {git_path}")
if DRY_RUN:
print(f"[DRY RUN] Would remove {alternates_path}")
return
if alternates_path.exists():
alternates_path.unlink()
if alternates_path.parent.exists() and not any(alternates_path.parent.iterdir()):
alternates_path.parent.rmdir()
def process_repository(repo_path, git_path=None, revert=False):
if git_path is None:
git_path = repo_path / '.git'
print(f"[process] {repo_path} in {git_path}")
submodules = get_submodules(repo_path)
for submodule_name, submodule_info in submodules.items():
submodule_path = submodule_info['path']
submodule_url = submodule_info['url']
# Check if the submodule is initialized
submodule_git_path = git_path / 'modules' / submodule_name
if not submodule_git_path.exists():
continue
submodule_full_path = repo_path / submodule_path
if not revert:
update_cache(submodule_url)
update_alternates(git_path, submodule_name, submodule_url, submodule_full_path)
else:
remove_alternates(git_path, submodule_name, submodule_full_path)
# Recursively process submodules
process_repository(submodule_full_path, submodule_git_path, revert)
def main(folder, revert=False):
for root, dirs, files in os.walk(folder):
if '.git' in dirs:
process_repository(Path(root), revert=revert)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Centralize or decentralize git submodules to reduce disk usage.')
parser.add_argument('folder', help='The folder containing git repositories to process')
parser.add_argument('-v', '--verbose', action='store_true', help='Enable verbose output')
parser.add_argument('-n', '--dry-run', action='store_true', help='Perform a dry run without making changes')
parser.add_argument('-r', '--revert', action='store_true', help='Revert the centralization process')
args = parser.parse_args()
DRY_RUN = args.dry_run
VERBOSE = args.verbose
main(args.folder, revert=args.revert)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment