Skip to content

Instantly share code, notes, and snippets.

@amuraru
Last active July 14, 2019 16:34
Show Gist options
  • Save amuraru/b750f54e55bb7208c42d to your computer and use it in GitHub Desktop.
Save amuraru/b750f54e55bb7208c42d to your computer and use it in GitHub Desktop.
HBase Merge Empty regions
#!/usr/bin/python
## hbase shell <<<'status "detailed"' > /tmp/hbase_regions
## python merge_adjacent_regions.py <table_name> <target_region_size_mb>
import collections
import sys
TABLE=sys.argv[1]
target_size_mb = int(sys.argv[2])
LOCKED_REGION='07d9c26d0dec29cac5fc9c193ebe1889'
r=open('/tmp/hbase_regions')
lines=r.readlines()
regions_parsed=[]
Region=collections.namedtuple('region', 'fqn start_key id size info')
for i in range(len(lines)):
if "%s," % TABLE not in lines[i]:
continue;
fqn=lines[i].strip(' "')
start_key = fqn.split(',')[1]
try:
id = fqn.split('.')[1]
except:
print fqn
sys.exit(1)
info=dict(item.strip().split("=") for item in lines[i+1].split(","))
regions_parsed.append(
Region( fqn=fqn,
start_key=start_key,
id=id,
size=int(info['storefileSizeMB']),
info=info))
i+=2
sorted_regions=sorted(regions_parsed, key=lambda r: r.start_key)
size = 0
merged_regions = []
adjacent_regions = []
idx = 0
while idx < len(sorted_regions):
region = sorted_regions[idx]
size += region.size
if size > target_size_mb or region.id == LOCKED_REGION:
# group size over target, roll-over
if len(adjacent_regions) > 1:
merged_regions.append(adjacent_regions)
adjacent_regions = []
size = 0
else:
adjacent_regions.append(region)
idx += 1
print "Mergeable regions"
for s in merged_regions:
print "New merged region: %s MB" % sum([r.size for r in s])
print "#merge_region %s" % ','.join(['"%s"'%r.id for r in s])
for r in s:
print "%s %20s %7d" % (r.start_key.ljust(40), r.id, r.size)
print
print "Results:"
no_total_regions = len(sorted_regions)
no_mergeable_regions = sum([len(s) for s in merged_regions])
no_merged_regions = len(merged_regions)
print "Total regions: %s" % no_total_regions
print "Mergeable regions: %s" % no_mergeable_regions
print "Merged regions: %s" % no_merged_regions
print "Resulted regions: %s" % (no_total_regions - no_mergeable_regions + no_merged_regions)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment