Skip to content

Instantly share code, notes, and snippets.

@fire-eggs
Last active June 5, 2017 23:18
Show Gist options
  • Save fire-eggs/5682db6aa78c3dccd1d45878844c5630 to your computer and use it in GitHub Desktop.
Save fire-eggs/5682db6aa78c3dccd1d45878844c5630 to your computer and use it in GitHub Desktop.
A python 3.x script to combine 2 or more GEDCOM files. All the ids are fixed up to be distinct, and references corrected. Useful to create test GEDCOM files with a specific number of records. May throw an exception on non-ASCII characters, in which case the error characters need to be removed.
import os, sys
# NOTE: assuming 'clean' files: level/tag/etc separated by spaces
# NOTE: assuming 'normal' ids: e.g. Ixxx, where xxx is a number
dict = {}
file2_ids = {}
def file1_ids(line):
# track largest id values
if (not line.startswith("0")):
return
vals = line.split(' ')
if (len(vals) < 3): # HEAD, TRLR
return
junk,realid,junk = vals[1].split('@')
# deal with ids like 'NI03'
val = ''.join([c for c in realid if c in '1234567890'])
#print(">>>{}:{}".format(vals[2],realid))
try:
global dict
dict[vals[2][0]] = int(val)
except:
pass
def read_file1(path):
# for first file, keep all but trailer, track largest id value
with open(path, "r") as f:
for line in iter(f):
line2 = line.strip()
file1_ids(line2)
# skip trailer only
if (line2 != "0 TRLR"):
print(line2)
def newid(oldid):
if (not oldid[0] in dict):
dict[oldid[0]] = 0
val = dict[oldid[0]]
val = val + 1
dict[oldid[0]] = val
return oldid[0] + str(val)
def fixids(line):
global file2_ids
# no id in line: nothing to do
if (line.find("@") == -1):
return line
try:
p1,oldid,p2 = line.split('@')
except ValueError:
return line
if (len(oldid) == 0):
return line
# have we already translated this id?
if (oldid in file2_ids):
#print(">>>>>{}:{}".format(oldid, file2_ids[oldid]))
return "{}@{}@{}".format(p1, file2_ids[oldid],p2)
else:
newval = newid(oldid)
#print(">>>>>{}:{}".format(oldid, newval))
if (newval == ""):
print("****FAIL:",oldid)
file2_ids[oldid] = newval
return "{}@{}@{}".format(p1, newval,p2)
def read_filen(path):
global file2_ids
file2_ids = {}
# for files 2-n, need to:
# a) ignore HEAD/SUBM/SUBN/TRLR
# b) translate any id reference
seen_indi = False
with open(path,"r") as f:
for line in iter(f):
line2 = line.strip()
# skip HEAD/SUBM/SUBN
seen_indi = seen_indi or (line2[0] == '0' and line2.find("INDI") != -1)
if (not seen_indi):
continue
if (line2 == "0 TRLR"):
return
line3 = fixids(line2)
print(line3)
if len(sys.argv) == 1:
print("Usage: python combine.py <path-to-file>")
sys.exit(0)
path = sys.argv[1]
if (not os.path.isfile(path)):
print("Not a file!")
sys.exit(0)
read_file1(path)
for i in range(2,len(sys.argv)):
read_filen(sys.argv[i])
print("0 TRLR")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment