Last active
June 5, 2017 23:18
-
-
Save fire-eggs/5682db6aa78c3dccd1d45878844c5630 to your computer and use it in GitHub Desktop.
A python 3.x script to combine 2 or more GEDCOM files. All the ids are fixed up to be distinct, and references corrected. Useful to create test GEDCOM files with a specific number of records. May throw an exception on non-ASCII characters, in which case the error characters need to be removed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, sys | |
# NOTE: assuming 'clean' files: level/tag/etc separated by spaces | |
# NOTE: assuming 'normal' ids: e.g. Ixxx, where xxx is a number | |
dict = {} | |
file2_ids = {} | |
def file1_ids(line): | |
# track largest id values | |
if (not line.startswith("0")): | |
return | |
vals = line.split(' ') | |
if (len(vals) < 3): # HEAD, TRLR | |
return | |
junk,realid,junk = vals[1].split('@') | |
# deal with ids like 'NI03' | |
val = ''.join([c for c in realid if c in '1234567890']) | |
#print(">>>{}:{}".format(vals[2],realid)) | |
try: | |
global dict | |
dict[vals[2][0]] = int(val) | |
except: | |
pass | |
def read_file1(path): | |
# for first file, keep all but trailer, track largest id value | |
with open(path, "r") as f: | |
for line in iter(f): | |
line2 = line.strip() | |
file1_ids(line2) | |
# skip trailer only | |
if (line2 != "0 TRLR"): | |
print(line2) | |
def newid(oldid): | |
if (not oldid[0] in dict): | |
dict[oldid[0]] = 0 | |
val = dict[oldid[0]] | |
val = val + 1 | |
dict[oldid[0]] = val | |
return oldid[0] + str(val) | |
def fixids(line): | |
global file2_ids | |
# no id in line: nothing to do | |
if (line.find("@") == -1): | |
return line | |
try: | |
p1,oldid,p2 = line.split('@') | |
except ValueError: | |
return line | |
if (len(oldid) == 0): | |
return line | |
# have we already translated this id? | |
if (oldid in file2_ids): | |
#print(">>>>>{}:{}".format(oldid, file2_ids[oldid])) | |
return "{}@{}@{}".format(p1, file2_ids[oldid],p2) | |
else: | |
newval = newid(oldid) | |
#print(">>>>>{}:{}".format(oldid, newval)) | |
if (newval == ""): | |
print("****FAIL:",oldid) | |
file2_ids[oldid] = newval | |
return "{}@{}@{}".format(p1, newval,p2) | |
def read_filen(path): | |
global file2_ids | |
file2_ids = {} | |
# for files 2-n, need to: | |
# a) ignore HEAD/SUBM/SUBN/TRLR | |
# b) translate any id reference | |
seen_indi = False | |
with open(path,"r") as f: | |
for line in iter(f): | |
line2 = line.strip() | |
# skip HEAD/SUBM/SUBN | |
seen_indi = seen_indi or (line2[0] == '0' and line2.find("INDI") != -1) | |
if (not seen_indi): | |
continue | |
if (line2 == "0 TRLR"): | |
return | |
line3 = fixids(line2) | |
print(line3) | |
if len(sys.argv) == 1: | |
print("Usage: python combine.py <path-to-file>") | |
sys.exit(0) | |
path = sys.argv[1] | |
if (not os.path.isfile(path)): | |
print("Not a file!") | |
sys.exit(0) | |
read_file1(path) | |
for i in range(2,len(sys.argv)): | |
read_filen(sys.argv[i]) | |
print("0 TRLR") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment