xvzf · April 17, 2021 21:45
diff --git a/extract-dgd.py b/extract-dgd.py
 import xml.etree.ElementTree as Xet
 import pandas as pd

 cols = [
        "Praeposition",
        "Kasus",
        "Genus",
        "Sonstiges",
        "Zitat",
        "Quelle",
        "Geschlecht",
        "Alter",
        "Region",
        "Vertrautheit",
        "Art",
        "DGD",
 ]
 rows = []

 # Parsing the XML file
 for f in [
        "DGD-Download_KWIC_ID057C3083-2053-D618-675E-195C949D95F7.xml",
        "DGD-Download_KWIC_ID180EE36C-8CEE-F4DC-4291-E405D77B6FED.xml",
        "DGD-Download_KWIC_ID38633448-31B8-B1F8-C4DA-ED41D52F4F6B.xml",
        "DGD-Download_KWIC_ID6ED5059F-287A-13F4-D937-DD68323ECC80.xml",
        "DGD-Download_KWIC_ID7BA9DF48-8F81-72F2-DCE8-F3FE11D7028A.xml",
        "DGD-Download_KWIC_ID923583CB-8D12-71B7-5C7D-19A0926D0429.xml",
        "DGD-Download_KWIC_IDA9C3D57A-6D61-7AD9-77C3-C0AAADD6DD8E.xml",
        "DGD-Download_KWIC_IDCE4ABF25-D0B3-A260-1ECE-D9BA5A0AC864.xml",
    ]:
    xmlparse = Xet.parse(f)
    root = xmlparse.getroot()
    for i in root:
        left = i.find("left-context")
        match = i.find("match")
        right = i.find("right-context")

        content = ""
        try:
            content = content + left.text
        except:
            pass
        content = content + match.text
        try:
            content = content + " " + right.text
        except:
            pass

        rows.append({
            "Praeposition": i.find("match").text,
            "Kasus": "",
            "Genus": "",
            "Sonstiges": "",
            "Zitat": content,
            "Quelle": f"{i.find('transcript-id').text} {i.find('v_e_datum').text} {i.find('speaker-id').text}",
            "Geschlecht": i.find("v_s_geschlecht").text,
            "Alter": i.find("v_ses_alter_s").text,
            "Region": i.find("v_e_region_wiesinger").text,
            "Vertrautheit": i.find("v_e_se_vertrautheit").text,
            "Art": i.find("v_e_se_art").text,
            "DGD": i.find("dgd-link").text,
        })

 df = pd.DataFrame(rows, columns=cols)
 df.to_csv('output.csv')
	import xml.etree.ElementTree as Xet
	import pandas as pd

	cols = [
	"Praeposition",
	"Kasus",
	"Genus",
	"Sonstiges",
	"Zitat",
	"Quelle",
	"Geschlecht",
	"Alter",
	"Region",
	"Vertrautheit",
	"Art",
	"DGD",
	]
	rows = []

	# Parsing the XML file
	for f in [
	"DGD-Download_KWIC_ID057C3083-2053-D618-675E-195C949D95F7.xml",
	"DGD-Download_KWIC_ID180EE36C-8CEE-F4DC-4291-E405D77B6FED.xml",
	"DGD-Download_KWIC_ID38633448-31B8-B1F8-C4DA-ED41D52F4F6B.xml",
	"DGD-Download_KWIC_ID6ED5059F-287A-13F4-D937-DD68323ECC80.xml",
	"DGD-Download_KWIC_ID7BA9DF48-8F81-72F2-DCE8-F3FE11D7028A.xml",
	"DGD-Download_KWIC_ID923583CB-8D12-71B7-5C7D-19A0926D0429.xml",
	"DGD-Download_KWIC_IDA9C3D57A-6D61-7AD9-77C3-C0AAADD6DD8E.xml",
	"DGD-Download_KWIC_IDCE4ABF25-D0B3-A260-1ECE-D9BA5A0AC864.xml",
	]:
	xmlparse = Xet.parse(f)
	root = xmlparse.getroot()
	for i in root:
	left = i.find("left-context")
	match = i.find("match")
	right = i.find("right-context")

	content = ""
	try:
	content = content + left.text
	except:
	pass
	content = content + match.text
	try:
	content = content + " " + right.text
	except:
	pass

	rows.append({
	"Praeposition": i.find("match").text,
	"Kasus": "",
	"Genus": "",
	"Sonstiges": "",
	"Zitat": content,
	"Quelle": f"{i.find('transcript-id').text} {i.find('v_e_datum').text} {i.find('speaker-id').text}",
	"Geschlecht": i.find("v_s_geschlecht").text,
	"Alter": i.find("v_ses_alter_s").text,
	"Region": i.find("v_e_region_wiesinger").text,
	"Vertrautheit": i.find("v_e_se_vertrautheit").text,
	"Art": i.find("v_e_se_art").text,
	"DGD": i.find("dgd-link").text,
	})

	df = pd.DataFrame(rows, columns=cols)
	df.to_csv('output.csv')