JodiTheTigger · January 14, 2014 01:52
diff --git a/gdbBacktraceToJson.py b/gdbBacktraceToJson.py
 #!/usr/bin/python2
 #
 # gdbBacktraceToJson.py.  Parses gdb backtraces into json.
 # Copyright (C) 2014 Richard Maxwell <jodi.the.tigger@gmail.com>
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>
 #
 # Description:
 # gdbBacktraceToJson.py parses the output of the command "thread apply all bt full" and turns it into a json array.
 # Useful for automating the analysis of coredump files generated when an application crashes. Use the tokenised json
 # to seach a database of crashes for similar crashes or make a nice web interface for viewing back traces. You could
 # make a backtrace diff tool. It's much easier to use and write tools using a standard data format.

 # Usage:
 # python2 gdbBacktraceToJson.py <backtrace file>
 # It will parse the file and output the backtrace as a json array to std out.
 # you can get the backtrace file from a core dump file by running gdb in the following way:
 # gdb [app with debug symbols] [core file] --eval-command "thread apply all bt full" --eval-command "quit" > mybacktrace.txt

 import sys
 import os
 import re
 import json
 import string
 import datetime

 def parseLocals(lines):
    result = {}

    index = 0
    while index < len(lines):                        
        simpleVars = re.match( r'\s*(.*)\s=\s(.*)\s*', lines[index], re.I|re.M)

        if lines[index].find('{') == -1:

            if simpleVars:
                result[simpleVars.group(1)] = simpleVars.group(2).strip().strip(',')

        else:            
            # find the closing brace.
            closingIndex = index + 1
            closingIndexFound = -1
            depth = 1
            while closingIndex < len(lines):
                closingIndexFound = closingIndex
                
                if lines[closingIndex].find('{') != -1:
                    depth = depth + 1
                else:                     
                    if lines[closingIndex].find('}') != -1: 
                        depth = depth - 1

                        if depth < 1:
                            closingIndexFound = closingIndex
                            break

                closingIndex = closingIndex + 1

            if closingIndexFound == -1:
                # wtf?
                print "*ERROR* Coreline: parseLocals: Can't find closing brace."
                return result
           
            # deal with nested braces using recursion.                 
            joinedLines = '\n'.join(lines[index+1:closingIndex])
            
            if simpleVars:
                result[simpleVars.group(1)] = parseLocals(lines[index+1:closingIndex])
                index = closingIndex
            else:
                return result      

        index = index + 1

    return result

 def coreLinesToObject(coreLine):
    coreObject = {}
    
    # line format is:
    # #frame [0x12345678] in (<function>) [from|at] [library|file]
    # (?:....) means don't capture that group (?:)
    matchResult = re.match( r'\#(\d+)\s+(?:(0x(?:[0-9A-F])*) in |)(\S+) (\((?:.|\n|\r)*\))(?: (?:at|from) (.*)|$)', coreLine, re.I|re.M)

    if matchResult: 
        # matches are:
        # 1: frame
        # 2: address or no match
        # 3: function name
        # 4: argument list (including braces)
        # 5: source / library
        coreObject['frame'] = matchResult.group(1)
        coreObject['address'] = matchResult.group(2)
        coreObject['function'] = matchResult.group(3)
        coreObject['source'] = matchResult.group(5)
        coreObject['arguments'] = {}

        # right, parse in the argument list
        # arguments can have the @ symbol in them 'this@entry=0x12345678'
        argSearch = re.findall( r'([\w@]+)=(\w+|<optimized out>)', matchResult.group(4), re.I|re.M)
        for (argKey, argValue) in argSearch:
            coreObject['arguments'][argKey] = argValue

        # bt full stuff will come here. Stack variables and source files too.
        arguments = coreLine.split('\n')[1:] 
        if len(arguments) > 1:
            if coreObject['source'] == None:
                sourceMatch = re.match( r'\s+(?:at|from) (.*)\w', arguments[0], re.I|re.M)

                if sourceMatch:
                    coreObject['source'] = sourceMatch.group(1)     
                
            # parse the arguments.
            coreObject['locals'] = parseLocals(arguments[1:])  

    else:
        # really should complain.
        print "*ERROR* Coreline mismatch: ", coreLine 

    return coreObject

 def textToList(filePath, fileText):
    core = {}
    core['filePath'] = filePath
    core['fileName'] = os.path.splitext(os.path.basename(filePath))[0]
    core['threads'] = []
    core['jsonCreationTimeUtc'] = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
    
    threadIndex = -1;
    threadId = ""
    multipleLines = ""
    
    for line in fileText:
        # Search for core dump global meta
        # (command line and termination reason)
        # Core was generated by `.....'.
        # Program terminated with ...
        # ---------------------------------------
        if not core.has_key('commandLine'):
            if line.find("Core was generated by") == 0:
                # [23:-3] manually deduced so I can keep what's in quotes
                # If I did it properly I would use a regex.
                core['commandLine'] = line[23:-3]
        
        if not core.has_key('coreReason'):
            if line.find("Program terminated with") == 0:
                # [:-1] remove line ending
                core['coreReason'] = line[:-1]


        # Parse core dumps per thread.
        # ---------------------------------------        
        if line.find("Thread")== 0:
            #right, make sure we purge the last line of the last stack trace please.
            if len(multipleLines) > 0:
                core['threads'][threadIndex]['stackTrace'].append(coreLinesToObject(multipleLines))
                multipleLines = "";
            
            threadIndex += 1
            
            threadId = line[:-2]
            core['threads'].append({})
            core['threads'][threadIndex]['stackTrace'] = []
            
            threadResult = re.match( r'Thread\s+(\d+)\s+\(LWP\s+(\d+)\)', threadId, re.I|re.M)
            if threadResult:
                core['threads'][threadIndex]['threadId'] = threadResult.group(2)
                core['threads'][threadIndex]['threadNumber'] = threadResult.group(1)
            else:                
                print "*ERROR* ThreadId mismatch: ", threadId
                core['threads'][threadIndex]['threadId'] = threadId
 
        else:
            if threadIndex > -1:
                if len(line) > 0:                
                    if len(multipleLines) > 0:
                        if line[0] == '#':
                            core['threads'][threadIndex]['stackTrace'].append(coreLinesToObject(multipleLines))
                            multipleLines = line;
                        else:
                            multipleLines += line;
                    else:
                        if (line[0] == '#'):
                            multipleLines = line;
                else:
                    if len(multipleLines) > 0:
                        core['threads'][threadIndex]['stackTrace'].append(coreLinesToObject(multipleLines))
                        multipleLines = "";
            
    return core
                

 # the filename is the name of the textual output of gdb's "thread apply all bt"
 def process(argList):
    fileName = argList[1]
    
    coreDump = open(fileName, 'r')
    lines = coreDump.readlines()
    coreDump.close()

    coreDumpObject = textToList(fileName, lines)

    # right, dump the json
    print json.dumps(coreDumpObject, sort_keys=True, indent=4)
        
 # decode the first passed filename
 process(sys.argv)
	#!/usr/bin/python2
	#
	# gdbBacktraceToJson.py. Parses gdb backtraces into json.
	# Copyright (C) 2014 Richard Maxwell <jodi.the.tigger@gmail.com>
	#
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <http://www.gnu.org/licenses/>
	#
	# Description:
	# gdbBacktraceToJson.py parses the output of the command "thread apply all bt full" and turns it into a json array.
	# Useful for automating the analysis of coredump files generated when an application crashes. Use the tokenised json
	# to seach a database of crashes for similar crashes or make a nice web interface for viewing back traces. You could
	# make a backtrace diff tool. It's much easier to use and write tools using a standard data format.

	# Usage:
	# python2 gdbBacktraceToJson.py <backtrace file>
	# It will parse the file and output the backtrace as a json array to std out.
	# you can get the backtrace file from a core dump file by running gdb in the following way:
	# gdb [app with debug symbols] [core file] --eval-command "thread apply all bt full" --eval-command "quit" > mybacktrace.txt

	import sys
	import os
	import re
	import json
	import string
	import datetime

	def parseLocals(lines):
	result = {}

	index = 0
	while index < len(lines):
	simpleVars = re.match( r'\s(.)\s=\s(.)\s', lines[index], re.I\|re.M)

	if lines[index].find('{') == -1:

	if simpleVars:
	result[simpleVars.group(1)] = simpleVars.group(2).strip().strip(',')

	else:
	# find the closing brace.
	closingIndex = index + 1
	closingIndexFound = -1
	depth = 1
	while closingIndex < len(lines):
	closingIndexFound = closingIndex

	if lines[closingIndex].find('{') != -1:
	depth = depth + 1
	else:
	if lines[closingIndex].find('}') != -1:
	depth = depth - 1

	if depth < 1:
	closingIndexFound = closingIndex
	break

	closingIndex = closingIndex + 1

	if closingIndexFound == -1:
	# wtf?
	print "ERROR Coreline: parseLocals: Can't find closing brace."
	return result

	# deal with nested braces using recursion.
	joinedLines = '\n'.join(lines[index+1:closingIndex])

	if simpleVars:
	result[simpleVars.group(1)] = parseLocals(lines[index+1:closingIndex])
	index = closingIndex
	else:
	return result

	index = index + 1

	return result

	def coreLinesToObject(coreLine):
	coreObject = {}

	# line format is:
	# #frame [0x12345678] in (<function>) [from\|at] [library\|file]
	# (?:....) means don't capture that group (?:)
	matchResult = re.match( r'\#(\d+)\s+(?:(0x(?:[0-9A-F])) in \|)(\S+) (\((?:.\|\n\|\r)\))(?: (?:at\|from) (.*)\|$)', coreLine, re.I\|re.M)

	if matchResult:
	# matches are:
	# 1: frame
	# 2: address or no match
	# 3: function name
	# 4: argument list (including braces)
	# 5: source / library
	coreObject['frame'] = matchResult.group(1)
	coreObject['address'] = matchResult.group(2)
	coreObject['function'] = matchResult.group(3)
	coreObject['source'] = matchResult.group(5)
	coreObject['arguments'] = {}

	# right, parse in the argument list
	# arguments can have the @ symbol in them 'this@entry=0x12345678'
	argSearch = re.findall( r'([\w@]+)=(\w+\|<optimized out>)', matchResult.group(4), re.I\|re.M)
	for (argKey, argValue) in argSearch:
	coreObject['arguments'][argKey] = argValue

	# bt full stuff will come here. Stack variables and source files too.
	arguments = coreLine.split('\n')[1:]
	if len(arguments) > 1:
	if coreObject['source'] == None:
	sourceMatch = re.match( r'\s+(?:at\|from) (.*)\w', arguments[0], re.I\|re.M)

	if sourceMatch:
	coreObject['source'] = sourceMatch.group(1)

	# parse the arguments.
	coreObject['locals'] = parseLocals(arguments[1:])

	else:
	# really should complain.
	print "ERROR Coreline mismatch: ", coreLine

	return coreObject

	def textToList(filePath, fileText):
	core = {}
	core['filePath'] = filePath
	core['fileName'] = os.path.splitext(os.path.basename(filePath))[0]
	core['threads'] = []
	core['jsonCreationTimeUtc'] = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")

	threadIndex = -1;
	threadId = ""
	multipleLines = ""

	for line in fileText:
	# Search for core dump global meta
	# (command line and termination reason)
	# Core was generated by `.....'.
	# Program terminated with ...
	# ---------------------------------------
	if not core.has_key('commandLine'):
	if line.find("Core was generated by") == 0:
	# [23:-3] manually deduced so I can keep what's in quotes
	# If I did it properly I would use a regex.
	core['commandLine'] = line[23:-3]

	if not core.has_key('coreReason'):
	if line.find("Program terminated with") == 0:
	# [:-1] remove line ending
	core['coreReason'] = line[:-1]


	# Parse core dumps per thread.
	# ---------------------------------------
	if line.find("Thread")== 0:
	#right, make sure we purge the last line of the last stack trace please.
	if len(multipleLines) > 0:
	core['threads'][threadIndex]['stackTrace'].append(coreLinesToObject(multipleLines))
	multipleLines = "";

	threadIndex += 1

	threadId = line[:-2]
	core['threads'].append({})
	core['threads'][threadIndex]['stackTrace'] = []

	threadResult = re.match( r'Thread\s+(\d+)\s+\(LWP\s+(\d+)\)', threadId, re.I\|re.M)
	if threadResult:
	core['threads'][threadIndex]['threadId'] = threadResult.group(2)
	core['threads'][threadIndex]['threadNumber'] = threadResult.group(1)
	else:
	print "ERROR ThreadId mismatch: ", threadId
	core['threads'][threadIndex]['threadId'] = threadId

	else:
	if threadIndex > -1:
	if len(line) > 0:
	if len(multipleLines) > 0:
	if line[0] == '#':
	core['threads'][threadIndex]['stackTrace'].append(coreLinesToObject(multipleLines))
	multipleLines = line;
	else:
	multipleLines += line;
	else:
	if (line[0] == '#'):
	multipleLines = line;
	else:
	if len(multipleLines) > 0:
	core['threads'][threadIndex]['stackTrace'].append(coreLinesToObject(multipleLines))
	multipleLines = "";

	return core


	# the filename is the name of the textual output of gdb's "thread apply all bt"
	def process(argList):
	fileName = argList[1]

	coreDump = open(fileName, 'r')
	lines = coreDump.readlines()
	coreDump.close()

	coreDumpObject = textToList(fileName, lines)

	# right, dump the json
	print json.dumps(coreDumpObject, sort_keys=True, indent=4)

	# decode the first passed filename
	process(sys.argv)