itherunder · April 21, 2022 02:20
diff --git a/opcode_value.py b/opcode_value.py
 OPCODES = [
 	# Stop and Arithmetic
 	'stop', 'add', 'mul', 'sub', 'div', 'sdiv', 'mod', 'smod', 'addmod', 'mulmod', 'exp', 'signextend',
 	# Comparison and Bitwise Logic
 	'lt', 'gt', 'slt', 'sgt', 'eq', 'iszero', 'and', 'or', 'xor', 'not', 'byte', 'shl', 'shr', 'sar',
 	# Sha3
 	'sha3',
 	# Environment Information
 	'address', 'balance', 'origin', 'caller', 'callvalue', 'calldataload', 'calldatasize', 'calldatacopy', 'codesize', 'codecopy', 'gasprice', 'extcodesize', 'extcodecopy', 'returndatasize', 'returndatacopy', 'extcodehash',
 	# These opcodes seem to belong in the environment 'block', but we are out of opcode space in 0x3*
 	'chainid', 'selfbalance',
 	# Block Information
 	'blockhash', 'coinbase', 'timestamp', 'number', 'difficulty', 'gaslimit',
 	# 'Stack', 'Memory', Storage and Flow Operations
 	'pop', 'mload', 'mstore', 'mstore8', 'sload', 'sstore', 'jump', 'jumpi', 'pc', 'msize', 'gas', 'jumpdest',
 	# Push Operations
 	'push1', 'push2', 'push3', 'push4', 'push5', 'push6', 'push7', 'push8', 'push9', 'push10', 'push11', 'push12', 'push13', 'push14', 'push15', 'push16', 'push17', 'push18', 'push19', 'push20', 'push21', 'push22', 'push23', 'push24', 'push25', 'push26', 'push27', 'push28', 'push29', 'push30', 'push31', 'push32',
 	# Duplicate Operations
 	'dup1', 'dup2', 'dup3', 'dup4', 'dup5', 'dup6', 'dup7', 'dup8', 'dup9', 'dup10', 'dup11', 'dup12', 'dup13', 'dup14', 'dup15', 'dup16',
 	# Exchange Operations
 	'swap1', 'swap2', 'swap3', 'swap4', 'swap5', 'swap6', 'swap7', 'swap8', 'swap9', 'swap10', 'swap11', 'swap12', 'swap13', 'swap14', 'swap15', 'swap16',
 	# Logging
 	'log0', 'log1', 'log2', 'log3', 'log4',
 	# System
 	'create', 'call', 'callcode', 'return', 'delegatecall', 'create2', 'staticcall', 'revert', 'selfdestruct'
 ]
diff --git a/scan.py b/scan.py
 '''
 *: `内联汇编`
 1. 多少个开源合约中有*
 2. 每个有*的合约中有几段（多少个函数）中有*
 3. 每一段*有多少行代码/多少字节码/哪些字节码
 4. 是否优化
 5. 有*的合约部署时间
 6. 有*的合约的调用次数，其中*执行次数
 7. 有多少次*执行失败，失败的原因
 8. 为什么用*，是否可以用源码代替
 9. *代码占自身合约的比例，指令频率比较
 10. 不同合约*的相似性
 11. 哪些Account创建了有*的合约（1.EOA or 2.SC）
 12. 创建者身份推测
 '''

 import os, shutil, re
 import subprocess, ujson
 import execjs, sys, time
 from opcode_value import *

 from utils import *

 # 设置最大递归次数（不然解析json的时候就崩了）
 sys.setrecursionlimit(951116)

 # 用来编译的js代码
 cxt = execjs.compile(
 	'''
 	function compile(version, contract, optimal, times) {
 		var solc = require('solc')
 		var input = {
 			language: 'Solidity',
 			sources: {
 				'Task': {
 					content: contract
 				}
 			},
 			settings: {
 				optimizer: {
 					enabled: optimal,
 					runs: times
 				},
 				outputSelection: {
 					'*': {
 						'*': ['*']
 					}
 				}
 			}
 		};
 		solc = solc.setupMethods(require('./solc-bin-gh-pages/bin/soljson-' + version + '.js'))
 		var result = JSON.parse(solc.compile(JSON.stringify(input)))
 		return result
 	}
 	'''
 )

 # 1. 多少个开源合约中有*
 def getContractsContainInline(contractspath):
 	# 总的*个数，
 	totalCounter, inlineCounter = 0, 0
 	for _, __, contracts in os.walk(contractspath):
 		for contract in contracts:
 			with open(contractspath+contract, 'r', encoding='utf8') as rf:
 				code = rf.read()
 				assemblies = re.findall(r'assembly\s*{', code)
 				if assemblies:
 					for assembly in assemblies:
 						if '\n' in assembly: print(len(assemblies), assembly)
 					inlineCounter += 1
 					shutil.copy(contractspath+contract, contractspath+'/../inlinecontracts/')
 					print('contract:', contract, 'totalCounter:', totalCounter, 'inlineCounter', inlineCounter)
 				# if 'assembly' in code:
 				# 	inlineCounter += 1
 				# 	shutil.copy(contractspath+contract, contractspath+'../inline_without_comments_contracts/')
 				# 	print('contract:', contract, 'totalCounter:', totalCounter, 'inlineCounter', inlineCounter)
 			totalCounter += 1

 # 2. 每个有*的合约中有几段（多少个函数|修饰器）中有*
 def getInlineCounterForEachContract(contractspath):
 	# 总的*个数，总的函数个数，总的包含*的函数个数，总的修饰器个数
 	totalModifierCounter, totalFunctionCounter = 0, 0
 	totalInlineCounter, totalFunctionInlineCounter = 0, 0
 	counter, total = 0, len(os.listdir(contractspath))
 	for contract in os.listdir(contractspath):
 		counter += 1
 		with open(contractspath+contract, 'r', encoding='utf8') as rf:
 			code = rf.read()
 			inlineCounter = len(re.findall(r'assembly\s*?{', code))
 			totalInlineCounter += inlineCounter

 			functionCounter = len(re.findall(r'function.*\(.*\)', code)) # 合约中的函数个数
 			functionCounter -= len(re.findall(r'function.*\(.*\);', code)) # 减去函数声明
 			modifierCounter = len(re.findall(r'modifier.*\{', code)) # 合约中修饰器的个数
 			totalFunctionCounter += functionCounter
 			totalModifierCounter += modifierCounter

 			functionInlineCounter = len(re.findall(r'(function|modifier)[\d\D]*?assembly\s*{', code)) # 合约中包含*的函数|修饰器个数
 			totalFunctionInlineCounter += functionInlineCounter
 		print('\rcontrct: %s %d/%i' % (contract, counter, total), end='')
 	print('\ntotaInlineCounter:%d inlineCounter:%d totalModifierCounter:%d modifierCounter:%d totalFunctionCounter:%d functionCounter:%d totalFunctionInlineCounter:%d functionInlineCounter:%d' % (totalInlineCounter, inlineCounter, totalModifierCounter, modifierCounter, totalFunctionCounter, functionCounter, totalFunctionInlineCounter, functionInlineCounter))

 # 3. 每一段*有多少行代码，并将所有的* 放到一个文件中记录
 def getInlineCode(inlinecontractspath, outputfile):
 	contracts = os.listdir(inlinecontractspath)
 	counter, total = 0, len(contracts)
 	for contract in contracts:
 		counter += 1
 		with open(outputfile, 'a', encoding='utf-8') as af:
 			af.write(contract + ':\n')
 		with open(inlinecontractspath+contract, 'r', encoding='utf8') as rf:
 			code = rf.read()
 			for assembly in re.finditer(r'assembly\s*?{', code):
 				index, stack = assembly.span()[1], ['{']
 				while True:
 					if code[index] == '{':
 						stack.append('{')
 					elif code[index] == '}':
 						stack.pop()
 						if not stack:
 							with open(outputfile, 'a', encoding='utf-8') as af:
 								inline_code = code[assembly.span()[0]:index+1]
 								af.write('%s\n' % (inline_code))
 							break
 					index += 1
 		print('\rcontract:%s %d/%d' % (contract, counter, total), end='')

 def Log(logFile, message):
 	print(message)
 	with open(logFile, 'a', encoding='utf8') as af:
 		af.write(message+'\n')

 def getInlineBytecode_(inlinecontractpath, addr2info, Rmod4):
 	counter = 0
 	for contract in os.listdir(inlinecontractpath):
 		counter += 1
 		if counter % 4 == Rmod4:
 			print('%s [INFO] contract: %s' % (time.strftime("%y-%m-%d %H:%M:%S", time.localtime()), contract))
 			mutexWriteFile('../logs/compile.log', 'a', '%s [INFO] contract: %s' % (time.strftime("%y-%m-%d %H:%M:%S", time.localtime()), contract))
 			rf = open(inlinecontractpath + contract, 'r', encoding='utf-8')
 			code = rf.read()
 			proc_code, inline_codes, count = code, [], 1
 			for assembly in re.finditer(r'assembly\s*?{', code):
 				index, stack = assembly.span()[1], ['{']
 				# count标识是第几段内联汇编
 				proc_inline_code = 'assembly {\nsstore(0x%d%d%d%d, 0x19951116)\n' % (count, count, count, count)
 				count += 1
 				while True:
 					proc_inline_code += '\nsstore(0x7777, 0x19980101)\n}' if (code[index] == '}' and len(stack) == 1) else code[index]
 					if code[index] == '{':
 						stack.append('{')
 					elif code[index] == '}':
 						stack.pop()
 						if not stack:
 							inline_code = code[assembly.span()[0]:index+1]
 							inline_codes.append(proc_inline_code)
 							break
 					index += 1
 				proc_code = proc_code.replace(inline_code, proc_inline_code)
 			# version, contarct, optimal, times
 			opti, times, vers = addr2info[contract]
 			try:
 				res = cxt.call('compile', vers, proc_code, opti == 'Yes', int(times))
 			except Exception as err:
 				mutexWriteFile('../data/usefullists/error_list', 'a', '[ERROR] @%s\n%s\n' % (contract, str(err)))
 				continue
 			with open('../data/compileresults_inline/%s' % contract, 'w', encoding='utf8') as wf:
 				wf.write(ujson.dumps(res))

 			bytecodes = re.findall(r'6319951116.{2}(\d+?)\1{3}55(.*?)6319980101', ujson.dumps(res))
 			bytecodes = set(bytecodes)
 			if bytecodes:
 				bf = open('../data/inlinebytecode/%s' % contract, 'a', encoding='utf-8')
 				for inline_code in inline_codes:
 					bf.write('%s\n' % inline_code)
 				for bytecode in bytecodes:
 					bf.write('%s#%s\n' % (bytecode[0], bytecode[1]))
 				bf.close()
 			else:
 				if 'errors' in res:
 					mutexWriteFile('../data/usefullists/nocompile_list', 'a', '%s\n' % str(res['errors']))

 # 4. 获得所有*的bytecode
 def getInlineBytecode(inlinecontractpath, versionlist):
 	vf = open(versionlist, 'r', encoding='utf-8')
 	# 存储合约对应的编译器版本，是否优化，优化次数信息
 	addr2info = {}
 	for line in vf.readlines():
 		addr, opti, times, vers = line.strip().split('#')
 		addr2info[addr] = (opti, times, vers)
 	# 这就要开多进程编译了，实在太慢了
 	pool = Pool(4)
 	for i in range(4):
 		pool.apply_async(getInlineBytecode_, args=(inlinecontractpath, addr2info, i))
 	pool.close()
 	pool.join()

 # 4. 从获取得到的inline_code.txt 中把opcode 和function 拿出做统计
 def getInlineOpcode(inlinecodepath):
 	opcode_counter, func_counter = {}, {}
 	with open(inlinecodepath, 'r', encoding='utf-8') as rf:
 		code = rf.read()
 		opcodes = re.findall(r'\w+\s?\(', code)
 		opcodes = [_[:-1] for _ in opcodes]
 		for opcode in opcodes:
 			opcode = opcode.strip()
 			if opcode in OPCODES:
 				opcode_counter[opcode] = opcode_counter.get(opcode, 0) + 1
 			else:
 				func_counter[opcode] = func_counter.get(opcode, 0) + 1 # 调用的自身的函数？
 	items = opcode_counter.items()
 	backitems=[[v[1],v[0]] for v in items] 
 	backitems.sort(reverse=True)
 	for k, v in backitems:
 		print('|%s|%d|' % (v, k))

 # 5. *的优化情况
 def isOptimal(contractspath, versionlist):
 	vf = open(versionlist, 'r', encoding='utf-8')
 	# 存储合约对应的编译器版本，是否优化，优化次数信息
 	addr2info = {}
 	for line in vf.readlines():
 		addr, opti, times, vers = line.strip().split('#')
 		addr2info[addr] = (opti, times, vers)
 	optimalCounter, notOptiCounter = 0, 0
 	for contract in os.listdir(contractspath):
 		optimalCounter += addr2info[contract][0] == 'Yes'
 		notOptiCounter += addr2info[contract][0] == 'No'
 		print('%s: %d %d' %(contract, optimalCounter, notOptiCounter))

 # 编译合约
 def getCompileResult_(contractspath, addr2info, Rmod4, resultspath):
 	counter = 0
 	for contract in os.listdir(contractspath):
 		counter += 1
 		if counter % 4 == Rmod4:
 			if os.path.exists('%s%s' % (resultspath, contract)):
 				print('%s [INFO] contract: %s has existed!' % (time.strftime("%y-%m-%d %H:%M:%S", time.localtime()), contract))
 				mutexWriteFile('../logs/compile.log', 'a', '%s [INFO] contract: %s has existed!' % (time.strftime("%y-%m-%d %H:%M:%S", time.localtime()), contract))
 				continue
 			print('%s [INFO] contract: %s' % (time.strftime("%y-%m-%d %H:%M:%S", time.localtime()), contract))
 			mutexWriteFile('../logs/compile.log', 'a', '%s [INFO] contract: %s' % (time.strftime("%y-%m-%d %H:%M:%S", time.localtime()), contract))
 			rf = open(contractspath + contract, 'r', encoding='utf8')
 			code = rf.read()
 			# version, contarct, optimal, times
 			opti, times, vers = addr2info[contract]
 			try:
 				res = cxt.call('compile', vers, code, opti == 'Yes', int(times))
 			except Exception as err:
 				mutexWriteFile('../data/usefullists/error_list', 'a', '[ERROR] @%s\n%s\n' % (contract, str(err)))
 				continue
 			with open(resultspath+contract, 'w', encoding='utf8') as wf:
 				wf.write(ujson.dumps(res))
 			rf.close()

 # 获取编译所有的结果
 def getCompileResult(contractspath, versionlist, resultspath):
 	if not os.path.exists(resultspath):
 		os.mkdir(resultspath)
 	vf = open(versionlist, 'r', encoding='utf-8')
 	# 存储合约对应的编译器版本，是否优化，优化次数信息
 	addr2info = {}
 	for line in vf.readlines():
 		addr, opti, times, vers = line.strip().split('#')
 		addr2info[addr] = (opti, times, vers)
 	# 这就要开多进程编译了，实在太慢了
 	pool = Pool(4)
 	for i in range(4):
 		pool.apply_async(getCompileResult_, args=(contractspath, addr2info, i, resultspath))
 	pool.close()
 	pool.join()

 # 根据ABI获取所有函数个数
 def getFunctionNumber(contractpagespath):
 	for contract in os.listdir(contractpagespath):
 		print(contract)

 # 拿编译时错误的合约（合约编写问题
 def getCompileError(compileresultspath, outputpath):
 	if not os.path.exists(outputpath):
 		os.mkdir(outputpath)
 	for result in os.listdir(compileresultspath):
 		with open(compileresultspath+result, 'r', encoding='utf8') as rf:
 			res = ujson.loads(rf.read())
 			if not res['sources']:
 				shutil.copy('../data/contracts_total/%s' % result, '%s%s' % (outputpath, result))
 				print(result)

 # 拿编译器错误的合约（调用编译器出错
 def getCompilerError(compileresultspath, contractsfile, outputpath):
 	if not os.path.exists(outputpath):
 		os.mkdir(outputpath)
 	results = {}
 	for result in os.listdir(compileresultspath):
 		results[result] = 1

 	with open(contractsfile, 'r', encoding='utf8') as rf:
 		for line in rf.readlines():
 			if line.strip() not in results:
 				shutil.copy('../data/contracts_total/%s' % line.strip(), '%s%s' % (outputpath, line.strip()))

 # 通过pages里面的abi获取函数个数
 def getInlineFunctionByABI(inlinecontractspath, contractpagespath):
 	for contract in os.listdir(inlinecontractspath):
 		print(contract)
 		rf = open(contractpagespath+contract, 'r', encoding='utf8')
 		abi = ujson.loads(ujson.loads(rf.read())['result'][0]['ABI'])
 		for _ in abi:
 			# print(_['type'], _['name'])
 			print(_)
 		break

 # 通过bytecode获取opcode
 def getOpcode(bytecodespath, outputpath):
 	if not os.path.exists(outputpath):
 		os.mkdir(outputpath)
 	for contract in os.listdir(bytecodespath):
 		print('contract: %s' % contract)
 		os.system('evm disasm %s%s > %s/%s' % (bytecodespath, contract, outputpath, contract))

 def main():
 	# init()
 	# getContractsContainInline('contracts/')
 	# getInlineBytecode('../data/contracts_inline/', '../data/usefullists/full_info_list')
 	# getInlineCode('../data/contracts_inline/', '../data/inline_code.txt')
 	# isOptimal('inlinecontracts/', 'lists/full_info_list')
 	# getOpcode('inline_code.txt')
 	# getCompileResult('../data/contracts_total/', '../data/usefullists/full_info_list', '../data/compileresults_total/')
 	# getCompileError('../data/compileresults_total/', '../data/contracts_total_compileerror/')
 	# getCompilerError('../data/compileresults_total/', '../data/contracts_total.txt', '../data/contracts_total_compilererror/')
 	# getCompileError('../data/compileresults_inline/', '../data/contracts_inline_compileerror/')
 	# getCompilerError('../data/compileresults_inline/', '../data/contracts_inline.txt', '../data/contracts_inline_compilererror/')
 	# getCompileResult('../data/contracts_compileerror/', '../data/usefullists/full_info_list', '../data/compileresults_compileerror/')
 	# getInlineFunctionByABI('../data/contracts_inline/', '../data/contract_pages_total/')
 	# getInlineCounterForEachContract('../data/contracts_inline/')
 	# getInlineCounterForEachContract('../data/contracts_total/')
 	# deleteSpaceLine('../data/contracts_total/')
 	# deleteSpaceLine('../data/contracts_inline/')
 	# getTotalCodeLine('../data/contracts_total/')
 	# getTotalCodeLine('../data/contracts_inline/')
 	getOpcode('../data/bytecodes_total/', '../data/opcodes_total/')

 if __name__ == "__main__":
 	main()

 # 6319951116.{2}(.*?){4}(.*?)(?:617777f3|f3)
 git add *
 git commit -m "提交一些东西0323"
 git push
diff --git a/utils.py b/utils.py
 import os, execjs, shutil, re, json, sys
 from bs4 import BeautifulSoup
 from multiprocessing import Pool, Lock

 __author__ = 'Zhou.Liao'

 # 无法使用的编译器版本
 ERRORSOLC = [
    'v0.3.4-nightly.2016.6.6+commit.e97ac4f.js',
    'v0.3.4-nightly.2016.6.8+commit.93790d.js',
    'v0.3.4-nightly.2016.6.8+commit.ccddd6f.js',
    'v0.3.4-nightly.2016.6.8+commit.d593166.js',
    'v0.3.6-nightly.2016.8.27+commit.91d4fa4.js',
    'v0.3.6-nightly.2016.8.29+commit.b8060c5.js',
    'v0.3.6-nightly.2016.8.30+commit.cf974fd.js',
    'v0.3.6-nightly.2016.8.31+commit.3ccd198.js',
    'v0.3.6-nightly.2016.9.1+commit.b5d941d.js',
    'v0.3.6-nightly.2016.9.2+commit.341c943.js',
    'v0.3.6-nightly.2016.9.5+commit.873d8bb.js',
    'v0.3.6-nightly.2016.9.6+commit.114502f.js',
    'v0.3.6-nightly.2016.9.7+commit.24524d6.js',
    'v0.3.6-nightly.2016.9.8+commit.f5a513a.js',
    'v0.4.1-nightly.2016.9.9+commit.79867f4.js'
 ]

 testCxt = execjs.compile(
 	'''
 	function compile(compiler) {
 		var solc = require('solc')
 		solc = solc.setupMethods(require('./solc-bin-gh-pages/bin/' + compiler))
 	}
 	'''
 )
 mutex = Lock()

 # 函数修饰器，表示该函数执行时必须互斥
 def mutexExec(func):
 	def mutexFunc(*args, **kwargs):
 		mutex.acquire()
 		func(*args, **kwargs)
 		mutex.release()
 	return mutexFunc

 # 互斥写文件
 @mutexExec
 def mutexWriteFile(file, type, message):
 	with open(file, type, encoding='utf8') as fd:
 		fd.write(message)
 	if file == 'sleep':
 		with open(file, 'r', encoding='utf8') as rf:
 			flag = len(rf.readlines()) > 951116
 		if flag:
 			os.remove(file)

 # -5. 格式化版本
 def formatVersion(versionfile):
 	versions, counter = [], 0
 	allversions = os.listdir('D:/solc-bin-gh-pages/bin/')
 	with open(versionfile, 'r', encoding='utf-8') as rf:
 		for line in rf.readlines():
 			contract, optimal, times, version = line.strip().split('#')
 			if 'soljson-' + version + '.js' in allversions:
 				versions.append(line)
 			else:
 				counter += 1
 				print(version, counter)
 				edition = version[:6] if version[6] in ['+', '-'] else version[:7]
 				commit = version[-7:] if version[-8] in ['.', '-'] else version[-8:]
 				for allversion in allversions:
 					if edition in allversion and commit in allversion:
 						versions.append(contract + '#' + optimal + '#' + times + '#' + allversion[8:-3] + '\n')
 						break
 	with open(versionfile, 'w', encoding='utf-8') as wf:
 		for version in versions:
 			wf.write(version)


 # -4. 编译失败的合约
 def getFailed(contractspath, astspath):
 	inlineContracts, asts = [], []
 	for _, _, inlineContractFiles in os.walk(contractspath):
 		for inlineContract in inlineContractFiles:
 			inlineContracts.append(inlineContract.split('.')[0])
 	for _, _, astFiles in os.walk(astspath):
 		for ast in astFiles:
 			asts.append(ast.split('.')[0])
 	for inlineContract in inlineContracts:
 		if inlineContract not in asts:
 			with open('diff.txt', 'a', encoding='utf-8') as af:
 				af.write(inlineContract + '\n')

 # -3. 删除编译头试试看
 def deletePragma(contractspath):
 	for root, _, files in os.walk(contractspath):
 		for file in files:
 			all_code = ''
 			print(file)
 			with open('%s/%s' % (root, file), 'r', encoding='utf8') as rf:
 				for line in rf.readlines():
 					if 'pragma solidity' not in line:
 						all_code += line
 			with open('%s/%s' % (root, file), 'w', encoding='utf8') as wf:
 				wf.write(all_code)

 # -2. 将转义符替换成有用字符
 def replaceTran(contractspath):
 	for root, _, files in os.walk(contractspath):
 		for file in files:
 			code = None
 			print(file)
 			with open('%s/%s' % (root, file), 'r', encoding='utf8') as rf:
 				code = rf.read()
 				for transferword in TRANSFERWORD.keys():
 					code = code.replace(transferword, TRANSFERWORD[transferword])
 			with open('%s/%s' % (root, file), 'w', encoding='utf8') as wf:
 				wf.write(code)

 # -1. 删除空白行
 def deleteSpaceLine(contractspath):
 	for root, _, files in os.walk(contractspath):
 		for file in files:
 			print('\r%s' % file, end='')
 			all_code = ''
 			with open('%s/%s' % (root, file), 'r', encoding='utf8') as rf:
 				for line in rf.readlines():
 					if line.strip():
 						all_code += line
 			with open('%s/%s' % (root, file), 'w', encoding='utf8') as wf:
 				wf.write(all_code)

 # 0. 删除注释
 def deleteComments(contractspath):
 	pattern = re.compile(r'(//[^\n]+)|(/\*.+?\*/)', re.DOTALL)
 	for _, __, contracts in os.walk(contractspath):
 		for contract in contracts:
 			print('contract:', contract)
 			code_without_comments = None
 			with open(contractspath+contract, 'r', encoding='utf8') as rf:
 				code = rf.read()
 				code_without_comments = re.sub(pattern, '', code)
 				
 				with open(contractspath + '/../without_comments_contracts/' + contract, 'w', encoding='utf8') as wf:
 					wf.write(code_without_comments)

 # 获取合约代码，编译版本，是否优化及优化次数
 def getInfo(page_path):
    print(page_path)
    rf = open(page_path, 'r', encoding='utf-8')
    soup = BeautifulSoup(rf.read(), 'html.parser')
    addr = soup.find(id='mainaddress').text.strip().lower()
    code = soup.find(id='editor').text
    tabs = soup.find_all(class_='table')
    vers = tabs[4].find_all('td')[3].text.strip()
    optm = tabs[5].find_all('td')[1].text.strip()
    time = tabs[5].find_all('td')[3].text.strip()
    
    cf = open('contracts/%s' % addr, 'w', encoding='utf-8')
    df = open('lists/full_info_list', 'a', encoding='utf-8')
    cf.write(code)
    df.write('%s#%s#%s#%s\n' % (addr, optm, time, vers))

 def getAllInfo(start):
    for dir in os.listdir('pages/'):
        for page in os.listdir('pages/%s' % dir):
            if int(page.split('.')[0]) < start:
                continue
            getInfo('pages/%s/%s' % (dir, page))

 # 获取合约调用次数
 def getCallTimes(page_path):
    totalTimes = 0
    for page in os.listdir(page_path):
        print(page, end=': ')
        with open(page_path + page, 'r', encoding='utf8') as rf:
            html = rf.read()
            if 'from a total of' in html:
                callTimes = int(re.findall(r"title='Click to view full list'>(.*?)</a>", html)[0].replace(',', ''))
                totalTimes += callTimes
            else:
                res = re.findall(r'Latest (.*?) txn', html)
                if res:
                    callTimes = int(res[0])
                    totalTimes += callTimes
                else:
                    callTimes = 0
        with open('calltimes.txt', 'a', encoding='utf8') as af:
            af.write('%s#%d\n' % (page, callTimes))
        print('call %d times! total times: %d' %(callTimes, totalTimes))

 def testRequire():
    for compiler in os.listdir('./solc-bin-gh-pages/bin/'):
        print(compiler)
        try:
            testCxt.call('compile', compiler)
        except Exception as err:
            shutil.copy('./solc-bin-gh-pages/bin/%s' % compiler, './solc-bin-gh-pages/%s' % compiler)

 def formatVersion():
    vf = open('../data/usefullists/full_info_list.bak', 'r', encoding='utf-8')
    infos, count, true_versions = [], 0, os.listdir('./solc-bin-gh-pages/bin/')
    for line in vf.readlines():
        print(count)
        vers = line.strip().split('#')[-1]
        # 需要对没有编译器（表示不同）做一个处理
        if 'soljson-%s.js' % vers not in true_versions:
            edition = re.findall(r'v(.*?)-', vers)[0]
            commit = vers.split('-')[-1]
            for true_version in true_versions:
                if edition in true_version and commit[-5:-1] in true_version:
                    infos.append(line.replace(vers, true_version[8:-3]))
        else:
            infos.append(line)
        count += 1
    with open('../data/usefullists/full_info_list', 'w', encoding='utf8') as wf:
        for info in infos:
            wf.write(info)

 def putContractsList(contractspath):
    for contract in os.listdir(contractspath):
        with open('contracts.txt', 'a', encoding='utf8') as af:
            af.write(contract + '\n')

 # 根据爬下来的网页获取创始者交易
 def getCreateTx(contractpagespath):
    af = open('createTxs.txt', 'a', encoding='utf8')
    for contractpage in os.listdir(contractpagespath):
        print(contractpage)
        with open(contractpagespath + contractpage, 'r', encoding='utf8') as rf:
            res = re.findall(r"title='Creator Address'>(.*?)</a>.*?<a href='/tx/(.*?)' title='Creator Txn Hash'", rf.read())
            creator, creatTx = res[0]
            af.write('%s#%s#%s\n' % (contractpage, creator, creatTx))
    af.close()

 # 根据爬下来的creator 和createTxs 界面输出这些个合约的创建时间以及创建区块
 def putContractsCreateTime(creatorspath, createTxspath):
    # for createTx in os.listdir(createTxspath):
    #     print(createTx)
    #     contract, txhash = createTx.split('#')
    #     with open(createTxspath + createTx, 'r', encoding='utf8') as rf:
    #         createNum, createTime = re.findall(r"<a href='/block/(.*?)'>[\s\S]*?mr-1'></i>(.*?)\s*?</div>", rf.read())[0]
    #         with open('createinfo.txt', 'a', encoding='utf8') as af:
    #             af.write('%s#%s#%s#%s\n' % (contract, txhash, createNum, createTime))
    for creator in os.listdir(creatorspath):
        print(creator)
        contract, creatorAddr = creator.split('#')
        with open(creatorspath + creator, 'r', encoding='utf8') as rf:
            html = rf.read()
            if 'The Address' in html and 'Contract Address' in html:
                print('/////////////////////////////%s' % creator)
                break
            if 'The Address' in html:
                addrtype = 'The Address'
            if 'Contract Address' in html:
                addrtype = 'Contract Address'
            with open('creatorinfo.txt', 'a', encoding='utf8') as af:
                af.write('%s#%s#%s\n' % (contract, creatorAddr, addrtype))

 # 初始化一些信息
 def init():
    if not os.path.exists('lists'):
        os.mkdir('lists')
    if not os.path.exists('logs'):
        os.mkdir('logs')
    if not os.path.exists('inlinebytecode'):
        os.mkdir('inlinebytecode')

    # for list_file in os.listdir('lists/'):
    #     if 'info' not in list_file:
    #         os.remove('lists/%s' % list_file)
    for log_file in os.listdir('logs/'):
        os.remove('logs/%s' % log_file)
    # for log_file in os.listdir('inlinebytecode/'):
    #     os.remove('inlinebytecode/%s' % log_file)

 # 输出所有代码的行数以计算比例
 def getTotalCodeLine(contractspath):
    totallines, contracts = 0, os.listdir(contractspath)
    counter, total = 0, len(contracts)
    for contract in contracts:
        with open(contractspath + contract, 'r', encoding='utf8') as rf:
            curlines = len(rf.readlines())
            totallines += curlines
        print('\r%s: %d/%d' % (contract, counter, total), end='')
    print('total line: %d' % totallines)

 '''
 前面的是后面的子集
 '''
 def diff(dir1, dir2):
    files1, diffs = [], []
    for file1 in os.listdir(dir1):
        files1.append(file1)
    for file2 in os.listdir(dir2):
        if file2 not in files1:
            diffs.append(file2)
            shutil.copy('%s/%s' % (dir2, file2), '../diffs/%s' % file2)
            shutil.copy('../data/contracts_inline/%s' % file2, '../diff_contracts/%s' % file2)
    print(len(diffs), diffs)

 def getSourceCode(contractspath):
    for contract in os.listdir(contractspath):
        print(contract)
        rf = open(contractspath + contract, 'r', encoding='utf8')
        res = json.loads(rf.read())
        with open('contracts/%s' % contract, 'w', encoding='utf8') as wf:
            wf.write(res['result'][0]['SourceCode'])
        rf.close()

 if __name__ == '__main__':
    # getAllInfo(16842)
    # testRequire()
    # formatVersion()
    # putContractsList('inlinecontracts/')
    # getCreateTx('contract_pages/')
    # putContractsCreateTime('creator_pages/', 'createTx_pages/')
    # getCallTimes('contract_pages/')
    # getTotalCodeLine('inlinecontracts/')
    diff('../data/inlinebytecode', '../data/compileresults_inline')
    # getSourceCode('getcode/contract_pages/')
	OPCODES = [
	# Stop and Arithmetic
	'stop', 'add', 'mul', 'sub', 'div', 'sdiv', 'mod', 'smod', 'addmod', 'mulmod', 'exp', 'signextend',
	# Comparison and Bitwise Logic
	'lt', 'gt', 'slt', 'sgt', 'eq', 'iszero', 'and', 'or', 'xor', 'not', 'byte', 'shl', 'shr', 'sar',
	# Sha3
	'sha3',
	# Environment Information
	'address', 'balance', 'origin', 'caller', 'callvalue', 'calldataload', 'calldatasize', 'calldatacopy', 'codesize', 'codecopy', 'gasprice', 'extcodesize', 'extcodecopy', 'returndatasize', 'returndatacopy', 'extcodehash',
	# These opcodes seem to belong in the environment 'block', but we are out of opcode space in 0x3*
	'chainid', 'selfbalance',
	# Block Information
	'blockhash', 'coinbase', 'timestamp', 'number', 'difficulty', 'gaslimit',
	# 'Stack', 'Memory', Storage and Flow Operations
	'pop', 'mload', 'mstore', 'mstore8', 'sload', 'sstore', 'jump', 'jumpi', 'pc', 'msize', 'gas', 'jumpdest',
	# Push Operations
	'push1', 'push2', 'push3', 'push4', 'push5', 'push6', 'push7', 'push8', 'push9', 'push10', 'push11', 'push12', 'push13', 'push14', 'push15', 'push16', 'push17', 'push18', 'push19', 'push20', 'push21', 'push22', 'push23', 'push24', 'push25', 'push26', 'push27', 'push28', 'push29', 'push30', 'push31', 'push32',
	# Duplicate Operations
	'dup1', 'dup2', 'dup3', 'dup4', 'dup5', 'dup6', 'dup7', 'dup8', 'dup9', 'dup10', 'dup11', 'dup12', 'dup13', 'dup14', 'dup15', 'dup16',
	# Exchange Operations
	'swap1', 'swap2', 'swap3', 'swap4', 'swap5', 'swap6', 'swap7', 'swap8', 'swap9', 'swap10', 'swap11', 'swap12', 'swap13', 'swap14', 'swap15', 'swap16',
	# Logging
	'log0', 'log1', 'log2', 'log3', 'log4',
	# System
	'create', 'call', 'callcode', 'return', 'delegatecall', 'create2', 'staticcall', 'revert', 'selfdestruct'
	]
	'''
	*: `内联汇编`
	1. 多少个开源合约中有*
	2. 每个有的合约中有几段（多少个函数）中有
	3. 每一段*有多少行代码/多少字节码/哪些字节码
	4. 是否优化
	5. 有*的合约部署时间
	6. 有的合约的调用次数，其中执行次数
	7. 有多少次*执行失败，失败的原因
	8. 为什么用*，是否可以用源码代替
	9. *代码占自身合约的比例，指令频率比较
	10. 不同合约*的相似性
	11. 哪些Account创建了有*的合约（1.EOA or 2.SC）
	12. 创建者身份推测
	'''

	import os, shutil, re
	import subprocess, ujson
	import execjs, sys, time
	from opcode_value import *

	from utils import *

	# 设置最大递归次数（不然解析json的时候就崩了）
	sys.setrecursionlimit(951116)

	# 用来编译的js代码
	cxt = execjs.compile(
	'''
	function compile(version, contract, optimal, times) {
	var solc = require('solc')
	var input = {
	language: 'Solidity',
	sources: {
	'Task': {
	content: contract
	}
	},
	settings: {
	optimizer: {
	enabled: optimal,
	runs: times
	},
	outputSelection: {
	'*': {
	'': ['']
	}
	}
	}
	};
	solc = solc.setupMethods(require('./solc-bin-gh-pages/bin/soljson-' + version + '.js'))
	var result = JSON.parse(solc.compile(JSON.stringify(input)))
	return result
	}
	'''
	)

	# 1. 多少个开源合约中有*
	def getContractsContainInline(contractspath):
	# 总的*个数，
	totalCounter, inlineCounter = 0, 0
	for _, __, contracts in os.walk(contractspath):
	for contract in contracts:
	with open(contractspath+contract, 'r', encoding='utf8') as rf:
	code = rf.read()
	assemblies = re.findall(r'assembly\s*{', code)
	if assemblies:
	for assembly in assemblies:
	if '\n' in assembly: print(len(assemblies), assembly)
	inlineCounter += 1
	shutil.copy(contractspath+contract, contractspath+'/../inlinecontracts/')
	print('contract:', contract, 'totalCounter:', totalCounter, 'inlineCounter', inlineCounter)
	# if 'assembly' in code:
	# inlineCounter += 1
	# shutil.copy(contractspath+contract, contractspath+'../inline_without_comments_contracts/')
	# print('contract:', contract, 'totalCounter:', totalCounter, 'inlineCounter', inlineCounter)
	totalCounter += 1

	# 2. 每个有的合约中有几段（多少个函数\|修饰器）中有
	def getInlineCounterForEachContract(contractspath):
	# 总的个数，总的函数个数，总的包含的函数个数，总的修饰器个数
	totalModifierCounter, totalFunctionCounter = 0, 0
	totalInlineCounter, totalFunctionInlineCounter = 0, 0
	counter, total = 0, len(os.listdir(contractspath))
	for contract in os.listdir(contractspath):
	counter += 1
	with open(contractspath+contract, 'r', encoding='utf8') as rf:
	code = rf.read()
	inlineCounter = len(re.findall(r'assembly\s*?{', code))
	totalInlineCounter += inlineCounter

	functionCounter = len(re.findall(r'function.\(.\)', code)) # 合约中的函数个数
	functionCounter -= len(re.findall(r'function.\(.\);', code)) # 减去函数声明
	modifierCounter = len(re.findall(r'modifier.*\{', code)) # 合约中修饰器的个数
	totalFunctionCounter += functionCounter
	totalModifierCounter += modifierCounter

	functionInlineCounter = len(re.findall(r'(function\|modifier)[\d\D]?assembly\s{', code)) # 合约中包含*的函数\|修饰器个数
	totalFunctionInlineCounter += functionInlineCounter
	print('\rcontrct: %s %d/%i' % (contract, counter, total), end='')
	print('\ntotaInlineCounter:%d inlineCounter:%d totalModifierCounter:%d modifierCounter:%d totalFunctionCounter:%d functionCounter:%d totalFunctionInlineCounter:%d functionInlineCounter:%d' % (totalInlineCounter, inlineCounter, totalModifierCounter, modifierCounter, totalFunctionCounter, functionCounter, totalFunctionInlineCounter, functionInlineCounter))

	# 3. 每一段有多少行代码，并将所有的放到一个文件中记录
	def getInlineCode(inlinecontractspath, outputfile):
	contracts = os.listdir(inlinecontractspath)
	counter, total = 0, len(contracts)
	for contract in contracts:
	counter += 1
	with open(outputfile, 'a', encoding='utf-8') as af:
	af.write(contract + ':\n')
	with open(inlinecontractspath+contract, 'r', encoding='utf8') as rf:
	code = rf.read()
	for assembly in re.finditer(r'assembly\s*?{', code):
	index, stack = assembly.span()[1], ['{']
	while True:
	if code[index] == '{':
	stack.append('{')
	elif code[index] == '}':
	stack.pop()
	if not stack:
	with open(outputfile, 'a', encoding='utf-8') as af:
	inline_code = code[assembly.span()[0]:index+1]
	af.write('%s\n' % (inline_code))
	break
	index += 1
	print('\rcontract:%s %d/%d' % (contract, counter, total), end='')

	def Log(logFile, message):
	print(message)
	with open(logFile, 'a', encoding='utf8') as af:
	af.write(message+'\n')

	def getInlineBytecode_(inlinecontractpath, addr2info, Rmod4):
	counter = 0
	for contract in os.listdir(inlinecontractpath):
	counter += 1
	if counter % 4 == Rmod4:
	print('%s [INFO] contract: %s' % (time.strftime("%y-%m-%d %H:%M:%S", time.localtime()), contract))
	mutexWriteFile('../logs/compile.log', 'a', '%s [INFO] contract: %s' % (time.strftime("%y-%m-%d %H:%M:%S", time.localtime()), contract))
	rf = open(inlinecontractpath + contract, 'r', encoding='utf-8')
	code = rf.read()
	proc_code, inline_codes, count = code, [], 1
	for assembly in re.finditer(r'assembly\s*?{', code):
	index, stack = assembly.span()[1], ['{']
	# count标识是第几段内联汇编
	proc_inline_code = 'assembly {\nsstore(0x%d%d%d%d, 0x19951116)\n' % (count, count, count, count)
	count += 1
	while True:
	proc_inline_code += '\nsstore(0x7777, 0x19980101)\n}' if (code[index] == '}' and len(stack) == 1) else code[index]
	if code[index] == '{':
	stack.append('{')
	elif code[index] == '}':
	stack.pop()
	if not stack:
	inline_code = code[assembly.span()[0]:index+1]
	inline_codes.append(proc_inline_code)
	break
	index += 1
	proc_code = proc_code.replace(inline_code, proc_inline_code)
	# version, contarct, optimal, times
	opti, times, vers = addr2info[contract]
	try:
	res = cxt.call('compile', vers, proc_code, opti == 'Yes', int(times))
	except Exception as err:
	mutexWriteFile('../data/usefullists/error_list', 'a', '[ERROR] @%s\n%s\n' % (contract, str(err)))
	continue
	with open('../data/compileresults_inline/%s' % contract, 'w', encoding='utf8') as wf:
	wf.write(ujson.dumps(res))

	bytecodes = re.findall(r'6319951116.{2}(\d+?)\1{3}55(.*?)6319980101', ujson.dumps(res))
	bytecodes = set(bytecodes)
	if bytecodes:
	bf = open('../data/inlinebytecode/%s' % contract, 'a', encoding='utf-8')
	for inline_code in inline_codes:
	bf.write('%s\n' % inline_code)
	for bytecode in bytecodes:
	bf.write('%s#%s\n' % (bytecode[0], bytecode[1]))
	bf.close()
	else:
	if 'errors' in res:
	mutexWriteFile('../data/usefullists/nocompile_list', 'a', '%s\n' % str(res['errors']))

	# 4. 获得所有*的bytecode
	def getInlineBytecode(inlinecontractpath, versionlist):
	vf = open(versionlist, 'r', encoding='utf-8')
	# 存储合约对应的编译器版本，是否优化，优化次数信息
	addr2info = {}
	for line in vf.readlines():
	addr, opti, times, vers = line.strip().split('#')
	addr2info[addr] = (opti, times, vers)
	# 这就要开多进程编译了，实在太慢了
	pool = Pool(4)
	for i in range(4):
	pool.apply_async(getInlineBytecode_, args=(inlinecontractpath, addr2info, i))
	pool.close()
	pool.join()

	# 4. 从获取得到的inline_code.txt 中把opcode 和function 拿出做统计
	def getInlineOpcode(inlinecodepath):
	opcode_counter, func_counter = {}, {}
	with open(inlinecodepath, 'r', encoding='utf-8') as rf:
	code = rf.read()
	opcodes = re.findall(r'\w+\s?\(', code)
	opcodes = [_[:-1] for _ in opcodes]
	for opcode in opcodes:
	opcode = opcode.strip()
	if opcode in OPCODES:
	opcode_counter[opcode] = opcode_counter.get(opcode, 0) + 1
	else:
	func_counter[opcode] = func_counter.get(opcode, 0) + 1 # 调用的自身的函数？
	items = opcode_counter.items()
	backitems=[[v[1],v[0]] for v in items]
	backitems.sort(reverse=True)
	for k, v in backitems:
	print('\|%s\|%d\|' % (v, k))

	# 5. *的优化情况
	def isOptimal(contractspath, versionlist):
	vf = open(versionlist, 'r', encoding='utf-8')
	# 存储合约对应的编译器版本，是否优化，优化次数信息
	addr2info = {}
	for line in vf.readlines():
	addr, opti, times, vers = line.strip().split('#')
	addr2info[addr] = (opti, times, vers)
	optimalCounter, notOptiCounter = 0, 0
	for contract in os.listdir(contractspath):
	optimalCounter += addr2info[contract][0] == 'Yes'
	notOptiCounter += addr2info[contract][0] == 'No'
	print('%s: %d %d' %(contract, optimalCounter, notOptiCounter))

	# 编译合约
	def getCompileResult_(contractspath, addr2info, Rmod4, resultspath):
	counter = 0
	for contract in os.listdir(contractspath):
	counter += 1
	if counter % 4 == Rmod4:
	if os.path.exists('%s%s' % (resultspath, contract)):
	print('%s [INFO] contract: %s has existed!' % (time.strftime("%y-%m-%d %H:%M:%S", time.localtime()), contract))
	mutexWriteFile('../logs/compile.log', 'a', '%s [INFO] contract: %s has existed!' % (time.strftime("%y-%m-%d %H:%M:%S", time.localtime()), contract))
	continue
	print('%s [INFO] contract: %s' % (time.strftime("%y-%m-%d %H:%M:%S", time.localtime()), contract))
	mutexWriteFile('../logs/compile.log', 'a', '%s [INFO] contract: %s' % (time.strftime("%y-%m-%d %H:%M:%S", time.localtime()), contract))
	rf = open(contractspath + contract, 'r', encoding='utf8')
	code = rf.read()
	# version, contarct, optimal, times
	opti, times, vers = addr2info[contract]
	try:
	res = cxt.call('compile', vers, code, opti == 'Yes', int(times))
	except Exception as err:
	mutexWriteFile('../data/usefullists/error_list', 'a', '[ERROR] @%s\n%s\n' % (contract, str(err)))
	continue
	with open(resultspath+contract, 'w', encoding='utf8') as wf:
	wf.write(ujson.dumps(res))
	rf.close()

	# 获取编译所有的结果
	def getCompileResult(contractspath, versionlist, resultspath):
	if not os.path.exists(resultspath):
	os.mkdir(resultspath)
	vf = open(versionlist, 'r', encoding='utf-8')
	# 存储合约对应的编译器版本，是否优化，优化次数信息
	addr2info = {}
	for line in vf.readlines():
	addr, opti, times, vers = line.strip().split('#')
	addr2info[addr] = (opti, times, vers)
	# 这就要开多进程编译了，实在太慢了
	pool = Pool(4)
	for i in range(4):
	pool.apply_async(getCompileResult_, args=(contractspath, addr2info, i, resultspath))
	pool.close()
	pool.join()

	# 根据ABI获取所有函数个数
	def getFunctionNumber(contractpagespath):
	for contract in os.listdir(contractpagespath):
	print(contract)

	# 拿编译时错误的合约（合约编写问题
	def getCompileError(compileresultspath, outputpath):
	if not os.path.exists(outputpath):
	os.mkdir(outputpath)
	for result in os.listdir(compileresultspath):
	with open(compileresultspath+result, 'r', encoding='utf8') as rf:
	res = ujson.loads(rf.read())
	if not res['sources']:
	shutil.copy('../data/contracts_total/%s' % result, '%s%s' % (outputpath, result))
	print(result)

	# 拿编译器错误的合约（调用编译器出错
	def getCompilerError(compileresultspath, contractsfile, outputpath):
	if not os.path.exists(outputpath):
	os.mkdir(outputpath)
	results = {}
	for result in os.listdir(compileresultspath):
	results[result] = 1

	with open(contractsfile, 'r', encoding='utf8') as rf:
	for line in rf.readlines():
	if line.strip() not in results:
	shutil.copy('../data/contracts_total/%s' % line.strip(), '%s%s' % (outputpath, line.strip()))

	# 通过pages里面的abi获取函数个数
	def getInlineFunctionByABI(inlinecontractspath, contractpagespath):
	for contract in os.listdir(inlinecontractspath):
	print(contract)
	rf = open(contractpagespath+contract, 'r', encoding='utf8')
	abi = ujson.loads(ujson.loads(rf.read())['result'][0]['ABI'])
	for _ in abi:
	# print(_['type'], _['name'])
	print(_)
	break

	# 通过bytecode获取opcode
	def getOpcode(bytecodespath, outputpath):
	if not os.path.exists(outputpath):
	os.mkdir(outputpath)
	for contract in os.listdir(bytecodespath):
	print('contract: %s' % contract)
	os.system('evm disasm %s%s > %s/%s' % (bytecodespath, contract, outputpath, contract))

	def main():
	# init()
	# getContractsContainInline('contracts/')
	# getInlineBytecode('../data/contracts_inline/', '../data/usefullists/full_info_list')
	# getInlineCode('../data/contracts_inline/', '../data/inline_code.txt')
	# isOptimal('inlinecontracts/', 'lists/full_info_list')
	# getOpcode('inline_code.txt')
	# getCompileResult('../data/contracts_total/', '../data/usefullists/full_info_list', '../data/compileresults_total/')
	# getCompileError('../data/compileresults_total/', '../data/contracts_total_compileerror/')
	# getCompilerError('../data/compileresults_total/', '../data/contracts_total.txt', '../data/contracts_total_compilererror/')
	# getCompileError('../data/compileresults_inline/', '../data/contracts_inline_compileerror/')
	# getCompilerError('../data/compileresults_inline/', '../data/contracts_inline.txt', '../data/contracts_inline_compilererror/')
	# getCompileResult('../data/contracts_compileerror/', '../data/usefullists/full_info_list', '../data/compileresults_compileerror/')
	# getInlineFunctionByABI('../data/contracts_inline/', '../data/contract_pages_total/')
	# getInlineCounterForEachContract('../data/contracts_inline/')
	# getInlineCounterForEachContract('../data/contracts_total/')
	# deleteSpaceLine('../data/contracts_total/')
	# deleteSpaceLine('../data/contracts_inline/')
	# getTotalCodeLine('../data/contracts_total/')
	# getTotalCodeLine('../data/contracts_inline/')
	getOpcode('../data/bytecodes_total/', '../data/opcodes_total/')

	if __name__ == "__main__":
	main()

	# 6319951116.{2}(.?){4}(.?)(?:617777f3\|f3)
	git add *
	git commit -m "提交一些东西0323"
	git push
	import os, execjs, shutil, re, json, sys
	from bs4 import BeautifulSoup
	from multiprocessing import Pool, Lock

	__author__ = 'Zhou.Liao'

	# 无法使用的编译器版本
	ERRORSOLC = [
	'v0.3.4-nightly.2016.6.6+commit.e97ac4f.js',
	'v0.3.4-nightly.2016.6.8+commit.93790d.js',
	'v0.3.4-nightly.2016.6.8+commit.ccddd6f.js',
	'v0.3.4-nightly.2016.6.8+commit.d593166.js',
	'v0.3.6-nightly.2016.8.27+commit.91d4fa4.js',
	'v0.3.6-nightly.2016.8.29+commit.b8060c5.js',
	'v0.3.6-nightly.2016.8.30+commit.cf974fd.js',
	'v0.3.6-nightly.2016.8.31+commit.3ccd198.js',
	'v0.3.6-nightly.2016.9.1+commit.b5d941d.js',
	'v0.3.6-nightly.2016.9.2+commit.341c943.js',
	'v0.3.6-nightly.2016.9.5+commit.873d8bb.js',
	'v0.3.6-nightly.2016.9.6+commit.114502f.js',
	'v0.3.6-nightly.2016.9.7+commit.24524d6.js',
	'v0.3.6-nightly.2016.9.8+commit.f5a513a.js',
	'v0.4.1-nightly.2016.9.9+commit.79867f4.js'
	]

	testCxt = execjs.compile(
	'''
	function compile(compiler) {
	var solc = require('solc')
	solc = solc.setupMethods(require('./solc-bin-gh-pages/bin/' + compiler))
	}
	'''
	)
	mutex = Lock()

	# 函数修饰器，表示该函数执行时必须互斥
	def mutexExec(func):
	def mutexFunc(args, *kwargs):
	mutex.acquire()
	func(args, *kwargs)
	mutex.release()
	return mutexFunc

	# 互斥写文件
	@mutexExec
	def mutexWriteFile(file, type, message):
	with open(file, type, encoding='utf8') as fd:
	fd.write(message)
	if file == 'sleep':
	with open(file, 'r', encoding='utf8') as rf:
	flag = len(rf.readlines()) > 951116
	if flag:
	os.remove(file)

	# -5. 格式化版本
	def formatVersion(versionfile):
	versions, counter = [], 0
	allversions = os.listdir('D:/solc-bin-gh-pages/bin/')
	with open(versionfile, 'r', encoding='utf-8') as rf:
	for line in rf.readlines():
	contract, optimal, times, version = line.strip().split('#')
	if 'soljson-' + version + '.js' in allversions:
	versions.append(line)
	else:
	counter += 1
	print(version, counter)
	edition = version[:6] if version[6] in ['+', '-'] else version[:7]
	commit = version[-7:] if version[-8] in ['.', '-'] else version[-8:]
	for allversion in allversions:
	if edition in allversion and commit in allversion:
	versions.append(contract + '#' + optimal + '#' + times + '#' + allversion[8:-3] + '\n')
	break
	with open(versionfile, 'w', encoding='utf-8') as wf:
	for version in versions:
	wf.write(version)


	# -4. 编译失败的合约
	def getFailed(contractspath, astspath):
	inlineContracts, asts = [], []
	for _, _, inlineContractFiles in os.walk(contractspath):
	for inlineContract in inlineContractFiles:
	inlineContracts.append(inlineContract.split('.')[0])
	for _, _, astFiles in os.walk(astspath):
	for ast in astFiles:
	asts.append(ast.split('.')[0])
	for inlineContract in inlineContracts:
	if inlineContract not in asts:
	with open('diff.txt', 'a', encoding='utf-8') as af:
	af.write(inlineContract + '\n')

	# -3. 删除编译头试试看
	def deletePragma(contractspath):
	for root, _, files in os.walk(contractspath):
	for file in files:
	all_code = ''
	print(file)
	with open('%s/%s' % (root, file), 'r', encoding='utf8') as rf:
	for line in rf.readlines():
	if 'pragma solidity' not in line:
	all_code += line
	with open('%s/%s' % (root, file), 'w', encoding='utf8') as wf:
	wf.write(all_code)

	# -2. 将转义符替换成有用字符
	def replaceTran(contractspath):
	for root, _, files in os.walk(contractspath):
	for file in files:
	code = None
	print(file)
	with open('%s/%s' % (root, file), 'r', encoding='utf8') as rf:
	code = rf.read()
	for transferword in TRANSFERWORD.keys():
	code = code.replace(transferword, TRANSFERWORD[transferword])
	with open('%s/%s' % (root, file), 'w', encoding='utf8') as wf:
	wf.write(code)

	# -1. 删除空白行
	def deleteSpaceLine(contractspath):
	for root, _, files in os.walk(contractspath):
	for file in files:
	print('\r%s' % file, end='')
	all_code = ''
	with open('%s/%s' % (root, file), 'r', encoding='utf8') as rf:
	for line in rf.readlines():
	if line.strip():
	all_code += line
	with open('%s/%s' % (root, file), 'w', encoding='utf8') as wf:
	wf.write(all_code)

	# 0. 删除注释
	def deleteComments(contractspath):
	pattern = re.compile(r'(//[^\n]+)\|(/\.+?\/)', re.DOTALL)
	for _, __, contracts in os.walk(contractspath):
	for contract in contracts:
	print('contract:', contract)
	code_without_comments = None
	with open(contractspath+contract, 'r', encoding='utf8') as rf:
	code = rf.read()
	code_without_comments = re.sub(pattern, '', code)

	with open(contractspath + '/../without_comments_contracts/' + contract, 'w', encoding='utf8') as wf:
	wf.write(code_without_comments)

	# 获取合约代码，编译版本，是否优化及优化次数
	def getInfo(page_path):
	print(page_path)
	rf = open(page_path, 'r', encoding='utf-8')
	soup = BeautifulSoup(rf.read(), 'html.parser')
	addr = soup.find(id='mainaddress').text.strip().lower()
	code = soup.find(id='editor').text
	tabs = soup.find_all(class_='table')
	vers = tabs[4].find_all('td')[3].text.strip()
	optm = tabs[5].find_all('td')[1].text.strip()
	time = tabs[5].find_all('td')[3].text.strip()

	cf = open('contracts/%s' % addr, 'w', encoding='utf-8')
	df = open('lists/full_info_list', 'a', encoding='utf-8')
	cf.write(code)
	df.write('%s#%s#%s#%s\n' % (addr, optm, time, vers))

	def getAllInfo(start):
	for dir in os.listdir('pages/'):
	for page in os.listdir('pages/%s' % dir):
	if int(page.split('.')[0]) < start:
	continue
	getInfo('pages/%s/%s' % (dir, page))

	# 获取合约调用次数
	def getCallTimes(page_path):
	totalTimes = 0
	for page in os.listdir(page_path):
	print(page, end=': ')
	with open(page_path + page, 'r', encoding='utf8') as rf:
	html = rf.read()
	if 'from a total of' in html:
	callTimes = int(re.findall(r"title='Click to view full list'>(.*?)</a>", html)[0].replace(',', ''))
	totalTimes += callTimes
	else:
	res = re.findall(r'Latest (.*?) txn', html)
	if res:
	callTimes = int(res[0])
	totalTimes += callTimes
	else:
	callTimes = 0
	with open('calltimes.txt', 'a', encoding='utf8') as af:
	af.write('%s#%d\n' % (page, callTimes))
	print('call %d times! total times: %d' %(callTimes, totalTimes))

	def testRequire():
	for compiler in os.listdir('./solc-bin-gh-pages/bin/'):
	print(compiler)
	try:
	testCxt.call('compile', compiler)
	except Exception as err:
	shutil.copy('./solc-bin-gh-pages/bin/%s' % compiler, './solc-bin-gh-pages/%s' % compiler)

	def formatVersion():
	vf = open('../data/usefullists/full_info_list.bak', 'r', encoding='utf-8')
	infos, count, true_versions = [], 0, os.listdir('./solc-bin-gh-pages/bin/')
	for line in vf.readlines():
	print(count)
	vers = line.strip().split('#')[-1]
	# 需要对没有编译器（表示不同）做一个处理
	if 'soljson-%s.js' % vers not in true_versions:
	edition = re.findall(r'v(.*?)-', vers)[0]
	commit = vers.split('-')[-1]
	for true_version in true_versions:
	if edition in true_version and commit[-5:-1] in true_version:
	infos.append(line.replace(vers, true_version[8:-3]))
	else:
	infos.append(line)
	count += 1
	with open('../data/usefullists/full_info_list', 'w', encoding='utf8') as wf:
	for info in infos:
	wf.write(info)

	def putContractsList(contractspath):
	for contract in os.listdir(contractspath):
	with open('contracts.txt', 'a', encoding='utf8') as af:
	af.write(contract + '\n')

	# 根据爬下来的网页获取创始者交易
	def getCreateTx(contractpagespath):
	af = open('createTxs.txt', 'a', encoding='utf8')
	for contractpage in os.listdir(contractpagespath):
	print(contractpage)
	with open(contractpagespath + contractpage, 'r', encoding='utf8') as rf:
	res = re.findall(r"title='Creator Address'>(.?)</a>.?<a href='/tx/(.*?)' title='Creator Txn Hash'", rf.read())
	creator, creatTx = res[0]
	af.write('%s#%s#%s\n' % (contractpage, creator, creatTx))
	af.close()

	# 根据爬下来的creator 和createTxs 界面输出这些个合约的创建时间以及创建区块
	def putContractsCreateTime(creatorspath, createTxspath):
	# for createTx in os.listdir(createTxspath):
	# print(createTx)
	# contract, txhash = createTx.split('#')
	# with open(createTxspath + createTx, 'r', encoding='utf8') as rf:
	# createNum, createTime = re.findall(r"<a href='/block/(.?)'>[\s\S]?mr-1'></i>(.?)\s?</div>", rf.read())[0]
	# with open('createinfo.txt', 'a', encoding='utf8') as af:
	# af.write('%s#%s#%s#%s\n' % (contract, txhash, createNum, createTime))
	for creator in os.listdir(creatorspath):
	print(creator)
	contract, creatorAddr = creator.split('#')
	with open(creatorspath + creator, 'r', encoding='utf8') as rf:
	html = rf.read()
	if 'The Address' in html and 'Contract Address' in html:
	print('/////////////////////////////%s' % creator)
	break
	if 'The Address' in html:
	addrtype = 'The Address'
	if 'Contract Address' in html:
	addrtype = 'Contract Address'
	with open('creatorinfo.txt', 'a', encoding='utf8') as af:
	af.write('%s#%s#%s\n' % (contract, creatorAddr, addrtype))

	# 初始化一些信息
	def init():
	if not os.path.exists('lists'):
	os.mkdir('lists')
	if not os.path.exists('logs'):
	os.mkdir('logs')
	if not os.path.exists('inlinebytecode'):
	os.mkdir('inlinebytecode')

	# for list_file in os.listdir('lists/'):
	# if 'info' not in list_file:
	# os.remove('lists/%s' % list_file)
	for log_file in os.listdir('logs/'):
	os.remove('logs/%s' % log_file)
	# for log_file in os.listdir('inlinebytecode/'):
	# os.remove('inlinebytecode/%s' % log_file)

	# 输出所有代码的行数以计算比例
	def getTotalCodeLine(contractspath):
	totallines, contracts = 0, os.listdir(contractspath)
	counter, total = 0, len(contracts)
	for contract in contracts:
	with open(contractspath + contract, 'r', encoding='utf8') as rf:
	curlines = len(rf.readlines())
	totallines += curlines
	print('\r%s: %d/%d' % (contract, counter, total), end='')
	print('total line: %d' % totallines)

	'''
	前面的是后面的子集
	'''
	def diff(dir1, dir2):
	files1, diffs = [], []
	for file1 in os.listdir(dir1):
	files1.append(file1)
	for file2 in os.listdir(dir2):
	if file2 not in files1:
	diffs.append(file2)
	shutil.copy('%s/%s' % (dir2, file2), '../diffs/%s' % file2)
	shutil.copy('../data/contracts_inline/%s' % file2, '../diff_contracts/%s' % file2)
	print(len(diffs), diffs)

	def getSourceCode(contractspath):
	for contract in os.listdir(contractspath):
	print(contract)
	rf = open(contractspath + contract, 'r', encoding='utf8')
	res = json.loads(rf.read())
	with open('contracts/%s' % contract, 'w', encoding='utf8') as wf:
	wf.write(res['result'][0]['SourceCode'])
	rf.close()

	if __name__ == '__main__':
	# getAllInfo(16842)
	# testRequire()
	# formatVersion()
	# putContractsList('inlinecontracts/')
	# getCreateTx('contract_pages/')
	# putContractsCreateTime('creator_pages/', 'createTx_pages/')
	# getCallTimes('contract_pages/')
	# getTotalCodeLine('inlinecontracts/')
	diff('../data/inlinebytecode', '../data/compileresults_inline')
	# getSourceCode('getcode/contract_pages/')