Anderssorby · November 16, 2017 01:09
diff --git a/numpy_mnist.py b/numpy_mnist.py
 import numpy as np
 from itertools import product
 from timeit import default_timer as timer
 import matplotlib.pyplot as plt

 # Deterministic
 np.random.seed(1234)

 # Data
 dim = 4
 data_size = 2**dim
 X = np.empty((data_size,dim))
 Y = np.empty(data_size)

 for n, x in enumerate(product(*([[0, 1]] * dim))):
    target = int(sum(x) == 2) # the function we want to predict
    X[n] = x
    Y[n] = target

 # Trainig data
 training_size = data_size # =< data_size
 target1 = np.array(Y[:training_size])
 input1 = np.array(X[:training_size])

 # Model parameters
 num_epochs = 100000
 echos = 10
 echo_freq = min(num_epochs // echos, 5000)
 etha = 0.1 # learning rate
 precision = 1e-4 # Stopping criterion

 # Some activation functions and their derivatives
 def sigmoid(a):
    return 1/(1+np.exp(-a))

 def sigmoid_prime(a):
    return sigmoid(a)*(1-sigmoid(a))

 def id(z):
    return z

 def id_prime(z):
    return 1

 def ReLU(a):
    b = np.copy(a)
    for i in range(len(a)):
        b[i] = np.max(a[i],0)
    return b

 def ReLU_prime(a):
    b = np.copy(a)
    for i in range(len(a)):
        if a[i] <= 0:
            b[i] = 0
        else:
            b[i] = 1
    return b

 # Activation function
 h = sigmoid
 h_prime = sigmoid_prime

 activation_functions = [sigmoid, sigmoid, id]
 act_prime = [sigmoid_prime, sigmoid_prime, id]

 # the output dimension of each layer
 # including the input and output layer
 layer_shape = [dim, 5, 10, 1]
 num_layers = len(layer_shape) - 1

 # Initializing layers
 layers = []
 biases = []
 for l in range(num_layers):
    layers.append(np.random.uniform(size=(layer_shape[l+1], layer_shape[l])))
    biases.append(np.random.uniform(size=layer_shape[l+1]))


 def compute(inp):
    output = [inp]
    a = inp
    for l in range(num_layers):
        # for each step transform the data using the activation function h
        h = activation_functions[l]
        w = layers[l]
        z = np.dot(w, a)# + biases[l]
        output.append(z)
        a = h(z)
    return output, a


 # Training eg. minimize the error function with SGD

 start = timer()

 errors = []
 wprev = layers.copy()
 bprev = biases.copy()
 for epoch in range(num_epochs):
    shuffled =  np.arange(training_size)
    np.random.shuffle(shuffled)
    sumerr = 0
    for t in shuffled:
        # Calculate the output for each layer
        output, result = compute(input1[t])

        # The sign is important
        error = result - target1[t]

        # Using the l2-norm
        sumerr += np.sqrt(sum(np.square(error)))

        # Backpropagation
        # Update the weights
        # w = w - etha*\del E(w)/\del w
        delta = etha*error
        for l in reversed(range(num_layers)):
            w = layers[l]
            z = output[l]
            b = biases[l]
            h = activation_functions[l]
            layers[l] = w - np.outer(delta, h(z))
            biases[l] = b - delta
            delta = w.T.dot(delta) * h_prime(z)

    sumerr = sumerr/training_size
    errors.append(sumerr)

    # if len(errors) > 2 and errors[-1] - errors[-2] < 1e-5:
    #     print(f'\nStopping at {epoch}th epoch with error {sumerr}')
    #     break

    if sumerr < precision:
        print(f'\nStopping at {epoch}th epoch with error {sumerr}')
        break

    if epoch % echo_freq == 0:
        print(f'Epoch {epoch} - Current error:{sumerr}')

        diff_w = [np.sum(layers[l] - wprev[l]) for l in range(num_layers)]
        diff_b = [np.sum(biases[l] - bprev[l]) for l in range(num_layers)]
        diff = np.linalg.norm(diff_w) + np.linalg.norm(diff_b)

        print(f'Difference {diff}')

        wprev = layers.copy()
        bprev = biases.copy()

 end_training = timer()
 print(f'\nTrainig took {end_training-start}s\n')

 # Plotting convergence
 plt.plot(errors)
 plt.show()

 # Testing
 for i in range(data_size):
    output, result = compute(X[i])
    test_error = np.sum(np.abs(result-Y[i]))

    if i >= training_size:
        print("---Testdata---")
    print(f'Test f({X[i]}) = {result}, target = {Y[i]}, error = {test_error}')
	import numpy as np
	from itertools import product
	from timeit import default_timer as timer
	import matplotlib.pyplot as plt

	# Deterministic
	np.random.seed(1234)

	# Data
	dim = 4
	data_size = 2**dim
	X = np.empty((data_size,dim))
	Y = np.empty(data_size)

	for n, x in enumerate(product(([[0, 1]] dim))):
	target = int(sum(x) == 2) # the function we want to predict
	X[n] = x
	Y[n] = target

	# Trainig data
	training_size = data_size # =< data_size
	target1 = np.array(Y[:training_size])
	input1 = np.array(X[:training_size])

	# Model parameters
	num_epochs = 100000
	echos = 10
	echo_freq = min(num_epochs // echos, 5000)
	etha = 0.1 # learning rate
	precision = 1e-4 # Stopping criterion

	# Some activation functions and their derivatives
	def sigmoid(a):
	return 1/(1+np.exp(-a))

	def sigmoid_prime(a):
	return sigmoid(a)*(1-sigmoid(a))

	def id(z):
	return z

	def id_prime(z):
	return 1

	def ReLU(a):
	b = np.copy(a)
	for i in range(len(a)):
	b[i] = np.max(a[i],0)
	return b

	def ReLU_prime(a):
	b = np.copy(a)
	for i in range(len(a)):
	if a[i] <= 0:
	b[i] = 0
	else:
	b[i] = 1
	return b

	# Activation function
	h = sigmoid
	h_prime = sigmoid_prime

	activation_functions = [sigmoid, sigmoid, id]
	act_prime = [sigmoid_prime, sigmoid_prime, id]

	# the output dimension of each layer
	# including the input and output layer
	layer_shape = [dim, 5, 10, 1]
	num_layers = len(layer_shape) - 1

	# Initializing layers
	layers = []
	biases = []
	for l in range(num_layers):
	layers.append(np.random.uniform(size=(layer_shape[l+1], layer_shape[l])))
	biases.append(np.random.uniform(size=layer_shape[l+1]))


	def compute(inp):
	output = [inp]
	a = inp
	for l in range(num_layers):
	# for each step transform the data using the activation function h
	h = activation_functions[l]
	w = layers[l]
	z = np.dot(w, a)# + biases[l]
	output.append(z)
	a = h(z)
	return output, a


	# Training eg. minimize the error function with SGD

	start = timer()

	errors = []
	wprev = layers.copy()
	bprev = biases.copy()
	for epoch in range(num_epochs):
	shuffled = np.arange(training_size)
	np.random.shuffle(shuffled)
	sumerr = 0
	for t in shuffled:
	# Calculate the output for each layer
	output, result = compute(input1[t])

	# The sign is important
	error = result - target1[t]

	# Using the l2-norm
	sumerr += np.sqrt(sum(np.square(error)))

	# Backpropagation
	# Update the weights
	# w = w - etha*\del E(w)/\del w
	delta = etha*error
	for l in reversed(range(num_layers)):
	w = layers[l]
	z = output[l]
	b = biases[l]
	h = activation_functions[l]
	layers[l] = w - np.outer(delta, h(z))
	biases[l] = b - delta
	delta = w.T.dot(delta) * h_prime(z)

	sumerr = sumerr/training_size
	errors.append(sumerr)

	# if len(errors) > 2 and errors[-1] - errors[-2] < 1e-5:
	# print(f'\nStopping at {epoch}th epoch with error {sumerr}')
	# break

	if sumerr < precision:
	print(f'\nStopping at {epoch}th epoch with error {sumerr}')
	break

	if epoch % echo_freq == 0:
	print(f'Epoch {epoch} - Current error:{sumerr}')

	diff_w = [np.sum(layers[l] - wprev[l]) for l in range(num_layers)]
	diff_b = [np.sum(biases[l] - bprev[l]) for l in range(num_layers)]
	diff = np.linalg.norm(diff_w) + np.linalg.norm(diff_b)

	print(f'Difference {diff}')

	wprev = layers.copy()
	bprev = biases.copy()

	end_training = timer()
	print(f'\nTrainig took {end_training-start}s\n')

	# Plotting convergence
	plt.plot(errors)
	plt.show()

	# Testing
	for i in range(data_size):
	output, result = compute(X[i])
	test_error = np.sum(np.abs(result-Y[i]))

	if i >= training_size:
	print("---Testdata---")
	print(f'Test f({X[i]}) = {result}, target = {Y[i]}, error = {test_error}')