cneill · August 23, 2020 08:54
diff --git a/ben.go b/ben.go
 package main

 import (
 	"fmt"
 	"io/ioutil"
 	"log"
 	"math"
 	"os"
 	"strconv"
 	"strings"
 )

 /*
 As an example dataset, you can collect unconfirmed bitcoin transactions (quoted in USD) with the following shell command:

 cat btc_transactions.txt <(curl https://www.blockchain.com/btc/unconfirmed-transactions | grep -oE '>\$[0-9\.,]+<' | tr -d ',$<>' | grep -vE '^0') | sort | uniq > btc_transactions2.txt && mv btc_transactions2.txt btc_transactions.txt && cat btc_transactions.txt | wc -l

 And if you want some random numbers, courtesy of Python:

 echo -e "import random\n\nfor i in range(0, 500000):\n\tprint(random.random() * 100000000)" | python > random.txt

 NOTE: this may include other prices? I don't know, I'm lazy
 */

 func errh(err error) {
 	if err != nil {
 		log.Fatalf("%v", err)
 	}
 }

 // getNumbers strips out numbers with leading '0'
 func getNumbers(contents []byte) []float64 {
 	var results = []float64{}

 	for _, line := range strings.Split(string(contents), "\n") {
 		line = strings.TrimSpace(line)
 		if line == "" {
 			continue
 		}

 		parsed, err := strconv.ParseFloat(line, 64)
 		if err != nil {
 			fmt.Printf("Failed to parse %s: %v\n", line, err)
 			continue
 		} else if parsed == 0.0 {
 			continue
 		}
 		results = append(results, parsed)
 	}

 	return results
 }

 // distribution is an array of ints with the distribution of leading characters, with the 0 index representing '1'
 func getDistribution(numbers []float64) []int64 {
 	var buckets = make([]int64, 9)

 	for _, number := range numbers {
 		lead := string(fmt.Sprintf("%f", number)[0])
 		leadInt, err := strconv.ParseInt(lead, 10, 32)
 		errh(err)
 		if leadInt == 0 {
 			fmt.Printf("Invalid number: %f\n", number)
 			continue
 		}
 		buckets[leadInt-1]++
 	}

 	return buckets
 }

 func getPercentageDistribution(distribution []int64) []float64 {
 	var buckets = make([]float64, 9)
 	var totalNum int64 = 0

 	for _, num := range distribution {
 		totalNum += num
 	}

 	for i, num := range distribution {
 		buckets[i] = float64(num) / float64(totalNum)
 	}

 	return buckets
 }

 /*
 Simon Newcomb was the original discoverer of "Benford's Law", in his paper "Note on the Frequency of Use of the
 Different Digits in Natural Numbers": https://www.jstor.org/stable/2369148?seq=1#metadata_info_tab_contents

 From the paper, here were Newcomb's probabilities of each leading digit:
 1: 0.3010
 2: 0.1761
 3: 0.1249
 4: 0.0969
 5: 0.0792
 6: 0.0669
 7: 0.0580
 8: 0.0512
 9: 0.0458
 */

 func compareNewcomb(percentages []float64) []float64 {
 	var buckets = make([]float64, 9)

 	for i := range percentages {
 		d := float64(i + 1)
 		expected := math.Log10((d + 1) / d)
 		buckets[i] = percentages[i] - expected
 	}

 	return buckets
 }

 func compareRandom(percentages []float64) []float64 {
 	var buckets = make([]float64, 9)

 	for i := range percentages {
 		buckets[i] = percentages[i] - 0.1111111111111
 	}

 	return buckets
 }

 func isRandom(percentages []float64) bool {
 	randomCompare := compareRandom(percentages)
 	for _, r := range randomCompare {
 		if r > 0.001 {
 			return false
 		}
 	}

 	return true
 }

 func main() {
 	if len(os.Args) < 2 {
 		log.Fatalf("Must supply a file name to parse")
 	}

 	filename := os.Args[1]

 	fileContents, err := ioutil.ReadFile(filename)
 	errh(err)
 	numbers := getNumbers(fileContents)

 	dist := getDistribution(numbers)
 	fmt.Printf("Distribution: %#v\n", dist)

 	percentages := getPercentageDistribution(dist)
 	fmt.Printf("Percentages: %#v\n", percentages)

 	newcombComparison := compareNewcomb(percentages)
 	fmt.Printf("Comparison to Newcomb: %#v\n", newcombComparison)

 	randomComparison := compareRandom(percentages)
 	fmt.Printf("Comparison to random: %#v\n", randomComparison)

 	fmt.Printf("Is distribution random? %t\n", isRandom(percentages))
 }
	package main

	import (
	"fmt"
	"io/ioutil"
	"log"
	"math"
	"os"
	"strconv"
	"strings"
	)

	/*
	As an example dataset, you can collect unconfirmed bitcoin transactions (quoted in USD) with the following shell command:

	cat btc_transactions.txt <(curl https://www.blockchain.com/btc/unconfirmed-transactions \| grep -oE '>\$[0-9\.,]+<' \| tr -d ',$<>' \| grep -vE '^0') \| sort \| uniq > btc_transactions2.txt && mv btc_transactions2.txt btc_transactions.txt && cat btc_transactions.txt \| wc -l

	And if you want some random numbers, courtesy of Python:

	echo -e "import random\n\nfor i in range(0, 500000):\n\tprint(random.random() * 100000000)" \| python > random.txt

	NOTE: this may include other prices? I don't know, I'm lazy
	*/

	func errh(err error) {
	if err != nil {
	log.Fatalf("%v", err)
	}
	}

	// getNumbers strips out numbers with leading '0'
	func getNumbers(contents []byte) []float64 {
	var results = []float64{}

	for _, line := range strings.Split(string(contents), "\n") {
	line = strings.TrimSpace(line)
	if line == "" {
	continue
	}

	parsed, err := strconv.ParseFloat(line, 64)
	if err != nil {
	fmt.Printf("Failed to parse %s: %v\n", line, err)
	continue
	} else if parsed == 0.0 {
	continue
	}
	results = append(results, parsed)
	}

	return results
	}

	// distribution is an array of ints with the distribution of leading characters, with the 0 index representing '1'
	func getDistribution(numbers []float64) []int64 {
	var buckets = make([]int64, 9)

	for _, number := range numbers {
	lead := string(fmt.Sprintf("%f", number)[0])
	leadInt, err := strconv.ParseInt(lead, 10, 32)
	errh(err)
	if leadInt == 0 {
	fmt.Printf("Invalid number: %f\n", number)
	continue
	}
	buckets[leadInt-1]++
	}

	return buckets
	}

	func getPercentageDistribution(distribution []int64) []float64 {
	var buckets = make([]float64, 9)
	var totalNum int64 = 0

	for _, num := range distribution {
	totalNum += num
	}

	for i, num := range distribution {
	buckets[i] = float64(num) / float64(totalNum)
	}

	return buckets
	}

	/*
	Simon Newcomb was the original discoverer of "Benford's Law", in his paper "Note on the Frequency of Use of the
	Different Digits in Natural Numbers": https://www.jstor.org/stable/2369148?seq=1#metadata_info_tab_contents

	From the paper, here were Newcomb's probabilities of each leading digit:
	1: 0.3010
	2: 0.1761
	3: 0.1249
	4: 0.0969
	5: 0.0792
	6: 0.0669
	7: 0.0580
	8: 0.0512
	9: 0.0458
	*/

	func compareNewcomb(percentages []float64) []float64 {
	var buckets = make([]float64, 9)

	for i := range percentages {
	d := float64(i + 1)
	expected := math.Log10((d + 1) / d)
	buckets[i] = percentages[i] - expected
	}

	return buckets
	}

	func compareRandom(percentages []float64) []float64 {
	var buckets = make([]float64, 9)

	for i := range percentages {
	buckets[i] = percentages[i] - 0.1111111111111
	}

	return buckets
	}

	func isRandom(percentages []float64) bool {
	randomCompare := compareRandom(percentages)
	for _, r := range randomCompare {
	if r > 0.001 {
	return false
	}
	}

	return true
	}

	func main() {
	if len(os.Args) < 2 {
	log.Fatalf("Must supply a file name to parse")
	}

	filename := os.Args[1]

	fileContents, err := ioutil.ReadFile(filename)
	errh(err)
	numbers := getNumbers(fileContents)

	dist := getDistribution(numbers)
	fmt.Printf("Distribution: %#v\n", dist)

	percentages := getPercentageDistribution(dist)
	fmt.Printf("Percentages: %#v\n", percentages)

	newcombComparison := compareNewcomb(percentages)
	fmt.Printf("Comparison to Newcomb: %#v\n", newcombComparison)

	randomComparison := compareRandom(percentages)
	fmt.Printf("Comparison to random: %#v\n", randomComparison)

	fmt.Printf("Is distribution random? %t\n", isRandom(percentages))
	}