Last active
August 23, 2020 08:54
-
-
Save cneill/8869c9d4a56440ea4a0bdd817abbe73b to your computer and use it in GitHub Desktop.
Analyze a set of numbers for "Benfordness"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"io/ioutil" | |
"log" | |
"math" | |
"os" | |
"strconv" | |
"strings" | |
) | |
/* | |
As an example dataset, you can collect unconfirmed bitcoin transactions (quoted in USD) with the following shell command: | |
cat btc_transactions.txt <(curl https://www.blockchain.com/btc/unconfirmed-transactions | grep -oE '>\$[0-9\.,]+<' | tr -d ',$<>' | grep -vE '^0') | sort | uniq > btc_transactions2.txt && mv btc_transactions2.txt btc_transactions.txt && cat btc_transactions.txt | wc -l | |
And if you want some random numbers, courtesy of Python: | |
echo -e "import random\n\nfor i in range(0, 500000):\n\tprint(random.random() * 100000000)" | python > random.txt | |
NOTE: this may include other prices? I don't know, I'm lazy | |
*/ | |
func errh(err error) { | |
if err != nil { | |
log.Fatalf("%v", err) | |
} | |
} | |
// getNumbers strips out numbers with leading '0' | |
func getNumbers(contents []byte) []float64 { | |
var results = []float64{} | |
for _, line := range strings.Split(string(contents), "\n") { | |
line = strings.TrimSpace(line) | |
if line == "" { | |
continue | |
} | |
parsed, err := strconv.ParseFloat(line, 64) | |
if err != nil { | |
fmt.Printf("Failed to parse %s: %v\n", line, err) | |
continue | |
} else if parsed == 0.0 { | |
continue | |
} | |
results = append(results, parsed) | |
} | |
return results | |
} | |
// distribution is an array of ints with the distribution of leading characters, with the 0 index representing '1' | |
func getDistribution(numbers []float64) []int64 { | |
var buckets = make([]int64, 9) | |
for _, number := range numbers { | |
lead := string(fmt.Sprintf("%f", number)[0]) | |
leadInt, err := strconv.ParseInt(lead, 10, 32) | |
errh(err) | |
if leadInt == 0 { | |
fmt.Printf("Invalid number: %f\n", number) | |
continue | |
} | |
buckets[leadInt-1]++ | |
} | |
return buckets | |
} | |
func getPercentageDistribution(distribution []int64) []float64 { | |
var buckets = make([]float64, 9) | |
var totalNum int64 = 0 | |
for _, num := range distribution { | |
totalNum += num | |
} | |
for i, num := range distribution { | |
buckets[i] = float64(num) / float64(totalNum) | |
} | |
return buckets | |
} | |
/* | |
Simon Newcomb was the original discoverer of "Benford's Law", in his paper "Note on the Frequency of Use of the | |
Different Digits in Natural Numbers": https://www.jstor.org/stable/2369148?seq=1#metadata_info_tab_contents | |
From the paper, here were Newcomb's probabilities of each leading digit: | |
1: 0.3010 | |
2: 0.1761 | |
3: 0.1249 | |
4: 0.0969 | |
5: 0.0792 | |
6: 0.0669 | |
7: 0.0580 | |
8: 0.0512 | |
9: 0.0458 | |
*/ | |
func compareNewcomb(percentages []float64) []float64 { | |
var buckets = make([]float64, 9) | |
for i := range percentages { | |
d := float64(i + 1) | |
expected := math.Log10((d + 1) / d) | |
buckets[i] = percentages[i] - expected | |
} | |
return buckets | |
} | |
func compareRandom(percentages []float64) []float64 { | |
var buckets = make([]float64, 9) | |
for i := range percentages { | |
buckets[i] = percentages[i] - 0.1111111111111 | |
} | |
return buckets | |
} | |
func isRandom(percentages []float64) bool { | |
randomCompare := compareRandom(percentages) | |
for _, r := range randomCompare { | |
if r > 0.001 { | |
return false | |
} | |
} | |
return true | |
} | |
func main() { | |
if len(os.Args) < 2 { | |
log.Fatalf("Must supply a file name to parse") | |
} | |
filename := os.Args[1] | |
fileContents, err := ioutil.ReadFile(filename) | |
errh(err) | |
numbers := getNumbers(fileContents) | |
dist := getDistribution(numbers) | |
fmt.Printf("Distribution: %#v\n", dist) | |
percentages := getPercentageDistribution(dist) | |
fmt.Printf("Percentages: %#v\n", percentages) | |
newcombComparison := compareNewcomb(percentages) | |
fmt.Printf("Comparison to Newcomb: %#v\n", newcombComparison) | |
randomComparison := compareRandom(percentages) | |
fmt.Printf("Comparison to random: %#v\n", randomComparison) | |
fmt.Printf("Is distribution random? %t\n", isRandom(percentages)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment