Last active
July 16, 2016 14:35
-
-
Save brezniczky/2bc925b05d9321bf5bef to your computer and use it in GitHub Desktop.
Blogger code #4 (A more efficient filter, full code)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
file.size.bytes = 1024 * 256 | |
default.window.width = 256 | |
# symbols in the range [0:(symbol.count - 1)] are expected | |
symbol.count = 256 | |
set.seed(0) | |
test.data.bytes = | |
sample(x = 0:(symbol.count - 1), replace = TRUE, size = file.size.bytes) | |
# The below is a slightly modified version of the example code at | |
# http://shape-of-code.coding-guidelines.com/2013/07/21/unique-bytes-in-a-sliding-window-as-a-file-content-signature/ | |
# | |
# The code published by Derek Jones takes a 5x less frequent approach. | |
origi.count = function(t, window_width = default.window.width) { | |
if (length(t) < window_width) { | |
return(rep(0, 0)) | |
} | |
# 5x denser than the original one | |
cnt_points = 1:(length(t) - window_width + 1) | |
# here's the original bit | |
u=sapply(cnt_points, function(X) length(unique(t[X:(X+window_width)]))) | |
} | |
count.positives = function(x) { | |
return(sum(x > 0)) | |
} | |
get.initial.freqs = function(x, window.width) { | |
frequency = rep(0, symbol.count) | |
for(i in 1:window.width) { | |
frequency[x[i] + 1] = frequency[x[i] + 1] + 1 | |
} | |
return(frequency) | |
} | |
quicker.count = function(x, window.width = default.window.width) { | |
if (length(x) < window.width) { | |
return(rep(0, 0)) | |
} | |
n.positions = length(x) - window.width + 1 | |
# initialize counts over the first window position | |
frequency = get.initial.freqs(x, window.width) | |
ans = rep(0, n.positions) | |
ans[1] = count.positives(frequency) | |
# start and do the counting | |
for(i in 2:n.positions) { | |
# the value leaving the window is the first item in the | |
# previous window, i.e. data[i - 1] | |
old.data.idx = x[i - 1] + 1 | |
frequency[old.data.idx] = frequency[old.data.idx] - 1 | |
# the value entering the window is the last item in the | |
# current window | |
new.data.idx = x[i + window.width - 1] + 1 | |
frequency[new.data.idx] = frequency[new.data.idx] + 1 | |
ans[i] = count.positives(frequency) | |
} | |
return(ans) | |
} | |
quicker.count.2 = function(x, window.width = default.window.width) { | |
if (length(x) < window.width) { | |
return(rep(0, 0)) | |
} | |
n.positions = length(x) - window.width + 1 | |
# find out the frequencies & count of unique symbols at the first position | |
frequency = get.initial.freqs(x, window.width) | |
ans = rep(0, n.positions) | |
ans[1] = count.positives(frequency) | |
count = ans[1] | |
for(i in 2:n.positions) { | |
old.data.idx = x[i - 1] + 1 | |
# the value leaving the window is the first item in the | |
# previous window | |
frequency[old.data.idx] = frequency[old.data.idx] - 1 | |
if (frequency[old.data.idx] == 0) { | |
# maintain the count value: symbol no longer present in the window | |
count = count - 1 | |
} | |
# the value entering the window is the last item in the | |
# current window | |
new.data.idx = x[i + window.width - 1] + 1 | |
frequency[new.data.idx] = frequency[new.data.idx] + 1 | |
if (frequency[new.data.idx] == 1) { | |
# maintain the count value: symbol just detected in the window | |
count = count + 1 | |
} | |
ans[i] = count | |
} | |
return(ans) | |
} | |
wait2 = function() { | |
date1 = date() | |
while (date1 == date()) { | |
} | |
date2 = date() | |
while (date2 == date()) { | |
} | |
} | |
# allow the CPU to get up to speed using a busy wait of 1..2 seconds | |
wait2() | |
t1 = system.time({ | |
counts1 = origi.count(test.data.bytes, default.window.width - 1) | |
}) | |
t2 = system.time({ | |
counts2 = quicker.count(test.data.bytes) | |
}) | |
t3 = system.time({ | |
counts3 = quicker.count.2(test.data.bytes) | |
}) | |
# could use for verification: | |
# | |
# print(head(cbind(head(counts1, length(counts2)), counts2, counts3))) | |
times = rbind(t1, t2, t3)[, "elapsed"] | |
names(times) <- c("Original", "First imp.", "Second imp.") | |
print(times) | |
barplot(height = times, col = "darkblue", | |
ylab = "Elapsed (sec.)") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment