Skip to content

Instantly share code, notes, and snippets.

@brezniczky
Last active July 16, 2016 14:35
Show Gist options
  • Save brezniczky/2bc925b05d9321bf5bef to your computer and use it in GitHub Desktop.
Save brezniczky/2bc925b05d9321bf5bef to your computer and use it in GitHub Desktop.
Blogger code #4 (A more efficient filter, full code)
file.size.bytes = 1024 * 256
default.window.width = 256
# symbols in the range [0:(symbol.count - 1)] are expected
symbol.count = 256
set.seed(0)
test.data.bytes =
sample(x = 0:(symbol.count - 1), replace = TRUE, size = file.size.bytes)
# The below is a slightly modified version of the example code at
# http://shape-of-code.coding-guidelines.com/2013/07/21/unique-bytes-in-a-sliding-window-as-a-file-content-signature/
#
# The code published by Derek Jones takes a 5x less frequent approach.
origi.count = function(t, window_width = default.window.width) {
if (length(t) < window_width) {
return(rep(0, 0))
}
# 5x denser than the original one
cnt_points = 1:(length(t) - window_width + 1)
# here's the original bit
u=sapply(cnt_points, function(X) length(unique(t[X:(X+window_width)])))
}
count.positives = function(x) {
return(sum(x > 0))
}
get.initial.freqs = function(x, window.width) {
frequency = rep(0, symbol.count)
for(i in 1:window.width) {
frequency[x[i] + 1] = frequency[x[i] + 1] + 1
}
return(frequency)
}
quicker.count = function(x, window.width = default.window.width) {
if (length(x) < window.width) {
return(rep(0, 0))
}
n.positions = length(x) - window.width + 1
# initialize counts over the first window position
frequency = get.initial.freqs(x, window.width)
ans = rep(0, n.positions)
ans[1] = count.positives(frequency)
# start and do the counting
for(i in 2:n.positions) {
# the value leaving the window is the first item in the
# previous window, i.e. data[i - 1]
old.data.idx = x[i - 1] + 1
frequency[old.data.idx] = frequency[old.data.idx] - 1
# the value entering the window is the last item in the
# current window
new.data.idx = x[i + window.width - 1] + 1
frequency[new.data.idx] = frequency[new.data.idx] + 1
ans[i] = count.positives(frequency)
}
return(ans)
}
quicker.count.2 = function(x, window.width = default.window.width) {
if (length(x) < window.width) {
return(rep(0, 0))
}
n.positions = length(x) - window.width + 1
# find out the frequencies & count of unique symbols at the first position
frequency = get.initial.freqs(x, window.width)
ans = rep(0, n.positions)
ans[1] = count.positives(frequency)
count = ans[1]
for(i in 2:n.positions) {
old.data.idx = x[i - 1] + 1
# the value leaving the window is the first item in the
# previous window
frequency[old.data.idx] = frequency[old.data.idx] - 1
if (frequency[old.data.idx] == 0) {
# maintain the count value: symbol no longer present in the window
count = count - 1
}
# the value entering the window is the last item in the
# current window
new.data.idx = x[i + window.width - 1] + 1
frequency[new.data.idx] = frequency[new.data.idx] + 1
if (frequency[new.data.idx] == 1) {
# maintain the count value: symbol just detected in the window
count = count + 1
}
ans[i] = count
}
return(ans)
}
wait2 = function() {
date1 = date()
while (date1 == date()) {
}
date2 = date()
while (date2 == date()) {
}
}
# allow the CPU to get up to speed using a busy wait of 1..2 seconds
wait2()
t1 = system.time({
counts1 = origi.count(test.data.bytes, default.window.width - 1)
})
t2 = system.time({
counts2 = quicker.count(test.data.bytes)
})
t3 = system.time({
counts3 = quicker.count.2(test.data.bytes)
})
# could use for verification:
#
# print(head(cbind(head(counts1, length(counts2)), counts2, counts3)))
times = rbind(t1, t2, t3)[, "elapsed"]
names(times) <- c("Original", "First imp.", "Second imp.")
print(times)
barplot(height = times, col = "darkblue",
ylab = "Elapsed (sec.)")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment