Last active
December 6, 2017 00:57
-
-
Save avullo/b92c7f71ff01867e88b8ca7b8e90fb2b to your computer and use it in GitHub Desktop.
R random sampling from (large) text files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Randomly sample a given percentage of lines from files in a directory | |
## and write them out to a file | |
## | |
sampleFiles <- function(dir = './', ofname, perc = 1, append = TRUE, seed = 1234) { | |
## 'dir' is a characted vector of length 1 representing the name of the directory | |
## 'ofname' is a character vector of length 1 indicating the name of the output file | |
## 'append' is logical to tell wheter to append to sampled fraction to the output file | |
## 'seed' is a number to the set the seed for the random number generator | |
## Return: NULL | |
files <- list.files(dir) | |
lapply(files, function(fname) sampleFile(paste(dir, fname, sep = "/"), ofname, perc = perc, append = append, seed = seed)) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Randomly sample a certain fraction of the lines of a file, and write them to an output file | |
## | |
## Adapted from https://stat.ethz.ch/pipermail/r-help/2007-February/124812.html | |
## | |
## WARN | |
## works with files with no or a very few empty lines | |
## | |
sampleFile <- function(ifname, ofname, perc = 1, append = TRUE, seed = 1234) { | |
## 'ifname' is a character vector of length 1 indicating the name of the file | |
## 'ofname' is a character vector of length 1 indicating the name of the output file | |
## 'perc' is a number between 1 and 100 indicating the fraction (in %) of | |
## of the input file to write to the output file | |
## 'append' is logical to tell wheter to append to sampled fraction to the output file | |
## 'seed' is a number to the set the seed for the random number generator | |
## Return: NULL | |
nlines <- numberOfLines(ifname) | |
# generate the random row values | |
set.seed(seed) | |
sel <- sample(1:nlines, nlines * perc / 100) | |
# set up a sequence for the cache chunks, | |
# chunk size is 9th of number of lines | |
chunk_size <- floor(nlines/9) | |
cuts <- seq(0, nlines, chunk_size) | |
# loop over the length of cuts, less 1 | |
for ( i in seq(along = cuts[-1]) ) { | |
# get a chunk_size row chunk, skipping rows | |
# as appropriate for each subsequent chunk | |
# might get less then chunk_size lines, if there are empty lines | |
chunk <- scan(ifname, what = character(), sep = "\n", skip = cuts[i], nlines = chunk_size) | |
# set up a row sequence for the current chunk | |
rows <- (cuts[i]+1):(cuts[i+1]) | |
# are any of the the random values in the current chunk? | |
# if so, get them and write them out | |
chunk.sel <- sel[which(sel %in% rows)] | |
if(length(chunk.sel) > 0) { | |
chunk_index <- sel - cuts[i] | |
# take into account chunk might have less than chunk_size lines | |
write.rows <- chunk[chunk_index[chunk_index>0 & chunk_index <= chunk_size]] | |
write(write.rows, ofname, append = append, sep = "\n") | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment