-
-
Save binarybana/4312281 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require("profile") | |
using Profile | |
@profile begin | |
############################################################################## | |
# | |
# Low-level text parsing | |
# | |
############################################################################## | |
# Implements a very simple two-state machine that splits *-separated | |
# lines on the single character `separator`, but ignores occurrences | |
# of `separator` when they occur inside a region bounded by | |
# `quotation_character` | |
# | |
# For now, we're going to restrict things to only handle incoming | |
# strings that use single Char encodings | |
function split_separated_line{T <: String}(line::T, | |
separator::Char, | |
quotation_character::Char) | |
inside_quotes = false | |
items = Array(UTF8String, strlen(line)) | |
current_item = Array(Uint8, strlen(line)) | |
total_items = 0 | |
i = 0 | |
for chr in line | |
i += 1 | |
if inside_quotes | |
if chr == quotation_character | |
inside_quotes = false | |
i -= 1 | |
else | |
current_item[i] = chr | |
end | |
else | |
if chr == quotation_character | |
inside_quotes = true | |
i -= 1 | |
else | |
if chr == separator | |
total_items += 1 | |
items[total_items] = bytestring(current_item[1:(i - 1)]) | |
i = 0 | |
current_item = Array(Uint8, strlen(line)) | |
else | |
current_item[i] = chr | |
end | |
end | |
end | |
end | |
total_items += 1 | |
items[total_items] = bytestring(current_item[1:i]) | |
return items[1:total_items] | |
end | |
############################################################################## | |
# | |
# Inferential steps | |
# | |
############################################################################## | |
function determine_separator{T <: String}(filename::T) | |
if ismatch(r"csv$", filename) | |
return ',' | |
elseif ismatch(r"tsv$", filename) | |
return '\t' | |
elseif ismatch(r"wsv$", filename) | |
return ' ' | |
else | |
error("Unable to determine separator used in $filename") | |
end | |
end | |
function determine_nrows{T <: String}(filename::T, header::Bool) | |
total_lines = countlines(filename) | |
if header | |
return total_lines - 1 | |
else | |
return total_lines | |
end | |
end | |
function determine_ncols{T <: String}(filename::T, | |
separator::Char, | |
quotation_character::Char) | |
io = open(filename, "r") | |
line = chomp(readline(io)) | |
close(io) | |
return length(split_separated_line(line, separator, quotation_character)) | |
end | |
function determine_column_names(io::IOStream, | |
separator::Char, | |
quotation_character::Char, | |
header::Bool) | |
seek(io, 0) | |
line = chomp(readline(io)) | |
if length(line) == 0 | |
error("Failed to determine column names from an empty data source") | |
end | |
fields = split_separated_line(line, separator, quotation_character) | |
if header | |
seek(io, 0) | |
return fields | |
else | |
seek(io, 0) | |
column_names = generate_column_names(length(fields)) | |
end | |
end | |
# Read data line-by-line | |
# Line-by-line reading may be IO-bound | |
function read_separated_text(io::IOStream, | |
nrows::Int, | |
ncols::Int, | |
separator::Char, | |
quotation_character::Char) | |
text_data = Array(UTF8String, nrows, ncols) | |
i = 0 | |
while i < nrows | |
line = chomp(readline(io)) | |
if length(line) == 0 | |
break | |
end | |
i += 1 | |
text_data[i, 1:ncols] = split_separated_line(line, separator, quotation_character) | |
end | |
if i == 0 | |
return Array(UTF8String, 0, 0) | |
else | |
return text_data[1:i, :] | |
end | |
end | |
function infer_column_types{S <: String, T <: String}(text_data::Matrix{S}, | |
missingness_indicators::Vector{T}) | |
nrows, ncols = size(text_data) | |
# Default to Int64 for all column types until we have to demote them | |
# May want to shift to using numeric codes for types | |
column_types = Array(Any, ncols) | |
for i in 1:ncols | |
column_types[i] = Int64 | |
end | |
for j in 1:ncols | |
for i in 1:nrows | |
if column_types[j] <: String | |
break | |
end | |
if !contains(missingness_indicators, text_data[i, j]) | |
column_types[j] = tightest_type(text_data[i, j], column_types[j]) | |
end | |
end | |
end | |
return column_types | |
end | |
# TODO: Split this into determine_column_names and infer_column_types | |
# Short-circuit option allows one to just guess metadata for massive files | |
# Currently maxes out after 1,000 lines | |
function determine_metadata{T <: String}(filename::String, | |
separator::Char, | |
quotation_character::Char, | |
missingness_indicators::Vector{T}, | |
header::Bool, | |
short_circuit::Bool) | |
nrows = determine_nrows(filename, header) | |
maxlines = nrows | |
if short_circuit | |
maxlines = min(nrows, 1_000) | |
end | |
io = open(filename, "r") | |
column_names = determine_column_names(io, separator, quotation_character, header) | |
ncols = length(column_names) | |
if header # Skip the header for type inference | |
readline(io) | |
end | |
text_data = read_separated_text(io, maxlines, ncols, separator, quotation_character) | |
close(io) | |
column_types = infer_column_types(text_data, missingness_indicators) | |
# Return the inferred column names and types | |
return (column_names, column_types, nrows) | |
end | |
function determine_metadata{T <: String}(filename::String, | |
header::Bool, | |
short_circuit::Bool) | |
separator = determine_separator(filename) | |
quotation_character = '"' | |
determine_metadata(filename, separator, quotation_character, missingness_indicators, header, short_circuit) | |
end | |
function convert_to_dataframe{R <: String, | |
S <: String, | |
T <: String}(text_data::Matrix{R}, | |
missingness_indicators::Vector{S}, | |
column_types::Vector, | |
column_names::Vector{T}) | |
# Keep a record of number of rows and columns | |
nrows, ncols = size(text_data) | |
# Short-circuit if the text data is empty | |
if nrows == 0 | |
return DataFrame(column_types, column_names, 0) | |
end | |
# Make sure that the user has specified coherent types and names | |
if ncols != length(column_types) || ncols != length(column_names) | |
error("Column types and names do not match the input data's size") | |
end | |
# Store the columns as a set of DataVec's inside an Array of Any's | |
columns = Array(Any, ncols) | |
# Convert each column of text into a DataVec of the | |
# appropriate type | |
for j in 1:ncols | |
is_missing = BitVector(nrows) | |
for i in 1:nrows | |
if contains(missingness_indicators, text_data[i, j]) | |
text_data[i, j] = string(baseval(column_types[j])) | |
is_missing[i] = true | |
else | |
is_missing[i] = false | |
end | |
end | |
if column_types[j] == Int64 | |
values = int(text_data[1:nrows, j]) | |
elseif column_types[j] == Float64 | |
values = float(text_data[1:nrows, j]) | |
elseif column_types[j] == UTF8String | |
values = convert(Array{UTF8String, 1}, text_data[1:nrows, j]) | |
elseif column_types[j] == ASCIIString | |
values = convert(Array{ASCIIString, 1}, text_data[1:nrows, j]) | |
else | |
error("Column cannot be converted to type: $(column_types[j])") | |
end | |
columns[j] = DataVec(values, is_missing) | |
end | |
# Prepare the DataFrame we'll return | |
df = DataFrame(columns, column_names) | |
return df | |
end | |
############################################################################## | |
# | |
# Text input | |
# | |
############################################################################## | |
# Read at most N lines from an IOStream | |
# Then return a minibatch of at most N rows as a DataFrame | |
function read_minibatch{R <: String, | |
S <: String, | |
T}(io::IOStream, | |
separator::Char, | |
quotation_character::Char, | |
missingness_indicators::Vector{R}, | |
column_names::Vector{S}, | |
column_types::Vector{T}, | |
minibatch_size::Int64) | |
# Keep a record of number of columns | |
ncols = length(column_types) | |
# Represent data as an array of strings before type conversion | |
text_data = read_separated_text(io, minibatch_size, ncols, separator, quotation_character) | |
# Convert text data to a DataFrame | |
return convert_to_dataframe(text_data, missingness_indicators, column_types, column_names) | |
end | |
# Read an entire data set into a DataFrame from an IOStream | |
# TODO: Do only IO-pass through the data | |
function read_table{R <: String, | |
S <: String}(io::IOStream, | |
separator::Char, | |
quotation_character::Char, | |
missingness_indicators::Vector{R}, | |
header::Bool, | |
column_names::Vector{S}, | |
nrows::Int64) | |
# Return to start of stream | |
seek(io, 0) | |
# Read first line to remove header in advance | |
if header | |
readline(io) | |
end | |
# Keep a record of number of columns | |
ncols = length(column_names) | |
# Represent data as an array of strings before type conversion | |
text_data = read_separated_text(io, nrows, ncols, separator, quotation_character) | |
# Short-circuit if data set is empty except for a header line | |
if size(text_data, 1) == 0 | |
column_types = {Any for i in 1:ncols} | |
return DataFrame(column_types, column_names, 0) | |
end | |
# Infer column types | |
column_types = infer_column_types(text_data, missingness_indicators) | |
# Convert text data to a DataFrame | |
return convert_to_dataframe(text_data, missingness_indicators, column_types, column_names) | |
end | |
function read_table{T <: String}(filename::T) | |
# Do inference for missing configuration settings | |
separator = determine_separator(filename) | |
quotation_character = '"' | |
missingness_indicators = ["", "NA"] | |
header = true | |
nrows = determine_nrows(filename, header) | |
io = open(filename, "r") | |
column_names = determine_column_names(io, separator, quotation_character, header) | |
df = read_table(io, | |
separator, | |
quotation_character, | |
missingness_indicators, | |
header, | |
column_names, | |
nrows) | |
close(io) | |
return df | |
end | |
end #profile | |
read_table("/home/bana/Downloads/movies.csv") | |
@profile report | |
############################################################################## | |
# | |
# Text output | |
# | |
############################################################################## | |
# Quotation rules | |
# Quote all string fields | |
# Don't quote real-valued fields | |
# Quote non-string, non-real-valued fields | |
function in_quotes{T <: String}(val::T, quotation_character::Char) | |
strcat(quotation_character, val, quotation_character) | |
end | |
function in_quotes{T <: Real}(val::T, quotation_character::Char) | |
string(val) | |
end | |
function in_quotes{T <: Any}(val::T, quotation_character::Char) | |
strcat(quotation_character, string(val), quotation_character) | |
end | |
# TODO: write_table should do more to react to the type of each column | |
# Need to increase precision of string representation of Float64's | |
function print_table(df::DataFrame, | |
io::IOStream, | |
separator::Char, | |
quotation_character::Char) | |
n, p = nrow(df), ncol(df) | |
column_names = colnames(df) | |
for j in 1:p | |
if j < p | |
print(io, in_quotes(column_names[j], quotation_character)) | |
print(io, separator) | |
else | |
println(io, in_quotes(column_names[j], quotation_character)) | |
end | |
end | |
for i in 1:n | |
for j in 1:p | |
if j < p | |
print(io, in_quotes(df[i, j], quotation_character)) | |
print(io, separator) | |
else | |
println(io, in_quotes(df[i, j], quotation_character)) | |
end | |
end | |
end | |
end | |
function print_table(df::DataFrame, separator::Char, quotation_character::Char) | |
print_table(df, OUTPUT_STREAM, separator, quotation_character) | |
end | |
print_table(df::DataFrame) = print_table(df, OUTPUT_STREAM, ',', '"') | |
function write_table{T <: String}(df::DataFrame, | |
filename::T, | |
separator::Char, | |
quotation_character::Char) | |
io = open(filename, "w") | |
print_table(df, io, separator, quotation_character) | |
close(io) | |
end | |
# Infer configuration settings from filename | |
function write_table{T <: String}(df::DataFrame, filename::T) | |
separator = determine_separator(filename) | |
quotation_character = '"' | |
write_table(df, filename, separator, quotation_character) | |
end | |
############################################################################## | |
# | |
# Binary serialization | |
# | |
############################################################################## | |
# Wrappers for serialization | |
function save(filename, d) | |
f = open(filename, "w") | |
serialize(f, d) | |
close(f) | |
end | |
function load_df(filename) | |
f = open(filename) | |
dd = deserialize(f)() | |
close(f) | |
return dd | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
julia> require("DataFrames") | |
count time(%) time(s) | |
58789 0.00 0.000186 # /home/bana/.julia/DataFrames/src/io.jl, line 20 | |
58789 0.02 0.028055 # line 21 | |
58789 0.01 0.006174 # line 22 | |
58789 0.00 0.000105 # line 23 | |
58789 0.00 0.000124 # line 24 | |
5941920 0.03 0.038879 # line 26 | |
176389 0.00 0.002492 # line 29 | |
176389 0.00 0.000460 # line 30 | |
1267077 0.01 0.007016 # line 32 | |
176389 0.00 0.003109 # line 36 | |
176389 0.00 0.000457 # line 37 | |
1410936 0.01 0.016423 # line 40 | |
1410936 1.60 1.976091 # line 41 | |
1410936 0.00 0.002732 # line 42 | |
1410936 0.11 0.134190 # line 43 | |
2911129 0.02 0.019880 # line 45 | |
58789 0.00 0.000789 # line 50 | |
58789 0.01 0.012929 # line 51 | |
58789 0.12 0.146372 # line 52 | |
1 0.03 0.033243 # /home/bana/.julia/DataFrames/src/io.jl, line 74 | |
1 0.00 0.000005 # /home/bana/.julia/DataFrames/src/io.jl, line 95 | |
1 0.01 0.007993 # line 96 | |
1 0.12 0.150127 # line 102 | |
1 0.00 0.000003 # line 105 | |
1 0.01 0.009379 # /home/bana/.julia/DataFrames/src/io.jl, line 120 | |
1 0.00 0.000000 # line 122 | |
58788 0.56 0.694576 # line 124 | |
58788 0.36 0.442168 # line 128 | |
58788 17.30 21.355132 # line 129 | |
1 0.00 0.000000 # /home/bana/.julia/DataFrames/src/io.jl, line 141 | |
1 0.00 0.000002 # line 145 | |
25 0.00 0.000001 # line 147 | |
1298553 5.01 6.185185 # line 156 | |
1 0.00 0.000000 # line 161 | |
1 0.00 0.000000 # /home/bana/.julia/DataFrames/src/io.jl, line 210 | |
1 0.00 0.000004 # line 223 | |
25 0.00 0.000306 # line 228 | |
107437 0.44 0.539362 # line 231 | |
107437 0.01 0.009928 # line 232 | |
1362263 1.52 1.877132 # line 234 | |
12 0.76 0.934277 # line 238 | |
11 0.67 0.832819 # line 240 | |
2 0.00 0.000717 # line 242 | |
25 0.02 0.023742 # line 248 | |
1 0.10 0.123444 # line 252 | |
1 0.00 0.000000 # line 253 | |
1 0.00 0.000005 # /home/bana/.julia/DataFrames/src/io.jl, line 294 | |
1 0.00 0.000078 # line 298 | |
1 0.00 0.000000 # line 302 | |
1 18.33 22.622687 # line 305 *read_separated_text(io, nrows, ncols, separator, quotation_character)* | |
1 10.65 13.139062 # line 314 *infer_column_types(text_data, missingness_indicators)* | |
1 6.35 7.838155 # line 317 *convert_to_dataframe(text_data, missingness_indicators, column_types, column_names)* | |
1 0.00 0.000015 # /home/bana/.julia/DataFrames/src/io.jl, line 322 | |
1 0.00 0.000000 # line 323 | |
1 0.00 0.000002 # line 324 | |
1 0.00 0.000000 # line 325 | |
1 0.03 0.033252 # line 326 | |
1 0.00 0.000026 # line 327 | |
1 0.15 0.190201 # line 328 | |
1 35.63 43.976593 # line 329 | |
1 0.00 0.000017 # line 336 | |
1 0.00 0.000000 # line 337 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment