Skip to content

Instantly share code, notes, and snippets.

@decabyte
Last active October 7, 2020 13:18
Show Gist options
  • Save decabyte/5101689 to your computer and use it in GitHub Desktop.
Save decabyte/5101689 to your computer and use it in GitHub Desktop.
% ARFF_READ - Read content of an ARFF file to a MATLAB's struct array.
%
% [DATA, relname, nomspec] = ARFF_READ(arff_file)
% arff_file => input file (.arff / .arff.gz extension)
% relname => relation name (string)
% DATA => struct array representing data and attributes (n x attrs)
% nomspec => struct array defining nominal-specification attributes
%
% NOTES:
% See ARFF_WRITE to read notes about relname and nomspec.
% See ARFF format specification on WEKA site.
function [data, relname, nomspec] = arff_read(arff_file)
if nargin < 1
error('MATLAB:input','Not enough inputs!');
end
if isempty(arff_file)
error('MATLAB:input','Bad file name!');
end
% check file extention
[~, ~, ext] = fileparts(arff_file);
if strcmpi(ext,'.arff')
% open file
fid = fopen(arff_file, 'r+t');
elseif strcmpi(ext,'.gz')
% temporary working dir
outdir = tempdir;
% decompress
dec_files = gunzip(arff_file, outdir);
if ~isempty(dec_files)
fid = fopen(dec_files{1}, 'r+t');
else
error('%s is not a valid arff_file', arff_file);
end
else
error('%s is not a valid arff_file', arff_file);
end
if fid == -1
error('MATLAB:file','File not found!');
end
% read relname
relname = [];
while isempty(relname)
tline = fgetl(fid);
if ~ischar(tline)
fclose(fid);
error('MATLAB:file','ARFF file not recognized!');
end
% avoid parsing @DATA and skip blank lines
if length(tline) > 9 && tline(1) == '@' && strcmpi(tline(2:9),'RELATION')
relname = tline(11:end);
break;
end
end
% read attributes
fields = {};
ftypes = [];
floop = 1;
fn = 1;
while floop
tline = fgetl(fid);
if ~ischar(tline)
break;
end
% avoid parsing @DATA and skip blank lines
if length(tline) > 5 && tline(1) == '@' && strcmpi(tline(2:10),'ATTRIBUTE')
%at = strfind(tline, ' ');
%
%if length(at) < 2
% error('MATLAB:file','ARFF file not recognized!');
%end
%
%fields{fn} = tline(at(1)+1:at(2)-1);
%typedef = tline(at(2)+1:end);
% parsing using textscan? (good for data, less for attributes)
A = textscan(tline,'%s %s %s','Whitespace',' \t\b{},');
if isempty(A{1}) || isempty(A{2}) || isempty(A{3})
fclose(fid);
error('MATLAB:file','ARFF file not recognized!');
end
if size(A{1},1) == 1
fields{fn} = char(A{2});
typedef = char(A{3});
else
fields{fn} = char(A{2}(1));
bt = strfind(tline,'{');
typedef = tline(bt(1):end);
end
if typedef(1) == '{' && typedef(end) == '}'
ftypes(fn) = 1;
%nomspec.(fields{fn}) = typedef;
% out is a cell with parsed classes assuming { x, x, x } format
out = textscan(typedef, '%s', 'Delimiter', ' ,{}', 'MultipleDelimsAsOne', 1);
% expand cell (avoid cell of cell)
nomspec.(fields{fn}) = out{:};
else
if strcmpi(typedef,'NUMERIC')
ftypes(fn) = 0;
elseif strcmpi(typedef,'STRING')
ftypes(fn) = 2;
else
dt = strfind(typedef, ' ');
if ~isempty(dt) && strcmpi(typedef(1:dt(1)-1), 'DATE')
ftypes(fn) = 3;
% implement date-format parsing
else
fclose(fid);
error('MATLAB:file','ARFF file not recognized!');
end
end
end
fn = fn + 1;
end
end
% create data struct
data = struct();
for fn = 1 : length(fields)
data.(fields{fn}) = [];
end
% store empty struct
data_tmpl = data;
% rewind file
fseek(fid,0,-1);
% seek data
has_data = 0;
while floop
tline = fgetl(fid);
if length(tline) == 5 && strcmpi(tline(1:5),'@DATA')
has_data = 1;
break;
end
if ~ischar(tline)
break;
end
end
if has_data == 1
dcnt = 1;
while floop
tline = fgetl(fid);
if length(tline) > 1
% find values
vt = strfind(tline,',');
% init with empty struct
data(dcnt) = data_tmpl;
for k = 1 : length(vt) + 1
if k == 1
if isempty(vt)
content = tline(1:end);
else
content = tline(1:vt(k)-1);
end
elseif k <= length(vt)
content = tline(vt(k-1)+1:vt(k)-1);
else
content = tline(vt(k-1)+1:end);
end
switch ftypes(k)
case 0
data(dcnt).(fields{k}) = str2double( content ); %str2num( content );
case 3
data(dcnt).(fields{k}) = datenum( content(2:end-1), 'yyyy-mm-dd HH:MM:SS' );
otherwise
data(dcnt).(fields{k}) = content;
end
end
dcnt = dcnt + 1;
end
if ~ischar(tline)
break;
end
end
end
% close file
fclose(fid);
% remove temporary decompressed file
if exist('dec_files','var') && ~isempty(dec_files)
delete(dec_files{1});
end
end
% References:
% [1]: http://www.cs.waikato.ac.nz/ml/weka/arff.html
% ARFF_WRITE - Saves a MATLAB's struct array to file using ARFF file format.
%
% ARFF_WRITE(arff_file, DATA, relname, nomspec)
% arff_file => output file (.arff / .arff.gz extension)
% DATA => struct array representing data and attributes (n x attrs)
% relname => relation name (string)
% nomspec => struct array defining nominal-specification attributes
%
% NOTES:
% Attribute name is taken from DATA struct fieldname and attribute
% type is taken from field data-type.
%
% Append "_class" to a DATA struct fieldname to save an attribute as
% nominal-specification attribute and specify the nominal-names
% inside NOMSPEC struct array using as fieldname the DATA struct's
% fieldname and as content a cell array of names (string).
%
% Append "_date" to a DATA struct fieldname and use numerical date
% representation (using datenum) to save an attribute as date type
% (using 'yyyy-mm-dd HH:MM:SS' format in ARFF file).
%
% TODO -- According to SPEC any attribute that contain space must be
% quoted using single quote char.
%
% See ARFF format specification on WEKA site.
function [] = arff_write(arff_file, data, relname, nomspec)
if nargin < 3
error('MATLAB:input','Not enough inputs!');
end
if isempty(data) || ~isstruct(data)
error('MATLAB:input','Please use struct data input!');
end
if isempty(arff_file)
arff_file = sprintf('output-%d.arff', randi(1000,1));
end
if isempty(relname)
relname = sprintf('relname-%d', randi(1000,1));
end
% check file extention
[arff_path, arff_name, ext] = fileparts(arff_file);
if strcmpi(ext,'.arff')
% open file
fid = fopen(arff_file, 'w+t');
elseif strcmpi(ext,'.gz')
% temp file
outfile = fullfile(tempdir, arff_name);
% open file
fid = fopen(outfile, 'w+t');
else
error('%s is not a valid arff_file', arff_file);
end
% write relname
fprintf(fid, '@RELATION %s\n\n', relname);
% write attributes
fields = fieldnames(data);
ftypes = zeros(size(fields));
for fn = 1 : length(fields)
if isnumeric( data(1).(fields{fn}) )
dt = strfind(fields{fn}, '_date');
if isempty(dt)
type = 'NUMERIC';
ftypes(fn) = 0;
else
% check SimpleDateFormat (java.doc) to accept this instead of ISO-8601
type = 'DATE "yyyy-mm-dd HH:MM:SS"';
ftypes(fn) = 3;
%name = fields{fn}(1:max(dt)-1);
end
elseif ischar( data(1).(fields{fn}) )
ct = strfind(fields{fn}, '_class');
if isempty(ct)
type = 'STRING';
ftypes(fn) = 2;
else
if isstruct(nomspec) && isfield(nomspec, fields{fn}) && ...
iscell(nomspec.(fields{fn}))
type = '{';
for k = 1 : length( nomspec.(fields{fn}) ) - 1
type = sprintf( '%s %s,', type, nomspec.(fields{fn}){k} );
end
type = sprintf('%s %s }', type, nomspec.(fields{fn}){k+1});
else
fclose(fid);
error('MATLAB:input','Inferring class specification from data!');
% TODO inference
end
ftypes(fn) = 1;
%name = fields{fn}(1:max(ct)-1);
end
else
fclose(fid);
error('MATLAB:input','Cannot convert %s field to ARFF format!', fields{fn});
end
fprintf(fid, '@ATTRIBUTE %s %s\n', fields{fn}, type);
%fprintf(fid, '@ATTRIBUTE %s %s\n', name, type);
end
% write data
fprintf(fid, '\n@DATA\n');
content = '';
for n = 1 : length(data)
for fn = 1 : length(fields)
if isempty(data(n).(fields{fn}))
content = '?';
else
switch ftypes(fn)
case 0
content = num2str( data(n).(fields{fn}) );
case 1
content = data(n).(fields{fn});
case 2
content = data(n).(fields{fn});
case 3
content = ['"' datestr(data(n).(fields{fn}), 'yyyy-mm-dd HH:MM:SS') '"'];
end
end
if fn < length(fields)
fprintf(fid,'%s,', content);
else
fprintf(fid,'%s', content);
end
end
fprintf(fid,'\n');
end
% close file
fclose(fid);
% remove temporary file & compress .arff
if exist('outfile','var') && ~isempty(outfile)
gzip(outfile, arff_path);
delete(outfile);
end
end
% References:
% [1]: http://www.cs.waikato.ac.nz/ml/weka/arff.html
% example_read.m
clear all; close all; clc;
path(path, '..');
%% import dataset
infile = 'example_dataset.arff';
% load arff
[data, relname, nomspec] = arff_read(infile);
% extract nominal specification attribute
type_class = nomspec.type_class;
%% plot dataset
plot([data.idx], [data.high], 'r.-'); grid on; hold on;
plot([data.idx], [data.med], 'g.-'); grid on; hold on;
plot([data.idx], [data.low], 'b.-'); grid on; hold on;
m_values = mean([ data.high; data.med; data.low ]');
for k = 1 : length(m_values)
hr = refline(0, m_values(k));
set(hr,'Color','k','LineStyle','--');
end
legend('high','med','low');
tl = title(relname);
xl = xlabel('idx');
yl = ylabel('value');
set(tl,'Interpreter','none');
set(tl,'FontSize', 14);
set(xl,'FontSize', 12);
set(yl,'FontSize', 12);
%% type histogram
T = {data.type_class};
[B,I,J] = unique(T);
f = figure();
hist(J,length(B)); grid on;
xlim([0.5 3.5]);
hp = findobj(f,'Type','patch');
set(hp,'FaceColor','r','EdgeColor','w');
% labels
[n,x] = hist(J,length(B));
text(x, n, type_class, 'horizontalalignment', ...
'center', 'verticalalignment', 'bottom');
tl = title(relname);
xl = xlabel('class');
yl = ylabel('count');
set(tl,'Interpreter','none');
set(gca,'XTick',[]);
set(tl,'FontSize', 14);
set(xl,'FontSize', 12);
set(yl,'FontSize', 12);
% example_write.m
clear all; close all; clc;
path(path, '..');
%% create data structure
data = struct();
relname = sprintf('dataset_%s', datestr(now,'yyyymmdd'));
outfile = sprintf('%s.arff', relname);
% nominal classes
type_class = { 'front', 'middle', 'rear' };
%% populate dataset
for i = 1 : 100
data(i).idx = i;
data(i).low = randi([0 33], 1);
data(i).med = randi([34 66], 1);
data(i).high = randi([67 100], 1);
data(i).type_class = type_class{ randi([1 3]) };
end
%% declare nominal specification attributes
nomspec.type_class = type_class;
% save arff
arff_write(outfile, data, relname, nomspec);
%% plot dataset
plot([data.idx], [data.high], 'r.-'); grid on; hold on;
plot([data.idx], [data.med], 'g.-'); grid on; hold on;
plot([data.idx], [data.low], 'b.-'); grid on; hold on;
m_values = mean([ data.high; data.med; data.low ]');
for k = 1 : length(m_values)
hr = refline(0, m_values(k));
set(hr,'Color','k','LineStyle','--');
end
legend('high','med','low');
tl = title(relname);
xl = xlabel('idx');
yl = ylabel('value');
set(tl,'Interpreter','none');
set(tl,'FontSize', 14);
set(xl,'FontSize', 12);
set(yl,'FontSize', 12);
%% type histogram
T = {data.type_class};
[B,I,J] = unique(T);
f = figure();
hist(J,length(B)); grid on;
xlim([0.5 3.5]);
hp = findobj(f,'Type','patch');
set(hp,'FaceColor','r','EdgeColor','w');
% labels
[n,x] = hist(J,length(B));
text(x, n, type_class, 'horizontalalignment', ...
'center', 'verticalalignment', 'bottom');
tl = title(relname);
xl = xlabel('class');
yl = ylabel('count');
set(tl,'Interpreter','none');
set(gca,'XTick',[]);
set(tl,'FontSize', 14);
set(xl,'FontSize', 12);
set(yl,'FontSize', 12);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment