Created
May 19, 2020 22:06
-
-
Save fjpse/3fcf0e04960234a25ceddeb9c8eb94b4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-module(index). | |
-export([ | |
process_file/2 | |
]). | |
%% | |
%% process_file(InputFile, OutputFile) -> Index | |
%% | |
%% this funcion does the next: | |
%% a) reads a text form the file InputFile | |
%% b) processes this text creating an index of words with the lines where this words appear. | |
%% c) writes the indext to the file OutputFile | |
%% d) returns the created index. | |
%% | |
process_file(InputFile, OutputFile) -> | |
Text = read_from_file(InputFile), | |
Index = process_text(Text), | |
write_to_file(OutputFile, Index), | |
Index. | |
%% | |
%% process_text(Text) -> Index | |
%% | |
%% returns the sorted Index generated by process_text/3. Text is a list of lines | |
%% | |
% TUPLELIST | |
% process_text(Text) -> lists:keysort(1, process_text(Text, 1, [])). | |
% DICT | |
process_text(Text) -> lists:keysort(1, dict:to_list(process_text(Text, 1, dict:new()))). | |
% ORDDICT | |
% process_text(Text) -> orddict:to_list(process_text(Text, 1, orddict:new())). | |
%% | |
%% process_test(Text, LineNumber, Index) -> NewIndex | |
%% | |
%% This function process each line, incrementing the line number for each line processed. | |
%% The information obtained is added to Index. | |
%% | |
%% When there is no more lines, the funtion finishes returning a new Index. | |
%% | |
process_text([], _, Index) -> Index; | |
process_text([Line|Lines], LineNumber, Index) -> | |
NewIndex = process_line(Line, LineNumber, Index), | |
process_text(Lines, LineNumber + 1, NewIndex). | |
%% | |
%% process_line(Line, LineNumber, Index) -> NewIndex | |
%% | |
%% This function splits a the Line into words a processes each word. | |
%% A NewIndex is generated adding the information of the words to the Index. | |
%% | |
process_line(Line, LineNumber, Index) -> | |
process_words(string:lexemes(Line, " \r\n\t.,;:-`´\"'!?()[]{}\\/"), LineNumber, Index). | |
%% | |
%% process_words(Words LineNumber, Index) -> NewIndex | |
%% | |
%% This function processes each word of the list Words. If the word has 3 or less letters, the word is ignored. | |
%% Otherwise, the function does the next: | |
%% a) transforms the word to a lowercase one | |
%% b) finds the word in the Index. | |
%% c) if the word exists in the index, adds LineNumber to the list of LineNumbers associated | |
%% to this word and creates a NewIndex with this information. | |
%% d) if the word does not exists in the index, add a new entry to it, generating a NewIndex. | |
%% When there is no more words to processes, returns the new Index. | |
%% | |
process_words([], _, Index) -> Index; | |
process_words([Word|Words], LineNumber, Index) when length(Word) > 3 -> | |
LowerCaseWord = string:lowercase(Word), | |
% TUPLELIST | |
%case lists:keyfind(LowerCaseWord, 1, Index) of | |
% {LowerCaseWord, LineNumbers} -> | |
% NewIndex = lists:keyreplace(LowerCaseWord, 1, Index, {LowerCaseWord, [LineNumber|LineNumbers]}), | |
% process_words(Words, LineNumber, NewIndex); | |
% false -> | |
% process_words(Words, LineNumber, [{LowerCaseWord, [LineNumber]} | Index]) | |
%end; | |
% DICT | |
case dict:find(LowerCaseWord, Index) of | |
{ok, _ } -> | |
NewIndex = dict:append(LowerCaseWord, LineNumber, Index), | |
process_words(Words, LineNumber, NewIndex); | |
error -> | |
NewIndex = dict:store(LowerCaseWord, [LineNumber], Index), | |
process_words(Words, LineNumber, NewIndex) | |
end; | |
% ORDDICT | |
%case orddict:find(LowerCaseWord, Index) of | |
% {ok, _ } -> | |
% NewIndex = orddict:append(LowerCaseWord, LineNumber, Index), | |
% process_words(Words, LineNumber, NewIndex); | |
% error -> | |
% NewIndex = orddict:store(LowerCaseWord, [LineNumber], Index), | |
% process_words(Words, LineNumber, NewIndex) | |
%end; | |
process_words([_Word|Words], LineNumber, Index) -> | |
process_words(Words, LineNumber, Index). | |
%% | |
%% read_from_line(FileName) -> [Lines] | |
%% | |
%% reads line to line the file FileName and returns the list of lines. | |
%% | |
read_from_file(FileName) -> | |
{ok, File} = file:open(FileName, [read]), | |
Text = read_lines(File, []), | |
file:close(File), | |
Text. | |
%% | |
%% read_lines(File, Text) -> Text | |
%% | |
%% reads a new line from File and adds it to Text until there is no more lines in the file | |
%% | |
read_lines(File, Text) -> | |
case io:get_line(File,"") of | |
eof -> | |
lists:reverse(Text); | |
Line -> | |
read_lines(File, [Line|Text]) | |
end. | |
%% | |
%% write_to_file(FileName, Index) -> ok | |
%% | |
%% writes to the file FileName all the entries in Index, each into a line. | |
%% | |
%% NOTES: the content of the file FileName is overwritten if exists. | |
%% | |
write_to_file(Name, Index) -> | |
{ok, File} = file:open(Name, [write]), | |
write_index(File, Index), | |
file:close(File). | |
%% | |
%% write_index(File, Index) -> ok | |
%% | |
%% write an entry in Index to file File until there is no more entries in the Index. | |
%% | |
write_index(_File, []) -> ok; | |
write_index(File, [Entry|Entries]) -> | |
write_entry(File, Entry), | |
write_index(File, Entries). | |
%% | |
%% write_entry(File, IndexEntry) -> ok | |
%% | |
%% writes one index entry into a file into one line with the next format: | |
%% <word> [ <line>, <line>, ...] | |
%% | |
%% NOTE: I don't use the control sequece ~w to avoid some intergers to be printed as ASCII characters. | |
%% | |
write_entry(File, {Word, LineNumbers}) -> | |
io:fwrite(File, "~s ", [Word]), | |
write_numbers(File, LineNumbers), | |
io:fwrite(File, "~n", []). | |
%% | |
%% write_nubmers(File, LineNumbes) -> ok | |
%% | |
%% writes to the file the line numbers as integers separated by commas | |
%% | |
write_numbers(File, [LineNumber]) -> | |
io:fwrite(File, "~b", [LineNumber]); | |
write_numbers(File, [LineNumber|LineNumbers]) -> | |
io:fwrite(File, "~b,", [LineNumber]), | |
write_numbers(File, LineNumbers). |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment