Created
May 19, 2020 17:24
-
-
Save pppillai/7981c32bc12c5b95907b54fe180300c5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-module(index). | |
-compile([export_all]). | |
-export([index_text/1]). | |
-define(MINLENGTH, 8). | |
-include_lib("eunit/include/eunit.hrl"). | |
%% to run c(index). | |
%% index:index_text("FullPathOfFile"). | |
% Used to read a file into a list of lines. | |
% Example files available in: | |
% gettysburg-address.txt (short) | |
% dickens-christmas.txt (long) | |
% Get the contents of a text file into a list of lines. | |
% Each line has its trailing newline removed. | |
get_file_contents(Name) -> | |
{ok,File} = file:open(Name,[read]), | |
Rev = get_all_lines(File,[]), | |
lists:reverse(Rev). | |
% Auxiliary function for get_file_contents. | |
% Not exported. | |
get_all_lines(File,Partial) -> | |
case io:get_line(File,"") of | |
eof -> file:close(File), | |
Partial; | |
Line -> {Strip,_} = lists:split(length(Line)-1,Line), | |
get_all_lines(File,[Strip|Partial]) | |
end. | |
% Show the contents of a list of strings. | |
% Can be used to check the results of calling get_file_contents. | |
show_file_contents([L|Ls]) -> | |
io:format("~s~n",[L]), | |
show_file_contents(Ls); | |
show_file_contents([]) -> | |
ok. | |
index_text(FilePath) -> | |
%% make lower case of all the text. | |
%% add this if empty line should be removed , length(X) > 0, but will screw up index. | |
FileContents = [lower_case(X)|| X <- get_file_contents(FilePath)], | |
%% make a list of tuples [{lines, linenumber},....] | |
ProcessedLines = lists:zip(FileContents, lists:seq(1, length(FileContents))), | |
%% make a list of all the words with line number | |
AllWords = [{Y, [N]}||{Line, N} <- ProcessedLines, Y <- string:tokens(Line,"/[]{}!;:_-,. "), length(Y) > ?MINLENGTH], | |
%% collect all line numbers for a word and deduplicate line number. | |
FinalIndex = [{Word, deduplicate(N, [])} || {Word, N} <- finalize_index(AllWords, [])], | |
%% make the tuple range as described in the problem statement | |
[{Word, lists:reverse(make_tuple_range([X|T], X, []))} || {Word,[X|T]} <- FinalIndex]. | |
finalize_index([], Result) -> | |
Result; | |
finalize_index([{Word, LineCountList}|Tail], Result) -> | |
case lists:keymember(Word, 1, Result) of | |
false -> | |
finalize_index(Tail, [{Word, LineCountList}|Result]); | |
true -> | |
Value = lists:keysearch(Word, 1, Result), | |
case Value of | |
false -> | |
finalize_index(Tail, [{Word, LineCountList}|Result]); | |
{value, {Word, NumberList}} -> | |
NewResult = lists:keyreplace(Word, 1, Result, {Word, NumberList++LineCountList}), | |
finalize_index(Tail, NewResult) | |
end | |
end. | |
lower_case([]) -> | |
[]; | |
lower_case([Char|Tail]) -> | |
C = case lists:member(Char, " -_,.;:\"\'") of | |
true -> | |
Char; | |
false -> | |
case Char >= $A andalso Char =< $Z of | |
true -> | |
Char + 32; | |
false -> | |
Char | |
end | |
end, | |
[C|lower_case(Tail)]. | |
deduplicate([], Result) -> | |
lists:reverse(Result); | |
deduplicate([X|Xs], Result) -> | |
case lists:member(X, Result) of | |
true -> | |
deduplicate(Xs, Result); | |
false -> | |
deduplicate(Xs, [X|Result]) | |
end. | |
make_tuple_range([], _FirstValue, Result) -> | |
Result; | |
make_tuple_range([X], FirstValue, Result) -> | |
[{FirstValue, X}|Result]; | |
make_tuple_range([X1,X2|T], FirstValue, Result) -> | |
case X2 - X1 == 1 of | |
true -> | |
make_tuple_range([X2|T], FirstValue, Result); | |
false -> | |
make_tuple_range([X2|T], X2, [{FirstValue, X1}|Result]) | |
end. | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I love your code. I love list comprehensions. Comments are brief but very clear and usefull.
Only twocomments. I think you can make "deduplicate" at the same time as "make_tuple_range", and why not to use string:lowercase?