- Note that we are using the original case files as input, and not the summarized files as input, because the result with the summarized files are gibberish
- We are using Python2.7
pip install gensim
python sum100.py <input folder> <output folder>
pip install gensim
python sum100.py <input folder> <output folder>
from gensim.summarization import summarize | |
import sys | |
import os | |
def load_file(filename): | |
text = list() | |
# print(filename) | |
with open(filename, 'r') as file: | |
for line in file: | |
text.append(line) | |
return text | |
errorList = list() | |
def find_num_of_words(text): | |
ln = list() | |
for i in text: | |
if i is ' ': | |
ln.append(1) | |
return len(ln) | |
def main(): | |
# Initialise the input, output folder | |
input = sys.argv[1] | |
output = sys.argv[2] | |
# List of all files in a folder | |
output_file_list = list() | |
# List of text of all files which are unsummarized | |
total_file_text_unsummarized = list() | |
# List of text of all files which are summarized | |
total_file_text_summarized = list() | |
for file in os.listdir(input): | |
output_file_list.append(file) | |
# Load all the files in text | |
for file in output_file_list: | |
total_file_text_unsummarized.append(load_file('{0}/{1}'.format(input,file))) | |
for index ,textfile in enumerate(total_file_text_unsummarized): | |
# textfile = textfile | |
word_count = 200 | |
filename = output_file_list[index] | |
text_file = str() | |
for line in textfile: | |
text_file += line | |
# print('---- TEXT FILE -- \n {0}'.format(text_file)) | |
summarized_text = summarize(text_file, word_count=word_count) | |
len_of_words_summarized = find_num_of_words(summarized_text) | |
if len_of_words_summarized <= 50 or len_of_words_summarized >= 200: | |
error = "{0} has the length {1}".format(filename, len_of_words_summarized) | |
errorList.append(error) | |
name_of_file = '/{0}.txt'.format(filename) | |
# print('HHHHHHHHH{0}'.format(output+name_of_file)) | |
with open(output+name_of_file,'a') as file: | |
file.write(summarized_text) | |
sys.stdout.write('{0}\r'.format(index)) | |
sys.stdout.flush() | |
print('--------- SUMMARIZATION COMPLETED!!------------\n') | |
print('--------- FOLLOWING ARE THE LIST OF ERROR FILEs-----\n') | |
print(errorList) | |
with open(output+'/ERRORLIST.txt','w') as file: | |
for error in errorList: | |
file.write(error) | |
print('----- END ------\n') | |
if __name__ == "__main__": | |
main() | |