Skip to content

Instantly share code, notes, and snippets.

@dpricha89
Created April 11, 2021 15:14
Show Gist options
  • Save dpricha89/7eb2bde5568648c07007bd2702ddf186 to your computer and use it in GitHub Desktop.
Save dpricha89/7eb2bde5568648c07007bd2702ddf186 to your computer and use it in GitHub Desktop.
Get top 10 lines in large file
import unittest
import sys, array, tempfile, heapq
import io
from itertools import islice
NUM_LINES_IN_MEM_PER_FILE = 100000
NUM_LINES_IN_EACH_FILE = 1000000
class SortLargeFile:
def intsfromfile(self, f):
while True:
next_n_lines = list(islice(f, NUM_LINES_IN_MEM_PER_FILE))
if not next_n_lines:
break
for line in next_n_lines:
yield line
f.close()
def saveToTemp(self, arr):
print("size of the arr {}".format(len(arr)))
f = tempfile.TemporaryFile(mode="r+")
li = list(sorted(arr))
f.writelines(i for i in li)
f.seek(0)
return self.intsfromfile(f)
def sort(self, source):
print('source {}'.format(source))
iters = []
with open(source, 'r') as sf:
arr = []
for line in sf:
arr.append(line)
if len(arr) >= NUM_LINES_IN_EACH_FILE:
iters += self.saveToTemp(arr)
arr = []
if arr:
iters += self.saveToTemp(arr)
print('Number of tmp files', len(iters))
last = None
count = 0
top_ten = []
for x in heapq.merge(*iters):
count += 1
if not last or last != x:
if not last:
last=x.strip('\n')
if len(top_ten) < 10:
heapq.heappush(top_ten, (count,last))
elif top_ten[0][0] < count or (top_ten[0][0] == count and top_ten[0][1] < last):
heapq.heappushpop(top_ten, (count,last))
last = x
count = 0
if len(top_ten) < 10 or top_ten[0][0] < count:
heapq.heappushpop(top_ten, (count,last))
while top_ten:
val = heapq.heappop(top_ten)
print(val[1], val[0])
slf = SortLargeFile()
slf.sort('/Users/drichards/grabbag/test_mocking.py/hash_keys_small.txt')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment