Last active
April 20, 2022 12:55
-
-
Save chekunkov/1ebcb461c4afd4d98cd4bf3893ce2059 to your computer and use it in GitHub Desktop.
Stream gzip file from s3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import zlib | |
import boto | |
def decompress(key): | |
d = zlib.decompressobj(16 + zlib.MAX_WBITS) | |
for chunk in key: | |
yield d.decompress(chunk) | |
yield d.flush() | |
def iterlines(decompressed_stream): | |
buf = b'' | |
# keep reading chunks of bytes into the buffer | |
for chunk in decompressed_stream: | |
buf += chunk | |
start = 0 | |
# process all lines within the current buffer | |
while True: | |
end = buf.find(b'\n', start) + 1 | |
if end: | |
yield buf[start:end] | |
start = end | |
else: | |
# no more newlines => break out to read more data from s3 into the buffer | |
buf = buf[start:] | |
break | |
# process the last line, too | |
if buf: | |
yield buf | |
def main(): | |
bucket = boto.connect_s3().get_bucket("some.bucket") | |
key = bucket.get_key("some/key.gz") | |
dstream = decompress(key) | |
for line in iterlines(dstream): | |
print line | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment