-
-
Save psa-jforestier/1c74330df8e0d1fd6028e75e210e5042 to your computer and use it in GitHub Desktop.
### | |
### This gist contains 2 files : settings.json and lambda_function.py | |
### | |
### settings.json | |
{ | |
"extensions" : ["*.hdr", "*.glb", "*.wasm"] | |
} | |
### lambda_function.py | |
''' | |
This script convert an uncompressed S3 file into a gzipped compressed file. File is replaced, original file is deleted (replaced by the gz version) | |
Create a role with S3 (read/write) and Cloudwatch Logs access : | |
{ | |
"Version": "2012-10-17", | |
"Statement": [ | |
{ | |
"Sid": "VisualEditor0", | |
"Effect": "Allow", | |
"Action": [ | |
"logs:CreateLogStream", | |
"s3:*", | |
"logs:PutLogEvents" | |
], | |
"Resource": "*" | |
}, | |
{ | |
"Sid": "VisualEditor1", | |
"Effect": "Allow", | |
"Action": "logs:CreateLogGroup", | |
"Resource": "*" | |
} | |
] | |
} | |
Install the Lambda in the region of the Bucket, Python 2.7, 1mn max execution time. | |
Change "settings.json" to add or remove extension you want to compress | |
Trigger is S3, PUT event (select the bucket where the lambda apply), output is S3 and Cloudwatch Logs. | |
How it works : | |
- on each PUT event (a new file is uploaded on the bucket), an event si sent to the lambda function (note : it doesnt work with a multipart upload). | |
- the lambda wake up, and analyze the incomming file | |
- read metadata of the incomming file | |
- if the file is have the "gzip" HTTP meta ContentEncoding header, it meens it is already compressed, so there is no need to recompress it | |
- if the file is too small (hard coded : 1024 byte) : no compression | |
- if the file is not a rucognized extension (see settings.json) : no compression | |
- if the file pass all this previous check, it is dowloaded locally (in /tmp) | |
- gzip the local vesion by using the local os "gzip" tool (could be improved by using the internal python gzip feature - TODO) | |
- overwrite the file in the bucket with the locally gzipped version | |
- update metadata with previous + ContentEncoding setted to "gzip" | |
- delete the locally gzipped version | |
''' | |
import json | |
import pprint | |
import boto3 | |
import botocore | |
import tempfile | |
import os | |
import subprocess | |
import fnmatch | |
def lambda_handler(event, context): | |
with open("settings.json") as json_data: | |
settings = json.load(json_data) | |
#print "EVENT :" | |
client = boto3.client('s3') | |
s3 = boto3.resource('s3') | |
for r in event.get('Records'): | |
# pprint.pprint(r) | |
bucketName = r.get('s3').get('bucket').get('name') | |
objectKey = r.get('s3').get('object').get('key') | |
etag = r.get('s3').get('object').get('eTag') | |
print "Retreiving object :" | |
print " bucketname = " + bucketName | |
print " objectKey = " + objectKey | |
uploadedMeta = client.head_object(Bucket=bucketName, Key=objectKey, IfMatch=etag) | |
contentEncoding = uploadedMeta.get('ContentEncoding', None) | |
size = uploadedMeta.get('ContentLength', 0) | |
print " Current encoding = " + str(contentEncoding) | |
print " Size = " + str(size) | |
if (contentEncoding == 'gzip'): | |
print(" ==> File is already compressed") | |
return True | |
match = False | |
for ext in settings['extensions']: | |
if fnmatch.fnmatch(objectKey, ext): | |
match = True | |
break | |
if (match == False): | |
print(" ==> File extension is not activated for compression. See settings.json") | |
return True | |
if (size < 1024): | |
print(" ==> File is too small to be compressed") | |
return True | |
tmp_in = tempfile.mkdtemp()+'.orig' | |
tmp_out = tmp_in + '.gz' # must be .gz because it is gzip default file creation | |
new_objectKey = objectKey + '.gz' # name in S3 | |
print("Download content to " + tmp_in + " and gzip it to " + tmp_out) | |
s3.Bucket(bucketName).download_file(objectKey, tmp_in) | |
print("GZipping file") | |
print subprocess.check_output(['gzip', '-v', '-f', '-9', tmp_in]) # gzip command create .gz file | |
statinfo = os.stat(tmp_out) | |
newsize = statinfo.st_size | |
print "New gzipped file = " + str(statinfo.st_size) | |
if (size - newsize < 1024 ): | |
print "Compression is not efficient, keep original file" | |
return True | |
print "Overwritting S3 file with gzipped version" | |
# Recreate metadata from original file (including http headers) | |
# Todo : keep original upload date | |
extraArgs = { | |
'ContentEncoding':"gzip" | |
} | |
for m in ['Metadata', 'CacheControl', 'ContentDisposition', 'ContentLanguage', 'ContentType', 'Expires']: | |
if (uploadedMeta.get(m, None) != None): | |
extraArgs[m] = uploadedMeta.get(m) | |
extraArgs['Metadata']['lambda'] = os.environ.get('AWS_LAMBDA_FUNCTION_NAME', '') | |
extraArgs['Metadata']['originak-size'] = str(size) | |
s3.Object(bucketName, objectKey).upload_file( | |
Filename=tmp_out, | |
ExtraArgs=extraArgs) | |
# remove local file | |
os.remove(tmp_out) | |
return { | |
'statusCode': 200, | |
'body': 'It works' | |
} |
not able to see settings.json
Hi - I am getting below error at the actual gzip step. Can you please help with what could be wrong? Should I do any additional imports?
print(subprocess.check_output(['gzip', '-v', '-f', '-9', tmp_in]))
Error log ->
[ERROR] FileNotFoundError: [Errno 2] No such file or directory: 'gzip'
Traceback (most recent call last):
File "/var/task/lambda_function.py", line 42, in lambda_handler
print(subprocess.check_output(['gzip', '-v', '-f', '-9', tmp_in])) # gzip command create .gz file
File "/var/lang/lib/python3.8/subprocess.py", line 411, in check_output
return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,
File "/var/lang/lib/python3.8/subprocess.py", line 489, in run
with Popen(*popenargs, **kwargs) as process:
File "/var/lang/lib/python3.8/subprocess.py", line 854, in init
self._execute_child(args, executable, preexec_fn, close_fds,
File "/var/lang/lib/python3.8/subprocess.py", line 1702, in _execute_child
raise child_exception_type(errno_num, err_msg, err_filename)
[ERROR] FileNotFoundError: [Errno 2] No such file or directory: 'gzip'
It means there is no gzip executable in the path. Maybe AWS change something on the underlying Linux behind Lambda. Try to find where the gzip program is located (/bin/gzip or similar)
suggest usingtmp.close()
# deletes temp file
and use delete flag when creating temp files tempfile.NamedTemporaryFile(delete=True)
@bhimpel-mediware : what says the Lambda log ? is the huge JS file detected when droped ? Maybe with huge file, S3 put files block-by-block, and the PUT event is not triggered.