Last active
June 8, 2016 12:31
-
-
Save DiKorsch/35d8366790c7af77dfc16e8b5066b35c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from converter import extract | |
from os import path | |
import argparse, simplejson as json | |
def generateJson(lectureId, outdir, images): | |
if not outdir.endswith("/"): outdir += "/" | |
sync_file = path.join(outdir, "sync") | |
json_obj = {"lectureId": lectureId, "images": []} | |
with open(sync_file) as f: | |
for line in f: | |
slide_num, time = line.rstrip().split() | |
json_obj["images"].append({ | |
"path": images.get(int(slide_num)).replace(outdir, ""), | |
"start": time, "unique": True, "textlines": [] | |
}) | |
json_file = path.join(outdir, "slides.json") | |
json.dump(json_obj, open(json_file, "w"), indent = 2) | |
return json_file | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='this extracts from the given PDF the slides') | |
parser.add_argument('-p', '--pdf', type=str, required = True, help='slides in PDF format') | |
parser.add_argument('-o', '--output', type=str, required = True, help='output directory') | |
parser.add_argument('--id', type=str, required = True, help='ID of the slide. needed for JSON generation') | |
parser.add_argument('-s', '--size', type=str, default='1280x720', help='size of the extracted images. format: "WIDTHxHEIGHT"') | |
parser.add_argument('--overwrite', action='store_true') | |
args = parser.parse_args() | |
images = extract(args.pdf, args.size, args.overwrite, args.output) | |
json_file = generateJson(args.id, args.output, images) | |
print "slide info saved under \"{}\"".format(json_file) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, PyPDF2, re | |
from os import path | |
from wand.image import Image | |
def getPageNames(pdf_name): | |
f = PyPDF2.PdfFileReader(open(pdf_name, "rb")) | |
return map(lambda c: "{}[{}]".format(pdf_name, c), range(f.numPages)) | |
def parseSize(size_as_string): return map(int, re.match(r"(\d+)x(\d+)", size_as_string).groups()) | |
def adjust_size(img, width, height): | |
img_ratio = float(img.width) / img.height | |
if img_ratio != float(width) / height: | |
height = int(width / img_ratio) | |
return width, height | |
def extract(fpath, size_as_string, overwrite = False, outdir = None): | |
images = [] | |
fname, dirpath = path.basename(fpath), path.dirname(fpath) | |
width, height = parseSize(size_as_string) | |
for idx, pageName in enumerate(getPageNames(fpath), 1): | |
page_img_name = path.join(outdir or dirpath, "slides", "{}.jpg".format(idx)) | |
images.append((idx, page_img_name)) | |
if not overwrite and path.isfile(page_img_name): continue | |
if not path.isdir(path.dirname(page_img_name)): | |
os.mkdir(path.dirname(page_img_name)) | |
with Image(filename = pageName, resolution=200) as img: | |
img.compression_quality = 99 | |
img.alpha_channel=False | |
img.resize(*adjust_size(img, width, height)) | |
img.save(filename = page_img_name) | |
return dict(images) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Wand | |
PyPDF2 | |
argparse | |
simplejson |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment