Created
January 7, 2017 01:21
-
-
Save mthomure/de2e5c24760c2c6a31b601c51802f1ff to your computer and use it in GitHub Desktop.
Read large excel files from clojure without killing the JVM
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns read-excel-stream | |
(:import [com.monitorjbl.xlsx StreamingReader])) | |
;; Remember to add the following maven dependency: | |
;; [com.monitorjbl/xlsx-streamer "1.0.1"] | |
;; More info here: | |
;; https://github.com/monitorjbl/excel-streaming-reader | |
(defn workbook | |
"Open an Excel workbook, for example in a with-open form. This is lazy, and | |
does not read the content of the workbook." | |
[path] | |
(-> (StreamingReader/builder) | |
(.rowCacheSize 100) | |
(.bufferSize 4096) | |
(.open (clojure.java.io/input-stream path)))) | |
(defn sheets | |
"Get all sheets in a workbook, keyed by their name. This is lazy, and does not | |
load the content of each sheet." | |
[workbook] | |
(->> workbook (map #(vector (.getSheetName %) %)) (into {}))) | |
(defn -main [path] | |
(with-open [wk (workbook path)] | |
(let [all-sheets (sheets wk) | |
sheet1 (all-sheets "Sheet1")] | |
(println "All Sheet Names:" (keys all-sheets)) | |
(println "Num Rows:" (time (count (seq sheet1))))))) | |
;; Example output: | |
;; $ lein run read-excel-stream | |
;; All Sheet Names: (Sheet1) | |
;; "Elapsed time: 63187.76884 msecs" | |
;; Num Rows: 320532 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment