-
-
Save algogrit/5939604 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns wikiparse | |
(:require [clojure.java.io :as io] | |
[clojure.data.xml :as xml] | |
[clojure.zip :as zip] | |
[clojure.data.zip :as dzip] | |
[clojure.data.zip.xml :as zxml]) | |
(:import [ org.apache.commons.compress.compressors.bzip2 BZip2CompressorInputStream]) | |
(:gen-class :main true)) | |
(defn bz2-reader | |
"Returns a streaming Reader for the given compressed BZip2 | |
file. Use within (with-open)." | |
[filename] | |
(-> filename io/file io/input-stream BZip2CompressorInputStream. io/reader)) | |
(defn print-band-title [page] | |
(let [z (zip/xml-zip page) | |
title (zxml/xml1-> z :title zxml/text)] | |
(if (#(re-find #"\(band\)" title)) | |
(println title)))) | |
(defn print-bands [filename max] | |
(with-open [rdr (bz2-reader filename)] | |
(dorun (->> (xml/parse rdr) | |
:content | |
(filter #(= :page (:tag %))) | |
(take max) | |
(map print-band-title))))) | |
(def wikifile "enwiki-latest-pages-articles.xml.bz2") | |
(defn -main | |
[& args] | |
(print-bands wikifile 100000000)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment