Created
January 6, 2012 14:58
-
-
Save milanaleksic/1570948 to your computer and use it in GitHub Desktop.
Find bad URLs in exported Chrome bookmarks file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package misc | |
String bookmarksFile = 'c:/temp/bookmarks.html' | |
def counter = new java.util.concurrent.atomic.AtomicInteger(0) | |
Map<String, String> urls = [:] | |
Map<String, String> badURLs = [:] | |
new File(bookmarksFile).eachLine { line -> | |
(line =~ /(?i)<A HREF="([^"]+)"[^>]*>([^<]*)<.*/).each { entire, linkLocation, linkName -> | |
if (urls.containsValue(linkLocation)) | |
println "WARNING link duplicated: [$linkName] $linkLocation" | |
if (urls.containsKey(linkName)) | |
urls[linkName + UUID.randomUUID()] = linkLocation | |
else | |
urls[linkName] = linkLocation | |
} | |
} | |
println "Links found and to be processed: ${urls.size()}" | |
groovyx.gpars.GParsExecutorsPool.withPool(10) { | |
urls.eachParallel { linkName, url -> | |
try { | |
println "Analyzing url ${counter.incrementAndGet()}/${urls.size()} $url" | |
def connection = new URL(url).openConnection() | |
connection.connectTimeout = 15000 | |
connection.readTimeout = 15000 | |
connection.connect() | |
if (!(connection.getHeaderField(null) ==~ /.*200 OK/)) { | |
badURLs[linkName] = url | |
println "\tUrl $url added to bad URL list because of response code: ${connection.getHeaderField(null)}" | |
} | |
} catch (Throwable t) { | |
println "\tIssue found while accessing [$url]: ${t.getMessage()}" | |
badURLs[linkName] = url | |
} | |
} | |
} | |
println '\n\n--------------\nBad URLs:' | |
badURLs.each { linkName, url -> | |
println "\t$linkName [$url]" | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment