Last active
May 15, 2019 07:37
-
-
Save conanca/7348222 to your computer and use it in GitHub Desktop.
爬bootstrap主题的脚本,仅供学习交流golang之用
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"github.com/PuerkitoBio/goquery" | |
"io" | |
"io/ioutil" | |
"net/http" | |
"os" | |
"regexp" | |
"strings" | |
) | |
const ( | |
ThemesUrl = "http://responsiweb.com/themes/preview/ace/1.3/" | |
Index = "index.html" | |
) | |
func pError(err error) { | |
if err != nil { | |
panic(err.Error()) | |
} | |
} | |
func Exist(filename string) bool { | |
_, err := os.Stat(filename) | |
return err == nil || os.IsExist(err) | |
} | |
// 将指定内容保存为指定文件名的文件 | |
func content2File(fileName string, content string) { | |
if strings.Contains(fileName, "/") { | |
os.MkdirAll(fileName[:strings.LastIndex(fileName, "/")], 0775) | |
} | |
dstFile, err := os.Create(fileName) | |
pError(err) | |
defer dstFile.Close() | |
content = strings.Replace(content, "\"//", "\"http://", -1) | |
dstFile.WriteString(content) | |
} | |
// 保存指定url的HTML文件并返回Document和content | |
func url2Html(url string) (doc *goquery.Document, content string) { | |
var err error | |
if doc, err = goquery.NewDocument(ThemesUrl + url); err != nil { | |
panic(err.Error()) | |
} | |
var _ bool | |
content, _ = doc.Html() | |
content2File(url, content) | |
return doc, content | |
} | |
// 保存指定url的资源(图片/js/css等文件) | |
func url2File(url string)(download bool) { | |
if url == "" || strings.HasPrefix(url, "http://") || strings.HasPrefix(url, "//") || Exist(url) { | |
return false | |
} | |
fmt.Println(url + " downloading......") | |
resp, err := http.Get(ThemesUrl + url) | |
pError(err) | |
if strings.Contains(url, "/") { | |
os.MkdirAll(url[:strings.LastIndex(url, "/")], 0775) | |
} | |
file, err := os.Create(url) | |
pError(err) | |
defer func() { | |
resp.Body.Close() | |
file.Close() | |
}() | |
io.Copy(file, resp.Body) | |
return true | |
} | |
// 保存网页中引用的js和css等文件 | |
func saveHtmlDoc(doc *goquery.Document, content string) { | |
// 解析引用的css | |
doc.Find("link").Each(func(i int, s *goquery.Selection) { | |
url, _ := s.Attr("href") | |
// 保存css文件 | |
var download = url2File(url) | |
if !download { | |
return | |
} | |
cssFile, err := os.Open(url) | |
pError(err) | |
defer cssFile.Close() | |
cssContent, err := ioutil.ReadAll(cssFile) | |
// 保存css文件中所引用的图片 | |
re, _ := regexp.Compile("url\\((.*?)\\)") | |
all := re.FindAllString(string(cssContent), -1) | |
for _, img := range all { | |
if strings.Contains(img, ".") { | |
// 提取url | |
img = strings.Replace(strings.Replace(img, "'", "", -1), "\"", "", -1) | |
img = img[4:strings.Index(img, ")")] | |
if strings.Contains(img, "../") { | |
img = strings.Replace(img, "../", "assets/", -1) | |
} else { | |
img = "assets/css/" + img | |
} | |
if strings.Contains(img, "?") { | |
img = img[:strings.Index(img, "?")] | |
} | |
if strings.Contains(img, "#") { | |
img = img[:strings.Index(img, "#")] | |
} | |
// 保存图片 | |
url2File(img) | |
} | |
} | |
}) | |
// 解析引用的js | |
doc.Find("script[src]").Each(func(i int, s *goquery.Selection) { | |
url, _ := s.Attr("src") | |
// 保存js文件 | |
url2File(url) | |
}) | |
// 解析引用的img | |
doc.Find("img").Each(func(i int, s *goquery.Selection) { | |
url, _ := s.Attr("src") | |
// 保存文件 | |
url2File(url) | |
}) | |
} | |
func main() { | |
fmt.Println("start!") | |
// 处理首页 | |
fmt.Println("==== Page " + Index + "====") | |
indexHtmlDoc, content := url2Html(Index) | |
saveHtmlDoc(indexHtmlDoc, content) | |
// 获取其他页 | |
indexHtmlDoc.Find("a[href]").Each(func(i int, s *goquery.Selection) { | |
url, _ := s.Attr("href") | |
if url != "" && url != "#" && url != "index.html" && strings.Contains(url, ".html") { | |
// 处理其他页 | |
fmt.Println("==== Page " + url + "====") | |
saveHtmlDoc(url2Html(url)) | |
} | |
}) | |
// 完成 | |
fmt.Println("finish!") | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment