Created
January 12, 2016 17:14
-
-
Save ericchiang/4cbefb674c9a4c0b33c2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"io" | |
"log" | |
"net/http" | |
"os" | |
"golang.org/x/net/html" | |
"golang.org/x/net/html/atom" | |
"golang.org/x/net/html/charset" | |
"golang.org/x/text/transform" | |
) | |
func main() { | |
root, err := getAndParse("https://www.wikipedia.org/", "") | |
if err != nil { | |
log.Fatal(err) | |
} | |
title, ok := getTitle(root) | |
if !ok { | |
log.Fatal("could not find title") | |
} | |
html.Render(os.Stdout, title) | |
fmt.Println() | |
} | |
// getAndParse makes a GET request to the provided URL and attempts | |
// to parse it as HTML encoded as the given charset. This method | |
// returns the root node of the page. | |
// | |
// If charset is an empty string the charset is guessed. | |
func getAndParse(url, pageCharset string) (*html.Node, error) { | |
// Make a GET request to the provided URL. | |
resp, err := http.Get(url) | |
if err != nil { | |
return nil, fmt.Errorf("GET: %v", err) | |
} | |
defer resp.Body.Close() | |
// html.Parse assumes content to be UTF-8 encoded. | |
var reader io.Reader | |
if pageCharset == "" { | |
// Attempt to guess the charset of the HTML document. | |
reader, err = charset.NewReader(resp.Body, "") | |
if err != nil { | |
return nil, fmt.Errorf("creating new charset reader: %v", err) | |
} | |
} else { | |
// Lookup the charset and attempts to transform it to UTF-8. | |
e, name := charset.Lookup(pageCharset) | |
if name == "" { | |
return nil, fmt.Errorf("provided charset not found") | |
} | |
reader = transform.NewReader(resp.Body, e.NewDecoder()) | |
} | |
// Use the html package to parse the page. | |
root, err := html.Parse(reader) | |
if err != nil { | |
return nil, fmt.Errorf("parsing HTML: %v", err) | |
} | |
return root, nil | |
} | |
// getTitle recursively looks up the <title> element. | |
func getTitle(node *html.Node) (*html.Node, bool) { | |
if node.DataAtom == atom.Title { | |
return node, true | |
} | |
for c := node.FirstChild; c != nil; c = c.NextSibling { | |
if title, ok := getTitle(c); ok { | |
return title, true | |
} | |
} | |
return nil, false | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment