-
-
Save zeddee/8f82ff236a670d739726c32215679edf to your computer and use it in GitHub Desktop.
A simple HTML doc parser in golang that sends the tokens we are looking for back to the caller over a channel.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"strings" | |
"golang.org/x/net/html" | |
) | |
func main() { | |
HTMLString := `<!DOCTYPE html> | |
<html itemscope itemtype="http://schema.org/QAPage"> | |
<head> | |
<title>go - Golang parse HTML, extract all content with <body> </body> tags - Stack Overflow</title> | |
<link rel="shortcut icon" href="//cdn.sstatic.net/Sites/stackoverflow/img/favicon.ico?v=4f32ecc8f43d"> | |
<link rel="apple-touch-icon image_src" href="//cdn.sstatic.net/Sites/stackoverflow/img/apple-touch-icon.png?v=c78bd457575a"> | |
<link rel="search" type="application/opensearchdescription+xml" title="Stack Overflow" href="/opensearch.xml"> | |
<meta name="twitter:card" content="summary"> | |
<meta name="twitter:domain" content="stackoverflow.com"/> | |
<meta property="og:type" content="website" /> | |
</head> | |
<body class="template-blog"> | |
<nav class="navigation"> | |
<div class="navigation__container container"> | |
<a class="navigation__logo" href="/"> | |
<h1>Foobar</h1> | |
</a> | |
<ul class="navigation__menu"> | |
<li><a href="/tags/">Topics</a></li> | |
<li><a href="/about">About</a></li> | |
</ul> | |
</div>` | |
var c chan Node | |
var title string | |
var a []string | |
wantedTokens := []string{ | |
"a", "title", | |
} | |
c = GetTokensFromHTMLString(HTMLString, wantedTokens) | |
for node := range c { | |
// fmt.Println(node.Type, node) | |
if node.Type == "title" { | |
tt := node.Doc.Next() | |
if tt == html.TextToken { | |
next := node.Doc.Token() | |
title = strings.TrimSpace(next.Data) | |
} | |
} | |
if node.Type == "a" { | |
a = append(a, node.Type) | |
} | |
} | |
fmt.Println("title", title) | |
fmt.Println("a", a) | |
} | |
// Node foobar | |
type Node struct { | |
Type string | |
Token html.Token | |
Doc *html.Tokenizer | |
} | |
// GetTokensFromHTMLString foobar | |
func GetTokensFromHTMLString(HTMLString string, wantedTokens []string) (c chan Node) { | |
c = make(chan Node) | |
go func() { | |
defer close(c) | |
// https://play.golang.org/p/0MRSefJ_-E | |
r := strings.NewReader(HTMLString) | |
z := html.NewTokenizer(r) | |
// defer func() { | |
// close(c) | |
// } | |
for { | |
tt := z.Next() | |
switch { | |
case tt == html.ErrorToken: | |
// End of the document, we're done | |
return | |
case tt == html.StartTagToken: | |
token := z.Token() | |
for _, name := range wantedTokens { | |
if token.Data == name { | |
c <- Node{token.Data, token, z} | |
} | |
continue | |
} | |
} | |
} | |
}() | |
return c | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment