Parse HTML extract tags and attributes



examples/parse-html/parse_html.go
package main

import (
    "fmt"
    "io"
    "strings"

    "golang.org/x/net/html"
)

func main() {
    body := `<html>
    <body>
        <h1>Main title</h1>
        <a href="https://code-maven.com/">Code Maven</a>
        <h2 id="subtitle" class="important">Some subtle title</h2>
    </body>
    </html>`

    reader := strings.NewReader(body)
    tokenizer := html.NewTokenizer(reader)
    for {
        tt := tokenizer.Next()
        if tt == html.ErrorToken {
            if tokenizer.Err() == io.EOF {
                return
            }
            fmt.Printf("Error: %v", tokenizer.Err())
            return
        }
        tag, hasAttr := tokenizer.TagName()
        fmt.Printf("Tag: %v\n", string(tag))
        if hasAttr {
            for {
                attrKey, attrValue, moreAttr := tokenizer.TagAttr()
                // if string(attrKey) == "" {
                //     break
                // }
                fmt.Printf("Attr: %v\n", string(attrKey))
                fmt.Printf("Attr: %v\n", string(attrValue))
                fmt.Printf("Attr: %v\n", moreAttr)
                if !moreAttr {
                    break
                }
            }
        }
    }
}

examples/parse-html/parse_html.out
Tag: html
Tag: 
Tag: body
Tag: 
Tag: h1
Tag: 
Tag: h1
Tag: 
Tag: a
Attr: href
Attr: https://code-maven.com/
Attr: false
Tag: 
Tag: a
Tag: 
Tag: h2
Attr: id
Attr: subtitle
Attr: true
Attr: class
Attr: important
Attr: false
Tag: 
Tag: h2
Tag: 
Tag: body
Tag: 
Tag: html