Browse Source
* [bugfix] Use better plaintext representation of status for filtering * add new deps to readme * lint * update tests * update regexes * address review comments * remove now unused xxhash * whoops, wrong logger * Merge branch 'main' into status_filtering_bugfix * put cache in caches struct * painpull/3310/head
15 changed files with 2682 additions and 61 deletions
@ -0,0 +1,10 @@
|
||||
language: go |
||||
go: |
||||
- master |
||||
before_install: |
||||
- go get github.com/axw/gocov/gocov |
||||
- go get github.com/mattn/goveralls |
||||
- if ! go get github.com/golang/tools/cmd/cover; then go get golang.org/x/tools/cmd/cover; fi |
||||
script: |
||||
- $HOME/gopath/bin/goveralls -service=travis-ci |
||||
|
||||
@ -0,0 +1,21 @@
|
||||
MIT License |
||||
|
||||
Copyright (c) 2017 Mario K3A Hros (www.k3a.me) |
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy |
||||
of this software and associated documentation files (the "Software"), to deal |
||||
in the Software without restriction, including without limitation the rights |
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
||||
copies of the Software, and to permit persons to whom the Software is |
||||
furnished to do so, subject to the following conditions: |
||||
|
||||
The above copyright notice and this permission notice shall be included in all |
||||
copies or substantial portions of the Software. |
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
||||
SOFTWARE. |
||||
@ -0,0 +1,60 @@
|
||||
[](https://godoc.org/github.com/k3a/html2text) |
||||
[](https://travis-ci.org/k3a/html2text) |
||||
[](https://coveralls.io/github/k3a/html2text?branch=master) |
||||
[](https://goreportcard.com/report/github.com/k3a/html2text) |
||||
|
||||
# html2text |
||||
|
||||
A simple Golang package to convert HTML to plain text (without non-standard dependencies). |
||||
|
||||
It converts HTML tags to text and also parses HTML entities into characters they represent. |
||||
A `<head>` section of the HTML document, as well as most other tags are stripped out but |
||||
links are properly converted into their href attribute. |
||||
|
||||
It can be used for converting HTML emails into text. |
||||
|
||||
Some tests are installed as well. |
||||
Uses semantic versioning and no breaking changes are planned. |
||||
|
||||
Fell free to publish a pull request if you have suggestions for improvement but please note that the library can now be considered feature-complete and API stable. If you need more than this basic conversion, please use an alternative mentioned at the bottom. |
||||
|
||||
## Install |
||||
```bash |
||||
go get github.com/k3a/html2text |
||||
``` |
||||
|
||||
## Usage |
||||
|
||||
```go |
||||
package main |
||||
|
||||
import ( |
||||
"fmt" |
||||
"github.com/k3a/html2text" |
||||
) |
||||
|
||||
func main() { |
||||
html := `<html><head><title>Good</title></head><body><strong>clean</strong> text</body>` |
||||
|
||||
plain := html2text.HTML2Text(html) |
||||
|
||||
fmt.Println(plain) |
||||
} |
||||
|
||||
/* Outputs: |
||||
|
||||
clean text |
||||
*/ |
||||
|
||||
``` |
||||
|
||||
To see all features, please look info `html2text_test.go`. |
||||
|
||||
## Alternatives |
||||
- https://github.com/jaytaylor/html2text (heavier, with more features) |
||||
- https://git.alexwennerberg.com/nanohtml2text (rewrite of this module in Rust) |
||||
|
||||
## License |
||||
|
||||
MIT |
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,333 @@
|
||||
package html2text |
||||
|
||||
import ( |
||||
"bytes" |
||||
"regexp" |
||||
"strconv" |
||||
"strings" |
||||
) |
||||
|
||||
// Line break constants
|
||||
// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
|
||||
const ( |
||||
WIN_LBR = "\r\n" |
||||
UNIX_LBR = "\n" |
||||
) |
||||
|
||||
var legacyLBR = WIN_LBR |
||||
var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s+)`) |
||||
var linkTagRE = regexp.MustCompile(`^(?i:a)(?:$|\s).*(?i:href)\s*=\s*('([^']*?)'|"([^"]*?)"|([^\s"'` + "`" + `=<>]+))`) |
||||
var badLinkHrefRE = regexp.MustCompile(`javascript:`) |
||||
var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`) |
||||
var numericEntityRE = regexp.MustCompile(`(?i)^#(x?[a-f0-9]+)$`) |
||||
|
||||
type options struct { |
||||
lbr string |
||||
linksInnerText bool |
||||
listPrefix string |
||||
} |
||||
|
||||
func newOptions() *options { |
||||
// apply defaults
|
||||
return &options{ |
||||
lbr: WIN_LBR, |
||||
} |
||||
} |
||||
|
||||
// Option is a functional option
|
||||
type Option func(*options) |
||||
|
||||
// WithUnixLineBreaks instructs the converter to use unix line breaks ("\n" instead of "\r\n" default)
|
||||
func WithUnixLineBreaks() Option { |
||||
return func(o *options) { |
||||
o.lbr = UNIX_LBR |
||||
} |
||||
} |
||||
|
||||
// WithLinksInnerText instructs the converter to retain link tag inner text and append href URLs in angle brackets after the text
|
||||
// Example: click news <http://bit.ly/2n4wXRs>
|
||||
func WithLinksInnerText() Option { |
||||
return func(o *options) { |
||||
o.linksInnerText = true |
||||
} |
||||
} |
||||
|
||||
// WithListSupportPrefix formats <ul> and <li> lists with the specified prefix
|
||||
func WithListSupportPrefix(prefix string) Option { |
||||
return func(o *options) { |
||||
o.listPrefix = prefix |
||||
} |
||||
} |
||||
|
||||
// WithListSupport formats <ul> and <li> lists with " - " prefix
|
||||
func WithListSupport() Option { |
||||
return WithListSupportPrefix(" - ") |
||||
} |
||||
|
||||
func parseHTMLEntity(entName string) (string, bool) { |
||||
if r, ok := entity[entName]; ok { |
||||
return string(r), true |
||||
} |
||||
|
||||
if match := numericEntityRE.FindStringSubmatch(entName); len(match) == 2 { |
||||
var ( |
||||
err error |
||||
n int64 |
||||
digits = match[1] |
||||
) |
||||
|
||||
if digits != "" && (digits[0] == 'x' || digits[0] == 'X') { |
||||
n, err = strconv.ParseInt(digits[1:], 16, 64) |
||||
} else { |
||||
n, err = strconv.ParseInt(digits, 10, 64) |
||||
} |
||||
|
||||
if err == nil && (n == 9 || n == 10 || n == 13 || n > 31) { |
||||
return string(rune(n)), true |
||||
} |
||||
} |
||||
|
||||
return "", false |
||||
} |
||||
|
||||
// SetUnixLbr with argument true sets Unix-style line-breaks in output ("\n")
|
||||
// with argument false sets Windows-style line-breaks in output ("\r\n", the default)
|
||||
// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
|
||||
func SetUnixLbr(b bool) { |
||||
if b { |
||||
legacyLBR = UNIX_LBR |
||||
} else { |
||||
legacyLBR = WIN_LBR |
||||
} |
||||
} |
||||
|
||||
// HTMLEntitiesToText decodes HTML entities inside a provided
|
||||
// string and returns decoded text
|
||||
func HTMLEntitiesToText(htmlEntsText string) string { |
||||
outBuf := bytes.NewBufferString("") |
||||
inEnt := false |
||||
|
||||
for i, r := range htmlEntsText { |
||||
switch { |
||||
case r == ';' && inEnt: |
||||
inEnt = false |
||||
continue |
||||
|
||||
case r == '&': //possible html entity
|
||||
entName := "" |
||||
isEnt := false |
||||
|
||||
// parse the entity name - max 10 chars
|
||||
chars := 0 |
||||
for _, er := range htmlEntsText[i+1:] { |
||||
if er == ';' { |
||||
isEnt = true |
||||
break |
||||
} else { |
||||
entName += string(er) |
||||
} |
||||
|
||||
chars++ |
||||
if chars == 10 { |
||||
break |
||||
} |
||||
} |
||||
|
||||
if isEnt { |
||||
if ent, isEnt := parseHTMLEntity(entName); isEnt { |
||||
outBuf.WriteString(ent) |
||||
inEnt = true |
||||
continue |
||||
} |
||||
} |
||||
} |
||||
|
||||
if !inEnt { |
||||
outBuf.WriteRune(r) |
||||
} |
||||
} |
||||
|
||||
return outBuf.String() |
||||
} |
||||
|
||||
func writeSpace(outBuf *bytes.Buffer) { |
||||
bts := outBuf.Bytes() |
||||
if len(bts) > 0 && bts[len(bts)-1] != ' ' { |
||||
outBuf.WriteString(" ") |
||||
} |
||||
} |
||||
|
||||
// HTML2Text converts html into a text form
|
||||
func HTML2Text(html string) string { |
||||
var opts []Option |
||||
if legacyLBR == UNIX_LBR { |
||||
opts = append(opts, WithUnixLineBreaks()) |
||||
} |
||||
return HTML2TextWithOptions(html, opts...) |
||||
} |
||||
|
||||
// HTML2TextWithOptions converts html into a text form with additional options
|
||||
func HTML2TextWithOptions(html string, reqOpts ...Option) string { |
||||
opts := newOptions() |
||||
for _, opt := range reqOpts { |
||||
opt(opts) |
||||
} |
||||
|
||||
inLen := len(html) |
||||
tagStart := 0 |
||||
inEnt := false |
||||
badTagStackDepth := 0 // if == 1 it means we are inside <head>...</head>
|
||||
shouldOutput := true |
||||
// maintain a stack of <a> tag href links and output it after the tag's inner text (for opts.linksInnerText only)
|
||||
hrefs := []string{} |
||||
// new line cannot be printed at the beginning or
|
||||
// for <p> after a new line created by previous <p></p>
|
||||
canPrintNewline := false |
||||
|
||||
outBuf := bytes.NewBufferString("") |
||||
|
||||
for i, r := range html { |
||||
if inLen > 0 && i == inLen-1 { |
||||
// prevent new line at the end of the document
|
||||
canPrintNewline = false |
||||
} |
||||
|
||||
switch { |
||||
// skip new lines and spaces adding a single space if not there yet
|
||||
case r <= 0xD, r == 0x85, r == 0x2028, r == 0x2029, // new lines
|
||||
r == ' ', r >= 0x2008 && r <= 0x200B: // spaces
|
||||
if shouldOutput && badTagStackDepth == 0 && !inEnt { |
||||
//outBuf.WriteString(fmt.Sprintf("{DBG r:%c, inEnt:%t, tag:%s}", r, inEnt, html[tagStart:i]))
|
||||
writeSpace(outBuf) |
||||
} |
||||
continue |
||||
|
||||
case r == ';' && inEnt: // end of html entity
|
||||
inEnt = false |
||||
continue |
||||
|
||||
case r == '&' && shouldOutput: // possible html entity
|
||||
entName := "" |
||||
isEnt := false |
||||
|
||||
// parse the entity name - max 10 chars
|
||||
chars := 0 |
||||
for _, er := range html[i+1:] { |
||||
if er == ';' { |
||||
isEnt = true |
||||
break |
||||
} else { |
||||
entName += string(er) |
||||
} |
||||
|
||||
chars++ |
||||
if chars == 10 { |
||||
break |
||||
} |
||||
} |
||||
|
||||
if isEnt { |
||||
if ent, isEnt := parseHTMLEntity(entName); isEnt { |
||||
outBuf.WriteString(ent) |
||||
inEnt = true |
||||
continue |
||||
} |
||||
} |
||||
|
||||
case r == '<': // start of a tag
|
||||
tagStart = i + 1 |
||||
shouldOutput = false |
||||
continue |
||||
|
||||
case r == '>': // end of a tag
|
||||
shouldOutput = true |
||||
tag := html[tagStart:i] |
||||
tagNameLowercase := strings.ToLower(tag) |
||||
|
||||
if tagNameLowercase == "/ul" || tagNameLowercase == "/ol" { |
||||
outBuf.WriteString(opts.lbr) |
||||
} else if tagNameLowercase == "li" || tagNameLowercase == "li/" { |
||||
if opts.listPrefix != "" { |
||||
outBuf.WriteString(opts.lbr + opts.listPrefix) |
||||
} else { |
||||
outBuf.WriteString(opts.lbr) |
||||
} |
||||
} else if headersRE.MatchString(tagNameLowercase) { |
||||
if canPrintNewline { |
||||
outBuf.WriteString(opts.lbr + opts.lbr) |
||||
} |
||||
canPrintNewline = false |
||||
} else if tagNameLowercase == "br" || tagNameLowercase == "br/" { |
||||
// new line
|
||||
outBuf.WriteString(opts.lbr) |
||||
} else if tagNameLowercase == "p" || tagNameLowercase == "/p" { |
||||
if canPrintNewline { |
||||
outBuf.WriteString(opts.lbr + opts.lbr) |
||||
} |
||||
canPrintNewline = false |
||||
} else if opts.linksInnerText && tagNameLowercase == "/a" { |
||||
// end of link
|
||||
// links can be empty can happen if the link matches the badLinkHrefRE
|
||||
if len(hrefs) > 0 { |
||||
outBuf.WriteString(" <") |
||||
outBuf.WriteString(HTMLEntitiesToText(hrefs[0])) |
||||
outBuf.WriteString(">") |
||||
hrefs = hrefs[1:] |
||||
} |
||||
} else if opts.linksInnerText && linkTagRE.MatchString(tagNameLowercase) { |
||||
// parse link href
|
||||
// add special handling for a tags
|
||||
m := linkTagRE.FindStringSubmatch(tag) |
||||
if len(m) == 5 { |
||||
link := m[2] |
||||
if len(link) == 0 { |
||||
link = m[3] |
||||
if len(link) == 0 { |
||||
link = m[4] |
||||
} |
||||
} |
||||
|
||||
if opts.linksInnerText && !badLinkHrefRE.MatchString(link) { |
||||
hrefs = append(hrefs, link) |
||||
} |
||||
} |
||||
} else if badTagnamesRE.MatchString(tagNameLowercase) { |
||||
// unwanted block
|
||||
badTagStackDepth++ |
||||
|
||||
// if link inner text preservation is not enabled
|
||||
// and the current tag is a link tag, parse its href and output that
|
||||
if !opts.linksInnerText { |
||||
// parse link href
|
||||
m := linkTagRE.FindStringSubmatch(tag) |
||||
if len(m) == 5 { |
||||
link := m[2] |
||||
if len(link) == 0 { |
||||
link = m[3] |
||||
if len(link) == 0 { |
||||
link = m[4] |
||||
} |
||||
} |
||||
|
||||
if !badLinkHrefRE.MatchString(link) { |
||||
outBuf.WriteString(HTMLEntitiesToText(link)) |
||||
} |
||||
} |
||||
} |
||||
} else if len(tagNameLowercase) > 0 && tagNameLowercase[0] == '/' && |
||||
badTagnamesRE.MatchString(tagNameLowercase[1:]) { |
||||
// end of unwanted block
|
||||
badTagStackDepth-- |
||||
} |
||||
continue |
||||
|
||||
} // switch end
|
||||
|
||||
if shouldOutput && badTagStackDepth == 0 && !inEnt { |
||||
canPrintNewline = true |
||||
outBuf.WriteRune(r) |
||||
} |
||||
} |
||||
|
||||
return outBuf.String() |
||||
} |
||||
Loading…
Reference in new issue