[bugfix] Use better plaintext representation of status for filtering (#3301)

* [bugfix] Use better plaintext representation of status for filtering * add new deps to readme * lint * update tests * update regexes * address review comments * remove now unused xxhash * whoops, wrong logger * Merge branch 'main' into status_filtering_bugfix * put cache in caches struct * pain
2 years ago · efd1a4f717
15 changed files with 2682 additions and 61 deletions
--- a/README.md
+++ b/README.md
@ -273,6 +273,7 @@ The following open source libraries, frameworks, and tools are used by GoToSocia
  - [jackc/pgconn](https://github.com/jackc/pgconn); Postgres driver. [MIT License](https://spdx.org/licenses/MIT.html).
  - [jackc/pgx](https://github.com/jackc/pgx); Postgres driver and toolkit. [MIT License](https://spdx.org/licenses/MIT.html).
 - [KimMachineGun/automemlimit](https://github.com/KimMachineGun/automemlimit); cgroups memory limit checking. [MIT License](https://spdx.org/licenses/MIT.html).
+- [k3a/html2text](https://github.com/k3a/html2text); HTML-to-text conversion. [MIT License](https://spdx.org/licenses/MIT.html).
 - [mcuadros/go-syslog](https://github.com/mcuadros/go-syslog); Syslog server library. [MIT License](https://spdx.org/licenses/MIT.html).
 - [microcosm-cc/bluemonday](https://github.com/microcosm-cc/bluemonday); HTML user-input sanitization. [BSD-3-Clause License](https://spdx.org/licenses/BSD-3-Clause.html).
 - [miekg/dns](https://github.com/miekg/dns); DNS utilities. [Go License](https://go.dev/LICENSE).
--- a/go.mod
+++ b/go.mod
@ -40,6 +40,7 @@ require (
 	github.com/gorilla/feeds v1.2.0
 	github.com/gorilla/websocket v1.5.2
 	github.com/jackc/pgx/v5 v5.7.1
+	github.com/k3a/html2text v1.2.1
 	github.com/microcosm-cc/bluemonday v1.0.27
 	github.com/miekg/dns v1.1.62
 	github.com/minio/minio-go/v7 v7.0.76
--- a/go.sum
+++ b/go.sum
@ -384,6 +384,8 @@ github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/X
 github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo=
 github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
 github.com/k0kubun/colorstring v0.0.0-20150214042306-9440f1994b88/go.mod h1:3w7q1U84EfirKl04SVQ/s7nPm1ZPhiXd34z40TNz36k=
+github.com/k3a/html2text v1.2.1 h1:nvnKgBvBR/myqrwfLuiqecUtaK1lB9hGziIJKatNFVY=
+github.com/k3a/html2text v1.2.1/go.mod h1:ieEXykM67iT8lTvEWBh6fhpH4B23kB9OMKPdIBmgUqA=
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
 github.com/klauspost/compress v1.10.4/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
 github.com/klauspost/compress v1.10.10/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
--- a/internal/cache/cache.go
+++ b/internal/cache/cache.go
@ -47,6 +47,11 @@ type Caches struct {
 	// Webfinger provides access to the webfinger URL cache.
 	Webfinger *ttl.Cache[string, string] // TTL=24hr, sweep=5min

+	// TTL cache of statuses -> filterable text fields.
+	// To ensure up-to-date fields, cache is keyed as:
+	// `[status.ID][status.UpdatedAt.Unix()]`
+	StatusesFilterableFields *ttl.Cache[string, []string]
+
 	// prevent pass-by-value.
 	_ nocopy
 }
@ -109,6 +114,7 @@ func (c *Caches) Init() {
 	c.initUserMuteIDs()
 	c.initWebfinger()
 	c.initVisibility()
+	c.initStatusesFilterableFields()
 }

 // Start will start any caches that require a background
@ -119,6 +125,10 @@ func (c *Caches) Start() {
 	tryUntil("starting webfinger cache", 5, func() bool {
 		return c.Webfinger.Start(5 * time.Minute)
 	})
+
+	tryUntil("starting statusesFilterableFields cache", 5, func() bool {
+		return c.StatusesFilterableFields.Start(5 * time.Minute)
+	})
 }

 // Stop will stop any caches that require a background
@ -127,6 +137,7 @@ func (c *Caches) Stop() {
 	log.Infof(nil, "stop: %p", c)

 	tryUntil("stopping webfinger cache", 5, c.Webfinger.Stop)
+	tryUntil("stopping statusesFilterableFields cache", 5, c.StatusesFilterableFields.Stop)
 }

 // Sweep will sweep all the available caches to ensure none
@ -204,3 +215,12 @@ func (c *Caches) initWebfinger() {
 		24*time.Hour,
 	)
 }
+
+func (c *Caches) initStatusesFilterableFields() {
+	c.StatusesFilterableFields = new(ttl.Cache[string, []string])
+	c.StatusesFilterableFields.Init(
+		0,
+		512,
+		1*time.Hour,
+	)
+}
--- a/internal/gtsmodel/filter.go
+++ b/internal/gtsmodel/filter.go
@ -20,6 +20,8 @@ package gtsmodel
 import (
 	"regexp"
 	"time"
+
+	"github.com/superseriousbusiness/gotosocial/internal/util"
 )

 // Filter stores a filter created by a local account.
@ -61,14 +63,23 @@ type FilterKeyword struct {

 // Compile will compile this FilterKeyword as a prepared regular expression.
 func (k *FilterKeyword) Compile() (err error) {
-	var wordBreak string
-	if k.WholeWord != nil && *k.WholeWord {
-		wordBreak = `\b`
+	var (
+		wordBreakStart string
+		wordBreakEnd   string
+	)
+
+	if util.PtrOrZero(k.WholeWord) {
+		// Either word boundary or
+		// whitespace or start of line.
+		wordBreakStart = `(?:\b|\s|^)`
+		// Either word boundary or
+		// whitespace or end of line.
+		wordBreakEnd = `(?:\b|\s|$)`
 	}

 	// Compile keyword filter regexp.
 	quoted := regexp.QuoteMeta(k.Keyword)
-	k.Regexp, err = regexp.Compile(`(?i)` + wordBreak + quoted + wordBreak)
+	k.Regexp, err = regexp.Compile(`(?i)` + wordBreakStart + quoted + wordBreakEnd)
 	return // caller is expected to wrap this error
 }

--- a/internal/typeutils/internaltofrontend.go
+++ b/internal/typeutils/internaltofrontend.go
@ -21,6 +21,8 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"slices"
+	"strconv"
 	"strings"
 	"time"

@ -35,7 +37,6 @@ import (
 	"github.com/superseriousbusiness/gotosocial/internal/language"
 	"github.com/superseriousbusiness/gotosocial/internal/log"
 	"github.com/superseriousbusiness/gotosocial/internal/media"
-	"github.com/superseriousbusiness/gotosocial/internal/text"
 	"github.com/superseriousbusiness/gotosocial/internal/uris"
 	"github.com/superseriousbusiness/gotosocial/internal/util"
 )
@ -939,32 +940,48 @@ func (c *Converter) statusToAPIFilterResults(
 		return nil, nil
 	}

-	// Extract text fields from the status that we will match filters against.
-	fields := filterableTextFields(s)
+	// Key this status based on ID + last updated time,
+	// to ensure we always filter on latest version.
+	statusKey := s.ID + strconv.FormatInt(s.UpdatedAt.Unix(), 10)
+
+	// Check if we have filterable fields cached for this status.
+	cache := c.state.Caches.StatusesFilterableFields
+	fields, stored := cache.Get(statusKey)
+	if !stored {
+		// We don't have filterable fields
+		// cached, calculate + cache now.
+		fields = filterableFields(s)
+		cache.Set(statusKey, fields)
+	}

 	// Record all matching warn filters and the reasons they matched.
 	filterResults := make([]apimodel.FilterResult, 0, len(filters))
 	for _, filter := range filters {
 		if !filterAppliesInContext(filter, filterContext) {
-			// Filter doesn't apply to this context.
+			// Filter doesn't apply
+			// to this context.
 			continue
 		}
+
 		if filter.Expired(now) {
+			// Filter doesn't
+			// apply anymore.
 			continue
 		}

-		// List all matching keywords.
+		// Assemble matching keywords (if any) from this filter.
 		keywordMatches := make([]string, 0, len(filter.Keywords))
-		for _, filterKeyword := range filter.Keywords {
-			var isMatch bool
-			for _, field := range fields {
-				if filterKeyword.Regexp.MatchString(field) {
-					isMatch = true
-					break
-				}
-			}
-			if isMatch {
-				keywordMatches = append(keywordMatches, filterKeyword.Keyword)
+		for _, keyword := range filter.Keywords {
+			// Check if at least one filterable field
+			// in the status matches on this filter.
+			if slices.ContainsFunc(
+				fields,
+				func(field string) bool {
+					return keyword.Regexp.MatchString(field)
+				},
+			) {
+				// At least one field matched on this filter.
+				keywordMatches = append(keywordMatches, keyword.Keyword)
 			}
 		}

@ -1001,40 +1018,6 @@ func (c *Converter) statusToAPIFilterResults(
 	return filterResults, nil
 }

-// filterableTextFields returns all text from a status that we might want to filter on:
-// - content
-// - content warning
-// - media descriptions
-// - poll options
-func filterableTextFields(s *gtsmodel.Status) []string {
-	fieldCount := 2 + len(s.Attachments)
-	if s.Poll != nil {
-		fieldCount += len(s.Poll.Options)
-	}
-	fields := make([]string, 0, fieldCount)
-
-	if s.Content != "" {
-		fields = append(fields, text.SanitizeToPlaintext(s.Content))
-	}
-	if s.ContentWarning != "" {
-		fields = append(fields, s.ContentWarning)
-	}
-	for _, attachment := range s.Attachments {
-		if attachment.Description != "" {
-			fields = append(fields, attachment.Description)
-		}
-	}
-	if s.Poll != nil {
-		for _, option := range s.Poll.Options {
-			if option != "" {
-				fields = append(fields, option)
-			}
-		}
-	}
-
-	return fields
-}
-
 // filterAppliesInContext returns whether a given filter applies in a given context.
 func filterAppliesInContext(filter *gtsmodel.Filter, filterContext statusfilter.FilterContext) bool {
 	switch filterContext {
--- a/internal/typeutils/internaltofrontend_test.go
+++ b/internal/typeutils/internaltofrontend_test.go
@ -1063,15 +1063,21 @@ func (suite *InternalToFrontendTestSuite) TestHideFilteredBoostToFrontend() {

 // Test that a hashtag filter for a hashtag in Mastodon HTML content works the way most users would expect.
 func (suite *InternalToFrontendTestSuite) testHashtagFilteredStatusToFrontend(wholeWord bool, boost bool) {
-	testStatus := suite.testStatuses["admin_account_status_1"]
+	testStatus := new(gtsmodel.Status)
+	*testStatus = *suite.testStatuses["admin_account_status_1"]
 	testStatus.Content = `<p>doggo doggin' it</p><p><a href="https://example.test/tags/dogsofmastodon" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>dogsofmastodon</span></a></p>`

 	if boost {
-		// Modify a fixture boost into a boost of the above status.
-		boostStatus := suite.testStatuses["admin_account_status_4"]
-		boostStatus.BoostOf = testStatus
-		boostStatus.BoostOfID = testStatus.ID
-		testStatus = boostStatus
+		boost, err := suite.typeconverter.StatusToBoost(
+			context.Background(),
+			testStatus,
+			suite.testAccounts["admin_account"],
+			"",
+		)
+		if err != nil {
+			suite.FailNow(err.Error())
+		}
+		testStatus = boost
 	}

 	requestingAccount := suite.testAccounts["local_account_1"]
@ -1103,9 +1109,11 @@ func (suite *InternalToFrontendTestSuite) testHashtagFilteredStatusToFrontend(wh
 		[]*gtsmodel.Filter{filter},
 		nil,
 	)
-	if suite.NoError(err) {
-		suite.NotEmpty(apiStatus.Filtered)
+	if err != nil {
+		suite.FailNow(err.Error())
 	}
+
+	suite.NotEmpty(apiStatus.Filtered)
 }

 func (suite *InternalToFrontendTestSuite) TestHashtagWholeWordFilteredStatusToFrontend() {
--- a/internal/typeutils/util.go
+++ b/internal/typeutils/util.go
@ -27,6 +27,7 @@ import (
 	"strconv"
 	"strings"

+	"github.com/k3a/html2text"
 	apimodel "github.com/superseriousbusiness/gotosocial/internal/api/model"
 	"github.com/superseriousbusiness/gotosocial/internal/config"
 	"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
@ -284,3 +285,64 @@ func ContentToContentLanguage(

 	return contentStr, langTagStr
 }
+
+// filterableFields returns text fields from
+// a status that we might want to filter on:
+//
+//   - content warning
+//   - content (converted to plaintext from HTML)
+//   - media descriptions
+//   - poll options
+//
+// Each field should be filtered separately.
+// This avoids scenarios where false-positive
+// multiple-word matches can be made by matching
+// the last word of one field + the first word
+// of the next field together.
+func filterableFields(s *gtsmodel.Status) []string {
+	// Estimate length of fields.
+	fieldCount := 2 + len(s.Attachments)
+	if s.Poll != nil {
+		fieldCount += len(s.Poll.Options)
+	}
+	fields := make([]string, 0, fieldCount)
+
+	// Content warning / title.
+	if s.ContentWarning != "" {
+		fields = append(fields, s.ContentWarning)
+	}
+
+	// Status content. Though we have raw text
+	// available for statuses created on our
+	// instance, use the html2text version to
+	// remove markdown-formatting characters
+	// and ensure more consistent filtering.
+	if s.Content != "" {
+		text := html2text.HTML2TextWithOptions(
+			s.Content,
+			html2text.WithLinksInnerText(),
+			html2text.WithUnixLineBreaks(),
+		)
+		if text != "" {
+			fields = append(fields, text)
+		}
+	}
+
+	// Media descriptions.
+	for _, attachment := range s.Attachments {
+		if attachment.Description != "" {
+			fields = append(fields, attachment.Description)
+		}
+	}
+
+	// Poll options.
+	if s.Poll != nil {
+		for _, opt := range s.Poll.Options {
+			if opt != "" {
+				fields = append(fields, opt)
+			}
+		}
+	}
+
+	return fields
+}
--- a/internal/typeutils/util_test.go
+++ b/internal/typeutils/util_test.go
@ -21,6 +21,7 @@ import (
 	"context"
 	"testing"

+	"github.com/stretchr/testify/assert"
 	"github.com/superseriousbusiness/gotosocial/internal/config"
 	"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
 	"github.com/superseriousbusiness/gotosocial/internal/language"
@ -158,3 +159,62 @@ func TestContentToContentLanguage(t *testing.T) {
 		}
 	}
 }
+
+func TestFilterableText(t *testing.T) {
+	type testcase struct {
+		status         *gtsmodel.Status
+		expectedFields []string
+	}
+
+	for _, testcase := range []testcase{
+		{
+			status: &gtsmodel.Status{
+				ContentWarning: "This is a test status",
+				Content:        `<p>Import / export of account data via CSV files will be coming in 0.17.0 :) No more having to run scripts + CLI tools to import a list of accounts you follow, after doing a migration to a <a href="https://gts.superseriousbusiness.org/tags/gotosocial" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>GoToSocial</span></a> instance.</p>`,
+			},
+			expectedFields: []string{
+				"This is a test status",
+				"Import / export of account data via CSV files will be coming in 0.17.0 :) No more having to run scripts + CLI tools to import a list of accounts you follow, after doing a migration to a #GoToSocial <https://gts.superseriousbusiness.org/tags/gotosocial> instance.",
+			},
+		},
+		{
+			status: &gtsmodel.Status{
+				Content: `<p><span class="h-card"><a href="https://example.org/@zlatko" class="u-url mention" rel="nofollow noreferrer noopener" target="_blank">@<span>zlatko</span></a></span> currently we used modernc/sqlite3 for our sqlite driver, but we've been experimenting with wasm sqlite, and will likely move to that permanently in future; in the meantime, both options are available (the latter with a build tag)</p><p><a href="https://github.com/superseriousbusiness/gotosocial/pull/2863" rel="nofollow noreferrer noopener" target="_blank">https://github.com/superseriousbusiness/gotosocial/pull/2863</a></p>`,
+			},
+			expectedFields: []string{
+				"@zlatko <https://example.org/@zlatko> currently we used modernc/sqlite3 for our sqlite driver, but we've been experimenting with wasm sqlite, and will likely move to that permanently in future; in the meantime, both options are available (the latter with a build tag)\n\nhttps://github.com/superseriousbusiness/gotosocial/pull/2863 <https://github.com/superseriousbusiness/gotosocial/pull/2863>",
+			},
+		},
+		{
+			status: &gtsmodel.Status{
+				ContentWarning: "Nerd stuff",
+				Content:        `<p>Latest graphs for <a href="https://gts.superseriousbusiness.org/tags/gotosocial" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>GoToSocial</span></a> on <a href="https://github.com/ncruces/go-sqlite3" rel="nofollow noreferrer noopener" target="_blank">Wasm sqlite3</a> with <a href="https://codeberg.org/gruf/go-ffmpreg" rel="nofollow noreferrer noopener" target="_blank">embedded Wasm ffmpeg</a>, both running on <a href="https://wazero.io/" rel="nofollow noreferrer noopener" target="_blank">Wazero</a>, and configured with a <a href="https://github.com/superseriousbusiness/gotosocial/blob/20fe430ef9ff3012a7a4dc2d01b68020c20e13bb/example/config.yaml#L259-L266" rel="nofollow noreferrer noopener" target="_blank">50MiB db cache target</a>. This is the version we'll be releasing soonish, now we're happy with how we've tamed everything.</p>`,
+				Attachments: []*gtsmodel.MediaAttachment{
+					{
+						Description: `Graph showing GtS using between 150-300 MiB of memory, steadily, over a few days.`,
+					},
+					{
+						Description: `Another media attachment`,
+					},
+				},
+				Poll: &gtsmodel.Poll{
+					Options: []string{
+						"Poll option 1",
+						"Poll option 2",
+					},
+				},
+			},
+			expectedFields: []string{
+				"Nerd stuff",
+				"Latest graphs for #GoToSocial <https://gts.superseriousbusiness.org/tags/gotosocial> on Wasm sqlite3 <https://github.com/ncruces/go-sqlite3> with embedded Wasm ffmpeg <https://codeberg.org/gruf/go-ffmpreg>, both running on Wazero <https://wazero.io/>, and configured with a 50MiB db cache target <https://github.com/superseriousbusiness/gotosocial/blob/20fe430ef9ff3012a7a4dc2d01b68020c20e13bb/example/config.yaml#L259-L266>. This is the version we'll be releasing soonish, now we're happy with how we've tamed everything.",
+				"Graph showing GtS using between 150-300 MiB of memory, steadily, over a few days.",
+				"Another media attachment",
+				"Poll option 1",
+				"Poll option 2",
+			},
+		},
+	} {
+		fields := filterableFields(testcase.status)
+		assert.Equal(t, testcase.expectedFields, fields)
+	}
+}
--- a/vendor/github.com/k3a/html2text/.travis.yml
+++ b/vendor/github.com/k3a/html2text/.travis.yml
@ -0,0 +1,10 @@
+language: go
+go:
+  - master
+before_install:
+  - go get github.com/axw/gocov/gocov
+  - go get github.com/mattn/goveralls
+  - if ! go get github.com/golang/tools/cmd/cover; then go get golang.org/x/tools/cmd/cover; fi
+script:
+  - $HOME/gopath/bin/goveralls -service=travis-ci
+
--- a/vendor/github.com/k3a/html2text/LICENSE
+++ b/vendor/github.com/k3a/html2text/LICENSE
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2017 Mario K3A Hros (www.k3a.me)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/vendor/github.com/k3a/html2text/README.md
+++ b/vendor/github.com/k3a/html2text/README.md
@ -0,0 +1,60 @@
+[![GoDoc](https://godoc.org/github.com/k3a/html2text?status.svg)](https://godoc.org/github.com/k3a/html2text)
+[![Build Status](https://travis-ci.org/k3a/html2text.svg?branch=master)](https://travis-ci.org/k3a/html2text)
+[![Coverage Status](https://coveralls.io/repos/github/k3a/html2text/badge.svg?branch=master)](https://coveralls.io/github/k3a/html2text?branch=master)
+[![Report Card](https://goreportcard.com/badge/github.com/k3a/html2text)](https://goreportcard.com/report/github.com/k3a/html2text)
+
+# html2text
+
+A simple Golang package to convert HTML to plain text (without non-standard dependencies).
+
+It converts HTML tags to text and also parses HTML entities into characters they represent.
+A `<head>` section of the HTML document, as well as most other tags are stripped out but 
+links are properly converted into their href attribute.
+
+It can be used for converting HTML emails into text.
+
+Some tests are installed as well.
+Uses semantic versioning and no breaking changes are planned.
+
+Fell free to publish a pull request if you have suggestions for improvement but please note that the library can now be considered feature-complete and API stable. If you need more than this basic conversion, please use an alternative mentioned at the bottom.
+
+## Install
+```bash
+go get github.com/k3a/html2text
+```
+
+## Usage
+
+```go
+package main
+
+import (
+	"fmt"
+	"github.com/k3a/html2text"
+)
+
+func main() {
+	html := `<html><head><title>Good</title></head><body><strong>clean</strong> text</body>`
+	
+	plain := html2text.HTML2Text(html)
+			  
+	fmt.Println(plain)
+}
+
+/*	Outputs:
+
+	clean text
+*/
+
+```
+
+To see all features, please look info `html2text_test.go`.
+
+## Alternatives
+- https://github.com/jaytaylor/html2text (heavier, with more features)
+- https://git.alexwennerberg.com/nanohtml2text (rewrite of this module in Rust)
+
+## License
+
+MIT
+
--- a/vendor/github.com/k3a/html2text/entity.go
+++ b/vendor/github.com/k3a/html2text/entity.go
--- a/vendor/github.com/k3a/html2text/html2text.go
+++ b/vendor/github.com/k3a/html2text/html2text.go
@ -0,0 +1,333 @@
+package html2text
+
+import (
+	"bytes"
+	"regexp"
+	"strconv"
+	"strings"
+)
+
+// Line break constants
+// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
+const (
+	WIN_LBR  = "\r\n"
+	UNIX_LBR = "\n"
+)
+
+var legacyLBR = WIN_LBR
+var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s+)`)
+var linkTagRE = regexp.MustCompile(`^(?i:a)(?:$|\s).*(?i:href)\s*=\s*('([^']*?)'|"([^"]*?)"|([^\s"'` + "`" + `=<>]+))`)
+var badLinkHrefRE = regexp.MustCompile(`javascript:`)
+var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
+var numericEntityRE = regexp.MustCompile(`(?i)^#(x?[a-f0-9]+)$`)
+
+type options struct {
+	lbr            string
+	linksInnerText bool
+	listPrefix     string
+}
+
+func newOptions() *options {
+	// apply defaults
+	return &options{
+		lbr: WIN_LBR,
+	}
+}
+
+// Option is a functional option
+type Option func(*options)
+
+// WithUnixLineBreaks instructs the converter to use unix line breaks ("\n" instead of "\r\n" default)
+func WithUnixLineBreaks() Option {
+	return func(o *options) {
+		o.lbr = UNIX_LBR
+	}
+}
+
+// WithLinksInnerText instructs the converter to retain link tag inner text and append href URLs in angle brackets after the text
+// Example: click news <http://bit.ly/2n4wXRs>
+func WithLinksInnerText() Option {
+	return func(o *options) {
+		o.linksInnerText = true
+	}
+}
+
+// WithListSupportPrefix formats <ul> and <li> lists with the specified prefix
+func WithListSupportPrefix(prefix string) Option {
+	return func(o *options) {
+		o.listPrefix = prefix
+	}
+}
+
+// WithListSupport formats <ul> and <li> lists with " - " prefix
+func WithListSupport() Option {
+	return WithListSupportPrefix(" - ")
+}
+
+func parseHTMLEntity(entName string) (string, bool) {
+	if r, ok := entity[entName]; ok {
+		return string(r), true
+	}
+
+	if match := numericEntityRE.FindStringSubmatch(entName); len(match) == 2 {
+		var (
+			err    error
+			n      int64
+			digits = match[1]
+		)
+
+		if digits != "" && (digits[0] == 'x' || digits[0] == 'X') {
+			n, err = strconv.ParseInt(digits[1:], 16, 64)
+		} else {
+			n, err = strconv.ParseInt(digits, 10, 64)
+		}
+
+		if err == nil && (n == 9 || n == 10 || n == 13 || n > 31) {
+			return string(rune(n)), true
+		}
+	}
+
+	return "", false
+}
+
+// SetUnixLbr with argument true sets Unix-style line-breaks in output ("\n")
+// with argument false sets Windows-style line-breaks in output ("\r\n", the default)
+// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
+func SetUnixLbr(b bool) {
+	if b {
+		legacyLBR = UNIX_LBR
+	} else {
+		legacyLBR = WIN_LBR
+	}
+}
+
+// HTMLEntitiesToText decodes HTML entities inside a provided
+// string and returns decoded text
+func HTMLEntitiesToText(htmlEntsText string) string {
+	outBuf := bytes.NewBufferString("")
+	inEnt := false
+
+	for i, r := range htmlEntsText {
+		switch {
+		case r == ';' && inEnt:
+			inEnt = false
+			continue
+
+		case r == '&': //possible html entity
+			entName := ""
+			isEnt := false
+
+			// parse the entity name - max 10 chars
+			chars := 0
+			for _, er := range htmlEntsText[i+1:] {
+				if er == ';' {
+					isEnt = true
+					break
+				} else {
+					entName += string(er)
+				}
+
+				chars++
+				if chars == 10 {
+					break
+				}
+			}
+
+			if isEnt {
+				if ent, isEnt := parseHTMLEntity(entName); isEnt {
+					outBuf.WriteString(ent)
+					inEnt = true
+					continue
+				}
+			}
+		}
+
+		if !inEnt {
+			outBuf.WriteRune(r)
+		}
+	}
+
+	return outBuf.String()
+}
+
+func writeSpace(outBuf *bytes.Buffer) {
+	bts := outBuf.Bytes()
+	if len(bts) > 0 && bts[len(bts)-1] != ' ' {
+		outBuf.WriteString(" ")
+	}
+}
+
+// HTML2Text converts html into a text form
+func HTML2Text(html string) string {
+	var opts []Option
+	if legacyLBR == UNIX_LBR {
+		opts = append(opts, WithUnixLineBreaks())
+	}
+	return HTML2TextWithOptions(html, opts...)
+}
+
+// HTML2TextWithOptions converts html into a text form with additional options
+func HTML2TextWithOptions(html string, reqOpts ...Option) string {
+	opts := newOptions()
+	for _, opt := range reqOpts {
+		opt(opts)
+	}
+
+	inLen := len(html)
+	tagStart := 0
+	inEnt := false
+	badTagStackDepth := 0 // if == 1 it means we are inside <head>...</head>
+	shouldOutput := true
+	// maintain a stack of <a> tag href links and output it after the tag's inner text (for opts.linksInnerText only)
+	hrefs := []string{}
+	// new line cannot be printed at the beginning or
+	// for <p> after a new line created by previous <p></p>
+	canPrintNewline := false
+
+	outBuf := bytes.NewBufferString("")
+
+	for i, r := range html {
+		if inLen > 0 && i == inLen-1 {
+			// prevent new line at the end of the document
+			canPrintNewline = false
+		}
+
+		switch {
+		// skip new lines and spaces adding a single space if not there yet
+		case r <= 0xD, r == 0x85, r == 0x2028, r == 0x2029, // new lines
+			r == ' ', r >= 0x2008 && r <= 0x200B: // spaces
+			if shouldOutput && badTagStackDepth == 0 && !inEnt {
+				//outBuf.WriteString(fmt.Sprintf("{DBG r:%c, inEnt:%t, tag:%s}", r, inEnt, html[tagStart:i]))
+				writeSpace(outBuf)
+			}
+			continue
+
+		case r == ';' && inEnt: // end of html entity
+			inEnt = false
+			continue
+
+		case r == '&' && shouldOutput: // possible html entity
+			entName := ""
+			isEnt := false
+
+			// parse the entity name - max 10 chars
+			chars := 0
+			for _, er := range html[i+1:] {
+				if er == ';' {
+					isEnt = true
+					break
+				} else {
+					entName += string(er)
+				}
+
+				chars++
+				if chars == 10 {
+					break
+				}
+			}
+
+			if isEnt {
+				if ent, isEnt := parseHTMLEntity(entName); isEnt {
+					outBuf.WriteString(ent)
+					inEnt = true
+					continue
+				}
+			}
+
+		case r == '<': // start of a tag
+			tagStart = i + 1
+			shouldOutput = false
+			continue
+
+		case r == '>': // end of a tag
+			shouldOutput = true
+			tag := html[tagStart:i]
+			tagNameLowercase := strings.ToLower(tag)
+
+			if tagNameLowercase == "/ul" || tagNameLowercase == "/ol" {
+				outBuf.WriteString(opts.lbr)
+			} else if tagNameLowercase == "li" || tagNameLowercase == "li/" {
+				if opts.listPrefix != "" {
+					outBuf.WriteString(opts.lbr + opts.listPrefix)
+				} else {
+					outBuf.WriteString(opts.lbr)
+				}
+			} else if headersRE.MatchString(tagNameLowercase) {
+				if canPrintNewline {
+					outBuf.WriteString(opts.lbr + opts.lbr)
+				}
+				canPrintNewline = false
+			} else if tagNameLowercase == "br" || tagNameLowercase == "br/" {
+				// new line
+				outBuf.WriteString(opts.lbr)
+			} else if tagNameLowercase == "p" || tagNameLowercase == "/p" {
+				if canPrintNewline {
+					outBuf.WriteString(opts.lbr + opts.lbr)
+				}
+				canPrintNewline = false
+			} else if opts.linksInnerText && tagNameLowercase == "/a" {
+				// end of link
+				// links can be empty can happen if the link matches the badLinkHrefRE
+				if len(hrefs) > 0 {
+					outBuf.WriteString(" <")
+					outBuf.WriteString(HTMLEntitiesToText(hrefs[0]))
+					outBuf.WriteString(">")
+					hrefs = hrefs[1:]
+				}
+			} else if opts.linksInnerText && linkTagRE.MatchString(tagNameLowercase) {
+				// parse link href
+				// add special handling for a tags
+				m := linkTagRE.FindStringSubmatch(tag)
+				if len(m) == 5 {
+					link := m[2]
+					if len(link) == 0 {
+						link = m[3]
+						if len(link) == 0 {
+							link = m[4]
+						}
+					}
+
+					if opts.linksInnerText && !badLinkHrefRE.MatchString(link) {
+						hrefs = append(hrefs, link)
+					}
+				}
+			} else if badTagnamesRE.MatchString(tagNameLowercase) {
+				// unwanted block
+				badTagStackDepth++
+
+				// if link inner text preservation is not enabled
+				// and the current tag is a link tag, parse its href and output that
+				if !opts.linksInnerText {
+					// parse link href
+					m := linkTagRE.FindStringSubmatch(tag)
+					if len(m) == 5 {
+						link := m[2]
+						if len(link) == 0 {
+							link = m[3]
+							if len(link) == 0 {
+								link = m[4]
+							}
+						}
+
+						if !badLinkHrefRE.MatchString(link) {
+							outBuf.WriteString(HTMLEntitiesToText(link))
+						}
+					}
+				}
+			} else if len(tagNameLowercase) > 0 && tagNameLowercase[0] == '/' &&
+				badTagnamesRE.MatchString(tagNameLowercase[1:]) {
+				// end of unwanted block
+				badTagStackDepth--
+			}
+			continue
+
+		} // switch end
+
+		if shouldOutput && badTagStackDepth == 0 && !inEnt {
+			canPrintNewline = true
+			outBuf.WriteRune(r)
+		}
+	}
+
+	return outBuf.String()
+}
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@ -446,6 +446,9 @@ github.com/josharian/intern
 # github.com/json-iterator/go v1.1.12
 ## explicit; go 1.12
 github.com/json-iterator/go
+# github.com/k3a/html2text v1.2.1
+## explicit; go 1.16
+github.com/k3a/html2text
 # github.com/klauspost/compress v1.17.9
 ## explicit; go 1.20
 github.com/klauspost/compress