Browse Source

feat: Relax URL matching (#3925)

* feat: Relax URL matching

Instead of only linkifying things with an explicit http or https scheme,
the xurls.Relaxed also matches links with known TLDs. This means that
text like 'banana.com' will also be matched, despite the missing
http/https scheme. This also works to linkify email addresses, which is
handy.

This should also ensure we catch links without a scheme for the purpose
of spam checking.
pull/3917/head
Daenney 12 months ago committed by GitHub
parent
commit
1bf40e755c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 2
      internal/filter/spam/statusable.go
  2. 14
      internal/regexes/regexes.go
  3. 2
      internal/text/markdown.go
  4. 2
      internal/text/plain.go

2
internal/filter/spam/statusable.go

@ -375,7 +375,7 @@ func (f *Filter) errantLinks(
}
// Find + parse every http/https link in the status.
rawLinks := regexes.LinkScheme.FindAllString(concat, -1)
rawLinks := regexes.URLLike.FindAllString(concat, -1)
links := make([]preppedLink, 0, len(rawLinks))
for _, rawLink := range rawLinks {
linkURI, err := url.Parse(rawLink)

14
internal/regexes/regexes.go

@ -22,7 +22,7 @@ import (
"regexp"
"sync"
"mvdan.cc/xurls/v2"
xurls "mvdan.cc/xurls/v2"
)
const (
@ -40,7 +40,6 @@ const (
reports = "reports"
accepts = "accepts"
schemes = `(http|https)://` // Allowed URI protocols for parsing links in text.
alphaNumeric = `\p{L}\p{M}*|\p{N}` // A single number or script character in any language, including chars with accents.
usernameGrp = `(?:` + alphaNumeric + `|\.|\-|\_)` // Non-capturing group that matches against a single valid username character.
domainGrp = `(?:` + alphaNumeric + `|\.|\-|\:)` // Non-capturing group that matches against a single valid domain character.
@ -79,14 +78,9 @@ const (
)
var (
// LinkScheme captures http/https schemes in URLs.
LinkScheme = func() *regexp.Regexp {
rgx, err := xurls.StrictMatchingScheme(schemes)
if err != nil {
panic(err)
}
return rgx
}()
// URLLike captures anything that resembles a URL. This includes URLs
// with or without a scheme, and emails.
URLLike = xurls.Relaxed()
// MentionName captures the username and domain part from
// a mention string such as @whatever_user@example.org,

2
internal/text/markdown.go

@ -139,7 +139,7 @@ func (f *Formatter) fromMarkdown(
},
// Turns URLs into links.
extension.NewLinkify(
extension.WithLinkifyURLRegexp(regexes.LinkScheme),
extension.WithLinkifyURLRegexp(regexes.URLLike),
),
extension.Strikethrough,
),

2
internal/text/plain.go

@ -168,7 +168,7 @@ func (f *Formatter) fromPlain(
},
// Turns URLs into links.
extension.NewLinkify(
extension.WithLinkifyURLRegexp(regexes.LinkScheme),
extension.WithLinkifyURLRegexp(regexes.URLLike),
),
),
)

Loading…
Cancel
Save