Browse Source

linkifier: Detect also URLs that do not have a scheme and matrix: URIs

merge-requests/1719/head
Kévin Commaille 2 years ago
parent
commit
dfc4a052de
No known key found for this signature in database
GPG Key ID: C971D9DBC9D678D
  1. 11
      Cargo.lock
  2. 1
      Cargo.toml
  3. 162
      src/utils/string/mod.rs
  4. 82
      src/utils/string/tests.rs

11
Cargo.lock generated

@ -1441,6 +1441,7 @@ dependencies = [
"sourceview5",
"strum",
"thiserror",
"tld",
"tokio",
"tokio-stream",
"tracing",
@ -5176,6 +5177,16 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "tld"
version = "2.35.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73ca5fc340fcb4f52570c502cf443fc22d5521e9ef2bb03528e3634254016cf7"
dependencies = [
"phf",
"phf_codegen",
]
[[package]]
name = "tokio"
version = "1.38.0"

1
Cargo.toml

@ -43,6 +43,7 @@ serde = "1"
serde_json = "1"
strum = { version = "0.26", features = ["derive"] }
thiserror = "1"
tld = "2"
tokio = { version = "1", features = ["rt", "rt-multi-thread", "sync"] }
tokio-stream = { version = "0.1", features = ["sync"] }
tracing = "0.1"

162
src/utils/string.rs → src/utils/string/mod.rs

@ -1,9 +1,14 @@
//! Helper traits and methods for strings.
use std::fmt::{self, Write};
use std::{borrow::Cow, fmt::Write};
use gtk::glib::markup_escape_text;
use linkify::{LinkFinder, LinkKind};
use ruma::MatrixUri;
use url::Url;
#[cfg(test)]
mod tests;
use super::matrix::{find_at_room, MatrixIdUri, AT_ROOM};
use crate::{
@ -12,6 +17,13 @@ use crate::{
session::model::Room,
};
/// The prefix for an email URI.
const EMAIL_URI_PREFIX: &str = "mailto:";
/// The prefix for a HTTPS URL.
const HTTPS_URI_PREFIX: &str = "https://";
/// The prefix for a `matrix:` URI.
const MATRIX_URI_PREFIX: &str = "matrix:";
/// Common extensions to strings.
pub trait StrExt {
/// Escape markup for compatibility with Pango.
@ -207,30 +219,34 @@ impl<'a> Linkifier<'a> {
///
/// Returns the list of mentions, if any where found.
pub fn linkify(mut self, text: &str) {
let finder = LinkFinder::new();
let mut finder = LinkFinder::new();
// Allow URLS without a scheme.
finder.url_must_have_scheme(false);
let mut prev_span = None;
for span in finder.spans(text) {
let span_text = span.as_str();
let uri = match span.kind() {
match span.kind() {
Some(LinkKind::Url) => {
if let MentionsMode::WithMentions { pills, room, .. } = &mut self.mentions {
if let Some(pill) = self.inner.maybe_append_mention(span_text, room) {
pills.push(pill);
let is_valid_url = self.append_detected_url(span_text, prev_span);
continue;
}
if is_valid_url {
prev_span = None;
} else {
prev_span = Some(span_text);
}
Some(UriParts {
prefix: None,
uri: span_text,
})
}
Some(LinkKind::Email) => Some(UriParts {
prefix: Some("mailto:"),
uri: span_text,
}),
Some(LinkKind::Email) => {
self.inner
.append_link_opening_tag(format!("{EMAIL_URI_PREFIX}{span_text}"));
self.inner.push_str(&span_text.escape_markup());
self.inner.push_str("</a>");
// The span was a valid email so we will not need to check it for the next span.
prev_span = None;
}
_ => {
if let MentionsMode::WithMentions {
pills,
@ -242,23 +258,104 @@ impl<'a> Linkifier<'a> {
pills.push(pill);
}
prev_span = Some(span_text);
continue;
}
None
self.append_string(span_text);
prev_span = Some(span_text);
}
};
}
}
}
if let Some(uri) = uri {
self.inner.append_link_opening_tag(uri.to_string());
/// Append the given string.
///
/// Escapes the markup of the string.
fn append_string(&mut self, s: &str) {
self.inner.push_str(&s.escape_markup());
}
/// Append the given URI.
fn append_uri(&mut self, uri: &str, prefix: Option<&str>) {
let full_uri = if let Some(prefix) = prefix {
Cow::Owned(format!("{prefix}{uri}"))
} else {
Cow::Borrowed(uri)
};
if let MentionsMode::WithMentions { pills, room, .. } = &mut self.mentions {
if let Some(pill) = self.inner.maybe_append_mention(full_uri.as_ref(), room) {
pills.push(pill);
return;
}
}
self.inner.push_str(&span_text.escape_markup());
self.inner.append_link_opening_tag(full_uri);
self.append_string(uri);
self.inner.push_str("</a>");
}
/// Append the given string detected as a URL.
///
/// Appends false positives as normal strings, otherwise appends it as a
/// URI.
///
/// Returns `true` if it was detected as a valid URL.
fn append_detected_url(&mut self, detected_url: &str, prev_span: Option<&str>) -> bool {
if Url::parse(detected_url).is_ok() {
// This is a full URL with a scheme, we can trust that it is valid.
self.append_uri(detected_url, None);
return true;
}
// It does not have a scheme, try to split it to get only the domain.
let domain = if let Some((domain, _)) = detected_url.split_once('/') {
// This is a URL with a path component.
domain
} else if let Some((domain, _)) = detected_url.split_once('?') {
// This is a URL with a query component.
domain
} else if let Some((domain, _)) = detected_url.split_once('#') {
// This is a URL with a fragment.
domain
} else {
// It should only contain the full domain.
detected_url
};
// Check that the top-level domain is known.
if !domain.rsplit_once('.').is_some_and(|(_, d)| tld::exist(d)) {
// This is a false positive, treat it like a regular string.
self.append_string(detected_url);
return false;
}
if uri.is_some() {
self.inner.push_str("</a>");
// The LinkFinder does not detect URIs without an authority component, which is
// problematic for `matrix:` URIs. However it detects a link starting from the
// homeserver part, e.g. it detects `example.org` in
// `matrix:r/somewhere:example.org`. We can use that to recompose the full URI
// with the previous span.
// First, detect if we can find the `matrix:` scheme in the previous span.
if let Some(maybe_uri_start) =
prev_span.and_then(|s| s.rfind(MATRIX_URI_PREFIX).map(|pos| &s[pos..]))
{
// See if the whole string is a valid URI.
let maybe_full_uri = format!("{maybe_uri_start}{detected_url}");
if MatrixUri::parse(&maybe_full_uri).is_ok() {
// Remove the start of the URI from the string.
self.inner
.truncate(self.inner.len() - maybe_uri_start.len());
self.append_uri(&maybe_full_uri, None);
return true;
}
}
self.append_uri(detected_url, Some(HTTPS_URI_PREFIX));
true
}
}
@ -278,20 +375,3 @@ enum MentionsMode<'a> {
detect_at_room: bool,
},
}
/// A URI that is possibly into parts.
#[derive(Debug, Clone, Copy)]
struct UriParts<'a> {
prefix: Option<&'a str>,
uri: &'a str,
}
impl<'a> fmt::Display for UriParts<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if let Some(prefix) = self.prefix {
f.write_str(prefix)?;
}
f.write_str(self.uri)
}
}

82
src/utils/string/tests.rs

@ -0,0 +1,82 @@
use super::linkify;
#[test]
fn linkify_text() {
// URLs with scheme.
let text = "https://gitlab.gnome.org/World/fractal";
assert_eq!(
linkify(text),
r#"<a href="https://gitlab.gnome.org/World/fractal" title="https://gitlab.gnome.org/World/fractal">https://gitlab.gnome.org/World/fractal</a>"#
);
let text = "https://matrix.to/#/!somewhere%3Aexample.org?via=elsewhere.ca";
assert_eq!(
linkify(text),
r#"<a href="https://matrix.to/#/!somewhere%3Aexample.org?via=elsewhere.ca" title="https://matrix.to/#/!somewhere%3Aexample.org?via=elsewhere.ca">https://matrix.to/#/!somewhere%3Aexample.org?via=elsewhere.ca</a>"#
);
// Email.
let text = "admin@matrix.org";
assert_eq!(
linkify(text),
r#"<a href="mailto:admin@matrix.org" title="mailto:admin@matrix.org">admin@matrix.org</a>"#
);
// URLs without scheme.
let text = "gnome.org";
assert_eq!(
linkify(text),
r#"<a href="https://gnome.org" title="https://gnome.org">gnome.org</a>"#
);
let text = "gitlab.gnome.org/World/fractal";
assert_eq!(
linkify(text),
r#"<a href="https://gitlab.gnome.org/World/fractal" title="https://gitlab.gnome.org/World/fractal">gitlab.gnome.org/World/fractal</a>"#
);
let text = "matrix.to/#/!somewhere%3Aexample.org?via=elsewhere.ca";
assert_eq!(
linkify(text),
r#"<a href="https://matrix.to/#/!somewhere%3Aexample.org?via=elsewhere.ca" title="https://matrix.to/#/!somewhere%3Aexample.org?via=elsewhere.ca">matrix.to/#/!somewhere%3Aexample.org?via=elsewhere.ca</a>"#
);
// `matrix:` URIs.
let text = "matrix:r/somewhere:example.org";
assert_eq!(
linkify(text),
r#"<a href="matrix:r/somewhere:example.org" title="matrix:r/somewhere:example.org">matrix:r/somewhere:example.org</a>"#
);
let text = "matrix:roomid/somewhere:example.org?via=elsewhere.ca";
assert_eq!(
linkify(text),
r#"<a href="matrix:roomid/somewhere:example.org?via=elsewhere.ca" title="matrix:roomid/somewhere:example.org?via=elsewhere.ca">matrix:roomid/somewhere:example.org?via=elsewhere.ca</a>"#
);
let text = "matrix:roomid/somewhere:example.org/e/event?via=elsewhere.ca";
assert_eq!(
linkify(text),
r#"<a href="matrix:roomid/somewhere:example.org/e/event?via=elsewhere.ca" title="matrix:roomid/somewhere:example.org/e/event?via=elsewhere.ca">matrix:roomid/somewhere:example.org/e/event?via=elsewhere.ca</a>"#
);
let text = "matrix:u/alice:example.org?action=chat";
assert_eq!(
linkify(text),
r#"<a href="matrix:u/alice:example.org?action=chat" title="matrix:u/alice:example.org?action=chat">matrix:u/alice:example.org?action=chat</a>"#
);
// Invalid TLDs.
let text = "gnome.invalid";
assert_eq!(linkify(text), "gnome.invalid");
let text = "org.gnome.fractal";
assert_eq!(linkify(text), "org.gnome.fractal");
// `matrix:` that is not a URI scheme.
let text = "My homeserver for matrix: gnome.org";
assert_eq!(
linkify(text),
r#"My homeserver for matrix: <a href="https://gnome.org" title="https://gnome.org">gnome.org</a>"#
);
}
Loading…
Cancel
Save