From dfc4a052de1d47a859d03dfc17a0c4e026558158 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=A9vin=20Commaille?= Date: Wed, 10 Jul 2024 16:10:40 +0200 Subject: [PATCH] linkifier: Detect also URLs that do not have a scheme and matrix: URIs --- Cargo.lock | 11 ++ Cargo.toml | 1 + src/utils/{string.rs => string/mod.rs} | 162 ++++++++++++++++++------- src/utils/string/tests.rs | 82 +++++++++++++ 4 files changed, 215 insertions(+), 41 deletions(-) rename src/utils/{string.rs => string/mod.rs} (61%) create mode 100644 src/utils/string/tests.rs diff --git a/Cargo.lock b/Cargo.lock index daad96f7..fff6a5ca 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1441,6 +1441,7 @@ dependencies = [ "sourceview5", "strum", "thiserror", + "tld", "tokio", "tokio-stream", "tracing", @@ -5176,6 +5177,16 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "tld" +version = "2.35.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73ca5fc340fcb4f52570c502cf443fc22d5521e9ef2bb03528e3634254016cf7" +dependencies = [ + "phf", + "phf_codegen", +] + [[package]] name = "tokio" version = "1.38.0" diff --git a/Cargo.toml b/Cargo.toml index b690f65a..73fdbe73 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,6 +43,7 @@ serde = "1" serde_json = "1" strum = { version = "0.26", features = ["derive"] } thiserror = "1" +tld = "2" tokio = { version = "1", features = ["rt", "rt-multi-thread", "sync"] } tokio-stream = { version = "0.1", features = ["sync"] } tracing = "0.1" diff --git a/src/utils/string.rs b/src/utils/string/mod.rs similarity index 61% rename from src/utils/string.rs rename to src/utils/string/mod.rs index 3f6279ac..c4888465 100644 --- a/src/utils/string.rs +++ b/src/utils/string/mod.rs @@ -1,9 +1,14 @@ //! Helper traits and methods for strings. -use std::fmt::{self, Write}; +use std::{borrow::Cow, fmt::Write}; use gtk::glib::markup_escape_text; use linkify::{LinkFinder, LinkKind}; +use ruma::MatrixUri; +use url::Url; + +#[cfg(test)] +mod tests; use super::matrix::{find_at_room, MatrixIdUri, AT_ROOM}; use crate::{ @@ -12,6 +17,13 @@ use crate::{ session::model::Room, }; +/// The prefix for an email URI. +const EMAIL_URI_PREFIX: &str = "mailto:"; +/// The prefix for a HTTPS URL. +const HTTPS_URI_PREFIX: &str = "https://"; +/// The prefix for a `matrix:` URI. +const MATRIX_URI_PREFIX: &str = "matrix:"; + /// Common extensions to strings. pub trait StrExt { /// Escape markup for compatibility with Pango. @@ -207,30 +219,34 @@ impl<'a> Linkifier<'a> { /// /// Returns the list of mentions, if any where found. pub fn linkify(mut self, text: &str) { - let finder = LinkFinder::new(); + let mut finder = LinkFinder::new(); + // Allow URLS without a scheme. + finder.url_must_have_scheme(false); + + let mut prev_span = None; for span in finder.spans(text) { let span_text = span.as_str(); - let uri = match span.kind() { + match span.kind() { Some(LinkKind::Url) => { - if let MentionsMode::WithMentions { pills, room, .. } = &mut self.mentions { - if let Some(pill) = self.inner.maybe_append_mention(span_text, room) { - pills.push(pill); + let is_valid_url = self.append_detected_url(span_text, prev_span); - continue; - } + if is_valid_url { + prev_span = None; + } else { + prev_span = Some(span_text); } - - Some(UriParts { - prefix: None, - uri: span_text, - }) } - Some(LinkKind::Email) => Some(UriParts { - prefix: Some("mailto:"), - uri: span_text, - }), + Some(LinkKind::Email) => { + self.inner + .append_link_opening_tag(format!("{EMAIL_URI_PREFIX}{span_text}")); + self.inner.push_str(&span_text.escape_markup()); + self.inner.push_str(""); + + // The span was a valid email so we will not need to check it for the next span. + prev_span = None; + } _ => { if let MentionsMode::WithMentions { pills, @@ -242,23 +258,104 @@ impl<'a> Linkifier<'a> { pills.push(pill); } + prev_span = Some(span_text); continue; } - None + self.append_string(span_text); + prev_span = Some(span_text); } - }; + } + } + } - if let Some(uri) = uri { - self.inner.append_link_opening_tag(uri.to_string()); + /// Append the given string. + /// + /// Escapes the markup of the string. + fn append_string(&mut self, s: &str) { + self.inner.push_str(&s.escape_markup()); + } + + /// Append the given URI. + fn append_uri(&mut self, uri: &str, prefix: Option<&str>) { + let full_uri = if let Some(prefix) = prefix { + Cow::Owned(format!("{prefix}{uri}")) + } else { + Cow::Borrowed(uri) + }; + + if let MentionsMode::WithMentions { pills, room, .. } = &mut self.mentions { + if let Some(pill) = self.inner.maybe_append_mention(full_uri.as_ref(), room) { + pills.push(pill); + + return; } + } - self.inner.push_str(&span_text.escape_markup()); + self.inner.append_link_opening_tag(full_uri); + self.append_string(uri); + self.inner.push_str(""); + } + + /// Append the given string detected as a URL. + /// + /// Appends false positives as normal strings, otherwise appends it as a + /// URI. + /// + /// Returns `true` if it was detected as a valid URL. + fn append_detected_url(&mut self, detected_url: &str, prev_span: Option<&str>) -> bool { + if Url::parse(detected_url).is_ok() { + // This is a full URL with a scheme, we can trust that it is valid. + self.append_uri(detected_url, None); + return true; + } + + // It does not have a scheme, try to split it to get only the domain. + let domain = if let Some((domain, _)) = detected_url.split_once('/') { + // This is a URL with a path component. + domain + } else if let Some((domain, _)) = detected_url.split_once('?') { + // This is a URL with a query component. + domain + } else if let Some((domain, _)) = detected_url.split_once('#') { + // This is a URL with a fragment. + domain + } else { + // It should only contain the full domain. + detected_url + }; + + // Check that the top-level domain is known. + if !domain.rsplit_once('.').is_some_and(|(_, d)| tld::exist(d)) { + // This is a false positive, treat it like a regular string. + self.append_string(detected_url); + return false; + } - if uri.is_some() { - self.inner.push_str(""); + // The LinkFinder does not detect URIs without an authority component, which is + // problematic for `matrix:` URIs. However it detects a link starting from the + // homeserver part, e.g. it detects `example.org` in + // `matrix:r/somewhere:example.org`. We can use that to recompose the full URI + // with the previous span. + + // First, detect if we can find the `matrix:` scheme in the previous span. + if let Some(maybe_uri_start) = + prev_span.and_then(|s| s.rfind(MATRIX_URI_PREFIX).map(|pos| &s[pos..])) + { + // See if the whole string is a valid URI. + let maybe_full_uri = format!("{maybe_uri_start}{detected_url}"); + if MatrixUri::parse(&maybe_full_uri).is_ok() { + // Remove the start of the URI from the string. + self.inner + .truncate(self.inner.len() - maybe_uri_start.len()); + self.append_uri(&maybe_full_uri, None); + + return true; } } + + self.append_uri(detected_url, Some(HTTPS_URI_PREFIX)); + true } } @@ -278,20 +375,3 @@ enum MentionsMode<'a> { detect_at_room: bool, }, } - -/// A URI that is possibly into parts. -#[derive(Debug, Clone, Copy)] -struct UriParts<'a> { - prefix: Option<&'a str>, - uri: &'a str, -} - -impl<'a> fmt::Display for UriParts<'a> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - if let Some(prefix) = self.prefix { - f.write_str(prefix)?; - } - - f.write_str(self.uri) - } -} diff --git a/src/utils/string/tests.rs b/src/utils/string/tests.rs new file mode 100644 index 00000000..b50f6d22 --- /dev/null +++ b/src/utils/string/tests.rs @@ -0,0 +1,82 @@ +use super::linkify; + +#[test] +fn linkify_text() { + // URLs with scheme. + let text = "https://gitlab.gnome.org/World/fractal"; + assert_eq!( + linkify(text), + r#"https://gitlab.gnome.org/World/fractal"# + ); + + let text = "https://matrix.to/#/!somewhere%3Aexample.org?via=elsewhere.ca"; + assert_eq!( + linkify(text), + r#"https://matrix.to/#/!somewhere%3Aexample.org?via=elsewhere.ca"# + ); + + // Email. + let text = "admin@matrix.org"; + assert_eq!( + linkify(text), + r#"admin@matrix.org"# + ); + + // URLs without scheme. + let text = "gnome.org"; + assert_eq!( + linkify(text), + r#"gnome.org"# + ); + + let text = "gitlab.gnome.org/World/fractal"; + assert_eq!( + linkify(text), + r#"gitlab.gnome.org/World/fractal"# + ); + + let text = "matrix.to/#/!somewhere%3Aexample.org?via=elsewhere.ca"; + assert_eq!( + linkify(text), + r#"matrix.to/#/!somewhere%3Aexample.org?via=elsewhere.ca"# + ); + + // `matrix:` URIs. + let text = "matrix:r/somewhere:example.org"; + assert_eq!( + linkify(text), + r#"matrix:r/somewhere:example.org"# + ); + + let text = "matrix:roomid/somewhere:example.org?via=elsewhere.ca"; + assert_eq!( + linkify(text), + r#"matrix:roomid/somewhere:example.org?via=elsewhere.ca"# + ); + + let text = "matrix:roomid/somewhere:example.org/e/event?via=elsewhere.ca"; + assert_eq!( + linkify(text), + r#"matrix:roomid/somewhere:example.org/e/event?via=elsewhere.ca"# + ); + + let text = "matrix:u/alice:example.org?action=chat"; + assert_eq!( + linkify(text), + r#"matrix:u/alice:example.org?action=chat"# + ); + + // Invalid TLDs. + let text = "gnome.invalid"; + assert_eq!(linkify(text), "gnome.invalid"); + + let text = "org.gnome.fractal"; + assert_eq!(linkify(text), "org.gnome.fractal"); + + // `matrix:` that is not a URI scheme. + let text = "My homeserver for matrix: gnome.org"; + assert_eq!( + linkify(text), + r#"My homeserver for matrix: gnome.org"# + ); +}