|
|
|
@ -2,24 +2,46 @@ use ruma::RoomId; |
|
|
|
|
|
|
|
|
|
|
|
use crate::{database::KeyValueDatabase, service, services, utils, Result}; |
|
|
|
use crate::{database::KeyValueDatabase, service, services, utils, Result}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/// Splits a string into tokens used as keys in the search inverted index
|
|
|
|
|
|
|
|
///
|
|
|
|
|
|
|
|
/// This may be used to tokenize both message bodies (for indexing) or search
|
|
|
|
|
|
|
|
/// queries (for querying).
|
|
|
|
|
|
|
|
fn tokenize(body: &str) -> impl Iterator<Item = String> + '_ { |
|
|
|
|
|
|
|
body.split_terminator(|c: char| !c.is_alphanumeric()) |
|
|
|
|
|
|
|
.filter(|s| !s.is_empty()) |
|
|
|
|
|
|
|
.filter(|word| word.len() <= 50) |
|
|
|
|
|
|
|
.map(str::to_lowercase) |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
impl service::rooms::search::Data for KeyValueDatabase { |
|
|
|
impl service::rooms::search::Data for KeyValueDatabase { |
|
|
|
fn index_pdu<'a>(&self, shortroomid: u64, pdu_id: &[u8], message_body: &str) -> Result<()> { |
|
|
|
fn index_pdu<'a>(&self, shortroomid: u64, pdu_id: &[u8], message_body: &str) -> Result<()> { |
|
|
|
let mut batch = message_body |
|
|
|
let mut batch = tokenize(message_body).map(|word| { |
|
|
|
.split_terminator(|c: char| !c.is_alphanumeric()) |
|
|
|
let mut key = shortroomid.to_be_bytes().to_vec(); |
|
|
|
.filter(|s| !s.is_empty()) |
|
|
|
key.extend_from_slice(word.as_bytes()); |
|
|
|
.filter(|word| word.len() <= 50) |
|
|
|
key.push(0xff); |
|
|
|
.map(str::to_lowercase) |
|
|
|
key.extend_from_slice(pdu_id); // TODO: currently we save the room id a second time here
|
|
|
|
.map(|word| { |
|
|
|
(key, Vec::new()) |
|
|
|
let mut key = shortroomid.to_be_bytes().to_vec(); |
|
|
|
}); |
|
|
|
key.extend_from_slice(word.as_bytes()); |
|
|
|
|
|
|
|
key.push(0xff); |
|
|
|
|
|
|
|
key.extend_from_slice(pdu_id); // TODO: currently we save the room id a second time here
|
|
|
|
|
|
|
|
(key, Vec::new()) |
|
|
|
|
|
|
|
}); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.tokenids.insert_batch(&mut batch) |
|
|
|
self.tokenids.insert_batch(&mut batch) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fn deindex_pdu(&self, shortroomid: u64, pdu_id: &[u8], message_body: &str) -> Result<()> { |
|
|
|
|
|
|
|
let batch = tokenize(message_body).map(|word| { |
|
|
|
|
|
|
|
let mut key = shortroomid.to_be_bytes().to_vec(); |
|
|
|
|
|
|
|
key.extend_from_slice(word.as_bytes()); |
|
|
|
|
|
|
|
key.push(0xFF); |
|
|
|
|
|
|
|
key.extend_from_slice(pdu_id); // TODO: currently we save the room id a second time here
|
|
|
|
|
|
|
|
key |
|
|
|
|
|
|
|
}); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for token in batch { |
|
|
|
|
|
|
|
self.tokenids.remove(&token)?; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Ok(()) |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
fn search_pdus<'a>( |
|
|
|
fn search_pdus<'a>( |
|
|
|
&'a self, |
|
|
|
&'a self, |
|
|
|
room_id: &RoomId, |
|
|
|
room_id: &RoomId, |
|
|
|
@ -33,11 +55,7 @@ impl service::rooms::search::Data for KeyValueDatabase { |
|
|
|
.to_be_bytes() |
|
|
|
.to_be_bytes() |
|
|
|
.to_vec(); |
|
|
|
.to_vec(); |
|
|
|
|
|
|
|
|
|
|
|
let words: Vec<_> = search_string |
|
|
|
let words: Vec<_> = tokenize(search_string).collect(); |
|
|
|
.split_terminator(|c: char| !c.is_alphanumeric()) |
|
|
|
|
|
|
|
.filter(|s| !s.is_empty()) |
|
|
|
|
|
|
|
.map(str::to_lowercase) |
|
|
|
|
|
|
|
.collect(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
let iterators = words.clone().into_iter().map(move |word| { |
|
|
|
let iterators = words.clone().into_iter().map(move |word| { |
|
|
|
let mut prefix2 = prefix.clone(); |
|
|
|
let mut prefix2 = prefix.clone(); |
|
|
|
|