1
1
//! This example shows a naive approach to implementing text search. It works by
2
//! creating a View with the Key being each word contained within a document.
3
//! This example uses the Value type to store which field the match came from.
4
//!
5
//! While this is a naive approach, this can be used to build a fairly robust
6
//! search. After retrieving the query results from the view, analysis can be
7
//! done on the results to rank the matched documents based on how many times a
8
//! keyword hit, which fields it matched upon, etc.
9
//!
10
//! While this approach can be powerful, it pales in comparsion to full text
11
//! search capabilities. The tracking issue for adding full text indexes to
12
//! BonsaiDb is here: <https://github.com/khonsulabs/bonsaidb/issues/149>.
13
use std::{str::Chars, time::SystemTime};
14

            
15
use bonsaidb::{
16
    core::{
17
        connection::Connection,
18
        document::{CollectionDocument, Emit},
19
        schema::{Collection, CollectionViewSchema, SerializedCollection, View, ViewMapResult},
20
    },
21
    local::{
22
        config::{Builder, StorageConfiguration},
23
        Database,
24
    },
25
};
26
use serde::{Deserialize, Serialize};
27

            
28
42
#[derive(Debug, Serialize, Deserialize, Collection)]
29
#[collection(name = "messages", views = [MessagesByWords])]
30
struct Message {
31
    pub timestamp: SystemTime,
32
    pub subject: String,
33
    pub body: String,
34
}
35

            
36
impl Message {
37
    /// Returns a new message with the current timestamp.
38
3
    pub fn new(subject: impl Into<String>, body: impl Into<String>) -> Self {
39
3
        Self {
40
3
            timestamp: SystemTime::now(),
41
3
            subject: subject.into(),
42
3
            body: body.into(),
43
3
        }
44
3
    }
45
}
46

            
47
20
#[derive(View, Debug, Clone)]
48
#[view(name = "by-keyword", collection = Message, key = String, value = String)]
49
struct MessagesByWords;
50

            
51
impl CollectionViewSchema for MessagesByWords {
52
    type View = Self;
53

            
54
3
    fn map(
55
3
        &self,
56
3
        document: CollectionDocument<<Self::View as View>::Collection>,
57
3
    ) -> ViewMapResult<Self::View> {
58
3
        // Emit a key/value mapping for each word found in the subject and body.
59
3
        let subject_words =
60
3
            keywords(&document.contents.subject).map(|word| (word, String::from("subject")));
61
10
        let body_words = keywords(&document.contents.body).map(|word| (word, String::from("body")));
62
3
        subject_words
63
3
            .chain(body_words)
64
13
            .map(|(key, value)| document.header.emit_key_and_value(key, value))
65
3
            .collect()
66
3
    }
67
}
68

            
69
1
fn main() -> Result<(), bonsaidb::core::Error> {
70
1
    let db = Database::open::<Message>(StorageConfiguration::new("keyword-search.bonsaidb"))?;
71

            
72
1
    Message::new("Groceries", "Can you pick up some milk on the way home?").push_into(&db)?;
73
1
    Message::new("Re: Groceries", "2% milk? How are our eggs?").push_into(&db)?;
74
1
    Message::new("Re: Groceries", "Yes. We could use another dozen eggs.").push_into(&db)?;
75

            
76
2
    for result in &db
77
1
        .view::<MessagesByWords>()
78
1
        .with_key("eggs")
79
1
        .query_with_collection_docs()?
80
2
    {
81
2
        println!(
82
2
            "Contained `eggs` in field {} : {:?}",
83
2
            result.value, result.document
84
2
        );
85
2
    }
86

            
87
1
    for message in db
88
1
        .view::<MessagesByWords>()
89
1
        .with_key_prefix(String::from("doz"))
90
1
        .query_with_collection_docs()?
91
        .documents
92
1
    {
93
1
        println!("Contained a word starting with `doz`: {message:?}");
94
1
    }
95

            
96
1
    Ok(())
97
1
}
98

            
99
/// Splits `source` into "words", where words are:
100
///
101
/// - Contiguous sequences of alphanumeric characters
102
/// - At least 4 characters long (Avoids "the", "and", etc, but it is *overly*
103
///   restrictive).
104
/// - Non-alphanumeric is always considered a word break and excluded from
105
///   results. This means that "m.d." will never yield any 'words' in this
106
///   algorithm, because 'm' and 'd' will be considered separate words and
107
///   excluded for being too short.
108
6
fn keywords(source: &str) -> impl Iterator<Item = String> + '_ {
109
6
    struct KeywordEmitter<'a> {
110
6
        chars: Chars<'a>,
111
6
    }
112
6

            
113
6
    impl<'a> Iterator for KeywordEmitter<'a> {
114
6
        type Item = String;
115
6

            
116
19
        fn next(&mut self) -> Option<Self::Item> {
117
19
            let mut word = String::new();
118
149
            for ch in &mut self.chars {
119
140
                if ch.is_alphanumeric() {
120
110
                    for ch in ch.to_lowercase() {
121
110
                        word.push(ch);
122
110
                    }
123
30
                } else if !word.is_empty() {
124
25
                    if word.len() > 3 {
125
10
                        return Some(word);
126
15
                    }
127
15
                    word.clear();
128
6
                }
129
6
            }
130
6

            
131
9
            (word.len() > 3).then(|| word)
132
19
        }
133
6
    }
134
6

            
135
6
    KeywordEmitter {
136
6
        chars: source.chars(),
137
6
    }
138
6
}
139

            
140
1
#[test]
141
1
fn runs() {
142
1
    main().unwrap()
143
1
}