1
//! This example shows a naive approach to implementing text search. It works by
2
//! creating a View with the Key being each word contained within a document.
3
//! This example uses the Value type to store which field the match came from.
4
//!
5
//! While this is a naive approach, this can be used to build a fairly robust
6
//! search. After retrieving the query results from the view, analysis can be
7
//! done on the results to rank the matched documents based on how many times a
8
//! keyword hit, which fields it matched upon, etc.
9
//!
10
//! While this approach can be powerful, it pales in comparsion to full text
11
//! search capabilities. The tracking issue for adding full text indexes to
12
//! BonsaiDb is here: <https://github.com/khonsulabs/bonsaidb/issues/149>.
13
use std::str::Chars;
14
use std::time::SystemTime;
15

            
16
use bonsaidb::core::document::{CollectionDocument, Emit};
17
use bonsaidb::core::schema::{
18
    Collection, CollectionMapReduce, SerializedCollection, SerializedView, View, ViewMapResult,
19
    ViewSchema,
20
};
21
use bonsaidb::local::config::{Builder, StorageConfiguration};
22
use bonsaidb::local::Database;
23
use serde::{Deserialize, Serialize};
24

            
25
42
#[derive(Debug, Serialize, Deserialize, Collection)]
26
#[collection(name = "messages", views = [MessagesByWords])]
27
struct Message {
28
    pub timestamp: SystemTime,
29
    pub subject: String,
30
    pub body: String,
31
}
32

            
33
impl Message {
34
    /// Returns a new message with the current timestamp.
35
3
    pub fn new(subject: impl Into<String>, body: impl Into<String>) -> Self {
36
3
        Self {
37
3
            timestamp: SystemTime::now(),
38
3
            subject: subject.into(),
39
3
            body: body.into(),
40
3
        }
41
3
    }
42
}
43

            
44
20
#[derive(View, ViewSchema, Debug, Clone)]
45
#[view(name = "by-keyword", collection = Message, key = String, value = String)]
46
struct MessagesByWords;
47

            
48
impl CollectionMapReduce for MessagesByWords {
49
3
    fn map<'doc>(
50
3
        &self,
51
3
        document: CollectionDocument<<Self::View as View>::Collection>,
52
3
    ) -> ViewMapResult<'doc, Self::View> {
53
3
        // Emit a key/value mapping for each word found in the subject and body.
54
3
        let subject_words =
55
3
            keywords(&document.contents.subject).map(|word| (word, String::from("subject")));
56
10
        let body_words = keywords(&document.contents.body).map(|word| (word, String::from("body")));
57
3
        subject_words
58
3
            .chain(body_words)
59
13
            .map(|(key, value)| document.header.emit_key_and_value(key, value))
60
3
            .collect()
61
3
    }
62
}
63

            
64
1
fn main() -> Result<(), bonsaidb::core::Error> {
65
1
    let db = Database::open::<Message>(StorageConfiguration::new("keyword-search.bonsaidb"))?;
66

            
67
1
    Message::new("Groceries", "Can you pick up some milk on the way home?").push_into(&db)?;
68
1
    Message::new("Re: Groceries", "2% milk? How are our eggs?").push_into(&db)?;
69
1
    Message::new("Re: Groceries", "Yes. We could use another dozen eggs.").push_into(&db)?;
70

            
71
2
    for result in &MessagesByWords::entries(&db)
72
1
        .with_key("eggs")
73
1
        .query_with_collection_docs()?
74
2
    {
75
2
        println!(
76
2
            "Contained `eggs` in field {} : {:?}",
77
2
            result.value, result.document
78
2
        );
79
2
    }
80

            
81
1
    for message in MessagesByWords::entries(&db)
82
1
        .with_key_prefix("doz")
83
1
        .query_with_collection_docs()?
84
        .documents
85
1
    {
86
1
        println!("Contained a word starting with `doz`: {message:?}");
87
1
    }
88

            
89
1
    Ok(())
90
1
}
91

            
92
/// Splits `source` into "words", where words are:
93
///
94
/// - Contiguous sequences of alphanumeric characters
95
/// - At least 4 characters long (Avoids "the", "and", etc, but it is *overly*
96
///   restrictive).
97
/// - Non-alphanumeric is always considered a word break and excluded from
98
///   results. This means that "m.d." will never yield any 'words' in this
99
///   algorithm, because 'm' and 'd' will be considered separate words and
100
///   excluded for being too short.
101
6
fn keywords(source: &str) -> impl Iterator<Item = String> + '_ {
102
6
    struct KeywordEmitter<'a> {
103
6
        chars: Chars<'a>,
104
6
    }
105
6

            
106
6
    impl<'a> Iterator for KeywordEmitter<'a> {
107
6
        type Item = String;
108
6

            
109
19
        fn next(&mut self) -> Option<Self::Item> {
110
19
            let mut word = String::new();
111
149
            for ch in &mut self.chars {
112
140
                if ch.is_alphanumeric() {
113
110
                    for ch in ch.to_lowercase() {
114
110
                        word.push(ch);
115
110
                    }
116
30
                } else if !word.is_empty() {
117
25
                    if word.len() > 3 {
118
10
                        return Some(word);
119
15
                    }
120
15
                    word.clear();
121
6
                }
122
6
            }
123
6

            
124
9
            (word.len() > 3).then_some(word)
125
19
        }
126
6
    }
127
6

            
128
6
    KeywordEmitter {
129
6
        chars: source.chars(),
130
6
    }
131
6
}
132

            
133
1
#[test]
134
1
fn runs() {
135
1
    main().unwrap()
136
1
}