presage 0.9.1
contextChangeDetector.cpp
Go to the documentation of this file.
1
2/******************************************************
3 * Presage, an extensible predictive text entry system
4 * ---------------------------------------------------
5 *
6 * Copyright (C) 2008 Matteo Vescovi <matteo.vescovi@yahoo.co.uk>
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License along
19 with this program; if not, write to the Free Software Foundation, Inc.,
20 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 *
22 **********(*)*/
23
24
26#include "../tokenizer/reverseTokenizer.h"
27
28#include <iostream>
29#include <sstream>
30#include <stdlib.h> // for atoi()
31#include <assert.h>
32
33const std::string::size_type ContextChangeDetector::DEFAULT_SLIDING_WINDOW_SIZE = 80;
34
36 const std::string tChars,
37 const std::string bChars,
38 const std::string cChars,
39 bool lowercase)
40 : wordChars (wChars),
41 separatorChars (tChars),
42 blankspaceChars(bChars),
43 controlChars (cChars),
44 lowercase_mode (lowercase)
45{
46 // intentionally empty
47}
48
50{
51 // intentionally empty
52}
53
55{
56 if(!str.empty()) {
57 SLIDING_WINDOW_SIZE = atoi(str.c_str());
58 } else {
60 }
61}
62
64{
65 if (str.size() <= SLIDING_WINDOW_SIZE) {
66 // past stream fits in sliding window
67 sliding_window = str;
68 } else {
69 // trim past stream down to sliding window
70 sliding_window = str.substr(str.size() - SLIDING_WINDOW_SIZE);
71 assert(sliding_window.size() == SLIDING_WINDOW_SIZE);
72 }
73}
74
75bool ContextChangeDetector::context_change(const std::string& past_stream) const
76{
77 // Here's how this is going to be implemented... We'll keep a
78 // sliding window on the last few chars seen by presage; the
79 // buffer holding them is the sliding window. We'll search for the
80 // last occurence of sliding_window in past_stream, if any.
81
82 // If sliding_window is not found in past_stream, then it is not
83 // possible to relate the current context to the previously seen
84 // context, hence we assume a context change has occured.
85
86 // If sliding_window is found, then we need to examine the chars
87 // following the sliding window in the past stream. We call this
88 // the remainder. If there are any non-word chars in the
89 // remainder, then a context change has occurred. Else, no context
90 // change occured.
91
92 // The sliding window is never implicitly updated as part of
93 // invoking this method.
94
95 return context_change_helper(sliding_window, past_stream);
96}
97
98
99bool ContextChangeDetector::context_change_helper(const std::string& prev_context, const std::string& curr_context) const
100{
101 bool result = false;
102
103 if (prev_context.empty()) {
104 if (curr_context.empty()) {
105 // both contexts are empty, nothing has happened, no
106 // change happened
107 result = false;
108 } else {
109 // current context changed, previous context is empty,
110 // first change happened
111 result = true;
112 }
113 } else {
114 // find position of previous context in current context
115 // i.e. find index pointing to last char of last occurence of
116 // prev_context in curr_context
117 std::string::size_type ctx_idx = curr_context.rfind(prev_context);
118
119 if (ctx_idx == std::string::npos) {
120 // prev_context could not be found in curr_context, a lot
121 // changed
122 result = true;
123 } else {
124 // found prev_context, examine remainder string.
125 // remainder string is substr(ctx_idx +
126 // prev_context.size()); i.e. substring given by index
127 // returned by rfind (which points at beginning of
128 // prev_context string found in curr_context) plus size of
129 // prev_context: this index points at end of prev_context
130 // substring found in curr_context
131
132 std::string remainder = curr_context.substr(ctx_idx + prev_context.size());
133
134 std::string::size_type idx = remainder.find_last_of(wordChars);
135 if (idx == std::string::npos) {
136 if (remainder.empty()) {
137 result = false;
138 } else {
139 char last_char = curr_context[ctx_idx + prev_context.size() - 1];
140 idx = wordChars.find(last_char);
141 if (idx == std::string::npos) {
142 result = false;
143 } else {
144 result = true;
145 }
146 }
147 } else {
148 if (idx == remainder.size() - 1) {
149 result = false;
150 } else {
151 result = true;
152 }
153 }
154
155
156/*
157 * alternate implementation of the logic in the enclosing else
158 * block. This uses tokenizers, which is not desirable as it makes
159 * tokenizer a dependency of context change detector.
160
161 std::string remainder = curr_context.substr(loc + prev_context.size());
162
163 std::stringstream curr_strstream(curr_context);
164 std::stringstream prev_strstream(prev_context);
165
166 ReverseTokenizer curr_tokenizer(curr_strstream, blankspaceChars, separatorChars);
167 ReverseTokenizer prev_tokenizer(prev_strstream, blankspaceChars, separatorChars);
168
169 std::string prev_token = prev_tokenizer.nextToken();
170 std::string curr_token = curr_tokenizer.nextToken();
171
172 if (curr_token.empty()) {
173 if (prev_token.empty()) {
174 result = false;
175
176 loc = curr_context.find_first_of(wordChars, loc);
177 if (loc == std::string::npos) {
178 result = false;
179 } else {
180 result = true;
181 }
182
183 } else {
184 result = true;
185 }
186
187 } else {
188 loc = curr_token.find(prev_token);
189 if (loc == std::string::npos) {
190 result = true;
191 } else {
192 result = false;
193 }
194 }
195*/
196
197 }
198 }
199
200 return result;
201}
202
203std::string ContextChangeDetector::change(const std::string& past_stream) const
204{
205 const std::string& prev_context = sliding_window; // let's rename these
206 const std::string& curr_context = past_stream; // for clarity's sake
207
208 std::string result;
209
210 if (sliding_window.empty()) {
211 result = past_stream;
212 } else {
213 // find position of previous context in current context
214 // i.e. find index pointing to last char of last occurence of
215 // prev_context in curr_context
216 std::string::size_type ctx_idx = curr_context.rfind(prev_context);
217
218 if (ctx_idx == std::string::npos) {
219 // prev_context could not be found in curr_context, a lot
220 // changed
221 result = past_stream;
222 } else {
223 // found prev_context, examine remainder string.
224 // remainder string is substr(ctx_idx +
225 // prev_context.size()); i.e. substring given by index
226 // returned by rfind (which points at beginning of
227 // prev_context string found in curr_context) plus size of
228 // prev_context: this index points at end of prev_context
229 // substring found in curr_context
230
231 result = curr_context.substr(ctx_idx + prev_context.size());
232
233 // handle case where a context change has occured and
234 // remainder string only contains part of the last token,
235 // i.e.:
236 //
237 // sliding_window = "The quick bro";
238 // past_stream = "The quick brown ";
239 //
240 // In this case, the remainder will only contain "wn", and
241 // the last token in the sliding window must be prepended
242 // to the change to be learnt
243 //
244 if (context_change(past_stream)) {
245 // prepend partially entered token to change if it
246 // exists, need to look into sliding_window to get
247 // previously partially entered token if it exists
248 std::stringstream sliding_window_stream;
249 sliding_window_stream << get_sliding_window();
250 ReverseTokenizer rTok(sliding_window_stream,
254 std::string first_token = rTok.nextToken();
255 if (!first_token.empty()) {
256 result = first_token + result;
257 }
258 }
259 }
260 }
261
262 return result;
263}
264
266{
267 return sliding_window;
268}
void update_sliding_window(const std::string &str)
void set_sliding_window_size(const std::string &str)
std::string get_sliding_window() const
static const std::string::size_type DEFAULT_SLIDING_WINDOW_SIZE
const std::string separatorChars
bool context_change(const std::string &past_stream) const
const std::string wordChars
ContextChangeDetector(const std::string, const std::string, const std::string, const std::string, bool)
const std::string blankspaceChars
std::string change(const std::string &past_stream) const
std::string::size_type SLIDING_WINDOW_SIZE
bool context_change_helper(const std::string &str1, const std::string &str2) const
virtual std::string nextToken()
void lowercaseMode(const bool)
Definition: tokenizer.cpp:81