Teuchos - Trilinos Tools Package Version of the Day
Loading...
Searching...
No Matches
Teuchos_regex.cpp
1#include "Teuchos_set.hpp"
2
3#include "Teuchos_regex.hpp"
4
5#include <iostream>
6#include <sstream>
7
8#include "Teuchos_Assert.hpp"
9#include "Teuchos_Parser.hpp"
10#include "Teuchos_vector.hpp"
11#include "Teuchos_string.hpp"
12#include "Teuchos_chartab.hpp"
13#include "Teuchos_Reader.hpp"
14#include "Teuchos_chartab.hpp"
15
16namespace Teuchos {
17namespace regex {
18
19Language make_language() {
20 /* The top produtions were from the "grep.y" YACC grammar in the source
21 code for Plan 9's grep utility, see here:
22https://github.com/wangeguo/plan9/blob/master/sys/src/cmd/grep/grep.y
23 The "set" related productions
24 are from a grammar intended to be used by ProLog to parse Perl's regular
25 expressions, see here:
26http://www.cs.sfu.ca/~cameron/Teaching/384/99-3/regexp-plg.html */
27 Language out;
28 Language::Productions& prods = out.productions;
29 prods.resize(NPRODS);
30 prods[PROD_REGEX]("regex") >> "union";
31 prods[PROD_UNION_DECAY]("union") >> "concat";
32 prods[PROD_UNION]("union") >> "union", "|", "concat"; // union
33 prods[PROD_CONCAT_DECAY]("concat") >> "qualified";
34 prods[PROD_CONCAT]("concat") >> "concat", "qualified"; // concatenation
35 prods[PROD_QUAL_DECAY]("qualified") >> "single";
36 prods[PROD_STAR]("qualified") >> "qualified", "*";
37 prods[PROD_PLUS]("qualified") >> "qualified", "+";
38 prods[PROD_MAYBE]("qualified") >> "qualified", "?";
39 prods[PROD_SINGLE_CHAR]("single") >> "char";
40 prods[PROD_ANY]("single") >> "."; // any
41 prods[PROD_SINGLE_SET]("single") >> "set";
42 prods[PROD_PARENS_UNION]("single") >> "(", "union", ")";
43 prods[PROD_SET_POSITIVE]("set") >> "positive-set";
44 prods[PROD_SET_NEGATIVE]("set") >> "negative-set";
45 prods[PROD_POSITIVE_SET]("positive-set") >> "[", "set-items", "]";
46 prods[PROD_NEGATIVE_SET]("negative-set") >> "[", "^", "set-items", "]";
47 prods[PROD_SET_ITEMS_DECAY]("set-items") >> "set-item";
48 prods[PROD_SET_ITEMS_ADD]("set-items") >> "set-items", "set-item";
49 prods[PROD_SET_ITEM_CHAR]("set-item") >> "char";
50 prods[PROD_SET_ITEM_RANGE]("set-item") >> "range";
51 prods[PROD_RANGE]("range") >> "char", "-", "char";
52 out.tokens.resize(NTOKS);
53 /* either one of the non-meta characters, or anything preceded by the escape slash */
54 out.tokens[TOK_CHAR]("char", "[^\\\\\\.\\[\\]\\(\\)\\|\\-\\^\\*\\+\\?]|\\\\.");
55 out.tokens[TOK_DOT](".", "\\.");
56 out.tokens[TOK_LRANGE]("[", "\\]");
57 out.tokens[TOK_RRANGE]("]", "\\]");
58 out.tokens[TOK_LPAREN]("(", "\\(");
59 out.tokens[TOK_RPAREN](")", "\\)");
60 out.tokens[TOK_UNION]("|", "\\|");
61 out.tokens[TOK_RANGE]("-", "\\-");
62 out.tokens[TOK_NEGATE]("^", "\\^");
63 out.tokens[TOK_STAR]("*", "\\*");
64 out.tokens[TOK_PLUS]("+", "\\+");
65 out.tokens[TOK_MAYBE]("?", "\\?");
66 return out;
67}
68
69/* bootstrap ! This lexer is used to build the ReaderTables that read
70 regular expressions themselves, so it can't depend on that reader ! */
71void make_lexer(FiniteAutomaton& result) {
72 std::string meta_chars_str = ".[]()|-^*+?";
73 std::set<int> all_chars;
74 for (int i = 0; i < NCHARS; ++i) all_chars.insert(i);
75 std::set<int> nonmeta_chars = all_chars;
76 for (int i = 0; i < Teuchos::size(meta_chars_str); ++i) {
77 int meta_char = at(meta_chars_str, i);
78 std::set<int>::iterator it = nonmeta_chars.find(get_symbol(meta_char));
79 nonmeta_chars.erase(it);
80 }
81 FiniteAutomaton lex_nonmeta;
82 make_set_nfa(lex_nonmeta, NCHARS, nonmeta_chars, TOK_CHAR);
83 FiniteAutomaton lex_slash;
84 make_char_single_nfa(lex_slash, '\\');
85 FiniteAutomaton lex_any;
86 make_set_nfa(lex_any, NCHARS, all_chars);
87 FiniteAutomaton lex_escaped;
88 concat(lex_escaped, lex_slash, lex_any, TOK_CHAR);
89 FiniteAutomaton lex_char;
90 unite(lex_char, lex_nonmeta, lex_escaped);
91 FiniteAutomaton lex_metachars;
92 for (int i = 0; i < Teuchos::size(meta_chars_str); ++i) {
93 int token = TOK_CHAR + i + 1;
94 if (i) {
95 FiniteAutomaton lex_metachar;
96 make_char_single_nfa(lex_metachar, at(meta_chars_str, i), token);
97 unite(lex_metachars, lex_metachars, lex_metachar);
98 } else {
99 make_char_single_nfa(lex_metachars, at(meta_chars_str, i), token);
100 }
101 }
102 unite(result, lex_metachars, lex_char);
103 make_deterministic(result, result);
104 simplify(result, result);
105}
106
107ReaderTablesPtr ask_reader_tables() {
108 static ReaderTablesPtr ptr;
109 if (ptr.strong_count() == 0) {
110 RCP<ReaderTables> newptr(new ReaderTables());
111 LanguagePtr lang = regex::ask_language();
112 GrammarPtr grammar = make_grammar(*lang);
113 newptr->parser = make_lalr1_parser(grammar);
114 regex::make_lexer(newptr->lexer);
115 newptr->indent_info.is_sensitive = false;
116 newptr->indent_info.indent_token = -1;
117 newptr->indent_info.dedent_token = -1;
118 ptr = newptr;
119 }
120 return ptr;
121}
122
123LanguagePtr ask_language() {
124 static LanguagePtr ptr;
125 if (ptr.strong_count() == 0) {
126 ptr.reset(new Language(make_language()));
127 }
128 return ptr;
129}
130
131void make_dfa(FiniteAutomaton& result, std::string const& name, std::string const& regex, int token) {
132 using std::swap;
133 regex::Reader reader(token);
134 any result_any;
135 try {
136 reader.read_string(result_any, regex, name);
137 } catch (const Teuchos::ParserFail& e) {
138 std::stringstream ss;
139 ss << e.what() << '\n';
140 ss << "error: couldn't build DFA for token \"" << name << "\" regex \"" << regex << "\"\n";
141 ss << "repeating with DebugReader:\n";
142 DebugReader debug_reader(regex::ask_reader_tables(), ss);
143 debug_reader.read_string(result_any, regex, name);
144 throw ParserFail(ss.str());
145 }
146 swap(any_ref_cast<FiniteAutomaton>(result_any), result);
147}
148
149regex::Reader::Reader(int result_token_in):
150 Teuchos::Reader(regex::ask_reader_tables()),
151 result_token(result_token_in) {
152}
153
154void regex::Reader::at_shift(any& result, int token, std::string& text) {
155 if (token != TOK_CHAR) return;
156 if (Teuchos::size(text) == 1) {
157 result = text[0];
158 } else if (Teuchos::size(text) == 2) {
159 TEUCHOS_ASSERT(text[0] == '\\');
160 result = text[1];
161 } else {
162 TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
163 "BUG: regex char text is \"" << text << "\"\n");
164 }
165}
166
167void regex::Reader::at_reduce(any& result_any, int production, std::vector<any>& rhs) {
168 using std::swap;
169 switch (production) {
170 case PROD_REGEX: {
171 swap(result_any, at(rhs, 0));
172 FiniteAutomaton& result = any_ref_cast<FiniteAutomaton>(result_any);
173 make_deterministic(result, result);
174 simplify(result, result);
175 return;
176 }
177 case PROD_UNION_DECAY:
178 case PROD_CONCAT_DECAY:
179 case PROD_QUAL_DECAY:
180 case PROD_SET_ITEMS_DECAY:
181 case PROD_SET_ITEM_RANGE: {
182 swap(result_any, at(rhs, 0));
183 return;
184 }
185 case PROD_UNION: {
186 FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
187 FiniteAutomaton& a = any_ref_cast<FiniteAutomaton>(at(rhs, 0));
188 FiniteAutomaton& b = any_ref_cast<FiniteAutomaton>(at(rhs, 2));
189 unite(result, a, b);
190 return;
191 }
192 case PROD_CONCAT: {
193 FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
194 FiniteAutomaton& a = any_ref_cast<FiniteAutomaton>(at(rhs, 0));
195 FiniteAutomaton& b = any_ref_cast<FiniteAutomaton>(at(rhs, 1));
196 concat(result, a, b, result_token);
197 return;
198 }
199 case PROD_STAR: {
200 FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
201 FiniteAutomaton& a = any_ref_cast<FiniteAutomaton>(at(rhs, 0));
202 star(result, a, result_token);
203 return;
204 }
205 case PROD_PLUS: {
206 FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
207 FiniteAutomaton& a = any_ref_cast<FiniteAutomaton>(at(rhs, 0));
208 plus(result, a, result_token);
209 return;
210 }
211 case PROD_MAYBE: {
212 FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
213 FiniteAutomaton& a = any_ref_cast<FiniteAutomaton>(at(rhs, 0));
214 maybe(result, a, result_token);
215 return;
216 }
217 case PROD_SINGLE_CHAR: {
218 FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
219 char c = any_cast<char>(at(rhs, 0));
220 make_char_single_nfa(result, c, result_token);
221 return;
222 }
223 case PROD_ANY: {
224 FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
225 make_range_nfa(result, NCHARS, 0, NCHARS - 1, result_token);
226 return;
227 }
228 case PROD_SINGLE_SET: {
229 FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
230 std::set<char>& charset = any_ref_cast<std::set<char> >(at(rhs, 0));
231 make_char_set_nfa(result, charset, result_token);
232 return;
233 }
234 case PROD_PARENS_UNION: {
235 swap(result_any, at(rhs, 1));
236 return;
237 }
238 case PROD_SET_POSITIVE: {
239 swap(result_any, at(rhs, 0));
240 return;
241 }
242 case PROD_SET_NEGATIVE: {
243 std::set<char>& result = make_any_ref<std::set<char> >(result_any);
244 std::set<char> const& charset = any_ref_cast<std::set<char> >(at(rhs, 0));
245 negate_set(result, charset);
246 return;
247 }
248 case PROD_POSITIVE_SET: {
249 swap(result_any, at(rhs, 1));
250 return;
251 }
252 case PROD_NEGATIVE_SET: {
253 swap(result_any, at(rhs, 2));
254 return;
255 }
256 case PROD_SET_ITEMS_ADD: {
257 std::set<char>& result = make_any_ref<std::set<char> >(result_any);
258 std::set<char>& a = any_ref_cast<std::set<char> >(at(rhs, 0));
259 std::set<char> const& b = any_ref_cast<std::set<char> >(at(rhs, 1));
260 swap(result, a);
261 unite_with(result, b);
262 return;
263 }
264 case PROD_SET_ITEM_CHAR: {
265 std::set<char>& result = make_any_ref<std::set<char> >(result_any);
266 char c = any_cast<char>(at(rhs, 0));
267 result.insert(c);
268 return;
269 }
270 case PROD_RANGE: {
271 std::set<char>& result = make_any_ref<std::set<char> >(result_any);
272 char a = any_cast<char>(at(rhs, 0));
273 char b = any_cast<char>(at(rhs, 2));
274 for (char c = a; c <= b; ++c) {
275 result.insert(c);
276 }
277 return;
278 }
279 }
280 TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
281 "BUG: unexpected production " << production << '\n');
282}
283
284} // namespace regex
285} // namespace Teuchos
Declares Teuchos::Parser, ParserFail and make_lalr1_parser.
Declares Teuchos::Reader.
Tries to create LALR(1) parser tables for a given grammar.
#define TEUCHOS_ASSERT(assertion_test)
This macro is throws when an assert fails.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
Macro for throwing an exception with breakpointing to ease debugging.
The Teuchos namespace contains all of the classes, structs and enums used by Teuchos,...
RCP< const ReaderTables > ReaderTablesPtr
an RCP to a const ReaderTables
void make_lexer(FiniteAutomaton &result, Language const &language)
construct a lexer for the Language tokens.
RCP< const Language > LanguagePtr
an RCP to a const Language
Parser make_lalr1_parser(GrammarPtr grammar, bool verbose)
Tries to create LALR(1) parser tables for a given grammar.
Productions productions
vector of productions