I am trying to escape a string in quotation marks using boost::spirit::karma. This works fine if it's just a string. However, for a string in a boost::variant in a std::vector, it does not. Just printing the string does work however, I do not quite understand why.
Line (1) works fine, but doesn't do what I want. Line (2) should do it, but doesn't.
#include <iostream>
#include <string>
#include <boost/variant.hpp>
#include <boost/spirit/include/karma.hpp>
namespace karma = boost::spirit::karma;
typedef std::vector<boost::variant<int, std::string>> ParameterList;
typedef boost::variant<int, std::string, ParameterList> Parameter;
main()
{
using karma::int_;
using boost::spirit::ascii::string;
using karma::eol;
using karma::lit;
std::string generated;
std::back_insert_iterator<std::string> sink(generated);
// (1)
karma::rule<std::back_insert_iterator<std::string>, ParameterList()> parameterListRule = (int_ | string) % lit(", "); // This works!
// (2)
//karma::rule<std::back_insert_iterator<std::string>, ParameterList()> parameterListRule = (int_ | (lit('"') << string << lit('"'))) % lit(", "); // This does not work
karma::rule<std::back_insert_iterator<std::string>, Parameter()> parameterRule = (int_ | (lit('"') << string << lit('"')) | parameterListRule) << eol; // This does work, even though it also escapes the string in a pair of quotation marks
karma::generate(sink, parameterRule, 1); // Works
karma::generate(sink, parameterRule, "foo"); // Works
karma::generate(sink, parameterRule, Parameter(ParameterList{1, "foo"})); // Only works using rule (1), not with (2)
std::cout << generated;
}
Edited In case recursion was not the goal, here's an edited version that solves the issue and the quote escaping: Live on Coliru (or just source here)
Hmm. It looks like you might have been after a recursive attribute/rule:
typedef boost::make_recursive_variant<int, std::string, std::vector<boost::recursive_variant_> >::type Parameter;
Just in that case, here's a simple approach to generating that:
gen = int_ | string | gen % ", ";
Now, your title suggests that strings containing double-quotes should escape these. I suggest
str = '"' << *('\\' << char_('"') | char_) << '"';
gen = int_ | str | gen % ", ";
Now the following test cases
for (Parameter p : Parameters {
1,
"foo",
Parameters { 1, "foo" },
Parameters { 1, "escape: \"foo\"", Parameters { "2", "bar" } }
})
{
std::cout << karma::format(gen, p) << '\n';
}
result in:
1
"foo"
1, "foo"
1, "escape: \"foo\"", "2", "bar"
If recursion is really a feature, you'd want to see the grouping of nested Parameter lists:
gen = int_ | str | '{' << gen % ", " << '}';
Now prints
1
"foo"
{1, "foo"}
{1, "escape: \"foo\"", {"2", "bar"}}
Full sample program:
#include <boost/variant.hpp>
#include <boost/spirit/include/karma.hpp>
namespace karma = boost::spirit::karma;
typedef boost::make_recursive_variant<int, std::string, std::vector<boost::recursive_variant_> >::type Parameter;
typedef std::vector<Parameter> Parameters;
int main()
{
typedef boost::spirit::ostream_iterator It;
karma::rule<It, Parameter()> gen;
karma::rule<It, std::string()> str;
str = '"' << *('\\' << karma::char_('"') | karma::char_) << '"';
gen = (karma::int_ | str | '{' << gen % ", " << '}');
for (Parameter p : Parameters {
1,
"foo",
Parameters { 1, "foo" },
Parameters { 1, "escape: \"foo\"", Parameters { "2", "bar" } }
})
{
std::cout << karma::format(gen, p) << '\n';
}
}
If you iterate your data types, you should iterate your rules.
#include <iostream>
#include <string>
#include <boost/variant.hpp>
#include <boost/spirit/include/karma.hpp>
namespace karma = boost::spirit::karma;
typedef boost::variant<int, std::string> Item;
typedef std::vector<Item> ParameterList;
typedef boost::variant<int, std::string, ParameterList> Parameter;
int main()
{
using karma::int_;
using boost::spirit::ascii::string;
using karma::eol;
using karma::lit;
std::string generated;
std::back_insert_iterator<std::string> sink(generated);
karma::rule<std::back_insert_iterator<std::string>, Item()> itemRule =
int_ | (lit('"') << string << lit('"'));
karma::rule<std::back_insert_iterator<std::string>, ParameterList()>
parameterListRule = itemRule % lit(", ");
karma::rule<std::back_insert_iterator<std::string>, Parameter()>
parameterRule = (int_ | (lit('"') << string << lit('"')) | parameterListRule) << eol;
karma::generate(sink, parameterRule, 1);
karma::generate(sink, parameterRule, "foo");
karma::generate(sink, parameterRule, Parameter(ParameterList {1, "foo"}));
std::cout << generated;
return 0;
}
#include <iostream>
#include <boost/spirit/include/karma.hpp>
#include <boost/spirit/include/karma_right_alignment.hpp>
using namespace boost;
void foo(char* buffer, uint32_t lhOid) {
boost::spirit::karma::generate(buffer, boost::spirit::right_align(20)[boost::spirit::karma::int_], lhOid);
*buffer = '\0';
}
int main() {
char arr[21];
foo(arr, 1234);
std::cout.write(arr, 21) << std::endl;
return 0;
}
Related
I am toying with Boost.Spirit. As part of a larger work I am trying to construct a grammar for parsing C/C++ style string literals. I encountered a problem:
How do I create a sub-grammar that appends a std::string() result to the calling grammar's std::string() attribute (instead of just a char?
Here is my code, which is working so far. (Actually I already got much more than that, including stuff like '\n' etc., but I cut it down to the essentials.)
#define BOOST_SPIRIT_UNICODE
#include <string>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
using namespace boost;
using namespace boost::spirit;
using namespace boost::spirit::qi;
template < typename Iterator >
struct EscapedUnicode : grammar< Iterator, char() > // <-- should be std::string
{
EscapedUnicode() : EscapedUnicode::base_type( escaped_unicode )
{
escaped_unicode %= "\\" > ( ( "u" >> uint_parser< char, 16, 4, 4 >() )
| ( "U" >> uint_parser< char, 16, 8, 8 >() ) );
}
rule< Iterator, char() > escaped_unicode; // <-- should be std::string
};
template < typename Iterator >
struct QuotedString : grammar< Iterator, std::string() >
{
QuotedString() : QuotedString::base_type( quoted_string )
{
quoted_string %= '"' >> *( escaped_unicode | ( char_ - ( '"' | eol ) ) ) >> '"';
}
EscapedUnicode< Iterator > escaped_unicode;
rule< Iterator, std::string() > quoted_string;
};
int main()
{
std::string input = "\"foo\u0041\"";
typedef std::string::const_iterator iterator_type;
QuotedString< iterator_type > qs;
std::string result;
bool r = parse( input.cbegin(), input.cend(), qs, result );
std::cout << result << std::endl;
}
This prints fooA -- the QuotedString grammar calls the EscapedUnicode grammar, which results in a char being added to the std::string attribute of QuotedString (the A, 0x41).
But of course I would need to generate a sequence of chars (bytes) for anything beyond 0x7f. EscapedUnicode would neet to produce a std::string, which would have to be appended to the string generated by QuotedString.
And that is where I've met a roadblock. I do not understand the things Boost.Spirit does in concert with Boost.Phoenix, and any attempts I have made resulted in lengthy and pretty much undecipherable template-related compiler errors.
So, how can I do this? The answer need not actually do the proper Unicode conversion; it's the std::string issue I need a solution for.
A few points applied:
please do not blanket using namespace in relation to highly generic code. ADL will ruin your day unless you control it
Operator %= is auto-rule assignment, meaning that automatic attribute propagation will be forced even in the presence of semantic actions. You don't want that because the attribute exposed by uint_parser will not be (correctly) automatically propagated if you want to encode into multi-byte string representation.
The input string
std::string input = "\"foo\u0041\"";
needed to be
std::string input = "\"foo\\u0041\"";
otherwise the compiler did the escape handling before the parser even runs :)
Here come the specific tricks for the meat of the task:
You will want to change the rule's declared attribute to something that Spirit will automatically "flatten" in simple sequences. E.g.
quoted_string = '"' >> *(escaped_unicode | (qi::char_ - ('"' | qi::eol))) >> '"';
Will not append because the first branch of the alternate results in a sequence of char, and the second in a single char. The following spelling of the equivalent:
quoted_string = '"' >> *(escaped_unicode | +(qi::char_ - ('"' | qi::eol | "\\u" | "\\U"))) >> '"';
subtly triggers the appending heuristic in Spirit, so we can achieve what we want without involving Semantic Actions.
The rest is straight-forward:
implement the actual encoding with a Phoenix function object:
struct encode_f {
template <typename...> struct result { using type = void; };
template <typename V, typename CP> void operator()(V& a, CP codepoint) const {
// TODO implement desired encoding (e.g. UTF8)
bio::stream<bio::back_insert_device<V> > os(a);
os << "[" << std::hex << std::showbase << std::setw(std::numeric_limits<CP>::digits/4) << std::setfill('0') << codepoint << "]";
}
};
boost::phoenix::function<encode_f> encode;
This you can then use like:
escaped_unicode = '\\' > ( ("u" >> uint_parser<uint16_t, 16, 4, 4>() [ encode(_val, _1) ])
| ("U" >> uint_parser<uint32_t, 16, 8, 8>() [ encode(_val, _1) ]) );
Because you mentioned you don't care about the specific encoding, I elected to encode the raw codepoint in 16bit or 32bit hex representation like [0x0041]. I pragmatically used Boost Iostreams which is capable of directly writing into the attribute's container type
Use BOOST_SPIRIT_DEBUG* macros
Live On Coliru
//#define BOOST_SPIRIT_UNICODE
//#define BOOST_SPIRIT_DEBUG
#include <string>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
// for demo re-encoding
#include <boost/iostreams/device/back_inserter.hpp>
#include <boost/iostreams/stream.hpp>
#include <iomanip>
namespace qi = boost::spirit::qi;
namespace bio = boost::iostreams;
namespace phx = boost::phoenix;
template <typename Iterator, typename Attr = std::vector<char> > // or std::string for that matter
struct EscapedUnicode : qi::grammar<Iterator, Attr()>
{
EscapedUnicode() : EscapedUnicode::base_type(escaped_unicode)
{
using namespace qi;
escaped_unicode = '\\' > ( ("u" >> uint_parser<uint16_t, 16, 4, 4>() [ encode(_val, _1) ])
| ("U" >> uint_parser<uint32_t, 16, 8, 8>() [ encode(_val, _1) ]) );
BOOST_SPIRIT_DEBUG_NODES((escaped_unicode))
}
struct encode_f {
template <typename...> struct result { using type = void; };
template <typename V, typename CP> void operator()(V& a, CP codepoint) const {
// TODO implement desired encoding (e.g. UTF8)
bio::stream<bio::back_insert_device<V> > os(a);
os << "[0x" << std::hex << std::setw(std::numeric_limits<CP>::digits/4) << std::setfill('0') << codepoint << "]";
}
};
boost::phoenix::function<encode_f> encode;
qi::rule<Iterator, Attr()> escaped_unicode;
};
template <typename Iterator>
struct QuotedString : qi::grammar<Iterator, std::string()>
{
QuotedString() : QuotedString::base_type(start)
{
start = quoted_string;
quoted_string = '"' >> *(escaped_unicode | +(qi::char_ - ('"' | qi::eol | "\\u" | "\\U"))) >> '"';
BOOST_SPIRIT_DEBUG_NODES((start)(quoted_string))
}
EscapedUnicode<Iterator> escaped_unicode;
qi::rule<Iterator, std::string()> start;
qi::rule<Iterator, std::vector<char>()> quoted_string;
};
int main() {
std::string input = "\"foo\\u0041\\U00000041\"";
typedef std::string::const_iterator iterator_type;
QuotedString<iterator_type> qs;
std::string result;
bool r = parse( input.cbegin(), input.cend(), qs, result );
std::cout << std::boolalpha << r << ": '" << result << "'\n";
}
Prints:
true: 'foo[0x0041][0x00000041]'
Is it possible to unzip previously zipped vectors using the C++ Range-v3 library? I would expect it to behave similarly to Haskell's unzip function or Python's zip(*list).
It would be convenient, for instance, when sorting a vector by values of another vector:
using namespace ranges;
std::vector<std::string> names {"john", "bob", "alice"};
std::vector<int> ages {32, 19, 35};
// zip names and ages
auto zipped = view::zip(names, ages);
// sort the zip by age
sort(zipped, [](auto &&a, auto &&b) {
return std::get<1>(a) < std::get<1>(b);
});
// put the sorted names back into the original vector
std::tie(names, std::ignore) = unzip(zipped);
When passed container arguments, view::zip in range-v3 creates a view consisting of tuples of references to the original elements. Passing the zipped view to sort sorts the elements in place. I.e., this program:
#include <vector>
#include <string>
#include <iostream>
#include <range/v3/algorithm.hpp>
#include <range/v3/view.hpp>
using namespace ranges;
template <std::size_t N>
struct get_n {
template <typename T>
auto operator()(T&& t) const ->
decltype(std::get<N>(std::forward<T>(t))) {
return std::get<N>(std::forward<T>(t));
}
};
namespace ranges {
template <class T, class U>
std::ostream& operator << (std::ostream& os, common_pair<T, U> const& p) {
return os << '(' << p.first << ", " << p.second << ')';
}
}
int main() {
std::vector<std::string> names {"john", "bob", "alice"};
std::vector<int> ages {32, 19, 35};
auto zipped = view::zip(names, ages);
std::cout << "Before: Names: " << view::all(names) << '\n'
<< " Ages: " << view::all(ages) << '\n'
<< " Zipped: " << zipped << '\n';
sort(zipped, less{}, get_n<1>{});
std::cout << " After: Names: " << view::all(names) << '\n'
<< " Ages: " << view::all(ages) << '\n'
<< " Zipped: " << zipped << '\n';
}
Outputs:
Before: Names: [john,bob,alice]
Ages: [32,19,35]
Zipped: [(john, 32),(bob, 19),(alice, 35)]
After: Names: [bob,john,alice]
Ages: [19,32,35]
Zipped: [(bob, 19),(john, 32),(alice, 35)]
Live Example on Coliru.
I am trying to use the letter character class from unicode i.e. \p{L} with Boost Spirit but I have no luck so far. Below is an example where I am trying to use (on line 30) the \p{L} character class. When I replace line 30 with line 29 it works but that is not the intended use as I need any letter from Unicode in my example.
My use case is for UTF8 only. At the end of they day what I am trying to do here is substract a unicode range from all unicode letters when using boost-spirit lexer.
PS
Of course, my example is trimmed down and may not make a lot of sense as a use case but I hope you get the idea.
#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/fusion/include/std_pair.hpp>
#include <iostream>
#include <fstream>
#include <chrono>
#include <vector>
using namespace boost;
using namespace boost::spirit;
using namespace std;
using namespace std::chrono;
std::vector<pair<string, string> > getTokenMacros() {
std::vector<pair<string, string> > tokenDefinitionsVector;
tokenDefinitionsVector.emplace_back("JAPANESE_HIRAGANA", "[\u3041-\u3096]");
tokenDefinitionsVector.emplace_back("JAPANESE_HIRAGANA1",
"[\u3099-\u309E]");
tokenDefinitionsVector.emplace_back("ASIAN_NWS", "{JAPANESE_HIRAGANA}|"
"{JAPANESE_HIRAGANA1}");
tokenDefinitionsVector.emplace_back("ASIAN_NWS_WORD", "{ASIAN_NWS}*");
//tokenDefinitionsVector.emplace_back("NON_ASIAN_LETTER", "[A-Za-z0-9]");
tokenDefinitionsVector.emplace_back("NON_ASIAN_LETTER", "[\\p{L}-[{ASIAN_NWS}]]");
tokenDefinitionsVector.emplace_back("WORD", "{NON_ASIAN_LETTER}+");
tokenDefinitionsVector.emplace_back("ANY", ".");
return tokenDefinitionsVector;
}
;
struct distance_func {
template<typename Iterator1, typename Iterator2>
struct result: boost::iterator_difference<Iterator1> {
};
template<typename Iterator1, typename Iterator2>
typename result<Iterator1, Iterator2>::type operator()(Iterator1& begin,
Iterator2& end) const {
return distance(begin, end);
}
};
boost::phoenix::function<distance_func> const distance_fctor = distance_func();
template<typename Lexer>
struct word_count_tokens: lex::lexer<Lexer> {
word_count_tokens() :
asianNwsWord("{ASIAN_NWS_WORD}", lex::min_token_id + 110), word(
"{WORD}", lex::min_token_id + 170), any("{ANY}",
lex::min_token_id + 3000) {
using lex::_start;
using lex::_end;
using boost::phoenix::ref;
std::vector<pair<string, string> > tokenMacros(getTokenMacros());
for (auto start = tokenMacros.begin(), end = tokenMacros.end();
start != end; start++) {
this->self.add_pattern(start->first, start->second);
}
this->self = asianNwsWord | word | any;
}
lex::token_def<> asianNwsWord, word, any;
};
int main(int argc, char* argv[]) {
typedef lex::lexertl::token<string::iterator> token_type;
typedef lex::lexertl::actor_lexer<token_type> lexer_type;
word_count_tokens<lexer_type> word_count_lexer;
// read in the file int memory
ifstream sampleFile("/home/dan/Documents/wikiSample.txt");
string str = "abc efg ぁあ";
string::iterator first = str.begin();
string::iterator last = str.end();
lexer_type::iterator_type iter = word_count_lexer.begin(first, last);
lexer_type::iterator_type end = word_count_lexer.end();
typedef boost::iterator_range<string::iterator> iterator_range;
vector<iterator_range> parsed_tokens;
while (iter != end && token_is_valid(*iter)) {
cout << (iter->id() - lex::min_token_id) << " " << iter->value()
<< endl;
const iterator_range range = get<iterator_range>(iter->value());
parsed_tokens.push_back(range);
++iter;
}
if (iter != end) {
string rest(first, last);
cout << endl << "!!!!!!!!!" << endl << "Lexical analysis failed\n"
<< "stopped at: \"" << rest << "\"" << endl;
cout << "#" << (int) rest.at(0) << "#" << endl;
}
return 0;
}
I have a boost spirit parser which works very well with boost 1.46.1, and which doesn't work with boost 1.54.
This parser extract informations from the following sentence, which is a variable initialization in a DSEL : "Position #start=0;0;0".
Informations extracted from this sentence are stored in a structure :
Type of the variable will be stored (here position) ;
Name of variable (here start) ;
The "#" means that the variable is "static" ;
The value of the variable (0;0;0).
The code to extract these inforamtions is as follow :
#include <iostream>
#include <boost/spirit/include/classic.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/spirit/include/qi_char_class.hpp>
using namespace std;
using namespace boost::spirit;
struct VariableInitialization
{
std::string m_type;
bool m_is_static;
std::string m_name;
std::string m_value;
};
BOOST_FUSION_ADAPT_STRUCT(
VariableInitialization,
(std::string, m_type)
(bool, m_is_static)
(std::string, m_name)
(std::string, m_value)
)
template <typename Iterator>
struct VariableInitializationParser : qi::grammar<Iterator, VariableInitialization(), ascii::space_type> {
/*!
* IsStatic is a mapping betwen a char and a boolean
*/
struct IsStatic_ : qi::symbols<char, bool> {
IsStatic_()
{
add("#", false)("#", true);
}
}IsStatic;
VariableInitializationParser() :
VariableInitializationParser::base_type(start) {
using qi::lit;
using ascii::char_;
using qi::_val;
using qi::_1;
/*!
* For now, type is one of the three following litterals :
*/
var_type %= lit("Position")|lit("String")|lit("Numeric")|lit("Integer")|lit("Trajectory");
/*!
* identifier is how a variable can be named. Name of variable is an alpha (a-zA-Z) or an _,followed
* by any alpha numeric (a-zA-Z0-9) or a _. The followings are correct :
* _toto _T5ot_To t1oTo ...
* The following are incorrect
* 12toto -tiotp ...
*/
identifier %= ((ascii::alpha|char_('_')) >> *(ascii::alnum|char_('_')));
/*!
* var value can be anything because it's parsed by someone else.
*/
var_value %= qi::lexeme[*(char_)];
start = var_type >> IsStatic >> identifier >> '=' >> var_value;
}
qi::rule<Iterator, std::string(), ascii::space_type> var_type;
qi::rule<Iterator, std::string(), ascii::space_type> identifier;
qi::rule<Iterator, std::string(), ascii::space_type> var_value;
qi::rule<Iterator, VariableInitialization(), ascii::space_type> start;
};
int main()
{
VariableInitialization variable;
std::string input = "Position #toto=1;2;2";
std::string::const_iterator iter = input.begin();
std::string::const_iterator end = input.end();
// The phrase_parse call wil fill the structure "variable" with the good values if the syntax is correct.
// if the syntax is not correct, the method will return false.
// So if input = "Integer #toto= 6", variable.m_type == "Integer", variable.m_isStatic==true,
// variable.m_name=="toto" and variable.m_vale="6".
VariableInitializationParser<std::string::const_iterator> m_parser;
bool ok = phrase_parse(iter, end, m_parser, boost::spirit::ascii::space, variable);
if(!ok) return false;
std::cout << "Boost version : " << BOOST_VERSION << std::endl;
std::cout << "Type : " << variable.m_type << std::endl
<< "Is Static : " << variable.m_is_static << std::endl
<< "Name :" << variable.m_name << std::endl
<< "Value :" << variable.m_value << std::endl;
return 0;
}
Output of the following code is different in Boost 1.46.1 and in boost 1.53.
With boost 1.46.1 I have the following Output :
Boost version : 104601
Type : Position
Is Static : 1
Name :toto
Value :1;2;2
With boost 1.54 I have :
Boost version : 105400
Type :
Is Static : 1
Name :toto
Value :1;2;2
As you can see in boost 1.54 the Position output is not filled by the parser.
I read (maybe not carefully) the changelog of Fusion and Spirit, and I can't find why this happens.
Does someone have an explaination ?
No this is not a regression. Quite clearly you depended on undocumented (possibly undefined) behaviour.
Just change lit into string, or just slightly more compact:
var_type = qi::raw[lit("Position")|"String"|"Numeric"|"Integer"|"Trajectory"];
See it Live On Coliru
#include <iostream>
#include <boost/spirit/include/qi.hpp>
#include <boost/fusion/include/adapt_struct.hpp>
using namespace std;
using namespace boost::spirit;
struct VariableInitialization
{
std::string m_type;
bool m_is_static;
std::string m_name;
std::string m_value;
};
BOOST_FUSION_ADAPT_STRUCT(
VariableInitialization,
(std::string, m_type)
(bool, m_is_static)
(std::string, m_name)
(std::string, m_value)
)
template <typename Iterator>
struct VariableInitializationParser : qi::grammar<Iterator, VariableInitialization(), ascii::space_type> {
struct IsStatic_ : qi::symbols<char, bool> {
IsStatic_() {
add("#", false)("#", true);
}
} IsStatic;
VariableInitializationParser() :
VariableInitializationParser::base_type(start) {
using qi::lit;
using ascii::char_;
var_type = qi::raw[lit("Position")|"String"|"Numeric"|"Integer"|"Trajectory"];
identifier = (ascii::alpha|'_') >> *(ascii::alnum|'_');
var_value = qi::lexeme[*(char_)];
start = var_type >> IsStatic >> identifier >> '=' >> var_value;
}
qi::rule<Iterator, std::string(), ascii::space_type> var_type;
qi::rule<Iterator, std::string(), ascii::space_type> identifier;
qi::rule<Iterator, std::string(), ascii::space_type> var_value;
qi::rule<Iterator, VariableInitialization(), ascii::space_type> start;
};
int main()
{
VariableInitialization variable;
std::string input = "Position #toto=1;2;2";
std::string::const_iterator iter = input.begin();
std::string::const_iterator end = input.end();
VariableInitializationParser<std::string::const_iterator> m_parser;
bool ok = phrase_parse(iter, end, m_parser, boost::spirit::ascii::space, variable);
if(!ok) return false;
std::cout << "Boost version : " << BOOST_VERSION << "\n";
std::cout << "Type : " << variable.m_type << "\n"
<< "Is Static : " << variable.m_is_static << "\n"
<< "Name :" << variable.m_name << "\n"
<< "Value :" << variable.m_value << "\n";
}
I'm trying to implement a parser for an old forth-based grammar where most of the functions take the form of: "num" "num" "command" where command is a string of some kind.
For example:
0 1 HSFF
41 SENSOR ON
1 12.0 BH 4 LNON
As you can see, the grammar is [mostly] reverse polish notation, with some string of arguments preceding the command. The grammar is pseudo white-space dependent, in that:
0 1 HSFF 41 SENSOR ON
Is as valid as:
0 1 HSFF
41 SENSOR ON
(In other words '\n' is treated just as a space)
Extra whitespace is also skipped, so:
0 1 HSFF 41 SENSOR ON
Is 2 valid commands with a lot of unnecessary whitespace.
All of this seemed simple enough, so I started chugging away at implementing the grammar. Of course, things are never as simple as they seem, and I found that my parser fails on the very first character (in this case an int). So, boiling things down, I tried implementing a single rule:
namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;
qi::rule<Iterator> Cmd_TARGETSENSPAIRCMD =
qi::int_ >> (lit("TARGET") | lit("SENSOR") | lit("PAIR") )
>> (lit("ON") | lit("OFF") | lit("ERASE") );
std::string in("0 TARGET ERASE\n");
Iterator = in.begin();
bool success = qi::parse(in.begin(), in.end(), Cmd_TARGETSENSPAIRCMD, ascii::space);
This code block always returns false, indicating that parsing has failed.
As you can see, the rule is that an int must be followed by two literals, in this case indicating whether the command is for a target, sensor, or pair, identified by the int, to be turned on, off, or erased.
If I look at the iterator to see where the parsing has stopped, it shows that it has failed immediately on the int. So I changed the rule to simply be +qi::int_, which succeeds in parsing the int, but fails on the literals. Shortening the rule to simply qi::int_ >> lit("TARGET") also fails.
I think the problem may be in the whitespace skipper I'm using, but I have been unable to determine what I'm doing wrong.
Is there a way to tell spirit that all tokens are separated by whitespace, with the exception of quoted strings (which turn into labels in my grammar)?
I have phantasized a little for you.
The first step I usually take is to come up with an AST model:
namespace Ast
{
enum Command { NO_CMD, TARGET, SENSOR, PAIR };
enum Modifier { NO_MODIFIER, ON, OFF, ERASE };
struct ModifiedCommand
{
Command cmd = NO_CMD;
Modifier mod = NO_MODIFIER;
};
struct OtherCommand
{
std::string token;
OtherCommand(std::string token = "") : token(std::move(token))
{ }
};
typedef boost::variant<int, double> Operand;
typedef boost::variant<Operand, ModifiedCommand, OtherCommand> RpnMachineInstruction;
typedef std::vector<RpnMachineInstruction> RpnMachineProgram;
}
As you can see I intend to distinguish integers and double for operand values, and I treat any "other" commands (like "HSSF") that wasn't actively described in your grammar as free-form tokens (uppercase alphabetical).
Now, we map the rule definitions onto this:
RpnGrammar() : RpnGrammar::base_type(_start)
{
_start = *_instruction;
_instruction = _operand | _mod_command | _other_command;
_operand = _strict_double | qi::int_;
_mod_command = _command >> _modifier;
_other_command = qi::as_string [ +qi::char_("A-Z") ];
// helpers
_command.add("TARGET", Ast::TARGET)("SENSOR", Ast::SENSOR)("PAIR", Ast::PAIR);
_modifier.add("ON", Ast::ON)("OFF", Ast::OFF)("ERASE", Ast::ERASE);
}
The grammar parses the result into a list of instructions (Ast::RpnMachineProgram), where each instruction is either an operand or an operation (a command with modifier, or any other free-form command like "HSSF"). Here are the rule declarations:
qi::rule<It, Ast::RpnMachineProgram(), Skipper> _start;
qi::rule<It, Ast::RpnMachineInstruction(), Skipper> _instruction;
qi::rule<It, Ast::ModifiedCommand(), Skipper> _mod_command;
qi::rule<It, Ast::Operand(), Skipper> _operand;
// note: omitting the Skipper has the same effect as wrapping with `qi::lexeme`
qi::rule<It, Ast::OtherCommand()> _other_command;
qi::real_parser<double, boost::spirit::qi::strict_real_policies<double> > _strict_double;
qi::symbols<char, Ast::Command> _command;
qi::symbols<char, Ast::Modifier> _modifier;
You can see it parse the sample from the question:
Parse succeeded, 10 stack instructions
int:0 int:1 'HSFF'
int:41 SENSOR [ON]
int:1 double:12 'BH'
int:4 'LNON'
The output is created with a sample visitor that you could use as inspiration for an interpreter/executor.
See it Live On Coliru
Full Listing
#include <boost/fusion/adapted/struct.hpp>
#include <boost/spirit/include/qi.hpp>
#include <fstream>
namespace qi = boost::spirit::qi;
namespace Ast
{
enum Command { NO_CMD, TARGET, SENSOR, PAIR };
enum Modifier { NO_MODIFIER, ON, OFF, ERASE };
struct ModifiedCommand
{
Command cmd = NO_CMD;
Modifier mod = NO_MODIFIER;
};
struct OtherCommand
{
std::string token;
OtherCommand(std::string token = "") : token(std::move(token))
{ }
};
typedef boost::variant<int, double> Operand;
typedef boost::variant<Operand, ModifiedCommand, OtherCommand> RpnMachineInstruction;
typedef std::vector<RpnMachineInstruction> RpnMachineProgram;
// for printing, you can adapt this to execute the stack instead
struct Print : boost::static_visitor<std::ostream&>
{
Print(std::ostream& os) : os(os) {}
std::ostream& os;
std::ostream& operator()(Ast::Command cmd) const {
switch(cmd) {
case TARGET: return os << "TARGET" << " ";
case SENSOR: return os << "SENSOR" << " ";
case PAIR: return os << "PAIR" << " ";
case NO_CMD: return os << "NO_CMD" << " ";
default: return os << "#INVALID_COMMAND#" << " ";
}
}
std::ostream& operator()(Ast::Modifier mod) const {
switch(mod) {
case ON: return os << "[ON]" << " ";
case OFF: return os << "[OFF]" << " ";
case ERASE: return os << "[ERASE]" << " ";
case NO_MODIFIER: return os << "[NO_MODIFIER]" << " ";
default: return os << "#INVALID_MODIFIER#" << " ";
}
}
std::ostream& operator()(double d) const { return os << "double:" << d << " "; }
std::ostream& operator()(int i) const { return os << "int:" << i << " "; }
std::ostream& operator()(Ast::OtherCommand const& cmd) const {
return os << "'" << cmd.token << "'\n";
}
std::ostream& operator()(Ast::ModifiedCommand const& cmd) const {
(*this)(cmd.cmd);
(*this)(cmd.mod);
return os << "\n";
}
template <typename... TVariant>
std::ostream& operator()(boost::variant<TVariant...> const& v) const {
return boost::apply_visitor(*this, v);
}
};
}
BOOST_FUSION_ADAPT_STRUCT(Ast::ModifiedCommand, (Ast::Command, cmd)(Ast::Modifier, mod))
template <typename It, typename Skipper = qi::space_type>
struct RpnGrammar : qi::grammar<It, Ast::RpnMachineProgram(), Skipper>
{
RpnGrammar() : RpnGrammar::base_type(_start)
{
_command.add("TARGET", Ast::TARGET)("SENSOR", Ast::SENSOR)("PAIR", Ast::PAIR);
_modifier.add("ON", Ast::ON)("OFF", Ast::OFF)("ERASE", Ast::ERASE);
_start = *_instruction;
_instruction = _operand | _mod_command | _other_command;
_operand = _strict_double | qi::int_;
_mod_command = _command >> _modifier;
_other_command = qi::as_string [ +qi::char_("A-Z") ];
}
private:
qi::rule<It, Ast::RpnMachineProgram(), Skipper> _start;
qi::rule<It, Ast::RpnMachineInstruction(), Skipper> _instruction;
qi::rule<It, Ast::ModifiedCommand(), Skipper> _mod_command;
qi::rule<It, Ast::Operand(), Skipper> _operand;
// note: omitting the Skipper has the same effect as wrapping with `qi::lexeme`
qi::rule<It, Ast::OtherCommand()> _other_command;
qi::real_parser<double, boost::spirit::qi::strict_real_policies<double> > _strict_double;
qi::symbols<char, Ast::Command> _command;
qi::symbols<char, Ast::Modifier> _modifier;
};
int main()
{
std::ifstream ifs("input.txt");
typedef boost::spirit::istream_iterator It;
ifs.unsetf(std::ios::skipws);
RpnGrammar<It> grammar;
It f(ifs), l;
Ast::RpnMachineProgram program;
bool ok = qi::phrase_parse(f, l, grammar, qi::space, program);
if (ok)
{
std::cout << "Parse succeeded, " << program.size() << " stack instructions\n";
std::for_each(
program.begin(),
program.end(),
Ast::Print(std::cout));
}
else
{
std::cout << "Parse failed\n";
}
if (f != l)
{
std::cout << "Remaining unparsed: '" << std::string(f,l) << "'\n";
}
}