Date: Mon, 2 May 2022 15:43:22 +0100
I'm currently writing the "pre-compiler" for my proposed idea,
'Continuity Methods' (see link to original post:
https://lists.isocpp.org/std-proposals/2022/04/3765.php ).
In writing the pre-compiler, I have to parse class definitions and
find all their base classes. If I take a C++ source file and pass it
through the the preprocessor to produce a translation unit, and if I
then feed this translation unit into my pre-compiler, it gives me
complex class definitions such as the following:
class MyClass : public
::std::__allocator_traits_base::__rebind<_Tp,_Up,__void_t<typename
_Tp::template rebind<_Up>::other>>::value_type, public ::std::string {
/* members go in here */
};
When I remove the class name and the colon from the front (i.e. "class
MyClass : "), I'm left with the comma-separated string containing the
names of the base classes:
"::std::vector<int>,
::std::__allocator_traits_base::__rebind<_Tp,_Up,__void_t<typename
_Tp::template rebind<_Up>::other>>::value_type"
When dealing with a string of comma-separated base classes like this,
there can be commas inside the angle brackets, and so I need to ignore
these commas within nested brackets. I've tried a few different regex
formulae for ignoring matches found within brackets, but none of them
work all the time -- and they all fail when dealing with a long string
containing many nested pairs of brackets.
So I figured it would make sense to complement
"std::regex_token_iterator" with a new kind of class:
"regex_top_level_token_iterator". This new class will match a regex
only if the match doesn't take place within brackets () [] {} <>.
There's no limit on the level of nesting.
Here's what I've got so far:
#include <cassert> // cassert
#include <iterator> // iterator_traits
#include <regex> // regex_token_iterator, regex, regex_traits, regex_constants
template<
class BidirIt,
class CharT = typename std::iterator_traits<BidirIt>::value_type,
class Traits = std::regex_traits<CharT>
>
class regex_top_level_token_iterator :
std::regex_token_iterator<BidirIt,CharT,Traits> {
private:
using Base = std::regex_token_iterator<BidirIt,CharT,Traits>;
Base &base = *static_cast<Base*>(this);
protected:
BidirIt const _a; // set in constructor's initialiser list
typename Base::value_type _strided_match; /* starts off with
matched = false */
bool Is_Top_Level(void) const
{
assert( base != Base() ); // Is_Top_Level should never be
called on a "no more matches" token iterator
size_t counts[4u] = {}; /* (), [], {}, <> */
for ( BidirIt iter = _a; iter != (*base).second; ++iter )
{
switch ( *iter )
{
case '(': ++(counts[0u]); break;
case ')': --(counts[0u]); break;
case '[': ++(counts[1u]); break;
case ']': --(counts[1u]); break;
case '{': ++(counts[2u]); break;
case '}': --(counts[2u]); break;
case '<': ++(counts[3u]); break;
case '>': --(counts[3u]); break;
}
}
for ( auto const &count : counts )
{
if ( 0u != count ) return false;
}
return true;
}
public:
regex_top_level_token_iterator(void) : Base(), _a() {}
void Keep_Searching_If_Necessary(void)
{
for ( _strided_match.matched = false; base != Base(); ++base )
{
if ( this->Is_Top_Level() )
{
_strided_match.second = (*base).second; // redundant
when _strided_match.matched == false
return;
}
else
{
if ( false == _strided_match.matched )
{
_strided_match.matched = true;
_strided_match.first = (*base).first;
}
}
}
_strided_match.matched = false;
}
regex_top_level_token_iterator(BidirIt const a, BidirIt const b,
typename Base::regex_type const &re,
int const submatch = 0,
std::regex_constants::match_flag_type const m =
std::regex_constants::match_default )
: Base(a,b,re,submatch,m), _a(a)
{
Keep_Searching_If_Necessary();
}
regex_top_level_token_iterator &operator++(void)
{
assert( base != Base() ); // operator++ should never be called
on a "no more matches" token iterator
++base;
Keep_Searching_If_Necessary();
return *this;
}
bool operator==(regex_top_level_token_iterator const &rhs) const
// Since C++20 we don't need operator!=
{
return base == rhs;
}
typename Base::value_type const &operator*(void) const
{
assert( base != Base() ); // operator* should never be called
on a "no more matches" token iterator
if ( false == _strided_match.matched )
{
return *base;
}
return _strided_match;
}
};
#include <string>
#include <string_view>
using sregex_top_level_token_iterator =
regex_top_level_token_iterator<std::string::const_iterator>;
using svregex_top_level_token_iterator =
regex_top_level_token_iterator<std::string_view::const_iterator>;
#include <iostream>
#include <string>
using std::cout;
using std::endl;
auto main(void) -> int
{
std::string const str("dog, cat, fish, (frogs,toads), monkeys,
elephants, (lizards, amphibians<true,1>), sharks");
cout << str << endl;
std::regex const my_separator(",");
for ( sregex_top_level_token_iterator iter( str.cbegin(),
str.cend(), my_separator, -1 );
sregex_top_level_token_iterator() != iter;
++iter )
{
cout << *iter << endl;
}
return 0;
}
I've also written a similar class "regex_top_level_iterator" which has
similar functionality to "regex_iterator".
'Continuity Methods' (see link to original post:
https://lists.isocpp.org/std-proposals/2022/04/3765.php ).
In writing the pre-compiler, I have to parse class definitions and
find all their base classes. If I take a C++ source file and pass it
through the the preprocessor to produce a translation unit, and if I
then feed this translation unit into my pre-compiler, it gives me
complex class definitions such as the following:
class MyClass : public
::std::__allocator_traits_base::__rebind<_Tp,_Up,__void_t<typename
_Tp::template rebind<_Up>::other>>::value_type, public ::std::string {
/* members go in here */
};
When I remove the class name and the colon from the front (i.e. "class
MyClass : "), I'm left with the comma-separated string containing the
names of the base classes:
"::std::vector<int>,
::std::__allocator_traits_base::__rebind<_Tp,_Up,__void_t<typename
_Tp::template rebind<_Up>::other>>::value_type"
When dealing with a string of comma-separated base classes like this,
there can be commas inside the angle brackets, and so I need to ignore
these commas within nested brackets. I've tried a few different regex
formulae for ignoring matches found within brackets, but none of them
work all the time -- and they all fail when dealing with a long string
containing many nested pairs of brackets.
So I figured it would make sense to complement
"std::regex_token_iterator" with a new kind of class:
"regex_top_level_token_iterator". This new class will match a regex
only if the match doesn't take place within brackets () [] {} <>.
There's no limit on the level of nesting.
Here's what I've got so far:
#include <cassert> // cassert
#include <iterator> // iterator_traits
#include <regex> // regex_token_iterator, regex, regex_traits, regex_constants
template<
class BidirIt,
class CharT = typename std::iterator_traits<BidirIt>::value_type,
class Traits = std::regex_traits<CharT>
>
class regex_top_level_token_iterator :
std::regex_token_iterator<BidirIt,CharT,Traits> {
private:
using Base = std::regex_token_iterator<BidirIt,CharT,Traits>;
Base &base = *static_cast<Base*>(this);
protected:
BidirIt const _a; // set in constructor's initialiser list
typename Base::value_type _strided_match; /* starts off with
matched = false */
bool Is_Top_Level(void) const
{
assert( base != Base() ); // Is_Top_Level should never be
called on a "no more matches" token iterator
size_t counts[4u] = {}; /* (), [], {}, <> */
for ( BidirIt iter = _a; iter != (*base).second; ++iter )
{
switch ( *iter )
{
case '(': ++(counts[0u]); break;
case ')': --(counts[0u]); break;
case '[': ++(counts[1u]); break;
case ']': --(counts[1u]); break;
case '{': ++(counts[2u]); break;
case '}': --(counts[2u]); break;
case '<': ++(counts[3u]); break;
case '>': --(counts[3u]); break;
}
}
for ( auto const &count : counts )
{
if ( 0u != count ) return false;
}
return true;
}
public:
regex_top_level_token_iterator(void) : Base(), _a() {}
void Keep_Searching_If_Necessary(void)
{
for ( _strided_match.matched = false; base != Base(); ++base )
{
if ( this->Is_Top_Level() )
{
_strided_match.second = (*base).second; // redundant
when _strided_match.matched == false
return;
}
else
{
if ( false == _strided_match.matched )
{
_strided_match.matched = true;
_strided_match.first = (*base).first;
}
}
}
_strided_match.matched = false;
}
regex_top_level_token_iterator(BidirIt const a, BidirIt const b,
typename Base::regex_type const &re,
int const submatch = 0,
std::regex_constants::match_flag_type const m =
std::regex_constants::match_default )
: Base(a,b,re,submatch,m), _a(a)
{
Keep_Searching_If_Necessary();
}
regex_top_level_token_iterator &operator++(void)
{
assert( base != Base() ); // operator++ should never be called
on a "no more matches" token iterator
++base;
Keep_Searching_If_Necessary();
return *this;
}
bool operator==(regex_top_level_token_iterator const &rhs) const
// Since C++20 we don't need operator!=
{
return base == rhs;
}
typename Base::value_type const &operator*(void) const
{
assert( base != Base() ); // operator* should never be called
on a "no more matches" token iterator
if ( false == _strided_match.matched )
{
return *base;
}
return _strided_match;
}
};
#include <string>
#include <string_view>
using sregex_top_level_token_iterator =
regex_top_level_token_iterator<std::string::const_iterator>;
using svregex_top_level_token_iterator =
regex_top_level_token_iterator<std::string_view::const_iterator>;
#include <iostream>
#include <string>
using std::cout;
using std::endl;
auto main(void) -> int
{
std::string const str("dog, cat, fish, (frogs,toads), monkeys,
elephants, (lizards, amphibians<true,1>), sharks");
cout << str << endl;
std::regex const my_separator(",");
for ( sregex_top_level_token_iterator iter( str.cbegin(),
str.cend(), my_separator, -1 );
sregex_top_level_token_iterator() != iter;
++iter )
{
cout << *iter << endl;
}
return 0;
}
I've also written a similar class "regex_top_level_iterator" which has
similar functionality to "regex_iterator".
Received on 2022-05-02 14:43:32