C++ Logo

std-proposals

Advanced search

[std-proposals] regex_top_level_token_iterator

From: Frederick Virchanza Gotham <cauldwell.thomas_at_[hidden]>
Date: Mon, 2 May 2022 15:43:22 +0100
I'm currently writing the "pre-compiler" for my proposed idea,
'Continuity Methods' (see link to original post:
https://lists.isocpp.org/std-proposals/2022/04/3765.php ).

In writing the pre-compiler, I have to parse class definitions and
find all their base classes. If I take a C++ source file and pass it
through the the preprocessor to produce a translation unit, and if I
then feed this translation unit into my pre-compiler, it gives me
complex class definitions such as the following:

    class MyClass : public
::std::__allocator_traits_base::__rebind<_Tp,_Up,__void_t<typename
_Tp::template rebind<_Up>::other>>::value_type, public ::std::string {
        /* members go in here */
    };

When I remove the class name and the colon from the front (i.e. "class
MyClass : "), I'm left with the comma-separated string containing the
names of the base classes:

    "::std::vector<int>,
::std::__allocator_traits_base::__rebind<_Tp,_Up,__void_t<typename
_Tp::template rebind<_Up>::other>>::value_type"

When dealing with a string of comma-separated base classes like this,
there can be commas inside the angle brackets, and so I need to ignore
these commas within nested brackets. I've tried a few different regex
formulae for ignoring matches found within brackets, but none of them
work all the time -- and they all fail when dealing with a long string
containing many nested pairs of brackets.

So I figured it would make sense to complement
"std::regex_token_iterator" with a new kind of class:
"regex_top_level_token_iterator". This new class will match a regex
only if the match doesn't take place within brackets () [] {} <>.
There's no limit on the level of nesting.

Here's what I've got so far:

#include <cassert> // cassert
#include <iterator> // iterator_traits
#include <regex> // regex_token_iterator, regex, regex_traits, regex_constants

template<
         class BidirIt,
         class CharT = typename std::iterator_traits<BidirIt>::value_type,
         class Traits = std::regex_traits<CharT>
>
class regex_top_level_token_iterator :
std::regex_token_iterator<BidirIt,CharT,Traits> {
private:

    using Base = std::regex_token_iterator<BidirIt,CharT,Traits>;
    Base &base = *static_cast<Base*>(this);

protected:

    BidirIt const _a; // set in constructor's initialiser list

    typename Base::value_type _strided_match; /* starts off with
matched = false */

    bool Is_Top_Level(void) const
    {
        assert( base != Base() ); // Is_Top_Level should never be
called on a "no more matches" token iterator

        size_t counts[4u] = {}; /* (), [], {}, <> */

        for ( BidirIt iter = _a; iter != (*base).second; ++iter )
        {
            switch ( *iter )
            {
            case '(': ++(counts[0u]); break;
            case ')': --(counts[0u]); break;

            case '[': ++(counts[1u]); break;
            case ']': --(counts[1u]); break;

            case '{': ++(counts[2u]); break;
            case '}': --(counts[2u]); break;

            case '<': ++(counts[3u]); break;
            case '>': --(counts[3u]); break;
            }
        }

        for ( auto const &count : counts )
        {
            if ( 0u != count ) return false;
        }

        return true;
    }

public:

    regex_top_level_token_iterator(void) : Base(), _a() {}

    void Keep_Searching_If_Necessary(void)
    {
        for ( _strided_match.matched = false; base != Base(); ++base )
        {
            if ( this->Is_Top_Level() )
            {
                _strided_match.second = (*base).second; // redundant
when _strided_match.matched == false
                return;
            }
            else
            {
                if ( false == _strided_match.matched )
                {
                    _strided_match.matched = true;

                    _strided_match.first = (*base).first;
                }
            }
        }

        _strided_match.matched = false;
    }

    regex_top_level_token_iterator(BidirIt const a, BidirIt const b,
                                   typename Base::regex_type const &re,
                                   int const submatch = 0,

std::regex_constants::match_flag_type const m =
std::regex_constants::match_default )
      : Base(a,b,re,submatch,m), _a(a)
    {
        Keep_Searching_If_Necessary();
    }

    regex_top_level_token_iterator &operator++(void)
    {
        assert( base != Base() ); // operator++ should never be called
on a "no more matches" token iterator

        ++base;

        Keep_Searching_If_Necessary();

        return *this;
    }

    bool operator==(regex_top_level_token_iterator const &rhs) const
// Since C++20 we don't need operator!=
    {
        return base == rhs;
    }

    typename Base::value_type const &operator*(void) const
    {
        assert( base != Base() ); // operator* should never be called
on a "no more matches" token iterator

        if ( false == _strided_match.matched )
        {
            return *base;
        }

        return _strided_match;
    }
};

#include <string>
#include <string_view>

using sregex_top_level_token_iterator =
regex_top_level_token_iterator<std::string::const_iterator>;
using svregex_top_level_token_iterator =
regex_top_level_token_iterator<std::string_view::const_iterator>;

#include <iostream>
#include <string>
using std::cout;
using std::endl;

auto main(void) -> int
{
    std::string const str("dog, cat, fish, (frogs,toads), monkeys,
elephants, (lizards, amphibians<true,1>), sharks");

    cout << str << endl;

    std::regex const my_separator(",");

    for ( sregex_top_level_token_iterator iter( str.cbegin(),
str.cend(), my_separator, -1 );
          sregex_top_level_token_iterator() != iter;
          ++iter )
    {
        cout << *iter << endl;
    }

    return 0;
}


I've also written a similar class "regex_top_level_iterator" which has
similar functionality to "regex_iterator".

Received on 2022-05-02 14:43:32