boost::adaptors::tokenized

References

Headers

boost::adaptors::tokenized is available by including any of the following headers:

  • boost/range/adaptor/tokenized.hpp or
  • boost/range/adaptors.hpp

Examples

tokenized.cpp

#include <iostream>
#include <string>

#include <boost/range/adaptors.hpp>

const auto WORD_RE = boost::regex("\\w+");
const auto WHITESPACE_RE = boost::regex("\\s+");
const auto SUBMATCH_RE = boost::regex("(\\w\\w)(\\w+)");

const std::string str = "Lorem ipsum dolor sit amet";


// For tokenization, pipe variant is preferable because it offers sane
// default parameters. In contrast, the tokenize() function does not
// have default parameter values, which makes it more painful to use.
//
// The iterator/range value type is boost::sub_match, which is equivalent
// to std::sub_match. The best documentation for those classes I've found
// is this: http://www.cplusplus.com/sub_match
//
// More in-depth information can be found on:
// http://www.cplusplus.com/reference/regex/regex_token_iterator/regex_token_iterator/


void whole_match_demo() {
    // A sub-match index of 0 (the default) indicates "give me each match of
    // the RE as a single sub_match object".
    std::cout << "Whole word matches, showing off sub_match objects:" << std::endl;
    for (const auto & token : str | boost::adaptors::tokenized(WORD_RE)) {
        size_t token_begin_idx = token.first - str.begin();
        size_t token_end_idx = token.second - str.begin();
        std::cout << "  "
                  << "token [" << token.str() << "] "
                  << "begin: " << token_begin_idx << "; "
                  << "end: " << token_end_idx << "; "
                  << "length: " << token.length() << ";"
                  << std::endl;
    }
}

void not_matching_demo() {
    // A sub-match index of -1 indicates "give me tokens that don't match the
    // regular expression".
    std::cout << "Tokens separated by whitespace:" << std::endl << "  ";
    for (const auto & token : str | boost::adaptors::tokenized(WHITESPACE_RE, -1)) {
        std::cout << "[" << token << "] ";
    }
    std::cout << std::endl;
}

void one_sub_match_demo() {
    // Positive sub-match indices select the corresponding match group from the regex.
    std::cout << "First two letters of each word:" << std::endl << "  ";
    for (const auto & token : str | boost::adaptors::tokenized(SUBMATCH_RE, 1)) {
        std::cout << "[" << token << "] ";
    }
    std::cout << std::endl;
}

void multi_sub_match_demo() {
    // Multiple sub-matches are selected by passing a vector of sub-match
    // indices. The resulting range contains one entry for each sub-match
    // for every single match.
    // I'm not aware of a direct way to get the sub-match index within
    // the for loop. A workaround is to use the range adaptor indexed().
    std::vector<int> submaps = {1, 2};
    std::cout << "Full split into sub-matches:" << std::endl << "  ";
    for (const auto & token : str | boost::adaptors::tokenized(SUBMATCH_RE, submaps)) {
        std::cout << "[" << token << "] ";
    }
    std::cout << std::endl;
}

int main() {
    whole_match_demo();
    not_matching_demo();
    one_sub_match_demo();
    multi_sub_match_demo();

    return 0;
}

Output:

Whole word matches, showing off sub_match objects:
  token [] begin: 5; end: 5; length: 0;
  token [] begin: 11; end: 11; length: 0;
  token [] begin: 17; end: 17; length: 0;
  token [] begin: 21; end: 21; length: 0;
  token [] begin: 26; end: 26; length: 0;
Tokens separated by whitespace:
  [] [] [] [] 
First two letters of each word:
  [] [] [] [] [] 
Full split into sub-matches:
  [Lo] [rem] [ip] [sum] [do] [lor] [si] [t] [am] [et]

 

Boost Range for Humans

This reference is part of Boost Range for Humans. Click the link to the overview.