9 Rcpp::CharacterVector preprocess(
10 Rcpp::CharacterVector input,
11 std::string erase =
"[^.?!:;'[:alnum:][:space:]]",
12 bool lower_case =
true
15 std::regex erase_(erase);
17 auto itend = input.end();
18 for(
auto it = input.begin(); it != itend; ++it){
20 if (erase !=
"") temp = std::regex_replace(temp, erase_,
"");
21 if (lower_case)
for (
char& c : temp) c = tolower(c);
28 std::string &, std::vector<std::string> &,
const std::regex &,
bool
31 Rcpp::CharacterVector tknz_sent(Rcpp::CharacterVector input,
32 std::string EOS =
"[.?!:;]+",
33 bool keep_first =
false)
37 size_t len = input.size();
38 std::vector<std::vector<std::string> > tmp(len);
43 for (
size_t i = 0; i < len; ++i) {
45 tokenized += tknz_sent(line, tmp[i], _EOS, keep_first);
48 Rcpp::CharacterVector res(tokenized);
50 for (
size_t i = 0; i < len; ++i) {
51 for (
const std::string & sentence : tmp[i]) {
60 size_t tknz_sent(std::string & line,
61 std::vector<std::string> & line_res,
62 const std::regex& _EOS,
65 auto itstart = std::sregex_iterator(line.begin(), line.end(), _EOS);
66 auto itend = std::sregex_iterator();
68 size_t start = 0, end;
70 for (std::sregex_iterator it = itstart; it != itend; ++it) {
75 line.substr(start, end - start) +
" " + line[end] :
76 line.substr(start, end - start)
78 start = end + m.length();
81 if (start != std::string::npos)
82 line_res.push_back(line.substr(start));
84 return line_res.size();