kgrams  0.1.0
Utilities.h
1 #ifndef UTILITIES_H
2 #define UTILITIES_H
3 
4 #include <Rcpp.h>
5 #include <vector>
6 #include <string>
7 #include <regex>
8 
9 Rcpp::CharacterVector preprocess(
10  Rcpp::CharacterVector input,
11  std::string erase = "[^.?!:;'[:alnum:][:space:]]",
12  bool lower_case = true
13  )
14 {
15  std::regex erase_(erase);
16  std::string temp;
17  auto itend = input.end();
18  for(auto it = input.begin(); it != itend; ++it){
19  temp = *it;
20  if (erase != "") temp = std::regex_replace(temp, erase_, "");
21  if (lower_case) for (char& c : temp) c = tolower(c);
22  *it = temp;
23  }
24  return input;
25 }
26 
27 size_t tknz_sent(
28  std::string &, std::vector<std::string> &, const std::regex &, bool
29  );
30 
31 Rcpp::CharacterVector tknz_sent(Rcpp::CharacterVector input,
32  std::string EOS = "[.?!:;]+",
33  bool keep_first = false)
34 {
35  if (EOS == "")
36  return input;
37  size_t len = input.size();
38  std::vector<std::vector<std::string> > tmp(len);
39  std::regex _EOS(EOS);
40 
41  size_t tokenized = 0;
42  std::string line;
43  for (size_t i = 0; i < len; ++i) {
44  line = input[i];
45  tokenized += tknz_sent(line, tmp[i], _EOS, keep_first);
46  }
47 
48  Rcpp::CharacterVector res(tokenized);
49  size_t j = 0;
50  for (size_t i = 0; i < len; ++i) {
51  for (const std::string & sentence : tmp[i]) {
52  res[j] = sentence;
53  j++;
54  }
55  }
56 
57  return res;
58 }
59 
60 size_t tknz_sent(std::string & line,
61  std::vector<std::string> & line_res,
62  const std::regex& _EOS,
63  bool keep_first)
64 {
65  auto itstart = std::sregex_iterator(line.begin(), line.end(), _EOS);
66  auto itend = std::sregex_iterator();
67 
68  size_t start = 0, end;
69  std::string tmp;
70  for (std::sregex_iterator it = itstart; it != itend; ++it) {
71  std::smatch m = *it;
72  end = m.position();
73  line_res.push_back(
74  keep_first ?
75  line.substr(start, end - start) + " " + line[end] :
76  line.substr(start, end - start)
77  );
78  start = end + m.length();
79  }
80 
81  if (start != std::string::npos)
82  line_res.push_back(line.substr(start));
83 
84  return line_res.size();
85 }
86 
87 #endif // UTILITIES_H