kgrams  0.1.0
Dictionary.h
Go to the documentation of this file.
1 
5 #ifndef DICTIONARY_H
6 #define DICTIONARY_H
7 
8 #include "special_tokens.h"
9 #include "WordStream.h"
10 #include <string>
11 #include <vector>
12 #include <unordered_map>
13 
14 
22 class Dictionary {
23  //--------Private elements--------//
25  std::unordered_map<std::string, std::string> word_to_ind_;
27  std::unordered_map<std::string, std::string> ind_to_word_;
29  size_t V_;
30 
31  //--------Private elements--------//
32  void insert_special_tokens() {
33  word_to_ind_[BOS_TOK] = BOS_IND;
34  ind_to_word_[BOS_IND] = BOS_TOK;
35  word_to_ind_[EOS_TOK] = EOS_IND;
36  ind_to_word_[EOS_IND] = EOS_TOK;
37  // UNK_TOK is not added as a key in Word-to-Index map, see
38  // contains() method below
39  ind_to_word_[UNK_IND] = UNK_TOK;
40  }
41 
42 public:
43  //--------Constructors--------//
44 
48  Dictionary () : V_(0) { insert_special_tokens(); }
49 
56  Dictionary (const std::vector<std::string> & dict)
57  : Dictionary() { for (std::string word : dict) insert(word); }
58 
63  bool contains (std::string word) const {
64  return word_to_ind_.find(word) != word_to_ind_.end();
65  }
66 
69  void insert (std::string word) {
70  if (contains(word)) return;
71  std::string index = std::to_string(++V_);
72  word_to_ind_[word] = index;
73  ind_to_word_[index] = word;
74  }
75 
79  std::string word (std::string index) const {
80  auto it = ind_to_word_.find(index);
81  if (it != ind_to_word_.end()) return it->second;
82  return UNK_TOK;
83  }
84 
88  std::string index (std::string word) const {
89  auto it = word_to_ind_.find(word);
90  if (it != word_to_ind_.end()) return it->second;
91  return UNK_IND;
92  }
93 
97  size_t length () const { return ind_to_word_.size() - 3; }
98 
102  size_t size () const { return length(); }
103 
112  std::pair<size_t, std::string> kgram_code (std::string kgram) const
113  {
114  std::pair<size_t, std::string> res{0, ""};
115  WordStream stream(kgram);
116  std::string word, ind;
117  for (; ; res.first++) {
118  word = stream.pop_word();
119  if (stream.eos())
120  break;
121  ind = index(word);
122  res.second += ind + " ";
123  }
124  if (res.first > 0)
125  res.second.pop_back();
126  return res;
127  }
128 }; // class Dictionary
129 
130 #endif // DICTIONARY_H
Dictionary::length
size_t length() const
Return size of the dictionary, excluding the special tokens (BOS, EOS, UNK).
Definition: Dictionary.h:97
Dictionary::index
std::string index(std::string word) const
Return the index corresponding to a given word.
Definition: Dictionary.h:88
Dictionary::insert
void insert(std::string word)
Insert a word in the Dictionary.
Definition: Dictionary.h:69
Dictionary::Dictionary
Dictionary(const std::vector< std::string > &dict)
Initialize Dictionary from list of words.
Definition: Dictionary.h:56
Dictionary::kgram_code
std::pair< size_t, std::string > kgram_code(std::string kgram) const
Extract k-gram code from a string.
Definition: Dictionary.h:112
Dictionary::contains
bool contains(std::string word) const
Check if a word is contained in the Dictionary.
Definition: Dictionary.h:63
Dictionary::word
std::string word(std::string index) const
Return the word corresponding to a given word index.
Definition: Dictionary.h:79
Dictionary
Word dictionary for language models.
Definition: Dictionary.h:22
Dictionary::size
size_t size() const
Return size of the dictionary, excluding the special tokens (BOS, EOS, UNK).
Definition: Dictionary.h:102
WordStream
Definition: WordStream.h:6
Dictionary::Dictionary
Dictionary()
Default constructor.
Definition: Dictionary.h:48