对着教科书上的方式写的naive算法,写了将近一个小时。我太菜了。
输入的data要求freq由小到大排序。代码能够优化:使用堆当作workset、收集路径(sub函数)中避免频繁复制vector,而是使用pop_back回溯。
已经一万年没写过的C++了,完全不会写了。
| #include <iostream> | |
| #include <optional> | |
| #include <vector> | |
| using namespace std; | |
| using Symbol = string; | |
| class Node { | |
| public: | |
| optional<Symbol> symbol{}; | |
| int freq{}; | |
| Node *left{}; | |
| Node *right{}; | |
| Node(int _freq, optional<Symbol> _s = {}) : symbol(_s), freq(_freq) {} | |
| string format() const { | |
| string s; | |
| s += "freq: "; | |
| s += std::to_string(freq); | |
| s += " symbol: "; | |
| if (symbol.has_value()) { | |
| s += symbol.value(); | |
| } else { | |
| s += "None"; | |
| } | |
| s += " "; | |
| if (left != nullptr) { | |
| s += "left: ("; | |
| s += left->format(); | |
| s += ") right: ("; | |
| s += right->format(); | |
| s += ")"; | |
| } | |
| return s; | |
| } | |
| }; | |
| enum Branch { | |
| Left, Right | |
| }; | |
| using EncodingType = vector<pair<Symbol, vector<Branch>>>; | |
| string build_branches_code(const vector<Branch> & b) { | |
| string s{}; | |
| for (const auto &item: b) { | |
| if (item == Left) { | |
| s += "0"; | |
| } else { | |
| s += "1"; | |
| } | |
| } | |
| return s; | |
| } | |
| void sub(Node *root, vector<Branch> prefix, EncodingType &result_vec) { | |
| if (root == nullptr) return; | |
| if (root->symbol.has_value()) { | |
| result_vec.push_back(pair(root->symbol.value(), prefix)); | |
| } | |
| auto p1 = prefix; p1.push_back(Left); | |
| auto p2 = prefix; p2.push_back(Right); | |
| sub(root->left, p1, result_vec); | |
| sub(root->right, p2, result_vec); | |
| }; | |
| EncodingType collect_encodings(Node *root) { | |
| EncodingType result; | |
| sub(root, {}, result); | |
| return result; | |
| } | |
| Node *build_huffman(Node **data, size_t size) { | |
| vector<Node *> workset; | |
| for (auto i = 0; i < size; ++i) workset.push_back(data[i]); | |
| for (auto iteration_n = 1;; ++iteration_n) { | |
| auto n1 = workset[0]; | |
| auto n2 = workset[1]; | |
| auto new_node = new Node(n1->freq + n2->freq, {}); | |
| if (n1->freq <= n2->freq) { | |
| new_node->left = n1; | |
| new_node->right = n2; | |
| } else { | |
| new_node->left = n2; | |
| new_node->right = n1; | |
| } | |
| // delete the first two nodes | |
| workset.erase(workset.begin()); | |
| workset.erase(workset.begin()); | |
| // insert to the workset | |
| // if the insertion position not found, new_node is greater than all the nodes; insert it to the end | |
| auto insert_pos = workset.size(); | |
| for (auto i = 0; i < workset.size(); ++i) { | |
| if (new_node->freq < workset[i]->freq) { | |
| insert_pos = i; | |
| break; | |
| } | |
| } | |
| auto pos = workset.begin() + (int) insert_pos; | |
| workset.insert(pos, new_node); | |
| if (workset.size() == 1) break; | |
| } | |
| auto root = workset[0]; | |
| return root; | |
| } | |
| size_t compute_wpl(const vector<Node *> &data, EncodingType &encoding) {// compute WPL | |
| size_t sum = 0; | |
| for (const auto &item: encoding) { | |
| // find position | |
| optional<size_t> pos{}; | |
| for (auto i = 0; i < data.size(); ++i) { | |
| if (data[i]->symbol == optional(item.first)) { | |
| pos = i; | |
| break; | |
| } | |
| } | |
| if (!pos.has_value()) { | |
| throw "unreachable"; | |
| } | |
| auto freq = data[pos.value()]->freq; | |
| sum += freq * item.second.size(); | |
| } | |
| return sum; | |
| } | |
| int main() { | |
| vector<Node *> data = { | |
| new Node(1, "a"), | |
| new Node(3, "b"), | |
| new Node(12, "c"), | |
| new Node(13, "d"), | |
| new Node(16, "e"), | |
| new Node(45, "f"), | |
| }; | |
| auto tree = build_huffman(data.data(), data.size()); | |
| auto encoding = collect_encodings(tree); | |
| for (const auto &item: encoding) { | |
| auto symbol = item.first; | |
| auto binary_string = build_branches_code(item.second); | |
| cout << "Symbol: " << symbol << "\tBinary: " << binary_string << endl; | |
| } | |
| size_t wpl = compute_wpl(data, encoding); | |
| cout << "WPL: " << wpl << endl; | |
| return 0; | |
| } |