mqtt_cpp
utf8encoded_strings.hpp
Go to the documentation of this file.
1 // Copyright Takatoshi Kondo 2015
2 //
3 // Distributed under the Boost Software License, Version 1.0.
4 // (See accompanying file LICENSE_1_0.txt or copy at
5 // http://www.boost.org/LICENSE_1_0.txt)
6 
7 #if !defined(MQTT_UTF8ENCODED_STRINGS_HPP)
8 #define MQTT_UTF8ENCODED_STRINGS_HPP
9 
10 #include <mqtt/namespace.hpp>
11 #include <mqtt/string_view.hpp>
12 
13 namespace MQTT_NS {
14 
15 namespace utf8string {
16 
17 enum struct validation
18 {
24  well_formed = 0,
25 
31  ill_formed,
32 
39 };
40 
41 constexpr bool
43  return str.size() <= 0xffff;
44 }
45 
46 constexpr validation
48  // This code is based on https://www.cl.cam.ac.uk/~mgk25/ucs/utf8_check.c
49  auto result = validation::well_formed;
50 #if defined(MQTT_USE_STR_CHECK)
51  auto it = str.begin();
52  auto end = str.end();
53 
54  while (it != end) {
55  if (static_cast<unsigned char>(*(it + 0)) < 0b1000'0000) {
56  // 0xxxxxxxxx
57  if (static_cast<unsigned char>(*(it + 0)) == 0x00) {
58  result = validation::ill_formed;
59  break;
60  }
61  if ((static_cast<unsigned char>(*(it + 0)) >= 0x01 &&
62  static_cast<unsigned char>(*(it + 0)) <= 0x1f) ||
63  static_cast<unsigned char>(*(it + 0)) == 0x7f) {
65  }
66  ++it;
67  }
68  else if ((static_cast<unsigned char>(*(it + 0)) & 0b1110'0000) == 0b1100'0000) {
69  // 110XXXXx 10xxxxxx
70  if (it + 1 >= end) {
71  result = validation::ill_formed;
72  break;
73  }
74  if ((static_cast<unsigned char>(*(it + 1)) & 0b1100'0000) != 0b1000'0000 ||
75  (static_cast<unsigned char>(*(it + 0)) & 0b1111'1110) == 0b1100'0000) { // overlong
76  result = validation::ill_formed;
77  break;
78  }
79  if (static_cast<unsigned char>(*(it + 0)) == 0b1100'0010 &&
80  static_cast<unsigned char>(*(it + 1)) >= 0b1000'0000 &&
81  static_cast<unsigned char>(*(it + 1)) <= 0b1001'1111) {
83  }
84  it += 2;
85  }
86  else if ((static_cast<unsigned char>(*(it + 0)) & 0b1111'0000) == 0b1110'0000) {
87  // 1110XXXX 10Xxxxxx 10xxxxxx
88  if (it + 2 >= end) {
89  result = validation::ill_formed;
90  break;
91  }
92  if ((static_cast<unsigned char>(*(it + 1)) & 0b1100'0000) != 0b1000'0000 ||
93  (static_cast<unsigned char>(*(it + 2)) & 0b1100'0000) != 0b1000'0000 ||
94  (static_cast<unsigned char>(*(it + 0)) == 0b1110'0000 &&
95  (static_cast<unsigned char>(*(it + 1)) & 0b1110'0000) == 0b1000'0000) || // overlong?
96  (static_cast<unsigned char>(*(it + 0)) == 0b1110'1101 &&
97  (static_cast<unsigned char>(*(it + 1)) & 0b1110'0000) == 0b1010'0000)) { // surrogate?
98  result = validation::ill_formed;
99  break;
100  }
101  if (static_cast<unsigned char>(*(it + 0)) == 0b1110'1111 &&
102  static_cast<unsigned char>(*(it + 1)) == 0b1011'1111 &&
103  (static_cast<unsigned char>(*(it + 2)) & 0b1111'1110) == 0b1011'1110) {
104  // U+FFFE or U+FFFF?
106  }
107  it += 3;
108  }
109  else if ((static_cast<unsigned char>(*(it + 0)) & 0b1111'1000) == 0b1111'0000) {
110  // 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
111  if (it + 3 >= end) {
112  result = validation::ill_formed;
113  break;
114  }
115  if ((static_cast<unsigned char>(*(it + 1)) & 0b1100'0000) != 0b1000'0000 ||
116  (static_cast<unsigned char>(*(it + 2)) & 0b1100'0000) != 0b1000'0000 ||
117  (static_cast<unsigned char>(*(it + 3)) & 0b1100'0000) != 0b1000'0000 ||
118  (static_cast<unsigned char>(*(it + 0)) == 0b1111'0000 &&
119  (static_cast<unsigned char>(*(it + 1)) & 0b1111'0000) == 0b1000'0000) || // overlong?
120  (static_cast<unsigned char>(*(it + 0)) == 0b1111'0100 &&
121  static_cast<unsigned char>(*(it + 1)) > 0b1000'1111) ||
122  static_cast<unsigned char>(*(it + 0)) > 0b1111'0100) { // > U+10FFFF?
123  result = validation::ill_formed;
124  break;
125  }
126  if ((static_cast<unsigned char>(*(it + 1)) & 0b1100'1111) == 0b1000'1111 &&
127  static_cast<unsigned char>(*(it + 2)) == 0b1011'1111 &&
128  (static_cast<unsigned char>(*(it + 3)) & 0b1111'1110) == 0b1011'1110) {
129  // U+nFFFE or U+nFFFF?
131  }
132  it += 4;
133  }
134  else {
135  result = validation::ill_formed;
136  break;
137  }
138  }
139 #else // MQTT_USE_STR_CHECK
140  static_cast<void>(str);
141 #endif // MQTT_USE_STR_CHECK
142  return result;
143 }
144 
145 } // namespace utf8string
146 
147 } // namespace MQTT_NS
148 
149 #endif // MQTT_UTF8ENCODED_STRINGS_HPP
validation
Definition: utf8encoded_strings.hpp:18
@ ill_formed
UTF-8 string is ill_formed or contains null character. See http://docs.oasis-open....
@ well_formed_with_non_charactor
UTF-8 string is well_formed and contains control character and non-character. See http://docs....
@ well_formed
UTF-8 string is well_formed. See http://docs.oasis-open.org/mqtt/mqtt/v3.1.1/os/mqtt-v3....
constexpr bool is_valid_length(string_view str)
Definition: utf8encoded_strings.hpp:42
constexpr validation validate_contents(string_view str)
Definition: utf8encoded_strings.hpp:47
Definition: any.hpp:27
boost::string_ref string_view
Definition: string_view.hpp:64