openMSX
utf8_core.hh
Go to the documentation of this file.
1 // UTF8-CPP http://utfcpp.sourceforge.net/
2 // Slightly simplified (and reformatted) to fit openMSX coding style.
3 
4 // Copyright 2006 Nemanja Trifunovic
5 
6 /*
7 Permission is hereby granted, free of charge, to any person or organization
8 obtaining a copy of the software and accompanying documentation covered by
9 this license (the "Software") to use, reproduce, display, distribute,
10 execute, and transmit the Software, and to prepare derivative works of the
11 Software, and to permit third-parties to whom the Software is furnished to
12 do so, all subject to the following:
13 
14 The copyright notices in the Software and this entire statement, including
15 the above license grant, this restriction and the following disclaimer,
16 must be included in all copies of the Software, in whole or in part, and
17 all derivative works of the Software, unless such copies or derivative
18 works are solely in the form of machine-executable object code generated by
19 a source language processor.
20 
21 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
24 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
25 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
26 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27 DEALINGS IN THE SOFTWARE.
28 */
29 
30 
31 #ifndef UTF8_CORE_HH
32 #define UTF8_CORE_HH
33 
34 #include "one_of.hh"
35 #include <iterator>
36 #include <cstdint>
37 
38 namespace utf8 {
39 
40 // Helper code - not intended to be directly called by the library users.
41 // May be changed at any time
42 namespace internal {
43 
44 // Unicode constants
45 // Leading (high) surrogates: 0xd800 - 0xdbff
46 // Trailing (low) surrogates: 0xdc00 - 0xdfff
47 const uint16_t LEAD_SURROGATE_MIN = 0xd800u;
48 const uint16_t LEAD_SURROGATE_MAX = 0xdbffu;
49 const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
50 const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
51 const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10);
52 const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
53 
54 // Maximum valid value for a Unicode code point
55 const uint32_t CODE_POINT_MAX = 0x0010ffffu;
56 
57 [[nodiscard]] inline bool is_trail(uint8_t oc)
58 {
59  return (oc >> 6) == 0x2;
60 }
61 
62 [[nodiscard]] inline bool is_surrogate(uint16_t cp)
63 {
64  return (cp >= LEAD_SURROGATE_MIN) && (cp <= TRAIL_SURROGATE_MAX);
65 }
66 
67 [[nodiscard]] inline bool is_code_point_valid(uint32_t cp)
68 {
69  return (cp <= CODE_POINT_MAX) && !is_surrogate(cp) &&
70  (cp != one_of(0xfffeu, 0xffffu));
71 }
72 
73 [[nodiscard]] inline unsigned sequence_length(uint8_t lead)
74 {
75  if (lead < 0x80) {
76  return 1;
77  } else if ((lead >> 5) == 0x06) {
78  return 2;
79  } else if ((lead >> 4) == 0x0e) {
80  return 3;
81  } else if ((lead >> 3) == 0x1e) {
82  return 4;
83  } else {
84  return 0;
85  }
86 }
87 
88 enum utf_error {
89  OK,
95 };
96 
97 template <typename octet_iterator>
98 [[nodiscard]] utf_error validate_next(octet_iterator& it, octet_iterator end,
99  uint32_t* code_point)
100 {
101  uint32_t cp = *it;
102  // Check the lead octet
103  int length = sequence_length(*it);
104 
105  // "Shortcut" for ASCII characters
106  if (length == 1) {
107  if (end - it <= 0) {
108  return NOT_ENOUGH_ROOM;
109  }
110  if (code_point) {
111  *code_point = cp;
112  }
113  ++it;
114  return OK;
115  }
116 
117  // Do we have enough memory?
118  if (std::distance(it, end) < length) {
119  return NOT_ENOUGH_ROOM;
120  }
121 
122  // Check trail octets and calculate the code point
123  switch (length) {
124  case 0:
125  return INVALID_LEAD;
126  case 2:
127  if (is_trail(*(++it))) {
128  cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
129  } else {
130  --it;
131  return INCOMPLETE_SEQUENCE;
132  }
133  break;
134  case 3:
135  if (is_trail(*(++it))) {
136  cp = ((cp << 12) & 0xffff) + ((*it << 6) & 0xfff);
137  if (is_trail(*(++it))) {
138  cp += (*it) & 0x3f;
139  } else {
140  std::advance(it, -2);
141  return INCOMPLETE_SEQUENCE;
142  }
143  } else {
144  --it;
145  return INCOMPLETE_SEQUENCE;
146  }
147  break;
148  case 4:
149  if (is_trail(*(++it))) {
150  cp = ((cp << 18) & 0x1fffff) + ((*it << 12) & 0x3ffff);
151  if (is_trail(*(++it))) {
152  cp += (*it << 6) & 0xfff;
153  if (is_trail(*(++it))) {
154  cp += (*it) & 0x3f;
155  } else {
156  std::advance(it, -3);
157  return INCOMPLETE_SEQUENCE;
158  }
159  } else {
160  std::advance(it, -2);
161  return INCOMPLETE_SEQUENCE;
162  }
163  } else {
164  --it;
165  return INCOMPLETE_SEQUENCE;
166  }
167  break;
168  }
169  // Is the code point valid?
170  if (!is_code_point_valid(cp)) {
171  for (int i = 0; i < length - 1; ++i) {
172  --it;
173  }
174  return INVALID_CODE_POINT;
175  }
176 
177  if (code_point) {
178  *code_point = cp;
179  }
180  if (cp < 0x80) {
181  if (length != 1) {
182  std::advance(it, -(length-1));
183  return OVERLONG_SEQUENCE;
184  }
185  } else if (cp < 0x800) {
186  if (length != 2) {
187  std::advance(it, -(length-1));
188  return OVERLONG_SEQUENCE;
189  }
190  } else if (cp < 0x10000) {
191  if (length != 3) {
192  std::advance(it, -(length-1));
193  return OVERLONG_SEQUENCE;
194  }
195  }
196 
197  ++it;
198  return OK;
199 }
200 
201 template <typename octet_iterator>
202 [[nodiscard]] inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
203  return validate_next(it, end, nullptr);
204 }
205 
206 } // namespace internal
207 
209 
210 // Byte order mark
211 const uint8_t bom[] = { 0xef, 0xbb, 0xbf };
212 
213 template <typename octet_iterator>
214 [[nodiscard]] octet_iterator find_invalid(octet_iterator start, octet_iterator end)
215 {
216  auto result = start;
217  while (result != end) {
218  internal::utf_error err_code = internal::validate_next(result, end);
219  if (err_code != internal::OK) {
220  return result;
221  }
222  }
223  return result;
224 }
225 
226 template <typename octet_iterator>
227 [[nodiscard]] inline bool is_valid(octet_iterator start, octet_iterator end)
228 {
229  return find_invalid(start, end) == end;
230 }
231 
232 template <typename octet_iterator>
233 [[nodiscard]] inline bool is_bom(octet_iterator it)
234 {
235  return ((*it++ == bom[0]) &&
236  (*it++ == bom[1]) &&
237  (*it == bom[2]));
238 }
239 
240 template<typename octet_iterator>
241 [[nodiscard]] inline octet_iterator sync_forward(octet_iterator it)
242 {
243  while (internal::is_trail(*it)) ++it;
244  return it;
245 }
246 
247 template<typename octet_iterator>
248 [[nodiscard]] inline octet_iterator sync_backward(octet_iterator it)
249 {
250  while (internal::is_trail(*it)) --it;
251  return it;
252 }
253 
254 // Is this a code point in the 'Private Use Area' (PUA).
255 // https://en.wikipedia.org/wiki/Private_Use_Areas
256 [[nodiscard]] inline bool is_pua(uint32_t cp)
257 {
258  return ((0x00E000 <= cp) && (cp <= 0x00F8FF)) ||
259  ((0x0F0000 <= cp) && (cp <= 0x0FFFFD)) ||
260  ((0x100000 <= cp) && (cp <= 0x10FFFD));
261 }
262 
263 } // namespace utf8
264 
265 #endif
one_of.hh
utf8::is_pua
bool is_pua(uint32_t cp)
Definition: utf8_core.hh:256
utf8
Definition: utf8_checked.hh:36
utf8::internal::LEAD_OFFSET
const uint16_t LEAD_OFFSET
Definition: utf8_core.hh:51
utf8::internal::CODE_POINT_MAX
const uint32_t CODE_POINT_MAX
Definition: utf8_core.hh:55
utf8::is_valid
bool is_valid(octet_iterator start, octet_iterator end)
Definition: utf8_core.hh:227
utf8::internal::sequence_length
unsigned sequence_length(uint8_t lead)
Definition: utf8_core.hh:73
utf8::internal::OVERLONG_SEQUENCE
@ OVERLONG_SEQUENCE
Definition: utf8_core.hh:93
utf8::internal::INCOMPLETE_SEQUENCE
@ INCOMPLETE_SEQUENCE
Definition: utf8_core.hh:92
gl::length
T length(const vecN< N, T > &x)
Definition: gl_vec.hh:348
utf8::internal::is_trail
bool is_trail(uint8_t oc)
Definition: utf8_core.hh:57
utf8::internal::TRAIL_SURROGATE_MAX
const uint16_t TRAIL_SURROGATE_MAX
Definition: utf8_core.hh:50
utf8::internal::utf_error
utf_error
Definition: utf8_core.hh:88
utf8::sync_forward
octet_iterator sync_forward(octet_iterator it)
Definition: utf8_core.hh:241
utf8::internal::INVALID_CODE_POINT
@ INVALID_CODE_POINT
Definition: utf8_core.hh:94
utf8::internal::SURROGATE_OFFSET
const uint32_t SURROGATE_OFFSET
Definition: utf8_core.hh:52
utf8::is_bom
bool is_bom(octet_iterator it)
Definition: utf8_core.hh:233
utf8::bom
const uint8_t bom[]
The library API - functions intended to be called by the users.
Definition: utf8_core.hh:211
one_of
Definition: one_of.hh:7
utf8::internal::validate_next
utf_error validate_next(octet_iterator &it, octet_iterator end, uint32_t *code_point)
Definition: utf8_core.hh:98
utf8::internal::TRAIL_SURROGATE_MIN
const uint16_t TRAIL_SURROGATE_MIN
Definition: utf8_core.hh:49
utf8::internal::is_surrogate
bool is_surrogate(uint16_t cp)
Definition: utf8_core.hh:62
utf8::distance
auto distance(octet_iterator first, octet_iterator last)
Definition: utf8_checked.hh:194
utf8::internal::LEAD_SURROGATE_MIN
const uint16_t LEAD_SURROGATE_MIN
Definition: utf8_core.hh:47
utf8::internal::OK
@ OK
Definition: utf8_core.hh:89
utf8::advance
void advance(octet_iterator &it, distance_type n, octet_iterator end)
Definition: utf8_checked.hh:186
utf8::internal::is_code_point_valid
bool is_code_point_valid(uint32_t cp)
Definition: utf8_core.hh:67
utf8::sync_backward
octet_iterator sync_backward(octet_iterator it)
Definition: utf8_core.hh:248
utf8::find_invalid
octet_iterator find_invalid(octet_iterator start, octet_iterator end)
Definition: utf8_core.hh:214
utf8::internal::LEAD_SURROGATE_MAX
const uint16_t LEAD_SURROGATE_MAX
Definition: utf8_core.hh:48
utf8::internal::INVALID_LEAD
@ INVALID_LEAD
Definition: utf8_core.hh:91
utf8::internal::NOT_ENOUGH_ROOM
@ NOT_ENOUGH_ROOM
Definition: utf8_core.hh:90