openMSX
utf8_checked.hh
Go to the documentation of this file.
1 // UTF8-CPP http://utfcpp.sourceforge.net/
2 // Slightly simplified (and reformatted) to fit openMSX coding style.
3 
4 // Copyright 2006 Nemanja Trifunovic
5 
6 /*
7 Permission is hereby granted, free of charge, to any person or organization
8 obtaining a copy of the software and accompanying documentation covered by
9 this license (the "Software") to use, reproduce, display, distribute,
10 execute, and transmit the Software, and to prepare derivative works of the
11 Software, and to permit third-parties to whom the Software is furnished to
12 do so, all subject to the following:
13 
14 The copyright notices in the Software and this entire statement, including
15 the above license grant, this restriction and the following disclaimer,
16 must be included in all copies of the Software, in whole or in part, and
17 all derivative works of the Software, unless such copies or derivative
18 works are solely in the form of machine-executable object code generated by
19 a source language processor.
20 
21 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
24 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
25 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
26 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27 DEALINGS IN THE SOFTWARE.
28 */
29 
30 #ifndef UTF8_CHECKED_HH
31 #define UTF8_CHECKED_HH
32 
33 #include "utf8_core.hh"
34 #include <stdexcept>
35 
36 namespace utf8 {
37 
38 // Exceptions that may be thrown from the library functions.
39 class invalid_code_point : public std::exception
40 {
41  uint32_t cp;
42 public:
43  explicit invalid_code_point(uint32_t cp_) : cp(cp_) {}
44  [[nodiscard]] const char* what() const noexcept override { return "Invalid code point"; }
45  [[nodiscard]] uint32_t code_point() const { return cp; }
46 };
47 
48 class invalid_utf8 : public std::exception
49 {
50  uint8_t u8;
51 public:
52  explicit invalid_utf8(uint8_t u) : u8(u) {}
53  [[nodiscard]] const char* what() const noexcept override { return "Invalid UTF-8"; }
54  [[nodiscard]] uint8_t utf8_octet() const { return u8; }
55 };
56 
57 class invalid_utf16 : public std::exception
58 {
59  uint16_t u16;
60 public:
61  explicit invalid_utf16(uint16_t u) : u16(u) {}
62  [[nodiscard]] const char* what() const noexcept override { return "Invalid UTF-16"; }
63  [[nodiscard]] uint16_t utf16_word() const { return u16; }
64 };
65 
66 class not_enough_room : public std::exception
67 {
68 public:
69  [[nodiscard]] const char* what() const noexcept override { return "Not enough space"; }
70 };
71 
72 // The library API - functions intended to be called by the users
73 
74 template <typename octet_iterator, typename output_iterator>
75 output_iterator replace_invalid(octet_iterator start, octet_iterator end,
76  output_iterator out, uint32_t replacement)
77 {
78  while (start != end) {
79  auto sequence_start = start;
80  internal::utf_error err_code = internal::validate_next(start, end);
81  switch (err_code) {
82  case internal::OK:
83  for (auto it = sequence_start; it != start; ++it) {
84  *out++ = *it;
85  }
86  break;
88  throw not_enough_room();
90  append(replacement, out);
91  ++start;
92  break;
96  append(replacement, out);
97  ++start;
98  // just one replacement mark for the sequence
99  while (internal::is_trail(*start) && start != end) {
100  ++start;
101  }
102  break;
103  }
104  }
105  return out;
106 }
107 
108 template <typename octet_iterator, typename output_iterator>
109 inline output_iterator replace_invalid(octet_iterator start, octet_iterator end,
110  output_iterator out)
111 {
112  return replace_invalid(start, end, out, 0xfffd);
113 }
114 
115 template <typename octet_iterator>
116 octet_iterator append(uint32_t cp, octet_iterator result)
117 {
119  throw invalid_code_point(cp);
120  }
121  if (cp < 0x80) {
122  // one octet
123  *result++ = cp;
124  } else if (cp < 0x800) {
125  // two octets
126  *result++ = ((cp >> 6) ) | 0xc0;
127  *result++ = ((cp >> 0) & 0x3f) | 0x80;
128  } else if (cp < 0x10000) {
129  // three octets
130  *result++ = ((cp >> 12) ) | 0xe0;
131  *result++ = ((cp >> 6) & 0x3f) | 0x80;
132  *result++ = ((cp >> 0) & 0x3f) | 0x80;
133  } else if (cp <= internal::CODE_POINT_MAX) {
134  // four octets
135  *result++ = ((cp >> 18) ) | 0xf0;
136  *result++ = ((cp >> 12) & 0x3f) | 0x80;
137  *result++ = ((cp >> 6) & 0x3f) | 0x80;
138  *result++ = ((cp >> 0) & 0x3f) | 0x80;
139  } else {
140  throw invalid_code_point(cp);
141  }
142  return result;
143 }
144 
145 template <typename octet_iterator>
146 uint32_t next(octet_iterator& it, octet_iterator end)
147 {
148  uint32_t cp = 0;
149  internal::utf_error err_code = internal::validate_next(it, end, &cp);
150  switch (err_code) {
151  case internal::OK :
152  break;
154  throw not_enough_room();
158  throw invalid_utf8(*it);
160  throw invalid_code_point(cp);
161  }
162  return cp;
163 }
164 
165 template <typename octet_iterator>
166 [[nodiscard]] uint32_t peek_next(octet_iterator it, octet_iterator end)
167 {
168  return next(it, end);
169 }
170 
171 template <typename octet_iterator>
172 uint32_t prior(octet_iterator& it, octet_iterator start)
173 {
174  auto end = it;
175  while (internal::is_trail(*(--it))) {
176  if (it < start) {
177  // error - no lead byte in the sequence
178  throw invalid_utf8(*it);
179  }
180  }
181  auto temp = it;
182  return next(temp, end);
183 }
184 
185 template <typename octet_iterator, typename distance_type>
186 void advance(octet_iterator& it, distance_type n, octet_iterator end)
187 {
188  for (distance_type i = 0; i < n; ++i) {
189  next(it, end);
190  }
191 }
192 
193 template <typename octet_iterator>
194 [[nodiscard]] auto distance(octet_iterator first, octet_iterator last)
195 {
196  typename std::iterator_traits<octet_iterator>::difference_type dist;
197  for (dist = 0; first < last; ++dist) {
198  next(first, last);
199  }
200  return dist;
201 }
202 
203 template <typename u16bit_iterator, typename octet_iterator>
204 octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end,
205  octet_iterator result)
206 {
207  while (start != end) {
208  uint32_t cp = *start++;
209  // Take care of surrogate pairs first
210  if (internal::is_surrogate(cp)) {
211  if (start == end) {
212  throw invalid_utf16(*start);
213  }
214  uint32_t trail_surrogate = *start++;
215  if (trail_surrogate < internal::TRAIL_SURROGATE_MIN ||
216  trail_surrogate > internal::TRAIL_SURROGATE_MAX) {
217  throw invalid_utf16(trail_surrogate);
218  }
219  cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
220  }
221  result = append(cp, result);
222  }
223  return result;
224 }
225 
226 template <typename u16bit_iterator, typename octet_iterator>
227 u16bit_iterator utf8to16(octet_iterator start, octet_iterator end,
228  u16bit_iterator result)
229 {
230  while (start != end) {
231  uint32_t cp = next(start, end);
232  if (cp > 0xffff) { // make a surrogate pair
233  *result++ = (cp >> 10) + internal::LEAD_OFFSET;
234  *result++ = (cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN;
235  } else {
236  *result++ = cp;
237  }
238  }
239  return result;
240 }
241 
242 template <typename octet_iterator, typename u32bit_iterator>
243 octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end,
244  octet_iterator result)
245 {
246  while (start != end) {
247  result = append(*start++, result);
248  }
249  return result;
250 }
251 
252 template <typename octet_iterator, typename u32bit_iterator>
253 u32bit_iterator utf8to32(octet_iterator start, octet_iterator end,
254  u32bit_iterator result)
255 {
256  while (start < end) {
257  *result++ = next(start, end);
258  }
259  return result;
260 }
261 
262 // The iterator class
263 template <typename octet_iterator>
264 class iterator : public std::iterator<std::bidirectional_iterator_tag, uint32_t>
265 {
266  octet_iterator it;
267  octet_iterator range_start;
268  octet_iterator range_end;
269 public:
270  iterator() = default;
271  iterator(const octet_iterator& octet_it,
272  const octet_iterator& range_start_,
273  const octet_iterator& range_end_)
274  : it(octet_it)
275  , range_start(range_start_)
276  , range_end(range_end_)
277  {
278  if (it < range_start || it > range_end) {
279  throw std::out_of_range("Invalid utf-8 iterator position");
280  }
281  }
282  // the default "big three" are OK
283  [[nodiscard]] octet_iterator base() const { return it; }
284  [[nodiscard]] uint32_t operator*() const
285  {
286  auto temp = it;
287  return next(temp, range_end);
288  }
289  [[nodiscard]] bool operator==(const iterator& rhs) const
290  {
291  if ((range_start != rhs.range_start) ||
292  (range_end != rhs.range_end)) {
293  throw std::logic_error(
294  "Comparing utf-8 iterators defined with different ranges");
295  }
296  return it == rhs.it;
297  }
298  [[nodiscard]] bool operator!=(const iterator& rhs) const
299  {
300  return !(operator==(rhs));
301  }
303  {
304  next(it, range_end);
305  return *this;
306  }
308  {
309  auto temp = *this;
310  next(it, range_end);
311  return temp;
312  }
314  {
315  prior(it, range_start);
316  return *this;
317  }
319  {
320  auto temp = *this;
321  prior(it, range_start);
322  return temp;
323  }
324 };
325 
326 #ifdef _WIN32
327 [[nodiscard]] std::string unknowntoutf8(const std::string& unknown);
328 [[nodiscard]] std::string utf8toansi(const std::string& utf8);
329 [[nodiscard]] std::wstring utf8to16(const std::string& utf8);
330 [[nodiscard]] std::string utf16to8(const std::wstring& utf16);
331 #endif
332 
333 } // namespace utf8
334 
335 #endif
utf8::utf32to8
octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end, octet_iterator result)
Definition: utf8_checked.hh:243
utf8
Definition: utf8_checked.hh:36
utf8::invalid_utf8
Definition: utf8_checked.hh:49
utf8::internal::LEAD_OFFSET
const uint16_t LEAD_OFFSET
Definition: utf8_core.hh:51
utf8::invalid_utf16::utf16_word
uint16_t utf16_word() const
Definition: utf8_checked.hh:63
utf8::internal::CODE_POINT_MAX
const uint32_t CODE_POINT_MAX
Definition: utf8_core.hh:55
utf8::replace_invalid
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
Definition: utf8_checked.hh:75
utf8::utf8to16
u16bit_iterator utf8to16(octet_iterator start, octet_iterator end, u16bit_iterator result)
Definition: utf8_checked.hh:227
utf8::internal::OVERLONG_SEQUENCE
@ OVERLONG_SEQUENCE
Definition: utf8_core.hh:93
utf8::internal::INCOMPLETE_SEQUENCE
@ INCOMPLETE_SEQUENCE
Definition: utf8_core.hh:92
utf8::internal::is_trail
bool is_trail(uint8_t oc)
Definition: utf8_core.hh:57
utf8::internal::TRAIL_SURROGATE_MAX
const uint16_t TRAIL_SURROGATE_MAX
Definition: utf8_core.hh:50
utf8::iterator::base
octet_iterator base() const
Definition: utf8_checked.hh:283
utf8::iterator
Definition: utf8_checked.hh:265
utf8::next
uint32_t next(octet_iterator &it, octet_iterator end)
Definition: utf8_checked.hh:146
utf8::internal::utf_error
utf_error
Definition: utf8_core.hh:88
utf8::invalid_utf16::invalid_utf16
invalid_utf16(uint16_t u)
Definition: utf8_checked.hh:61
utf8::iterator::operator--
iterator & operator--()
Definition: utf8_checked.hh:313
utf8::invalid_utf8::utf8_octet
uint8_t utf8_octet() const
Definition: utf8_checked.hh:54
utf8::iterator::operator!=
bool operator!=(const iterator &rhs) const
Definition: utf8_checked.hh:298
utf8::internal::INVALID_CODE_POINT
@ INVALID_CODE_POINT
Definition: utf8_core.hh:94
utf8::internal::SURROGATE_OFFSET
const uint32_t SURROGATE_OFFSET
Definition: utf8_core.hh:52
utf8::invalid_code_point::what
const char * what() const noexcept override
Definition: utf8_checked.hh:44
utf8::internal::validate_next
utf_error validate_next(octet_iterator &it, octet_iterator end, uint32_t *code_point)
Definition: utf8_core.hh:98
utf8::internal::TRAIL_SURROGATE_MIN
const uint16_t TRAIL_SURROGATE_MIN
Definition: utf8_core.hh:49
utf8_core.hh
utf8::internal::is_surrogate
bool is_surrogate(uint16_t cp)
Definition: utf8_core.hh:62
utf8::distance
auto distance(octet_iterator first, octet_iterator last)
Definition: utf8_checked.hh:194
utf8::prior
uint32_t prior(octet_iterator &it, octet_iterator start)
Definition: utf8_checked.hh:172
utf8::iterator::operator++
iterator operator++(int)
Definition: utf8_checked.hh:307
utf8::iterator::operator*
uint32_t operator*() const
Definition: utf8_checked.hh:284
utf8::utf8to32
u32bit_iterator utf8to32(octet_iterator start, octet_iterator end, u32bit_iterator result)
Definition: utf8_checked.hh:253
utf8::iterator::operator--
iterator operator--(int)
Definition: utf8_checked.hh:318
utf8::invalid_code_point
Definition: utf8_checked.hh:40
utf8::internal::OK
@ OK
Definition: utf8_core.hh:89
utf8::iterator::operator==
bool operator==(const iterator &rhs) const
Definition: utf8_checked.hh:289
utf8::invalid_code_point::invalid_code_point
invalid_code_point(uint32_t cp_)
Definition: utf8_checked.hh:43
utf8::append
octet_iterator append(uint32_t cp, octet_iterator result)
Definition: utf8_checked.hh:116
utf8::iterator::operator++
iterator & operator++()
Definition: utf8_checked.hh:302
utf8::advance
void advance(octet_iterator &it, distance_type n, octet_iterator end)
Definition: utf8_checked.hh:186
utf8::internal::is_code_point_valid
bool is_code_point_valid(uint32_t cp)
Definition: utf8_core.hh:67
utf8::iterator::iterator
iterator(const octet_iterator &octet_it, const octet_iterator &range_start_, const octet_iterator &range_end_)
Definition: utf8_checked.hh:271
utf8::utf16to8
octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end, octet_iterator result)
Definition: utf8_checked.hh:204
utf8::invalid_utf8::invalid_utf8
invalid_utf8(uint8_t u)
Definition: utf8_checked.hh:52
utf8::iterator::iterator
iterator()=default
utf8::invalid_utf16::what
const char * what() const noexcept override
Definition: utf8_checked.hh:62
utf8::not_enough_room
Definition: utf8_checked.hh:67
utf8::invalid_code_point::code_point
uint32_t code_point() const
Definition: utf8_checked.hh:45
utf8::invalid_utf16
Definition: utf8_checked.hh:58
utf8::invalid_utf8::what
const char * what() const noexcept override
Definition: utf8_checked.hh:53
utf8::peek_next
uint32_t peek_next(octet_iterator it, octet_iterator end)
Definition: utf8_checked.hh:166
utf8::internal::INVALID_LEAD
@ INVALID_LEAD
Definition: utf8_core.hh:91
utf8::internal::NOT_ENOUGH_ROOM
@ NOT_ENOUGH_ROOM
Definition: utf8_core.hh:90
utf8::not_enough_room::what
const char * what() const noexcept override
Definition: utf8_checked.hh:69