openMSX
utf8_checked.hh
Go to the documentation of this file.
1 // UTF8-CPP http://utfcpp.sourceforge.net/
2 // Slightly simplified (and reformatted) to fit openMSX coding style.
3 
4 // Copyright 2006 Nemanja Trifunovic
5 
6 /*
7 Permission is hereby granted, free of charge, to any person or organization
8 obtaining a copy of the software and accompanying documentation covered by
9 this license (the "Software") to use, reproduce, display, distribute,
10 execute, and transmit the Software, and to prepare derivative works of the
11 Software, and to permit third-parties to whom the Software is furnished to
12 do so, all subject to the following:
13 
14 The copyright notices in the Software and this entire statement, including
15 the above license grant, this restriction and the following disclaimer,
16 must be included in all copies of the Software, in whole or in part, and
17 all derivative works of the Software, unless such copies or derivative
18 works are solely in the form of machine-executable object code generated by
19 a source language processor.
20 
21 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
24 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
25 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
26 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27 DEALINGS IN THE SOFTWARE.
28 */
29 
30 #ifndef UTF8_CHECKED_HH
31 #define UTF8_CHECKED_HH
32 
33 #include "utf8_core.hh"
34 #include <stdexcept>
35 
36 namespace utf8 {
37 
38 // Exceptions that may be thrown from the library functions.
39 class invalid_code_point : public std::exception
40 {
41  uint32_t cp;
42 public:
43  explicit invalid_code_point(uint32_t cp_) : cp(cp_) {}
44  const char* what() const noexcept override { return "Invalid code point"; }
45  uint32_t code_point() const { return cp; }
46 };
47 
48 class invalid_utf8 : public std::exception
49 {
50  uint8_t u8;
51 public:
52  explicit invalid_utf8(uint8_t u) : u8(u) {}
53  const char* what() const noexcept override { return "Invalid UTF-8"; }
54  uint8_t utf8_octet() const { return u8; }
55 };
56 
57 class invalid_utf16 : public std::exception
58 {
59  uint16_t u16;
60 public:
61  explicit invalid_utf16(uint16_t u) : u16(u) {}
62  const char* what() const noexcept override { return "Invalid UTF-16"; }
63  uint16_t utf16_word() const { return u16; }
64 };
65 
66 class not_enough_room : public std::exception
67 {
68 public:
69  const char* what() const noexcept override { return "Not enough space"; }
70 };
71 
72 // The library API - functions intended to be called by the users
73 
74 template <typename octet_iterator, typename output_iterator>
75 output_iterator replace_invalid(octet_iterator start, octet_iterator end,
76  output_iterator out, uint32_t replacement)
77 {
78  while (start != end) {
79  auto sequence_start = start;
80  internal::utf_error err_code = internal::validate_next(start, end);
81  switch (err_code) {
82  case internal::OK:
83  for (auto it = sequence_start; it != start; ++it) {
84  *out++ = *it;
85  }
86  break;
88  throw not_enough_room();
90  append(replacement, out);
91  ++start;
92  break;
96  append(replacement, out);
97  ++start;
98  // just one replacement mark for the sequence
99  while (internal::is_trail(*start) && start != end) {
100  ++start;
101  }
102  break;
103  }
104  }
105  return out;
106 }
107 
108 template <typename octet_iterator, typename output_iterator>
109 inline output_iterator replace_invalid(octet_iterator start, octet_iterator end,
110  output_iterator out)
111 {
112  return replace_invalid(start, end, out, 0xfffd);
113 }
114 
115 template <typename octet_iterator>
116 octet_iterator append(uint32_t cp, octet_iterator result)
117 {
119  throw invalid_code_point(cp);
120  }
121  if (cp < 0x80) {
122  // one octet
123  *result++ = cp;
124  } else if (cp < 0x800) {
125  // two octets
126  *result++ = ((cp >> 6) ) | 0xc0;
127  *result++ = ((cp >> 0) & 0x3f) | 0x80;
128  } else if (cp < 0x10000) {
129  // three octets
130  *result++ = ((cp >> 12) ) | 0xe0;
131  *result++ = ((cp >> 6) & 0x3f) | 0x80;
132  *result++ = ((cp >> 0) & 0x3f) | 0x80;
133  } else if (cp <= internal::CODE_POINT_MAX) {
134  // four octets
135  *result++ = ((cp >> 18) ) | 0xf0;
136  *result++ = ((cp >> 12) & 0x3f) | 0x80;
137  *result++ = ((cp >> 6) & 0x3f) | 0x80;
138  *result++ = ((cp >> 0) & 0x3f) | 0x80;
139  } else {
140  throw invalid_code_point(cp);
141  }
142  return result;
143 }
144 
145 template <typename octet_iterator>
146 uint32_t next(octet_iterator& it, octet_iterator end)
147 {
148  uint32_t cp = 0;
149  internal::utf_error err_code = internal::validate_next(it, end, &cp);
150  switch (err_code) {
151  case internal::OK :
152  break;
154  throw not_enough_room();
158  throw invalid_utf8(*it);
160  throw invalid_code_point(cp);
161  }
162  return cp;
163 }
164 
165 template <typename octet_iterator>
166 uint32_t peek_next(octet_iterator it, octet_iterator end)
167 {
168  return next(it, end);
169 }
170 
171 template <typename octet_iterator>
172 uint32_t prior(octet_iterator& it, octet_iterator start)
173 {
174  auto end = it;
175  while (internal::is_trail(*(--it))) {
176  if (it < start) {
177  // error - no lead byte in the sequence
178  throw invalid_utf8(*it);
179  }
180  }
181  auto temp = it;
182  return next(temp, end);
183 }
184 
185 template <typename octet_iterator, typename distance_type>
186 void advance(octet_iterator& it, distance_type n, octet_iterator end)
187 {
188  for (distance_type i = 0; i < n; ++i) {
189  next(it, end);
190  }
191 }
192 
193 template <typename octet_iterator>
194 auto distance(octet_iterator first, octet_iterator last)
195 {
196  typename std::iterator_traits<octet_iterator>::difference_type dist;
197  for (dist = 0; first < last; ++dist) {
198  next(first, last);
199  }
200  return dist;
201 }
202 
203 template <typename u16bit_iterator, typename octet_iterator>
204 octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end,
205  octet_iterator result)
206 {
207  while (start != end) {
208  uint32_t cp = *start++;
209  // Take care of surrogate pairs first
210  if (internal::is_surrogate(cp)) {
211  if (start == end) {
212  throw invalid_utf16(*start);
213  }
214  uint32_t trail_surrogate = *start++;
215  if (trail_surrogate < internal::TRAIL_SURROGATE_MIN ||
216  trail_surrogate > internal::TRAIL_SURROGATE_MAX) {
217  throw invalid_utf16(trail_surrogate);
218  }
219  cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
220  }
221  result = append(cp, result);
222  }
223  return result;
224 }
225 
226 template <typename u16bit_iterator, typename octet_iterator>
227 u16bit_iterator utf8to16(octet_iterator start, octet_iterator end,
228  u16bit_iterator result)
229 {
230  while (start != end) {
231  uint32_t cp = next(start, end);
232  if (cp > 0xffff) { // make a surrogate pair
233  *result++ = (cp >> 10) + internal::LEAD_OFFSET;
234  *result++ = (cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN;
235  } else {
236  *result++ = cp;
237  }
238  }
239  return result;
240 }
241 
242 template <typename octet_iterator, typename u32bit_iterator>
243 octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end,
244  octet_iterator result)
245 {
246  while (start != end) {
247  result = append(*start++, result);
248  }
249  return result;
250 }
251 
252 template <typename octet_iterator, typename u32bit_iterator>
253 u32bit_iterator utf8to32(octet_iterator start, octet_iterator end,
254  u32bit_iterator result)
255 {
256  while (start < end) {
257  *result++ = next(start, end);
258  }
259  return result;
260 }
261 
262 // The iterator class
263 template <typename octet_iterator>
264 class iterator : public std::iterator<std::bidirectional_iterator_tag, uint32_t>
265 {
266  octet_iterator it;
267  octet_iterator range_start;
268  octet_iterator range_end;
269 public:
270  iterator() = default;
271  iterator(const octet_iterator& octet_it,
272  const octet_iterator& range_start_,
273  const octet_iterator& range_end_)
274  : it(octet_it)
275  , range_start(range_start_)
276  , range_end(range_end_)
277  {
278  if (it < range_start || it > range_end) {
279  throw std::out_of_range("Invalid utf-8 iterator position");
280  }
281  }
282  // the default "big three" are OK
283  octet_iterator base() const { return it; }
284  uint32_t operator*() const
285  {
286  auto temp = it;
287  return next(temp, range_end);
288  }
289  bool operator==(const iterator& rhs) const
290  {
291  if ((range_start != rhs.range_start) ||
292  (range_end != rhs.range_end)) {
293  throw std::logic_error(
294  "Comparing utf-8 iterators defined with different ranges");
295  }
296  return it == rhs.it;
297  }
298  bool operator!=(const iterator& rhs) const
299  {
300  return !(operator==(rhs));
301  }
303  {
304  next(it, range_end);
305  return *this;
306  }
308  {
309  auto temp = *this;
310  next(it, range_end);
311  return temp;
312  }
314  {
315  prior(it, range_start);
316  return *this;
317  }
319  {
320  auto temp = *this;
321  prior(it, range_start);
322  return temp;
323  }
324 };
325 
326 #ifdef _WIN32
327 std::string unknowntoutf8(const std::string& unknown);
328 std::string utf8toansi(const std::string& utf8);
329 std::wstring utf8to16(const std::string& utf8);
330 std::string utf16to8(const std::wstring& utf16);
331 #endif
332 
333 } // namespace utf8
334 
335 #endif
bool is_trail(uint8_t oc)
Definition: utf8_core.hh:56
const uint16_t TRAIL_SURROGATE_MAX
Definition: utf8_core.hh:49
u32bit_iterator utf8to32(octet_iterator start, octet_iterator end, u32bit_iterator result)
const char * what() const noexcept override
Definition: utf8_checked.hh:44
auto distance(octet_iterator first, octet_iterator last)
uint32_t next(octet_iterator &it, octet_iterator end)
const uint16_t TRAIL_SURROGATE_MIN
Definition: utf8_core.hh:48
constexpr bool operator==(const optional< T > &x, const optional< T > &y)
Definition: optional.hh:503
invalid_utf16(uint16_t u)
Definition: utf8_checked.hh:61
const char * what() const noexcept override
Definition: utf8_checked.hh:53
const uint32_t CODE_POINT_MAX
Definition: utf8_core.hh:54
const char * what() const noexcept override
Definition: utf8_checked.hh:62
uint32_t peek_next(octet_iterator it, octet_iterator end)
bool is_surrogate(uint16_t cp)
Definition: utf8_core.hh:61
uint16_t utf16_word() const
Definition: utf8_checked.hh:63
uint32_t code_point() const
Definition: utf8_checked.hh:45
invalid_code_point(uint32_t cp_)
Definition: utf8_checked.hh:43
invalid_utf8(uint8_t u)
Definition: utf8_checked.hh:52
iterator operator++(int)
iterator operator--(int)
octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end, octet_iterator result)
void advance(octet_iterator &it, distance_type n, octet_iterator end)
octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end, octet_iterator result)
uint32_t operator*() const
uint8_t utf8_octet() const
Definition: utf8_checked.hh:54
const uint32_t SURROGATE_OFFSET
Definition: utf8_core.hh:51
iterator(const octet_iterator &octet_it, const octet_iterator &range_start_, const octet_iterator &range_end_)
bool operator==(const iterator &rhs) const
uint32_t prior(octet_iterator &it, octet_iterator start)
iterator & operator--()
iterator & operator++()
bool is_code_point_valid(uint32_t cp)
Definition: utf8_core.hh:66
u16bit_iterator utf8to16(octet_iterator start, octet_iterator end, u16bit_iterator result)
utf_error validate_next(octet_iterator &it, octet_iterator end, uint32_t *code_point)
Definition: utf8_core.hh:97
const char * what() const noexcept override
Definition: utf8_checked.hh:69
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
Definition: utf8_checked.hh:75
const uint16_t LEAD_OFFSET
Definition: utf8_core.hh:50
octet_iterator base() const
auto end(const string_view &x)
Definition: string_view.hh:152
bool operator!=(const iterator &rhs) const
octet_iterator append(uint32_t cp, octet_iterator result)