openMSX
utf8_checked.hh
Go to the documentation of this file.
1 // UTF8-CPP http://utfcpp.sourceforge.net/
2 // Slightly simplified (and reformatted) to fit openMSX coding style.
3 
4 // Copyright 2006 Nemanja Trifunovic
5 
6 /*
7 Permission is hereby granted, free of charge, to any person or organization
8 obtaining a copy of the software and accompanying documentation covered by
9 this license (the "Software") to use, reproduce, display, distribute,
10 execute, and transmit the Software, and to prepare derivative works of the
11 Software, and to permit third-parties to whom the Software is furnished to
12 do so, all subject to the following:
13 
14 The copyright notices in the Software and this entire statement, including
15 the above license grant, this restriction and the following disclaimer,
16 must be included in all copies of the Software, in whole or in part, and
17 all derivative works of the Software, unless such copies or derivative
18 works are solely in the form of machine-executable object code generated by
19 a source language processor.
20 
21 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
24 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
25 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
26 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27 DEALINGS IN THE SOFTWARE.
28 */
29 
30 #ifndef UTF8_CHECKED_HH
31 #define UTF8_CHECKED_HH
32 
33 #include "utf8_core.hh"
34 #include "zstring_view.hh"
35 #include <stdexcept>
36 
37 namespace utf8 {
38 
39 // Exceptions that may be thrown from the library functions.
40 class invalid_code_point : public std::exception
41 {
42  uint32_t cp;
43 public:
44  explicit invalid_code_point(uint32_t cp_) : cp(cp_) {}
45  [[nodiscard]] const char* what() const noexcept override { return "Invalid code point"; }
46  [[nodiscard]] uint32_t code_point() const { return cp; }
47 };
48 
49 class invalid_utf8 : public std::exception
50 {
51  uint8_t u8;
52 public:
53  explicit invalid_utf8(uint8_t u) : u8(u) {}
54  [[nodiscard]] const char* what() const noexcept override { return "Invalid UTF-8"; }
55  [[nodiscard]] uint8_t utf8_octet() const { return u8; }
56 };
57 
58 class invalid_utf16 : public std::exception
59 {
60  uint16_t u16;
61 public:
62  explicit invalid_utf16(uint16_t u) : u16(u) {}
63  [[nodiscard]] const char* what() const noexcept override { return "Invalid UTF-16"; }
64  [[nodiscard]] uint16_t utf16_word() const { return u16; }
65 };
66 
67 class not_enough_room : public std::exception
68 {
69 public:
70  [[nodiscard]] const char* what() const noexcept override { return "Not enough space"; }
71 };
72 
73 // The library API - functions intended to be called by the users
74 
75 template<typename octet_iterator, typename output_iterator>
76 output_iterator replace_invalid(octet_iterator start, octet_iterator end,
77  output_iterator out, uint32_t replacement)
78 {
79  while (start != end) {
80  auto sequence_start = start;
82  switch (err_code) {
83  case internal::OK:
84  for (auto it = sequence_start; it != start; ++it) {
85  *out++ = *it;
86  }
87  break;
89  throw not_enough_room();
91  append(replacement, out);
92  ++start;
93  break;
97  append(replacement, out);
98  ++start;
99  // just one replacement mark for the sequence
100  while (internal::is_trail(*start) && start != end) {
101  ++start;
102  }
103  break;
104  }
105  }
106  return out;
107 }
108 
109 template<typename octet_iterator, typename output_iterator>
110 inline output_iterator replace_invalid(octet_iterator start, octet_iterator end,
111  output_iterator out)
112 {
113  return replace_invalid(start, end, out, 0xfffd);
114 }
115 
116 template<typename octet_iterator>
117 octet_iterator append(uint32_t cp, octet_iterator result)
118 {
120  throw invalid_code_point(cp);
121  }
122  if (cp < 0x80) {
123  // one octet
124  *result++ = cp;
125  } else if (cp < 0x800) {
126  // two octets
127  *result++ = ((cp >> 6) ) | 0xc0;
128  *result++ = ((cp >> 0) & 0x3f) | 0x80;
129  } else if (cp < 0x10000) {
130  // three octets
131  *result++ = ((cp >> 12) ) | 0xe0;
132  *result++ = ((cp >> 6) & 0x3f) | 0x80;
133  *result++ = ((cp >> 0) & 0x3f) | 0x80;
134  } else if (cp <= internal::CODE_POINT_MAX) {
135  // four octets
136  *result++ = ((cp >> 18) ) | 0xf0;
137  *result++ = ((cp >> 12) & 0x3f) | 0x80;
138  *result++ = ((cp >> 6) & 0x3f) | 0x80;
139  *result++ = ((cp >> 0) & 0x3f) | 0x80;
140  } else {
141  throw invalid_code_point(cp);
142  }
143  return result;
144 }
145 
146 template<typename octet_iterator>
147 uint32_t next(octet_iterator& it, octet_iterator end)
148 {
149  uint32_t cp = 0;
150  internal::utf_error err_code = internal::validate_next(it, end, &cp);
151  switch (err_code) {
152  case internal::OK :
153  break;
155  throw not_enough_room();
159  throw invalid_utf8(*it);
161  throw invalid_code_point(cp);
162  }
163  return cp;
164 }
165 
166 template<typename octet_iterator>
167 [[nodiscard]] uint32_t peek_next(octet_iterator it, octet_iterator end)
168 {
169  return next(it, end);
170 }
171 
172 template<typename octet_iterator>
173 uint32_t prior(octet_iterator& it, octet_iterator start)
174 {
175  auto end = it;
176  while (internal::is_trail(*(--it))) {
177  if (it < start) {
178  // error - no lead byte in the sequence
179  throw invalid_utf8(*it);
180  }
181  }
182  auto temp = it;
183  return next(temp, end);
184 }
185 
186 template<typename octet_iterator, typename distance_type>
187 void advance(octet_iterator& it, distance_type n, octet_iterator end)
188 {
189  repeat(n, [&] { next(it, end); });
190 }
191 
192 template<typename octet_iterator>
193 [[nodiscard]] auto distance(octet_iterator first, octet_iterator last)
194 {
195  typename std::iterator_traits<octet_iterator>::difference_type dist = 0;
196  while (first < last) {
197  ++dist;
198  next(first, last);
199  }
200  return dist;
201 }
202 
203 template<typename u16bit_iterator, typename octet_iterator>
204 octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end,
205  octet_iterator result)
206 {
207  while (start != end) {
208  uint32_t cp = *start++;
209  // Take care of surrogate pairs first
210  if (internal::is_surrogate(cp)) {
211  if (start == end) {
212  throw invalid_utf16(*start);
213  }
214  uint32_t trail_surrogate = *start++;
215  if (trail_surrogate < internal::TRAIL_SURROGATE_MIN ||
216  trail_surrogate > internal::TRAIL_SURROGATE_MAX) {
217  throw invalid_utf16(trail_surrogate);
218  }
219  cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
220  }
221  result = append(cp, result);
222  }
223  return result;
224 }
225 
226 template<typename u16bit_iterator, typename octet_iterator>
227 u16bit_iterator utf8to16(octet_iterator start, octet_iterator end,
228  u16bit_iterator result)
229 {
230  while (start != end) {
231  uint32_t cp = next(start, end);
232  if (cp > 0xffff) { // make a surrogate pair
233  *result++ = (cp >> 10) + internal::LEAD_OFFSET;
234  *result++ = (cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN;
235  } else {
236  *result++ = cp;
237  }
238  }
239  return result;
240 }
241 
242 template<typename octet_iterator, typename u32bit_iterator>
243 octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end,
244  octet_iterator result)
245 {
246  while (start != end) {
247  result = append(*start++, result);
248  }
249  return result;
250 }
251 
252 template<typename octet_iterator, typename u32bit_iterator>
253 u32bit_iterator utf8to32(octet_iterator start, octet_iterator end,
254  u32bit_iterator result)
255 {
256  while (start < end) {
257  *result++ = next(start, end);
258  }
259  return result;
260 }
261 
262 // The iterator class
263 template<typename octet_iterator>
264 class iterator
265 {
266  octet_iterator it;
267  octet_iterator range_start;
268  octet_iterator range_end;
269 
270 public:
271  using iterator_category = std::bidirectional_iterator_tag;
272  using difference_type = ptrdiff_t;
273  using value_type = uint32_t;
274  using pointer = uint32_t*;
275  using reference = uint32_t&;
276 
277  iterator() = default;
278  iterator(const octet_iterator& octet_it,
279  const octet_iterator& range_start_,
280  const octet_iterator& range_end_)
281  : it(octet_it)
282  , range_start(range_start_)
283  , range_end(range_end_)
284  {
285  if (it < range_start || it > range_end) {
286  throw std::out_of_range("Invalid utf-8 iterator position");
287  }
288  }
289  // the default "big three" are OK
290  [[nodiscard]] octet_iterator base() const { return it; }
291  [[nodiscard]] uint32_t operator*() const
292  {
293  auto temp = it;
294  return next(temp, range_end);
295  }
296  [[nodiscard]] bool operator==(const iterator& rhs) const
297  {
298  if ((range_start != rhs.range_start) ||
299  (range_end != rhs.range_end)) {
300  throw std::logic_error(
301  "Comparing utf-8 iterators defined with different ranges");
302  }
303  return it == rhs.it;
304  }
305  [[nodiscard]] bool operator!=(const iterator& rhs) const
306  {
307  return !(operator==(rhs));
308  }
310  {
311  next(it, range_end);
312  return *this;
313  }
315  {
316  auto temp = *this;
317  next(it, range_end);
318  return temp;
319  }
321  {
322  prior(it, range_start);
323  return *this;
324  }
326  {
327  auto temp = *this;
328  prior(it, range_start);
329  return temp;
330  }
331 };
332 
333 #ifdef _WIN32
334 [[nodiscard]] std::string utf8ToAnsi(zstring_view utf8);
335 [[nodiscard]] std::wstring utf8to16 (zstring_view utf8);
336 [[nodiscard]] std::string utf16to8 (const std::wstring& utf16);
337 #endif
338 
339 } // namespace utf8
340 
341 #endif
uint32_t code_point() const
Definition: utf8_checked.hh:46
const char * what() const noexcept override
Definition: utf8_checked.hh:45
invalid_code_point(uint32_t cp_)
Definition: utf8_checked.hh:44
uint16_t utf16_word() const
Definition: utf8_checked.hh:64
invalid_utf16(uint16_t u)
Definition: utf8_checked.hh:62
const char * what() const noexcept override
Definition: utf8_checked.hh:63
invalid_utf8(uint8_t u)
Definition: utf8_checked.hh:53
const char * what() const noexcept override
Definition: utf8_checked.hh:54
uint8_t utf8_octet() const
Definition: utf8_checked.hh:55
uint32_t operator*() const
bool operator==(const iterator &rhs) const
octet_iterator base() const
uint32_t * pointer
iterator & operator--()
uint32_t value_type
iterator & operator++()
iterator operator++(int)
std::bidirectional_iterator_tag iterator_category
bool operator!=(const iterator &rhs) const
iterator(const octet_iterator &octet_it, const octet_iterator &range_start_, const octet_iterator &range_end_)
iterator()=default
uint32_t & reference
ptrdiff_t difference_type
iterator operator--(int)
const char * what() const noexcept override
Definition: utf8_checked.hh:70
Like std::string_view, but with the extra guarantee that it refers to a zero-terminated string.
Definition: zstring_view.hh:22
constexpr bool is_code_point_valid(uint32_t cp)
Definition: utf8_core.hh:67
constexpr uint32_t SURROGATE_OFFSET
Definition: utf8_core.hh:52
constexpr uint16_t TRAIL_SURROGATE_MAX
Definition: utf8_core.hh:50
constexpr uint16_t TRAIL_SURROGATE_MIN
Definition: utf8_core.hh:49
constexpr bool is_trail(uint8_t oc)
Definition: utf8_core.hh:57
constexpr uint16_t LEAD_OFFSET
Definition: utf8_core.hh:51
constexpr uint32_t CODE_POINT_MAX
Definition: utf8_core.hh:55
constexpr bool is_surrogate(uint16_t cp)
Definition: utf8_core.hh:62
constexpr utf_error validate_next(octet_iterator &it, octet_iterator end, uint32_t *code_point)
Definition: utf8_core.hh:98
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
Definition: utf8_checked.hh:76
octet_iterator append(uint32_t cp, octet_iterator result)
void advance(octet_iterator &it, distance_type n, octet_iterator end)
uint32_t peek_next(octet_iterator it, octet_iterator end)
octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end, octet_iterator result)
u16bit_iterator utf8to16(octet_iterator start, octet_iterator end, u16bit_iterator result)
u32bit_iterator utf8to32(octet_iterator start, octet_iterator end, u32bit_iterator result)
auto distance(octet_iterator first, octet_iterator last)
uint32_t prior(octet_iterator &it, octet_iterator start)
octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end, octet_iterator result)
uint32_t next(octet_iterator &it, octet_iterator end)
constexpr void repeat(T n, Op op)
Repeat the given operation 'op' 'n' times.
Definition: xrange.hh:170
constexpr auto end(const zstring_view &x)
Definition: zstring_view.hh:84