openMSX
utf8_checked.hh
Go to the documentation of this file.
1// UTF8-CPP http://utfcpp.sourceforge.net/
2// Slightly simplified (and reformatted) to fit openMSX coding style.
3
4// Copyright 2006 Nemanja Trifunovic
5
6/*
7Permission is hereby granted, free of charge, to any person or organization
8obtaining a copy of the software and accompanying documentation covered by
9this license (the "Software") to use, reproduce, display, distribute,
10execute, and transmit the Software, and to prepare derivative works of the
11Software, and to permit third-parties to whom the Software is furnished to
12do so, all subject to the following:
13
14The copyright notices in the Software and this entire statement, including
15the above license grant, this restriction and the following disclaimer,
16must be included in all copies of the Software, in whole or in part, and
17all derivative works of the Software, unless such copies or derivative
18works are solely in the form of machine-executable object code generated by
19a source language processor.
20
21THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
24SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
25FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
26ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27DEALINGS IN THE SOFTWARE.
28*/
29
30#ifndef UTF8_CHECKED_HH
31#define UTF8_CHECKED_HH
32
33#include "utf8_core.hh"
34#include "zstring_view.hh"
35#include <stdexcept>
36
37namespace utf8 {
38
39// Exceptions that may be thrown from the library functions.
40class invalid_code_point : public std::exception
41{
42 uint32_t cp;
43public:
44 explicit invalid_code_point(uint32_t cp_) : cp(cp_) {}
45 [[nodiscard]] const char* what() const noexcept override { return "Invalid code point"; }
46 [[nodiscard]] uint32_t code_point() const { return cp; }
47};
48
49class invalid_utf8 : public std::exception
50{
51 uint8_t u8;
52public:
53 explicit invalid_utf8(uint8_t u) : u8(u) {}
54 [[nodiscard]] const char* what() const noexcept override { return "Invalid UTF-8"; }
55 [[nodiscard]] uint8_t utf8_octet() const { return u8; }
56};
57
58class invalid_utf16 : public std::exception
59{
60 uint16_t u16;
61public:
62 explicit invalid_utf16(uint16_t u) : u16(u) {}
63 [[nodiscard]] const char* what() const noexcept override { return "Invalid UTF-16"; }
64 [[nodiscard]] uint16_t utf16_word() const { return u16; }
65};
66
67class not_enough_room : public std::exception
68{
69public:
70 [[nodiscard]] const char* what() const noexcept override { return "Not enough space"; }
71};
72
73// The library API - functions intended to be called by the users
74
75template<typename octet_iterator, typename output_iterator>
76output_iterator replace_invalid(octet_iterator start, octet_iterator end,
77 output_iterator out, uint32_t replacement)
78{
79 while (start != end) {
80 auto sequence_start = start;
82 switch (err_code) {
83 using enum internal::utf_error;
84 case OK:
85 for (auto it = sequence_start; it != start; ++it) {
86 *out++ = *it;
87 }
88 break;
89 case NOT_ENOUGH_ROOM:
90 throw not_enough_room();
91 case INVALID_LEAD:
92 append(replacement, out);
93 ++start;
94 break;
95 case INCOMPLETE_SEQUENCE:
96 case OVERLONG_SEQUENCE:
97 case INVALID_CODE_POINT:
98 append(replacement, out);
99 ++start;
100 // just one replacement mark for the sequence
101 while (internal::is_trail(*start) && start != end) {
102 ++start;
103 }
104 break;
105 }
106 }
107 return out;
108}
109
110template<typename octet_iterator, typename output_iterator>
111inline output_iterator replace_invalid(octet_iterator start, octet_iterator end,
112 output_iterator out)
113{
114 return replace_invalid(start, end, out, 0xfffd);
115}
116
117template<typename octet_iterator>
118octet_iterator append(uint32_t cp, octet_iterator result)
119{
121 throw invalid_code_point(cp);
122 }
123 if (cp < 0x80) {
124 // one octet
125 *result++ = cp;
126 } else if (cp < 0x800) {
127 // two octets
128 *result++ = ((cp >> 6) & 0x1f) | 0xc0; // 0b110.'.... (5)
129 *result++ = ((cp >> 0) & 0x3f) | 0x80; // 0b10..'.... (6)
130 } else if (cp < 0x10000) {
131 // three octets
132 *result++ = ((cp >> 12) & 0x0f) | 0xe0; // 0b1110'.... (4)
133 *result++ = ((cp >> 6) & 0x3f) | 0x80; // 0b10..'.... (6)
134 *result++ = ((cp >> 0) & 0x3f) | 0x80; // 0b10..'.... (6)
135 } else if (cp <= internal::CODE_POINT_MAX) {
136 // four octets
137 *result++ = ((cp >> 18) & 0x07) | 0xf0; // 0b1111'0... (3)
138 *result++ = ((cp >> 12) & 0x3f) | 0x80; // 0b10..'.... (6)
139 *result++ = ((cp >> 6) & 0x3f) | 0x80; // 0b10..'.... (6)
140 *result++ = ((cp >> 0) & 0x3f) | 0x80; // 0b10..'.... (6)
141 } else {
142 throw invalid_code_point(cp);
143 }
144 return result;
145}
146
147template<typename octet_iterator>
148uint32_t next(octet_iterator& it, octet_iterator end)
149{
150 uint32_t cp = 0;
152 switch (err_code) {
153 using enum internal::utf_error;
154 case OK:
155 break;
156 case NOT_ENOUGH_ROOM:
157 throw not_enough_room();
158 case INVALID_LEAD:
159 case INCOMPLETE_SEQUENCE:
160 case OVERLONG_SEQUENCE:
161 throw invalid_utf8(*it);
162 case INVALID_CODE_POINT:
163 throw invalid_code_point(cp);
164 }
165 return cp;
166}
167
168template<typename octet_iterator>
169[[nodiscard]] uint32_t peek_next(octet_iterator it, octet_iterator end)
170{
171 return next(it, end);
172}
173
174template<typename octet_iterator>
175uint32_t prior(octet_iterator& it, octet_iterator start)
176{
177 auto end = it;
178 while (internal::is_trail(*(--it))) {
179 if (it < start) {
180 // error - no lead byte in the sequence
181 throw invalid_utf8(*it);
182 }
183 }
184 auto temp = it;
185 return next(temp, end);
186}
187
188template<typename octet_iterator, typename distance_type>
189void advance(octet_iterator& it, distance_type n, octet_iterator end)
190{
191 repeat(n, [&] { next(it, end); });
192}
193
194template<typename octet_iterator>
195[[nodiscard]] auto distance(octet_iterator first, octet_iterator last)
196{
197 typename std::iterator_traits<octet_iterator>::difference_type dist = 0;
198 while (first < last) {
199 ++dist;
200 next(first, last);
201 }
202 return dist;
203}
204
205template<typename u16bit_iterator, typename octet_iterator>
206octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end,
207 octet_iterator result)
208{
209 while (start != end) {
210 uint32_t cp = *start++;
211 // Take care of surrogate pairs first
212 if (internal::is_surrogate(cp)) {
213 if (start == end) {
214 throw invalid_utf16(*start);
215 }
216 auto trail_surrogate = *start++;
217 if (trail_surrogate < internal::TRAIL_SURROGATE_MIN ||
218 trail_surrogate > internal::TRAIL_SURROGATE_MAX) {
219 throw invalid_utf16(trail_surrogate);
220 }
221 cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
222 }
223 result = append(cp, result);
224 }
225 return result;
226}
227
228template<typename u16bit_iterator, typename octet_iterator>
229u16bit_iterator utf8to16(octet_iterator start, octet_iterator end,
230 u16bit_iterator result)
231{
232 while (start != end) {
233 uint32_t cp = next(start, end);
234 if (cp > 0xffff) { // make a surrogate pair
235 *result++ = (cp >> 10) + internal::LEAD_OFFSET;
236 *result++ = (cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN;
237 } else {
238 *result++ = cp;
239 }
240 }
241 return result;
242}
243
244template<typename octet_iterator, typename u32bit_iterator>
245octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end,
246 octet_iterator result)
247{
248 while (start != end) {
249 result = append(*start++, result);
250 }
251 return result;
252}
253
254template<typename octet_iterator, typename u32bit_iterator>
255u32bit_iterator utf8to32(octet_iterator start, octet_iterator end,
256 u32bit_iterator result)
257{
258 while (start < end) {
259 *result++ = next(start, end);
260 }
261 return result;
262}
263
264// The iterator class
265template<typename octet_iterator>
267{
268 octet_iterator it;
269 octet_iterator range_start;
270 octet_iterator range_end;
271
272public:
273 using iterator_category = std::bidirectional_iterator_tag;
274 using difference_type = ptrdiff_t;
275 using value_type = uint32_t;
276 using pointer = uint32_t*;
277 using reference = uint32_t&;
278
279 iterator() = default;
280 iterator(const octet_iterator& octet_it,
281 const octet_iterator& range_start_,
282 const octet_iterator& range_end_)
283 : it(octet_it)
284 , range_start(range_start_)
285 , range_end(range_end_)
286 {
287 if (it < range_start || it > range_end) {
288 throw std::out_of_range("Invalid utf-8 iterator position");
289 }
290 }
291 // the default "big three" are OK
292 [[nodiscard]] octet_iterator base() const { return it; }
293 [[nodiscard]] uint32_t operator*() const
294 {
295 auto temp = it;
296 return next(temp, range_end);
297 }
298 [[nodiscard]] bool operator==(const iterator& rhs) const
299 {
300 if ((range_start != rhs.range_start) ||
301 (range_end != rhs.range_end)) {
302 throw std::logic_error(
303 "Comparing utf-8 iterators defined with different ranges");
304 }
305 return it == rhs.it;
306 }
308 {
309 next(it, range_end);
310 return *this;
311 }
313 {
314 auto temp = *this;
315 next(it, range_end);
316 return temp;
317 }
319 {
320 prior(it, range_start);
321 return *this;
322 }
324 {
325 auto temp = *this;
326 prior(it, range_start);
327 return temp;
328 }
329};
330
331#ifdef _WIN32
332[[nodiscard]] std::string utf8ToAnsi(zstring_view utf8);
333[[nodiscard]] std::wstring utf8to16 (zstring_view utf8);
334[[nodiscard]] std::string utf16to8 (const std::wstring& utf16);
335#endif
336
337} // namespace utf8
338
339#endif
uint32_t code_point() const
const char * what() const noexcept override
invalid_code_point(uint32_t cp_)
const char * what() const noexcept override
uint16_t utf16_word() const
invalid_utf16(uint16_t u)
invalid_utf8(uint8_t u)
const char * what() const noexcept override
uint8_t utf8_octet() const
uint32_t operator*() const
bool operator==(const iterator &rhs) const
octet_iterator base() const
iterator & operator--()
uint32_t * pointer
uint32_t value_type
iterator operator++(int)
iterator & operator++()
std::bidirectional_iterator_tag iterator_category
iterator(const octet_iterator &octet_it, const octet_iterator &range_start_, const octet_iterator &range_end_)
iterator()=default
uint32_t & reference
ptrdiff_t difference_type
iterator operator--(int)
const char * what() const noexcept override
Like std::string_view, but with the extra guarantee that it refers to a zero-terminated string.
constexpr bool is_code_point_valid(uint32_t cp)
Definition utf8_core.hh:69
constexpr uint32_t SURROGATE_OFFSET
Definition utf8_core.hh:54
constexpr uint16_t TRAIL_SURROGATE_MAX
Definition utf8_core.hh:52
constexpr uint16_t TRAIL_SURROGATE_MIN
Definition utf8_core.hh:51
constexpr bool is_trail(uint8_t oc)
Definition utf8_core.hh:59
constexpr uint16_t LEAD_OFFSET
Definition utf8_core.hh:53
constexpr uint32_t CODE_POINT_MAX
Definition utf8_core.hh:57
constexpr bool is_surrogate(uint32_t cp)
Definition utf8_core.hh:64
constexpr utf_error validate_next(octet_iterator &it, octet_iterator end, uint32_t *code_point)
Definition utf8_core.hh:100
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
octet_iterator append(uint32_t cp, octet_iterator result)
void advance(octet_iterator &it, distance_type n, octet_iterator end)
uint32_t peek_next(octet_iterator it, octet_iterator end)
octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end, octet_iterator result)
u16bit_iterator utf8to16(octet_iterator start, octet_iterator end, u16bit_iterator result)
u32bit_iterator utf8to32(octet_iterator start, octet_iterator end, u32bit_iterator result)
auto distance(octet_iterator first, octet_iterator last)
uint32_t prior(octet_iterator &it, octet_iterator start)
octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end, octet_iterator result)
uint32_t next(octet_iterator &it, octet_iterator end)
constexpr void repeat(T n, Op op)
Repeat the given operation 'op' 'n' times.
Definition xrange.hh:147
constexpr auto end(const zstring_view &x)