openMSX
utf8_checked.hh
Go to the documentation of this file.
1// UTF8-CPP http://utfcpp.sourceforge.net/
2// Slightly simplified (and reformatted) to fit openMSX coding style.
3
4// Copyright 2006 Nemanja Trifunovic
5
6/*
7Permission is hereby granted, free of charge, to any person or organization
8obtaining a copy of the software and accompanying documentation covered by
9this license (the "Software") to use, reproduce, display, distribute,
10execute, and transmit the Software, and to prepare derivative works of the
11Software, and to permit third-parties to whom the Software is furnished to
12do so, all subject to the following:
13
14The copyright notices in the Software and this entire statement, including
15the above license grant, this restriction and the following disclaimer,
16must be included in all copies of the Software, in whole or in part, and
17all derivative works of the Software, unless such copies or derivative
18works are solely in the form of machine-executable object code generated by
19a source language processor.
20
21THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
24SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
25FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
26ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27DEALINGS IN THE SOFTWARE.
28*/
29
30#ifndef UTF8_CHECKED_HH
31#define UTF8_CHECKED_HH
32
33#include "utf8_core.hh"
34#include "zstring_view.hh"
35#include <stdexcept>
36
37namespace utf8 {
38
39// Exceptions that may be thrown from the library functions.
40class invalid_code_point : public std::exception
41{
42 uint32_t cp;
43public:
44 explicit invalid_code_point(uint32_t cp_) : cp(cp_) {}
45 [[nodiscard]] const char* what() const noexcept override { return "Invalid code point"; }
46 [[nodiscard]] uint32_t code_point() const { return cp; }
47};
48
49class invalid_utf8 : public std::exception
50{
51 uint8_t u8;
52public:
53 explicit invalid_utf8(uint8_t u) : u8(u) {}
54 [[nodiscard]] const char* what() const noexcept override { return "Invalid UTF-8"; }
55 [[nodiscard]] uint8_t utf8_octet() const { return u8; }
56};
57
58class invalid_utf16 : public std::exception
59{
60 uint16_t u16;
61public:
62 explicit invalid_utf16(uint16_t u) : u16(u) {}
63 [[nodiscard]] const char* what() const noexcept override { return "Invalid UTF-16"; }
64 [[nodiscard]] uint16_t utf16_word() const { return u16; }
65};
66
67class not_enough_room : public std::exception
68{
69public:
70 [[nodiscard]] const char* what() const noexcept override { return "Not enough space"; }
71};
72
73// The library API - functions intended to be called by the users
74
75template<typename octet_iterator, typename output_iterator>
76output_iterator replace_invalid(octet_iterator start, octet_iterator end,
77 output_iterator out, uint32_t replacement)
78{
79 while (start != end) {
80 auto sequence_start = start;
82 switch (err_code) {
83 case internal::OK:
84 for (auto it = sequence_start; it != start; ++it) {
85 *out++ = *it;
86 }
87 break;
89 throw not_enough_room();
91 append(replacement, out);
92 ++start;
93 break;
97 append(replacement, out);
98 ++start;
99 // just one replacement mark for the sequence
100 while (internal::is_trail(*start) && start != end) {
101 ++start;
102 }
103 break;
104 }
105 }
106 return out;
107}
108
109template<typename octet_iterator, typename output_iterator>
110inline output_iterator replace_invalid(octet_iterator start, octet_iterator end,
111 output_iterator out)
112{
113 return replace_invalid(start, end, out, 0xfffd);
114}
115
116template<typename octet_iterator>
117octet_iterator append(uint32_t cp, octet_iterator result)
118{
120 throw invalid_code_point(cp);
121 }
122 if (cp < 0x80) {
123 // one octet
124 *result++ = cp;
125 } else if (cp < 0x800) {
126 // two octets
127 *result++ = ((cp >> 6) & 0x1f) | 0xc0; // 0b110.'.... (5)
128 *result++ = ((cp >> 0) & 0x3f) | 0x80; // 0b10..'.... (6)
129 } else if (cp < 0x10000) {
130 // three octets
131 *result++ = ((cp >> 12) & 0x0f) | 0xe0; // 0b1110'.... (4)
132 *result++ = ((cp >> 6) & 0x3f) | 0x80; // 0b10..'.... (6)
133 *result++ = ((cp >> 0) & 0x3f) | 0x80; // 0b10..'.... (6)
134 } else if (cp <= internal::CODE_POINT_MAX) {
135 // four octets
136 *result++ = ((cp >> 18) & 0x07) | 0xf0; // 0b1111'0... (3)
137 *result++ = ((cp >> 12) & 0x3f) | 0x80; // 0b10..'.... (6)
138 *result++ = ((cp >> 6) & 0x3f) | 0x80; // 0b10..'.... (6)
139 *result++ = ((cp >> 0) & 0x3f) | 0x80; // 0b10..'.... (6)
140 } else {
141 throw invalid_code_point(cp);
142 }
143 return result;
144}
145
146template<typename octet_iterator>
147uint32_t next(octet_iterator& it, octet_iterator end)
148{
149 uint32_t cp = 0;
151 switch (err_code) {
152 case internal::OK :
153 break;
155 throw not_enough_room();
159 throw invalid_utf8(*it);
161 throw invalid_code_point(cp);
162 }
163 return cp;
164}
165
166template<typename octet_iterator>
167[[nodiscard]] uint32_t peek_next(octet_iterator it, octet_iterator end)
168{
169 return next(it, end);
170}
171
172template<typename octet_iterator>
173uint32_t prior(octet_iterator& it, octet_iterator start)
174{
175 auto end = it;
176 while (internal::is_trail(*(--it))) {
177 if (it < start) {
178 // error - no lead byte in the sequence
179 throw invalid_utf8(*it);
180 }
181 }
182 auto temp = it;
183 return next(temp, end);
184}
185
186template<typename octet_iterator, typename distance_type>
187void advance(octet_iterator& it, distance_type n, octet_iterator end)
188{
189 repeat(n, [&] { next(it, end); });
190}
191
192template<typename octet_iterator>
193[[nodiscard]] auto distance(octet_iterator first, octet_iterator last)
194{
195 typename std::iterator_traits<octet_iterator>::difference_type dist = 0;
196 while (first < last) {
197 ++dist;
198 next(first, last);
199 }
200 return dist;
201}
202
203template<typename u16bit_iterator, typename octet_iterator>
204octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end,
205 octet_iterator result)
206{
207 while (start != end) {
208 uint32_t cp = *start++;
209 // Take care of surrogate pairs first
210 if (internal::is_surrogate(cp)) {
211 if (start == end) {
212 throw invalid_utf16(*start);
213 }
214 auto trail_surrogate = *start++;
215 if (trail_surrogate < internal::TRAIL_SURROGATE_MIN ||
216 trail_surrogate > internal::TRAIL_SURROGATE_MAX) {
217 throw invalid_utf16(trail_surrogate);
218 }
219 cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
220 }
221 result = append(cp, result);
222 }
223 return result;
224}
225
226template<typename u16bit_iterator, typename octet_iterator>
227u16bit_iterator utf8to16(octet_iterator start, octet_iterator end,
228 u16bit_iterator result)
229{
230 while (start != end) {
231 uint32_t cp = next(start, end);
232 if (cp > 0xffff) { // make a surrogate pair
233 *result++ = (cp >> 10) + internal::LEAD_OFFSET;
234 *result++ = (cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN;
235 } else {
236 *result++ = cp;
237 }
238 }
239 return result;
240}
241
242template<typename octet_iterator, typename u32bit_iterator>
243octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end,
244 octet_iterator result)
245{
246 while (start != end) {
247 result = append(*start++, result);
248 }
249 return result;
250}
251
252template<typename octet_iterator, typename u32bit_iterator>
253u32bit_iterator utf8to32(octet_iterator start, octet_iterator end,
254 u32bit_iterator result)
255{
256 while (start < end) {
257 *result++ = next(start, end);
258 }
259 return result;
260}
261
262// The iterator class
263template<typename octet_iterator>
265{
266 octet_iterator it;
267 octet_iterator range_start;
268 octet_iterator range_end;
269
270public:
271 using iterator_category = std::bidirectional_iterator_tag;
272 using difference_type = ptrdiff_t;
273 using value_type = uint32_t;
274 using pointer = uint32_t*;
275 using reference = uint32_t&;
276
277 iterator() = default;
278 iterator(const octet_iterator& octet_it,
279 const octet_iterator& range_start_,
280 const octet_iterator& range_end_)
281 : it(octet_it)
282 , range_start(range_start_)
283 , range_end(range_end_)
284 {
285 if (it < range_start || it > range_end) {
286 throw std::out_of_range("Invalid utf-8 iterator position");
287 }
288 }
289 // the default "big three" are OK
290 [[nodiscard]] octet_iterator base() const { return it; }
291 [[nodiscard]] uint32_t operator*() const
292 {
293 auto temp = it;
294 return next(temp, range_end);
295 }
296 [[nodiscard]] bool operator==(const iterator& rhs) const
297 {
298 if ((range_start != rhs.range_start) ||
299 (range_end != rhs.range_end)) {
300 throw std::logic_error(
301 "Comparing utf-8 iterators defined with different ranges");
302 }
303 return it == rhs.it;
304 }
306 {
307 next(it, range_end);
308 return *this;
309 }
311 {
312 auto temp = *this;
313 next(it, range_end);
314 return temp;
315 }
317 {
318 prior(it, range_start);
319 return *this;
320 }
322 {
323 auto temp = *this;
324 prior(it, range_start);
325 return temp;
326 }
327};
328
329#ifdef _WIN32
330[[nodiscard]] std::string utf8ToAnsi(zstring_view utf8);
331[[nodiscard]] std::wstring utf8to16 (zstring_view utf8);
332[[nodiscard]] std::string utf16to8 (const std::wstring& utf16);
333#endif
334
335} // namespace utf8
336
337#endif
uint32_t code_point() const
const char * what() const noexcept override
invalid_code_point(uint32_t cp_)
const char * what() const noexcept override
uint16_t utf16_word() const
invalid_utf16(uint16_t u)
invalid_utf8(uint8_t u)
const char * what() const noexcept override
uint8_t utf8_octet() const
uint32_t operator*() const
bool operator==(const iterator &rhs) const
octet_iterator base() const
iterator & operator--()
uint32_t * pointer
uint32_t value_type
iterator operator++(int)
iterator & operator++()
std::bidirectional_iterator_tag iterator_category
iterator(const octet_iterator &octet_it, const octet_iterator &range_start_, const octet_iterator &range_end_)
iterator()=default
uint32_t & reference
ptrdiff_t difference_type
iterator operator--(int)
const char * what() const noexcept override
Like std::string_view, but with the extra guarantee that it refers to a zero-terminated string.
constexpr bool is_code_point_valid(uint32_t cp)
Definition utf8_core.hh:69
constexpr uint32_t SURROGATE_OFFSET
Definition utf8_core.hh:54
constexpr uint16_t TRAIL_SURROGATE_MAX
Definition utf8_core.hh:52
constexpr uint16_t TRAIL_SURROGATE_MIN
Definition utf8_core.hh:51
constexpr bool is_trail(uint8_t oc)
Definition utf8_core.hh:59
constexpr uint16_t LEAD_OFFSET
Definition utf8_core.hh:53
constexpr uint32_t CODE_POINT_MAX
Definition utf8_core.hh:57
constexpr bool is_surrogate(uint32_t cp)
Definition utf8_core.hh:64
constexpr utf_error validate_next(octet_iterator &it, octet_iterator end, uint32_t *code_point)
Definition utf8_core.hh:100
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
octet_iterator append(uint32_t cp, octet_iterator result)
void advance(octet_iterator &it, distance_type n, octet_iterator end)
uint32_t peek_next(octet_iterator it, octet_iterator end)
octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end, octet_iterator result)
u16bit_iterator utf8to16(octet_iterator start, octet_iterator end, u16bit_iterator result)
u32bit_iterator utf8to32(octet_iterator start, octet_iterator end, u32bit_iterator result)
auto distance(octet_iterator first, octet_iterator last)
uint32_t prior(octet_iterator &it, octet_iterator start)
octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end, octet_iterator result)
uint32_t next(octet_iterator &it, octet_iterator end)
constexpr void repeat(T n, Op op)
Repeat the given operation 'op' 'n' times.
Definition xrange.hh:147
constexpr auto end(const zstring_view &x)