openMSX
utf8_core.hh
Go to the documentation of this file.
1// UTF8-CPP http://utfcpp.sourceforge.net/
2// Slightly simplified (and reformatted) to fit openMSX coding style.
3
4// Copyright 2006 Nemanja Trifunovic
5
6/*
7Permission is hereby granted, free of charge, to any person or organization
8obtaining a copy of the software and accompanying documentation covered by
9this license (the "Software") to use, reproduce, display, distribute,
10execute, and transmit the Software, and to prepare derivative works of the
11Software, and to permit third-parties to whom the Software is furnished to
12do so, all subject to the following:
13
14The copyright notices in the Software and this entire statement, including
15the above license grant, this restriction and the following disclaimer,
16must be included in all copies of the Software, in whole or in part, and
17all derivative works of the Software, unless such copies or derivative
18works are solely in the form of machine-executable object code generated by
19a source language processor.
20
21THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
24SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
25FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
26ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27DEALINGS IN THE SOFTWARE.
28*/
29
30
31#ifndef UTF8_CORE_HH
32#define UTF8_CORE_HH
33
34#include "narrow.hh"
35#include "one_of.hh"
36#include <array>
37#include <iterator>
38#include <cstdint>
39
40namespace utf8 {
41
42// Helper code - not intended to be directly called by the library users.
43// May be changed at any time
44namespace internal {
45
46// Unicode constants
47// Leading (high) surrogates: 0xd800 - 0xdbff
48// Trailing (low) surrogates: 0xdc00 - 0xdfff
49inline constexpr uint16_t LEAD_SURROGATE_MIN = 0xd800u;
50inline constexpr uint16_t LEAD_SURROGATE_MAX = 0xdbffu;
51inline constexpr uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
52inline constexpr uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
53inline constexpr uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10);
54inline constexpr uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
55
56// Maximum valid value for a Unicode code point
57inline constexpr uint32_t CODE_POINT_MAX = 0x0010ffffu;
58
59[[nodiscard]] constexpr bool is_trail(uint8_t oc)
60{
61 return (oc >> 6) == 0x2;
62}
63
64[[nodiscard]] constexpr bool is_surrogate(uint32_t cp)
65{
66 return (cp >= LEAD_SURROGATE_MIN) && (cp <= TRAIL_SURROGATE_MAX);
67}
68
69[[nodiscard]] constexpr bool is_code_point_valid(uint32_t cp)
70{
71 return (cp <= CODE_POINT_MAX) && !is_surrogate(cp) &&
72 (cp != one_of(0xfffeu, 0xffffu));
73}
74
75[[nodiscard]] constexpr unsigned sequence_length(uint8_t lead)
76{
77 if (lead < 0x80) {
78 return 1;
79 } else if ((lead >> 5) == 0x06) {
80 return 2;
81 } else if ((lead >> 4) == 0x0e) {
82 return 3;
83 } else if ((lead >> 3) == 0x1e) {
84 return 4;
85 } else {
86 return 0;
87 }
88}
89
98
99template<typename octet_iterator>
100[[nodiscard]] constexpr utf_error validate_next(octet_iterator& it, octet_iterator end,
101 uint32_t* code_point)
102{
103 using enum utf_error;
104 uint32_t cp = narrow_cast<unsigned char>(*it);
105 // Check the lead octet
106 int length = sequence_length(*it);
107
108 // "Shortcut" for ASCII characters
109 if (length == 1) {
110 if (end - it <= 0) {
111 return NOT_ENOUGH_ROOM;
112 }
113 if (code_point) {
114 *code_point = cp;
115 }
116 ++it;
117 return OK;
118 }
119
120 // Do we have enough memory?
121 if (std::distance(it, end) < length) {
122 return NOT_ENOUGH_ROOM;
123 }
124
125 // Check trail octets and calculate the code point
126 switch (length) {
127 case 0:
128 return INVALID_LEAD;
129 case 2:
130 if (is_trail(*(++it))) {
131 cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
132 } else {
133 --it;
134 return INCOMPLETE_SEQUENCE;
135 }
136 break;
137 case 3:
138 if (is_trail(*(++it))) {
139 cp = ((cp << 12) & 0xffff) + ((*it << 6) & 0xfff);
140 if (is_trail(*(++it))) {
141 cp += (*it) & 0x3f;
142 } else {
143 std::advance(it, -2);
144 return INCOMPLETE_SEQUENCE;
145 }
146 } else {
147 --it;
148 return INCOMPLETE_SEQUENCE;
149 }
150 break;
151 case 4:
152 if (is_trail(*(++it))) {
153 cp = ((cp << 18) & 0x1fffff) + ((*it << 12) & 0x3ffff);
154 if (is_trail(*(++it))) {
155 cp += (*it << 6) & 0xfff;
156 if (is_trail(*(++it))) {
157 cp += (*it) & 0x3f;
158 } else {
159 std::advance(it, -3);
160 return INCOMPLETE_SEQUENCE;
161 }
162 } else {
163 std::advance(it, -2);
164 return INCOMPLETE_SEQUENCE;
165 }
166 } else {
167 --it;
168 return INCOMPLETE_SEQUENCE;
169 }
170 break;
171 }
172 // Is the code point valid?
173 if (!is_code_point_valid(cp)) {
174 repeat(length - 1, [&] { --it; });
175 return INVALID_CODE_POINT;
176 }
177
178 if (code_point) {
179 *code_point = cp;
180 }
181 if (cp < 0x80) {
182 if (length != 1) {
183 std::advance(it, -(length-1));
184 return OVERLONG_SEQUENCE;
185 }
186 } else if (cp < 0x800) {
187 if (length != 2) {
188 std::advance(it, -(length-1));
189 return OVERLONG_SEQUENCE;
190 }
191 } else if (cp < 0x10000) {
192 if (length != 3) {
193 std::advance(it, -(length-1));
194 return OVERLONG_SEQUENCE;
195 }
196 }
197
198 ++it;
199 return OK;
200}
201
202template<typename octet_iterator>
203[[nodiscard]] constexpr utf_error validate_next(octet_iterator& it, octet_iterator end) {
204 return validate_next(it, end, nullptr);
205}
206
207} // namespace internal
208
210
211template<typename octet_iterator>
212[[nodiscard]] constexpr octet_iterator find_invalid(octet_iterator start, octet_iterator end)
213{
214 auto result = start;
215 while (result != end) {
217 if (err_code != internal::utf_error::OK) {
218 return result;
219 }
220 }
221 return result;
222}
223
224template<typename octet_iterator>
225[[nodiscard]] constexpr bool is_valid(octet_iterator start, octet_iterator end)
226{
227 return find_invalid(start, end) == end;
228}
229
230template<typename octet_iterator>
231[[nodiscard]] constexpr bool is_bom(octet_iterator it)
232{
233 // Byte order mark
234 constexpr std::array<uint8_t, 3> bom = {0xef, 0xbb, 0xbf};
235
236 return ((*it++ == bom[0]) &&
237 (*it++ == bom[1]) &&
238 (*it == bom[2]));
239}
240
241template<typename octet_iterator>
242[[nodiscard]] constexpr octet_iterator sync_forward(octet_iterator it)
243{
244 while (internal::is_trail(*it)) ++it;
245 return it;
246}
247
248template<typename octet_iterator>
249[[nodiscard]] constexpr octet_iterator sync_backward(octet_iterator it)
250{
251 while (internal::is_trail(*it)) --it;
252 return it;
253}
254
255// Is this a code point in the 'Private Use Area' (PUA).
256// https://en.wikipedia.org/wiki/Private_Use_Areas
257[[nodiscard]] constexpr bool is_pua(uint32_t cp)
258{
259 return ((0x00E000 <= cp) && (cp <= 0x00F8FF)) ||
260 ((0x0F0000 <= cp) && (cp <= 0x0FFFFD)) ||
261 ((0x100000 <= cp) && (cp <= 0x10FFFD));
262}
263
264} // namespace utf8
265
266#endif
constexpr bool is_code_point_valid(uint32_t cp)
Definition utf8_core.hh:69
constexpr uint16_t LEAD_SURROGATE_MAX
Definition utf8_core.hh:50
constexpr uint32_t SURROGATE_OFFSET
Definition utf8_core.hh:54
constexpr uint16_t TRAIL_SURROGATE_MAX
Definition utf8_core.hh:52
constexpr uint16_t TRAIL_SURROGATE_MIN
Definition utf8_core.hh:51
constexpr bool is_trail(uint8_t oc)
Definition utf8_core.hh:59
constexpr uint16_t LEAD_OFFSET
Definition utf8_core.hh:53
constexpr uint32_t CODE_POINT_MAX
Definition utf8_core.hh:57
constexpr uint16_t LEAD_SURROGATE_MIN
Definition utf8_core.hh:49
constexpr bool is_surrogate(uint32_t cp)
Definition utf8_core.hh:64
constexpr utf_error validate_next(octet_iterator &it, octet_iterator end, uint32_t *code_point)
Definition utf8_core.hh:100
constexpr unsigned sequence_length(uint8_t lead)
Definition utf8_core.hh:75
constexpr octet_iterator find_invalid(octet_iterator start, octet_iterator end)
The library API - functions intended to be called by the users.
Definition utf8_core.hh:212
constexpr bool is_valid(octet_iterator start, octet_iterator end)
Definition utf8_core.hh:225
constexpr octet_iterator sync_backward(octet_iterator it)
Definition utf8_core.hh:249
constexpr bool is_bom(octet_iterator it)
Definition utf8_core.hh:231
constexpr octet_iterator sync_forward(octet_iterator it)
Definition utf8_core.hh:242
constexpr bool is_pua(uint32_t cp)
Definition utf8_core.hh:257
constexpr void repeat(T n, Op op)
Repeat the given operation 'op' 'n' times.
Definition xrange.hh:147
constexpr auto end(const zstring_view &x)