openMSX
utf8_core.hh
Go to the documentation of this file.
1// UTF8-CPP http://utfcpp.sourceforge.net/
2// Slightly simplified (and reformatted) to fit openMSX coding style.
3
4// Copyright 2006 Nemanja Trifunovic
5
6/*
7Permission is hereby granted, free of charge, to any person or organization
8obtaining a copy of the software and accompanying documentation covered by
9this license (the "Software") to use, reproduce, display, distribute,
10execute, and transmit the Software, and to prepare derivative works of the
11Software, and to permit third-parties to whom the Software is furnished to
12do so, all subject to the following:
13
14The copyright notices in the Software and this entire statement, including
15the above license grant, this restriction and the following disclaimer,
16must be included in all copies of the Software, in whole or in part, and
17all derivative works of the Software, unless such copies or derivative
18works are solely in the form of machine-executable object code generated by
19a source language processor.
20
21THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
24SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
25FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
26ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27DEALINGS IN THE SOFTWARE.
28*/
29
30
31#ifndef UTF8_CORE_HH
32#define UTF8_CORE_HH
33
34#include "narrow.hh"
35#include "one_of.hh"
36#include <array>
37#include <iterator>
38#include <cstdint>
39
40namespace utf8 {
41
42// Helper code - not intended to be directly called by the library users.
43// May be changed at any time
44namespace internal {
45
46// Unicode constants
47// Leading (high) surrogates: 0xd800 - 0xdbff
48// Trailing (low) surrogates: 0xdc00 - 0xdfff
49inline constexpr uint16_t LEAD_SURROGATE_MIN = 0xd800u;
50inline constexpr uint16_t LEAD_SURROGATE_MAX = 0xdbffu;
51inline constexpr uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
52inline constexpr uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
53inline constexpr uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10);
54inline constexpr uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
55
56// Maximum valid value for a Unicode code point
57inline constexpr uint32_t CODE_POINT_MAX = 0x0010ffffu;
58
59[[nodiscard]] constexpr bool is_trail(uint8_t oc)
60{
61 return (oc >> 6) == 0x2;
62}
63
64[[nodiscard]] constexpr bool is_surrogate(uint16_t cp)
65{
66 return (cp >= LEAD_SURROGATE_MIN) && (cp <= TRAIL_SURROGATE_MAX);
67}
68
69[[nodiscard]] constexpr bool is_code_point_valid(uint32_t cp)
70{
71 return (cp <= CODE_POINT_MAX) && !is_surrogate(cp) &&
72 (cp != one_of(0xfffeu, 0xffffu));
73}
74
75[[nodiscard]] constexpr unsigned sequence_length(uint8_t lead)
76{
77 if (lead < 0x80) {
78 return 1;
79 } else if ((lead >> 5) == 0x06) {
80 return 2;
81 } else if ((lead >> 4) == 0x0e) {
82 return 3;
83 } else if ((lead >> 3) == 0x1e) {
84 return 4;
85 } else {
86 return 0;
87 }
88}
89
97};
98
99template<typename octet_iterator>
100[[nodiscard]] constexpr utf_error validate_next(octet_iterator& it, octet_iterator end,
101 uint32_t* code_point)
102{
103 uint32_t cp = narrow_cast<unsigned char>(*it);
104 // Check the lead octet
105 int length = sequence_length(*it);
106
107 // "Shortcut" for ASCII characters
108 if (length == 1) {
109 if (end - it <= 0) {
110 return NOT_ENOUGH_ROOM;
111 }
112 if (code_point) {
113 *code_point = cp;
114 }
115 ++it;
116 return OK;
117 }
118
119 // Do we have enough memory?
120 if (std::distance(it, end) < length) {
121 return NOT_ENOUGH_ROOM;
122 }
123
124 // Check trail octets and calculate the code point
125 switch (length) {
126 case 0:
127 return INVALID_LEAD;
128 case 2:
129 if (is_trail(*(++it))) {
130 cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
131 } else {
132 --it;
133 return INCOMPLETE_SEQUENCE;
134 }
135 break;
136 case 3:
137 if (is_trail(*(++it))) {
138 cp = ((cp << 12) & 0xffff) + ((*it << 6) & 0xfff);
139 if (is_trail(*(++it))) {
140 cp += (*it) & 0x3f;
141 } else {
142 std::advance(it, -2);
143 return INCOMPLETE_SEQUENCE;
144 }
145 } else {
146 --it;
147 return INCOMPLETE_SEQUENCE;
148 }
149 break;
150 case 4:
151 if (is_trail(*(++it))) {
152 cp = ((cp << 18) & 0x1fffff) + ((*it << 12) & 0x3ffff);
153 if (is_trail(*(++it))) {
154 cp += (*it << 6) & 0xfff;
155 if (is_trail(*(++it))) {
156 cp += (*it) & 0x3f;
157 } else {
158 std::advance(it, -3);
159 return INCOMPLETE_SEQUENCE;
160 }
161 } else {
162 std::advance(it, -2);
163 return INCOMPLETE_SEQUENCE;
164 }
165 } else {
166 --it;
167 return INCOMPLETE_SEQUENCE;
168 }
169 break;
170 }
171 // Is the code point valid?
172 if (!is_code_point_valid(cp)) {
173 repeat(length - 1, [&] { --it; });
174 return INVALID_CODE_POINT;
175 }
176
177 if (code_point) {
178 *code_point = cp;
179 }
180 if (cp < 0x80) {
181 if (length != 1) {
182 std::advance(it, -(length-1));
183 return OVERLONG_SEQUENCE;
184 }
185 } else if (cp < 0x800) {
186 if (length != 2) {
187 std::advance(it, -(length-1));
188 return OVERLONG_SEQUENCE;
189 }
190 } else if (cp < 0x10000) {
191 if (length != 3) {
192 std::advance(it, -(length-1));
193 return OVERLONG_SEQUENCE;
194 }
195 }
196
197 ++it;
198 return OK;
199}
200
201template<typename octet_iterator>
202[[nodiscard]] constexpr utf_error validate_next(octet_iterator& it, octet_iterator end) {
203 return validate_next(it, end, nullptr);
204}
205
206} // namespace internal
207
209
210template<typename octet_iterator>
211[[nodiscard]] constexpr octet_iterator find_invalid(octet_iterator start, octet_iterator end)
212{
213 auto result = start;
214 while (result != end) {
216 if (err_code != internal::OK) {
217 return result;
218 }
219 }
220 return result;
221}
222
223template<typename octet_iterator>
224[[nodiscard]] constexpr bool is_valid(octet_iterator start, octet_iterator end)
225{
226 return find_invalid(start, end) == end;
227}
228
229template<typename octet_iterator>
230[[nodiscard]] constexpr bool is_bom(octet_iterator it)
231{
232 // Byte order mark
233 constexpr std::array<uint8_t, 3> bom = {0xef, 0xbb, 0xbf};
234
235 return ((*it++ == bom[0]) &&
236 (*it++ == bom[1]) &&
237 (*it == bom[2]));
238}
239
240template<typename octet_iterator>
241[[nodiscard]] constexpr octet_iterator sync_forward(octet_iterator it)
242{
243 while (internal::is_trail(*it)) ++it;
244 return it;
245}
246
247template<typename octet_iterator>
248[[nodiscard]] constexpr octet_iterator sync_backward(octet_iterator it)
249{
250 while (internal::is_trail(*it)) --it;
251 return it;
252}
253
254// Is this a code point in the 'Private Use Area' (PUA).
255// https://en.wikipedia.org/wiki/Private_Use_Areas
256[[nodiscard]] constexpr bool is_pua(uint32_t cp)
257{
258 return ((0x00E000 <= cp) && (cp <= 0x00F8FF)) ||
259 ((0x0F0000 <= cp) && (cp <= 0x0FFFFD)) ||
260 ((0x100000 <= cp) && (cp <= 0x10FFFD));
261}
262
263} // namespace utf8
264
265#endif
Definition: one_of.hh:7
T length(const vecN< N, T > &x)
Definition: gl_vec.hh:340
constexpr bool is_code_point_valid(uint32_t cp)
Definition: utf8_core.hh:69
constexpr uint16_t LEAD_SURROGATE_MAX
Definition: utf8_core.hh:50
constexpr uint32_t SURROGATE_OFFSET
Definition: utf8_core.hh:54
constexpr uint16_t TRAIL_SURROGATE_MAX
Definition: utf8_core.hh:52
constexpr uint16_t TRAIL_SURROGATE_MIN
Definition: utf8_core.hh:51
constexpr bool is_trail(uint8_t oc)
Definition: utf8_core.hh:59
constexpr uint16_t LEAD_OFFSET
Definition: utf8_core.hh:53
constexpr uint32_t CODE_POINT_MAX
Definition: utf8_core.hh:57
constexpr bool is_surrogate(uint16_t cp)
Definition: utf8_core.hh:64
constexpr uint16_t LEAD_SURROGATE_MIN
Definition: utf8_core.hh:49
constexpr utf_error validate_next(octet_iterator &it, octet_iterator end, uint32_t *code_point)
Definition: utf8_core.hh:100
constexpr unsigned sequence_length(uint8_t lead)
Definition: utf8_core.hh:75
void advance(octet_iterator &it, distance_type n, octet_iterator end)
constexpr octet_iterator find_invalid(octet_iterator start, octet_iterator end)
The library API - functions intended to be called by the users.
Definition: utf8_core.hh:211
constexpr bool is_valid(octet_iterator start, octet_iterator end)
Definition: utf8_core.hh:224
constexpr octet_iterator sync_backward(octet_iterator it)
Definition: utf8_core.hh:248
auto distance(octet_iterator first, octet_iterator last)
constexpr bool is_bom(octet_iterator it)
Definition: utf8_core.hh:230
constexpr octet_iterator sync_forward(octet_iterator it)
Definition: utf8_core.hh:241
constexpr bool is_pua(uint32_t cp)
Definition: utf8_core.hh:256
constexpr void repeat(T n, Op op)
Repeat the given operation 'op' 'n' times.
Definition: xrange.hh:148
constexpr auto end(const zstring_view &x)