openMSX
utf8_core.hh
Go to the documentation of this file.
1// UTF8-CPP http://utfcpp.sourceforge.net/
2// Slightly simplified (and reformatted) to fit openMSX coding style.
3
4// Copyright 2006 Nemanja Trifunovic
5
6/*
7Permission is hereby granted, free of charge, to any person or organization
8obtaining a copy of the software and accompanying documentation covered by
9this license (the "Software") to use, reproduce, display, distribute,
10execute, and transmit the Software, and to prepare derivative works of the
11Software, and to permit third-parties to whom the Software is furnished to
12do so, all subject to the following:
13
14The copyright notices in the Software and this entire statement, including
15the above license grant, this restriction and the following disclaimer,
16must be included in all copies of the Software, in whole or in part, and
17all derivative works of the Software, unless such copies or derivative
18works are solely in the form of machine-executable object code generated by
19a source language processor.
20
21THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
24SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
25FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
26ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27DEALINGS IN THE SOFTWARE.
28*/
29
30
31#ifndef UTF8_CORE_HH
32#define UTF8_CORE_HH
33
34#include "one_of.hh"
35#include <iterator>
36#include <cstdint>
37
38namespace utf8 {
39
40// Helper code - not intended to be directly called by the library users.
41// May be changed at any time
42namespace internal {
43
44// Unicode constants
45// Leading (high) surrogates: 0xd800 - 0xdbff
46// Trailing (low) surrogates: 0xdc00 - 0xdfff
47constexpr uint16_t LEAD_SURROGATE_MIN = 0xd800u;
48constexpr uint16_t LEAD_SURROGATE_MAX = 0xdbffu;
49constexpr uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
50constexpr uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
51constexpr uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10);
52constexpr uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
53
54// Maximum valid value for a Unicode code point
55constexpr uint32_t CODE_POINT_MAX = 0x0010ffffu;
56
57[[nodiscard]] constexpr bool is_trail(uint8_t oc)
58{
59 return (oc >> 6) == 0x2;
60}
61
62[[nodiscard]] constexpr bool is_surrogate(uint16_t cp)
63{
64 return (cp >= LEAD_SURROGATE_MIN) && (cp <= TRAIL_SURROGATE_MAX);
65}
66
67[[nodiscard]] constexpr bool is_code_point_valid(uint32_t cp)
68{
69 return (cp <= CODE_POINT_MAX) && !is_surrogate(cp) &&
70 (cp != one_of(0xfffeu, 0xffffu));
71}
72
73[[nodiscard]] constexpr unsigned sequence_length(uint8_t lead)
74{
75 if (lead < 0x80) {
76 return 1;
77 } else if ((lead >> 5) == 0x06) {
78 return 2;
79 } else if ((lead >> 4) == 0x0e) {
80 return 3;
81 } else if ((lead >> 3) == 0x1e) {
82 return 4;
83 } else {
84 return 0;
85 }
86}
87
95};
96
97template<typename octet_iterator>
98[[nodiscard]] constexpr utf_error validate_next(octet_iterator& it, octet_iterator end,
99 uint32_t* code_point)
100{
101 uint32_t cp = *it;
102 // Check the lead octet
103 int length = sequence_length(*it);
104
105 // "Shortcut" for ASCII characters
106 if (length == 1) {
107 if (end - it <= 0) {
108 return NOT_ENOUGH_ROOM;
109 }
110 if (code_point) {
111 *code_point = cp;
112 }
113 ++it;
114 return OK;
115 }
116
117 // Do we have enough memory?
118 if (std::distance(it, end) < length) {
119 return NOT_ENOUGH_ROOM;
120 }
121
122 // Check trail octets and calculate the code point
123 switch (length) {
124 case 0:
125 return INVALID_LEAD;
126 case 2:
127 if (is_trail(*(++it))) {
128 cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
129 } else {
130 --it;
131 return INCOMPLETE_SEQUENCE;
132 }
133 break;
134 case 3:
135 if (is_trail(*(++it))) {
136 cp = ((cp << 12) & 0xffff) + ((*it << 6) & 0xfff);
137 if (is_trail(*(++it))) {
138 cp += (*it) & 0x3f;
139 } else {
140 std::advance(it, -2);
141 return INCOMPLETE_SEQUENCE;
142 }
143 } else {
144 --it;
145 return INCOMPLETE_SEQUENCE;
146 }
147 break;
148 case 4:
149 if (is_trail(*(++it))) {
150 cp = ((cp << 18) & 0x1fffff) + ((*it << 12) & 0x3ffff);
151 if (is_trail(*(++it))) {
152 cp += (*it << 6) & 0xfff;
153 if (is_trail(*(++it))) {
154 cp += (*it) & 0x3f;
155 } else {
156 std::advance(it, -3);
157 return INCOMPLETE_SEQUENCE;
158 }
159 } else {
160 std::advance(it, -2);
161 return INCOMPLETE_SEQUENCE;
162 }
163 } else {
164 --it;
165 return INCOMPLETE_SEQUENCE;
166 }
167 break;
168 }
169 // Is the code point valid?
170 if (!is_code_point_valid(cp)) {
171 repeat(length - 1, [&] { --it; });
172 return INVALID_CODE_POINT;
173 }
174
175 if (code_point) {
176 *code_point = cp;
177 }
178 if (cp < 0x80) {
179 if (length != 1) {
180 std::advance(it, -(length-1));
181 return OVERLONG_SEQUENCE;
182 }
183 } else if (cp < 0x800) {
184 if (length != 2) {
185 std::advance(it, -(length-1));
186 return OVERLONG_SEQUENCE;
187 }
188 } else if (cp < 0x10000) {
189 if (length != 3) {
190 std::advance(it, -(length-1));
191 return OVERLONG_SEQUENCE;
192 }
193 }
194
195 ++it;
196 return OK;
197}
198
199template<typename octet_iterator>
200[[nodiscard]] constexpr utf_error validate_next(octet_iterator& it, octet_iterator end) {
201 return validate_next(it, end, nullptr);
202}
203
204} // namespace internal
205
207
208template<typename octet_iterator>
209[[nodiscard]] constexpr octet_iterator find_invalid(octet_iterator start, octet_iterator end)
210{
211 auto result = start;
212 while (result != end) {
214 if (err_code != internal::OK) {
215 return result;
216 }
217 }
218 return result;
219}
220
221template<typename octet_iterator>
222[[nodiscard]] constexpr bool is_valid(octet_iterator start, octet_iterator end)
223{
224 return find_invalid(start, end) == end;
225}
226
227template<typename octet_iterator>
228[[nodiscard]] constexpr bool is_bom(octet_iterator it)
229{
230 // Byte order mark
231 constexpr uint8_t bom[] = {0xef, 0xbb, 0xbf};
232
233 return ((*it++ == bom[0]) &&
234 (*it++ == bom[1]) &&
235 (*it == bom[2]));
236}
237
238template<typename octet_iterator>
239[[nodiscard]] constexpr octet_iterator sync_forward(octet_iterator it)
240{
241 while (internal::is_trail(*it)) ++it;
242 return it;
243}
244
245template<typename octet_iterator>
246[[nodiscard]] constexpr octet_iterator sync_backward(octet_iterator it)
247{
248 while (internal::is_trail(*it)) --it;
249 return it;
250}
251
252// Is this a code point in the 'Private Use Area' (PUA).
253// https://en.wikipedia.org/wiki/Private_Use_Areas
254[[nodiscard]] constexpr bool is_pua(uint32_t cp)
255{
256 return ((0x00E000 <= cp) && (cp <= 0x00F8FF)) ||
257 ((0x0F0000 <= cp) && (cp <= 0x0FFFFD)) ||
258 ((0x100000 <= cp) && (cp <= 0x10FFFD));
259}
260
261} // namespace utf8
262
263#endif
Definition: one_of.hh:7
T length(const vecN< N, T > &x)
Definition: gl_vec.hh:339
constexpr bool is_code_point_valid(uint32_t cp)
Definition: utf8_core.hh:67
constexpr uint16_t LEAD_SURROGATE_MAX
Definition: utf8_core.hh:48
constexpr uint32_t SURROGATE_OFFSET
Definition: utf8_core.hh:52
constexpr uint16_t TRAIL_SURROGATE_MAX
Definition: utf8_core.hh:50
constexpr uint16_t TRAIL_SURROGATE_MIN
Definition: utf8_core.hh:49
constexpr bool is_trail(uint8_t oc)
Definition: utf8_core.hh:57
constexpr uint16_t LEAD_OFFSET
Definition: utf8_core.hh:51
constexpr uint32_t CODE_POINT_MAX
Definition: utf8_core.hh:55
constexpr bool is_surrogate(uint16_t cp)
Definition: utf8_core.hh:62
constexpr uint16_t LEAD_SURROGATE_MIN
Definition: utf8_core.hh:47
constexpr utf_error validate_next(octet_iterator &it, octet_iterator end, uint32_t *code_point)
Definition: utf8_core.hh:98
constexpr unsigned sequence_length(uint8_t lead)
Definition: utf8_core.hh:73
void advance(octet_iterator &it, distance_type n, octet_iterator end)
constexpr octet_iterator find_invalid(octet_iterator start, octet_iterator end)
The library API - functions intended to be called by the users.
Definition: utf8_core.hh:209
constexpr bool is_valid(octet_iterator start, octet_iterator end)
Definition: utf8_core.hh:222
constexpr octet_iterator sync_backward(octet_iterator it)
Definition: utf8_core.hh:246
auto distance(octet_iterator first, octet_iterator last)
constexpr bool is_bom(octet_iterator it)
Definition: utf8_core.hh:228
constexpr octet_iterator sync_forward(octet_iterator it)
Definition: utf8_core.hh:239
constexpr bool is_pua(uint32_t cp)
Definition: utf8_core.hh:254
constexpr void repeat(T n, Op op)
Repeat the given operation 'op' 'n' times.
Definition: xrange.hh:148
constexpr auto end(const zstring_view &x)