openMSX
MsxChar2Unicode.cc
Go to the documentation of this file.
1#include "MsxChar2Unicode.hh"
2#include "File.hh"
3#include "FileContext.hh"
4#include "FileException.hh"
5#include "MSXException.hh"
6#include "StringOp.hh"
7#include "one_of.hh"
8#include "ranges.hh"
9#include "strCat.hh"
10#include "utf8_unchecked.hh"
11#include "xrange.hh"
12
13namespace openmsx {
14
15MsxChar2Unicode::MsxChar2Unicode(std::string_view mappingName)
16{
17 ranges::fill(msx2unicode, uint32_t(-1));
18
19 std::string filename;
20 try {
21 filename = systemFileContext().resolve(
22 tmpStrCat("unicodemaps/character_set_mappings/", mappingName));
23 } catch (FileException& e) {
24 throw MSXException("Couldn't find MSX character mapping file that was specified in unicodemap: ", mappingName, " (", e.getMessage(), ")");
25 }
26 try {
27 File file(filename);
28 auto buf = file.mmap();
29 parseVid(std::string_view(reinterpret_cast<const char*>(buf.data()), buf.size()));
30 } catch (FileException&) {
31 throw MSXException("Couldn't load MSX character mapping file that was specified in unicodemap: ", filename);
32 } catch (MSXException& e) {
33 throw MSXException(e.getMessage(), " in ", filename);
34 }
35}
36
37/* Remove the next line from 'text' and return it.
38 * A line is everything upto the first newline character. The character is
39 * removed from 'text' but not included in the return value.
40 */
41static constexpr std::string_view getLine(std::string_view& text)
42{
43 if (auto pos = text.find_first_of('\n'); pos != std::string_view::npos) {
44 // handle both 'LF' and 'CR LF'
45 auto pos2 = ((pos != 0) && (text[pos - 1] == '\r')) ? pos - 1 : pos;
46 auto result = text.substr(0, pos2);
47 text.remove_prefix(pos + 1);
48 return result;
49 }
50 return std::exchange(text, {});
51}
52
53/* Return the given line with comments at the end of the line removed.
54 * Comments start at the first '#' character and continue till the end of the
55 * line.
56 */
57[[nodiscard]] static constexpr std::string_view stripComments(std::string_view line)
58{
59 if (auto pos = line.find_first_of('#'); pos != std::string_view::npos) {
60 line = line.substr(0, pos);
61 }
62 return line;
63}
64
65/* Returns true iff the given character is a separator (whitespace).
66 * Newline and hash-mark are handled (already removed) by other functions.
67 */
68[[nodiscard]] static constexpr bool isSep(char c)
69{
70 return c == one_of(' ', '\t', '\r'); // whitespace
71}
72
73/* Remove one token from 'line' and return it.
74 * Tokens are separated by one or more separator character as defined by 'isSep()'.
75 * Those separators are removed from 'line' but not included in the return value.
76 * The assumption is that there are no leading separator characters before calling this function.
77 */
78static constexpr std::string_view getToken(std::string_view& line)
79{
80 size_t s = line.size();
81 size_t i = 0;
82 while ((i < s) && !isSep(line[i])) {
83 ++i;
84 }
85 auto result = line.substr(0, i);
86 while ((i < s) && isSep(line[i])) {
87 ++i;
88 }
89 line.remove_prefix(i);
90 return result;
91}
92
93void MsxChar2Unicode::parseVid(std::string_view file)
94{
95 // The general syntax of this file is
96 // <msx-char> <unicode> # comments
97 // Fields are separated via whitespace (tabs or spaces).
98 // For example:
99 // 0x2A 0x002A # ASTERISK
100
101 // Usually each msx-char only has a single corresponding unicode, and
102 // then 'unicode2msx.size()' will be (close to) 256. But for example
103 // this is not the case in 'MSXVIDAR.TXT'.
104 unicode2msx.reserve(256);
105
106 while (!file.empty()) {
107 auto origLine = getLine(file);
108 auto line = stripComments(origLine);
109
110 auto msxTok = getToken(line);
111 if (msxTok.empty()) continue; // empty line (or only whitespace / comments)
112 auto msx = StringOp::stringTo<uint8_t>(msxTok);
113 if (!msx) {
114 throw MSXException("Invalid msx character value, expected an "
115 "integer in range 0x00..0xff, but got: ", msxTok);
116 }
117
118 auto unicodeTok = getToken(line);
119 if ((unicodeTok.size() >= 5) && (unicodeTok[0] == '<') &&
120 (unicodeTok[3] == '>') && (unicodeTok[4] == '+')) {
121 // In some files the <unicode> field is preceded with an annotation like:
122 // <LR>+0x0020 left-to-right
123 // <RL>+0x0020 right-to-left
124 // <RV>+0x0020 reverse-video
125 // Just strip out that annotation and ignore. Current
126 // implementation assumes the code 'LR', 'RL', 'RV' is
127 // exactly two characters long.
128 unicodeTok.remove_prefix(5);
129 }
130 auto unicode = StringOp::stringTo<uint32_t>(unicodeTok);
131 if (!unicode || *unicode > 0x10ffff) {
132 throw MSXException("Invalid unicode character value, expected an "
133 "integer in range 0..0x10ffff, but got: ", unicodeTok);
134 }
135
136 if (!line.empty()) {
137 throw MSXException("Syntax error, expected \"<msx-char> <unicode>\", "
138 "but got: ", origLine);
139 }
140
141 // There can be duplicates (e.g. in 'MSXVIDAR.TXT'), in that
142 // case only keep the first entry ...
143 if (msx2unicode[*msx] == uint32_t(-1)) {
144 msx2unicode[*msx] = *unicode;
145 }
146 // ... but (for now) keep all unicode->msx mappings (duplicates are removed below).
147 unicode2msx.emplace_back(*unicode, *msx);
148 }
149
150 // Sort on unicode (for later binary-search). If there are duplicate
151 // unicodes (with different msx-code), then keep the first entry (hence
152 // use stable_sort).
153 ranges::stable_sort(unicode2msx, {}, &Entry::unicode);
154 unicode2msx.erase(ranges::unique(unicode2msx, {}, &Entry::unicode), end(unicode2msx));
155}
156
158 std::span<const uint8_t> msx, const std::function<uint32_t(uint8_t)>& fallback) const
159{
160 std::string utf8;
161 utf8.reserve(msx.size()); // possibly underestimation, but that's fine
162 auto out = std::back_inserter(utf8);
163 for (auto m : msx) {
164 auto u = msx2unicode[m];
165 auto u2 = (u != uint32_t(-1)) ? u : fallback(m);
166 out = utf8::unchecked::append(u2, out);
167 }
168 return utf8;
169}
170
171std::vector<uint8_t> MsxChar2Unicode::utf8ToMsx(
172 std::string_view utf8, const std::function<uint8_t(uint32_t)>& fallback) const
173{
174 std::vector<uint8_t> msx;
175 auto it = utf8.begin(), et = utf8.end();
176 while (it != et) {
177 auto u = utf8::unchecked::next(it);
178 auto m = binary_find(unicode2msx, u, {}, &Entry::unicode);
179 msx.push_back(m ? m->msx : fallback(u));
180 }
181 return msx;
182}
183
184std::string MsxChar2Unicode::msxToUtf8(std::span<const uint8_t> msx, char fallback) const
185{
186 return msxToUtf8(msx, [&](uint32_t) { return fallback; });
187}
188
189std::vector<uint8_t> MsxChar2Unicode::utf8ToMsx(std::string_view utf8, char fallback) const
190{
191 return utf8ToMsx(utf8, [&](uint8_t) { return fallback; });
192}
193
194
195} // namespace openmsx
Definition: one_of.hh:7
std::string resolve(std::string_view filename) const
Definition: FileContext.cc:79
std::span< const uint8_t > mmap()
Map file in memory.
Definition: File.cc:102
std::vector< uint8_t > utf8ToMsx(std::string_view utf8, const std::function< uint8_t(uint32_t)> &fallback) const
TODO.
MsxChar2Unicode(std::string_view mappingName)
std::string msxToUtf8(std::span< const uint8_t > msx, const std::function< uint32_t(uint8_t)> &fallback) const
TODO.
constexpr double e
Definition: Math.hh:21
This file implemented 3 utility functions:
Definition: Autofire.cc:9
const FileContext & systemFileContext()
Definition: FileContext.cc:155
auto unique(ForwardRange &&range)
Definition: ranges.hh:222
constexpr void fill(ForwardRange &&range, const T &value)
Definition: ranges.hh:305
void stable_sort(RandomAccessRange &&range)
Definition: ranges.hh:76
uint32_t next(octet_iterator &it)
octet_iterator append(uint32_t cp, octet_iterator result)
auto * binary_find(ForwardRange &&range, const T &value, Compare comp={}, Proj proj={})
Definition: ranges.hh:438
TemporaryString tmpStrCat(Ts &&... ts)
Definition: strCat.hh:693
constexpr auto end(const zstring_view &x)