openMSX
MsxChar2Unicode.cc
Go to the documentation of this file.
1#include "MsxChar2Unicode.hh"
2
3#include "File.hh"
4#include "FileContext.hh"
5#include "FileException.hh"
6#include "MSXException.hh"
7
8#include "StringOp.hh"
9#include "one_of.hh"
10#include "ranges.hh"
11#include "strCat.hh"
12#include "utf8_unchecked.hh"
13#include "xrange.hh"
14
15#include <bit>
16
17namespace openmsx {
18
19MsxChar2Unicode::MsxChar2Unicode(std::string_view mappingName)
20{
21 ranges::fill(msx2unicode, uint32_t(-1));
22
23 std::string filename;
24 try {
25 filename = systemFileContext().resolve(
26 tmpStrCat("unicodemaps/character_set_mappings/", mappingName));
27 } catch (FileException& e) {
28 throw MSXException("Couldn't find MSX character mapping file that was specified in unicodemap: ", mappingName, " (", e.getMessage(), ")");
29 }
30 try {
31 File file(filename);
32 auto buf = file.mmap();
33 parseVid(std::string_view(std::bit_cast<const char*>(buf.data()), buf.size()));
34 } catch (FileException&) {
35 throw MSXException("Couldn't load MSX character mapping file that was specified in unicodemap: ", filename);
36 } catch (MSXException& e) {
37 throw MSXException(e.getMessage(), " in ", filename);
38 }
39}
40
41/* Remove the next line from 'text' and return it.
42 * A line is everything upto the first newline character. The character is
43 * removed from 'text' but not included in the return value.
44 */
45static constexpr std::string_view getLine(std::string_view& text)
46{
47 if (auto pos = text.find_first_of('\n'); pos != std::string_view::npos) {
48 // handle both 'LF' and 'CR LF'
49 auto pos2 = ((pos != 0) && (text[pos - 1] == '\r')) ? pos - 1 : pos;
50 auto result = text.substr(0, pos2);
51 text.remove_prefix(pos + 1);
52 return result;
53 }
54 return std::exchange(text, {});
55}
56
57/* Return the given line with comments at the end of the line removed.
58 * Comments start at the first '#' character and continue till the end of the
59 * line.
60 */
61[[nodiscard]] static constexpr std::string_view stripComments(std::string_view line)
62{
63 if (auto pos = line.find_first_of('#'); pos != std::string_view::npos) {
64 line = line.substr(0, pos);
65 }
66 return line;
67}
68
69/* Returns true iff the given character is a separator (whitespace).
70 * Newline and hash-mark are handled (already removed) by other functions.
71 */
72[[nodiscard]] static constexpr bool isSep(char c)
73{
74 return c == one_of(' ', '\t', '\r'); // whitespace
75}
76
77/* Remove one token from 'line' and return it.
78 * Tokens are separated by one or more separator character as defined by 'isSep()'.
79 * Those separators are removed from 'line' but not included in the return value.
80 * The assumption is that there are no leading separator characters before calling this function.
81 */
82static constexpr std::string_view getToken(std::string_view& line)
83{
84 size_t s = line.size();
85 size_t i = 0;
86 while ((i < s) && !isSep(line[i])) {
87 ++i;
88 }
89 auto result = line.substr(0, i);
90 while ((i < s) && isSep(line[i])) {
91 ++i;
92 }
93 line.remove_prefix(i);
94 return result;
95}
96
97void MsxChar2Unicode::parseVid(std::string_view file)
98{
99 // The general syntax of this file is
100 // <msx-char> <unicode> # comments
101 // Fields are separated via whitespace (tabs or spaces).
102 // For example:
103 // 0x2A 0x002A # ASTERISK
104
105 // Usually each msx-char only has a single corresponding unicode, and
106 // then 'unicode2msx.size()' will be (close to) 256. But for example
107 // this is not the case in 'MSXVIDAR.TXT'.
108 unicode2msx.reserve(256);
109
110 while (!file.empty()) {
111 auto origLine = getLine(file);
112 auto line = stripComments(origLine);
113
114 auto msxTok = getToken(line);
115 if (msxTok.empty()) continue; // empty line (or only whitespace / comments)
116 auto msx = StringOp::stringTo<uint8_t>(msxTok);
117 if (!msx) {
118 throw MSXException("Invalid msx character value, expected an "
119 "integer in range 0x00..0xff, but got: ", msxTok);
120 }
121
122 auto unicodeTok = getToken(line);
123 if ((unicodeTok.size() >= 5) && (unicodeTok[0] == '<') &&
124 (unicodeTok[3] == '>') && (unicodeTok[4] == '+')) {
125 // In some files the <unicode> field is preceded with an annotation like:
126 // <LR>+0x0020 left-to-right
127 // <RL>+0x0020 right-to-left
128 // <RV>+0x0020 reverse-video
129 // Just strip out that annotation and ignore. Current
130 // implementation assumes the code 'LR', 'RL', 'RV' is
131 // exactly two characters long.
132 unicodeTok.remove_prefix(5);
133 }
134 auto unicode = StringOp::stringTo<uint32_t>(unicodeTok);
135 if (!unicode || *unicode > 0x10ffff) {
136 throw MSXException("Invalid unicode character value, expected an "
137 "integer in range 0..0x10ffff, but got: ", unicodeTok);
138 }
139
140 if (!line.empty()) {
141 throw MSXException("Syntax error, expected \"<msx-char> <unicode>\", "
142 "but got: ", origLine);
143 }
144
145 // There can be duplicates (e.g. in 'MSXVIDAR.TXT'), in that
146 // case only keep the first entry ...
147 if (msx2unicode[*msx] == uint32_t(-1)) {
148 msx2unicode[*msx] = *unicode;
149 }
150 // ... but (for now) keep all unicode->msx mappings (duplicates are removed below).
151 unicode2msx.emplace_back(*unicode, *msx);
152 }
153
154 // Sort on unicode (for later binary-search). If there are duplicate
155 // unicodes (with different msx-code), then keep the first entry (hence
156 // use stable_sort).
157 ranges::stable_sort(unicode2msx, {}, &Entry::unicode);
158 unicode2msx.erase(ranges::unique(unicode2msx, {}, &Entry::unicode), end(unicode2msx));
159}
160
162 std::span<const uint8_t> msx, const std::function<uint32_t(uint8_t)>& fallback) const
163{
164 std::string utf8;
165 utf8.reserve(msx.size()); // possibly underestimation, but that's fine
166 auto out = std::back_inserter(utf8);
167 for (auto m : msx) {
168 auto u = msx2unicode[m];
169 auto u2 = (u != uint32_t(-1)) ? u : fallback(m);
170 out = utf8::unchecked::append(u2, out);
171 }
172 return utf8;
173}
174
175std::vector<uint8_t> MsxChar2Unicode::utf8ToMsx(
176 std::string_view utf8, const std::function<uint8_t(uint32_t)>& fallback) const
177{
178 std::vector<uint8_t> msx;
179 auto it = utf8.begin(), et = utf8.end();
180 while (it != et) {
181 auto u = utf8::unchecked::next(it);
182 auto m = binary_find(unicode2msx, u, {}, &Entry::unicode);
183 msx.push_back(m ? m->msx : fallback(u));
184 }
185 return msx;
186}
187
188std::string MsxChar2Unicode::msxToUtf8(std::span<const uint8_t> msx, char fallback) const
189{
190 return msxToUtf8(msx, [&](uint32_t) { return fallback; });
191}
192
193std::vector<uint8_t> MsxChar2Unicode::utf8ToMsx(std::string_view utf8, char fallback) const
194{
195 return utf8ToMsx(utf8, [&](uint8_t) { return fallback; });
196}
197
198
199} // namespace openmsx
std::string resolve(std::string_view filename) const
std::span< const uint8_t > mmap()
Map file in memory.
Definition File.cc:102
std::vector< uint8_t > utf8ToMsx(std::string_view utf8, const std::function< uint8_t(uint32_t)> &fallback) const
TODO.
MsxChar2Unicode(std::string_view mappingName)
std::string msxToUtf8(std::span< const uint8_t > msx, const std::function< uint32_t(uint8_t)> &fallback) const
TODO.
This file implemented 3 utility functions:
Definition Autofire.cc:11
const FileContext & systemFileContext()
auto unique(ForwardRange &&range)
Definition ranges.hh:224
constexpr void fill(ForwardRange &&range, const T &value)
Definition ranges.hh:315
void stable_sort(RandomAccessRange &&range)
Definition ranges.hh:78
uint32_t next(octet_iterator &it)
octet_iterator append(uint32_t cp, octet_iterator result)
auto * binary_find(ForwardRange &&range, const T &value, Compare comp={}, Proj proj={})
Definition ranges.hh:448
TemporaryString tmpStrCat(Ts &&... ts)
Definition strCat.hh:742
constexpr auto end(const zstring_view &x)