openMSX
MsxChar2Unicode.cc
Go to the documentation of this file.
1 #include "MsxChar2Unicode.hh"
2 #include "File.hh"
3 #include "FileContext.hh"
4 #include "FileException.hh"
5 #include "MSXException.hh"
6 #include "StringOp.hh"
7 #include "one_of.hh"
8 #include "ranges.hh"
9 #include "strCat.hh"
10 #include "utf8_unchecked.hh"
11 #include "xrange.hh"
12 
13 namespace openmsx {
14 
15 MsxChar2Unicode::MsxChar2Unicode(std::string_view mappingName)
16 {
17  ranges::fill(msx2unicode, uint32_t(-1));
18 
19  std::string filename;
20  try {
22  tmpStrCat("unicodemaps/character_set_mappings/", mappingName));
23  } catch (FileException& e) {
24  throw MSXException("Couldn't find MSX character mapping file that was specified in unicodemap: ", mappingName, " (", e.getMessage(), ")");
25  }
26  try {
27  File file(filename);
28  auto buf = file.mmap();
29  parseVid(std::string_view(reinterpret_cast<const char*>(buf.data()), buf.size()));
30  } catch (FileException&) {
31  throw MSXException("Couldn't load MSX character mapping file that was specified in unicodemap: ", filename);
32  } catch (MSXException& e) {
33  throw MSXException(e.getMessage(), " in ", filename);
34  }
35 }
36 
37 /* Remove the next line from 'text' and return it.
38  * A line is everything upto the first newline character. The character is
39  * removed from 'text' but not included in the return value.
40  */
41 static constexpr std::string_view getLine(std::string_view& text)
42 {
43  if (auto pos = text.find_first_of('\n'); pos != std::string_view::npos) {
44  // handle both 'LF' and 'CR LF'
45  auto pos2 = ((pos != 0) && (text[pos - 1] == '\r')) ? pos - 1 : pos;
46  auto result = text.substr(0, pos2);
47  text.remove_prefix(pos + 1);
48  return result;
49  }
50  return std::exchange(text, {});
51 }
52 
53 /* Return the given line with comments at the end of the line removed.
54  * Comments start at the first '#' character and continue till the end of the
55  * line.
56  */
57 [[nodiscard]] static constexpr std::string_view stripComments(std::string_view line)
58 {
59  if (auto pos = line.find_first_of('#'); pos != std::string_view::npos) {
60  line = line.substr(0, pos);
61  }
62  return line;
63 }
64 
65 /* Returns true iff the given character is a separator (whitespace).
66  * Newline and hash-mark are handled (already removed) by other functions.
67  */
68 [[nodiscard]] static constexpr bool isSep(char c)
69 {
70  return c == one_of(' ', '\t', '\r'); // whitespace
71 }
72 
73 /* Remove one token from 'line' and return it.
74  * Tokens are separated by one or more separator character as defined by 'isSep()'.
75  * Those separators are removed from 'line' but not included in the return value.
76  * The assumption is that there are no leading separator characters before calling this function.
77  */
78 static constexpr std::string_view getToken(std::string_view& line)
79 {
80  size_t s = line.size();
81  size_t i = 0;
82  while ((i < s) && !isSep(line[i])) {
83  ++i;
84  }
85  auto result = line.substr(0, i);
86  while ((i < s) && isSep(line[i])) {
87  ++i;
88  }
89  line.remove_prefix(i);
90  return result;
91 }
92 
93 void MsxChar2Unicode::parseVid(std::string_view file)
94 {
95  // The general syntax of this file is
96  // <msx-char> <unicode> # comments
97  // Fields are separated via whitespace (tabs or spaces).
98  // For example:
99  // 0x2A 0x002A # ASTERISK
100 
101  // Usually each msx-char only has a single corresponding unicode, and
102  // then 'unicode2msx.size()' will be (close to) 256. But for example
103  // this is not the case in 'MSXVIDAR.TXT'.
104  unicode2msx.reserve(256);
105 
106  while (!file.empty()) {
107  auto origLine = getLine(file);
108  auto line = stripComments(origLine);
109 
110  auto msxTok = getToken(line);
111  if (msxTok.empty()) continue; // empty line (or only whitespace / comments)
112  auto msx = StringOp::stringTo<uint8_t>(msxTok);
113  if (!msx) {
114  throw MSXException("Invalid msx character value, expected an "
115  "integer in range 0x00..0xff, but got: ", msxTok);
116  }
117 
118  auto unicodeTok = getToken(line);
119  if ((unicodeTok.size() >= 5) && (unicodeTok[0] == '<') &&
120  (unicodeTok[3] == '>') && (unicodeTok[4] == '+')) {
121  // In some files the <unicode> field is preceded with an annotation like:
122  // <LR>+0x0020 left-to-right
123  // <RL>+0x0020 right-to-left
124  // <RV>+0x0020 reverse-video
125  // Just strip out that annotation and ignore. Current
126  // implementation assumes the code 'LR', 'RL', 'RV' is
127  // exactly two characters long.
128  unicodeTok.remove_prefix(5);
129  }
130  auto unicode = StringOp::stringTo<uint32_t>(unicodeTok);
131  if (!unicode || *unicode > 0x10ffff) {
132  throw MSXException("Invalid unicode character value, expected an "
133  "integer in range 0..0x10ffff, but got: ", unicodeTok);
134  }
135 
136  if (!line.empty()) {
137  throw MSXException("Syntax error, expected \"<msx-char> <unicode>\", "
138  "but got: ", origLine);
139  }
140 
141  // There can be duplicates (e.g. in 'MSXVIDAR.TXT'), in that
142  // case only keep the first entry ...
143  if (msx2unicode[*msx] == uint32_t(-1)) {
144  msx2unicode[*msx] = *unicode;
145  }
146  // ... but (for now) keep all unicode->msx mappings (duplicates are removed below).
147  unicode2msx.emplace_back(*unicode, *msx);
148  }
149 
150  // Sort on unicode (for later binary-search). If there are duplicate
151  // unicodes (with different msx-code), then keep the first entry (hence
152  // use stable_sort).
153  ranges::stable_sort(unicode2msx, {}, &Entry::unicode);
154  unicode2msx.erase(ranges::unique(unicode2msx, {}, &Entry::unicode), end(unicode2msx));
155 }
156 
158  std::span<const uint8_t> msx, std::function<uint32_t(uint8_t)> fallback) const
159 {
160  std::string utf8;
161  utf8.reserve(msx.size()); // possibly underestimation, but that's fine
162  auto out = std::back_inserter(utf8);
163  for (auto m : msx) {
164  auto u = msx2unicode[m];
165  auto u2 = (u != uint32_t(-1)) ? u : fallback(m);
166  out = utf8::unchecked::append(u2, out);
167  }
168  return utf8;
169 }
170 
171 std::vector<uint8_t> MsxChar2Unicode::utf8ToMsx(
172  std::string_view utf8, std::function<uint8_t(uint32_t)> fallback) const
173 {
174  std::vector<uint8_t> msx;
175  auto it = utf8.begin(), et = utf8.end();
176  while (it != et) {
177  auto u = utf8::unchecked::next(it);
178  auto i = ranges::lower_bound(unicode2msx, u, {}, &Entry::unicode);
179  auto m = ((i != end(unicode2msx)) && (i->unicode == u))
180  ? i->msx
181  : fallback(u);
182  msx.push_back(m);
183  }
184  return msx;
185 }
186 
187 } // namespace openmsx
Definition: one_of.hh:7
std::string resolve(std::string_view filename) const
Definition: FileContext.cc:79
std::span< const uint8_t > mmap()
Map file in memory.
Definition: File.cc:101
std::string msxToUtf8(std::span< const uint8_t > msx, std::function< uint32_t(uint8_t)> fallback) const
TODO.
std::vector< uint8_t > utf8ToMsx(std::string_view utf8, std::function< uint8_t(uint32_t)> fallback) const
TODO.
MsxChar2Unicode(std::string_view mappingName)
constexpr double e
Definition: Math.hh:18
This file implemented 3 utility functions:
Definition: Autofire.cc:9
const FileContext & systemFileContext()
Definition: FileContext.cc:155
constexpr const char *const filename
auto unique(ForwardRange &&range)
Definition: ranges.hh:181
constexpr void fill(ForwardRange &&range, const T &value)
Definition: ranges.hh:256
void stable_sort(RandomAccessRange &&range)
Definition: ranges.hh:60
auto lower_bound(ForwardRange &&range, const T &value, Compare comp={}, Proj proj={})
Definition: ranges.hh:99
uint32_t next(octet_iterator &it)
octet_iterator append(uint32_t cp, octet_iterator result)
TemporaryString tmpStrCat(Ts &&... ts)
Definition: strCat.hh:617
constexpr auto end(const zstring_view &x)