openMSX
FilePoolCore.cc
Go to the documentation of this file.
1 #include "FilePoolCore.hh"
2 #include "File.hh"
3 #include "FileException.hh"
4 #include "foreach_file.hh"
5 #include "Date.hh"
6 #include "Timer.hh"
7 #include "one_of.hh"
8 #include "ranges.hh"
9 #include <fstream>
10 #include <optional>
11 #include <tuple>
12 
13 using std::string;
14 
15 namespace openmsx {
16 
17 class CompareSha1 {
18 public:
19  CompareSha1(const FilePoolCore::Pool& pool_) : pool(pool_) {}
20  template<typename X, typename Y>
21  [[nodiscard]] bool operator()(const X& x, const Y& y) const {
22  return get(x) < get(y);
23  }
24 private:
25  [[nodiscard]] const Sha1Sum& get(const Sha1Sum& sum) const { return sum; }
26  [[nodiscard]] const Sha1Sum& get(FilePoolCore::Index idx) const { return pool[idx].sum; }
27 private:
28  const FilePoolCore::Pool& pool;
29 };
30 
31 
32 FilePoolCore::FilePoolCore(string filecache_,
33  std::function<Directories()> getDirectories_,
34  std::function<void(const string&)> reportProgress_)
35  : filecache(std::move(filecache_))
36  , getDirectories(getDirectories_)
37  , reportProgress(reportProgress_)
38 {
39  try {
40  readSha1sums();
41  } catch (MSXException&) {
42  // ignore, probably .filecache doesn't exist yet
43  }
44 }
45 
47 {
48  if (needWrite) {
49  writeSha1sums();
50  }
51 }
52 
53 void FilePoolCore::insert(const Sha1Sum& sum, time_t time, const string& filename)
54 {
55  stringBuffer.push_back(filename);
56  auto idx = pool.emplace(sum, time, stringBuffer.back()).idx;
57  auto it = ranges::upper_bound(sha1Index, sum, CompareSha1(pool));
58  sha1Index.insert(it, idx);
59  filenameIndex.insert(idx);
60  needWrite = true;
61 }
62 
63 FilePoolCore::Sha1Index::iterator FilePoolCore::getSha1Iterator(Index idx, Entry& entry)
64 {
65  // There can be multiple entries for the same sha1, look for the specific one.
66  for (auto [b, e] = ranges::equal_range(sha1Index, entry.sum, CompareSha1(pool)); b != e; ++b) {
67  if (*b == idx) {
68  return b;
69  }
70  }
71  assert(false); return sha1Index.end();
72 }
73 
74 void FilePoolCore::remove(Sha1Index::iterator it)
75 {
76  auto idx = *it;
77  filenameIndex.erase(idx);
78  pool.remove(idx);
79  sha1Index.erase(it);
80  needWrite = true;
81 }
82 
83 void FilePoolCore::remove(Index idx, Entry& entry)
84 {
85  remove(getSha1Iterator(idx, entry));
86 }
87 
88 void FilePoolCore::remove(Index idx)
89 {
90  remove(idx, pool[idx]);
91 }
92 
93 // Change the sha1sum of the element pointed to by 'it' into 'newSum'. Also
94 // re-arrange the items so that 'sha1Index' remains sorted on sha1sum.
95 // Internally this method doesn't actually sort, it merely rotates the
96 // elements.
97 // Returns false if the new position is before (or at) the old position.
98 // Returns true if the new position is after the old position.
99 bool FilePoolCore::adjustSha1(Sha1Index::iterator it, Entry& entry, const Sha1Sum& newSum)
100 {
101  needWrite = true;
102  auto newIt = ranges::upper_bound(sha1Index, newSum, CompareSha1(pool));
103  entry.sum = newSum; // update sum
104  if (newIt > it) {
105  // move to back
106  rotate(it, it + 1, newIt);
107  return true;
108  } else {
109  if (newIt < it) {
110  // move to front
111  rotate(newIt, it, it + 1);
112  } else {
113  // (unlikely) sha1sum has changed, but after
114  // resorting item would remain in the same
115  // position
116  }
117  return false;
118  }
119 }
120 
121 bool FilePoolCore::adjustSha1(Index idx, Entry& entry, const Sha1Sum& newSum)
122 {
123  return adjustSha1(getSha1Iterator(idx, entry), entry, newSum);
124 }
125 
126 time_t FilePoolCore::Entry::getTime()
127 {
128  if (time == Date::INVALID_TIME_T) {
129  time = Date::fromString(timeStr);
130  }
131  return time;
132 }
133 
134 void FilePoolCore::Entry::setTime(time_t t)
135 {
136  time = t;
137  timeStr = nullptr;
138 }
139 
140 // returns: <sha1, time-string, filename>
141 static std::optional<std::tuple<Sha1Sum, const char*, std::string_view>> parse(
142  char* line, char* line_end)
143 {
144  if ((line_end - line) <= 68) return {}; // minumum length (only filename is variable)
145 
146  // only perform quick sanity check on date/time format
147  if (line[40] != ' ') return {}; // two space between sha1sum and date
148  if (line[41] != ' ') return {};
149  if (line[45] != ' ') return {}; // space between day-of-week and month
150  if (line[49] != ' ') return {}; // space between month and day of month
151  if (line[52] != ' ') return {}; // space between day of month and hour
152  if (line[55] != ':') return {}; // colon between hour and minutes
153  if (line[58] != ':') return {}; // colon between minutes and seconds
154  if (line[61] != ' ') return {}; // space between seconds and year
155  if (line[66] != ' ') return {}; // two spaces between date and filename
156  if (line[67] != ' ') return {};
157 
158  Sha1Sum sha1(Sha1Sum::UninitializedTag{});
159  try {
160  sha1.parse40(line);
161  } catch (MSXException&) {
162  return {};
163  }
164 
165  const char* timeStr = line + 42; // not guaranteed to be a correct date/time
166  line[66] = '\0'; // zero-terminate timeStr, so that it can be printed
167 
168  std::string_view filename(line + 68, line_end - (line + 68)); // TODO c++20 [begin, end)
169 
170  return std::tuple{sha1, timeStr, filename};
171 }
172 
173 void FilePoolCore::readSha1sums()
174 {
175  assert(sha1Index.empty());
176  assert(fileMem.empty());
177 
178  File file(filecache);
179  auto size = file.getSize();
180  fileMem.resize(size + 1);
181  file.read(fileMem.data(), size);
182  fileMem[size] = '\n'; // ensure there's always a '\n' at the end
183 
184  // Process each line.
185  // Assume lines are separated by "\n", "\r\n" or "\n\r" (but not "\r").
186  char* data = fileMem.data();
187  char* data_end = data + size + 1;
188  while (data != data_end) {
189  // memchr() seems better optimized than std::find_if()
190  char* it = static_cast<char*>(memchr(data, '\n', data_end - data));
191  if (it == nullptr) it = data_end;
192  if ((it != data) && (it[-1] == '\r')) --it;
193 
194  if (auto r = parse(data, it)) {
195  auto [sum, timeStr, filename] = *r;
196  sha1Index.push_back(pool.emplace(sum, timeStr, filename).idx);
197  // sha1Index not yet guaranteed sorted
198  }
199 
200  data = std::find_if(it + 1, data_end, [](char c) {
201  return c != one_of('\n', '\r');
202  });
203  }
204 
205  if (!ranges::is_sorted(sha1Index, CompareSha1(pool))) {
206  // This should _rarely_ happen. In fact it should only happen
207  // when .filecache was manually edited. Though because it's
208  // very important that pool is indeed sorted I've added this
209  // safety mechanism.
210  ranges::sort(sha1Index, CompareSha1(pool));
211  }
212 
213  // 'pool' is populated, 'sha1Index' is sorted, now build 'filenameIndex'
214  auto n = sha1Index.size();
215  filenameIndex.reserve(n);
216  while (n != 0) { // sha1Index might change while iterating ...
217  --n;
218  Index idx = sha1Index[n]; // ... therefor use indices instead of iterators
219  bool inserted = filenameIndex.insert(idx);
220  if (!inserted) {
221  // ERROR: there may not be multiple entries for the same filename
222  // Should only happen when .filecache was manually edited.
223  remove(idx);
224  }
225  }
226 }
227 
228 void FilePoolCore::writeSha1sums()
229 {
230  std::ofstream file;
231  FileOperations::openofstream(file, filecache);
232  if (!file.is_open()) {
233  return;
234  }
235  for (auto idx : sha1Index) {
236  const auto& entry = pool[idx];
237  file << entry.sum.toString() << " ";
238  if (entry.timeStr) {
239  file << entry.timeStr;
240  } else {
241  assert(entry.time != Date::INVALID_TIME_T);
242  file << Date::toString(entry.time);
243  }
244  file << " " << entry.filename << '\n';
245  }
246 }
247 
248 File FilePoolCore::getFile(FileType fileType, const Sha1Sum& sha1sum)
249 {
250  File result = getFromPool(sha1sum);
251  if (result.is_open()) return result;
252 
253  // not found in cache, need to scan directories
254  stop = false;
255  ScanProgress progress;
256  progress.lastTime = Timer::getTime();
257  progress.amountScanned = 0;
258 
259  for (auto& [path, types] : getDirectories()) {
260  if ((types & fileType) != FileType::NONE) {
261  result = scanDirectory(sha1sum, FileOperations::expandTilde(path), path, progress);
262  if (result.is_open()) return result;
263  }
264  }
265 
266  return result; // not found
267 }
268 
269 Sha1Sum FilePoolCore::calcSha1sum(File& file)
270 {
271  // Calculate sha1 in several steps so that we can show progress
272  // information. We take a fixed step size for an efficient calculation.
273  constexpr size_t STEP_SIZE = 1024 * 1024; // 1MB
274 
275  auto data = file.mmap();
276 
277  SHA1 sha1;
278  size_t size = data.size();
279  size_t done = 0;
280  size_t remaining = size;
281  auto lastShowedProgress = Timer::getTime();
282  bool everShowedProgress = false;
283 
284  auto report = [&](size_t percentage) {
285  reportProgress(strCat("Calculating SHA1 sum for ", file.getOriginalName(),
286  "... ", percentage, '%'));
287  };
288  // Loop over all-but-the last blocks. For small files this loop is skipped.
289  while (remaining > STEP_SIZE) {
290  sha1.update(&data[done], STEP_SIZE);
291  done += STEP_SIZE;
292  remaining -= STEP_SIZE;
293 
294  auto now = Timer::getTime();
295  if ((now - lastShowedProgress) > 250'000) { // 4Hz
296  report((100 * done) / size);
297  lastShowedProgress = now;
298  everShowedProgress = true;
299  }
300  }
301  // last block
302  if (remaining) {
303  sha1.update(&data[done], remaining);
304  }
305  if (everShowedProgress) {
306  report(100);
307  }
308  return sha1.digest();
309 }
310 
311 File FilePoolCore::getFromPool(const Sha1Sum& sha1sum)
312 {
313  auto [b, e] = ranges::equal_range(sha1Index, sha1sum, CompareSha1(pool));
314  // use indices instead of iterators
315  auto i = distance(begin(sha1Index), b);
316  auto last = distance(begin(sha1Index), e);
317  while (i != last) {
318  auto it = begin(sha1Index) + i;
319  auto& entry = pool[*it];
320  if (entry.getTime() == Date::INVALID_TIME_T) {
321  // Invalid time/date format. Remove from
322  // database and continue searching.
323  remove(it);
324  --last;
325  continue;
326  }
327  try {
328  File file(entry.filename);
329  auto newTime = file.getModificationDate();
330  if (entry.getTime() == newTime) {
331  // When modification time is unchanged, assume
332  // sha1sum is also unchanged. So avoid
333  // expensive sha1sum calculation.
334  return file;
335  }
336  entry.setTime(newTime); // update timestamp
337  needWrite = true;
338  auto newSum = calcSha1sum(file);
339  if (newSum == sha1sum) {
340  // Modification time was changed, but
341  // (recalculated) sha1sum is still the same.
342  return file;
343  }
344  // Sha1sum has changed: update sha1sum, move entry to
345  // new position new sum and continue searching.
346  if (adjustSha1(it, entry, newSum)) {
347  // after
348  --last; // no ++i
349  } else {
350  // before (or at)
351  ++i;
352  }
353  } catch (FileException&) {
354  // Error reading file: remove from db and continue
355  // searching.
356  remove(it);
357  --last;
358  }
359  }
360  return File(); // not found
361 }
362 
363 File FilePoolCore::scanDirectory(
364  const Sha1Sum& sha1sum, const string& directory, const string& poolPath,
365  ScanProgress& progress)
366 {
367  File result;
368  auto fileAction = [&](const std::string& path, const FileOperations::Stat& st) {
369  if (stop) {
370  // Scanning can take a long time. Allow to exit
371  // openmsx when it takes too long. Stop scanning
372  // by pretending we didn't find the file.
373  assert(!result.is_open());
374  return false; // abort foreach_file_recursive
375  }
376  result = scanFile(sha1sum, path, st, poolPath, progress);
377  return !result.is_open(); // abort traversal when found
378  };
379  foreach_file_recursive(directory, fileAction);
380  return result;
381 }
382 
383 File FilePoolCore::scanFile(const Sha1Sum& sha1sum, const string& filename,
384  const FileOperations::Stat& st, const string& poolPath,
385  ScanProgress& progress)
386 {
387  ++progress.amountScanned;
388  // Periodically send a progress message with the current filename
389  auto now = Timer::getTime();
390  if (now > (progress.lastTime + 250'000)) { // 4Hz
391  progress.lastTime = now;
392  reportProgress(strCat(
393  "Searching for file with sha1sum ", sha1sum.toString(),
394  "...\nIndexing filepool ", poolPath, ": [",
395  progress.amountScanned, "]: ",
396  std::string_view(filename).substr(poolPath.size())));
397  }
398 
399  auto time = FileOperations::getModificationDate(st);
400  if (auto [idx, entry] = findInDatabase(filename); idx == Index(-1)) {
401  // not in pool
402  try {
403  File file(filename);
404  auto sum = calcSha1sum(file);
405  insert(sum, time, filename);
406  if (sum == sha1sum) {
407  return file;
408  }
409  } catch (FileException&) {
410  // ignore
411  }
412  } else {
413  // already in pool
414  assert(filename == entry->filename);
415  try {
416  if (entry->getTime() == time) {
417  // db is still up to date
418  if (entry->sum == sha1sum) {
419  return File(filename);
420  }
421  } else {
422  // db outdated
423  File file(filename);
424  auto sum = calcSha1sum(file);
425  entry->setTime(time);
426  adjustSha1(idx, *entry, sum);
427  if (sum == sha1sum) {
428  return file;
429  }
430  }
431  } catch (FileException&) {
432  // error reading file, remove from db
433  remove(idx, *entry);
434  }
435  }
436  return File(); // not found
437 }
438 
439 std::pair<FilePoolCore::Index, FilePoolCore::Entry*> FilePoolCore::findInDatabase(std::string_view filename)
440 {
441  auto it = filenameIndex.find(filename);
442  if (!it) return {Index(-1), nullptr};
443 
444  Index idx = *it;
445  auto& entry = pool[idx];
446  assert(entry.filename == filename);
447  if (entry.getTime() == Date::INVALID_TIME_T) {
448  // invalid date/time string, remove from db
449  remove(idx, entry);
450  return {Index(-1), nullptr};
451  }
452  return {idx, &entry};
453 }
454 
455 Sha1Sum FilePoolCore::getSha1Sum(File& file)
456 {
457  auto time = file.getModificationDate();
458  const auto& filename = file.getURL();
459 
460  auto [idx, entry] = findInDatabase(filename);
461  if (idx != Index(-1)) {
462  if (entry->getTime() == time) {
463  // in database and modification time matches,
464  // assume sha1sum also matches
465  return entry->sum;
466  }
467  }
468 
469  // not in database or timestamp mismatch
470  auto sum = calcSha1sum(file);
471  if (idx == Index(-1)) {
472  // was not yet in database, insert new entry
473  insert(sum, time, filename);
474  } else {
475  // was already in database, but with wrong timestamp (and sha1sum)
476  entry->setTime(time);
477  adjustSha1(idx, *entry, sum);
478  }
479  return sum;
480 }
481 
482 } // namespace openmsx
one_of.hh
FileException.hh
openmsx::SHA1
Helper class to perform a sha1 calculation.
Definition: sha1.hh:79
openmsx::File::mmap
span< uint8_t > mmap()
Map file in memory.
Definition: File.cc:93
gl::rotate
mat4 rotate(float angle, const vec3 &axis)
Definition: gl_transform.hh:58
SimpleHashSet::reserve
void reserve(size_t n)
Definition: SimpleHashSet.hh:63
Timer.hh
ranges::sort
void sort(RandomAccessRange &&range)
Definition: ranges.hh:35
openmsx::MemBuffer::empty
bool empty() const
No memory allocated?
Definition: MemBuffer.hh:103
openmsx::FileType
FileType
Definition: FilePoolCore.hh:22
utf8::unchecked::size
size_t size(std::string_view utf8)
Definition: utf8_unchecked.hh:227
Date.hh
openmsx::FilePoolCore::Directories
std::vector< Dir > Directories
Definition: FilePoolCore.hh:44
openmsx::FileOperations::Stat
struct stat Stat
Definition: FileOperations.hh:218
t
TclObject t
Definition: TclObject_test.cc:264
ranges.hh
openmsx::Date::INVALID_TIME_T
constexpr time_t INVALID_TIME_T
Definition: Date.hh:10
openmsx::Timer::getTime
uint64_t getTime()
Get current (real) time in us.
Definition: Timer.cc:7
openmsx::MSXException
Definition: MSXException.hh:10
SimpleHashSet::insert
bool insert(Value2 &&val)
Definition: SimpleHashSet.hh:84
openmsx::FileOperations::openofstream
void openofstream(std::ofstream &stream, const std::string &filename)
Open an ofstream in a platform-independent manner.
Definition: FileOperations.cc:367
openmsx::CompareSha1
Definition: FilePoolCore.cc:17
openmsx::FilePoolCore::CompareSha1
friend class CompareSha1
Definition: FilePoolCore.hh:170
openmsx::MemBuffer::resize
void resize(size_t size)
Grow or shrink the memory block.
Definition: MemBuffer.hh:111
openmsx::FileOperations::expandTilde
string expandTilde(string_view path)
Expand the '~' character to the users home directory.
Definition: FileOperations.cc:195
File.hh
ObjectPool::remove
void remove(Index idx)
Definition: ObjectPool.hh:93
one_of
Definition: one_of.hh:7
openmsx::File::getOriginalName
std::string getOriginalName()
Get Original filename for this object.
Definition: File.cc:138
openmsx::filename
constexpr const char *const filename
Definition: FirmwareSwitch.cc:10
ObjectPool< Entry >
FilePoolCore.hh
openmsx::Date::fromString
time_t fromString(const char *p)
Definition: Date.cc:29
openmsx::File::is_open
bool is_open() const
Return true iff this file handle refers to an open file.
Definition: File.hh:61
openmsx::FilePoolCore::getFile
File getFile(FileType fileType, const Sha1Sum &sha1sum)
Search file with the given sha1sum.
Definition: FilePoolCore.cc:248
SimpleHashSet::erase
bool erase(const Value2 &val)
Definition: SimpleHashSet.hh:106
ranges::find
auto find(InputRange &&range, const T &value)
Definition: ranges.hh:107
openmsx::Sha1Sum
This class represents the result of a sha1 calculation (a 160-bit value).
Definition: sha1.hh:20
openmsx::Date::toString
std::string toString(time_t time)
Definition: Date.cc:150
openmsx::FileType::NONE
@ NONE
openmsx::x
constexpr KeyMatrixPosition x
Keyboard bindings.
Definition: Keyboard.cc:1416
openmsx::CompareSha1::operator()
bool operator()(const X &x, const Y &y) const
Definition: FilePoolCore.cc:21
openmsx::File
Definition: File.hh:16
openmsx::FilePoolCore::FilePoolCore
FilePoolCore(std::string filecache, std::function< Directories()> getDirectories, std::function< void(const std::string &)> reportProgress)
Definition: FilePoolCore.cc:32
foreach_file.hh
openmsx::foreach_file_recursive
bool foreach_file_recursive(std::string path, FileAction fileAction)
Definition: foreach_file.hh:163
openmsx::MemBuffer::data
const T * data() const
Returns pointer to the start of the memory buffer.
Definition: MemBuffer.hh:81
openmsx::CompareSha1::CompareSha1
CompareSha1(const FilePoolCore::Pool &pool_)
Definition: FilePoolCore.cc:19
ranges::find_if
auto find_if(InputRange &&range, UnaryPredicate pred)
Definition: ranges.hh:113
strCat
std::string strCat(Ts &&...ts)
Definition: strCat.hh:573
ranges::is_sorted
bool is_sorted(ForwardRange &&range)
Definition: ranges.hh:23
ranges::upper_bound
auto upper_bound(ForwardRange &&range, const T &value)
Definition: ranges.hh:83
ranges::equal_range
auto equal_range(ForwardRange &&range, const T &value)
Definition: ranges.hh:95
openmsx
This file implemented 3 utility functions:
Definition: Autofire.cc:5
ObjectPool::emplace
EmplaceResult emplace(Args &&...args)
Definition: ObjectPool.hh:73
openmsx::FilePoolCore::~FilePoolCore
~FilePoolCore()
Definition: FilePoolCore.cc:46
sum
auto sum(InputRange &&range)
Definition: stl.hh:293
openmsx::SHA1::update
void update(const uint8_t *data, size_t len)
Incrementally calculate the hash value.
Definition: utils/sha1.cc:318