openMSX
FilePoolCore.cc
Go to the documentation of this file.
1 #include "FilePoolCore.hh"
2 #include "File.hh"
3 #include "FileException.hh"
4 #include "foreach_file.hh"
5 #include "Date.hh"
6 #include "Timer.hh"
7 #include "one_of.hh"
8 #include "ranges.hh"
9 #include <fstream>
10 #include <optional>
11 #include <tuple>
12 
13 namespace openmsx {
14 
15 struct GetSha1 {
17 
18  [[nodiscard]] const Sha1Sum& operator()(FilePoolCore::Index idx) const {
19  return pool[idx].sum;
20  }
21 };
22 
23 
24 FilePoolCore::FilePoolCore(std::string filecache_,
25  std::function<Directories()> getDirectories_,
26  std::function<void(std::string_view)> reportProgress_)
27  : filecache(std::move(filecache_))
28  , getDirectories(getDirectories_)
29  , reportProgress(reportProgress_)
30 {
31  try {
32  readSha1sums();
33  } catch (MSXException&) {
34  // ignore, probably .filecache doesn't exist yet
35  }
36 }
37 
39 {
40  if (needWrite) {
41  writeSha1sums();
42  }
43 }
44 
45 void FilePoolCore::insert(const Sha1Sum& sum, time_t time, const std::string& filename)
46 {
47  stringBuffer.push_back(filename);
48  auto idx = pool.emplace(sum, time, stringBuffer.back()).idx;
49  auto it = ranges::upper_bound(sha1Index, sum, {}, GetSha1{pool});
50  sha1Index.insert(it, idx);
51  filenameIndex.insert(idx);
52  needWrite = true;
53 }
54 
55 FilePoolCore::Sha1Index::iterator FilePoolCore::getSha1Iterator(Index idx, Entry& entry)
56 {
57  // There can be multiple entries for the same sha1, look for the specific one.
58  for (auto [b, e] = ranges::equal_range(sha1Index, entry.sum, {}, GetSha1{pool}); b != e; ++b) {
59  if (*b == idx) {
60  return b;
61  }
62  }
63  assert(false); return sha1Index.end();
64 }
65 
66 void FilePoolCore::remove(Sha1Index::iterator it)
67 {
68  auto idx = *it;
69  filenameIndex.erase(idx);
70  pool.remove(idx);
71  sha1Index.erase(it);
72  needWrite = true;
73 }
74 
75 void FilePoolCore::remove(Index idx, Entry& entry)
76 {
77  remove(getSha1Iterator(idx, entry));
78 }
79 
80 void FilePoolCore::remove(Index idx)
81 {
82  remove(idx, pool[idx]);
83 }
84 
85 // Change the sha1sum of the element pointed to by 'it' into 'newSum'. Also
86 // re-arrange the items so that 'sha1Index' remains sorted on sha1sum.
87 // Internally this method doesn't actually sort, it merely rotates the
88 // elements.
89 // Returns false if the new position is before (or at) the old position.
90 // Returns true if the new position is after the old position.
91 bool FilePoolCore::adjustSha1(Sha1Index::iterator it, Entry& entry, const Sha1Sum& newSum)
92 {
93  needWrite = true;
94  auto newIt = ranges::upper_bound(sha1Index, newSum, {}, GetSha1{pool});
95  entry.sum = newSum; // update sum
96  if (newIt > it) {
97  // move to back
98  rotate(it, it + 1, newIt);
99  return true;
100  } else {
101  if (newIt < it) {
102  // move to front
103  rotate(newIt, it, it + 1);
104  } else {
105  // (unlikely) sha1sum has changed, but after
106  // resorting item would remain in the same
107  // position
108  }
109  return false;
110  }
111 }
112 
113 bool FilePoolCore::adjustSha1(Index idx, Entry& entry, const Sha1Sum& newSum)
114 {
115  return adjustSha1(getSha1Iterator(idx, entry), entry, newSum);
116 }
117 
118 time_t FilePoolCore::Entry::getTime()
119 {
120  if (time == Date::INVALID_TIME_T) {
121  time = Date::fromString(timeStr);
122  }
123  return time;
124 }
125 
126 void FilePoolCore::Entry::setTime(time_t t)
127 {
128  time = t;
129  timeStr = nullptr;
130 }
131 
132 // returns: <sha1, time-string, filename>
133 static std::optional<std::tuple<Sha1Sum, const char*, std::string_view>> parse(
134  char* line, char* line_end)
135 {
136  if ((line_end - line) <= 68) return {}; // minumum length (only filename is variable)
137 
138  // only perform quick sanity check on date/time format
139  if (line[40] != ' ') return {}; // two space between sha1sum and date
140  if (line[41] != ' ') return {};
141  if (line[45] != ' ') return {}; // space between day-of-week and month
142  if (line[49] != ' ') return {}; // space between month and day of month
143  if (line[52] != ' ') return {}; // space between day of month and hour
144  if (line[55] != ':') return {}; // colon between hour and minutes
145  if (line[58] != ':') return {}; // colon between minutes and seconds
146  if (line[61] != ' ') return {}; // space between seconds and year
147  if (line[66] != ' ') return {}; // two spaces between date and filename
148  if (line[67] != ' ') return {};
149 
150  Sha1Sum sha1(Sha1Sum::UninitializedTag{});
151  try {
152  sha1.parse40(line);
153  } catch (MSXException&) {
154  return {};
155  }
156 
157  const char* timeStr = line + 42; // not guaranteed to be a correct date/time
158  line[66] = '\0'; // zero-terminate timeStr, so that it can be printed
159 
160  std::string_view filename(line + 68, line_end - (line + 68)); // TODO c++20 [begin, end)
161 
162  return std::tuple{sha1, timeStr, filename};
163 }
164 
165 void FilePoolCore::readSha1sums()
166 {
167  assert(sha1Index.empty());
168  assert(fileMem.empty());
169 
170  File file(filecache);
171  auto size = file.getSize();
172  fileMem.resize(size + 1);
173  file.read(fileMem.data(), size);
174  fileMem[size] = '\n'; // ensure there's always a '\n' at the end
175 
176  // Process each line.
177  // Assume lines are separated by "\n", "\r\n" or "\n\r" (but not "\r").
178  char* data = fileMem.data();
179  char* data_end = data + size + 1;
180  while (data != data_end) {
181  // memchr() seems better optimized than std::find_if()
182  char* it = static_cast<char*>(memchr(data, '\n', data_end - data));
183  if (it == nullptr) it = data_end;
184  if ((it != data) && (it[-1] == '\r')) --it;
185 
186  if (auto r = parse(data, it)) {
187  auto [sum, timeStr, filename] = *r;
188  sha1Index.push_back(pool.emplace(sum, timeStr, filename).idx);
189  // sha1Index not yet guaranteed sorted
190  }
191 
192  data = std::find_if(it + 1, data_end, [](char c) {
193  return c != one_of('\n', '\r');
194  });
195  }
196 
197  if (!ranges::is_sorted(sha1Index, {}, GetSha1{pool})) {
198  // This should _rarely_ happen. In fact it should only happen
199  // when .filecache was manually edited. Though because it's
200  // very important that pool is indeed sorted I've added this
201  // safety mechanism.
202  ranges::sort(sha1Index, {}, GetSha1{pool});
203  }
204 
205  // 'pool' is populated, 'sha1Index' is sorted, now build 'filenameIndex'
206  auto n = sha1Index.size();
207  filenameIndex.reserve(n);
208  while (n != 0) { // sha1Index might change while iterating ...
209  --n;
210  Index idx = sha1Index[n]; // ... therefor use indices instead of iterators
211  bool inserted = filenameIndex.insert(idx);
212  if (!inserted) {
213  // ERROR: there may not be multiple entries for the same filename
214  // Should only happen when .filecache was manually edited.
215  remove(idx);
216  }
217  }
218 }
219 
220 void FilePoolCore::writeSha1sums()
221 {
222  std::ofstream file;
223  FileOperations::openofstream(file, filecache);
224  if (!file.is_open()) {
225  return;
226  }
227  for (auto idx : sha1Index) {
228  const auto& entry = pool[idx];
229  file << entry.sum.toString() << " ";
230  if (entry.timeStr) {
231  file << entry.timeStr;
232  } else {
233  assert(entry.time != Date::INVALID_TIME_T);
234  file << Date::toString(entry.time);
235  }
236  file << " " << entry.filename << '\n';
237  }
238 }
239 
240 File FilePoolCore::getFile(FileType fileType, const Sha1Sum& sha1sum)
241 {
242  File result = getFromPool(sha1sum);
243  if (result.is_open()) return result;
244 
245  // not found in cache, need to scan directories
246  stop = false;
247  ScanProgress progress;
248  progress.lastTime = Timer::getTime();
249  progress.amountScanned = 0;
250 
251  for (auto& [path, types] : getDirectories()) {
252  if ((types & fileType) != FileType::NONE) {
253  result = scanDirectory(sha1sum, FileOperations::expandTilde(std::string(path)), path, progress);
254  if (result.is_open()) return result;
255  }
256  }
257 
258  return result; // not found
259 }
260 
261 Sha1Sum FilePoolCore::calcSha1sum(File& file)
262 {
263  // Calculate sha1 in several steps so that we can show progress
264  // information. We take a fixed step size for an efficient calculation.
265  constexpr size_t STEP_SIZE = 1024 * 1024; // 1MB
266 
267  auto data = file.mmap();
268 
269  SHA1 sha1;
270  size_t size = data.size();
271  size_t done = 0;
272  size_t remaining = size;
273  auto lastShowedProgress = Timer::getTime();
274  bool everShowedProgress = false;
275 
276  auto report = [&](size_t percentage) {
277  reportProgress(tmpStrCat("Calculating SHA1 sum for ", file.getOriginalName(),
278  "... ", percentage, '%'));
279  };
280  // Loop over all-but-the last blocks. For small files this loop is skipped.
281  while (remaining > STEP_SIZE) {
282  sha1.update({&data[done], STEP_SIZE});
283  done += STEP_SIZE;
284  remaining -= STEP_SIZE;
285 
286  auto now = Timer::getTime();
287  if ((now - lastShowedProgress) > 250'000) { // 4Hz
288  report((100 * done) / size);
289  lastShowedProgress = now;
290  everShowedProgress = true;
291  }
292  }
293  // last block
294  if (remaining) {
295  sha1.update({&data[done], remaining});
296  }
297  if (everShowedProgress) {
298  report(100);
299  }
300  return sha1.digest();
301 }
302 
303 File FilePoolCore::getFromPool(const Sha1Sum& sha1sum)
304 {
305  auto [b, e] = ranges::equal_range(sha1Index, sha1sum, {}, GetSha1{pool});
306  // use indices instead of iterators
307  auto i = distance(begin(sha1Index), b);
308  auto last = distance(begin(sha1Index), e);
309  while (i != last) {
310  auto it = begin(sha1Index) + i;
311  auto& entry = pool[*it];
312  if (entry.getTime() == Date::INVALID_TIME_T) {
313  // Invalid time/date format. Remove from
314  // database and continue searching.
315  remove(it);
316  --last;
317  continue;
318  }
319  try {
320  File file(std::string(entry.filename));
321  auto newTime = file.getModificationDate();
322  if (entry.getTime() == newTime) {
323  // When modification time is unchanged, assume
324  // sha1sum is also unchanged. So avoid
325  // expensive sha1sum calculation.
326  return file;
327  }
328  entry.setTime(newTime); // update timestamp
329  needWrite = true;
330  auto newSum = calcSha1sum(file);
331  if (newSum == sha1sum) {
332  // Modification time was changed, but
333  // (recalculated) sha1sum is still the same.
334  return file;
335  }
336  // Sha1sum has changed: update sha1sum, move entry to
337  // new position new sum and continue searching.
338  if (adjustSha1(it, entry, newSum)) {
339  // after
340  --last; // no ++i
341  } else {
342  // before (or at)
343  ++i;
344  }
345  } catch (FileException&) {
346  // Error reading file: remove from db and continue
347  // searching.
348  remove(it);
349  --last;
350  }
351  }
352  return File(); // not found
353 }
354 
355 File FilePoolCore::scanDirectory(
356  const Sha1Sum& sha1sum, const std::string& directory, std::string_view poolPath,
357  ScanProgress& progress)
358 {
359  File result;
360  auto fileAction = [&](const std::string& path, const FileOperations::Stat& st) {
361  if (stop) {
362  // Scanning can take a long time. Allow to exit
363  // openmsx when it takes too long. Stop scanning
364  // by pretending we didn't find the file.
365  assert(!result.is_open());
366  return false; // abort foreach_file_recursive
367  }
368  result = scanFile(sha1sum, path, st, poolPath, progress);
369  return !result.is_open(); // abort traversal when found
370  };
371  foreach_file_recursive(directory, fileAction);
372  return result;
373 }
374 
375 File FilePoolCore::scanFile(const Sha1Sum& sha1sum, const std::string& filename,
376  const FileOperations::Stat& st, std::string_view poolPath,
377  ScanProgress& progress)
378 {
379  ++progress.amountScanned;
380  // Periodically send a progress message with the current filename
381  auto now = Timer::getTime();
382  if (now > (progress.lastTime + 250'000)) { // 4Hz
383  progress.lastTime = now;
384  reportProgress(tmpStrCat(
385  "Searching for file with sha1sum ", sha1sum.toString(),
386  "...\nIndexing filepool ", poolPath, ": [",
387  progress.amountScanned, "]: ",
388  std::string_view(filename).substr(poolPath.size())));
389  }
390 
391  auto time = FileOperations::getModificationDate(st);
392  if (auto [idx, entry] = findInDatabase(filename); idx == Index(-1)) {
393  // not in pool
394  try {
395  File file(filename);
396  auto sum = calcSha1sum(file);
397  insert(sum, time, filename);
398  if (sum == sha1sum) {
399  return file;
400  }
401  } catch (FileException&) {
402  // ignore
403  }
404  } else {
405  // already in pool
406  assert(filename == entry->filename);
407  try {
408  if (entry->getTime() == time) {
409  // db is still up to date
410  if (entry->sum == sha1sum) {
411  return File(filename);
412  }
413  } else {
414  // db outdated
415  File file(filename);
416  auto sum = calcSha1sum(file);
417  entry->setTime(time);
418  adjustSha1(idx, *entry, sum);
419  if (sum == sha1sum) {
420  return file;
421  }
422  }
423  } catch (FileException&) {
424  // error reading file, remove from db
425  remove(idx, *entry);
426  }
427  }
428  return File(); // not found
429 }
430 
431 std::pair<FilePoolCore::Index, FilePoolCore::Entry*> FilePoolCore::findInDatabase(std::string_view filename)
432 {
433  auto it = filenameIndex.find(filename);
434  if (!it) return {Index(-1), nullptr};
435 
436  Index idx = *it;
437  auto& entry = pool[idx];
438  assert(entry.filename == filename);
439  if (entry.getTime() == Date::INVALID_TIME_T) {
440  // invalid date/time string, remove from db
441  remove(idx, entry);
442  return {Index(-1), nullptr};
443  }
444  return {idx, &entry};
445 }
446 
448 {
449  auto time = file.getModificationDate();
450  const std::string& filename = file.getURL();
451 
452  auto [idx, entry] = findInDatabase(filename);
453  if (idx != Index(-1)) {
454  if (entry->getTime() == time) {
455  // in database and modification time matches,
456  // assume sha1sum also matches
457  return entry->sum;
458  }
459  }
460 
461  // not in database or timestamp mismatch
462  auto sum = calcSha1sum(file);
463  if (idx == Index(-1)) {
464  // was not yet in database, insert new entry
465  insert(sum, time, filename);
466  } else {
467  // was already in database, but with wrong timestamp (and sha1sum)
468  entry->setTime(time);
469  adjustSha1(idx, *entry, sum);
470  }
471  return sum;
472 }
473 
474 } // namespace openmsx
TclObject t
void remove(Index idx)
Definition: ObjectPool.hh:95
EmplaceResult emplace(Args &&...args)
Definition: ObjectPool.hh:75
Value * find(const Value2 &val) const
bool insert(Value2 &&val)
void reserve(size_t n)
bool erase(const Value2 &val)
Definition: one_of.hh:7
Sha1Sum getSha1Sum(File &file)
Calculate sha1sum for the given File object.
FilePoolCore(std::string filecache, std::function< Directories()> getDirectories, std::function< void(std::string_view)> reportProgress)
Definition: FilePoolCore.cc:24
File getFile(FileType fileType, const Sha1Sum &sha1sum)
Search file with the given sha1sum.
friend struct GetSha1
std::vector< Dir > Directories
Definition: FilePoolCore.hh:44
span< const uint8_t > mmap()
Map file in memory.
Definition: File.cc:101
std::string_view getOriginalName()
Get Original filename for this object.
Definition: File.cc:146
time_t getModificationDate()
Get the date/time of last modification.
Definition: File.cc:157
bool is_open() const
Return true iff this file handle refers to an open file.
Definition: File.hh:63
const std::string & getURL() const
Returns the URL of this file object.
Definition: File.cc:136
const T * data() const
Returns pointer to the start of the memory buffer.
Definition: MemBuffer.hh:81
void resize(size_t size)
Grow or shrink the memory block.
Definition: MemBuffer.hh:111
bool empty() const
No memory allocated?
Definition: MemBuffer.hh:103
Helper class to perform a sha1 calculation.
Definition: sha1.hh:81
Sha1Sum digest()
Get the final hash.
Definition: utils/sha1.cc:359
void update(span< const uint8_t > data)
Incrementally calculate the hash value.
Definition: utils/sha1.cc:316
This class represents the result of a sha1 calculation (a 160-bit value).
Definition: sha1.hh:22
mat4 rotate(float angle, const vec3 &axis)
Definition: gl_transform.hh:58
std::string toString(time_t time)
Definition: Date.cc:150
constexpr time_t INVALID_TIME_T
Definition: Date.hh:10
time_t fromString(const char *p)
Definition: Date.cc:29
void openofstream(std::ofstream &stream, zstring_view filename)
Open an ofstream in a platform-independent manner.
string expandTilde(string path)
Expand the '~' character to the users home directory.
time_t getModificationDate(const Stat &st)
Get the date/time of last modification.
uint64_t getTime()
Get current (real) time in us.
Definition: Timer.cc:7
This file implemented 3 utility functions:
Definition: Autofire.cc:9
constexpr const char *const filename
bool foreach_file_recursive(std::string path, FileAction fileAction)
bool is_sorted(ForwardRange &&range, Compare comp={}, Proj proj={})
Definition: ranges.hh:25
auto find_if(InputRange &&range, UnaryPredicate pred)
Definition: ranges.hh:143
auto upper_bound(ForwardRange &&range, const T &value, Compare comp={}, Proj proj={})
Definition: ranges.hh:94
auto equal_range(ForwardRange &&range, const T &value, Compare comp={})
Definition: ranges.hh:103
void sort(RandomAccessRange &&range)
Definition: ranges.hh:34
size_t size(std::string_view utf8)
auto distance(octet_iterator first, octet_iterator last)
auto sum(InputRange &&range, Proj proj={})
Definition: stl.hh:245
TemporaryString tmpStrCat(Ts &&... ts)
Definition: strCat.hh:659
const Sha1Sum & operator()(FilePoolCore::Index idx) const
Definition: FilePoolCore.cc:18
const FilePoolCore::Pool & pool
Definition: FilePoolCore.cc:16
constexpr auto begin(const zstring_view &x)
Definition: zstring_view.hh:83