openMSX
FilePoolCore.cc
Go to the documentation of this file.
1#include "FilePoolCore.hh"
2
3#include "File.hh"
4#include "FileException.hh"
5#include "foreach_file.hh"
6
7#include "Date.hh"
8#include "Timer.hh"
9#include "one_of.hh"
10#include "ranges.hh"
11
12#include <fstream>
13#include <optional>
14#include <tuple>
15
16namespace openmsx {
17
18struct GetSha1 {
20
21 [[nodiscard]] const Sha1Sum& operator()(FilePoolCore::Index idx) const {
22 return pool[idx].sum;
23 }
24};
25
26
27FilePoolCore::FilePoolCore(std::string fileCache_,
28 std::function<Directories()> getDirectories_,
29 std::function<void(std::string_view, float)> reportProgress_)
30 : fileCache(std::move(fileCache_))
31 , getDirectories(std::move(getDirectories_))
32 , reportProgress(std::move(reportProgress_))
33{
34 try {
35 readSha1sums();
36 } catch (MSXException&) {
37 // ignore, probably .filecache doesn't exist yet
38 }
39}
40
42{
43 if (needWrite) {
44 writeSha1sums();
45 }
46}
47
48void FilePoolCore::insert(const Sha1Sum& sum, time_t time, const std::string& filename)
49{
50 stringBuffer.push_back(filename);
51 auto idx = pool.emplace(sum, time, stringBuffer.back()).idx;
52 auto it = ranges::upper_bound(sha1Index, sum, {}, GetSha1{pool});
53 sha1Index.insert(it, idx);
54 filenameIndex.insert(idx);
55 needWrite = true;
56}
57
58FilePoolCore::Sha1Index::iterator FilePoolCore::getSha1Iterator(Index idx, const Entry& entry)
59{
60 // There can be multiple entries for the same sha1, look for the specific one.
61 for (auto [b, e] = ranges::equal_range(sha1Index, entry.sum, {}, GetSha1{pool}); b != e; ++b) {
62 if (*b == idx) {
63 return b;
64 }
65 }
66 assert(false); return sha1Index.end();
67}
68
69void FilePoolCore::remove(Sha1Index::iterator it)
70{
71 auto idx = *it;
72 filenameIndex.erase(idx);
73 pool.remove(idx);
74 sha1Index.erase(it);
75 needWrite = true;
76}
77
78void FilePoolCore::remove(Index idx, const Entry& entry)
79{
80 remove(getSha1Iterator(idx, entry));
81}
82
83void FilePoolCore::remove(Index idx)
84{
85 remove(idx, pool[idx]);
86}
87
88// Change the sha1sum of the element pointed to by 'it' into 'newSum'. Also
89// re-arrange the items so that 'sha1Index' remains sorted on sha1sum.
90// Internally this method doesn't actually sort, it merely rotates the
91// elements.
92// Returns false if the new position is before (or at) the old position.
93// Returns true if the new position is after the old position.
94bool FilePoolCore::adjustSha1(Sha1Index::iterator it, Entry& entry, const Sha1Sum& newSum)
95{
96 needWrite = true;
97 auto newIt = ranges::upper_bound(sha1Index, newSum, {}, GetSha1{pool});
98 entry.sum = newSum; // update sum
99 if (newIt > it) {
100 // move to back
101 rotate(it, it + 1, newIt);
102 return true;
103 } else {
104 if (newIt < it) {
105 // move to front
106 rotate(newIt, it, it + 1);
107 } else {
108 // (unlikely) sha1sum has changed, but after
109 // resorting item would remain in the same
110 // position
111 }
112 return false;
113 }
114}
115
116bool FilePoolCore::adjustSha1(Index idx, Entry& entry, const Sha1Sum& newSum)
117{
118 return adjustSha1(getSha1Iterator(idx, entry), entry, newSum);
119}
120
121time_t FilePoolCore::Entry::getTime()
122{
123 if (time == Date::INVALID_TIME_T) {
124 time = Date::fromString(std::span<const char, 24>{timeStr, 24});
125 }
126 return time;
127}
128
129void FilePoolCore::Entry::setTime(time_t t)
130{
131 time = t;
132 timeStr = nullptr;
133}
134
135// returns: <sha1, time-string, filename>
136static std::optional<std::tuple<Sha1Sum, const char*, std::string_view>> parse(
137 std::span<char> line)
138{
139 if (line.size() <= 68) return {}; // minumum length (only filename is variable)
140
141 // only perform quick sanity check on date/time format
142 if (line[40] != ' ') return {}; // two space between sha1sum and date
143 if (line[41] != ' ') return {};
144 if (line[45] != ' ') return {}; // space between day-of-week and month
145 if (line[49] != ' ') return {}; // space between month and day of month
146 if (line[52] != ' ') return {}; // space between day of month and hour
147 if (line[55] != ':') return {}; // colon between hour and minutes
148 if (line[58] != ':') return {}; // colon between minutes and seconds
149 if (line[61] != ' ') return {}; // space between seconds and year
150 if (line[66] != ' ') return {}; // two spaces between date and filename
151 if (line[67] != ' ') return {};
152
153 Sha1Sum sha1(Sha1Sum::UninitializedTag{});
154 try {
155 sha1.parse40(subspan<40>(line));
156 } catch (MSXException&) {
157 return {};
158 }
159
160 const char* timeStr = &line[42]; // not guaranteed to be a correct date/time
161 line[66] = '\0'; // zero-terminate timeStr, so that it can be printed
162
163 std::string_view filename(&line[68], line.size() - 68); // TODO c++20 [begin, end)
164
165 return std::tuple{sha1, timeStr, filename};
166}
167
168void FilePoolCore::readSha1sums()
169{
170 assert(sha1Index.empty());
171 assert(fileMem.empty());
172
173 File file(fileCache);
174 auto size = file.getSize();
175 fileMem.resize(size + 1);
176 file.read(fileMem.first(size));
177 fileMem[size] = '\n'; // ensure there's always a '\n' at the end
178
179 // Process each line.
180 // Assume lines are separated by "\n", "\r\n" or "\n\r" (but not "\r").
181 char* data = fileMem.begin();
182 char* data_end = fileMem.end();
183 while (data != data_end) {
184 // memchr() seems better optimized than std::find_if()
185 auto* it = static_cast<char*>(memchr(data, '\n', data_end - data));
186 if (it == nullptr) it = data_end;
187 if ((it != data) && (it[-1] == '\r')) --it;
188
189 if (auto r = parse({data, it})) {
190 auto [sum, timeStr, filename] = *r;
191 sha1Index.push_back(pool.emplace(sum, timeStr, filename).idx);
192 // sha1Index not yet guaranteed sorted
193 }
194
195 data = std::find_if(it + 1, data_end, [](char c) {
196 return c != one_of('\n', '\r');
197 });
198 }
199
200 if (!ranges::is_sorted(sha1Index, {}, GetSha1{pool})) {
201 // This should _rarely_ happen. In fact it should only happen
202 // when .filecache was manually edited. Though because it's
203 // very important that pool is indeed sorted I've added this
204 // safety mechanism.
205 ranges::sort(sha1Index, {}, GetSha1{pool});
206 }
207
208 // 'pool' is populated, 'sha1Index' is sorted, now build 'filenameIndex'
209 auto n = sha1Index.size();
210 filenameIndex.reserve(n);
211 while (n != 0) { // sha1Index might change while iterating ...
212 --n;
213 Index idx = sha1Index[n]; // ... therefor use indices instead of iterators
214 bool inserted = filenameIndex.insert(idx);
215 if (!inserted) {
216 // ERROR: there may not be multiple entries for the same filename
217 // Should only happen when .filecache was manually edited.
218 remove(idx);
219 }
220 }
221}
222
223void FilePoolCore::writeSha1sums()
224{
225 std::ofstream file;
226 FileOperations::openOfStream(file, fileCache);
227 if (!file.is_open()) {
228 return;
229 }
230 for (auto idx : sha1Index) {
231 const auto& entry = pool[idx];
232 file << entry.sum.toString() << " ";
233 if (entry.timeStr) {
234 file << entry.timeStr;
235 } else {
236 assert(entry.time != Date::INVALID_TIME_T);
237 file << Date::toString(entry.time);
238 }
239 file << " " << entry.filename << '\n';
240 }
241}
242
244{
245 File result = getFromPool(sha1sum);
246 if (result.is_open()) return result;
247
248 // not found in cache, need to scan directories
249 stop = false;
250 ScanProgress progress {
251 .lastTime = Timer::getTime(),
252 };
253
254 for (const auto& [path, types] : getDirectories()) {
255 if ((types & fileType) != FileType::NONE) {
256 result = scanDirectory(sha1sum, FileOperations::expandTilde(std::string(path)), path, progress);
257 if (result.is_open()) {
258 if (progress.printed) {
259 reportProgress(tmpStrCat("Found file with sha1sum ", sha1sum.toString()), 1.0f);
260 }
261 return result;
262 }
263 }
264 }
265
266 if (progress.printed) {
267 reportProgress(tmpStrCat("Did not find file with sha1sum ", sha1sum.toString()), 1.0f);
268 }
269 return result; // not found
270}
271
272Sha1Sum FilePoolCore::calcSha1sum(File& file) const
273{
274 // Calculate sha1 in several steps so that we can show progress
275 // information. We take a fixed step size for an efficient calculation.
276 constexpr size_t STEP_SIZE = 1024 * 1024; // 1MB
277
278 auto data = file.mmap();
279
280 SHA1 sha1;
281 size_t size = data.size();
282 size_t done = 0;
283 size_t remaining = size;
284 auto lastShowedProgress = Timer::getTime();
285 bool everShowedProgress = false;
286
287 auto report = [&](float fraction) {
288 reportProgress(tmpStrCat("Calculating SHA1 sum for ", file.getOriginalName()),
289 fraction);
290 };
291 // Loop over all-but-the last blocks. For small files this loop is skipped.
292 while (remaining > STEP_SIZE) {
293 sha1.update({&data[done], STEP_SIZE});
294 done += STEP_SIZE;
295 remaining -= STEP_SIZE;
296
297 auto now = Timer::getTime();
298 if ((now - lastShowedProgress) > 250'000) { // 4Hz
299 report(float(done) / float(size));
300 lastShowedProgress = now;
301 everShowedProgress = true;
302 }
303 }
304 // last block
305 if (remaining) {
306 sha1.update({&data[done], remaining});
307 }
308 if (everShowedProgress) {
309 report(1.0f);
310 }
311 return sha1.digest();
312}
313
314File FilePoolCore::getFromPool(const Sha1Sum& sha1sum)
315{
316 auto [b, e] = ranges::equal_range(sha1Index, sha1sum, {}, GetSha1{pool});
317 // use indices instead of iterators
318 auto i = distance(begin(sha1Index), b);
319 auto last = distance(begin(sha1Index), e);
320 while (i != last) {
321 auto it = begin(sha1Index) + i;
322 auto& entry = pool[*it];
323 if (entry.getTime() == Date::INVALID_TIME_T) {
324 // Invalid time/date format. Remove from
325 // database and continue searching.
326 remove(it);
327 --last;
328 continue;
329 }
330 try {
331 File file(std::string(entry.filename));
332 auto newTime = file.getModificationDate();
333 if (entry.getTime() == newTime) {
334 // When modification time is unchanged, assume
335 // sha1sum is also unchanged. So avoid
336 // expensive sha1sum calculation.
337 return file;
338 }
339 entry.setTime(newTime); // update timestamp
340 needWrite = true;
341 auto newSum = calcSha1sum(file);
342 if (newSum == sha1sum) {
343 // Modification time was changed, but
344 // (recalculated) sha1sum is still the same.
345 return file;
346 }
347 // Sha1sum has changed: update sha1sum, move entry to
348 // new position new sum and continue searching.
349 if (adjustSha1(it, entry, newSum)) {
350 // after
351 --last; // no ++i
352 } else {
353 // before (or at)
354 ++i;
355 }
356 } catch (FileException&) {
357 // Error reading file: remove from db and continue
358 // searching.
359 remove(it);
360 --last;
361 }
362 }
363 return {}; // not found
364}
365
366File FilePoolCore::scanDirectory(
367 const Sha1Sum& sha1sum, const std::string& directory, std::string_view poolPath,
368 ScanProgress& progress)
369{
370 File result;
371 auto fileAction = [&](const std::string& path, const FileOperations::Stat& st) {
372 if (stop) {
373 // Scanning can take a long time. Allow to exit
374 // openmsx when it takes too long. Stop scanning
375 // by pretending we didn't find the file.
376 assert(!result.is_open());
377 return false; // abort foreach_file_recursive
378 }
379 result = scanFile(sha1sum, path, st, poolPath, progress);
380 return !result.is_open(); // abort traversal when found
381 };
382 foreach_file_recursive(directory, fileAction);
383 return result;
384}
385
386File FilePoolCore::scanFile(const Sha1Sum& sha1sum, const std::string& filename,
387 const FileOperations::Stat& st, std::string_view poolPath,
388 ScanProgress& progress)
389{
390 ++progress.amountScanned;
391 // Periodically send a progress message with the current filename
392 if (auto now = Timer::getTime();
393 now > (progress.lastTime + 250'000)) { // 4Hz
394 progress.lastTime = now;
395 progress.printed = true;
396 reportProgress(tmpStrCat(
397 "Searching for file with sha1sum ", sha1sum.toString(),
398 "...\nIndexing filepool ", poolPath, ": [",
399 progress.amountScanned, "]: ",
400 std::string_view(filename).substr(poolPath.size())),
401 -1.0f); // unknown progress
402 }
403
405 if (auto [idx, entry] = findInDatabase(filename); idx == Index(-1)) {
406 // not in pool
407 try {
408 File file(filename);
409 auto sum = calcSha1sum(file);
410 insert(sum, time, filename);
411 if (sum == sha1sum) {
412 return file;
413 }
414 } catch (FileException&) {
415 // ignore
416 }
417 } else {
418 // already in pool
419 assert(filename == entry->filename);
420 try {
421 if (entry->getTime() == time) {
422 // db is still up to date
423 if (entry->sum == sha1sum) {
424 return File(filename);
425 }
426 } else {
427 // db outdated
428 File file(filename);
429 auto sum = calcSha1sum(file);
430 entry->setTime(time);
431 adjustSha1(idx, *entry, sum);
432 if (sum == sha1sum) {
433 return file;
434 }
435 }
436 } catch (FileException&) {
437 // error reading file, remove from db
438 remove(idx, *entry);
439 }
440 }
441 return {}; // not found
442}
443
444std::pair<FilePoolCore::Index, FilePoolCore::Entry*> FilePoolCore::findInDatabase(std::string_view filename)
445{
446 auto it = filenameIndex.find(filename);
447 if (!it) return {Index(-1), nullptr};
448
449 Index idx = *it;
450 auto& entry = pool[idx];
451 assert(entry.filename == filename);
452 if (entry.getTime() == Date::INVALID_TIME_T) {
453 // invalid date/time string, remove from db
454 remove(idx, entry);
455 return {Index(-1), nullptr};
456 }
457 return {idx, &entry};
458}
459
461{
462 auto time = file.getModificationDate();
463 const std::string& filename = file.getURL();
464
465 auto [idx, entry] = findInDatabase(filename);
466 if ((idx != Index(-1)) && (entry->getTime() == time)) {
467 // in database and modification time matches,
468 // assume sha1sum also matches
469 return entry->sum;
470 }
471
472 // not in database or timestamp mismatch
473 auto sum = calcSha1sum(file);
474 if (idx == Index(-1)) {
475 // was not yet in database, insert new entry
476 insert(sum, time, filename);
477 } else {
478 // was already in database, but with wrong timestamp (and sha1sum)
479 entry->setTime(time);
480 adjustSha1(idx, *entry, sum);
481 }
482 return sum;
483}
484
485} // namespace openmsx
TclObject t
void remove(Index idx)
Definition ObjectPool.hh:96
EmplaceResult emplace(Args &&...args)
Definition ObjectPool.hh:76
Value * find(const Value2 &val) const
bool insert(Value2 &&val)
bool erase(const Value2 &val)
void reserve(size_t n)
Sha1Sum getSha1Sum(File &file)
Calculate sha1sum for the given File object.
File getFile(FileType fileType, const Sha1Sum &sha1sum)
Search file with the given sha1sum.
std::vector< Dir > Directories
FilePoolCore(std::string fileCache, std::function< Directories()> getDirectories, std::function< void(std::string_view, float)> reportProgress)
std::span< const uint8_t > mmap()
Map file in memory.
Definition File.cc:102
std::string_view getOriginalName()
Get Original filename for this object.
Definition File.cc:147
time_t getModificationDate()
Get the date/time of last modification.
Definition File.cc:158
bool is_open() const
Return true iff this file handle refers to an open file.
Definition File.hh:65
const std::string & getURL() const
Returns the URL of this file object.
Definition File.cc:137
std::span< const T > first(size_t n) const
Definition MemBuffer.hh:127
void resize(size_t size)
Grow or shrink the memory block.
Definition MemBuffer.hh:156
bool empty() const
Definition MemBuffer.hh:100
Helper class to perform a sha1 calculation.
Definition sha1.hh:67
Sha1Sum digest()
Get the final hash.
void update(std::span< const uint8_t > data)
Incrementally calculate the hash value.
This class represents the result of a sha1 calculation (a 160-bit value).
Definition sha1.hh:24
std::string toString() const
constexpr double e
Definition Math.hh:21
mat4 rotate(float angle, const vec3 &axis)
std::string toString(time_t time)
Definition Date.cc:153
time_t fromString(std::span< const char, 24 > s)
Definition Date.cc:33
constexpr time_t INVALID_TIME_T
Definition Date.hh:11
string expandTilde(string path)
Expand the '~' character to the users home directory.
void openOfStream(std::ofstream &stream, zstring_view filename)
Open an ofstream in a platform-independent manner.
time_t getModificationDate(const Stat &st)
Get the date/time of last modification.
uint64_t getTime()
Get current (real) time in us.
Definition Timer.cc:7
This file implemented 3 utility functions:
Definition Autofire.cc:11
bool foreach_file_recursive(std::string path, FileAction fileAction)
bool is_sorted(ForwardRange &&range, Compare comp={}, Proj proj={})
Definition ranges.hh:42
auto upper_bound(ForwardRange &&range, const T &value, Compare comp={}, Proj proj={})
Definition ranges.hh:126
auto equal_range(ForwardRange &&range, const T &value, Compare comp={})
Definition ranges.hh:135
constexpr void sort(RandomAccessRange &&range)
Definition ranges.hh:51
STL namespace.
size_t size(std::string_view utf8)
auto distance(octet_iterator first, octet_iterator last)
constexpr auto sum(InputRange &&range, Proj proj={})
Definition stl.hh:248
TemporaryString tmpStrCat(Ts &&... ts)
Definition strCat.hh:742
const Sha1Sum & operator()(FilePoolCore::Index idx) const
const FilePoolCore::Pool & pool
constexpr auto begin(const zstring_view &x)