openMSX
FilePoolCore.cc
Go to the documentation of this file.
1#include "FilePoolCore.hh"
2
3#include "File.hh"
4#include "FileException.hh"
5#include "foreach_file.hh"
6
7#include "Date.hh"
8#include "Timer.hh"
9#include "one_of.hh"
10#include "ranges.hh"
11
12#include <algorithm>
13#include <fstream>
14#include <optional>
15#include <tuple>
16
17namespace openmsx {
18
19struct GetSha1 {
21
22 [[nodiscard]] const Sha1Sum& operator()(FilePoolCore::Index idx) const {
23 return pool[idx].sum;
24 }
25};
26
27
28FilePoolCore::FilePoolCore(std::string fileCache_,
29 std::function<Directories()> getDirectories_,
30 std::function<void(std::string_view, float)> reportProgress_)
31 : fileCache(std::move(fileCache_))
32 , getDirectories(std::move(getDirectories_))
33 , reportProgress(std::move(reportProgress_))
34{
35 try {
36 readSha1sums();
37 } catch (MSXException&) {
38 // ignore, probably .filecache doesn't exist yet
39 }
40}
41
43{
44 if (needWrite) {
45 writeSha1sums();
46 }
47}
48
49void FilePoolCore::insert(const Sha1Sum& sum, time_t time, const std::string& filename)
50{
51 stringBuffer.push_back(filename);
52 auto idx = pool.emplace(sum, time, stringBuffer.back()).idx;
53 auto it = std::ranges::upper_bound(sha1Index, sum, {}, GetSha1{pool});
54 sha1Index.insert(it, idx);
55 filenameIndex.insert(idx);
56 needWrite = true;
57}
58
59FilePoolCore::Sha1Index::iterator FilePoolCore::getSha1Iterator(Index idx, const Entry& entry)
60{
61 // There can be multiple entries for the same sha1, look for the specific one.
62 for (auto [b, e] = std::ranges::equal_range(sha1Index, entry.sum, {}, GetSha1{pool}); b != e; ++b) {
63 if (*b == idx) {
64 return b;
65 }
66 }
67 assert(false); return sha1Index.end();
68}
69
70void FilePoolCore::remove(Sha1Index::iterator it)
71{
72 auto idx = *it;
73 filenameIndex.erase(idx);
74 pool.remove(idx);
75 sha1Index.erase(it);
76 needWrite = true;
77}
78
79void FilePoolCore::remove(Index idx, const Entry& entry)
80{
81 remove(getSha1Iterator(idx, entry));
82}
83
84void FilePoolCore::remove(Index idx)
85{
86 remove(idx, pool[idx]);
87}
88
89// Change the sha1sum of the element pointed to by 'it' into 'newSum'. Also
90// re-arrange the items so that 'sha1Index' remains sorted on sha1sum.
91// Internally this method doesn't actually sort, it merely rotates the
92// elements.
93// Returns false if the new position is before (or at) the old position.
94// Returns true if the new position is after the old position.
95bool FilePoolCore::adjustSha1(Sha1Index::iterator it, Entry& entry, const Sha1Sum& newSum)
96{
97 needWrite = true;
98 auto newIt = std::ranges::upper_bound(sha1Index, newSum, {}, GetSha1{pool});
99 entry.sum = newSum; // update sum
100 if (newIt > it) {
101 // move to back
102 rotate(it, it + 1, newIt);
103 return true;
104 } else {
105 if (newIt < it) {
106 // move to front
107 rotate(newIt, it, it + 1);
108 } else {
109 // (unlikely) sha1sum has changed, but after
110 // resorting item would remain in the same
111 // position
112 }
113 return false;
114 }
115}
116
117bool FilePoolCore::adjustSha1(Index idx, Entry& entry, const Sha1Sum& newSum)
118{
119 return adjustSha1(getSha1Iterator(idx, entry), entry, newSum);
120}
121
122time_t FilePoolCore::Entry::getTime()
123{
124 if (time == Date::INVALID_TIME_T) {
125 time = Date::fromString(std::span<const char, 24>{timeStr, 24});
126 }
127 return time;
128}
129
130void FilePoolCore::Entry::setTime(time_t t)
131{
132 time = t;
133 timeStr = nullptr;
134}
135
136// returns: <sha1, time-string, filename>
137static std::optional<std::tuple<Sha1Sum, const char*, std::string_view>> parse(
138 std::span<char> line)
139{
140 if (line.size() <= 68) return {}; // minumum length (only filename is variable)
141
142 // only perform quick sanity check on date/time format
143 if (line[40] != ' ') return {}; // two space between sha1sum and date
144 if (line[41] != ' ') return {};
145 if (line[45] != ' ') return {}; // space between day-of-week and month
146 if (line[49] != ' ') return {}; // space between month and day of month
147 if (line[52] != ' ') return {}; // space between day of month and hour
148 if (line[55] != ':') return {}; // colon between hour and minutes
149 if (line[58] != ':') return {}; // colon between minutes and seconds
150 if (line[61] != ' ') return {}; // space between seconds and year
151 if (line[66] != ' ') return {}; // two spaces between date and filename
152 if (line[67] != ' ') return {};
153
154 Sha1Sum sha1(Sha1Sum::UninitializedTag{});
155 try {
156 sha1.parse40(subspan<40>(line));
157 } catch (MSXException&) {
158 return {};
159 }
160
161 const char* timeStr = &line[42]; // not guaranteed to be a correct date/time
162 line[66] = '\0'; // zero-terminate timeStr, so that it can be printed
163
164 std::string_view filename(&line[68], line.size() - 68); // TODO c++20 [begin, end)
165
166 return std::tuple{sha1, timeStr, filename};
167}
168
169void FilePoolCore::readSha1sums()
170{
171 assert(sha1Index.empty());
172 assert(fileMem.empty());
173
174 File file(fileCache);
175 auto size = file.getSize();
176 fileMem.resize(size + 1);
177 file.read(fileMem.first(size));
178 fileMem[size] = '\n'; // ensure there's always a '\n' at the end
179
180 // Process each line.
181 // Assume lines are separated by "\n", "\r\n" or "\n\r" (but not "\r").
182 char* data = fileMem.begin();
183 char* data_end = fileMem.end();
184 while (data != data_end) {
185 // memchr() seems better optimized than std::find_if()
186 auto* it = static_cast<char*>(memchr(data, '\n', data_end - data));
187 if (it == nullptr) it = data_end;
188 if ((it != data) && (it[-1] == '\r')) --it;
189
190 if (auto r = parse({data, it})) {
191 auto [sum, timeStr, filename] = *r;
192 sha1Index.push_back(pool.emplace(sum, timeStr, filename).idx);
193 // sha1Index not yet guaranteed sorted
194 }
195
196 data = std::find_if(it + 1, data_end, [](char c) {
197 return c != one_of('\n', '\r');
198 });
199 }
200
201 if (!ranges::is_sorted(sha1Index, {}, GetSha1{pool})) {
202 // This should _rarely_ happen. In fact it should only happen
203 // when .filecache was manually edited. Though because it's
204 // very important that pool is indeed sorted I've added this
205 // safety mechanism.
206 ranges::sort(sha1Index, {}, GetSha1{pool});
207 }
208
209 // 'pool' is populated, 'sha1Index' is sorted, now build 'filenameIndex'
210 auto n = sha1Index.size();
211 filenameIndex.reserve(n);
212 while (n != 0) { // sha1Index might change while iterating ...
213 --n;
214 Index idx = sha1Index[n]; // ... therefor use indices instead of iterators
215 bool inserted = filenameIndex.insert(idx);
216 if (!inserted) {
217 // ERROR: there may not be multiple entries for the same filename
218 // Should only happen when .filecache was manually edited.
219 remove(idx);
220 }
221 }
222}
223
224void FilePoolCore::writeSha1sums()
225{
226 std::ofstream file;
227 FileOperations::openOfStream(file, fileCache);
228 if (!file.is_open()) {
229 return;
230 }
231 for (auto idx : sha1Index) {
232 const auto& entry = pool[idx];
233 file << entry.sum.toString() << " ";
234 if (entry.timeStr) {
235 file << entry.timeStr;
236 } else {
237 assert(entry.time != Date::INVALID_TIME_T);
238 file << Date::toString(entry.time);
239 }
240 file << " " << entry.filename << '\n';
241 }
242}
243
245{
246 File result = getFromPool(sha1sum);
247 if (result.is_open()) return result;
248
249 // not found in cache, need to scan directories
250 stop = false;
251 ScanProgress progress {
252 .lastTime = Timer::getTime(),
253 };
254
255 for (const auto& [path, types] : getDirectories()) {
256 if ((types & fileType) != FileType::NONE) {
257 result = scanDirectory(sha1sum, FileOperations::expandTilde(std::string(path)), path, progress);
258 if (result.is_open()) {
259 if (progress.printed) {
260 reportProgress(tmpStrCat("Found file with sha1sum ", sha1sum.toString()), 1.0f);
261 }
262 return result;
263 }
264 }
265 }
266
267 if (progress.printed) {
268 reportProgress(tmpStrCat("Did not find file with sha1sum ", sha1sum.toString()), 1.0f);
269 }
270 return result; // not found
271}
272
273Sha1Sum FilePoolCore::calcSha1sum(File& file) const
274{
275 // Calculate sha1 in several steps so that we can show progress
276 // information. We take a fixed step size for an efficient calculation.
277 constexpr size_t STEP_SIZE = 1024 * 1024; // 1MB
278
279 auto data = file.mmap();
280
281 SHA1 sha1;
282 size_t size = data.size();
283 size_t done = 0;
284 size_t remaining = size;
285 auto lastShowedProgress = Timer::getTime();
286 bool everShowedProgress = false;
287
288 auto report = [&](float fraction) {
289 reportProgress(tmpStrCat("Calculating SHA1 sum for ", file.getOriginalName()),
290 fraction);
291 };
292 // Loop over all-but-the last blocks. For small files this loop is skipped.
293 while (remaining > STEP_SIZE) {
294 sha1.update({&data[done], STEP_SIZE});
295 done += STEP_SIZE;
296 remaining -= STEP_SIZE;
297
298 auto now = Timer::getTime();
299 if ((now - lastShowedProgress) > 250'000) { // 4Hz
300 report(float(done) / float(size));
301 lastShowedProgress = now;
302 everShowedProgress = true;
303 }
304 }
305 // last block
306 if (remaining) {
307 sha1.update({&data[done], remaining});
308 }
309 if (everShowedProgress) {
310 report(1.0f);
311 }
312 return sha1.digest();
313}
314
315File FilePoolCore::getFromPool(const Sha1Sum& sha1sum)
316{
317 auto [b, e] = std::ranges::equal_range(sha1Index, sha1sum, {}, GetSha1{pool});
318 // use indices instead of iterators
319 auto i = distance(begin(sha1Index), b);
320 auto last = distance(begin(sha1Index), e);
321 while (i != last) {
322 auto it = begin(sha1Index) + i;
323 auto& entry = pool[*it];
324 if (entry.getTime() == Date::INVALID_TIME_T) {
325 // Invalid time/date format. Remove from
326 // database and continue searching.
327 remove(it);
328 --last;
329 continue;
330 }
331 try {
332 File file(std::string(entry.filename));
333 auto newTime = file.getModificationDate();
334 if (entry.getTime() == newTime) {
335 // When modification time is unchanged, assume
336 // sha1sum is also unchanged. So avoid
337 // expensive sha1sum calculation.
338 return file;
339 }
340 entry.setTime(newTime); // update timestamp
341 needWrite = true;
342 auto newSum = calcSha1sum(file);
343 if (newSum == sha1sum) {
344 // Modification time was changed, but
345 // (recalculated) sha1sum is still the same.
346 return file;
347 }
348 // Sha1sum has changed: update sha1sum, move entry to
349 // new position new sum and continue searching.
350 if (adjustSha1(it, entry, newSum)) {
351 // after
352 --last; // no ++i
353 } else {
354 // before (or at)
355 ++i;
356 }
357 } catch (FileException&) {
358 // Error reading file: remove from db and continue
359 // searching.
360 remove(it);
361 --last;
362 }
363 }
364 return {}; // not found
365}
366
367File FilePoolCore::scanDirectory(
368 const Sha1Sum& sha1sum, const std::string& directory, std::string_view poolPath,
369 ScanProgress& progress)
370{
371 File result;
372 auto fileAction = [&](const std::string& path, const FileOperations::Stat& st) {
373 if (stop) {
374 // Scanning can take a long time. Allow to exit
375 // openmsx when it takes too long. Stop scanning
376 // by pretending we didn't find the file.
377 assert(!result.is_open());
378 return false; // abort foreach_file_recursive
379 }
380 result = scanFile(sha1sum, path, st, poolPath, progress);
381 return !result.is_open(); // abort traversal when found
382 };
383 foreach_file_recursive(directory, fileAction);
384 return result;
385}
386
387File FilePoolCore::scanFile(const Sha1Sum& sha1sum, const std::string& filename,
388 const FileOperations::Stat& st, std::string_view poolPath,
389 ScanProgress& progress)
390{
391 ++progress.amountScanned;
392 // Periodically send a progress message with the current filename
393 if (auto now = Timer::getTime();
394 now > (progress.lastTime + 250'000)) { // 4Hz
395 progress.lastTime = now;
396 progress.printed = true;
397 reportProgress(tmpStrCat(
398 "Searching for file with sha1sum ", sha1sum.toString(),
399 "...\nIndexing filepool ", poolPath, ": [",
400 progress.amountScanned, "]: ",
401 std::string_view(filename).substr(poolPath.size())),
402 -1.0f); // unknown progress
403 }
404
406 if (auto [idx, entry] = findInDatabase(filename); idx == Index(-1)) {
407 // not in pool
408 try {
409 File file(filename);
410 auto sum = calcSha1sum(file);
411 insert(sum, time, filename);
412 if (sum == sha1sum) {
413 return file;
414 }
415 } catch (FileException&) {
416 // ignore
417 }
418 } else {
419 // already in pool
420 assert(filename == entry->filename);
421 try {
422 if (entry->getTime() == time) {
423 // db is still up to date
424 if (entry->sum == sha1sum) {
425 return File(filename);
426 }
427 } else {
428 // db outdated
429 File file(filename);
430 auto sum = calcSha1sum(file);
431 entry->setTime(time);
432 adjustSha1(idx, *entry, sum);
433 if (sum == sha1sum) {
434 return file;
435 }
436 }
437 } catch (FileException&) {
438 // error reading file, remove from db
439 remove(idx, *entry);
440 }
441 }
442 return {}; // not found
443}
444
445std::pair<FilePoolCore::Index, FilePoolCore::Entry*> FilePoolCore::findInDatabase(std::string_view filename)
446{
447 auto it = filenameIndex.find(filename);
448 if (!it) return {Index(-1), nullptr};
449
450 Index idx = *it;
451 auto& entry = pool[idx];
452 assert(entry.filename == filename);
453 if (entry.getTime() == Date::INVALID_TIME_T) {
454 // invalid date/time string, remove from db
455 remove(idx, entry);
456 return {Index(-1), nullptr};
457 }
458 return {idx, &entry};
459}
460
462{
463 auto time = file.getModificationDate();
464 const std::string& filename = file.getURL();
465
466 auto [idx, entry] = findInDatabase(filename);
467 if ((idx != Index(-1)) && (entry->getTime() == time)) {
468 // in database and modification time matches,
469 // assume sha1sum also matches
470 return entry->sum;
471 }
472
473 // not in database or timestamp mismatch
474 auto sum = calcSha1sum(file);
475 if (idx == Index(-1)) {
476 // was not yet in database, insert new entry
477 insert(sum, time, filename);
478 } else {
479 // was already in database, but with wrong timestamp (and sha1sum)
480 entry->setTime(time);
481 adjustSha1(idx, *entry, sum);
482 }
483 return sum;
484}
485
486} // namespace openmsx
TclObject t
void remove(Index idx)
Definition ObjectPool.hh:96
EmplaceResult emplace(Args &&...args)
Definition ObjectPool.hh:76
Value * find(const Value2 &val) const
bool insert(Value2 &&val)
bool erase(const Value2 &val)
void reserve(size_t n)
Sha1Sum getSha1Sum(File &file)
Calculate sha1sum for the given File object.
File getFile(FileType fileType, const Sha1Sum &sha1sum)
Search file with the given sha1sum.
std::vector< Dir > Directories
FilePoolCore(std::string fileCache, std::function< Directories()> getDirectories, std::function< void(std::string_view, float)> reportProgress)
std::span< const uint8_t > mmap()
Map file in memory.
Definition File.cc:106
std::string_view getOriginalName()
Get Original filename for this object.
Definition File.cc:151
time_t getModificationDate()
Get the date/time of last modification.
Definition File.cc:162
bool is_open() const
Return true iff this file handle refers to an open file.
Definition File.hh:65
const std::string & getURL() const
Returns the URL of this file object.
Definition File.cc:141
std::span< const T > first(size_t n) const
Definition MemBuffer.hh:127
void resize(size_t size)
Grow or shrink the memory block.
Definition MemBuffer.hh:156
bool empty() const
Definition MemBuffer.hh:100
Helper class to perform a sha1 calculation.
Definition sha1.hh:67
Sha1Sum digest()
Get the final hash.
void update(std::span< const uint8_t > data)
Incrementally calculate the hash value.
This class represents the result of a sha1 calculation (a 160-bit value).
Definition sha1.hh:24
std::string toString() const
constexpr double e
Definition Math.hh:21
mat4 rotate(float angle, const vec3 &axis)
std::string toString(time_t time)
Definition Date.cc:153
time_t fromString(std::span< const char, 24 > s)
Definition Date.cc:33
constexpr time_t INVALID_TIME_T
Definition Date.hh:11
string expandTilde(string path)
Expand the '~' character to the users home directory.
void openOfStream(std::ofstream &stream, zstring_view filename)
Open an ofstream in a platform-independent manner.
time_t getModificationDate(const Stat &st)
Get the date/time of last modification.
uint64_t getTime()
Get current (real) time in us.
Definition Timer.cc:7
This file implemented 3 utility functions:
Definition Autofire.cc:11
bool foreach_file_recursive(std::string path, FileAction fileAction)
bool is_sorted(ForwardRange &&range, Compare comp={}, Proj proj={})
Definition ranges.hh:42
constexpr void sort(RandomAccessRange &&range)
Definition ranges.hh:51
STL namespace.
size_t size(std::string_view utf8)
auto distance(octet_iterator first, octet_iterator last)
constexpr auto sum(InputRange &&range, Proj proj={})
Definition stl.hh:248
TemporaryString tmpStrCat(Ts &&... ts)
Definition strCat.hh:742
const Sha1Sum & operator()(FilePoolCore::Index idx) const
const FilePoolCore::Pool & pool
constexpr auto begin(const zstring_view &x)