openMSX
FilePoolCore.cc
Go to the documentation of this file.
1#include "FilePoolCore.hh"
2#include "File.hh"
3#include "FileException.hh"
4#include "foreach_file.hh"
5#include "Date.hh"
6#include "Timer.hh"
7#include "one_of.hh"
8#include "ranges.hh"
9#include <fstream>
10#include <optional>
11#include <tuple>
12
13namespace openmsx {
14
15struct GetSha1 {
17
18 [[nodiscard]] const Sha1Sum& operator()(FilePoolCore::Index idx) const {
19 return pool[idx].sum;
20 }
21};
22
23
24FilePoolCore::FilePoolCore(std::string fileCache_,
25 std::function<Directories()> getDirectories_,
26 std::function<void(std::string_view, float)> reportProgress_)
27 : fileCache(std::move(fileCache_))
28 , getDirectories(std::move(getDirectories_))
29 , reportProgress(std::move(reportProgress_))
30{
31 try {
32 readSha1sums();
33 } catch (MSXException&) {
34 // ignore, probably .filecache doesn't exist yet
35 }
36}
37
39{
40 if (needWrite) {
41 writeSha1sums();
42 }
43}
44
45void FilePoolCore::insert(const Sha1Sum& sum, time_t time, const std::string& filename)
46{
47 stringBuffer.push_back(filename);
48 auto idx = pool.emplace(sum, time, stringBuffer.back()).idx;
49 auto it = ranges::upper_bound(sha1Index, sum, {}, GetSha1{pool});
50 sha1Index.insert(it, idx);
51 filenameIndex.insert(idx);
52 needWrite = true;
53}
54
55FilePoolCore::Sha1Index::iterator FilePoolCore::getSha1Iterator(Index idx, Entry& entry)
56{
57 // There can be multiple entries for the same sha1, look for the specific one.
58 for (auto [b, e] = ranges::equal_range(sha1Index, entry.sum, {}, GetSha1{pool}); b != e; ++b) {
59 if (*b == idx) {
60 return b;
61 }
62 }
63 assert(false); return sha1Index.end();
64}
65
66void FilePoolCore::remove(Sha1Index::iterator it)
67{
68 auto idx = *it;
69 filenameIndex.erase(idx);
70 pool.remove(idx);
71 sha1Index.erase(it);
72 needWrite = true;
73}
74
75void FilePoolCore::remove(Index idx, Entry& entry)
76{
77 remove(getSha1Iterator(idx, entry));
78}
79
80void FilePoolCore::remove(Index idx)
81{
82 remove(idx, pool[idx]);
83}
84
85// Change the sha1sum of the element pointed to by 'it' into 'newSum'. Also
86// re-arrange the items so that 'sha1Index' remains sorted on sha1sum.
87// Internally this method doesn't actually sort, it merely rotates the
88// elements.
89// Returns false if the new position is before (or at) the old position.
90// Returns true if the new position is after the old position.
91bool FilePoolCore::adjustSha1(Sha1Index::iterator it, Entry& entry, const Sha1Sum& newSum)
92{
93 needWrite = true;
94 auto newIt = ranges::upper_bound(sha1Index, newSum, {}, GetSha1{pool});
95 entry.sum = newSum; // update sum
96 if (newIt > it) {
97 // move to back
98 rotate(it, it + 1, newIt);
99 return true;
100 } else {
101 if (newIt < it) {
102 // move to front
103 rotate(newIt, it, it + 1);
104 } else {
105 // (unlikely) sha1sum has changed, but after
106 // resorting item would remain in the same
107 // position
108 }
109 return false;
110 }
111}
112
113bool FilePoolCore::adjustSha1(Index idx, Entry& entry, const Sha1Sum& newSum)
114{
115 return adjustSha1(getSha1Iterator(idx, entry), entry, newSum);
116}
117
118time_t FilePoolCore::Entry::getTime()
119{
120 if (time == Date::INVALID_TIME_T) {
121 time = Date::fromString(std::span<const char, 24>{timeStr, 24});
122 }
123 return time;
124}
125
126void FilePoolCore::Entry::setTime(time_t t)
127{
128 time = t;
129 timeStr = nullptr;
130}
131
132// returns: <sha1, time-string, filename>
133static std::optional<std::tuple<Sha1Sum, const char*, std::string_view>> parse(
134 std::span<char> line)
135{
136 if (line.size() <= 68) return {}; // minumum length (only filename is variable)
137
138 // only perform quick sanity check on date/time format
139 if (line[40] != ' ') return {}; // two space between sha1sum and date
140 if (line[41] != ' ') return {};
141 if (line[45] != ' ') return {}; // space between day-of-week and month
142 if (line[49] != ' ') return {}; // space between month and day of month
143 if (line[52] != ' ') return {}; // space between day of month and hour
144 if (line[55] != ':') return {}; // colon between hour and minutes
145 if (line[58] != ':') return {}; // colon between minutes and seconds
146 if (line[61] != ' ') return {}; // space between seconds and year
147 if (line[66] != ' ') return {}; // two spaces between date and filename
148 if (line[67] != ' ') return {};
149
150 Sha1Sum sha1(Sha1Sum::UninitializedTag{});
151 try {
152 sha1.parse40(subspan<40>(line));
153 } catch (MSXException&) {
154 return {};
155 }
156
157 const char* timeStr = &line[42]; // not guaranteed to be a correct date/time
158 line[66] = '\0'; // zero-terminate timeStr, so that it can be printed
159
160 std::string_view filename(&line[68], line.size() - 68); // TODO c++20 [begin, end)
161
162 return std::tuple{sha1, timeStr, filename};
163}
164
165void FilePoolCore::readSha1sums()
166{
167 assert(sha1Index.empty());
168 assert(fileMem.empty());
169
170 File file(fileCache);
171 auto size = file.getSize();
172 fileMem.resize(size + 1);
173 file.read(std::span{fileMem.data(), size});
174 fileMem[size] = '\n'; // ensure there's always a '\n' at the end
175
176 // Process each line.
177 // Assume lines are separated by "\n", "\r\n" or "\n\r" (but not "\r").
178 char* data = fileMem.data();
179 char* data_end = data + size + 1;
180 while (data != data_end) {
181 // memchr() seems better optimized than std::find_if()
182 char* it = static_cast<char*>(memchr(data, '\n', data_end - data));
183 if (it == nullptr) it = data_end;
184 if ((it != data) && (it[-1] == '\r')) --it;
185
186 if (auto r = parse({data, it})) {
187 auto [sum, timeStr, filename] = *r;
188 sha1Index.push_back(pool.emplace(sum, timeStr, filename).idx);
189 // sha1Index not yet guaranteed sorted
190 }
191
192 data = std::find_if(it + 1, data_end, [](char c) {
193 return c != one_of('\n', '\r');
194 });
195 }
196
197 if (!ranges::is_sorted(sha1Index, {}, GetSha1{pool})) {
198 // This should _rarely_ happen. In fact it should only happen
199 // when .filecache was manually edited. Though because it's
200 // very important that pool is indeed sorted I've added this
201 // safety mechanism.
202 ranges::sort(sha1Index, {}, GetSha1{pool});
203 }
204
205 // 'pool' is populated, 'sha1Index' is sorted, now build 'filenameIndex'
206 auto n = sha1Index.size();
207 filenameIndex.reserve(n);
208 while (n != 0) { // sha1Index might change while iterating ...
209 --n;
210 Index idx = sha1Index[n]; // ... therefor use indices instead of iterators
211 bool inserted = filenameIndex.insert(idx);
212 if (!inserted) {
213 // ERROR: there may not be multiple entries for the same filename
214 // Should only happen when .filecache was manually edited.
215 remove(idx);
216 }
217 }
218}
219
220void FilePoolCore::writeSha1sums()
221{
222 std::ofstream file;
223 FileOperations::openOfStream(file, fileCache);
224 if (!file.is_open()) {
225 return;
226 }
227 for (auto idx : sha1Index) {
228 const auto& entry = pool[idx];
229 file << entry.sum.toString() << " ";
230 if (entry.timeStr) {
231 file << entry.timeStr;
232 } else {
233 assert(entry.time != Date::INVALID_TIME_T);
234 file << Date::toString(entry.time);
235 }
236 file << " " << entry.filename << '\n';
237 }
238}
239
241{
242 File result = getFromPool(sha1sum);
243 if (result.is_open()) return result;
244
245 // not found in cache, need to scan directories
246 stop = false;
247 ScanProgress progress {
248 .lastTime = Timer::getTime(),
249 };
250
251 for (auto& [path, types] : getDirectories()) {
252 if ((types & fileType) != FileType::NONE) {
253 result = scanDirectory(sha1sum, FileOperations::expandTilde(std::string(path)), path, progress);
254 if (result.is_open()) {
255 if (progress.printed) {
256 reportProgress(tmpStrCat("Found file with sha1sum ", sha1sum.toString()), 1.0f);
257 }
258 return result;
259 }
260 }
261 }
262
263 if (progress.printed) {
264 reportProgress(tmpStrCat("Did not find file with sha1sum ", sha1sum.toString()), 1.0f);
265 }
266 return result; // not found
267}
268
269Sha1Sum FilePoolCore::calcSha1sum(File& file)
270{
271 // Calculate sha1 in several steps so that we can show progress
272 // information. We take a fixed step size for an efficient calculation.
273 constexpr size_t STEP_SIZE = 1024 * 1024; // 1MB
274
275 auto data = file.mmap();
276
277 SHA1 sha1;
278 size_t size = data.size();
279 size_t done = 0;
280 size_t remaining = size;
281 auto lastShowedProgress = Timer::getTime();
282 bool everShowedProgress = false;
283
284 auto report = [&](float fraction) {
285 reportProgress(tmpStrCat("Calculating SHA1 sum for ", file.getOriginalName()),
286 fraction);
287 };
288 // Loop over all-but-the last blocks. For small files this loop is skipped.
289 while (remaining > STEP_SIZE) {
290 sha1.update({&data[done], STEP_SIZE});
291 done += STEP_SIZE;
292 remaining -= STEP_SIZE;
293
294 auto now = Timer::getTime();
295 if ((now - lastShowedProgress) > 250'000) { // 4Hz
296 report(float(done) / float(size));
297 lastShowedProgress = now;
298 everShowedProgress = true;
299 }
300 }
301 // last block
302 if (remaining) {
303 sha1.update({&data[done], remaining});
304 }
305 if (everShowedProgress) {
306 report(1.0f);
307 }
308 return sha1.digest();
309}
310
311File FilePoolCore::getFromPool(const Sha1Sum& sha1sum)
312{
313 auto [b, e] = ranges::equal_range(sha1Index, sha1sum, {}, GetSha1{pool});
314 // use indices instead of iterators
315 auto i = distance(begin(sha1Index), b);
316 auto last = distance(begin(sha1Index), e);
317 while (i != last) {
318 auto it = begin(sha1Index) + i;
319 auto& entry = pool[*it];
320 if (entry.getTime() == Date::INVALID_TIME_T) {
321 // Invalid time/date format. Remove from
322 // database and continue searching.
323 remove(it);
324 --last;
325 continue;
326 }
327 try {
328 File file(std::string(entry.filename));
329 auto newTime = file.getModificationDate();
330 if (entry.getTime() == newTime) {
331 // When modification time is unchanged, assume
332 // sha1sum is also unchanged. So avoid
333 // expensive sha1sum calculation.
334 return file;
335 }
336 entry.setTime(newTime); // update timestamp
337 needWrite = true;
338 auto newSum = calcSha1sum(file);
339 if (newSum == sha1sum) {
340 // Modification time was changed, but
341 // (recalculated) sha1sum is still the same.
342 return file;
343 }
344 // Sha1sum has changed: update sha1sum, move entry to
345 // new position new sum and continue searching.
346 if (adjustSha1(it, entry, newSum)) {
347 // after
348 --last; // no ++i
349 } else {
350 // before (or at)
351 ++i;
352 }
353 } catch (FileException&) {
354 // Error reading file: remove from db and continue
355 // searching.
356 remove(it);
357 --last;
358 }
359 }
360 return {}; // not found
361}
362
363File FilePoolCore::scanDirectory(
364 const Sha1Sum& sha1sum, const std::string& directory, std::string_view poolPath,
365 ScanProgress& progress)
366{
367 File result;
368 auto fileAction = [&](const std::string& path, const FileOperations::Stat& st) {
369 if (stop) {
370 // Scanning can take a long time. Allow to exit
371 // openmsx when it takes too long. Stop scanning
372 // by pretending we didn't find the file.
373 assert(!result.is_open());
374 return false; // abort foreach_file_recursive
375 }
376 result = scanFile(sha1sum, path, st, poolPath, progress);
377 return !result.is_open(); // abort traversal when found
378 };
379 foreach_file_recursive(directory, fileAction);
380 return result;
381}
382
383File FilePoolCore::scanFile(const Sha1Sum& sha1sum, const std::string& filename,
384 const FileOperations::Stat& st, std::string_view poolPath,
385 ScanProgress& progress)
386{
387 ++progress.amountScanned;
388 // Periodically send a progress message with the current filename
389 auto now = Timer::getTime();
390 if (now > (progress.lastTime + 250'000)) { // 4Hz
391 progress.lastTime = now;
392 progress.printed = true;
393 reportProgress(tmpStrCat(
394 "Searching for file with sha1sum ", sha1sum.toString(),
395 "...\nIndexing filepool ", poolPath, ": [",
396 progress.amountScanned, "]: ",
397 std::string_view(filename).substr(poolPath.size())),
398 -1.0f); // unknown progress
399 }
400
402 if (auto [idx, entry] = findInDatabase(filename); idx == Index(-1)) {
403 // not in pool
404 try {
405 File file(filename);
406 auto sum = calcSha1sum(file);
407 insert(sum, time, filename);
408 if (sum == sha1sum) {
409 return file;
410 }
411 } catch (FileException&) {
412 // ignore
413 }
414 } else {
415 // already in pool
416 assert(filename == entry->filename);
417 try {
418 if (entry->getTime() == time) {
419 // db is still up to date
420 if (entry->sum == sha1sum) {
421 return File(filename);
422 }
423 } else {
424 // db outdated
425 File file(filename);
426 auto sum = calcSha1sum(file);
427 entry->setTime(time);
428 adjustSha1(idx, *entry, sum);
429 if (sum == sha1sum) {
430 return file;
431 }
432 }
433 } catch (FileException&) {
434 // error reading file, remove from db
435 remove(idx, *entry);
436 }
437 }
438 return {}; // not found
439}
440
441std::pair<FilePoolCore::Index, FilePoolCore::Entry*> FilePoolCore::findInDatabase(std::string_view filename)
442{
443 auto it = filenameIndex.find(filename);
444 if (!it) return {Index(-1), nullptr};
445
446 Index idx = *it;
447 auto& entry = pool[idx];
448 assert(entry.filename == filename);
449 if (entry.getTime() == Date::INVALID_TIME_T) {
450 // invalid date/time string, remove from db
451 remove(idx, entry);
452 return {Index(-1), nullptr};
453 }
454 return {idx, &entry};
455}
456
458{
459 auto time = file.getModificationDate();
460 const std::string& filename = file.getURL();
461
462 auto [idx, entry] = findInDatabase(filename);
463 if (idx != Index(-1)) {
464 if (entry->getTime() == time) {
465 // in database and modification time matches,
466 // assume sha1sum also matches
467 return entry->sum;
468 }
469 }
470
471 // not in database or timestamp mismatch
472 auto sum = calcSha1sum(file);
473 if (idx == Index(-1)) {
474 // was not yet in database, insert new entry
475 insert(sum, time, filename);
476 } else {
477 // was already in database, but with wrong timestamp (and sha1sum)
478 entry->setTime(time);
479 adjustSha1(idx, *entry, sum);
480 }
481 return sum;
482}
483
484} // namespace openmsx
TclObject t
void remove(Index idx)
Definition ObjectPool.hh:96
EmplaceResult emplace(Args &&...args)
Definition ObjectPool.hh:76
bool insert(Value2 &&val)
Value * find(const Value2 &val) const
void reserve(size_t n)
bool erase(const Value2 &val)
Sha1Sum getSha1Sum(File &file)
Calculate sha1sum for the given File object.
File getFile(FileType fileType, const Sha1Sum &sha1sum)
Search file with the given sha1sum.
std::vector< Dir > Directories
FilePoolCore(std::string fileCache, std::function< Directories()> getDirectories, std::function< void(std::string_view, float)> reportProgress)
std::span< const uint8_t > mmap()
Map file in memory.
Definition File.cc:102
std::string_view getOriginalName()
Get Original filename for this object.
Definition File.cc:147
time_t getModificationDate()
Get the date/time of last modification.
Definition File.cc:158
bool is_open() const
Return true iff this file handle refers to an open file.
Definition File.hh:63
const std::string & getURL() const
Returns the URL of this file object.
Definition File.cc:137
void resize(size_t size)
Grow or shrink the memory block.
Definition MemBuffer.hh:111
bool empty() const
No memory allocated?
Definition MemBuffer.hh:103
const T * data() const
Returns pointer to the start of the memory buffer.
Definition MemBuffer.hh:81
Helper class to perform a sha1 calculation.
Definition sha1.hh:80
Sha1Sum digest()
Get the final hash.
void update(std::span< const uint8_t > data)
Incrementally calculate the hash value.
This class represents the result of a sha1 calculation (a 160-bit value).
Definition sha1.hh:23
std::string toString() const
constexpr double e
Definition Math.hh:21
mat4 rotate(float angle, const vec3 &axis)
std::string toString(time_t time)
Definition Date.cc:152
time_t fromString(std::span< const char, 24 > s)
Definition Date.cc:32
constexpr time_t INVALID_TIME_T
Definition Date.hh:11
string expandTilde(string path)
Expand the '~' character to the users home directory.
void openOfStream(std::ofstream &stream, zstring_view filename)
Open an ofstream in a platform-independent manner.
time_t getModificationDate(const Stat &st)
Get the date/time of last modification.
uint64_t getTime()
Get current (real) time in us.
Definition Timer.cc:7
This file implemented 3 utility functions:
Definition Autofire.cc:9
bool foreach_file_recursive(std::string path, FileAction fileAction)
bool is_sorted(ForwardRange &&range, Compare comp={}, Proj proj={})
Definition ranges.hh:40
auto upper_bound(ForwardRange &&range, const T &value, Compare comp={}, Proj proj={})
Definition ranges.hh:124
auto equal_range(ForwardRange &&range, const T &value, Compare comp={})
Definition ranges.hh:133
constexpr void sort(RandomAccessRange &&range)
Definition ranges.hh:49
STL namespace.
size_t size(std::string_view utf8)
auto distance(octet_iterator first, octet_iterator last)
auto sum(InputRange &&range, Proj proj={})
Definition stl.hh:245
TemporaryString tmpStrCat(Ts &&... ts)
Definition strCat.hh:742
const Sha1Sum & operator()(FilePoolCore::Index idx) const
const FilePoolCore::Pool & pool
constexpr auto begin(const zstring_view &x)