openMSX
FilePoolCore.cc
Go to the documentation of this file.
1#include "FilePoolCore.hh"
2#include "File.hh"
3#include "FileException.hh"
4#include "foreach_file.hh"
5#include "Date.hh"
6#include "Timer.hh"
7#include "one_of.hh"
8#include "ranges.hh"
9#include <fstream>
10#include <optional>
11#include <tuple>
12
13namespace openmsx {
14
15struct GetSha1 {
17
18 [[nodiscard]] const Sha1Sum& operator()(FilePoolCore::Index idx) const {
19 return pool[idx].sum;
20 }
21};
22
23
24FilePoolCore::FilePoolCore(std::string fileCache_,
25 std::function<Directories()> getDirectories_,
26 std::function<void(std::string_view)> reportProgress_)
27 : fileCache(std::move(fileCache_))
28 , getDirectories(std::move(getDirectories_))
29 , reportProgress(std::move(reportProgress_))
30{
31 try {
32 readSha1sums();
33 } catch (MSXException&) {
34 // ignore, probably .filecache doesn't exist yet
35 }
36}
37
39{
40 if (needWrite) {
41 writeSha1sums();
42 }
43}
44
45void FilePoolCore::insert(const Sha1Sum& sum, time_t time, const std::string& filename)
46{
47 stringBuffer.push_back(filename);
48 auto idx = pool.emplace(sum, time, stringBuffer.back()).idx;
49 auto it = ranges::upper_bound(sha1Index, sum, {}, GetSha1{pool});
50 sha1Index.insert(it, idx);
51 filenameIndex.insert(idx);
52 needWrite = true;
53}
54
55FilePoolCore::Sha1Index::iterator FilePoolCore::getSha1Iterator(Index idx, Entry& entry)
56{
57 // There can be multiple entries for the same sha1, look for the specific one.
58 for (auto [b, e] = ranges::equal_range(sha1Index, entry.sum, {}, GetSha1{pool}); b != e; ++b) {
59 if (*b == idx) {
60 return b;
61 }
62 }
63 assert(false); return sha1Index.end();
64}
65
66void FilePoolCore::remove(Sha1Index::iterator it)
67{
68 auto idx = *it;
69 filenameIndex.erase(idx);
70 pool.remove(idx);
71 sha1Index.erase(it);
72 needWrite = true;
73}
74
75void FilePoolCore::remove(Index idx, Entry& entry)
76{
77 remove(getSha1Iterator(idx, entry));
78}
79
80void FilePoolCore::remove(Index idx)
81{
82 remove(idx, pool[idx]);
83}
84
85// Change the sha1sum of the element pointed to by 'it' into 'newSum'. Also
86// re-arrange the items so that 'sha1Index' remains sorted on sha1sum.
87// Internally this method doesn't actually sort, it merely rotates the
88// elements.
89// Returns false if the new position is before (or at) the old position.
90// Returns true if the new position is after the old position.
91bool FilePoolCore::adjustSha1(Sha1Index::iterator it, Entry& entry, const Sha1Sum& newSum)
92{
93 needWrite = true;
94 auto newIt = ranges::upper_bound(sha1Index, newSum, {}, GetSha1{pool});
95 entry.sum = newSum; // update sum
96 if (newIt > it) {
97 // move to back
98 rotate(it, it + 1, newIt);
99 return true;
100 } else {
101 if (newIt < it) {
102 // move to front
103 rotate(newIt, it, it + 1);
104 } else {
105 // (unlikely) sha1sum has changed, but after
106 // resorting item would remain in the same
107 // position
108 }
109 return false;
110 }
111}
112
113bool FilePoolCore::adjustSha1(Index idx, Entry& entry, const Sha1Sum& newSum)
114{
115 return adjustSha1(getSha1Iterator(idx, entry), entry, newSum);
116}
117
118time_t FilePoolCore::Entry::getTime()
119{
120 if (time == Date::INVALID_TIME_T) {
121 time = Date::fromString(std::span<const char, 24>{timeStr, 24});
122 }
123 return time;
124}
125
126void FilePoolCore::Entry::setTime(time_t t)
127{
128 time = t;
129 timeStr = nullptr;
130}
131
132// returns: <sha1, time-string, filename>
133static std::optional<std::tuple<Sha1Sum, const char*, std::string_view>> parse(
134 std::span<char> line)
135{
136 if (line.size() <= 68) return {}; // minumum length (only filename is variable)
137
138 // only perform quick sanity check on date/time format
139 if (line[40] != ' ') return {}; // two space between sha1sum and date
140 if (line[41] != ' ') return {};
141 if (line[45] != ' ') return {}; // space between day-of-week and month
142 if (line[49] != ' ') return {}; // space between month and day of month
143 if (line[52] != ' ') return {}; // space between day of month and hour
144 if (line[55] != ':') return {}; // colon between hour and minutes
145 if (line[58] != ':') return {}; // colon between minutes and seconds
146 if (line[61] != ' ') return {}; // space between seconds and year
147 if (line[66] != ' ') return {}; // two spaces between date and filename
148 if (line[67] != ' ') return {};
149
150 Sha1Sum sha1(Sha1Sum::UninitializedTag{});
151 try {
152 sha1.parse40(subspan<40>(line));
153 } catch (MSXException&) {
154 return {};
155 }
156
157 const char* timeStr = &line[42]; // not guaranteed to be a correct date/time
158 line[66] = '\0'; // zero-terminate timeStr, so that it can be printed
159
160 std::string_view filename(&line[68], line.size() - 68); // TODO c++20 [begin, end)
161
162 return std::tuple{sha1, timeStr, filename};
163}
164
165void FilePoolCore::readSha1sums()
166{
167 assert(sha1Index.empty());
168 assert(fileMem.empty());
169
170 File file(fileCache);
171 auto size = file.getSize();
172 fileMem.resize(size + 1);
173 file.read(std::span{fileMem.data(), size});
174 fileMem[size] = '\n'; // ensure there's always a '\n' at the end
175
176 // Process each line.
177 // Assume lines are separated by "\n", "\r\n" or "\n\r" (but not "\r").
178 char* data = fileMem.data();
179 char* data_end = data + size + 1;
180 while (data != data_end) {
181 // memchr() seems better optimized than std::find_if()
182 char* it = static_cast<char*>(memchr(data, '\n', data_end - data));
183 if (it == nullptr) it = data_end;
184 if ((it != data) && (it[-1] == '\r')) --it;
185
186 if (auto r = parse({data, it})) {
187 auto [sum, timeStr, filename] = *r;
188 sha1Index.push_back(pool.emplace(sum, timeStr, filename).idx);
189 // sha1Index not yet guaranteed sorted
190 }
191
192 data = std::find_if(it + 1, data_end, [](char c) {
193 return c != one_of('\n', '\r');
194 });
195 }
196
197 if (!ranges::is_sorted(sha1Index, {}, GetSha1{pool})) {
198 // This should _rarely_ happen. In fact it should only happen
199 // when .filecache was manually edited. Though because it's
200 // very important that pool is indeed sorted I've added this
201 // safety mechanism.
202 ranges::sort(sha1Index, {}, GetSha1{pool});
203 }
204
205 // 'pool' is populated, 'sha1Index' is sorted, now build 'filenameIndex'
206 auto n = sha1Index.size();
207 filenameIndex.reserve(n);
208 while (n != 0) { // sha1Index might change while iterating ...
209 --n;
210 Index idx = sha1Index[n]; // ... therefor use indices instead of iterators
211 bool inserted = filenameIndex.insert(idx);
212 if (!inserted) {
213 // ERROR: there may not be multiple entries for the same filename
214 // Should only happen when .filecache was manually edited.
215 remove(idx);
216 }
217 }
218}
219
220void FilePoolCore::writeSha1sums()
221{
222 std::ofstream file;
223 FileOperations::openOfStream(file, fileCache);
224 if (!file.is_open()) {
225 return;
226 }
227 for (auto idx : sha1Index) {
228 const auto& entry = pool[idx];
229 file << entry.sum.toString() << " ";
230 if (entry.timeStr) {
231 file << entry.timeStr;
232 } else {
233 assert(entry.time != Date::INVALID_TIME_T);
234 file << Date::toString(entry.time);
235 }
236 file << " " << entry.filename << '\n';
237 }
238}
239
241{
242 File result = getFromPool(sha1sum);
243 if (result.is_open()) return result;
244
245 // not found in cache, need to scan directories
246 stop = false;
247 ScanProgress progress;
248 progress.lastTime = Timer::getTime();
249 progress.amountScanned = 0;
250
251 for (auto& [path, types] : getDirectories()) {
252 if ((types & fileType) != FileType::NONE) {
253 result = scanDirectory(sha1sum, FileOperations::expandTilde(std::string(path)), path, progress);
254 if (result.is_open()) return result;
255 }
256 }
257
258 return result; // not found
259}
260
261Sha1Sum FilePoolCore::calcSha1sum(File& file)
262{
263 // Calculate sha1 in several steps so that we can show progress
264 // information. We take a fixed step size for an efficient calculation.
265 constexpr size_t STEP_SIZE = 1024 * 1024; // 1MB
266
267 auto data = file.mmap();
268
269 SHA1 sha1;
270 size_t size = data.size();
271 size_t done = 0;
272 size_t remaining = size;
273 auto lastShowedProgress = Timer::getTime();
274 bool everShowedProgress = false;
275
276 auto report = [&](size_t percentage) {
277 reportProgress(tmpStrCat("Calculating SHA1 sum for ", file.getOriginalName(),
278 "... ", percentage, '%'));
279 };
280 // Loop over all-but-the last blocks. For small files this loop is skipped.
281 while (remaining > STEP_SIZE) {
282 sha1.update({&data[done], STEP_SIZE});
283 done += STEP_SIZE;
284 remaining -= STEP_SIZE;
285
286 auto now = Timer::getTime();
287 if ((now - lastShowedProgress) > 250'000) { // 4Hz
288 report((100 * done) / size);
289 lastShowedProgress = now;
290 everShowedProgress = true;
291 }
292 }
293 // last block
294 if (remaining) {
295 sha1.update({&data[done], remaining});
296 }
297 if (everShowedProgress) {
298 report(100);
299 }
300 return sha1.digest();
301}
302
303File FilePoolCore::getFromPool(const Sha1Sum& sha1sum)
304{
305 auto [b, e] = ranges::equal_range(sha1Index, sha1sum, {}, GetSha1{pool});
306 // use indices instead of iterators
307 auto i = distance(begin(sha1Index), b);
308 auto last = distance(begin(sha1Index), e);
309 while (i != last) {
310 auto it = begin(sha1Index) + i;
311 auto& entry = pool[*it];
312 if (entry.getTime() == Date::INVALID_TIME_T) {
313 // Invalid time/date format. Remove from
314 // database and continue searching.
315 remove(it);
316 --last;
317 continue;
318 }
319 try {
320 File file(std::string(entry.filename));
321 auto newTime = file.getModificationDate();
322 if (entry.getTime() == newTime) {
323 // When modification time is unchanged, assume
324 // sha1sum is also unchanged. So avoid
325 // expensive sha1sum calculation.
326 return file;
327 }
328 entry.setTime(newTime); // update timestamp
329 needWrite = true;
330 auto newSum = calcSha1sum(file);
331 if (newSum == sha1sum) {
332 // Modification time was changed, but
333 // (recalculated) sha1sum is still the same.
334 return file;
335 }
336 // Sha1sum has changed: update sha1sum, move entry to
337 // new position new sum and continue searching.
338 if (adjustSha1(it, entry, newSum)) {
339 // after
340 --last; // no ++i
341 } else {
342 // before (or at)
343 ++i;
344 }
345 } catch (FileException&) {
346 // Error reading file: remove from db and continue
347 // searching.
348 remove(it);
349 --last;
350 }
351 }
352 return {}; // not found
353}
354
355File FilePoolCore::scanDirectory(
356 const Sha1Sum& sha1sum, const std::string& directory, std::string_view poolPath,
357 ScanProgress& progress)
358{
359 File result;
360 auto fileAction = [&](const std::string& path, const FileOperations::Stat& st) {
361 if (stop) {
362 // Scanning can take a long time. Allow to exit
363 // openmsx when it takes too long. Stop scanning
364 // by pretending we didn't find the file.
365 assert(!result.is_open());
366 return false; // abort foreach_file_recursive
367 }
368 result = scanFile(sha1sum, path, st, poolPath, progress);
369 return !result.is_open(); // abort traversal when found
370 };
371 foreach_file_recursive(directory, fileAction);
372 return result;
373}
374
375File FilePoolCore::scanFile(const Sha1Sum& sha1sum, const std::string& filename,
376 const FileOperations::Stat& st, std::string_view poolPath,
377 ScanProgress& progress)
378{
379 ++progress.amountScanned;
380 // Periodically send a progress message with the current filename
381 auto now = Timer::getTime();
382 if (now > (progress.lastTime + 250'000)) { // 4Hz
383 progress.lastTime = now;
384 reportProgress(tmpStrCat(
385 "Searching for file with sha1sum ", sha1sum.toString(),
386 "...\nIndexing filepool ", poolPath, ": [",
387 progress.amountScanned, "]: ",
388 std::string_view(filename).substr(poolPath.size())));
389 }
390
392 if (auto [idx, entry] = findInDatabase(filename); idx == Index(-1)) {
393 // not in pool
394 try {
395 File file(filename);
396 auto sum = calcSha1sum(file);
397 insert(sum, time, filename);
398 if (sum == sha1sum) {
399 return file;
400 }
401 } catch (FileException&) {
402 // ignore
403 }
404 } else {
405 // already in pool
406 assert(filename == entry->filename);
407 try {
408 if (entry->getTime() == time) {
409 // db is still up to date
410 if (entry->sum == sha1sum) {
411 return File(filename);
412 }
413 } else {
414 // db outdated
415 File file(filename);
416 auto sum = calcSha1sum(file);
417 entry->setTime(time);
418 adjustSha1(idx, *entry, sum);
419 if (sum == sha1sum) {
420 return file;
421 }
422 }
423 } catch (FileException&) {
424 // error reading file, remove from db
425 remove(idx, *entry);
426 }
427 }
428 return {}; // not found
429}
430
431std::pair<FilePoolCore::Index, FilePoolCore::Entry*> FilePoolCore::findInDatabase(std::string_view filename)
432{
433 auto it = filenameIndex.find(filename);
434 if (!it) return {Index(-1), nullptr};
435
436 Index idx = *it;
437 auto& entry = pool[idx];
438 assert(entry.filename == filename);
439 if (entry.getTime() == Date::INVALID_TIME_T) {
440 // invalid date/time string, remove from db
441 remove(idx, entry);
442 return {Index(-1), nullptr};
443 }
444 return {idx, &entry};
445}
446
448{
449 auto time = file.getModificationDate();
450 const std::string& filename = file.getURL();
451
452 auto [idx, entry] = findInDatabase(filename);
453 if (idx != Index(-1)) {
454 if (entry->getTime() == time) {
455 // in database and modification time matches,
456 // assume sha1sum also matches
457 return entry->sum;
458 }
459 }
460
461 // not in database or timestamp mismatch
462 auto sum = calcSha1sum(file);
463 if (idx == Index(-1)) {
464 // was not yet in database, insert new entry
465 insert(sum, time, filename);
466 } else {
467 // was already in database, but with wrong timestamp (and sha1sum)
468 entry->setTime(time);
469 adjustSha1(idx, *entry, sum);
470 }
471 return sum;
472}
473
474} // namespace openmsx
TclObject t
void remove(Index idx)
Definition: ObjectPool.hh:96
EmplaceResult emplace(Args &&...args)
Definition: ObjectPool.hh:76
bool insert(Value2 &&val)
Value * find(const Value2 &val) const
void reserve(size_t n)
bool erase(const Value2 &val)
Definition: one_of.hh:7
Sha1Sum getSha1Sum(File &file)
Calculate sha1sum for the given File object.
FilePoolCore(std::string fileCache, std::function< Directories()> getDirectories, std::function< void(std::string_view)> reportProgress)
Definition: FilePoolCore.cc:24
File getFile(FileType fileType, const Sha1Sum &sha1sum)
Search file with the given sha1sum.
friend struct GetSha1
std::vector< Dir > Directories
Definition: FilePoolCore.hh:44
std::span< const uint8_t > mmap()
Map file in memory.
Definition: File.cc:102
std::string_view getOriginalName()
Get Original filename for this object.
Definition: File.cc:147
time_t getModificationDate()
Get the date/time of last modification.
Definition: File.cc:158
bool is_open() const
Return true iff this file handle refers to an open file.
Definition: File.hh:63
const std::string & getURL() const
Returns the URL of this file object.
Definition: File.cc:137
void resize(size_t size)
Grow or shrink the memory block.
Definition: MemBuffer.hh:111
bool empty() const
No memory allocated?
Definition: MemBuffer.hh:103
const T * data() const
Returns pointer to the start of the memory buffer.
Definition: MemBuffer.hh:81
Helper class to perform a sha1 calculation.
Definition: sha1.hh:80
Sha1Sum digest()
Get the final hash.
Definition: utils/sha1.cc:355
void update(std::span< const uint8_t > data)
Incrementally calculate the hash value.
Definition: utils/sha1.cc:313
This class represents the result of a sha1 calculation (a 160-bit value).
Definition: sha1.hh:23
constexpr double e
Definition: Math.hh:21
mat4 rotate(float angle, const vec3 &axis)
Definition: gl_transform.hh:58
std::string toString(time_t time)
Definition: Date.cc:152
time_t fromString(std::span< const char, 24 > s)
Definition: Date.cc:32
constexpr time_t INVALID_TIME_T
Definition: Date.hh:11
string expandTilde(string path)
Expand the '~' character to the users home directory.
void openOfStream(std::ofstream &stream, zstring_view filename)
Open an ofstream in a platform-independent manner.
time_t getModificationDate(const Stat &st)
Get the date/time of last modification.
uint64_t getTime()
Get current (real) time in us.
Definition: Timer.cc:7
This file implemented 3 utility functions:
Definition: Autofire.cc:9
bool foreach_file_recursive(std::string path, FileAction fileAction)
bool is_sorted(ForwardRange &&range, Compare comp={}, Proj proj={})
Definition: ranges.hh:40
auto find_if(InputRange &&range, UnaryPredicate pred)
Definition: ranges.hh:173
auto upper_bound(ForwardRange &&range, const T &value, Compare comp={}, Proj proj={})
Definition: ranges.hh:124
auto equal_range(ForwardRange &&range, const T &value, Compare comp={})
Definition: ranges.hh:133
constexpr void sort(RandomAccessRange &&range)
Definition: ranges.hh:49
STL namespace.
size_t size(std::string_view utf8)
auto distance(octet_iterator first, octet_iterator last)
auto sum(InputRange &&range, Proj proj={})
Definition: stl.hh:236
TemporaryString tmpStrCat(Ts &&... ts)
Definition: strCat.hh:610
const Sha1Sum & operator()(FilePoolCore::Index idx) const
Definition: FilePoolCore.cc:18
const FilePoolCore::Pool & pool
Definition: FilePoolCore.cc:16
constexpr auto begin(const zstring_view &x)