From e35ad8015e166a32e1d97cbcc337e7e7492be6c1 Mon Sep 17 00:00:00 2001 From: David Schultz Date: Mon, 19 Jul 2021 16:33:56 -0500 Subject: [PATCH] Add optional sqlite cache for metadata lookup. Greatly speeds up rebuilding for large libraries. --- sigal/__init__.py | 10 +-- sigal/cache.py | 102 ++++++++++++++++++++++++++++++ sigal/gallery.py | 113 +++++++++++++++++++++------------- sigal/settings.py | 1 + sigal/templates/sigal.conf.py | 4 ++ sigal/utils.py | 7 +++ tests/test_cache.py | 94 ++++++++++++++++++++++++++++ tests/test_utils.py | 15 +++++ 8 files changed, 300 insertions(+), 46 deletions(-) create mode 100644 sigal/cache.py create mode 100644 tests/test_cache.py diff --git a/sigal/__init__.py b/sigal/__init__.py index de22a55..3f5e795 100644 --- a/sigal/__init__.py +++ b/sigal/__init__.py @@ -108,13 +108,15 @@ def init(path): ) @option('--title', help="Title of the gallery (overrides the title setting.") @option('-n', '--ncpu', help="Number of cpu to use (default: all)") +@option('--cache', help="Cache file path") def build( - source, destination, debug, verbose, quiet, force, config, theme, title, ncpu + source, destination, debug, verbose, quiet, force, config, theme, title, + ncpu, cache ): """Run sigal to process a directory. - If provided, 'source', 'destination' and 'theme' will override the - corresponding values from the settings file. + If provided, 'source', 'destination', 'theme' and 'cache' will override + the corresponding values from the settings file. """ if sum([debug, verbose, quiet]) > 1: @@ -139,7 +141,7 @@ def build( start_time = time.time() settings = read_settings(config) - for key in ('source', 'destination', 'theme'): + for key in ('source', 'destination', 'theme', 'cache'): arg = locals()[key] if arg is not None: settings[key] = os.path.abspath(arg) diff --git a/sigal/cache.py b/sigal/cache.py new file mode 100644 index 0000000..3b68208 --- /dev/null +++ b/sigal/cache.py @@ -0,0 +1,102 @@ +from pickle import loads, dumps +import logging +import os +from os.path import isfile +import sqlite3 + +from .utils import get_mod_date + + +class Cache: + """ + Uses sqlite3 to cache file data for faster lookup. + + A cache is considered good if the file modification date is accurate + to the second. + + Cache contents can either be a string or a dict (such as metadata). + """ + def __init__(self, settings): + self.settings = settings + self.con = None + self.logger = logging.getLogger(__name__) + + def __enter__(self): + if (not self.con) and self.settings.get('cache', None): + self.con = sqlite3.connect(':memory:') + if isfile(self.settings['cache']): + self.logger.info('Loading cache from file %s', + self.settings['cache']) + try: + disk_con = sqlite3.connect(self.settings['cache']) + with self.con: + disk_con.backup(self.con) + disk_con.close() + except sqlite3.OperationalError: + self.logger.warning('Cache db is corrupt, deleting') + os.remove(self.settings['cache']) + + sql = ('CREATE TABLE IF NOT EXISTS ' + 'cache(path NOT NULL PRIMARY KEY, mod, data)') + self.con.execute(sql) + + return self + + def __exit__(self, exc_type, exc_value, traceback): + if not self.con: + return + + if (not exc_type) and self.settings.get('cache', None): + self.logger.info('Saving cache to file %s', + self.settings['cache']) + disk_con = sqlite3.connect(self.settings['cache']) + with disk_con: + self.con.backup(disk_con) + disk_con.close() + self.con.close() + self.con = None + + def read(self, path): + if not self.con: + return None + + self.logger.debug('Reading from cache: %s', path) + + mod_date = int(get_mod_date(path)) + + cur = self.con.execute('SELECT mod,data FROM cache WHERE path = ?', + (path, )) + row = cur.fetchone() + + if row and mod_date == row[0]: + return row[1] + else: + return None + + def read_dict(self, path): + if not self.con: + return None + + data = self.read(path) + if data: + return loads(data) + else: + return None + + def write(self, path, data): + if not self.con: + return + + self.logger.debug('Writing to cache: %s', path) + + mod_date = int(get_mod_date(path)) + + self.con.execute('REPLACE INTO cache (path, mod, data) VALUES (?, ?, ?)', + (path, mod_date, data)) + self.con.commit() + + def write_dict(self, path, data): + if not self.con: + return + + self.write(path, dumps(data)) diff --git a/sigal/gallery.py b/sigal/gallery.py index 220c878..0987f5c 100644 --- a/sigal/gallery.py +++ b/sigal/gallery.py @@ -41,6 +41,7 @@ from natsort import natsort_keygen, ns from PIL import Image as PILImage from . import image, signals, video +from .cache import Cache from .image import get_exif_tags, get_image_metadata, get_size, process_image from .settings import Status, get_thumb from .utils import ( @@ -49,6 +50,7 @@ from .utils import ( check_or_create_dir, copy, get_mime, + get_mod_date, is_valid_html5_video, read_markdown, url_from_path, @@ -57,6 +59,10 @@ from .video import process_video from .writer import AlbumListPageWriter, AlbumPageWriter +# metadata cache +CACHE = None + + class Media: """Base Class for media files. @@ -202,13 +208,18 @@ class Media: descfile = splitext(self.src_path)[0] + '.md' if isfile(descfile): - meta = read_markdown(descfile) + meta = None + if CACHE: + meta = CACHE.read_dict(descfile) + if not meta: + meta = read_markdown(descfile) + if CACHE: + CACHE.write_dict(descfile, meta) for key, val in meta.items(): setattr(self, key, val) def _get_file_date(self): - stat = os.stat(self.src_path) - return datetime.fromtimestamp(stat.st_mtime) + return datetime.fromtimestamp(get_mod_date(self.src_path)) class Image(Media): @@ -247,7 +258,15 @@ class Image(Media): def _get_metadata(self): super()._get_metadata() - self.file_metadata = get_image_metadata(self.src_path) + + meta = None + if CACHE: + meta = CACHE.read_dict(self.src_path) + if not meta: + meta = get_image_metadata(self.src_path) + if CACHE: + CACHE.write_dict(self.src_path, meta) + self.file_metadata = meta # If a title or description hasn't been obtained by other means, look # for the information in IPTC fields @@ -407,7 +426,13 @@ class Album: self.title = os.path.basename(self.path if self.path != '.' else self.src_path) if isfile(descfile): - meta = read_markdown(descfile) + meta = None + if CACHE: + meta = CACHE.read_dict(descfile) + if not meta: + meta = read_markdown(descfile) + if CACHE: + CACHE.write_dict(descfile, meta) for key, val in meta.items(): setattr(self, key, val) @@ -640,6 +665,7 @@ class Album: class Gallery: def __init__(self, settings, ncpu=None, quiet=False): + global CACHE self.settings = settings self.logger = logging.getLogger(__name__) self.stats = defaultdict(int) @@ -664,43 +690,46 @@ class Gallery: ) self.progressbar_target = None if show_progress else Devnull() - for path, dirs, files in os.walk(src_path, followlinks=True, topdown=False): - if show_progress: - print("\rCollecting albums " + next(progressChars), end="") - relpath = os.path.relpath(path, src_path) - - # Test if the directory match the ignore_dirs settings - if ignore_dirs and any( - fnmatch.fnmatch(relpath, ignore) for ignore in ignore_dirs - ): - self.logger.info('Ignoring %s', relpath) - continue - - # Remove files that match the ignore_files settings - if ignore_files: - files_path = {join(relpath, f) for f in files} - for ignore in ignore_files: - files_path -= set(fnmatch.filter(files_path, ignore)) - - self.logger.debug('Files before filtering: %r', files) - files = [os.path.split(f)[1] for f in files_path] - self.logger.debug('Files after filtering: %r', files) - - # Remove sub-directories that have been ignored in a previous - # iteration (as topdown=False, sub-directories are processed before - # their parent - for d in dirs[:]: - path = join(relpath, d) if relpath != '.' else d - if path not in albums.keys(): - dirs.remove(d) - - album = Album(relpath, settings, dirs, files, self) - - if not album.medias and not album.albums: - self.logger.info('Skip empty album: %r', album) - else: - album.create_output_directories() - albums[relpath] = album + with Cache(settings) as cache: + CACHE = cache + for path, dirs, files in os.walk(src_path, followlinks=True, topdown=False): + if show_progress: + print("\rCollecting albums " + next(progressChars), end="") + relpath = os.path.relpath(path, src_path) + + # Test if the directory match the ignore_dirs settings + if ignore_dirs and any( + fnmatch.fnmatch(relpath, ignore) for ignore in ignore_dirs + ): + self.logger.info('Ignoring %s', relpath) + continue + + # Remove files that match the ignore_files settings + if ignore_files: + files_path = {join(relpath, f) for f in files} + for ignore in ignore_files: + files_path -= set(fnmatch.filter(files_path, ignore)) + + self.logger.debug('Files before filtering: %r', files) + files = [os.path.split(f)[1] for f in files_path] + self.logger.debug('Files after filtering: %r', files) + + # Remove sub-directories that have been ignored in a previous + # iteration (as topdown=False, sub-directories are processed before + # their parent + for d in dirs[:]: + path = join(relpath, d) if relpath != '.' else d + if path not in albums.keys(): + dirs.remove(d) + + album = Album(relpath, settings, dirs, files, self) + + if not album.medias and not album.albums: + self.logger.info('Skip empty album: %r', album) + else: + album.create_output_directories() + albums[relpath] = album + CACHE = None if show_progress: print("\rCollecting albums, done.") diff --git a/sigal/settings.py b/sigal/settings.py index 6b5352e..475570b 100644 --- a/sigal/settings.py +++ b/sigal/settings.py @@ -30,6 +30,7 @@ _DEFAULT_CONFIG = { 'albums_sort_attr': 'name', 'albums_sort_reverse': False, 'autorotate_images': True, + 'cache': None, 'colorbox_column_size': 3, 'copy_exif_data': False, 'datetime_format': '%c', diff --git a/sigal/templates/sigal.conf.py b/sigal/templates/sigal.conf.py index ba876fd..d8c4bfd 100644 --- a/sigal/templates/sigal.conf.py +++ b/sigal/templates/sigal.conf.py @@ -19,6 +19,10 @@ source = 'pictures' # `sigal build` command (default: '_build') # destination = '_build' +# Cache file. Can be set here or as an argument to the `sigal build` command. +# Uses sqlite3 to cache markdown contents for faster re-processing. +# cache = None + # Theme : # - colorbox (default), galleria, photoswipe, or the path to a custom theme # directory diff --git a/sigal/utils.py b/sigal/utils.py index f8779ad..8e5173f 100644 --- a/sigal/utils.py +++ b/sigal/utils.py @@ -18,6 +18,7 @@ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS # IN THE SOFTWARE. +from functools import lru_cache import os import shutil from urllib.parse import quote @@ -64,6 +65,12 @@ def check_or_create_dir(path): os.makedirs(path) +@lru_cache(maxsize=1024) +def get_mod_date(path): + """Get modification date for a path, caching result with LRU cache.""" + return os.path.getmtime(path) + + def url_from_path(path): """Transform path to url, converting backslashes to slashes if needed.""" diff --git a/tests/test_cache.py b/tests/test_cache.py new file mode 100644 index 0000000..d35eaba --- /dev/null +++ b/tests/test_cache.py @@ -0,0 +1,94 @@ +import os + +from sigal.utils import read_markdown +from sigal.cache import Cache + + +CURRENT_DIR = os.path.dirname(__file__) +SAMPLE_DIR = os.path.join(CURRENT_DIR, 'sample') +TEST_META = os.path.join(SAMPLE_DIR, 'pictures/dir1/test1/11.md') + + +def test_no_cache(tmpdir): + data = 'testing' + with Cache({}) as c: + assert c.read(TEST_META) is None + c.write(TEST_META, data) + assert c.read(TEST_META) is None + + data = {'test': True} + with Cache({}) as c: + assert c.read_dict(TEST_META) is None + c.write_dict(TEST_META, data) + assert c.read_dict(TEST_META) is None + + +def test_cache(tmpdir): + settings = {'cache': os.path.join(tmpdir, 'cache.sqlite')} + data = 'testing' + with Cache(settings) as c: + assert c.read(TEST_META) is None + c.write(TEST_META, data) + assert c.read(TEST_META) == data + + assert os.path.isfile(settings['cache']) + + with Cache(settings) as c: + assert c.read(TEST_META) == data + + +def test_cache_dict(tmpdir): + settings = {'cache': os.path.join(tmpdir, 'cache.sqlite')} + data = read_markdown(TEST_META) + with Cache(settings) as c: + assert c.read_dict(TEST_META) is None + c.write_dict(TEST_META, data) + assert c.read_dict(TEST_META) == data + + assert os.path.isfile(settings['cache']) + + with Cache(settings) as c: + assert c.read_dict(TEST_META) == data + + +def test_invalid_cache(tmpdir): + """ + Test an invalid cache file. + Should delete bad file and continue. + """ + settings = {'cache': os.path.join(tmpdir, 'cache.sqlite')} + with open(settings['cache'], 'w') as f: + f.write('invalid') + + data = 'testing' + with Cache(settings) as c: + assert c.read(TEST_META) is None + c.write(TEST_META, data) + assert c.read(TEST_META) == data + + assert os.path.isfile(settings['cache']) + + with Cache(settings) as c: + assert c.read(TEST_META) == data + + +def test_cache_exception(tmpdir): + """ + Test what happens when an exception occurs. + Cache should not be saved. + """ + settings = {'cache': os.path.join(tmpdir, 'cache.sqlite')} + data = 'testing' + try: + with Cache(settings) as c: + assert c.read(TEST_META) is None + c.write(TEST_META, data) + assert c.read(TEST_META) == data + raise RuntimeError() + except RuntimeError: + pass + + assert not os.path.isfile(settings['cache']) + + with Cache(settings) as c: + assert c.read(TEST_META) is None diff --git a/tests/test_utils.py b/tests/test_utils.py index e782612..9c3e081 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,5 +1,6 @@ import os from pathlib import Path +from time import sleep from sigal import utils @@ -50,6 +51,20 @@ def test_check_or_create_dir(tmpdir): assert os.path.isdir(path) +def test_get_mod_date(tmp_path): + path = tmp_path / 'foo' + path.touch() + start = utils.get_mod_date(str(path)) + sleep(.1) + path.touch() + end = utils.get_mod_date(str(path)) + assert start == end # cache is working + + utils.get_mod_date.cache_clear() + end = utils.get_mod_date(str(path)) + assert start != end + + def test_url_from_path(): assert utils.url_from_path(os.sep.join(['foo', 'bar'])) == 'foo/bar'