Browse Source

Add optional sqlite cache for metadata lookup. Greatly speeds up rebuilding for large libraries.

pull/440/head
David Schultz 5 years ago
parent
commit
e35ad8015e
  1. 10
      sigal/__init__.py
  2. 102
      sigal/cache.py
  3. 113
      sigal/gallery.py
  4. 1
      sigal/settings.py
  5. 4
      sigal/templates/sigal.conf.py
  6. 7
      sigal/utils.py
  7. 94
      tests/test_cache.py
  8. 15
      tests/test_utils.py

10
sigal/__init__.py

@ -108,13 +108,15 @@ def init(path):
)
@option('--title', help="Title of the gallery (overrides the title setting.")
@option('-n', '--ncpu', help="Number of cpu to use (default: all)")
@option('--cache', help="Cache file path")
def build(
source, destination, debug, verbose, quiet, force, config, theme, title, ncpu
source, destination, debug, verbose, quiet, force, config, theme, title,
ncpu, cache
):
"""Run sigal to process a directory.
If provided, 'source', 'destination' and 'theme' will override the
corresponding values from the settings file.
If provided, 'source', 'destination', 'theme' and 'cache' will override
the corresponding values from the settings file.
"""
if sum([debug, verbose, quiet]) > 1:
@ -139,7 +141,7 @@ def build(
start_time = time.time()
settings = read_settings(config)
for key in ('source', 'destination', 'theme'):
for key in ('source', 'destination', 'theme', 'cache'):
arg = locals()[key]
if arg is not None:
settings[key] = os.path.abspath(arg)

102
sigal/cache.py

@ -0,0 +1,102 @@
from pickle import loads, dumps
import logging
import os
from os.path import isfile
import sqlite3
from .utils import get_mod_date
class Cache:
"""
Uses sqlite3 to cache file data for faster lookup.
A cache is considered good if the file modification date is accurate
to the second.
Cache contents can either be a string or a dict (such as metadata).
"""
def __init__(self, settings):
self.settings = settings
self.con = None
self.logger = logging.getLogger(__name__)
def __enter__(self):
if (not self.con) and self.settings.get('cache', None):
self.con = sqlite3.connect(':memory:')
if isfile(self.settings['cache']):
self.logger.info('Loading cache from file %s',
self.settings['cache'])
try:
disk_con = sqlite3.connect(self.settings['cache'])
with self.con:
disk_con.backup(self.con)
disk_con.close()
except sqlite3.OperationalError:
self.logger.warning('Cache db is corrupt, deleting')
os.remove(self.settings['cache'])
sql = ('CREATE TABLE IF NOT EXISTS '
'cache(path NOT NULL PRIMARY KEY, mod, data)')
self.con.execute(sql)
return self
def __exit__(self, exc_type, exc_value, traceback):
if not self.con:
return
if (not exc_type) and self.settings.get('cache', None):
self.logger.info('Saving cache to file %s',
self.settings['cache'])
disk_con = sqlite3.connect(self.settings['cache'])
with disk_con:
self.con.backup(disk_con)
disk_con.close()
self.con.close()
self.con = None
def read(self, path):
if not self.con:
return None
self.logger.debug('Reading from cache: %s', path)
mod_date = int(get_mod_date(path))
cur = self.con.execute('SELECT mod,data FROM cache WHERE path = ?',
(path, ))
row = cur.fetchone()
if row and mod_date == row[0]:
return row[1]
else:
return None
def read_dict(self, path):
if not self.con:
return None
data = self.read(path)
if data:
return loads(data)
else:
return None
def write(self, path, data):
if not self.con:
return
self.logger.debug('Writing to cache: %s', path)
mod_date = int(get_mod_date(path))
self.con.execute('REPLACE INTO cache (path, mod, data) VALUES (?, ?, ?)',
(path, mod_date, data))
self.con.commit()
def write_dict(self, path, data):
if not self.con:
return
self.write(path, dumps(data))

113
sigal/gallery.py

@ -41,6 +41,7 @@ from natsort import natsort_keygen, ns
from PIL import Image as PILImage
from . import image, signals, video
from .cache import Cache
from .image import get_exif_tags, get_image_metadata, get_size, process_image
from .settings import Status, get_thumb
from .utils import (
@ -49,6 +50,7 @@ from .utils import (
check_or_create_dir,
copy,
get_mime,
get_mod_date,
is_valid_html5_video,
read_markdown,
url_from_path,
@ -57,6 +59,10 @@ from .video import process_video
from .writer import AlbumListPageWriter, AlbumPageWriter
# metadata cache
CACHE = None
class Media:
"""Base Class for media files.
@ -202,13 +208,18 @@ class Media:
descfile = splitext(self.src_path)[0] + '.md'
if isfile(descfile):
meta = read_markdown(descfile)
meta = None
if CACHE:
meta = CACHE.read_dict(descfile)
if not meta:
meta = read_markdown(descfile)
if CACHE:
CACHE.write_dict(descfile, meta)
for key, val in meta.items():
setattr(self, key, val)
def _get_file_date(self):
stat = os.stat(self.src_path)
return datetime.fromtimestamp(stat.st_mtime)
return datetime.fromtimestamp(get_mod_date(self.src_path))
class Image(Media):
@ -247,7 +258,15 @@ class Image(Media):
def _get_metadata(self):
super()._get_metadata()
self.file_metadata = get_image_metadata(self.src_path)
meta = None
if CACHE:
meta = CACHE.read_dict(self.src_path)
if not meta:
meta = get_image_metadata(self.src_path)
if CACHE:
CACHE.write_dict(self.src_path, meta)
self.file_metadata = meta
# If a title or description hasn't been obtained by other means, look
# for the information in IPTC fields
@ -407,7 +426,13 @@ class Album:
self.title = os.path.basename(self.path if self.path != '.' else self.src_path)
if isfile(descfile):
meta = read_markdown(descfile)
meta = None
if CACHE:
meta = CACHE.read_dict(descfile)
if not meta:
meta = read_markdown(descfile)
if CACHE:
CACHE.write_dict(descfile, meta)
for key, val in meta.items():
setattr(self, key, val)
@ -640,6 +665,7 @@ class Album:
class Gallery:
def __init__(self, settings, ncpu=None, quiet=False):
global CACHE
self.settings = settings
self.logger = logging.getLogger(__name__)
self.stats = defaultdict(int)
@ -664,43 +690,46 @@ class Gallery:
)
self.progressbar_target = None if show_progress else Devnull()
for path, dirs, files in os.walk(src_path, followlinks=True, topdown=False):
if show_progress:
print("\rCollecting albums " + next(progressChars), end="")
relpath = os.path.relpath(path, src_path)
# Test if the directory match the ignore_dirs settings
if ignore_dirs and any(
fnmatch.fnmatch(relpath, ignore) for ignore in ignore_dirs
):
self.logger.info('Ignoring %s', relpath)
continue
# Remove files that match the ignore_files settings
if ignore_files:
files_path = {join(relpath, f) for f in files}
for ignore in ignore_files:
files_path -= set(fnmatch.filter(files_path, ignore))
self.logger.debug('Files before filtering: %r', files)
files = [os.path.split(f)[1] for f in files_path]
self.logger.debug('Files after filtering: %r', files)
# Remove sub-directories that have been ignored in a previous
# iteration (as topdown=False, sub-directories are processed before
# their parent
for d in dirs[:]:
path = join(relpath, d) if relpath != '.' else d
if path not in albums.keys():
dirs.remove(d)
album = Album(relpath, settings, dirs, files, self)
if not album.medias and not album.albums:
self.logger.info('Skip empty album: %r', album)
else:
album.create_output_directories()
albums[relpath] = album
with Cache(settings) as cache:
CACHE = cache
for path, dirs, files in os.walk(src_path, followlinks=True, topdown=False):
if show_progress:
print("\rCollecting albums " + next(progressChars), end="")
relpath = os.path.relpath(path, src_path)
# Test if the directory match the ignore_dirs settings
if ignore_dirs and any(
fnmatch.fnmatch(relpath, ignore) for ignore in ignore_dirs
):
self.logger.info('Ignoring %s', relpath)
continue
# Remove files that match the ignore_files settings
if ignore_files:
files_path = {join(relpath, f) for f in files}
for ignore in ignore_files:
files_path -= set(fnmatch.filter(files_path, ignore))
self.logger.debug('Files before filtering: %r', files)
files = [os.path.split(f)[1] for f in files_path]
self.logger.debug('Files after filtering: %r', files)
# Remove sub-directories that have been ignored in a previous
# iteration (as topdown=False, sub-directories are processed before
# their parent
for d in dirs[:]:
path = join(relpath, d) if relpath != '.' else d
if path not in albums.keys():
dirs.remove(d)
album = Album(relpath, settings, dirs, files, self)
if not album.medias and not album.albums:
self.logger.info('Skip empty album: %r', album)
else:
album.create_output_directories()
albums[relpath] = album
CACHE = None
if show_progress:
print("\rCollecting albums, done.")

1
sigal/settings.py

@ -30,6 +30,7 @@ _DEFAULT_CONFIG = {
'albums_sort_attr': 'name',
'albums_sort_reverse': False,
'autorotate_images': True,
'cache': None,
'colorbox_column_size': 3,
'copy_exif_data': False,
'datetime_format': '%c',

4
sigal/templates/sigal.conf.py

@ -19,6 +19,10 @@ source = 'pictures'
# `sigal build` command (default: '_build')
# destination = '_build'
# Cache file. Can be set here or as an argument to the `sigal build` command.
# Uses sqlite3 to cache markdown contents for faster re-processing.
# cache = None
# Theme :
# - colorbox (default), galleria, photoswipe, or the path to a custom theme
# directory

7
sigal/utils.py

@ -18,6 +18,7 @@
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
from functools import lru_cache
import os
import shutil
from urllib.parse import quote
@ -64,6 +65,12 @@ def check_or_create_dir(path):
os.makedirs(path)
@lru_cache(maxsize=1024)
def get_mod_date(path):
"""Get modification date for a path, caching result with LRU cache."""
return os.path.getmtime(path)
def url_from_path(path):
"""Transform path to url, converting backslashes to slashes if needed."""

94
tests/test_cache.py

@ -0,0 +1,94 @@
import os
from sigal.utils import read_markdown
from sigal.cache import Cache
CURRENT_DIR = os.path.dirname(__file__)
SAMPLE_DIR = os.path.join(CURRENT_DIR, 'sample')
TEST_META = os.path.join(SAMPLE_DIR, 'pictures/dir1/test1/11.md')
def test_no_cache(tmpdir):
data = 'testing'
with Cache({}) as c:
assert c.read(TEST_META) is None
c.write(TEST_META, data)
assert c.read(TEST_META) is None
data = {'test': True}
with Cache({}) as c:
assert c.read_dict(TEST_META) is None
c.write_dict(TEST_META, data)
assert c.read_dict(TEST_META) is None
def test_cache(tmpdir):
settings = {'cache': os.path.join(tmpdir, 'cache.sqlite')}
data = 'testing'
with Cache(settings) as c:
assert c.read(TEST_META) is None
c.write(TEST_META, data)
assert c.read(TEST_META) == data
assert os.path.isfile(settings['cache'])
with Cache(settings) as c:
assert c.read(TEST_META) == data
def test_cache_dict(tmpdir):
settings = {'cache': os.path.join(tmpdir, 'cache.sqlite')}
data = read_markdown(TEST_META)
with Cache(settings) as c:
assert c.read_dict(TEST_META) is None
c.write_dict(TEST_META, data)
assert c.read_dict(TEST_META) == data
assert os.path.isfile(settings['cache'])
with Cache(settings) as c:
assert c.read_dict(TEST_META) == data
def test_invalid_cache(tmpdir):
"""
Test an invalid cache file.
Should delete bad file and continue.
"""
settings = {'cache': os.path.join(tmpdir, 'cache.sqlite')}
with open(settings['cache'], 'w') as f:
f.write('invalid')
data = 'testing'
with Cache(settings) as c:
assert c.read(TEST_META) is None
c.write(TEST_META, data)
assert c.read(TEST_META) == data
assert os.path.isfile(settings['cache'])
with Cache(settings) as c:
assert c.read(TEST_META) == data
def test_cache_exception(tmpdir):
"""
Test what happens when an exception occurs.
Cache should not be saved.
"""
settings = {'cache': os.path.join(tmpdir, 'cache.sqlite')}
data = 'testing'
try:
with Cache(settings) as c:
assert c.read(TEST_META) is None
c.write(TEST_META, data)
assert c.read(TEST_META) == data
raise RuntimeError()
except RuntimeError:
pass
assert not os.path.isfile(settings['cache'])
with Cache(settings) as c:
assert c.read(TEST_META) is None

15
tests/test_utils.py

@ -1,5 +1,6 @@
import os
from pathlib import Path
from time import sleep
from sigal import utils
@ -50,6 +51,20 @@ def test_check_or_create_dir(tmpdir):
assert os.path.isdir(path)
def test_get_mod_date(tmp_path):
path = tmp_path / 'foo'
path.touch()
start = utils.get_mod_date(str(path))
sleep(.1)
path.touch()
end = utils.get_mod_date(str(path))
assert start == end # cache is working
utils.get_mod_date.cache_clear()
end = utils.get_mod_date(str(path))
assert start != end
def test_url_from_path():
assert utils.url_from_path(os.sep.join(['foo', 'bar'])) == 'foo/bar'

Loading…
Cancel
Save