Browse Source

Merge pull request #443 from dsschult/metadata_caching

Extend caching plugin to handle markdown and iptc metadata
pull/456/head
Simon Conseil 4 years ago committed by GitHub
parent
commit
045509647f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 125
      sigal/gallery.py
  2. 127
      sigal/plugins/extended_caching.py
  3. 7
      sigal/utils.py
  4. 93
      tests/test_extended_caching.py
  5. 9
      tests/test_image.py

125
sigal/gallery.py

@ -51,6 +51,7 @@ from .utils import (
check_or_create_dir,
copy,
get_mime,
get_mod_date,
is_valid_html5_video,
read_markdown,
url_from_path,
@ -96,12 +97,6 @@ class Media:
self.logger = logging.getLogger(__name__)
self.file_metadata = None
self._get_metadata()
# default: title is the filename
if not self.title:
self.title = self.basename
signals.media_initialized.send(self)
def __repr__(self):
@ -190,27 +185,45 @@ class Media:
return
return url_from_path(self.thumb_name)
def _get_metadata(self):
"""Get image metadata from filename.md: title, description, meta."""
self.description = ''
@cached_property
def description(self):
"""Description extracted from the Markdown <imagename>.md file."""
return self.markdown_metadata.get('description', '')
self.title = ''
"""Title extracted from the Markdown <imagename>.md file."""
@cached_property
def title(self):
"""Title extracted from the metadata, or defaults to the filename."""
title = self.markdown_metadata.get('title', '')
return title if title else self.basename
self.meta = {}
@cached_property
def meta(self):
"""Other metadata extracted from the Markdown <imagename>.md file."""
return self.markdown_metadata.get('meta', {})
descfile = splitext(self.src_path)[0] + '.md'
if isfile(descfile):
meta = read_markdown(descfile)
for key, val in meta.items():
setattr(self, key, val)
@cached_property
def markdown_metadata(self):
"""Get metadata from filename.md: title, description, meta."""
return self._get_markdown_metadata()
@property
def markdown_metadata_filepath(self):
return splitext(self.src_path)[0] + '.md'
def _get_markdown_metadata(self):
"""Get metadata from filename.md."""
meta = {'title': '', 'description': '', 'meta': {}}
if isfile(self.markdown_metadata_filepath):
meta.update(read_markdown(self.markdown_metadata_filepath))
return meta
@cached_property
def file_metadata(self):
"""Type-specific metadata"""
return {}
def _get_file_date(self):
stat = os.stat(self.src_path)
return datetime.fromtimestamp(stat.st_mtime)
return datetime.fromtimestamp(get_mod_date(self.src_path))
class Image(Media):
@ -247,21 +260,23 @@ class Image(Media):
else None
)
def _get_metadata(self):
super()._get_metadata()
self.file_metadata = get_image_metadata(self.src_path)
@cached_property
def file_metadata(self):
"""Image file metadata (Exif and IPTC)"""
return get_image_metadata(self.src_path)
def _get_markdown_metadata(self):
"""Get metadata from filename.md."""
meta = super()._get_markdown_metadata()
# If a title or description hasn't been obtained by other means, look
# for the information in IPTC fields
if self.title and self.description:
# Nothing to do - we already have title and description
return
if not meta['title']:
meta['title'] = self.file_metadata['iptc'].get('title', '')
if not meta['description']:
meta['description'] = self.file_metadata['iptc'].get('description', '')
iptc_data = self.file_metadata['iptc']
if not self.title and iptc_data.get('title'):
self.title = iptc_data['title']
if not self.description and iptc_data.get('description'):
self.description = iptc_data['description']
return meta
@cached_property
def raw_exif(self):
@ -358,7 +373,6 @@ class Album:
self.dst_path = join(settings['destination'], path)
self.logger = logging.getLogger(__name__)
self._get_metadata()
# optionally add index.html to the URLs
self.url_ext = self.output_file if settings['index_in_url'] else ''
@ -411,27 +425,42 @@ class Album:
def __iter__(self):
return iter(self.medias)
def _get_metadata(self):
"""Get album metadata from `description_file` (`index.md`):
-> title, thumbnail image, description
@cached_property
def description(self):
"""Description extracted from the Markdown index.md file."""
return self.markdown_metadata.get('description', '')
"""
descfile = join(self.src_path, self.description_file)
self.description = ''
self.meta = {}
# default: get title from directory name
self.title = os.path.basename(self.path if self.path != '.' else self.src_path)
@cached_property
def title(self):
"""Title extracted from the Markdown index.md file."""
title = self.markdown_metadata.get('title', '')
path = self.path if self.path != '.' else self.src_path
return title if title else os.path.basename(path)
if isfile(descfile):
meta = read_markdown(descfile)
for key, val in meta.items():
setattr(self, key, val)
@cached_property
def meta(self):
"""Other metadata extracted from the Markdown index.md file."""
return self.markdown_metadata.get('meta', {})
@cached_property
def author(self):
"""Author extracted from the Markdown index.md file or settings."""
try:
self.author = self.meta['author'][0]
return self.meta['author'][0]
except KeyError:
self.author = self.settings.get('author')
return self.settings.get('author')
@property
def markdown_metadata_filepath(self):
return join(self.src_path, self.description_file)
@cached_property
def markdown_metadata(self):
"""Get metadata from filename.md: title, description, meta."""
meta = {'title': '', 'description': '', 'meta': {}}
if isfile(self.markdown_metadata_filepath):
meta.update(read_markdown(self.markdown_metadata_filepath))
return meta
def create_output_directories(self):
"""Create output directories for thumbnails and original images."""

127
sigal/plugins/extended_caching.py

@ -22,63 +22,130 @@
2.5s instead of 30s)
This plugin allows extended caching, which is useful for large galleries. Once
a gallery has been built it caches the exif-data of the contained images in the
gallery target folder. Before the next run it restores them so that the image
does not have to be parsed again. For large galleries this can speed up the
creation of index files dramatically.
a gallery has been built it caches all metadata for all media (markdown, exif,
itpc) in the gallery target folder. Before the next run it restores them so
that the image and metadata files do not have to be parsed again. For large
galleries this can speed up the creation of index files dramatically.
"""
import logging
import os
import pickle
from sigal import signals
from .. import signals
from ..utils import get_mod_date
logger = logging.getLogger(__name__)
def load_exif(album):
"""Loads the exif data of all images in an album from cache"""
if not hasattr(album.gallery, "exifCache"):
def load_metadata(album):
"""Loads the metadata of all media in an album from cache"""
if not hasattr(album.gallery, "metadataCache"):
_restore_cache(album.gallery)
cache = album.gallery.exifCache
cache = album.gallery.metadataCache
# load album metadata
key = os.path.join(album.path, '_index')
if key in cache:
data = cache[key]
# check if file has changed
try:
mod_date = int(get_mod_date(album.markdown_metadata_filepath))
except FileNotFoundError:
pass
else:
if data.get('mod_date', -1) >= mod_date:
# cache is good
if 'markdown_metadata' in data:
album.markdown_metadata = data['markdown_metadata']
# load media metadata
for media in album.medias:
if media.type == "image":
key = os.path.join(media.path, media.dst_filename)
if key in cache:
media.exif = cache[key]
key = os.path.join(media.path, media.dst_filename)
if key in cache:
data = cache[key]
# check if files have changed
try:
mod_date = int(get_mod_date(media.src_path))
except FileNotFoundError:
continue
if data.get('mod_date', -1) < mod_date:
continue # file_metadata needs updating
if 'file_metadata' in data:
media.file_metadata = data['file_metadata']
if 'exif' in data:
media.exif = data['exif']
try:
mod_date = int(get_mod_date(media.markdown_metadata_filepath))
except FileNotFoundError:
continue
if data.get('meta_mod_date', -1) < mod_date:
continue # markdown_metadata needs updating
if 'markdown_metadata' in data:
media.markdown_metadata = data['markdown_metadata']
def _restore_cache(gallery):
"""Restores the exif data cache from the cache file"""
cachePath = os.path.join(gallery.settings["destination"], ".exif_cache")
"""Restores the metadata cache from the cache file"""
cachePath = os.path.join(gallery.settings["destination"], ".metadata_cache")
try:
if os.path.exists(cachePath):
with open(cachePath, "rb") as cacheFile:
gallery.exifCache = pickle.load(cacheFile)
logger.debug("Loaded cache with %d entries", len(gallery.exifCache))
gallery.metadataCache = pickle.load(cacheFile)
logger.debug("Loaded cache with %d entries", len(gallery.metadataCache))
else:
gallery.exifCache = {}
gallery.metadataCache = {}
except Exception as e:
logger.warn("Could not load cache: %s", e)
gallery.exifCache = {}
logger.warning("Could not load cache: %s", e)
gallery.metadataCache = {}
def save_cache(gallery):
"""Stores the exif data of all images in the gallery"""
if hasattr(gallery, "exifCache"):
cache = gallery.exifCache
if hasattr(gallery, "metadataCache"):
cache = gallery.metadataCache
else:
cache = gallery.exifCache = {}
cache = gallery.metadataCache = {}
for album in gallery.albums.values():
for image in album.images:
cache[os.path.join(image.path, image.dst_filename)] = image.exif
cachePath = os.path.join(gallery.settings["destination"], ".exif_cache")
try:
data = {
'mod_date': int(get_mod_date(album.markdown_metadata_filepath)),
'markdown_metadata': album.markdown_metadata,
}
cache[os.path.join(album.path, '_index')] = data
except FileNotFoundError:
pass
for media in album.medias:
data = {}
try:
mod_date = int(get_mod_date(media.src_path))
except FileNotFoundError:
continue
else:
data['mod_date'] = mod_date
data['file_metadata'] = media.file_metadata
if hasattr(media, 'exif'):
data['exif'] = media.exif
try:
meta_mod_date = int(get_mod_date(media.markdown_metadata_filepath))
except FileNotFoundError:
pass
else:
data['meta_mod_date'] = meta_mod_date
data['markdown_metadata'] = media.markdown_metadata
cache[os.path.join(media.path, media.dst_filename)] = data
cachePath = os.path.join(gallery.settings["destination"], ".metadata_cache")
if len(cache) == 0:
if os.path.exists(cachePath):
@ -88,7 +155,7 @@ def save_cache(gallery):
try:
with open(cachePath, "wb") as cacheFile:
pickle.dump(cache, cacheFile)
logger.debug("Stored cache with %d entries", len(gallery.exifCache))
logger.debug("Stored cache with %d entries", len(gallery.metadataCache))
except Exception as e:
logger.warn("Could not store cache: %s", e)
os.remove(cachePath)
@ -96,4 +163,4 @@ def save_cache(gallery):
def register(settings):
signals.gallery_build.connect(save_cache)
signals.album_initialized.connect(load_exif)
signals.album_initialized.connect(load_metadata)

7
sigal/utils.py

@ -18,6 +18,7 @@
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
from functools import lru_cache
import os
import shutil
from urllib.parse import quote
@ -64,6 +65,12 @@ def check_or_create_dir(path):
os.makedirs(path)
@lru_cache(maxsize=1024)
def get_mod_date(path):
"""Get modification date for a path, caching result with LRU cache."""
return os.path.getmtime(path)
def url_from_path(path):
"""Transform path to url, converting backslashes to slashes if needed."""

93
tests/test_extended_caching.py

@ -1,7 +1,7 @@
import os
import pickle
from sigal.gallery import Gallery
from sigal.gallery import Gallery, Image
from sigal.plugins import extended_caching
CURRENT_DIR = os.path.dirname(__file__)
@ -12,16 +12,47 @@ def test_save_cache(settings, tmpdir):
gal = Gallery(settings, ncpu=1)
extended_caching.save_cache(gal)
cachePath = os.path.join(settings['destination'], ".exif_cache")
cachePath = os.path.join(settings['destination'], ".metadata_cache")
assert os.path.isfile(cachePath)
with open(cachePath, "rb") as cacheFile:
cache = pickle.load(cacheFile)
assert cache["exifTest/21.jpg"] == gal.albums["exifTest"].medias[0].exif
assert cache["exifTest/22.jpg"] == gal.albums["exifTest"].medias[1].exif
assert cache["exifTest/noexif.png"] == gal.albums["exifTest"].medias[2].exif
# test exif
album = gal.albums["exifTest"]
cache_img = cache["exifTest/21.jpg"]
assert cache_img["exif"] == album.medias[0].exif
assert 'markdown_metadata' not in cache_img
assert cache_img["file_metadata"] == album.medias[0].file_metadata
cache_img = cache["exifTest/22.jpg"]
assert cache_img["exif"] == album.medias[1].exif
assert 'markdown_metadata' not in cache_img
assert cache_img["file_metadata"] == album.medias[1].file_metadata
cache_img = cache["exifTest/noexif.png"]
assert cache_img["exif"] == album.medias[2].exif
assert 'markdown_metadata' not in cache_img
assert cache_img["file_metadata"] == album.medias[2].file_metadata
# test iptc and md
album = gal.albums["iptcTest"]
assert cache["iptcTest/_index"]["markdown_metadata"] == album.markdown_metadata
cache_img = cache["iptcTest/1.jpg"]
assert cache_img["file_metadata"] == album.medias[0].file_metadata
assert 'markdown_metadata' not in cache_img
cache_img = cache["iptcTest/2.jpg"]
assert cache_img["markdown_metadata"] == album.medias[1].markdown_metadata
# test if file disappears
gal.albums["exifTest"].medias.append(Image("foooo.jpg", "exifTest", settings))
extended_caching.save_cache(gal)
with open(cachePath, "rb") as cacheFile:
cache = pickle.load(cacheFile)
assert "exifTest/foooo.jpg" not in cache
def test_restore_cache(settings, tmpdir):
@ -30,22 +61,64 @@ def test_restore_cache(settings, tmpdir):
gal2 = Gallery(settings, ncpu=1)
extended_caching.save_cache(gal1)
extended_caching._restore_cache(gal2)
assert gal1.exifCache == gal2.exifCache
assert gal1.metadataCache == gal2.metadataCache
# test bad cache
cachePath = os.path.join(settings['destination'], ".metadata_cache")
with open(cachePath, 'w') as f:
f.write('bad pickle file')
extended_caching._restore_cache(gal2)
assert gal2.metadataCache == {}
def test_load_exif(settings, tmpdir):
settings['destination'] = str(tmpdir)
gal1 = Gallery(settings, ncpu=1)
gal1.albums["exifTest"].medias[2].exif = "blafoo"
gal1.exifCache = {"exifTest/21.jpg": "Foo", "exifTest/22.jpg": "Bar"}
# set mod_date in future, to force these values
gal1.metadataCache = {
"exifTest/21.jpg": {"exif": "Foo", "mod_date": 100000000000},
"exifTest/22.jpg": {"exif": "Bar", "mod_date": 100000000000},
}
extended_caching.load_exif(gal1.albums["exifTest"])
extended_caching.load_metadata(gal1.albums["exifTest"])
assert gal1.albums["exifTest"].medias[0].exif == "Foo"
assert gal1.albums["exifTest"].medias[1].exif == "Bar"
assert gal1.albums["exifTest"].medias[2].exif == "blafoo"
# check if setting gallery.exifCache works
# check if setting gallery.metadataCache works
gal2 = Gallery(settings, ncpu=1)
extended_caching.save_cache(gal1)
extended_caching.load_exif(gal2.albums["exifTest"])
extended_caching.load_metadata(gal2.albums["exifTest"])
assert gal2.albums["exifTest"].medias[0].exif == "Foo"
assert gal2.albums["exifTest"].medias[1].exif == "Bar"
assert gal2.albums["exifTest"].medias[2].exif == "blafoo"
def test_load_metadata_missing(settings, tmpdir):
settings['destination'] = str(tmpdir)
gal = Gallery(settings, ncpu=1)
extended_caching.save_cache(gal)
assert gal.metadataCache
# test if file disappears
gal.albums["exifTest"].medias.append(Image("foooo.jpg", "exifTest", settings))
# set mod_date to -1 to force cache update
gal.metadataCache = {
"exifTest/_index": {"mod_date": -1,},
"exifTest/21.jpg": {"exif": "Foo", "mod_date": -1},
"exifTest/foooo.jpg": {"exif": "Foo"},
"dir1/test2/22.jpg": {"exif": "Bar", "mod_date": 100000000000, "meta_mod_date": -1, "markdown_metadata": "Bar"},
}
# errors should all be caught
extended_caching.load_metadata(gal.albums["exifTest"])
assert gal.albums["exifTest"].medias[0].exif != "Foo"
assert gal.albums["exifTest"].medias[-1].exif != "Foo"
extended_caching.load_metadata(gal.albums["dir1/test2"])
assert gal.albums["dir1/test2"].medias[1].exif == "Bar"
assert gal.albums["dir1/test2"].medias[1].markdown_metadata != "Bar"

9
tests/test_image.py

@ -12,6 +12,7 @@ from sigal.image import (
get_exif_data,
get_exif_tags,
get_iptc_data,
get_image_metadata,
get_size,
process_image,
)
@ -241,6 +242,14 @@ def test_get_iptc_data(caplog):
assert ['IPTC Error in'] == [log.message[:13] for log in caplog.records]
def test_get_image_metadata_exceptions():
# image does not exist
test_image = 'bad_image.jpg'
src_file = os.path.join(CURRENT_DIR, 'sample', test_image)
data = get_image_metadata(src_file)
assert data == {'exif': {}, 'iptc': {}, 'size': {}}
def test_iso_speed_ratings():
data = {'ISOSpeedRatings': ()}
simple = get_exif_tags(data)

Loading…
Cancel
Save