Cache¶

arkindex_worker.cache ¶

Database mappings and helper methods for the experimental worker caching feature.

On methods that support caching, the database will be used for all reads, and writes will go both to the Arkindex API and the database, reducing network usage.

Classes¶

JSONField ¶

Bases: Field

A Peewee field that stores a JSON payload as a string and parses it automatically.

Version ¶

Bases: Model

Cache version table, used to warn about incompatible cache databases when a worker uses an outdated version of base-worker.

CachedImage ¶

Bases: Model

Cache image table

CachedElement ¶

Bases: Model

Cache element table

Functions¶

open_image ¶

open_image(
    *args,
    max_width: int | None = None,
    max_height: int | None = None,
    **kwargs
) -> Image

Open this element’s image as a Pillow image. This does not crop the image to the element’s polygon. IIIF servers with maxWidth, maxHeight or maxArea restrictions on image size are not supported.

Warns:¶

If both, max_width and max_height are set, the image ratio is not preserved.

Parameters:

Name	Type	Description	Default
`*args`		Positional arguments passed to arkindex_worker.image.open_image	`()`
`max_width`	`int \| None`	The maximum width of the image.	`None`
`max_height`	`int \| None`	The maximum height of the image.	`None`
`**kwargs`		Keyword arguments passed to arkindex_worker.image.open_image	`{}`

Returns:

Type	Description
`Image`	A Pillow image.

Raises:

Type	Description
`ValueError`	When this element does not have an image ID or a polygon.

Source code in arkindex_worker/cache.py

def open_image(
    self,
    *args,
    max_width: int | None = None,
    max_height: int | None = None,
    **kwargs,
) -> Image:
    """
    Open this element's image as a Pillow image.
    This does not crop the image to the element's polygon.
    IIIF servers with maxWidth, maxHeight or maxArea restrictions on image size are not supported.

    Warns:
    ----
       If both, ``max_width`` and ``max_height`` are set, the image ratio is not preserved.


    :param *args: Positional arguments passed to [arkindex_worker.image.open_image][]
    :param max_width: The maximum width of the image.
    :param max_height: The maximum height of the image.
    :param **kwargs: Keyword arguments passed to [arkindex_worker.image.open_image][]
    :raises ValueError: When this element does not have an image ID or a polygon.
    :return: A Pillow image.
    """
    from arkindex_worker.image import open_image, polygon_bounding_box

    if not self.image_id or not self.polygon:
        raise ValueError(f"Element {self.id} has no image")

    # Always fetch the image from the bounding box when size differs from full image
    bounding_box = polygon_bounding_box(self.polygon)
    if (
        bounding_box.width != self.image.width
        or bounding_box.height != self.image.height
    ):
        box = f"{bounding_box.x},{bounding_box.y},{bounding_box.width},{bounding_box.height}"
    else:
        box = "full"

    if max_width is None and max_height is None:
        resize = "full"
    else:
        if (
            # Do not resize for polygons that do not exactly match the images
            # as the resize is made directly by the IIIF server using the box parameter
            bounding_box.width != self.image.width
            or bounding_box.height != self.image.height
        ) or (
            # Do not resize when the image is below the maximum size
            (max_width is None or self.image.width <= max_width)
            and (max_height is None or self.image.height <= max_height)
        ):
            resize = "full"
        else:
            resize = f"{max_width or ''},{max_height or ''}"

    url = self.image.url
    if not url.endswith("/"):
        url += "/"

    return open_image(
        f"{url}{box}/{resize}/0/default.jpg",
        *args,
        rotation_angle=self.rotation_angle,
        mirrored=self.mirrored,
        **kwargs,
    )

CachedTranscription ¶

Bases: Model

Cache transcription table

CachedClassification ¶

Bases: Model

Cache classification table

CachedEntity ¶

Bases: Model

Cache entity table

CachedTranscriptionEntity ¶

Bases: Model

Cache transcription entity table

Functions¶

init_cache_db ¶

init_cache_db(path: Path)

Create the cache database on the given path

Parameters:

Name	Type	Description	Default
`path`	`Path`	Where the new database should be created	required

Source code in arkindex_worker/cache.py

def init_cache_db(path: Path):
    """
    Create the cache database on the given path
    :param path: Where the new database should be created
    """
    db.init(
        path,
        pragmas={
            # SQLite ignores foreign keys and check constraints by default!
            "foreign_keys": 1,
            "ignore_check_constraints": 0,
        },
    )
    db.connect()
    logger.info(f"Connected to cache on {path}")

create_tables ¶

create_tables()

Creates the tables in the cache DB only if they do not already exist.

Source code in arkindex_worker/cache.py

def create_tables():
    """
    Creates the tables in the cache DB only if they do not already exist.
    """
    db.create_tables(MODELS)

create_version_table ¶

create_version_table()

Creates the Version table in the cache DB. This step must be independent from other tables creation since we only want to create the table and add the one and only Version entry when the cache is created from scratch.

Source code in arkindex_worker/cache.py

def create_version_table():
    """
    Creates the Version table in the cache DB.
    This step must be independent from other tables creation since we only
    want to create the table and add the one and only Version entry when the
    cache is created from scratch.
    """
    db.create_tables([Version])
    Version.create(version=SQL_VERSION)

check_version ¶

check_version(cache_path: str | Path)

Check the validity of the SQLite version

Parameters:

Name	Type	Description	Default
`cache_path`	`str \| Path`	Path towards a local SQLite database	required

Source code in arkindex_worker/cache.py

def check_version(cache_path: str | Path):
    """
    Check the validity of the SQLite version

    :param cache_path: Path towards a local SQLite database
    """
    with SqliteDatabase(cache_path) as provided_db, provided_db.bind_ctx([Version]):
        try:
            version = Version.get().version
        except OperationalError:
            version = None

        assert (
            version == SQL_VERSION
        ), f"The SQLite database {cache_path} does not have the correct cache version, it should be {SQL_VERSION}"

merge_parents_cache ¶

merge_parents_cache(paths: list, current_database: Path)

Merge all the potential parent task’s databases into the existing local one

Parameters:

Name	Type	Description	Default
`paths`	`list`	Path to cache databases	required
`current_database`	`Path`	Path to the current database	required

Source code in arkindex_worker/cache.py

def merge_parents_cache(paths: list, current_database: Path):
    """
    Merge all the potential parent task's databases into the existing local one
    :param paths: Path to cache databases
    :param current_database: Path to the current database
    """
    assert current_database.exists()

    if not paths:
        logger.info("No parents cache to use")
        return

    # Open a connection on current database
    connection = sqlite3.connect(current_database)
    cursor = connection.cursor()

    # Merge each table into the local database
    for idx, path in enumerate(paths):
        # Check that the parent cache uses a compatible version
        check_version(path)

        with SqliteDatabase(path) as source, source.bind_ctx(MODELS):
            source.create_tables(MODELS)

        logger.info(f"Merging parent db {path} into {current_database}")
        statements = [
            "PRAGMA page_size=80000;",
            "PRAGMA synchronous=OFF;",
            f"ATTACH DATABASE '{path}' AS source_{idx};",
            f"REPLACE INTO images SELECT * FROM source_{idx}.images;",
            f"REPLACE INTO elements SELECT * FROM source_{idx}.elements;",
            f"REPLACE INTO transcriptions SELECT * FROM source_{idx}.transcriptions;",
            f"REPLACE INTO classifications SELECT * FROM source_{idx}.classifications;",
            f"REPLACE INTO entities SELECT * FROM source_{idx}.entities;",
            f"REPLACE INTO transcription_entities SELECT * FROM source_{idx}.transcription_entities;",
            f"REPLACE INTO datasets SELECT * FROM source_{idx}.datasets;",
            f"REPLACE INTO dataset_elements SELECT * FROM source_{idx}.dataset_elements;",
        ]

        for statement in statements:
            cursor.execute(statement)
        connection.commit()