Skip to content

Entity

arkindex_worker.worker.entity

ElementsWorker methods for entities.

Attributes

Classes

MissingEntityType

Bases: Exception

Raised when the specified entity type was not found in the corpus and the worker cannot create it.

EntityMixin

Functions
check_required_entity_types
check_required_entity_types(
    entity_types: list[str], create_missing: bool = True
)

Checks that every entity type needed is available in the corpus. Missing ones may be created automatically if needed.

Parameters:

Name Type Description Default
entity_types list[str]

Entity type names to search.

required
create_missing bool

Whether the missing types should be created. Defaults to True.

True

Raises:

Type Description
MissingEntityType

When an entity type is missing and cannot create.

Source code in arkindex_worker/worker/entity.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
@unsupported_cache
def check_required_entity_types(
    self, entity_types: list[str], create_missing: bool = True
):
    """Checks that every entity type needed is available in the corpus.
    Missing ones may be created automatically if needed.

    :param entity_types: Entity type names to search.
    :param create_missing: Whether the missing types should be created. Defaults to True.
    :raises MissingEntityType: When an entity type is missing and cannot create.
    """
    # Retrieve entity_type ID
    if not self.entity_types:
        # Load entity_types of corpus
        self.list_corpus_entity_types()

    for entity_type in entity_types:
        # Do nothing if type already exists
        if entity_type in self.entity_types:
            continue

        # Do not create missing if not requested
        if not create_missing:
            raise MissingEntityType(
                f"Entity type `{entity_type}` was not in the corpus."
            )

        # Create type if non-existent
        self.entity_types[entity_type] = self.api_client.request(
            "CreateEntityType",
            body={
                "name": entity_type,
                "corpus": self.corpus_id,
            },
        )["id"]
        logger.info(f"Created a new entity type with name `{entity_type}`.")
create_entity
create_entity(
    name: str, type: str, metas=None, validated=None
)

Create an entity on the given corpus. If cache support is enabled, a CachedEntity will also be created.

Parameters:

Name Type Description Default
name str

Name of the entity.

required
type str

Type of the entity.

required
Source code in arkindex_worker/worker/entity.py
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
def create_entity(
    self,
    name: str,
    type: str,
    metas=None,
    validated=None,
):
    """
    Create an entity on the given corpus.
    If cache support is enabled, a [CachedEntity][arkindex_worker.cache.CachedEntity] will also be created.

    :param name: Name of the entity.
    :param type: Type of the entity.
    """
    assert name and isinstance(
        name, str
    ), "name shouldn't be null and should be of type str"
    assert type and isinstance(
        type, str
    ), "type shouldn't be null and should be of type str"
    metas = metas or {}
    if metas:
        assert isinstance(metas, dict), "metas should be of type dict"
    if validated is not None:
        assert isinstance(validated, bool), "validated should be of type bool"
    if self.is_read_only:
        logger.warning("Cannot create entity as this worker is in read-only mode")
        return

    # Retrieve entity_type ID
    if not self.entity_types:
        # Load entity_types of corpus
        self.list_corpus_entity_types()

    entity_type_id = self.entity_types.get(type)
    assert entity_type_id, f"Entity type `{type}` not found in the corpus."

    entity = self.api_client.request(
        "CreateEntity",
        body={
            "name": name,
            "type_id": entity_type_id,
            "metas": metas,
            "validated": validated,
            "corpus": self.corpus_id,
            "worker_run_id": self.worker_run_id,
        },
    )

    if self.use_cache:
        # Store entity in local cache
        try:
            to_insert = [
                {
                    "id": entity["id"],
                    "type": type,
                    "name": name,
                    "validated": validated if validated is not None else False,
                    "metas": metas,
                    "worker_run_id": self.worker_run_id,
                }
            ]
            CachedEntity.insert_many(to_insert).execute()
        except IntegrityError as e:
            logger.warning(f"Couldn't save created entity in local cache: {e}")

    return entity["id"]
create_transcription_entity
create_transcription_entity(
    transcription: Transcription,
    entity: str,
    offset: int,
    length: int,
    confidence: float | None = None,
) -> dict[str, str | int] | None

Create a link between an existing entity and an existing transcription. If cache support is enabled, a CachedTranscriptionEntity will also be created.

Parameters:

Name Type Description Default
transcription Transcription

Transcription to create the entity on.

required
entity str

UUID of the existing entity.

required
offset int

Starting position of the entity in the transcription’s text, as a 0-based index.

required
length int

Length of the entity in the transcription’s text.

required
confidence float | None

Optional confidence score between 0 or 1.

None

Returns:

Type Description
dict[str, str | int] | None

A dict as returned by the CreateTranscriptionEntity API endpoint, or None if the worker is in read-only mode.

Source code in arkindex_worker/worker/entity.py
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
def create_transcription_entity(
    self,
    transcription: Transcription,
    entity: str,
    offset: int,
    length: int,
    confidence: float | None = None,
) -> dict[str, str | int] | None:
    """
    Create a link between an existing entity and an existing transcription.
    If cache support is enabled, a `CachedTranscriptionEntity` will also be created.

    :param transcription: Transcription to create the entity on.
    :param entity: UUID of the existing entity.
    :param offset: Starting position of the entity in the transcription's text,
       as a 0-based index.
    :param length: Length of the entity in the transcription's text.
    :param confidence: Optional confidence score between 0 or 1.
    :returns: A dict as returned by the ``CreateTranscriptionEntity`` API endpoint,
       or None if the worker is in read-only mode.
    """
    assert transcription and isinstance(
        transcription, Transcription
    ), "transcription shouldn't be null and should be a Transcription"
    assert entity and isinstance(
        entity, str
    ), "entity shouldn't be null and should be of type str"
    assert (
        offset is not None and isinstance(offset, int) and offset >= 0
    ), "offset shouldn't be null and should be a positive integer"
    assert (
        length is not None and isinstance(length, int) and length > 0
    ), "length shouldn't be null and should be a strictly positive integer"
    assert (
        confidence is None or isinstance(confidence, float) and 0 <= confidence <= 1
    ), "confidence should be null or a float in [0..1] range"
    if self.is_read_only:
        logger.warning(
            "Cannot create transcription entity as this worker is in read-only mode"
        )
        return

    body = {
        "entity": entity,
        "length": length,
        "offset": offset,
        "worker_run_id": self.worker_run_id,
    }
    if confidence is not None:
        body["confidence"] = confidence

    transcription_ent = self.api_client.request(
        "CreateTranscriptionEntity",
        id=transcription.id,
        body=body,
    )

    if self.use_cache:
        # Store transcription entity in local cache
        try:
            CachedTranscriptionEntity.create(
                transcription=transcription.id,
                entity=entity,
                offset=offset,
                length=length,
                worker_run_id=self.worker_run_id,
                confidence=confidence,
            )
        except IntegrityError as e:
            logger.warning(
                f"Couldn't save created transcription entity in local cache: {e}"
            )
    return transcription_ent
create_transcription_entities
create_transcription_entities(
    transcription: Transcription,
    entities: list[Entity],
    batch_size: int = DEFAULT_BATCH_SIZE,
) -> list[dict[str, str]]

Create multiple entities attached to a transcription in a single API request.

Parameters:

Name Type Description Default
transcription Transcription

Transcription to create the entity on.

required
entities list[Entity]

List of dicts, one per element. Each dict can have the following keys: name (str) Required. Name of the entity. type_id (str) Required. ID of the EntityType of the entity. length (int) Required. Length of the entity in the transcription’s text. offset (int) Required. Starting position of the entity in the transcription’s text, as a 0-based index. confidence (float or None) Optional confidence score, between 0.0 and 1.0.

required
batch_size int

The size of each batch, which will be used to split the publication to avoid API errors.

DEFAULT_BATCH_SIZE

Returns:

Type Description
list[dict[str, str]]

List of dicts, with each dict having a two keys, transcription_entity_id and entity_id, holding the UUID of each created object.

Source code in arkindex_worker/worker/entity.py
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
@unsupported_cache
@batch_publication
def create_transcription_entities(
    self,
    transcription: Transcription,
    entities: list[Entity],
    batch_size: int = DEFAULT_BATCH_SIZE,
) -> list[dict[str, str]]:
    """
    Create multiple entities attached to a transcription in a single API request.

    :param transcription: Transcription to create the entity on.
    :param entities: List of dicts, one per element. Each dict can have the following keys:

        name (str)
           Required. Name of the entity.

        type_id (str)
           Required. ID of the EntityType of the entity.

        length (int)
           Required. Length of the entity in the transcription's text.

        offset (int)
           Required. Starting position of the entity in the transcription's text, as a 0-based index.

        confidence (float or None)
            Optional confidence score, between 0.0 and 1.0.

    :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.

    :return: List of dicts, with each dict having a two keys, `transcription_entity_id` and `entity_id`, holding the UUID of each created object.
    """
    assert transcription and isinstance(
        transcription, Transcription
    ), "transcription shouldn't be null and should be of type Transcription"

    assert entities and isinstance(
        entities, list
    ), "entities shouldn't be null and should be of type list"

    for index, entity in enumerate(entities):
        assert isinstance(
            entity, dict
        ), f"Entity at index {index} in entities: Should be of type dict"

        name = entity.get("name")
        assert (
            name and isinstance(name, str)
        ), f"Entity at index {index} in entities: name shouldn't be null and should be of type str"

        type_id = entity.get("type_id")
        assert (
            type_id and isinstance(type_id, str)
        ), f"Entity at index {index} in entities: type_id shouldn't be null and should be of type str"

        offset = entity.get("offset")
        assert (
            offset is not None and isinstance(offset, int) and offset >= 0
        ), f"Entity at index {index} in entities: offset shouldn't be null and should be a positive integer"

        length = entity.get("length")
        assert (
            length is not None and isinstance(length, int) and length > 0
        ), f"Entity at index {index} in entities: length shouldn't be null and should be a strictly positive integer"

        confidence = entity.get("confidence")
        assert (
            confidence is None
            or (isinstance(confidence, float) and 0 <= confidence <= 1)
        ), f"Entity at index {index} in entities: confidence should be None or a float in [0..1] range"

    assert len(entities) == len(
        set(map(itemgetter("offset", "length", "name", "type_id"), entities))
    ), "entities should be unique"

    if self.is_read_only:
        logger.warning(
            "Cannot create transcription entities in bulk as this worker is in read-only mode"
        )
        return

    created_entities = [
        created_entity
        for batch in make_batches(entities, "entity", batch_size)
        for created_entity in self.api_client.request(
            "CreateTranscriptionEntities",
            id=transcription.id,
            body={
                "worker_run_id": self.worker_run_id,
                "entities": batch,
            },
        )["entities"]
    ]

    return created_entities
list_transcription_entities
list_transcription_entities(
    transcription: Transcription,
    worker_version: str | bool | None = None,
    worker_run: str | bool | None = None,
)

List existing entities on a transcription This method does not support cache

Warns:

The following parameters are deprecated:

  • worker_version in favor of worker_run

Parameters:

Name Type Description Default
transcription Transcription

The transcription to list entities on.

required
worker_version str | bool | None

Deprecated Restrict to entities created by a worker version with this UUID. Set to False to look for manually created entities.

None
worker_run str | bool | None

Restrict to entities created by a worker run with this UUID. Set to False to look for manually created entities.

None
Source code in arkindex_worker/worker/entity.py
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
def list_transcription_entities(
    self,
    transcription: Transcription,
    worker_version: str | bool | None = None,
    worker_run: str | bool | None = None,
):
    """
    List existing entities on a transcription
    This method does not support cache

    Warns:
    ----
    The following parameters are **deprecated**:

    - `worker_version` in favor of `worker_run`

    :param transcription: The transcription to list entities on.
    :param worker_version: **Deprecated** Restrict to entities created by a worker version with this UUID. Set to False to look for manually created entities.
    :param worker_run: Restrict to entities created by a worker run with this UUID. Set to False to look for manually created entities.
    """
    query_params = {}
    assert transcription and isinstance(
        transcription, Transcription
    ), "transcription shouldn't be null and should be a Transcription"

    if worker_version is not None:
        warn(
            "`worker_version` usage is deprecated. Consider using `worker_run` instead.",
            DeprecationWarning,
            stacklevel=1,
        )
        assert isinstance(
            worker_version, str | bool
        ), "worker_version should be of type str or bool"

        if isinstance(worker_version, bool):
            assert (
                worker_version is False
            ), "if of type bool, worker_version can only be set to False"
        query_params["worker_version"] = worker_version
    if worker_run is not None:
        assert isinstance(
            worker_run, str | bool
        ), "worker_run should be of type str or bool"
        if isinstance(worker_run, bool):
            assert (
                worker_run is False
            ), "if of type bool, worker_run can only be set to False"
        query_params["worker_run"] = worker_run

    return self.api_client.paginate(
        "ListTranscriptionEntities", id=transcription.id, **query_params
    )
list_corpus_entities
list_corpus_entities(
    name: str | None = None, parent: Element | None = None
)

List all entities in the worker’s corpus and store them in the self.entities cache.

Parameters:

Name Type Description Default
name str | None

Filter entities by part of their name (case-insensitive)

None
parent Element | None

Restrict entities to those linked to all transcriptions of an element and all its descendants. Note that links to metadata are ignored.

None
Source code in arkindex_worker/worker/entity.py
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
def list_corpus_entities(
    self,
    name: str | None = None,
    parent: Element | None = None,
):
    """
    List all entities in the worker's corpus and store them in the ``self.entities`` cache.
    :param name: Filter entities by part of their name (case-insensitive)
    :param parent: Restrict entities to those linked to all transcriptions of an element and all its descendants. Note that links to metadata are ignored.
    """
    query_params = {}

    if name is not None:
        assert name and isinstance(name, str), "name should be of type str"
        query_params["name"] = name

    if parent is not None:
        assert isinstance(parent, Element), "parent should be of type Element"
        query_params["parent"] = parent.id

    self.entities = {
        entity["id"]: entity
        for entity in self.api_client.paginate(
            "ListCorpusEntities", id=self.corpus_id, **query_params
        )
    }
    count = len(self.entities)
    logger.info(
        f'Loaded {count} {pluralize("entity", count)} in corpus ({self.corpus_id})'
    )
list_corpus_entity_types
list_corpus_entity_types()

Loads available entity types in corpus.

Source code in arkindex_worker/worker/entity.py
403
404
405
406
407
408
409
410
411
412
413
414
415
416
def list_corpus_entity_types(self):
    """
    Loads available entity types in corpus.
    """
    self.entity_types = {
        entity_type["name"]: entity_type["id"]
        for entity_type in self.api_client.paginate(
            "ListCorpusEntityTypes", id=self.corpus_id
        )
    }
    count = len(self.entity_types)
    logger.info(
        f'Loaded {count} entity {pluralize("type", count)} in corpus ({self.corpus_id}).'
    )

Functions