Skip to content

Entity

arkindex_worker.worker.entity

ElementsWorker methods for entities.

MissingEntityType

Bases: Exception

Raised when the specified entity type was not found in the corpus and the worker cannot create it.

EntityMixin

Bases: object

check_required_entity_types

check_required_entity_types(
    entity_types, create_missing=True
)

Checks that every entity type needed is available in the corpus. Missing ones may be created automatically if needed.

Parameters:

Name Type Description Default
entity_types List[str]

Entity type names to search.

required
create_missing bool

Whether the missing types should be created. Defaults to True.

True

Raises:

Type Description
MissingEntityType

When an entity type is missing and cannot create.

Source code in arkindex_worker/worker/entity.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
def check_required_entity_types(
    self, entity_types: List[str], create_missing: bool = True
):
    """Checks that every entity type needed is available in the corpus.
    Missing ones may be created automatically if needed.

    :param entity_types: Entity type names to search.
    :param create_missing: Whether the missing types should be created. Defaults to True.
    :raises MissingEntityType: When an entity type is missing and cannot create.
    """
    # Retrieve entity_type ID
    if not self.entity_types:
        # Load entity_types of corpus
        self.list_corpus_entity_types()
    for entity_type in entity_types:
        # Do nothing if type already exists
        if entity_type in self.entity_types:
            continue

        # Do not create missing if not requested
        if not create_missing:
            raise MissingEntityType(
                f"Entity type `{entity_type}` was not in the corpus."
            )

        # Create type if non-existent
        self.entity_types[entity_type] = self.request(
            "CreateEntityType",
            body={
                "name": entity_type,
                "corpus": self.corpus_id,
            },
        )["id"]
        logger.info(f"Created a new entity type with name `{entity_type}`.")

create_entity

create_entity(
    element, name, type, metas=dict(), validated=None
)

Create an entity on the given corpus. If cache support is enabled, a CachedEntity will also be created.

Parameters:

Name Type Description Default
element Union[Element, CachedElement]

An element on which the entity will be reported with the Reporter. This does not have any effect on the entity itself.

required
name str

Name of the entity.

required
type str

Type of the entity.

required
Source code in arkindex_worker/worker/entity.py
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
def create_entity(
    self,
    element: Union[Element, CachedElement],
    name: str,
    type: str,
    metas=dict(),
    validated=None,
):
    """
    Create an entity on the given corpus.
    If cache support is enabled, a [CachedEntity][arkindex_worker.cache.CachedEntity] will also be created.

    :param element: An element on which the entity will be reported with the [Reporter][arkindex_worker.reporting.Reporter].
       This does not have any effect on the entity itself.
    :param name: Name of the entity.
    :param type: Type of the entity.
    """
    assert element and isinstance(
        element, (Element, CachedElement)
    ), "element shouldn't be null and should be an Element or CachedElement"
    assert name and isinstance(
        name, str
    ), "name shouldn't be null and should be of type str"
    assert type and isinstance(
        type, str
    ), "type shouldn't be null and should be of type str"
    if metas:
        assert isinstance(metas, dict), "metas should be of type dict"
    if validated is not None:
        assert isinstance(validated, bool), "validated should be of type bool"
    if self.is_read_only:
        logger.warning("Cannot create entity as this worker is in read-only mode")
        return

    # Retrieve entity_type ID
    if not self.entity_types:
        # Load entity_types of corpus
        self.list_corpus_entity_types()

    entity_type_id = self.entity_types.get(type)
    assert entity_type_id, f"Entity type `{type}` not found in the corpus."

    entity = self.request(
        "CreateEntity",
        body={
            "name": name,
            "type_id": entity_type_id,
            "metas": metas,
            "validated": validated,
            "corpus": self.corpus_id,
            "worker_run_id": self.worker_run_id,
        },
    )
    self.report.add_entity(element.id, entity["id"], entity_type_id, name)

    if self.use_cache:
        # Store entity in local cache
        try:
            to_insert = [
                {
                    "id": entity["id"],
                    "type": type,
                    "name": name,
                    "validated": validated if validated is not None else False,
                    "metas": metas,
                    "worker_run_id": self.worker_run_id,
                }
            ]
            CachedEntity.insert_many(to_insert).execute()
        except IntegrityError as e:
            logger.warning(f"Couldn't save created entity in local cache: {e}")

    return entity["id"]

create_transcription_entity

create_transcription_entity(
    transcription, entity, offset, length, confidence=None
)

Create a link between an existing entity and an existing transcription. If cache support is enabled, a CachedTranscriptionEntity will also be created.

Parameters:

Name Type Description Default
transcription Transcription

Transcription to create the entity on.

required
entity str

UUID of the existing entity.

required
offset int

Starting position of the entity in the transcription’s text, as a 0-based index.

required
length int

Length of the entity in the transcription’s text.

required
confidence Optional[float]

Optional confidence score between 0 or 1.

None

Returns:

Type Description
Optional[Dict[str, Union[str, int]]]

A dict as returned by the CreateTranscriptionEntity API endpoint, or None if the worker is in read-only mode.

Source code in arkindex_worker/worker/entity.py
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
def create_transcription_entity(
    self,
    transcription: Transcription,
    entity: str,
    offset: int,
    length: int,
    confidence: Optional[float] = None,
) -> Optional[Dict[str, Union[str, int]]]:
    """
    Create a link between an existing entity and an existing transcription.
    If cache support is enabled, a `CachedTranscriptionEntity` will also be created.

    :param transcription: Transcription to create the entity on.
    :param entity: UUID of the existing entity.
    :param offset: Starting position of the entity in the transcription's text,
       as a 0-based index.
    :param length: Length of the entity in the transcription's text.
    :param confidence: Optional confidence score between 0 or 1.
    :returns: A dict as returned by the ``CreateTranscriptionEntity`` API endpoint,
       or None if the worker is in read-only mode.
    """
    assert transcription and isinstance(
        transcription, Transcription
    ), "transcription shouldn't be null and should be a Transcription"
    assert entity and isinstance(
        entity, str
    ), "entity shouldn't be null and should be of type str"
    assert (
        offset is not None and isinstance(offset, int) and offset >= 0
    ), "offset shouldn't be null and should be a positive integer"
    assert (
        length is not None and isinstance(length, int) and length > 0
    ), "length shouldn't be null and should be a strictly positive integer"
    assert (
        confidence is None or isinstance(confidence, float) and 0 <= confidence <= 1
    ), "confidence should be null or a float in [0..1] range"
    if self.is_read_only:
        logger.warning(
            "Cannot create transcription entity as this worker is in read-only mode"
        )
        return

    body = {
        "entity": entity,
        "length": length,
        "offset": offset,
        "worker_run_id": self.worker_run_id,
    }
    if confidence is not None:
        body["confidence"] = confidence

    transcription_ent = self.request(
        "CreateTranscriptionEntity",
        id=transcription.id,
        body=body,
    )
    self.report.add_transcription_entity(entity, transcription, transcription_ent)

    if self.use_cache:
        # Store transcription entity in local cache
        try:
            CachedTranscriptionEntity.create(
                transcription=transcription.id,
                entity=entity,
                offset=offset,
                length=length,
                worker_run_id=self.worker_run_id,
                confidence=confidence,
            )
        except IntegrityError as e:
            logger.warning(
                f"Couldn't save created transcription entity in local cache: {e}"
            )
    return transcription_ent

create_transcription_entities

create_transcription_entities(transcription, entities)

Create multiple entities attached to a transcription in a single API request.

Parameters:

Name Type Description Default
transcription Transcription

Transcription to create the entity on.

required
entities List[Entity]

List of dicts, one per element. Each dict can have the following keys: name (str) Required. Name of the entity. type_id (str) Required. ID of the EntityType of the entity. length (int) Required. Length of the entity in the transcription’s text. offset (int) Required. Starting position of the entity in the transcription’s text, as a 0-based index. confidence (float or None) Optional confidence score, between 0.0 and 1.0.

required

Returns:

Type Description
List[Dict[str, str]]

List of dicts, with each dict having a two keys, transcription_entity_id and entity_id, holding the UUID of each created object.

Source code in arkindex_worker/worker/entity.py
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
def create_transcription_entities(
    self,
    transcription: Transcription,
    entities: List[Entity],
) -> List[Dict[str, str]]:
    """
    Create multiple entities attached to a transcription in a single API request.

    :param transcription: Transcription to create the entity on.
    :param entities: List of dicts, one per element. Each dict can have the following keys:

        name (str)
           Required. Name of the entity.

        type_id (str)
           Required. ID of the EntityType of the entity.

        length (int)
           Required. Length of the entity in the transcription's text.

        offset (int)
           Required. Starting position of the entity in the transcription's text, as a 0-based index.

        confidence (float or None)
            Optional confidence score, between 0.0 and 1.0.

    :return: List of dicts, with each dict having a two keys, `transcription_entity_id` and `entity_id`, holding the UUID of each created object.
    """
    assert transcription and isinstance(
        transcription, Transcription
    ), "transcription shouldn't be null and should be of type Transcription"

    # Needed for MLreport
    assert (
        hasattr(transcription, "element") and transcription.element
    ), f"No element linked to {transcription}"

    assert entities and isinstance(
        entities, list
    ), "entities shouldn't be null and should be of type list"

    for index, entity in enumerate(entities):
        assert isinstance(
            entity, dict
        ), f"Entity at index {index} in entities: Should be of type dict"

        name = entity.get("name")
        assert name and isinstance(
            name, str
        ), f"Entity at index {index} in entities: name shouldn't be null and should be of type str"

        type_id = entity.get("type_id")
        assert type_id and isinstance(
            type_id, str
        ), f"Entity at index {index} in entities: type_id shouldn't be null and should be of type str"

        offset = entity.get("offset")
        assert (
            offset is not None and isinstance(offset, int) and offset >= 0
        ), f"Entity at index {index} in entities: offset shouldn't be null and should be a positive integer"

        length = entity.get("length")
        assert (
            length is not None and isinstance(length, int) and length > 0
        ), f"Entity at index {index} in entities: length shouldn't be null and should be a strictly positive integer"

        confidence = entity.get("confidence")
        assert confidence is None or (
            isinstance(confidence, float) and 0 <= confidence <= 1
        ), f"Entity at index {index} in entities: confidence should be None or a float in [0..1] range"

    if self.is_read_only:
        logger.warning(
            "Cannot create transcription entities in bulk as this worker is in read-only mode"
        )
        return

    created_ids = self.request(
        "CreateTranscriptionEntities",
        id=transcription.id,
        body={
            "worker_run_id": self.worker_run_id,
            "entities": entities,
        },
    )

    for entity, created_objects in zip(entities, created_ids["entities"]):
        # Report entity creation
        self.report.add_entity(
            transcription.element.id,
            created_objects["entity_id"],
            entity.get("type_id"),
            entity.get("name"),
        )

        # Report transcription entity creation
        self.report.add_transcription_entity(
            created_objects["entity_id"],
            transcription,
            created_objects["transcription_entity_id"],
        )

    return created_ids["entities"]

list_transcription_entities

list_transcription_entities(
    transcription, worker_version=None
)

List existing entities on a transcription This method does not support cache

Parameters:

Name Type Description Default
transcription Transcription

The transcription to list entities on.

required
worker_version Optional[Union[str, bool]]

Restrict to entities created by a worker version with this UUID. Set to False to look for manually created transcriptions.

None
Source code in arkindex_worker/worker/entity.py
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
def list_transcription_entities(
    self,
    transcription: Transcription,
    worker_version: Optional[Union[str, bool]] = None,
):
    """
    List existing entities on a transcription
    This method does not support cache

    :param transcription: The transcription to list entities on.
    :param worker_version: Restrict to entities created by a worker version with this UUID. Set to False to look for manually created transcriptions.
    """
    query_params = {}
    assert transcription and isinstance(
        transcription, Transcription
    ), "transcription shouldn't be null and should be a Transcription"

    if worker_version is not None:
        assert isinstance(
            worker_version, (str, bool)
        ), "worker_version should be of type str or bool"

        if isinstance(worker_version, bool):
            assert (
                worker_version is False
            ), "if of type bool, worker_version can only be set to False"
        query_params["worker_version"] = worker_version

    return self.api_client.paginate(
        "ListTranscriptionEntities", id=transcription.id, **query_params
    )

list_corpus_entities

list_corpus_entities(name=None, parent=None)

List all entities in the worker’s corpus This method does not support cache

Parameters:

Name Type Description Default
name Optional[str]

Filter entities by part of their name (case-insensitive)

None
Element parent

Restrict entities to those linked to all transcriptions of an element and all its descendants. Note that links to metadata are ignored.

required
Source code in arkindex_worker/worker/entity.py
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
def list_corpus_entities(
    self,
    name: Optional[str] = None,
    parent: Optional[Element] = None,
):
    """
    List all entities in the worker's corpus
    This method does not support cache
    :param name: Filter entities by part of their name (case-insensitive)
    :param parent Element: Restrict entities to those linked to all transcriptions of an element and all its descendants. Note that links to metadata are ignored.
    """
    query_params = {}

    if name is not None:
        assert name and isinstance(name, str), "name should be of type str"
        query_params["name"] = name

    if parent is not None:
        assert isinstance(parent, Element), "parent should be of type Element"
        query_params["parent"] = parent.id

    return self.api_client.paginate(
        "ListCorpusEntities", id=self.corpus_id, **query_params
    )

list_corpus_entity_types

list_corpus_entity_types()

Loads available entity types in corpus.

Source code in arkindex_worker/worker/entity.py
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
def list_corpus_entity_types(
    self,
):
    """
    Loads available entity types in corpus.
    """
    self.entity_types = {
        entity_type["name"]: entity_type["id"]
        for entity_type in self.api_client.paginate(
            "ListCorpusEntityTypes", id=self.corpus_id
        )
    }
    logger.info(
        f"Loaded {len(self.entity_types)} entity types in corpus ({self.corpus_id})."
    )

check_required_entity_types

check_required_entity_types(
    entity_types, create_missing=True
)

Checks that every entity type needed is available in the corpus. Missing ones may be created automatically if needed.

Parameters:

Name Type Description Default
entity_types List[str]

Entity type names to search.

required
create_missing bool

Whether the missing types should be created. Defaults to True.

True

Raises:

Type Description
MissingEntityType

When an entity type is missing and cannot create.

Source code in arkindex_worker/worker/entity.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
def check_required_entity_types(
    self, entity_types: List[str], create_missing: bool = True
):
    """Checks that every entity type needed is available in the corpus.
    Missing ones may be created automatically if needed.

    :param entity_types: Entity type names to search.
    :param create_missing: Whether the missing types should be created. Defaults to True.
    :raises MissingEntityType: When an entity type is missing and cannot create.
    """
    # Retrieve entity_type ID
    if not self.entity_types:
        # Load entity_types of corpus
        self.list_corpus_entity_types()
    for entity_type in entity_types:
        # Do nothing if type already exists
        if entity_type in self.entity_types:
            continue

        # Do not create missing if not requested
        if not create_missing:
            raise MissingEntityType(
                f"Entity type `{entity_type}` was not in the corpus."
            )

        # Create type if non-existent
        self.entity_types[entity_type] = self.request(
            "CreateEntityType",
            body={
                "name": entity_type,
                "corpus": self.corpus_id,
            },
        )["id"]
        logger.info(f"Created a new entity type with name `{entity_type}`.")

create_entity

create_entity(
    element, name, type, metas=dict(), validated=None
)

Create an entity on the given corpus. If cache support is enabled, a CachedEntity will also be created.

Parameters:

Name Type Description Default
element Union[Element, CachedElement]

An element on which the entity will be reported with the Reporter. This does not have any effect on the entity itself.

required
name str

Name of the entity.

required
type str

Type of the entity.

required
Source code in arkindex_worker/worker/entity.py
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
def create_entity(
    self,
    element: Union[Element, CachedElement],
    name: str,
    type: str,
    metas=dict(),
    validated=None,
):
    """
    Create an entity on the given corpus.
    If cache support is enabled, a [CachedEntity][arkindex_worker.cache.CachedEntity] will also be created.

    :param element: An element on which the entity will be reported with the [Reporter][arkindex_worker.reporting.Reporter].
       This does not have any effect on the entity itself.
    :param name: Name of the entity.
    :param type: Type of the entity.
    """
    assert element and isinstance(
        element, (Element, CachedElement)
    ), "element shouldn't be null and should be an Element or CachedElement"
    assert name and isinstance(
        name, str
    ), "name shouldn't be null and should be of type str"
    assert type and isinstance(
        type, str
    ), "type shouldn't be null and should be of type str"
    if metas:
        assert isinstance(metas, dict), "metas should be of type dict"
    if validated is not None:
        assert isinstance(validated, bool), "validated should be of type bool"
    if self.is_read_only:
        logger.warning("Cannot create entity as this worker is in read-only mode")
        return

    # Retrieve entity_type ID
    if not self.entity_types:
        # Load entity_types of corpus
        self.list_corpus_entity_types()

    entity_type_id = self.entity_types.get(type)
    assert entity_type_id, f"Entity type `{type}` not found in the corpus."

    entity = self.request(
        "CreateEntity",
        body={
            "name": name,
            "type_id": entity_type_id,
            "metas": metas,
            "validated": validated,
            "corpus": self.corpus_id,
            "worker_run_id": self.worker_run_id,
        },
    )
    self.report.add_entity(element.id, entity["id"], entity_type_id, name)

    if self.use_cache:
        # Store entity in local cache
        try:
            to_insert = [
                {
                    "id": entity["id"],
                    "type": type,
                    "name": name,
                    "validated": validated if validated is not None else False,
                    "metas": metas,
                    "worker_run_id": self.worker_run_id,
                }
            ]
            CachedEntity.insert_many(to_insert).execute()
        except IntegrityError as e:
            logger.warning(f"Couldn't save created entity in local cache: {e}")

    return entity["id"]

create_transcription_entity

create_transcription_entity(
    transcription, entity, offset, length, confidence=None
)

Create a link between an existing entity and an existing transcription. If cache support is enabled, a CachedTranscriptionEntity will also be created.

Parameters:

Name Type Description Default
transcription Transcription

Transcription to create the entity on.

required
entity str

UUID of the existing entity.

required
offset int

Starting position of the entity in the transcription’s text, as a 0-based index.

required
length int

Length of the entity in the transcription’s text.

required
confidence Optional[float]

Optional confidence score between 0 or 1.

None

Returns:

Type Description
Optional[Dict[str, Union[str, int]]]

A dict as returned by the CreateTranscriptionEntity API endpoint, or None if the worker is in read-only mode.

Source code in arkindex_worker/worker/entity.py
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
def create_transcription_entity(
    self,
    transcription: Transcription,
    entity: str,
    offset: int,
    length: int,
    confidence: Optional[float] = None,
) -> Optional[Dict[str, Union[str, int]]]:
    """
    Create a link between an existing entity and an existing transcription.
    If cache support is enabled, a `CachedTranscriptionEntity` will also be created.

    :param transcription: Transcription to create the entity on.
    :param entity: UUID of the existing entity.
    :param offset: Starting position of the entity in the transcription's text,
       as a 0-based index.
    :param length: Length of the entity in the transcription's text.
    :param confidence: Optional confidence score between 0 or 1.
    :returns: A dict as returned by the ``CreateTranscriptionEntity`` API endpoint,
       or None if the worker is in read-only mode.
    """
    assert transcription and isinstance(
        transcription, Transcription
    ), "transcription shouldn't be null and should be a Transcription"
    assert entity and isinstance(
        entity, str
    ), "entity shouldn't be null and should be of type str"
    assert (
        offset is not None and isinstance(offset, int) and offset >= 0
    ), "offset shouldn't be null and should be a positive integer"
    assert (
        length is not None and isinstance(length, int) and length > 0
    ), "length shouldn't be null and should be a strictly positive integer"
    assert (
        confidence is None or isinstance(confidence, float) and 0 <= confidence <= 1
    ), "confidence should be null or a float in [0..1] range"
    if self.is_read_only:
        logger.warning(
            "Cannot create transcription entity as this worker is in read-only mode"
        )
        return

    body = {
        "entity": entity,
        "length": length,
        "offset": offset,
        "worker_run_id": self.worker_run_id,
    }
    if confidence is not None:
        body["confidence"] = confidence

    transcription_ent = self.request(
        "CreateTranscriptionEntity",
        id=transcription.id,
        body=body,
    )
    self.report.add_transcription_entity(entity, transcription, transcription_ent)

    if self.use_cache:
        # Store transcription entity in local cache
        try:
            CachedTranscriptionEntity.create(
                transcription=transcription.id,
                entity=entity,
                offset=offset,
                length=length,
                worker_run_id=self.worker_run_id,
                confidence=confidence,
            )
        except IntegrityError as e:
            logger.warning(
                f"Couldn't save created transcription entity in local cache: {e}"
            )
    return transcription_ent

create_transcription_entities

create_transcription_entities(transcription, entities)

Create multiple entities attached to a transcription in a single API request.

Parameters:

Name Type Description Default
transcription Transcription

Transcription to create the entity on.

required
entities List[Entity]

List of dicts, one per element. Each dict can have the following keys: name (str) Required. Name of the entity. type_id (str) Required. ID of the EntityType of the entity. length (int) Required. Length of the entity in the transcription’s text. offset (int) Required. Starting position of the entity in the transcription’s text, as a 0-based index. confidence (float or None) Optional confidence score, between 0.0 and 1.0.

required

Returns:

Type Description
List[Dict[str, str]]

List of dicts, with each dict having a two keys, transcription_entity_id and entity_id, holding the UUID of each created object.

Source code in arkindex_worker/worker/entity.py
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
def create_transcription_entities(
    self,
    transcription: Transcription,
    entities: List[Entity],
) -> List[Dict[str, str]]:
    """
    Create multiple entities attached to a transcription in a single API request.

    :param transcription: Transcription to create the entity on.
    :param entities: List of dicts, one per element. Each dict can have the following keys:

        name (str)
           Required. Name of the entity.

        type_id (str)
           Required. ID of the EntityType of the entity.

        length (int)
           Required. Length of the entity in the transcription's text.

        offset (int)
           Required. Starting position of the entity in the transcription's text, as a 0-based index.

        confidence (float or None)
            Optional confidence score, between 0.0 and 1.0.

    :return: List of dicts, with each dict having a two keys, `transcription_entity_id` and `entity_id`, holding the UUID of each created object.
    """
    assert transcription and isinstance(
        transcription, Transcription
    ), "transcription shouldn't be null and should be of type Transcription"

    # Needed for MLreport
    assert (
        hasattr(transcription, "element") and transcription.element
    ), f"No element linked to {transcription}"

    assert entities and isinstance(
        entities, list
    ), "entities shouldn't be null and should be of type list"

    for index, entity in enumerate(entities):
        assert isinstance(
            entity, dict
        ), f"Entity at index {index} in entities: Should be of type dict"

        name = entity.get("name")
        assert name and isinstance(
            name, str
        ), f"Entity at index {index} in entities: name shouldn't be null and should be of type str"

        type_id = entity.get("type_id")
        assert type_id and isinstance(
            type_id, str
        ), f"Entity at index {index} in entities: type_id shouldn't be null and should be of type str"

        offset = entity.get("offset")
        assert (
            offset is not None and isinstance(offset, int) and offset >= 0
        ), f"Entity at index {index} in entities: offset shouldn't be null and should be a positive integer"

        length = entity.get("length")
        assert (
            length is not None and isinstance(length, int) and length > 0
        ), f"Entity at index {index} in entities: length shouldn't be null and should be a strictly positive integer"

        confidence = entity.get("confidence")
        assert confidence is None or (
            isinstance(confidence, float) and 0 <= confidence <= 1
        ), f"Entity at index {index} in entities: confidence should be None or a float in [0..1] range"

    if self.is_read_only:
        logger.warning(
            "Cannot create transcription entities in bulk as this worker is in read-only mode"
        )
        return

    created_ids = self.request(
        "CreateTranscriptionEntities",
        id=transcription.id,
        body={
            "worker_run_id": self.worker_run_id,
            "entities": entities,
        },
    )

    for entity, created_objects in zip(entities, created_ids["entities"]):
        # Report entity creation
        self.report.add_entity(
            transcription.element.id,
            created_objects["entity_id"],
            entity.get("type_id"),
            entity.get("name"),
        )

        # Report transcription entity creation
        self.report.add_transcription_entity(
            created_objects["entity_id"],
            transcription,
            created_objects["transcription_entity_id"],
        )

    return created_ids["entities"]

list_transcription_entities

list_transcription_entities(
    transcription, worker_version=None
)

List existing entities on a transcription This method does not support cache

Parameters:

Name Type Description Default
transcription Transcription

The transcription to list entities on.

required
worker_version Optional[Union[str, bool]]

Restrict to entities created by a worker version with this UUID. Set to False to look for manually created transcriptions.

None
Source code in arkindex_worker/worker/entity.py
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
def list_transcription_entities(
    self,
    transcription: Transcription,
    worker_version: Optional[Union[str, bool]] = None,
):
    """
    List existing entities on a transcription
    This method does not support cache

    :param transcription: The transcription to list entities on.
    :param worker_version: Restrict to entities created by a worker version with this UUID. Set to False to look for manually created transcriptions.
    """
    query_params = {}
    assert transcription and isinstance(
        transcription, Transcription
    ), "transcription shouldn't be null and should be a Transcription"

    if worker_version is not None:
        assert isinstance(
            worker_version, (str, bool)
        ), "worker_version should be of type str or bool"

        if isinstance(worker_version, bool):
            assert (
                worker_version is False
            ), "if of type bool, worker_version can only be set to False"
        query_params["worker_version"] = worker_version

    return self.api_client.paginate(
        "ListTranscriptionEntities", id=transcription.id, **query_params
    )

list_corpus_entities

list_corpus_entities(name=None, parent=None)

List all entities in the worker’s corpus This method does not support cache

Parameters:

Name Type Description Default
name Optional[str]

Filter entities by part of their name (case-insensitive)

None
Element parent

Restrict entities to those linked to all transcriptions of an element and all its descendants. Note that links to metadata are ignored.

required
Source code in arkindex_worker/worker/entity.py
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
def list_corpus_entities(
    self,
    name: Optional[str] = None,
    parent: Optional[Element] = None,
):
    """
    List all entities in the worker's corpus
    This method does not support cache
    :param name: Filter entities by part of their name (case-insensitive)
    :param parent Element: Restrict entities to those linked to all transcriptions of an element and all its descendants. Note that links to metadata are ignored.
    """
    query_params = {}

    if name is not None:
        assert name and isinstance(name, str), "name should be of type str"
        query_params["name"] = name

    if parent is not None:
        assert isinstance(parent, Element), "parent should be of type Element"
        query_params["parent"] = parent.id

    return self.api_client.paginate(
        "ListCorpusEntities", id=self.corpus_id, **query_params
    )

list_corpus_entity_types

list_corpus_entity_types()

Loads available entity types in corpus.

Source code in arkindex_worker/worker/entity.py
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
def list_corpus_entity_types(
    self,
):
    """
    Loads available entity types in corpus.
    """
    self.entity_types = {
        entity_type["name"]: entity_type["id"]
        for entity_type in self.api_client.paginate(
            "ListCorpusEntityTypes", id=self.corpus_id
        )
    }
    logger.info(
        f"Loaded {len(self.entity_types)} entity types in corpus ({self.corpus_id})."
    )