Source code for cltk.morphology.lat

""" This modules provides Decliner for Latin. Given a lemma, the Decliner will provide each grammatically valid forms

This work is based on the lexical and linguistic data built for and by the Collatinus Team ( https://github.com/biblissima/collatinus ).
This module hence inherit the license from the original project. The objective of this module is to port part of Collatinus to CLTK.

"""

__author__ = ["Thibault Clerice"]
# credits also to "Yves Ouvrard" and "Philippe Verkerk"
__license__ = "GPL v3"


import json
import os
import re
from typing import Dict, List, Tuple

from cltk.core.exceptions import CLTKException
from cltk.utils import CLTK_DATA_DIR


[docs]class CollatinusDecliner:
    """Latin Decliner based on Collatinus data and approach to declining words for Latin

    .. code-block:: python

       # Ensure you have downloaded the corpus latin_models_cltk before running this
       from cltk.stem.lat.declension import CollatinusDecliner

       decliner = CollatinusDecliner()
       print(decliner.decline("via"))

        [
            ('via', '--s----n-'), ('via', '--s----v-'), ('viam', '--s----a-'), ('viae', '--s----g-'),
            ('viae', '--s----d-'), ('via', '--s----b-'), ('viae', '--p----n-'), ('viae', '--p----v-'),
            ('vias', '--p----a-'), ('viarum', '--p----g-'), ('viis', '--p----d-'), ('viis', '--p----b-')
        ]



    """

    _dism = re.compile(r"(\d+)")

    def __init__(self):
        path = os.path.join(
            CLTK_DATA_DIR, "lat/model/lat_models_cltk/lemmata/collatinus/collected.json"
        )
        path = os.path.expanduser(path)
        with open(path) as data_file:
            self._data = json.load(data_file)

        self._models = self._data["models"]
        self._lemmas = self._data["lemmas"]
        self._mapped = self._data["maps"]

    def __getPOS(self, key):
        """Get POS tag for key

        :param key: Key Index of Collatinus Morphos
        :return: Part-Of-Speech tag
        """
        return self._data["pos"][str(key)]

[docs]    def _remove_disambiguation(self, root):
        """Remove disambiguation index from lemma root

        :param root: Root in Collatinus
        :return: Cleaned root
        """
        return CollatinusDecliner._dism.sub("", root)

[docs]    def _getRoots(self, lemma, model):
        """Retrieve the known roots of a lemma

        :param lemma: Canonical form of the word (lemma)
        :type lemma: str
        :param model: Model data from the loaded self.__data__. Can be passed by decline()
        :type model: dict
        :return: Dictionary of roots with their root identifier as key
        :rtype: dict
        """

        if lemma not in self._lemmas:
            raise CLTKException("%s is unknown" % lemma)

        ROOT_IDS = {"K": "lemma", "1": "geninf", "2": "perf"}

        lemma_entry = self._lemmas[lemma]
        if "quantity" in lemma_entry and lemma_entry["quantity"]:
            lemma_in_lemma_entry = lemma_entry["quantity"]
        else:
            lemma_in_lemma_entry = self._remove_disambiguation(lemma_entry["lemma"])

        original_roots = {
            root_id: lemma_entry[root_name].split(",")
            for root_id, root_name in ROOT_IDS.items()
            if root_id != "K" and lemma_entry[root_name]
        }
        returned_roots = {}

        if not model:
            model = self._models[lemma_entry["model"]]

        # For each registered root in the model,
        for model_root_id, model_root_data in model["R"].items():

            # If we have K, it's equivalent to canonical form
            if model_root_data[0] == "K":
                returned_roots[model_root_id] = lemma_in_lemma_entry.split(",")
            # Otherwise we have deletion number and addition char
            else:
                deletion, addition = int(model_root_data[0]), model_root_data[1] or ""

                # If a the root is declared already,
                # we retrieve the information
                if model_root_id != "1" and model_root_id in returned_roots:
                    lemma_roots = returned_roots[model_root_id]
                else:
                    lemma_roots = lemma_in_lemma_entry.split(",")
                # We construct the roots
                returned_roots[model_root_id] = [
                    lemma_root[:-deletion] + addition for lemma_root in lemma_roots
                ]

            if model_root_id in original_roots:
                returned_roots[model_root_id].extend(original_roots[model_root_id])
            returned_roots[model_root_id] = list(set(returned_roots[model_root_id]))
        original_roots.update(returned_roots)

        return original_roots

[docs]    def decline(
        self, lemma: str, flatten: bool = False, collatinus_dict: bool = False
    ) -> List[Tuple[str, str]]:
        """ Decline a lemma

        .. warning:: POS are incomplete as we do not detect the type outside of verbs, participle and adjective.

        :raise CLTKException: When the lemma is unknown to our data

        :param lemma: Lemma (Canonical form) to decline
        :type lemma: str
        :param flatten: If set to True, returns a list of forms without natural language information about them
        :type flatten: bool
        :param collatinus_dict: If sets to True, Dictionary of grammatically valid forms, including variants, with keys\
         corresponding to morpho informations.
        :type collatinus_dict: bool
        :return: List of tuple where first value is the form and second the pos, ie [("sum", "v1ppip---")]
        :rtype: list or dict

        """

        if lemma in self._lemmas:
            # Get data information
            lemma_entry = self._lemmas[lemma]
        elif lemma in self._mapped and self._mapped[lemma] in self._lemmas:
            # Get data information
            lemma = self._mapped[lemma]
            lemma_entry = self._lemmas[self._mapped[lemma]]
        else:
            raise CLTKException("%s is unknown" % lemma)
        model = self._models[lemma_entry["model"]]

        # Get the roots
        roots = self._getRoots(lemma, model=model)
        # Get the known forms in order
        keys = sorted([int(key) for key in model["des"].keys()])
        forms_data = [(key, model["des"][str(key)]) for key in keys]

        # Generate the return dict
        forms = {key: [] for key in keys}
        for key, form_list in forms_data:
            for form in form_list:
                root_id, endings = tuple(form)
                for root in roots[root_id]:
                    for ending in endings:
                        forms[key].append(root + ending)

        # sufd means we have the original forms of the parent but we add a suffix
        if len(model["sufd"]):
            # For each constant form1
            for key, iter_forms in forms.items():
                new_forms = []
                # We add the constant suffix
                for sufd in model["sufd"]:
                    new_forms += [form + sufd for form in iter_forms]
                forms[key] = new_forms

        # If we need a secure version of the forms. For example, if we have variants
        if len(model["suf"]):
            cached_forms = {
                k: v + [] for k, v in forms.items()
            }  # Making cache without using copy

            # For each suffix
            # The format is [suffix characters, [modified forms]]
            for suffixes in model["suf"]:
                suffix, modified_forms = suffixes[0], suffixes[1]
                for modified_form in modified_forms:
                    forms[modified_form] += [
                        f + suffix for f in cached_forms[modified_form]
                    ]
            # We update with the new roots

        # If some form do not exist, we delete them prehentively
        if len(model["abs"]):
            for abs_form in model["abs"]:
                if abs_form in forms:
                    del forms[abs_form]

        if flatten:
            return list([form for case_forms in forms.values() for form in case_forms])
        elif collatinus_dict:
            return forms
        else:
            return list(
                [
                    (form, self.__getPOS(key))
                    for key, case_forms in forms.items()
                    for form in case_forms
                ]
            )

    @property
    def lemmas(self) -> Dict[str, Dict[str, str]]:
        return self._lemmas
Source code for cltk.morphology.lat

The Classical Language Toolkit

Navigation

Related Topics