Source code for cltk.lemmatize.grc

"""Module for lemmatizing Ancient Greek
"""

__author__ = ["Patrick J. Burns <patrick@diyclassics.org>"]
__license__ = "MIT License. See LICENSE."

import os
from typing import List

from cltk.lemmatize.backoff import (
    DictLemmatizer,
    IdentityLemmatizer,
    RegexpLemmatizer,
    UnigramLemmatizer,
)
from cltk.utils import CLTK_DATA_DIR
from cltk.utils.file_operations import open_pickle

greek_sub_patterns = [("(ων)(ος|ι|να)$", r"ων")]

models_path = os.path.normpath(
    os.path.join(CLTK_DATA_DIR, "grc/model/grc_models_cltk/lemmata/backoff")
)


[docs]class GreekBackoffLemmatizer: """Suggested backoff chain; includes at least on of each type of major sequential backoff class from backoff.py """ def __init__( self: object, train: List[list] = None, seed: int = 3, verbose: bool = False ): self.models_path = models_path missing_models_message = "GreekBackoffLemmatizer requires the ```grc_models_cltk``` to be in cltk_data. Please load this corpus." try: self.train = open_pickle( os.path.join(self.models_path, "greek_lemmatized_sents.pickle") ) self.GREEK_OLD_MODEL = open_pickle( os.path.join(self.models_path, "greek_lemmata_cltk.pickle") ) self.GREEK_MODEL = open_pickle( os.path.join(self.models_path, "greek_model.pickle") ) except FileNotFoundError as err: raise type(err)(missing_models_message) self.greek_sub_patterns = greek_sub_patterns self.seed = seed self.VERBOSE = verbose def _randomize_data(train: List[list], seed: int): import random random.seed(seed) random.shuffle(train) pos_train_sents = train[:4000] lem_train_sents = [[(item[0], item[1]) for item in sent] for sent in train] train_sents = lem_train_sents[:4000] test_sents = lem_train_sents[4000:5000] return pos_train_sents, train_sents, test_sents self.pos_train_sents, self.train_sents, self.test_sents = _randomize_data( self.train, self.seed ) self._define_lemmatizer() def _define_lemmatizer(self: object): # Suggested backoff chain--should be tested for optimal order self.backoff0 = None self.backoff1 = IdentityLemmatizer(verbose=self.VERBOSE) self.backoff2 = DictLemmatizer( lemmas=self.GREEK_OLD_MODEL, source="Morpheus Lemmas", backoff=self.backoff1, verbose=self.VERBOSE, ) self.backoff3 = RegexpLemmatizer( self.greek_sub_patterns, source="CLTK Greek Regex Patterns", backoff=self.backoff2, verbose=self.VERBOSE, ) self.backoff4 = UnigramLemmatizer( self.train_sents, source="CLTK Sentence Training Data", backoff=self.backoff3, verbose=self.VERBOSE, ) self.backoff5 = DictLemmatizer( lemmas=self.GREEK_MODEL, source="Greek Model", backoff=self.backoff4, verbose=self.VERBOSE, ) self.lemmatizer = self.backoff5
[docs] def lemmatize(self: object, tokens: List[str]): """ Lemmatize a list of words. >>> lemmatizer = GreekBackoffLemmatizer() >>> lemmatizer.lemmatize("κατέβην χθὲς εἰς Πειραιᾶ μετὰ Γλαύκωνος τοῦ Ἀρίστωνος".split()) [('κατέβην', 'καταβαίνω'), ('χθὲς', 'χθὲς'), ('εἰς', 'εἰς'), ('Πειραιᾶ', 'Πειραιεύς'), \ ('μετὰ', 'μετά'), ('Γλαύκωνος', 'Γλαύκων'), ('τοῦ', 'ὁ'), ('Ἀρίστωνος', 'Ἀρίστων')] """ lemmas = self.lemmatizer.lemmatize(tokens) return lemmas
[docs] def evaluate(self: object): if self.VERBOSE: raise AssertionError( "evaluate() method only works when verbose: bool = False" ) return self.lemmatizer.evaluate(self.test_sents)
def __repr__(self: object): return f"<BackoffGreekLemmatizer v0.1>" def __call__(self, token: str) -> str: return self.lemmatize([token])[0][0]