Source code for cltk.alphabet.text_normalization

"""Functions for preprocessing texts. Not language-specific."""

from unicodedata import normalize


[docs]def cltk_normalize(text, compatibility=True): if compatibility: return normalize("NFKC", text) else: return normalize("NFC", text)
[docs]def remove_non_ascii(input_string): """Remove non-ascii characters Source: http://stackoverflow.com/a/1342373 """ no_ascii = "".join(i for i in input_string if ord(i) < 128) return no_ascii
[docs]def remove_non_latin(input_string, also_keep=None): """Remove non-Latin characters. `also_keep` should be a list which will add chars (e.g. punctuation) that will not be filtered. """ if also_keep: also_keep += [" "] else: also_keep = [" "] latin_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" latin_chars += latin_chars.lower() latin_chars += "".join(also_keep) no_latin = "".join([char for char in input_string if char in latin_chars]) return no_latin