diff --git a/cltk/corpus/persian/alphabet.py b/cltk/corpus/persian/alphabet.py index e3f253c0f..40ff55993 100644 --- a/cltk/corpus/persian/alphabet.py +++ b/cltk/corpus/persian/alphabet.py @@ -1,22 +1,148 @@ """Persian alphabet""" +__author__ = "Iman Nazari" -#Persian digits from 0 to 9 -DIGITS = ['۰', '۱', '۲', '۳', '۴', '۵', '۶', '۷', '۸', '۹'] +# HAMZEH Family +HAMZEH = '\u0621' +ALEF_WITH_MAD = '\u0622' +ALEF_WITH_HAMZEH_ABOVE = '\u0623' +VE_WITH_HAMZA_ABOVE = '\u0624' +ALEF_WITH_HAMZEH_BELOW = '\u0625' +YE_WITH_HAMZA_ABOVE = '\u0626' -#Diacritics -SHORT_VOWELS = ['َ', 'ِ', 'ُ', 'ْ'] +# واج‌ها +# Phonemes +ALEF = '\u0627' +BE = '\u0628' +PE = '\u067e' +TE = '\u062a' +SE = '\u062b' +JIM = '\u062c' +CHE = '\u0686' +HE = '\u062d' +KHE = '\u062e' +DAL = '\u062f' +ZAL = '\u0630' +RE = '\u0631' +ZE = '\u0632' +ZHE = '\u0698' +SIN = '\u0633' +SHIN = '\u0634' +SAD = '\u0635' +ZAD = '\u0636' +TA = '\u0637' +ZA = '\u0638' +EYN = '\u0639' +GHEYN = '\u063a' +FE = '\u0641' +GHAF = '\u0642' +KAF = '\u06a9' +GAF = '\u0642' +LAM = '\u0644' +MIM = '\u0645' +NOON = '\u0646' +VAV = '\u0648' +HE2 = '\u0647' +YE = '\u06cc' -#The Persian Alphabet -LONG_VOWELS = ['ا', 'و', 'ی'] +# Punctuation marks +COMMA = '\u060C' +SEMICOLON = '\u061B' +QUESTION = '\u061F' -CONSONANTS = ['ء', 'ب', 'پ', 'ت', 'ث', 'ج', 'چ', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن', 'ه'] +# Other symbols +PERCENT = '\u066a' +DECIMAL = '\u066b' +THOUSANDS = '\u066c' -#Special Characters -SPECIAL = ['آ', 'ۀ'] +# Necessary for writing +KESHIDEGI = '\u0640' +ZERO_WIDTH_NONE_JOINER = '\u200c' +ZERO_WIDTH_JOINER = '\u200d' -#Tanvins -TANVIN = [ 'ٌ' , 'ٍ' , 'ً'] +# تنوین‌ها +# Tanvins +TANVIN_FATHE = '\u064b' +TANVIN_ZAMME = '\u064c' +TANVIN_KASRE = '\u064d' -#Tashdid -TASHDID = 'ّ' +# واکه‌ها یا مصوت‌ها یا حروف صدادار +# Vowels +FATHE = '\u064e' +ZAMME = '\u064f' +KASRE = '\u0650' + +# Diacritics +TASHDID = '\u0651' +SOKUN = '\u0652' +MAD = '\u0653' + +HAMZEH_FAMILY = ( + HAMZEH, + ALEF_WITH_MAD, + ALEF_WITH_HAMZEH_ABOVE, + VE_WITH_HAMZA_ABOVE, + ALEF_WITH_HAMZEH_BELOW, + YE_WITH_HAMZA_ABOVE, +) + +ALPHABETIC_ORDER = { + ALEF: 1, + BE: 2, + PE: 3, + TE: 4, + SE: 5, + JIM: 6, + CHE: 7, + HE: 8, + KHE: 9, + DAL: 10, + ZAL: 11, + RE: 12, + ZE: 13, + ZHE: 14, + SIN: 15, + SHIN: 16, + SAD: 17, + ZAD: 18, + TA: 19, + ZA: 20, + EYN: 21, + GHEYN: 22, + FE: 23, + GHAF: 24, + KAF: 25, + GAF: 26, + LAM: 27, + MIM: 28, + NOON: 29, + VAV: 30, + HE2: 31, + YE: 32 +} + +NUMERALS = { + 0: '۰', + 1: '۱', + 2: '۲', + 3: '۳', + 4: '۴', + 5: '۵', + 6: '۶', + 7: '۷', + 8: '۸', + 9: '۹' +} + +NUMERALS_WRITINGS = { + 0: "صفر", + 1: "یک", + 2: "دو", + 3: "سه", + 4: "چهار", + 5: "پنج", + 6: "شش", + 7: "هفت", + 8: "هشت", + 9: "نه" +} diff --git a/cltk/corpus/persian/persian_utils.py b/cltk/corpus/persian/persian_utils.py new file mode 100644 index 000000000..af3b6ae73 --- /dev/null +++ b/cltk/corpus/persian/persian_utils.py @@ -0,0 +1,99 @@ +import re +import cltk.corpus.persian.alphabet as alphabet +from cltk.corpus.arabic.alphabet import * + +to_reform = [ + { + "characters": [ + HAMZA, + HAMZA_BELOW, + HAMZA_ABOVE, + HAMZA_ISOLATED, + + MINI_ALEF, + SMALL_ALEF, + SMALL_WAW, + SMALL_YEH, + + KASHEEDA, + FATHATAN, + DAMMATAN, + KASRATAN, + FATHA, + DAMMA, + KASRA, + SHADDA, + SUKUN, + alphabet.THOUSANDS, + alphabet.DECIMAL + ], + "to_be": "" + }, + { + "characters": [ + ALEF_MADDA, + ALEF_WASLA, + HAMZA_BELOW_ALEF, + HAMZA_ABOVE_ALEF, + ], + "to_be": alphabet.ALEF + }, + { + "characters": [ + ALEF_MAKSURA, + YEH, + ], + "to_be": alphabet.YE + }, + { + "characters": [KAF], + "to_be": alphabet.KAF + }, + { + "characters": [ + LAM_ALEF, + LAM_ALEF_HAMZA_ABOVE, + LAM_ALEF_HAMZA_BELOW, + LAM_ALEF_MADDA_ABOVE, + ], + "to_be": alphabet.LAM + alphabet.ALEF + }, + { + "characters": [TEH_MARBUTA], + "to_be": alphabet.HE2 + }, +] + +replacementDict = {} +for rule in to_reform: + for character in rule["characters"]: + replacementDict[character] = rule["to_be"] + +for originalForm, shapedForms in SHAPED_FORMS.items(): + for form in shapedForms: + replacementDict[form] = replacementDict.get(originalForm, originalForm) + +replacementDict4Word2vec = replacementDict.copy() + +for i in range(10): + replacementDict[EASTERN_ARABIC_NUMERALS[i]] = alphabet.NUMERALS[i] + replacementDict[WESTERN_ARABIC_NUMERALS[i]] = alphabet.NUMERALS[i] + +for i in range(10): + replacementDict4Word2vec[EASTERN_ARABIC_NUMERALS[i]] = " %s " % alphabet.NUMERALS_WRITINGS[i] + replacementDict4Word2vec[WESTERN_ARABIC_NUMERALS[i]] = " %s " % alphabet.NUMERALS_WRITINGS[i] + replacementDict4Word2vec[alphabet.NUMERALS[i]] = " %s " % alphabet.NUMERALS_WRITINGS[i] + +for char in '[!"#%\'()*+,-./:;<=>?@\[\]^_`{|}~’”“′‘\\\]؟؛«»،٪': + replacementDict4Word2vec[char] = " " + +replacementDict4Word2vec[" +"] = " " + +replacementRegex4Word2vec = re.compile("(%s)" % "|".join(map(re.escape, replacementDict4Word2vec.keys()))) +replacementRegex = re.compile("(%s)" % "|".join(map(re.escape, replacementDict.keys()))) + +def standardize(text): + return replacementRegex.sub(lambda mo: replacementDict[mo.string[mo.start():mo.end()]], text) + +def standardize4Word2vec(text): + return replacementRegex4Word2vec.sub(lambda mo: replacementDict4Word2vec[mo.string[mo.start():mo.end()]], text) diff --git a/requirements.txt b/requirements.txt index 55a5bdd79..959f084fc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -41,7 +41,7 @@ mypy==0.610 nbconvert==5.3.1 nbformat==4.4.0 nltk==3.3 -notebook==5.5.0 +notebook==6.4.10 numpy==1.14.5 pandocfilters==1.4.2 parso==0.3.1