Skip to content
Open
152 changes: 139 additions & 13 deletions cltk/corpus/persian/alphabet.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,148 @@
"""Persian alphabet"""

__author__ = "Iman Nazari"

#Persian digits from 0 to 9
DIGITS = ['۰', '۱', '۲', '۳', '۴', '۵', '۶', '۷', '۸', '۹']
# HAMZEH Family
HAMZEH = '\u0621'
ALEF_WITH_MAD = '\u0622'
ALEF_WITH_HAMZEH_ABOVE = '\u0623'
VE_WITH_HAMZA_ABOVE = '\u0624'
ALEF_WITH_HAMZEH_BELOW = '\u0625'
YE_WITH_HAMZA_ABOVE = '\u0626'

#Diacritics
SHORT_VOWELS = ['َ', 'ِ', 'ُ', 'ْ']
# واج‌ها
# Phonemes
ALEF = '\u0627'
BE = '\u0628'
PE = '\u067e'
TE = '\u062a'
SE = '\u062b'
JIM = '\u062c'
CHE = '\u0686'
HE = '\u062d'
KHE = '\u062e'
DAL = '\u062f'
ZAL = '\u0630'
RE = '\u0631'
ZE = '\u0632'
ZHE = '\u0698'
SIN = '\u0633'
SHIN = '\u0634'
SAD = '\u0635'
ZAD = '\u0636'
TA = '\u0637'
ZA = '\u0638'
EYN = '\u0639'
GHEYN = '\u063a'
FE = '\u0641'
GHAF = '\u0642'
KAF = '\u06a9'
GAF = '\u0642'
LAM = '\u0644'
MIM = '\u0645'
NOON = '\u0646'
VAV = '\u0648'
HE2 = '\u0647'
YE = '\u06cc'

#The Persian Alphabet
LONG_VOWELS = ['ا', 'و', 'ی']
# Punctuation marks
COMMA = '\u060C'
SEMICOLON = '\u061B'
QUESTION = '\u061F'

CONSONANTS = ['ء', 'ب', 'پ', 'ت', 'ث', 'ج', 'چ', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن', 'ه']
# Other symbols
PERCENT = '\u066a'
DECIMAL = '\u066b'
THOUSANDS = '\u066c'

#Special Characters
SPECIAL = ['آ', 'ۀ']
# Necessary for writing
KESHIDEGI = '\u0640'
ZERO_WIDTH_NONE_JOINER = '\u200c'
ZERO_WIDTH_JOINER = '\u200d'

#Tanvins
TANVIN = [ 'ٌ' , 'ٍ' , 'ً']
# تنوین‌ها
# Tanvins
TANVIN_FATHE = '\u064b'
TANVIN_ZAMME = '\u064c'
TANVIN_KASRE = '\u064d'

#Tashdid
TASHDID = 'ّ'
# واکه‌ها یا مصوت‌ها یا حروف صدادار
# Vowels
FATHE = '\u064e'
ZAMME = '\u064f'
KASRE = '\u0650'

# Diacritics
TASHDID = '\u0651'
SOKUN = '\u0652'
MAD = '\u0653'

HAMZEH_FAMILY = (
HAMZEH,
ALEF_WITH_MAD,
ALEF_WITH_HAMZEH_ABOVE,
VE_WITH_HAMZA_ABOVE,
ALEF_WITH_HAMZEH_BELOW,
YE_WITH_HAMZA_ABOVE,
)

ALPHABETIC_ORDER = {
ALEF: 1,
BE: 2,
PE: 3,
TE: 4,
SE: 5,
JIM: 6,
CHE: 7,
HE: 8,
KHE: 9,
DAL: 10,
ZAL: 11,
RE: 12,
ZE: 13,
ZHE: 14,
SIN: 15,
SHIN: 16,
SAD: 17,
ZAD: 18,
TA: 19,
ZA: 20,
EYN: 21,
GHEYN: 22,
FE: 23,
GHAF: 24,
KAF: 25,
GAF: 26,
LAM: 27,
MIM: 28,
NOON: 29,
VAV: 30,
HE2: 31,
YE: 32
}

NUMERALS = {
0: '۰',
1: '۱',
2: '۲',
3: '۳',
4: '۴',
5: '۵',
6: '۶',
7: '۷',
8: '۸',
9: '۹'
}

NUMERALS_WRITINGS = {
0: "صفر",
1: "یک",
2: "دو",
3: "سه",
4: "چهار",
5: "پنج",
6: "شش",
7: "هفت",
8: "هشت",
9: "نه"
}
99 changes: 99 additions & 0 deletions cltk/corpus/persian/persian_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import re
import cltk.corpus.persian.alphabet as alphabet
from cltk.corpus.arabic.alphabet import *

to_reform = [
{
"characters": [
HAMZA,
HAMZA_BELOW,
HAMZA_ABOVE,
HAMZA_ISOLATED,

MINI_ALEF,
SMALL_ALEF,
SMALL_WAW,
SMALL_YEH,

KASHEEDA,
FATHATAN,
DAMMATAN,
KASRATAN,
FATHA,
DAMMA,
KASRA,
SHADDA,
SUKUN,
alphabet.THOUSANDS,
alphabet.DECIMAL
],
"to_be": ""
},
{
"characters": [
ALEF_MADDA,
ALEF_WASLA,
HAMZA_BELOW_ALEF,
HAMZA_ABOVE_ALEF,
],
"to_be": alphabet.ALEF
},
{
"characters": [
ALEF_MAKSURA,
YEH,
],
"to_be": alphabet.YE
},
{
"characters": [KAF],
"to_be": alphabet.KAF
},
{
"characters": [
LAM_ALEF,
LAM_ALEF_HAMZA_ABOVE,
LAM_ALEF_HAMZA_BELOW,
LAM_ALEF_MADDA_ABOVE,
],
"to_be": alphabet.LAM + alphabet.ALEF
},
{
"characters": [TEH_MARBUTA],
"to_be": alphabet.HE2
},
]

replacementDict = {}
for rule in to_reform:
for character in rule["characters"]:
replacementDict[character] = rule["to_be"]

for originalForm, shapedForms in SHAPED_FORMS.items():
for form in shapedForms:
replacementDict[form] = replacementDict.get(originalForm, originalForm)

replacementDict4Word2vec = replacementDict.copy()

for i in range(10):
replacementDict[EASTERN_ARABIC_NUMERALS[i]] = alphabet.NUMERALS[i]
replacementDict[WESTERN_ARABIC_NUMERALS[i]] = alphabet.NUMERALS[i]

for i in range(10):
replacementDict4Word2vec[EASTERN_ARABIC_NUMERALS[i]] = " %s " % alphabet.NUMERALS_WRITINGS[i]
replacementDict4Word2vec[WESTERN_ARABIC_NUMERALS[i]] = " %s " % alphabet.NUMERALS_WRITINGS[i]
replacementDict4Word2vec[alphabet.NUMERALS[i]] = " %s " % alphabet.NUMERALS_WRITINGS[i]

for char in '[!"#%\'()*+,-./:;<=>?@\[\]^_`{|}~’”“′‘\\\]؟؛«»،٪':
replacementDict4Word2vec[char] = " "

replacementDict4Word2vec[" +"] = " "

replacementRegex4Word2vec = re.compile("(%s)" % "|".join(map(re.escape, replacementDict4Word2vec.keys())))
replacementRegex = re.compile("(%s)" % "|".join(map(re.escape, replacementDict.keys())))

def standardize(text):
return replacementRegex.sub(lambda mo: replacementDict[mo.string[mo.start():mo.end()]], text)

def standardize4Word2vec(text):
return replacementRegex4Word2vec.sub(lambda mo: replacementDict4Word2vec[mo.string[mo.start():mo.end()]], text)
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ mypy==0.610
nbconvert==5.3.1
nbformat==4.4.0
nltk==3.3
notebook==5.5.0
notebook==6.4.10
numpy==1.14.5
pandocfilters==1.4.2
parso==0.3.1
Expand Down