From 31b74dd3c18a51e068bf57c21f94a2aeeb5fc1e4 Mon Sep 17 00:00:00 2001 From: Iman Nazari <32216131+ishto7@users.noreply.github.com> Date: Wed, 6 Jun 2018 22:11:27 +0430 Subject: [PATCH 01/14] Update Persian alphabet Just starting... --- cltk/corpus/persian/alphabet.py | 62 ++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 20 deletions(-) diff --git a/cltk/corpus/persian/alphabet.py b/cltk/corpus/persian/alphabet.py index e3f253c0f..c1fd956c2 100644 --- a/cltk/corpus/persian/alphabet.py +++ b/cltk/corpus/persian/alphabet.py @@ -1,22 +1,44 @@ """Persian alphabet""" - -#Persian digits from 0 to 9 -DIGITS = ['۰', '۱', '۲', '۳', '۴', '۵', '۶', '۷', '۸', '۹'] - -#Diacritics -SHORT_VOWELS = ['َ', 'ِ', 'ُ', 'ْ'] - -#The Persian Alphabet -LONG_VOWELS = ['ا', 'و', 'ی'] - -CONSONANTS = ['ء', 'ب', 'پ', 'ت', 'ث', 'ج', 'چ', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن', 'ه'] - -#Special Characters -SPECIAL = ['آ', 'ۀ'] - -#Tanvins -TANVIN = [ 'ٌ' , 'ٍ' , 'ً'] - -#Tashdid -TASHDID = 'ّ' +__author__ = "Iman Nazari" + +# HAMZEH Family +ALEF_WITH_HAMZEH_ABOVE = '\u0623' +ALEF_WITH_HAMZEH_BELOW = '\u0625' + +# Extensions +ALEF_WITH_MAD = '\u0622' + +# Alphabet +ALEF = '\u0627' +BE = '\u0628' +PE = '\u067e' +TE = '\u062a' +SE = '\u062b' +JIM = '\u062c' +CHE = '\u0686' +HE = '\u062d' +KHE = '\u062e' +DAL = '\u062f' +ZAL = '\u0630' +RE = '\u0631' +ZE = '\u0632' +ZHE = '\u0698' +SIN = '\u0633' +SHIN = '\u0634' +SAD = '\u0635' +ZAD = '\u0636' +TA = '\u0637' +ZA = '\u0638' +EYN = '\u0639' +GHEYN = '\u063a' +FE = '\u0641' +GHAF = '\u0642' +KAF = '\u06a9' +GAF = '\u0642' +LAM = '\u0644' +MIM = '\u0645' +NOON = '\u0646' +VAV = '\u0648' +HE2 = '\u0647' +YE = '\u06cc' From abe40c7f14c69d6fbef7e01bb7cd49e7d7fffaed Mon Sep 17 00:00:00 2001 From: Iman Nazari <32216131+ishto7@users.noreply.github.com> Date: Thu, 7 Jun 2018 12:43:27 +0430 Subject: [PATCH 02/14] Persian alphabet The first version of the Persian alphabet --- cltk/corpus/persian/alphabet.py | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/cltk/corpus/persian/alphabet.py b/cltk/corpus/persian/alphabet.py index c1fd956c2..0252f485b 100644 --- a/cltk/corpus/persian/alphabet.py +++ b/cltk/corpus/persian/alphabet.py @@ -3,11 +3,12 @@ __author__ = "Iman Nazari" # HAMZEH Family +HAMZEH = '\u0621' +ALEF_WITH_MAD = '\u0622' ALEF_WITH_HAMZEH_ABOVE = '\u0623' +VE_WITH_HAMZA_ABOVE = '\u0624' ALEF_WITH_HAMZEH_BELOW = '\u0625' - -# Extensions -ALEF_WITH_MAD = '\u0622' +YE_WITH_HAMZA_ABOVE = '\u0626' # Alphabet ALEF = '\u0627' @@ -42,3 +43,29 @@ VAV = '\u0648' HE2 = '\u0647' YE = '\u06cc' + +# Punctuation marks +COMMA = '\u060C' +SEMICOLON = '\u061B' +QUESTION = '\u061F' + +# Other symbols +PERCENT = '\u066a' +DECIMAL = '\u066b' +THOUSANDS = '\u066c' + +# Necessary for writing +KESHIDEGI = '\u0640' +ZERO_WIDTH_NONE_JOINER = '\u200c' +ZERO_WIDTH_JOINER = '\u200d' + +# Diacritics +TANVIN_FATHE = '\u064b' +TANVIN_ZAMME = '\u064c' +TANVIN_KASRE = '\u064d' +FATHE = '\u064e' +ZAMME = '\u064f' +KASRE = '\u0650' +TASHDID = '\u0651' +SOKUN = '\u0652' +MAD = '\u0653' From 442425fdf9f6ed47e6fa7d19de004ebc9f2fa099 Mon Sep 17 00:00:00 2001 From: Iman Nazari <32216131+ishto7@users.noreply.github.com> Date: Thu, 7 Jun 2018 16:11:59 +0430 Subject: [PATCH 03/14] Update alphabet.py --- cltk/corpus/persian/alphabet.py | 58 +++++++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 2 deletions(-) diff --git a/cltk/corpus/persian/alphabet.py b/cltk/corpus/persian/alphabet.py index 0252f485b..040245802 100644 --- a/cltk/corpus/persian/alphabet.py +++ b/cltk/corpus/persian/alphabet.py @@ -1,5 +1,6 @@ """Persian alphabet""" + __author__ = "Iman Nazari" # HAMZEH Family @@ -10,7 +11,8 @@ ALEF_WITH_HAMZEH_BELOW = '\u0625' YE_WITH_HAMZA_ABOVE = '\u0626' -# Alphabet +# واج‌ها +# Phonemes ALEF = '\u0627' BE = '\u0628' PE = '\u067e' @@ -59,13 +61,65 @@ ZERO_WIDTH_NONE_JOINER = '\u200c' ZERO_WIDTH_JOINER = '\u200d' -# Diacritics +# تنوین‌ها +# Tanvins TANVIN_FATHE = '\u064b' TANVIN_ZAMME = '\u064c' TANVIN_KASRE = '\u064d' + +# واکه‌ها یا مصوت‌ها یا حروف صدادار +# Vowels FATHE = '\u064e' ZAMME = '\u064f' KASRE = '\u0650' + +# Diacritics TASHDID = '\u0651' SOKUN = '\u0652' MAD = '\u0653' + +HAMZEH_FAMILY = ( + HAMZEH, + ALEF_WITH_MAD, + ALEF_WITH_HAMZEH_ABOVE, + VE_WITH_HAMZA_ABOVE, + ALEF_WITH_HAMZEH_BELOW, + YE_WITH_HAMZA_ABOVE, +) + +ALPHABETIC_ORDER = { + ALEF: 1, + BE: 2, + PE: 3, + TE: 4, + SE: 5, + JIM: 6, + CHE: 7, + HE: 8, + KHE: 9, + DAL: 10, + ZAL: 11, + RE: 12, + ZE: 13, + ZHE: 14, + SIN: 15, + SHIN: 16, + SAD: 17, + ZAD: 18, + TA: 19, + ZA: 20, + EYN: 21, + GHEYN: 22, + FE: 23, + GHAF: 24, + KAF: 25, + GAF: 26, + LAM: 27, + MIM: 28, + NOON: 29, + VAV: 30, + HE2: 31, + YE: 32 +} + +PERSIAN_NUMERALS = ["۰","۱","۲","۳","۴","۵","۶","۷","۸","۹"] From 7a2710e5fd1b16f4932882a053ead7008bd28496 Mon Sep 17 00:00:00 2001 From: Iman Nazari <32216131+ishto7@users.noreply.github.com> Date: Mon, 23 Jul 2018 08:23:19 +0430 Subject: [PATCH 04/14] Add a dictionary of Arabic alphabet It is needed to standardize text --- cltk/corpus/persian/ArabicAlphabet.py | 462 ++++++++++++++++++++++++++ 1 file changed, 462 insertions(+) create mode 100644 cltk/corpus/persian/ArabicAlphabet.py diff --git a/cltk/corpus/persian/ArabicAlphabet.py b/cltk/corpus/persian/ArabicAlphabet.py new file mode 100644 index 000000000..00aaddf24 --- /dev/null +++ b/cltk/corpus/persian/ArabicAlphabet.py @@ -0,0 +1,462 @@ +""" + Arabic alphabet + source 1 : pyarabic 'https://github.com/linuxscout/pyarabic' + source 2 : arabicstemmer 'https://github.com/assem-ch/arabicstemmer/blob/master/algorithm/stemmer.sbl' +""" +__author__ = 'Lakhdar Benzahia ' + +# Arabic letters + +# Hamza letter +HAMZA = '\u0621' +HAMZA_ABOVE_ALEF = '\u0623' +HAMZA_BELOW_ALEF = '\u0625' +ALEF_MADDA = '\u0622' +HAMZA_ABOVE_WAW = '\u0624' +HAMZA_ABOVE_YEH = '\u0626' + +ALEF = '\u0627' +ALEF_MAKSURA = '\u0649' +BEH = '\u0628' +TEH_MARBUTA = '\u0629' +TEH = '\u062a' +THEH = '\u062b' +JEEM = '\u062c' +HAH = '\u062d' +KHAH = '\u062e' +DAL = '\u062f' +THEL = '\u0630' +REH = '\u0631' +ZAIN = '\u0632' +SEEN = '\u0633' +SHEEN = '\u0634' +SAD = '\u0635' +DAD = '\u0636' +TAH = '\u0637' +ZAH = '\u0638' +AIN = '\u0639' +GHAIN = '\u063a' +FEH = '\u0641' +QAF = '\u0642' +KAF = '\u0643' +LAM = '\u0644' +MEEM = '\u0645' +NOON = '\u0646' +HEH = '\u0647' +WAW = '\u0648' +YEH = '\u064a' + +MINI_ALEF = '\u0670' +ALEF_WASLA = '\u0671' +MADDA_ABOVE = '\u0653' +HAMZA_ABOVE = '\u0654' +HAMZA_BELOW = '\u0655' + +# Small Letters +SMALL_ALEF = "\u0670" +SMALL_WAW = "\u06E5" +SMALL_YEH = "\u06E6" + +# Ligatures Lam-Alef +LAM_ALEF = '\ufefb' +LAM_ALEF_HAMZA_ABOVE = '\ufef7' +LAM_ALEF_HAMZA_BELOW = '\ufef9' +LAM_ALEF_MADDA_ABOVE = '\ufef5' + + +SIMPLE_LAM_ALEF = '\u0644\u0627' +SIMPLE_LAM_ALEF_HAMZA_ABOVE = '\u0644\u0623' +SIMPLE_LAM_ALEF_HAMZA_BELOW = '\u0644\u0625' +SIMPLE_LAM_ALEF_MADDA_ABOVE = '\u0644\u0622' + +# shaped forms +LAM_ALEF_ISOLATED = '\ufefb' +LAM_ALEF_FINAL = '\ufefc' + +LAM_ALEF_HAMZA_ABOVE_ISOLATED = '\ufef7' +LAM_ALEF_HAMZA_ABOVE_FINAL = '\ufef8' + +LAM_ALEF_HAMZA_BELOW_ISOLATED = '\ufef9' +LAM_ALEF_HAMZA_BELOW_FINAL = '\ufefa' + +LAM_ALEF_MADDA_ABOVE_ISOLATED = '\ufef5' +LAM_ALEF_MADDA_ABOVE_FINAL = '\ufef6' + +HAMZA_ISOLATED = '\ufe80' + +ALEF_HAMZA_ABOVE_ISOLATED = '\ufe83' +ALEF_HAMZA_ABOVE_FINAL = '\ufe84' + +ALEF_HAMZA_BELOW_ISOLATED = '\ufe87' +ALEF_HAMZA_BELOW_FINAL = '\ufe88' + +YEH_HAMZA_INITIAL = '\ufe8b' +YEH_HAMZA_MEDIAL = '\ufe8c' +YEH_HAMZA_ISOLATED = '\ufe89' +YEH_HAMZA_FINAL = '\ufe8a' + +ALEF_MADDA_ISOLATED = '\ufe81' +ALEF_MADDA_FINAL = '\ufe82' + +WAW_HAMZA_ISOLATED = '\ufe85' +WAW_HAMZA_FINAL = '\ufe86' + +ALEF_ISOLATED = '\ufe8d' +ALEF_FINAL = '\ufe8e' + +BEH_ISOLATED = '\ufe8f' +BEH_FINAL = '\ufe90' +BEH_INITIAL = '\ufe91' +BEH_MEDIAL = '\ufe92' + +TEH_MARBUTA_ISOLATED = '\ufe93' +TEH_MARBUTA_FINAL = '\ufe94' + +TEH_INITIAL = '\ufe97' +TEH_MEDIAL = '\ufe98' +TEH_ISOLATED = '\ufe95' +TEH_FINAL = '\ufe96' + +THEH_INITIAL = '\ufe9b' +THEH_MEDIAL = '\ufe9c' +THEH_FINAL = '\ufe9a' +THEH_ISOLATED = '\ufe99' + +JEEM_INITIAL = '\ufe9f' +JEEM_MEDIAL = '\ufea0' +JEEM_ISOLATED = '\ufe9d' +JEEM_FINAL = '\ufe9e' + +HAH_INITIAL = '\ufea3' +HAH_MEDIAL = '\ufea4' +HAH_ISOLATED = '\ufea1' +HAH_FINAL = '\ufea2' + +KHAH_INITIAL = '\ufea7' +KHAH_MEDIAL = '\ufea8' +KHAH_ISOLATED = '\ufea5' +KHAH_FINAL = '\ufea6' + +DAL_ISOLATED = '\ufea9' +DAL_FINAL = '\ufeaa' + +THEL_ISOLATED = '\ufeab' +THEL_FINAL = '\ufeac' + +REH_ISOLATED = '\ufead' +REH_FINAL = '\ufeae' + +ZAIN_ISOLATED = '\ufeaf' +ZAIN_FINAL = '\ufeb0' + +SEEN_INITIAL = '\ufeb3' +SEEN_MEDIAL = '\ufeb4' +SEEN_ISOLATED = '\ufeb1' +SEEN_FINAL = '\ufeb2' + +SHEEN_INITIAL = '\ufeb7' +SHEEN_MEDIAL = '\ufeb8' +SHEEN_ISOLATED = '\ufeb5' +SHEEN_FINAL = '\ufeb6' + +SAD_INITIAL = '\ufebb' +SAD_MEDIAL = '\ufebc' +SAD_ISOLATED = '\ufeb9' +SAD_FINAL = '\ufeba' + +DAD_INITIAL = '\ufebf' +DAD_MEDIAL = '\ufec0' +DAD_ISOLATED = '\ufebd' +DAD_FINAL = '\ufebe' + +TAH_INITIAL = '\ufec3' +TAH_MEDIAL = '\ufec4' +TAH_ISOLATED = '\ufec1' +TAH_FINAL = '\ufec2' + +ZAH_INITIAL = '\ufec7' +ZAH_MEDIAL = '\ufec8' +ZAH_ISOLATED = '\ufec5' +ZAH_FINAL = '\ufec6' + +AIN_INITIAL = '\ufecb' +AIN_MEDIAL = '\ufecc' +AIN_ISOLATED = '\ufec9' +AIN_FINAL = '\ufeca' + +GHAIN_INITIAL = '\ufecf' +GHAIN_MEDIAL = '\ufed0' +GHAIN_ISOLATED = '\ufecd' +GHAIN_FINAL = '\ufece' + +FEH_INITIAL = '\ufed3' +FEH_MEDIAL = '\ufed4' +FEH_ISOLATED = '\ufed1' +FEH_FINAL = '\ufed2' + +QAF_INITIAL = '\ufed7' +QAF_MEDIAL = '\ufed8' +QAF_ISOLATED = '\ufed5' +QAF_FINAL = '\ufed6' + +KAF_INITIAL = '\ufedb' +KAF_MEDIAL = '\ufedC' +KAF_ISOLATED = '\ufed9' +KAF_FINAL = '\ufeda' + +LAM_INITIAL = '\ufedf' +LAM_MEDIAL = '\ufed0' +LAM_ISOLATED = '\ufedd' +LAM_FINAL = '\ufede' + +MEEM_INITIAL = '\ufee3' +MEEM_MEDIAL = '\ufee4' +MEEM_ISOLATED = '\ufee1' +MEEM_FINAL = '\ufee2' + +NOON_INITIAL = '\ufee7' +NOON_MEDIAL = '\ufee8' +NOON_ISOLATED = '\ufee5' +NOON_FINAL = '\ufee6' + +HEH_INITIAL = '\ufeeb' +HEH_MEDIAL = '\ufeec' +HEH_ISOLATED = '\ufee9' +HEH_FINAL = '\ufeea' + +WAW_ISOLATED = '\ufeed' +WAW_FINAL = '\ufeee' + +ALEF_MAKSURA_ISOLATED = '\ufeef' +ALEF_MAKSURA_FINAL = '\ufef0' + +YEH_INITIAL = '\ufef3' +YEH_MEDIAL = '\ufef4' +YEH_ISOLATED = '\ufef1' +YEH_FINAL = '\ufef2' + +# Punctuation marks +COMMA = '\u060C' +SEMICOLON = '\u061B' +QUESTION = '\u061F' + +# Kasheeda, Tatweel +KASHEEDA = '\u0640' + +# Other symbols +PERCENT = '\u066a' +DECIMAL = '\u066b' +THOUSANDS = '\u066c' +STAR = '\u066d' +FULL_STOP = '\u06d4' +BYTE_ORDER_MARK = '\ufeff' + +#Diacritics +FATHATAN = '\u064b' +DAMMATAN = '\u064c' +KASRATAN = '\u064d' +FATHA = '\u064e' +DAMMA = '\u064f' +KASRA = '\u0650' +SHADDA = '\u0651' +SUKUN = '\u0652' + +# groups + +HAMZAT = ( HAMZA, + HAMZA_ABOVE_ALEF, + HAMZA_BELOW_ALEF, + ALEF_MADDA, + HAMZA_ABOVE_WAW, + HAMZA_ABOVE_YEH, + HAMZA_ABOVE, + HAMZA_BELOW + ) + +ALEFAT = ( + ALEF, + ALEF_MADDA, + HAMZA_BELOW_ALEF, + HAMZA_ABOVE_ALEF, + ALEF_WASLA, + ALEF_MAKSURA, + SMALL_ALEF, + ) + +WEAK = (ALEF, WAW, YEH, ALEF_MAKSURA) + +YEHLIKE = (YEH, HAMZA_ABOVE_YEH, ALEF_MAKSURA, SMALL_YEH) + +WAWLIKE = (WAW, HAMZA_ABOVE_WAW, SMALL_WAW) + +TEHLIKE = (TEH, TEH_MARBUTA) + +SMALL = (SMALL_ALEF, SMALL_WAW, SMALL_YEH) + + +LETTERS = ( + ALEF, BEH, TEH, TEH_MARBUTA, THEH, JEEM, HAH, KHAH, + DAL, THEL, REH, ZAIN, SEEN, SHEEN, SAD, DAD, TAH, ZAH, + AIN, GHAIN, FEH, QAF, KAF, LAM, MEEM, NOON, HEH, WAW, YEH, + HAMZA, ALEF_MADDA, HAMZA_ABOVE_ALEF, HAMZA_ABOVE_WAW, HAMZA_BELOW_ALEF, + HAMZA_ABOVE_YEH, + ) + +TASHKEEL = (FATHATAN, DAMMATAN, KASRATAN, + FATHA, DAMMA, KASRA, + SUKUN, SHADDA + ) + +HARAKAT = (FATHATAN, DAMMATAN, KASRATAN, + FATHA, DAMMA, KASRA, + SUKUN + ) + +SHORTHARAKAT = (FATHA, DAMMA, KASRA, SUKUN) + +TANWEEN = (FATHATAN, DAMMATAN, KASRATAN) + + +NOT_DEF_HARAKA = KASHEEDA + +LIGATURES_LAM_ALEF = (LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE) + + +ALPHABETIC_ORDER = { + ALEF: 1, + BEH: 2, + TEH: 3, + TEH_MARBUTA: 3, + THEH: 4, + JEEM: 5, + HAH: 6, + KHAH: 7, + DAL: 8, + THEL: 9, + REH: 10, + ZAIN: 11, + SEEN: 12, + SHEEN: 13, + SAD: 14, + DAD: 15, + TAH: 16, + ZAH: 17, + AIN: 18, + GHAIN: 19, + FEH: 20, + QAF: 21, + KAF: 22, + LAM: 23, + MEEM: 24, + NOON: 25, + HEH: 26, + WAW: 27, + YEH: 28, + HAMZA: 29, + + ALEF_MADDA: 29, + HAMZA_ABOVE_ALEF: 29, + HAMZA_ABOVE_WAW: 29, + HAMZA_BELOW_ALEF: 29, + HAMZA_ABOVE_YEH: 29, + } + +NAMES = { + ALEF: "ألف", + BEH: "باء", + TEH: 'تاء', + TEH_MARBUTA: 'تاء مربوطة', + THEH: 'ثاء', + JEEM: 'جيم', + HAH: 'حاء', + KHAH: 'خاء', + DAL: 'دال', + THEL: 'ذال', + REH: 'راء', + ZAIN: 'زاي', + SEEN: 'سين', + SHEEN: 'شين', + SAD: 'صاد', + DAD: 'ضاد', + TAH: 'طاء', + ZAH: 'ظاء', + AIN: 'عين', + GHAIN: 'غين', + FEH: 'فاء', + QAF: 'قاف', + KAF: 'كاف', + LAM: 'لام', + MEEM: 'ميم', + NOON: 'نون', + HEH: 'هاء', + WAW: 'واو', + YEH: 'ياء', + HAMZA: 'همزة', + + KASHEEDA: 'تطويل', + ALEF_MADDA: 'ألف ممدودة', + ALEF_MAKSURA: 'ألف مقصورة', + HAMZA_ABOVE_ALEF: 'همزة على الألف', + HAMZA_ABOVE_WAW: 'همزة على الواو', + HAMZA_BELOW_ALEF: 'همزة تحت الألف', + HAMZA_ABOVE_YEH: 'همزة على الياء', + FATHATAN: 'فتحتان', + DAMMATAN: 'ضمتان', + KASRATAN: 'كسرتان', + FATHA: 'فتحة', + DAMMA: 'ضمة', + KASRA: 'كسرة', + SHADDA: 'شدة', + SUKUN: 'سكون', + } + +SHAPED_FORMS = { + HAMZA: (HAMZA_ISOLATED), + HAMZA_ABOVE_ALEF: (ALEF_HAMZA_ABOVE_ISOLATED, ALEF_HAMZA_ABOVE_FINAL), + HAMZA_BELOW_ALEF: (ALEF_HAMZA_BELOW_ISOLATED, ALEF_HAMZA_BELOW_FINAL), + HAMZA_ABOVE_YEH: (YEH_HAMZA_ISOLATED, YEH_HAMZA_INITIAL, YEH_HAMZA_MEDIAL, YEH_HAMZA_FINAL), + ALEF_MADDA: (ALEF_MADDA_ISOLATED, ALEF_MADDA_FINAL), + HAMZA_ABOVE_WAW: (WAW_HAMZA_ISOLATED, WAW_HAMZA_FINAL), + ALEF: (ALEF_ISOLATED, ALEF_FINAL), + BEH: (BEH_ISOLATED, BEH_FINAL, BEH_INITIAL, BEH_MEDIAL), + TEH_MARBUTA: (TEH_MARBUTA_ISOLATED, TEH_MARBUTA_FINAL), + TEH: (TEH_ISOLATED, TEH_INITIAL, TEH_MEDIAL, TEH_FINAL), + THEH: (THEH_ISOLATED, THEH_INITIAL, THEH_MEDIAL, THEH_FINAL), + JEEM: (JEEM_ISOLATED, JEEM_INITIAL, JEEM_MEDIAL, JEEM_FINAL), + HAH: (HAH_ISOLATED, HAH_INITIAL, HAH_MEDIAL, HAH_FINAL), + KHAH: (KHAH_ISOLATED, KHAH_INITIAL, KHAH_MEDIAL, KHAH_FINAL), + DAL: (DAL_ISOLATED, DAL_FINAL), + THEL: (THEL_ISOLATED, THEL_FINAL), + REH: (REH_ISOLATED, REH_FINAL), + ZAIN: (ZAIN_ISOLATED, ZAIN_FINAL), + SEEN: (SEEN_ISOLATED, SEEN_INITIAL, SEEN_MEDIAL, SEEN_FINAL), + SHEEN: (SHEEN_ISOLATED, SHEEN_INITIAL, SHEEN_MEDIAL, SHEEN_FINAL), + SAD: (SAD_ISOLATED, SAD_INITIAL, SAD_MEDIAL, SAD_FINAL), + DAD: (DAD_ISOLATED, DAD_INITIAL, DAD_MEDIAL, DAD_FINAL), + TAH: (TAH_ISOLATED, TAH_INITIAL, TAH_MEDIAL, TAH_FINAL), + ZAH: (ZAH_ISOLATED, ZAH_INITIAL, ZAH_MEDIAL, ZAH_FINAL), + AIN: (AIN_ISOLATED, AIN_INITIAL, AIN_MEDIAL, AIN_FINAL), + GHAIN: (GHAIN_ISOLATED, GHAIN_INITIAL, GHAIN_MEDIAL, GHAIN_FINAL), + FEH: (FEH_ISOLATED, FEH_INITIAL, FEH_MEDIAL, FEH_FINAL), + QAF: (QAF_ISOLATED, QAF_INITIAL, QAF_MEDIAL, QAF_FINAL), + KAF: (KAF_ISOLATED, KAF_INITIAL, KAF_MEDIAL, KAF_FINAL), + LAM: (LAM_ISOLATED, LAM_INITIAL, LAM_MEDIAL, LAM_FINAL), + MEEM: (MEEM_ISOLATED, MEEM_INITIAL, MEEM_MEDIAL, MEEM_FINAL), + NOON: (NOON_ISOLATED, NOON_INITIAL, NOON_MEDIAL, NOON_FINAL), + HEH: (HEH_ISOLATED, HEH_INITIAL, HEH_MEDIAL, HEH_FINAL), + WAW: (WAW_ISOLATED, WAW_FINAL), + ALEF_MAKSURA: (ALEF_MAKSURA_ISOLATED, ALEF_MAKSURA_FINAL), + YEH: (YEH_ISOLATED, YEH_INITIAL, YEH_MEDIAL, YEH_FINAL), + LAM_ALEF: (LAM_ALEF_ISOLATED, LAM_ALEF_FINAL), + LAM_ALEF_HAMZA_ABOVE: (LAM_ALEF_HAMZA_ABOVE_ISOLATED, LAM_ALEF_HAMZA_ABOVE_FINAL), + LAM_ALEF_HAMZA_BELOW: (LAM_ALEF_HAMZA_BELOW_ISOLATED, LAM_ALEF_HAMZA_BELOW_FINAL), + LAM_ALEF_MADDA_ABOVE: (LAM_ALEF_MADDA_ABOVE_ISOLATED, LAM_ALEF_MADDA_ABOVE_FINAL) + } + + +PUNCTUATION_MARKS = [COMMA, SEMICOLON, QUESTION] + + +WESTERN_ARABIC_NUMERALS = ['0','1','2','3','4','5','6','7','8','9'] + +EASTERN_ARABIC_NUMERALS = ['۰', '۱', '۲', '۳', '٤', '۵', '٦', '۷', '۸', '۹'] From eff3692166f9cb6371ab1afb9fa9c08c148ab298 Mon Sep 17 00:00:00 2001 From: Iman Nazari <32216131+ishto7@users.noreply.github.com> Date: Mon, 23 Jul 2018 08:33:17 +0430 Subject: [PATCH 05/14] Delete ArabicAlphabet.py --- cltk/corpus/persian/ArabicAlphabet.py | 462 -------------------------- 1 file changed, 462 deletions(-) delete mode 100644 cltk/corpus/persian/ArabicAlphabet.py diff --git a/cltk/corpus/persian/ArabicAlphabet.py b/cltk/corpus/persian/ArabicAlphabet.py deleted file mode 100644 index 00aaddf24..000000000 --- a/cltk/corpus/persian/ArabicAlphabet.py +++ /dev/null @@ -1,462 +0,0 @@ -""" - Arabic alphabet - source 1 : pyarabic 'https://github.com/linuxscout/pyarabic' - source 2 : arabicstemmer 'https://github.com/assem-ch/arabicstemmer/blob/master/algorithm/stemmer.sbl' -""" -__author__ = 'Lakhdar Benzahia ' - -# Arabic letters - -# Hamza letter -HAMZA = '\u0621' -HAMZA_ABOVE_ALEF = '\u0623' -HAMZA_BELOW_ALEF = '\u0625' -ALEF_MADDA = '\u0622' -HAMZA_ABOVE_WAW = '\u0624' -HAMZA_ABOVE_YEH = '\u0626' - -ALEF = '\u0627' -ALEF_MAKSURA = '\u0649' -BEH = '\u0628' -TEH_MARBUTA = '\u0629' -TEH = '\u062a' -THEH = '\u062b' -JEEM = '\u062c' -HAH = '\u062d' -KHAH = '\u062e' -DAL = '\u062f' -THEL = '\u0630' -REH = '\u0631' -ZAIN = '\u0632' -SEEN = '\u0633' -SHEEN = '\u0634' -SAD = '\u0635' -DAD = '\u0636' -TAH = '\u0637' -ZAH = '\u0638' -AIN = '\u0639' -GHAIN = '\u063a' -FEH = '\u0641' -QAF = '\u0642' -KAF = '\u0643' -LAM = '\u0644' -MEEM = '\u0645' -NOON = '\u0646' -HEH = '\u0647' -WAW = '\u0648' -YEH = '\u064a' - -MINI_ALEF = '\u0670' -ALEF_WASLA = '\u0671' -MADDA_ABOVE = '\u0653' -HAMZA_ABOVE = '\u0654' -HAMZA_BELOW = '\u0655' - -# Small Letters -SMALL_ALEF = "\u0670" -SMALL_WAW = "\u06E5" -SMALL_YEH = "\u06E6" - -# Ligatures Lam-Alef -LAM_ALEF = '\ufefb' -LAM_ALEF_HAMZA_ABOVE = '\ufef7' -LAM_ALEF_HAMZA_BELOW = '\ufef9' -LAM_ALEF_MADDA_ABOVE = '\ufef5' - - -SIMPLE_LAM_ALEF = '\u0644\u0627' -SIMPLE_LAM_ALEF_HAMZA_ABOVE = '\u0644\u0623' -SIMPLE_LAM_ALEF_HAMZA_BELOW = '\u0644\u0625' -SIMPLE_LAM_ALEF_MADDA_ABOVE = '\u0644\u0622' - -# shaped forms -LAM_ALEF_ISOLATED = '\ufefb' -LAM_ALEF_FINAL = '\ufefc' - -LAM_ALEF_HAMZA_ABOVE_ISOLATED = '\ufef7' -LAM_ALEF_HAMZA_ABOVE_FINAL = '\ufef8' - -LAM_ALEF_HAMZA_BELOW_ISOLATED = '\ufef9' -LAM_ALEF_HAMZA_BELOW_FINAL = '\ufefa' - -LAM_ALEF_MADDA_ABOVE_ISOLATED = '\ufef5' -LAM_ALEF_MADDA_ABOVE_FINAL = '\ufef6' - -HAMZA_ISOLATED = '\ufe80' - -ALEF_HAMZA_ABOVE_ISOLATED = '\ufe83' -ALEF_HAMZA_ABOVE_FINAL = '\ufe84' - -ALEF_HAMZA_BELOW_ISOLATED = '\ufe87' -ALEF_HAMZA_BELOW_FINAL = '\ufe88' - -YEH_HAMZA_INITIAL = '\ufe8b' -YEH_HAMZA_MEDIAL = '\ufe8c' -YEH_HAMZA_ISOLATED = '\ufe89' -YEH_HAMZA_FINAL = '\ufe8a' - -ALEF_MADDA_ISOLATED = '\ufe81' -ALEF_MADDA_FINAL = '\ufe82' - -WAW_HAMZA_ISOLATED = '\ufe85' -WAW_HAMZA_FINAL = '\ufe86' - -ALEF_ISOLATED = '\ufe8d' -ALEF_FINAL = '\ufe8e' - -BEH_ISOLATED = '\ufe8f' -BEH_FINAL = '\ufe90' -BEH_INITIAL = '\ufe91' -BEH_MEDIAL = '\ufe92' - -TEH_MARBUTA_ISOLATED = '\ufe93' -TEH_MARBUTA_FINAL = '\ufe94' - -TEH_INITIAL = '\ufe97' -TEH_MEDIAL = '\ufe98' -TEH_ISOLATED = '\ufe95' -TEH_FINAL = '\ufe96' - -THEH_INITIAL = '\ufe9b' -THEH_MEDIAL = '\ufe9c' -THEH_FINAL = '\ufe9a' -THEH_ISOLATED = '\ufe99' - -JEEM_INITIAL = '\ufe9f' -JEEM_MEDIAL = '\ufea0' -JEEM_ISOLATED = '\ufe9d' -JEEM_FINAL = '\ufe9e' - -HAH_INITIAL = '\ufea3' -HAH_MEDIAL = '\ufea4' -HAH_ISOLATED = '\ufea1' -HAH_FINAL = '\ufea2' - -KHAH_INITIAL = '\ufea7' -KHAH_MEDIAL = '\ufea8' -KHAH_ISOLATED = '\ufea5' -KHAH_FINAL = '\ufea6' - -DAL_ISOLATED = '\ufea9' -DAL_FINAL = '\ufeaa' - -THEL_ISOLATED = '\ufeab' -THEL_FINAL = '\ufeac' - -REH_ISOLATED = '\ufead' -REH_FINAL = '\ufeae' - -ZAIN_ISOLATED = '\ufeaf' -ZAIN_FINAL = '\ufeb0' - -SEEN_INITIAL = '\ufeb3' -SEEN_MEDIAL = '\ufeb4' -SEEN_ISOLATED = '\ufeb1' -SEEN_FINAL = '\ufeb2' - -SHEEN_INITIAL = '\ufeb7' -SHEEN_MEDIAL = '\ufeb8' -SHEEN_ISOLATED = '\ufeb5' -SHEEN_FINAL = '\ufeb6' - -SAD_INITIAL = '\ufebb' -SAD_MEDIAL = '\ufebc' -SAD_ISOLATED = '\ufeb9' -SAD_FINAL = '\ufeba' - -DAD_INITIAL = '\ufebf' -DAD_MEDIAL = '\ufec0' -DAD_ISOLATED = '\ufebd' -DAD_FINAL = '\ufebe' - -TAH_INITIAL = '\ufec3' -TAH_MEDIAL = '\ufec4' -TAH_ISOLATED = '\ufec1' -TAH_FINAL = '\ufec2' - -ZAH_INITIAL = '\ufec7' -ZAH_MEDIAL = '\ufec8' -ZAH_ISOLATED = '\ufec5' -ZAH_FINAL = '\ufec6' - -AIN_INITIAL = '\ufecb' -AIN_MEDIAL = '\ufecc' -AIN_ISOLATED = '\ufec9' -AIN_FINAL = '\ufeca' - -GHAIN_INITIAL = '\ufecf' -GHAIN_MEDIAL = '\ufed0' -GHAIN_ISOLATED = '\ufecd' -GHAIN_FINAL = '\ufece' - -FEH_INITIAL = '\ufed3' -FEH_MEDIAL = '\ufed4' -FEH_ISOLATED = '\ufed1' -FEH_FINAL = '\ufed2' - -QAF_INITIAL = '\ufed7' -QAF_MEDIAL = '\ufed8' -QAF_ISOLATED = '\ufed5' -QAF_FINAL = '\ufed6' - -KAF_INITIAL = '\ufedb' -KAF_MEDIAL = '\ufedC' -KAF_ISOLATED = '\ufed9' -KAF_FINAL = '\ufeda' - -LAM_INITIAL = '\ufedf' -LAM_MEDIAL = '\ufed0' -LAM_ISOLATED = '\ufedd' -LAM_FINAL = '\ufede' - -MEEM_INITIAL = '\ufee3' -MEEM_MEDIAL = '\ufee4' -MEEM_ISOLATED = '\ufee1' -MEEM_FINAL = '\ufee2' - -NOON_INITIAL = '\ufee7' -NOON_MEDIAL = '\ufee8' -NOON_ISOLATED = '\ufee5' -NOON_FINAL = '\ufee6' - -HEH_INITIAL = '\ufeeb' -HEH_MEDIAL = '\ufeec' -HEH_ISOLATED = '\ufee9' -HEH_FINAL = '\ufeea' - -WAW_ISOLATED = '\ufeed' -WAW_FINAL = '\ufeee' - -ALEF_MAKSURA_ISOLATED = '\ufeef' -ALEF_MAKSURA_FINAL = '\ufef0' - -YEH_INITIAL = '\ufef3' -YEH_MEDIAL = '\ufef4' -YEH_ISOLATED = '\ufef1' -YEH_FINAL = '\ufef2' - -# Punctuation marks -COMMA = '\u060C' -SEMICOLON = '\u061B' -QUESTION = '\u061F' - -# Kasheeda, Tatweel -KASHEEDA = '\u0640' - -# Other symbols -PERCENT = '\u066a' -DECIMAL = '\u066b' -THOUSANDS = '\u066c' -STAR = '\u066d' -FULL_STOP = '\u06d4' -BYTE_ORDER_MARK = '\ufeff' - -#Diacritics -FATHATAN = '\u064b' -DAMMATAN = '\u064c' -KASRATAN = '\u064d' -FATHA = '\u064e' -DAMMA = '\u064f' -KASRA = '\u0650' -SHADDA = '\u0651' -SUKUN = '\u0652' - -# groups - -HAMZAT = ( HAMZA, - HAMZA_ABOVE_ALEF, - HAMZA_BELOW_ALEF, - ALEF_MADDA, - HAMZA_ABOVE_WAW, - HAMZA_ABOVE_YEH, - HAMZA_ABOVE, - HAMZA_BELOW - ) - -ALEFAT = ( - ALEF, - ALEF_MADDA, - HAMZA_BELOW_ALEF, - HAMZA_ABOVE_ALEF, - ALEF_WASLA, - ALEF_MAKSURA, - SMALL_ALEF, - ) - -WEAK = (ALEF, WAW, YEH, ALEF_MAKSURA) - -YEHLIKE = (YEH, HAMZA_ABOVE_YEH, ALEF_MAKSURA, SMALL_YEH) - -WAWLIKE = (WAW, HAMZA_ABOVE_WAW, SMALL_WAW) - -TEHLIKE = (TEH, TEH_MARBUTA) - -SMALL = (SMALL_ALEF, SMALL_WAW, SMALL_YEH) - - -LETTERS = ( - ALEF, BEH, TEH, TEH_MARBUTA, THEH, JEEM, HAH, KHAH, - DAL, THEL, REH, ZAIN, SEEN, SHEEN, SAD, DAD, TAH, ZAH, - AIN, GHAIN, FEH, QAF, KAF, LAM, MEEM, NOON, HEH, WAW, YEH, - HAMZA, ALEF_MADDA, HAMZA_ABOVE_ALEF, HAMZA_ABOVE_WAW, HAMZA_BELOW_ALEF, - HAMZA_ABOVE_YEH, - ) - -TASHKEEL = (FATHATAN, DAMMATAN, KASRATAN, - FATHA, DAMMA, KASRA, - SUKUN, SHADDA - ) - -HARAKAT = (FATHATAN, DAMMATAN, KASRATAN, - FATHA, DAMMA, KASRA, - SUKUN - ) - -SHORTHARAKAT = (FATHA, DAMMA, KASRA, SUKUN) - -TANWEEN = (FATHATAN, DAMMATAN, KASRATAN) - - -NOT_DEF_HARAKA = KASHEEDA - -LIGATURES_LAM_ALEF = (LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE) - - -ALPHABETIC_ORDER = { - ALEF: 1, - BEH: 2, - TEH: 3, - TEH_MARBUTA: 3, - THEH: 4, - JEEM: 5, - HAH: 6, - KHAH: 7, - DAL: 8, - THEL: 9, - REH: 10, - ZAIN: 11, - SEEN: 12, - SHEEN: 13, - SAD: 14, - DAD: 15, - TAH: 16, - ZAH: 17, - AIN: 18, - GHAIN: 19, - FEH: 20, - QAF: 21, - KAF: 22, - LAM: 23, - MEEM: 24, - NOON: 25, - HEH: 26, - WAW: 27, - YEH: 28, - HAMZA: 29, - - ALEF_MADDA: 29, - HAMZA_ABOVE_ALEF: 29, - HAMZA_ABOVE_WAW: 29, - HAMZA_BELOW_ALEF: 29, - HAMZA_ABOVE_YEH: 29, - } - -NAMES = { - ALEF: "ألف", - BEH: "باء", - TEH: 'تاء', - TEH_MARBUTA: 'تاء مربوطة', - THEH: 'ثاء', - JEEM: 'جيم', - HAH: 'حاء', - KHAH: 'خاء', - DAL: 'دال', - THEL: 'ذال', - REH: 'راء', - ZAIN: 'زاي', - SEEN: 'سين', - SHEEN: 'شين', - SAD: 'صاد', - DAD: 'ضاد', - TAH: 'طاء', - ZAH: 'ظاء', - AIN: 'عين', - GHAIN: 'غين', - FEH: 'فاء', - QAF: 'قاف', - KAF: 'كاف', - LAM: 'لام', - MEEM: 'ميم', - NOON: 'نون', - HEH: 'هاء', - WAW: 'واو', - YEH: 'ياء', - HAMZA: 'همزة', - - KASHEEDA: 'تطويل', - ALEF_MADDA: 'ألف ممدودة', - ALEF_MAKSURA: 'ألف مقصورة', - HAMZA_ABOVE_ALEF: 'همزة على الألف', - HAMZA_ABOVE_WAW: 'همزة على الواو', - HAMZA_BELOW_ALEF: 'همزة تحت الألف', - HAMZA_ABOVE_YEH: 'همزة على الياء', - FATHATAN: 'فتحتان', - DAMMATAN: 'ضمتان', - KASRATAN: 'كسرتان', - FATHA: 'فتحة', - DAMMA: 'ضمة', - KASRA: 'كسرة', - SHADDA: 'شدة', - SUKUN: 'سكون', - } - -SHAPED_FORMS = { - HAMZA: (HAMZA_ISOLATED), - HAMZA_ABOVE_ALEF: (ALEF_HAMZA_ABOVE_ISOLATED, ALEF_HAMZA_ABOVE_FINAL), - HAMZA_BELOW_ALEF: (ALEF_HAMZA_BELOW_ISOLATED, ALEF_HAMZA_BELOW_FINAL), - HAMZA_ABOVE_YEH: (YEH_HAMZA_ISOLATED, YEH_HAMZA_INITIAL, YEH_HAMZA_MEDIAL, YEH_HAMZA_FINAL), - ALEF_MADDA: (ALEF_MADDA_ISOLATED, ALEF_MADDA_FINAL), - HAMZA_ABOVE_WAW: (WAW_HAMZA_ISOLATED, WAW_HAMZA_FINAL), - ALEF: (ALEF_ISOLATED, ALEF_FINAL), - BEH: (BEH_ISOLATED, BEH_FINAL, BEH_INITIAL, BEH_MEDIAL), - TEH_MARBUTA: (TEH_MARBUTA_ISOLATED, TEH_MARBUTA_FINAL), - TEH: (TEH_ISOLATED, TEH_INITIAL, TEH_MEDIAL, TEH_FINAL), - THEH: (THEH_ISOLATED, THEH_INITIAL, THEH_MEDIAL, THEH_FINAL), - JEEM: (JEEM_ISOLATED, JEEM_INITIAL, JEEM_MEDIAL, JEEM_FINAL), - HAH: (HAH_ISOLATED, HAH_INITIAL, HAH_MEDIAL, HAH_FINAL), - KHAH: (KHAH_ISOLATED, KHAH_INITIAL, KHAH_MEDIAL, KHAH_FINAL), - DAL: (DAL_ISOLATED, DAL_FINAL), - THEL: (THEL_ISOLATED, THEL_FINAL), - REH: (REH_ISOLATED, REH_FINAL), - ZAIN: (ZAIN_ISOLATED, ZAIN_FINAL), - SEEN: (SEEN_ISOLATED, SEEN_INITIAL, SEEN_MEDIAL, SEEN_FINAL), - SHEEN: (SHEEN_ISOLATED, SHEEN_INITIAL, SHEEN_MEDIAL, SHEEN_FINAL), - SAD: (SAD_ISOLATED, SAD_INITIAL, SAD_MEDIAL, SAD_FINAL), - DAD: (DAD_ISOLATED, DAD_INITIAL, DAD_MEDIAL, DAD_FINAL), - TAH: (TAH_ISOLATED, TAH_INITIAL, TAH_MEDIAL, TAH_FINAL), - ZAH: (ZAH_ISOLATED, ZAH_INITIAL, ZAH_MEDIAL, ZAH_FINAL), - AIN: (AIN_ISOLATED, AIN_INITIAL, AIN_MEDIAL, AIN_FINAL), - GHAIN: (GHAIN_ISOLATED, GHAIN_INITIAL, GHAIN_MEDIAL, GHAIN_FINAL), - FEH: (FEH_ISOLATED, FEH_INITIAL, FEH_MEDIAL, FEH_FINAL), - QAF: (QAF_ISOLATED, QAF_INITIAL, QAF_MEDIAL, QAF_FINAL), - KAF: (KAF_ISOLATED, KAF_INITIAL, KAF_MEDIAL, KAF_FINAL), - LAM: (LAM_ISOLATED, LAM_INITIAL, LAM_MEDIAL, LAM_FINAL), - MEEM: (MEEM_ISOLATED, MEEM_INITIAL, MEEM_MEDIAL, MEEM_FINAL), - NOON: (NOON_ISOLATED, NOON_INITIAL, NOON_MEDIAL, NOON_FINAL), - HEH: (HEH_ISOLATED, HEH_INITIAL, HEH_MEDIAL, HEH_FINAL), - WAW: (WAW_ISOLATED, WAW_FINAL), - ALEF_MAKSURA: (ALEF_MAKSURA_ISOLATED, ALEF_MAKSURA_FINAL), - YEH: (YEH_ISOLATED, YEH_INITIAL, YEH_MEDIAL, YEH_FINAL), - LAM_ALEF: (LAM_ALEF_ISOLATED, LAM_ALEF_FINAL), - LAM_ALEF_HAMZA_ABOVE: (LAM_ALEF_HAMZA_ABOVE_ISOLATED, LAM_ALEF_HAMZA_ABOVE_FINAL), - LAM_ALEF_HAMZA_BELOW: (LAM_ALEF_HAMZA_BELOW_ISOLATED, LAM_ALEF_HAMZA_BELOW_FINAL), - LAM_ALEF_MADDA_ABOVE: (LAM_ALEF_MADDA_ABOVE_ISOLATED, LAM_ALEF_MADDA_ABOVE_FINAL) - } - - -PUNCTUATION_MARKS = [COMMA, SEMICOLON, QUESTION] - - -WESTERN_ARABIC_NUMERALS = ['0','1','2','3','4','5','6','7','8','9'] - -EASTERN_ARABIC_NUMERALS = ['۰', '۱', '۲', '۳', '٤', '۵', '٦', '۷', '۸', '۹'] From 4259bd61f443be7bb5478e597d20dc47d52b7815 Mon Sep 17 00:00:00 2001 From: Iman Nazari <32216131+ishto7@users.noreply.github.com> Date: Mon, 23 Jul 2018 08:40:16 +0430 Subject: [PATCH 06/14] Add Utilities A help to make sure you're using the standard Persian not main Arabic --- cltk/corpus/persian/PersianUtils.py | 91 +++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 cltk/corpus/persian/PersianUtils.py diff --git a/cltk/corpus/persian/PersianUtils.py b/cltk/corpus/persian/PersianUtils.py new file mode 100644 index 000000000..1dcb4f0c8 --- /dev/null +++ b/cltk/corpus/persian/PersianUtils.py @@ -0,0 +1,91 @@ +import re +import alphabet +from cltk.corpus.arabic.alphabet import * + +toReform = [ + { + "characters": [ + HAMZA, + HAMZA_BELOW, + HAMZA_ABOVE, + HAMZA_ISOLATED, + + MINI_ALEF, + SMALL_ALEF, + SMALL_WAW, + SMALL_YEH, + + KASHEEDA, + FATHATAN, + DAMMATAN, + KASRATAN, + FATHA, + DAMMA, + KASRA, + SHADDA, + SUKUN, + alphabet.THOUSANDS, + alphabet.DECIMAL + ], + "toBe": "" + }, + { + "characters": [ + ALEF_MADDA, + ALEF_WASLA, + HAMZA_BELOW_ALEF, + HAMZA_ABOVE_ALEF, + ], + "toBe": alphabet.ALEF + }, + { + "characters": [ + ALEF_MAKSURA, + YEH, + ], + "toBe": alphabet.YE + }, + { + "characters": [KAF], + "toBe": alphabet.KAF + }, + { + "characters": [ + LAM_ALEF, + LAM_ALEF_HAMZA_ABOVE, + LAM_ALEF_HAMZA_BELOW, + LAM_ALEF_MADDA_ABOVE, + ], + "toBe": alphabet.LAM + alphabet.ALEF + }, + { + "characters": [TEH_MARBUTA], + "toBe": alphabet.HE2 + }, +] + +replacementDict = {} +for rule in toReform: + for character in rule["characters"]: + replacementDict[character] = rule["toBe"] + +for originalForm, shapedForms in SHAPED_FORMS.items(): + for form in shapedForms: + replacementDict[form] = replacementDict.get(originalForm, originalForm) + +for i in range(10): + replacementDict[EASTERN_ARABIC_NUMERALS[i]] = alphabet.NUMERALS[i] + replacementDict[WESTERN_ARABIC_NUMERALS[i]] = alphabet.NUMERALS[i] + # Use the commented parts for Word2Vec embeddings + # replacementDict[alphabet.NUMERALS[i]] = " %s " % alphabet.NUMERALS_WRITINGS[i] + + +# for char in '[!"#%\'()*+,-./:;<=>?@\[\]^_`{|}~’”“′‘\\\]؟؛«»،٪': +# replacementDict[char] = " " +# +# replacementDict[" +"] = " " + +replacementRegex = re.compile("(%s)" % "|".join(map(re.escape, replacementDict.keys()))) + +def standardize(text): + return replacementRegex.sub(lambda mo: replacementDict[mo.string[mo.start():mo.end()]], text) From dcbacfd661a8db80691df173ee61e7a643725f2b Mon Sep 17 00:00:00 2001 From: Iman Nazari <32216131+ishto7@users.noreply.github.com> Date: Mon, 23 Jul 2018 08:46:38 +0430 Subject: [PATCH 07/14] Update Persian Alphabet Add Numerals --- cltk/corpus/persian/alphabet.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/cltk/corpus/persian/alphabet.py b/cltk/corpus/persian/alphabet.py index 040245802..40ff55993 100644 --- a/cltk/corpus/persian/alphabet.py +++ b/cltk/corpus/persian/alphabet.py @@ -1,6 +1,5 @@ """Persian alphabet""" - __author__ = "Iman Nazari" # HAMZEH Family @@ -122,4 +121,28 @@ YE: 32 } -PERSIAN_NUMERALS = ["۰","۱","۲","۳","۴","۵","۶","۷","۸","۹"] +NUMERALS = { + 0: '۰', + 1: '۱', + 2: '۲', + 3: '۳', + 4: '۴', + 5: '۵', + 6: '۶', + 7: '۷', + 8: '۸', + 9: '۹' +} + +NUMERALS_WRITINGS = { + 0: "صفر", + 1: "یک", + 2: "دو", + 3: "سه", + 4: "چهار", + 5: "پنج", + 6: "شش", + 7: "هفت", + 8: "هشت", + 9: "نه" +} From 149d57c71a270e828d096fabb7151f6abb6edabe Mon Sep 17 00:00:00 2001 From: Iman Nazari <32216131+ishto7@users.noreply.github.com> Date: Mon, 23 Jul 2018 08:47:11 +0430 Subject: [PATCH 08/14] Update PersianUtils.py --- cltk/corpus/persian/PersianUtils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cltk/corpus/persian/PersianUtils.py b/cltk/corpus/persian/PersianUtils.py index 1dcb4f0c8..aa83a708b 100644 --- a/cltk/corpus/persian/PersianUtils.py +++ b/cltk/corpus/persian/PersianUtils.py @@ -1,5 +1,5 @@ import re -import alphabet +import cltk.corpus.persian.alphabet as alphabet from cltk.corpus.arabic.alphabet import * toReform = [ From 499e1cd3991acbf4a64c5b2e9af43c2245da8e4f Mon Sep 17 00:00:00 2001 From: "Kyle P. Johnson" Date: Mon, 23 Jul 2018 10:16:42 -0700 Subject: [PATCH 09/14] Change name for pep8 --- cltk/corpus/persian/{PersianUtils.py => persian_utils.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename cltk/corpus/persian/{PersianUtils.py => persian_utils.py} (100%) diff --git a/cltk/corpus/persian/PersianUtils.py b/cltk/corpus/persian/persian_utils.py similarity index 100% rename from cltk/corpus/persian/PersianUtils.py rename to cltk/corpus/persian/persian_utils.py From 67519b0acae934d87f18bbeea7bb782803f7679a Mon Sep 17 00:00:00 2001 From: "Kyle P. Johnson" Date: Mon, 23 Jul 2018 10:20:24 -0700 Subject: [PATCH 10/14] Change namespaces for pep8 --- cltk/corpus/persian/persian_utils.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cltk/corpus/persian/persian_utils.py b/cltk/corpus/persian/persian_utils.py index aa83a708b..e53236b79 100644 --- a/cltk/corpus/persian/persian_utils.py +++ b/cltk/corpus/persian/persian_utils.py @@ -2,7 +2,7 @@ import cltk.corpus.persian.alphabet as alphabet from cltk.corpus.arabic.alphabet import * -toReform = [ +to_reform = [ { "characters": [ HAMZA, @@ -27,7 +27,7 @@ alphabet.THOUSANDS, alphabet.DECIMAL ], - "toBe": "" + "to_be": "" }, { "characters": [ @@ -36,18 +36,18 @@ HAMZA_BELOW_ALEF, HAMZA_ABOVE_ALEF, ], - "toBe": alphabet.ALEF + "to_be": alphabet.ALEF }, { "characters": [ ALEF_MAKSURA, YEH, ], - "toBe": alphabet.YE + "to_be": alphabet.YE }, { "characters": [KAF], - "toBe": alphabet.KAF + "to_be": alphabet.KAF }, { "characters": [ @@ -56,18 +56,18 @@ LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE, ], - "toBe": alphabet.LAM + alphabet.ALEF + "to_be": alphabet.LAM + alphabet.ALEF }, { "characters": [TEH_MARBUTA], - "toBe": alphabet.HE2 + "to_be": alphabet.HE2 }, ] replacementDict = {} for rule in toReform: for character in rule["characters"]: - replacementDict[character] = rule["toBe"] + replacementDict[character] = rule["to_be"] for originalForm, shapedForms in SHAPED_FORMS.items(): for form in shapedForms: From f3a122777da71a61af0ae8e2819d192951f211fc Mon Sep 17 00:00:00 2001 From: Iman Nazari <32216131+ishto7@users.noreply.github.com> Date: Thu, 2 Aug 2018 20:12:39 +0430 Subject: [PATCH 11/14] Add Word2Vec needed function for Persian Add a function named standardize4Word2vec that standardizes text for a Word2Vec training. This function is almost all you need to do for preprocessing. --- cltk/corpus/persian/persian_utils.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/cltk/corpus/persian/persian_utils.py b/cltk/corpus/persian/persian_utils.py index e53236b79..c025c4302 100644 --- a/cltk/corpus/persian/persian_utils.py +++ b/cltk/corpus/persian/persian_utils.py @@ -72,20 +72,28 @@ for originalForm, shapedForms in SHAPED_FORMS.items(): for form in shapedForms: replacementDict[form] = replacementDict.get(originalForm, originalForm) + +replacementDict4Word2vec = replacementDict.copy() for i in range(10): replacementDict[EASTERN_ARABIC_NUMERALS[i]] = alphabet.NUMERALS[i] replacementDict[WESTERN_ARABIC_NUMERALS[i]] = alphabet.NUMERALS[i] - # Use the commented parts for Word2Vec embeddings - # replacementDict[alphabet.NUMERALS[i]] = " %s " % alphabet.NUMERALS_WRITINGS[i] +for i in range(10): + replacementDict4Word2vec[EASTERN_ARABIC_NUMERALS[i]] = " %s " % alphabet.NUMERALS_WRITINGS[i] + replacementDict4Word2vec[WESTERN_ARABIC_NUMERALS[i]] = " %s " % alphabet.NUMERALS_WRITINGS[i] + replacementDict4Word2vec[alphabet.NUMERALS[i]] = " %s " % alphabet.NUMERALS_WRITINGS[i] + +for char in '[!"#%\'()*+,-./:;<=>?@\[\]^_`{|}~’”“′‘\\\]؟؛«»،٪': + replacementDict4Word2vec[char] = " " -# for char in '[!"#%\'()*+,-./:;<=>?@\[\]^_`{|}~’”“′‘\\\]؟؛«»،٪': -# replacementDict[char] = " " -# -# replacementDict[" +"] = " " +replacementDict4Word2vec[" +"] = " " +replacementRegex4Word2vec = re.compile("(%s)" % "|".join(map(re.escape, replacementDict4Word2vec.keys()))) replacementRegex = re.compile("(%s)" % "|".join(map(re.escape, replacementDict.keys()))) def standardize(text): return replacementRegex.sub(lambda mo: replacementDict[mo.string[mo.start():mo.end()]], text) + +def standardize4Word2vec(text): + return replacementRegex.sub(lambda mo: replacementDict4Word2vec[mo.string[mo.start():mo.end()]], text) From e6759ce5b4cc44d6b3a4082b3621ca750958e5a1 Mon Sep 17 00:00:00 2001 From: Iman Nazari <32216131+ishto7@users.noreply.github.com> Date: Thu, 2 Aug 2018 20:46:12 +0430 Subject: [PATCH 12/14] Update persian_utils.py --- cltk/corpus/persian/persian_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cltk/corpus/persian/persian_utils.py b/cltk/corpus/persian/persian_utils.py index c025c4302..9aae6e4fc 100644 --- a/cltk/corpus/persian/persian_utils.py +++ b/cltk/corpus/persian/persian_utils.py @@ -65,7 +65,7 @@ ] replacementDict = {} -for rule in toReform: +for rule in to_reform: for character in rule["characters"]: replacementDict[character] = rule["to_be"] From 87fb0c2b08d8401e1fec422dd1e2eb0a12dfa0c7 Mon Sep 17 00:00:00 2001 From: Iman Nazari <32216131+ishto7@users.noreply.github.com> Date: Thu, 2 Aug 2018 22:49:54 +0430 Subject: [PATCH 13/14] Update persian_utils.py --- cltk/corpus/persian/persian_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cltk/corpus/persian/persian_utils.py b/cltk/corpus/persian/persian_utils.py index 9aae6e4fc..af3b6ae73 100644 --- a/cltk/corpus/persian/persian_utils.py +++ b/cltk/corpus/persian/persian_utils.py @@ -96,4 +96,4 @@ def standardize(text): return replacementRegex.sub(lambda mo: replacementDict[mo.string[mo.start():mo.end()]], text) def standardize4Word2vec(text): - return replacementRegex.sub(lambda mo: replacementDict4Word2vec[mo.string[mo.start():mo.end()]], text) + return replacementRegex4Word2vec.sub(lambda mo: replacementDict4Word2vec[mo.string[mo.start():mo.end()]], text) From 65f036264fa605f4f8ad583c128de2c2f8b40531 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Fri, 8 Apr 2022 04:18:22 +0000 Subject: [PATCH 14/14] fix: requirements.txt to reduce vulnerabilities The following vulnerabilities are fixed by pinning transitive dependencies: - https://snyk.io/vuln/SNYK-PYTHON-NOTEBOOK-2441824 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 55a5bdd79..959f084fc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -41,7 +41,7 @@ mypy==0.610 nbconvert==5.3.1 nbformat==4.4.0 nltk==3.3 -notebook==5.5.0 +notebook==6.4.10 numpy==1.14.5 pandocfilters==1.4.2 parso==0.3.1