Multi-Label-Classification-Using-MI-and-SVM/Preprocessing.py at master · irwanafandi24/Multi-Label-Classification-Using-MI-and-SVM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# -*- coding: utf-8 -*-
"""
Created on Tue Feb 26 00:30:26 2019

@author: Irwan
"""

import re #regular expression
from nltk.tokenize import word_tokenize #tokenisasi
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory #stopword
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory #stemming
import xlwt

factori = StemmerFactory()
stemmer = factori.create_stemmer()
factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()

class hadisClass(object):
    def __init__(self,hadis,k1,k2,k3):
        self.hadis = hadis
        self.k1 = k1
        self.k2 = k2
        self.k3 = k3

def openFile(wb):
    hadisContent, items= [],[]                                                  #take only the word, take all items hadith
    for sheet in wb.sheets():
        num_row, num_col = sheet.nrows, 4                                       #size for all the document 1064, just take 4 data till class
        for row in range(num_row):                                              #access the active row (has been filled)
            values = []
            for col in range(num_col):
                if col == 0:
                    x = (sheet.cell(row,col).value)
                value = (sheet.cell(row,col).value)
                values.append(value)                                            #values is array filled with hadis anc class
            hadisContent.append(x)
            item = hadisClass(*values)                                          #create object hadis
            items.append(item)                                                  #items array of hadis object
    return hadisContent, items

def preprocessingProcess(hadisContent):
    for i in range (len(hadisContent)):
        print(i)
        cleanning = re.sub('[^a-zA-z\s]','', hadisContent[i])                   #cleanning
        casefold = cleanning.lower();                                           #case folding
#        stopW = stopword.remove(casefold)                                      #stopword removal
#        stemming = stemmer.stem(stopW)                                         #stemming    ==> bikin lama frooh
#        word_tokens = word_tokenize(stopW)                                     #tokenisasi
#        word_tokens = word_tokenize(stemming)
        word_tokens = word_tokenize(casefold)

        hadisContent[i]=word_tokens
    return hadisContent

def preprocessingInput(hadisContent):                                           #just for input data from keyboard
    cleanning = re.sub('[^a-zA-z\s]','', hadisContent)                          #cleanning
    casefold = cleanning.lower();                                               #case folding
#        stopW = stopword.remove(casefold)                                      #stopword removal
#        stemming = stemmer.stem(casefold)                                      #stemming    ==> bikin lama frooh
#        word_tokens = word_tokenize(stemming)                                  #tokenisasi
    word_tokens = word_tokenize(casefold)
    return word_tokens

def saveHasil(filename, sheet, content, p1, p2, p3):
    book = xlwt.Workbook()
    sh = book.add_sheet(sheet)
    n=0

    for i in range(len(content)):
        sh.write(n,0,content[i])
        sh.write(n,1,str(p1[i]))
        sh.write(n,2,str(p2[i]))
        sh.write(n,3,str(p3[i]))
        n+=1

    book.save('Output/'+filename)