Textmining/test.py at master · LucaZH/Textmining · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import spacy

nlp = spacy.load("fr_core_news_md")
with open("file.txt", "r", encoding="utf-8") as file:
    text = file.read()
    doc = nlp(text)

    for ent in doc.ents:
        print(ent.text, ent.label_)
import re
import spacy

nlp = spacy.load("fr_core_news_md")
class TextAnalyzer:
    def __init__(self, content):
        self.content = content
        self.ponctuation = re.compile(r'[^\w\s]')
    def clean(self,content):
        for ponct in self.ponctuation.findall(content):
            content= content.replace(ponct,"\spec+")
        content=content.replace("\n", "\spec+")
        content=content.replace("\spec+", " ")
        return content.lower()
    def count_space(self):
        espace = 0
        for element in self.content:
            if element == " ":
                espace += 1
        return espace

    def count_caractere(self):
        caractere = 0
        for element in self.content:
            caractere += 1
        return caractere

    def count_word(self):
        content = self.content
        for ponct in self.ponctuation.findall(content):
            content = content.replace("'", "’")
            if ponct != "’":
                content = content.replace(ponct, "\spec+")
            elif ponct == "’":
                pass
        content = content.replace("\n", "\spec+")
        content = content.replace("\spec+", " ")
        return len(content.split())

    def get_sentence(self):
        doc = nlp(self.content)
        sentences = list(doc.sents)
        return len(sentences)

    def findkeyword(self,total):
        cleancontents=self.content.split()
        with open("stopwords.txt" , encoding="utf-8") as stopwords:
            g=stopwords.read()
            nb=0
            ocurence = []
            r=self.clean(g).split()
            for i in range(len(cleancontents)):
                if cleancontents[i] in r:
                        pass
                else:
                    for j in range(len(cleancontents)):
                        if cleancontents[i]==cleancontents[j] :
                            nb+=1
                    element={
                            "mot":cleancontents[i] ,
                            "occurence": nb,
                            "frequence": round((nb/total)*100,3)
                            }
                    if element not in ocurence:
                        ocurence.append(element)
                nb=0
        return sorted(ocurence,key=lambda x:  x["occurence"],reverse=True)[:8]
    def count_paragraphs(self):
        paragraphs = self.content.split("\n\n")
        return len(paragraphs)

    def count_verbs(self):
        doc = nlp(self.content)
        verbs = [token for token in doc if token.pos_ == "VERB"]
        return len(verbs)

    def ner(self):
        doc = nlp(self.content)
        entities = [(word, word.ent_iob_, word.ent_type_) for word in doc]
        return entities
# with open("file.txt", encoding="utf-8") as f:
#     content = f.read()

# analyzer = TextAnalyzer(content)

# print("Nombre d'espaces :", analyzer.count_space())
# print("Nombre de caractères :", analyzer.count_caractere())
# print("Nombre de mots :", analyzer.count_word())
# print("Nombre de phrases :", analyzer.getsentence())
# print("Nombre de paragraphes :", analyzer.count_paragraphs())
# print("Nombre de verbes :", analyzer.count_verbs())
# print("Entités nommées :", analyzer.ner())
# print("Mot clés:", analyzer.findkeyword(analyzer.count_word()))

# doc = nlp(content)
# for ent in doc.ents:
#     print(ent.text, spacy.explain(f"{ent.label_}")()