Textmining/traitement.py at master · LucaZH/Textmining · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import re
import spacy
from collections import Counter
nlp = spacy.load("fr_core_news_md")

ponctuation = re.compile(r'[^\w\s]')

def readfile(filename):
    with open(filename,encoding="utf-8") as f:
        return f.read()

def count_space(content):
    espace=0
    for element in content:
        if element==" ":
            espace+=1
    return espace

def count_caractere(content):
    caractere=0
    for element in content:
        caractere+=1
    return caractere

def count_word(content):
    for ponct in ponctuation.findall(content):
        content = content.replace("'", "’")
        if ponct !="’":
            content= content.replace(ponct,"\spec+")
        elif ponct =="’":
            pass
    content=content.replace("\n", "\spec+")
    content=content.replace("\spec+", " ")
    return len(content.split())

def clean(content):
    for ponct in ponctuation.findall(content):
        content= content.replace(ponct,"\spec+")
    content=content.replace("\n", "\spec+")
    content=content.replace("\spec+", " ")
    return content.lower()

def find_keyword(content,total):
    cleancontents=clean(content).split()
    with open("stopwords.txt" , encoding="utf-8") as stopwords:
        g=stopwords.read()
        nb=0
        ocurence = []
        r=clean(g).split()
        for i in range(len(cleancontents)):
            if cleancontents[i] in r:
                    pass
            else:
                for j in range(len(cleancontents)):
                    if cleancontents[i]==cleancontents[j] :
                        nb+=1
                element={
                        "mot":cleancontents[i] ,
                        "occurence": nb,
                        "frequence": round((nb/total)*100,3)
                        }
                if element not in ocurence:
                    ocurence.append(element)
            nb=0
    return sorted(ocurence,key=lambda x:  x["occurence"],reverse=True)[:8]

def count_paragraphs(content):
    paragraphs = content.split("\n\n")
    return len(paragraphs)

def verbs(content):
    doc = nlp(content)
    verbs = [token for token in doc if token.pos_ == "VERB"]
    return verbs

def verb_occurence(content):
    verb_list = verbs(content)
    verb_occurence = Counter([verb.text.lower() for verb in verb_list])
    return verb_occurence


def ner(content):
    doc = nlp(content)
    entities = [(word, word.ent_iob_, word.ent_type_) for word in doc]
    return entities
def get_sentence(content):
    doc = nlp(content)
    sentences = list(doc.sents)
    return len(sentences)