-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtraitement.py
More file actions
90 lines (77 loc) · 2.51 KB
/
traitement.py
File metadata and controls
90 lines (77 loc) · 2.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import re
import spacy
from collections import Counter
nlp = spacy.load("fr_core_news_md")
ponctuation = re.compile(r'[^\w\s]')
def readfile(filename):
with open(filename,encoding="utf-8") as f:
return f.read()
def count_space(content):
espace=0
for element in content:
if element==" ":
espace+=1
return espace
def count_caractere(content):
caractere=0
for element in content:
caractere+=1
return caractere
def count_word(content):
for ponct in ponctuation.findall(content):
content = content.replace("'", "’")
if ponct !="’":
content= content.replace(ponct,"\spec+")
elif ponct =="’":
pass
content=content.replace("\n", "\spec+")
content=content.replace("\spec+", " ")
return len(content.split())
def clean(content):
for ponct in ponctuation.findall(content):
content= content.replace(ponct,"\spec+")
content=content.replace("\n", "\spec+")
content=content.replace("\spec+", " ")
return content.lower()
def find_keyword(content,total):
cleancontents=clean(content).split()
with open("stopwords.txt" , encoding="utf-8") as stopwords:
g=stopwords.read()
nb=0
ocurence = []
r=clean(g).split()
for i in range(len(cleancontents)):
if cleancontents[i] in r:
pass
else:
for j in range(len(cleancontents)):
if cleancontents[i]==cleancontents[j] :
nb+=1
element={
"mot":cleancontents[i] ,
"occurence": nb,
"frequence": round((nb/total)*100,3)
}
if element not in ocurence:
ocurence.append(element)
nb=0
return sorted(ocurence,key=lambda x: x["occurence"],reverse=True)[:8]
def count_paragraphs(content):
paragraphs = content.split("\n\n")
return len(paragraphs)
def verbs(content):
doc = nlp(content)
verbs = [token for token in doc if token.pos_ == "VERB"]
return verbs
def verb_occurence(content):
verb_list = verbs(content)
verb_occurence = Counter([verb.text.lower() for verb in verb_list])
return verb_occurence
def ner(content):
doc = nlp(content)
entities = [(word, word.ent_iob_, word.ent_type_) for word in doc]
return entities
def get_sentence(content):
doc = nlp(content)
sentences = list(doc.sents)
return len(sentences)