-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprint.py
More file actions
150 lines (114 loc) · 3.83 KB
/
print.py
File metadata and controls
150 lines (114 loc) · 3.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
print("Hello World!")
import docx2txt
from PyPDF2 import PdfFileReader, PdfFileWriter, PdfFileMerger
# Extracting text from DOCX
def doctotext(m):
temp = docx2txt.process(m)
resume_text = [line.replace('\t', ' ') for line in temp.split('\n') if line]
text = ' '.join(resume_text)
return (text)
#Extracting text from PDF
def pdftotext(m):
# pdf file object
# you can find find the pdf file with complete code in below
pdfFileObj = open(m, 'rb')
# pdf reader object
pdfFileReader = PdfFileReader(pdfFileObj)
# number of pages in pdf
num_pages = pdfFileReader.numPages
currentPageNumber = 0
text = ''
print(pdfFileReader.getFields())
# Loop in all the pdf pages.
while(currentPageNumber < num_pages ):
# Get the specified pdf page object.
pdfPage = pdfFileReader.getPage(currentPageNumber)
# Get pdf page text.
text = text + pdfPage.extractText()
# Process next page.
currentPageNumber += 1
return (text)
#main function
if __name__ == '__main__':
FilePath = 'AI.pdf'
FilePath.lower().endswith(('.png', '.docx'))
if FilePath.endswith('.docx'):
textinput = doctotext(FilePath)
elif FilePath.endswith('.pdf'):
textinput = pdftotext('Sample.pdf')
else:
print("File not support")
print(textinput[0:10])
import spacy
import en_core_web_sm
from spacy.matcher import Matcher
# load pre-trained model
nlp = en_core_web_sm.load()
# initialize matcher with a vocab
matcher = Matcher(nlp.vocab)
def extract_name(resume_text):
nlp_text = nlp(resume_text)
# First name and Last name are always Proper Nouns
pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
matcher.add('NAME', [pattern])
matches = matcher(nlp_text)
print(type(matches))
for x in range(len(matches)):
print(matches[x])
for match_id, start, end in matches:
# print(start,end,match_id)
# print(nlp_text)
span = nlp_text[start:end]
print(span)
return span.text
print('Name: ',extract_name(textinput))
import re
from nltk.corpus import stopwords
# Grad all general stop words
STOPWORDS = set(stopwords.words('english'))
# Education Degrees
EDUCATION = [
'BE','B.E.', 'B.E', 'BS', 'B.S',
'ME', 'M.E', 'M.E.', 'M.B.A', 'MBA', 'MS', 'M.S',
'BTECH', 'B.TECH', 'M.TECH', 'MTECH',
'SSLC', 'SSC' 'HSC', 'CBSE', 'ICSE', 'X', 'XII'
]
def extract_education(resume_text):
nlp_text = nlp(resume_text)
# Sentence Tokenizer
nlp_text = [str(sent).strip() for sent in nlp_text.sents]
edu = {}
# Extract education degree
for index, text in enumerate(nlp_text):
for tex in text.split():
# Replace all special symbols
tex = re.sub(r'[?|$|.|!|,]', r'', tex)
if tex.upper() in EDUCATION and tex not in STOPWORDS:
edu[tex] = text + nlp_text[index]
# Extract year
education = []
for key in edu.keys():
year = re.search(re.compile(r'(((20|19)(\d{})))'), edu[key])
if year:
education.append((key, ''.join(year[0])))
else:
education.append(key)
return education
print('Qualification: ',extract_education(textinput))
import pandas as pd
import spacy
nlp = spacy.load('en_core_web_sm')
def extract_skills(resume_text):
nlp_text = nlp(resume_text)
# removing stop words and implementing word tokenization
tokens = [token.text for token in nlp_text if not token.is_stop]
# extract values
skills = ['JavaScript','SQL']
print(skills)
skillset = []
# check for one-grams (example: python)
for token in tokens:
if token in skills:
skillset.append(token)
return skillset
print ('Skills',extract_skills(textinput))