Stackoverflow_Technical_Knowledge_Depth_Analysis/preprocessing.py at main · Emysha99/Stackoverflow_Technical_Knowledge_Depth_Analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# preprocessing.py

import re
import string
import html
from bs4 import BeautifulSoup
import pandas as pd
from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Optional: download required NLTK packages
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

# --- STEP 1: Handle missing values ---
def check_and_handle_missing_values(df):
    missing_values = df.isnull().sum()
    total_missing = missing_values.sum()

    if total_missing > 0:
        df = df.dropna()
        print(f"Dropped {total_missing} missing values. Remaining rows: {len(df)}")
    else:
        print("No missing values found.")
    return df

# --- STEP 2: Remove duplicates ---
def remove_duplicates(df, subset=None, keep='first'):
    duplicate_count = df.duplicated(subset=subset, keep=False).sum()
    if duplicate_count > 0:
        df = df.drop_duplicates(subset=subset, keep=keep)
        print(f"Removed {duplicate_count} duplicate rows. Remaining rows: {len(df)}")
    else:
        print("No duplicates found.")
    return df

# --- STEP 3: NLP preprocessing ---
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def remove_links(text):
    return re.sub(r'http\S+|www\.\S+', '', text)

def clean_html_with_code(text):
    soup = BeautifulSoup(text, "html.parser")
    output = []
    for elem in soup.descendants:
        if elem.name == 'code' and elem.parent.name == 'pre':
            output.append("[CODE]\n" + elem.get_text(strip=True) + "\n[/CODE]")
        elif elem.name == 'code':
            output.append(elem.get_text(strip=True))
        elif elem.name is None:
            output.append(elem.strip())
    combined = ' '.join([html.unescape(str(x)) for x in output if str(x).strip()])
    return re.sub(r'\s+', ' ', combined).strip()

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator).lower().lstrip()

def remove_stop_words_and_lemmatization(text):
    text = remove_links(text)
    text = clean_html_with_code(text)
    text = remove_punctuation(text)

    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)

    return " ".join([
        lemmatizer.lemmatize(word, get_wordnet_pos(tag))
        for word, tag in tagged if word.lower() not in stop_words
    ])

def preprocess_row(row):
    row["title"] = remove_stop_words_and_lemmatization(row["title"])
    row["question_body"] = remove_stop_words_and_lemmatization(row["question_body"])
    return row