SentimentAnalysis/analysis.py at main · NukaNarendra/SentimentAnalysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from textblob import TextBlob
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity, analysis.sentiment.subjectivity

df['polarity'], df['subjectivity'] = zip(*df['cleaned_text'].apply(get_sentiment))

# Normalize
scaler = StandardScaler()
features = scaler.fit_transform(df[['polarity', 'subjectivity']])

# KMeans Clustering
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(features)


cluster_avg = df.groupby('cluster')['polarity'].mean().sort_values()
sentiment_mapping = {cluster: i+1 for i, cluster in enumerate(cluster_avg.index)}

df['sentiment_score'] = df['cluster'].map(sentiment_mapping)


import spacy

nlp = spacy.load("en_core_web_sm")
key_entities = {}

for text in df['cleaned_text']:
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ in ["ORG", "PERSON", "PRODUCT", "GPE", "BRAND"]:
            ent_text = ent.text.lower()
            key_entities[ent_text] = key_entities.get(ent_text, 0) + 1

# Top 10
from collections import Counter
top_entities = dict(Counter(key_entities).most_common(10))


from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-mpnet-base-v2')
embeddings = model.encode(df['cleaned_text'].tolist())


kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
df['topic_cluster'] = kmeans.fit_predict(embeddings)
from collections import Counter

def get_dominant_words(texts, num_words=5):
    words = []
    for text in texts:
        words.extend(text.split())
    return [word for word, _ in Counter(words).most_common(num_words)]

for cluster_id in range(5):
    texts = df[df['topic_cluster'] == cluster_id]['cleaned_text'].tolist()
    print(f"Cluster {cluster_id}: {get_dominant_words(texts)}")

df.to_csv("final_output.csv", index=False)