-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanalysis.py
More file actions
63 lines (43 loc) · 1.86 KB
/
analysis.py
File metadata and controls
63 lines (43 loc) · 1.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from textblob import TextBlob
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
def get_sentiment(text):
analysis = TextBlob(text)
return analysis.sentiment.polarity, analysis.sentiment.subjectivity
df['polarity'], df['subjectivity'] = zip(*df['cleaned_text'].apply(get_sentiment))
# Normalize
scaler = StandardScaler()
features = scaler.fit_transform(df[['polarity', 'subjectivity']])
# KMeans Clustering
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(features)
cluster_avg = df.groupby('cluster')['polarity'].mean().sort_values()
sentiment_mapping = {cluster: i+1 for i, cluster in enumerate(cluster_avg.index)}
df['sentiment_score'] = df['cluster'].map(sentiment_mapping)
import spacy
nlp = spacy.load("en_core_web_sm")
key_entities = {}
for text in df['cleaned_text']:
doc = nlp(text)
for ent in doc.ents:
if ent.label_ in ["ORG", "PERSON", "PRODUCT", "GPE", "BRAND"]:
ent_text = ent.text.lower()
key_entities[ent_text] = key_entities.get(ent_text, 0) + 1
# Top 10
from collections import Counter
top_entities = dict(Counter(key_entities).most_common(10))
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')
embeddings = model.encode(df['cleaned_text'].tolist())
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
df['topic_cluster'] = kmeans.fit_predict(embeddings)
from collections import Counter
def get_dominant_words(texts, num_words=5):
words = []
for text in texts:
words.extend(text.split())
return [word for word, _ in Counter(words).most_common(num_words)]
for cluster_id in range(5):
texts = df[df['topic_cluster'] == cluster_id]['cleaned_text'].tolist()
print(f"Cluster {cluster_id}: {get_dominant_words(texts)}")
df.to_csv("final_output.csv", index=False)