-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTF_IDF.py
More file actions
73 lines (58 loc) · 2.45 KB
/
TF_IDF.py
File metadata and controls
73 lines (58 loc) · 2.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# tfidf_headlines.py
# Minimal TF-IDF for headlines.csv → saves sparse matrix + vectorizer + feature names
import json
from pathlib import Path
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text as sktext
import joblib
# ---------- CONFIG ----------
CSV_PATH = Path("headlines.csv") # change if stored elsewhere
TEXT_COL = "headline" # text column in your CSV
OUT_DIR = Path("tfidf_out") # outputs will be saved here
# Vectorizer settings (sane defaults for headlines)
MAX_FEATURES = 80000 # cap vocab size (protect RAM)
NGRAM_RANGE = (1, 2) # unigrams + bigrams
MIN_DF = 3 # drop terms in <3 docs
MAX_DF = 0.95 # drop terms in >95% of docs
USE_STOPWORDS = True # set to False to keep every token
DTYPE = np.float32
# ---------- LOAD ----------
if not CSV_PATH.exists():
raise FileNotFoundError(f"CSV not found: {CSV_PATH.resolve()}")
df = pd.read_csv(CSV_PATH)
if TEXT_COL not in df.columns:
raise ValueError(f"Column '{TEXT_COL}' not in CSV. Columns: {df.columns.tolist()}")
texts = df[TEXT_COL].fillna("").astype(str).tolist()
print(f"[INFO] Loaded {len(texts)} documents from {CSV_PATH.name}")
# ---------- TF-IDF ----------
stop_words = sktext.ENGLISH_STOP_WORDS if USE_STOPWORDS else None
vectorizer = TfidfVectorizer(
max_features=MAX_FEATURES,
ngram_range=NGRAM_RANGE,
min_df=MIN_DF,
max_df=MAX_DF,
stop_words='english',
lowercase=True,
dtype=DTYPE,
norm="l2",
sublinear_tf=True, # log(1+tf)
)
X = vectorizer.fit_transform(texts) # CSR sparse matrix [num_docs, vocab_size]
print(f"[INFO] TF-IDF shape: {X.shape} (rows=docs, cols=features)")
# ---------- SAVE ----------
OUT_DIR.mkdir(parents=True, exist_ok=True)
# 1) Sparse matrix
sparse.save_npz(OUT_DIR / "tfidf_matrix.npz", X)
# 2) Vectorizer (to reuse same vocab later)
joblib.dump(vectorizer, OUT_DIR / "tfidf_vectorizer.joblib")
# 3) Feature names (for inspection)
with open(OUT_DIR / "tfidf_feature_names.json", "w") as f:
json.dump(vectorizer.get_feature_names_out().tolist(), f)
# 4) Tiny preview (nonzeros per doc) to sanity-check sparsity
preview = pd.DataFrame({"row_id": np.arange(X.shape[0]),
"nnz": np.diff(X.indptr)})
preview.to_csv(OUT_DIR / "preview_row_sparsity.csv", index=False)
print(f"[OK] Saved artifacts to: {OUT_DIR.resolve()}")