javedsha · chenziqi66 · Apr 14, 2026
diff --git a/Text+Classification+using+python,+scikit+and+nltk.py b/Text+Classification+using+python,+scikit+and+nltk.py
@@ -1,120 +1,103 @@
-
 # coding: utf-8
 
-# In[1]:
-
-#Loading the data set - training data.
+# 导入模块 - 按照PEP8规范，所有导入集中在文件开头
+import nltk
+from nltk.stem.snowball import SnowballStemmer
+import numpy as np
 from sklearn.datasets import fetch_20newsgroups
-twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
+from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
+from sklearn.linear_model import SGDClassifier
+from sklearn.model_selection import GridSearchCV
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.pipeline import Pipeline
 
 
-# In[4]:
+# 确保NLTK数据已下载（非交互式）
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    nltk.download('punkt', quiet=True)
 
-# You can check the target names (categories) and some data files by following commands.
-twenty_train.target_names #prints all the categories
 
+# Loading the data set - training data.
+twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
 
-# In[5]:
 
-print("\n".join(twenty_train.data[0].split("\n")[:3])) #prints first line of the first data file
+# You can check the target names (categories) and some data files by following commands.
+twenty_train.target_names  # prints all the categories
+
 
+print("\n".join(twenty_train.data[0].split("\n")[:3]))  # prints first line of the first data file
 
-# In[6]:
 
 # Extracting features from text files
-from sklearn.feature_extraction.text import CountVectorizer
 count_vect = CountVectorizer()
 X_train_counts = count_vect.fit_transform(twenty_train.data)
 X_train_counts.shape
 
 
-# In[7]:
-
 # TF-IDF
-from sklearn.feature_extraction.text import TfidfTransformer
 tfidf_transformer = TfidfTransformer()
 X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
 X_train_tfidf.shape
 
 
-# In[9]:
-
 # Machine Learning
 # Training Naive Bayes (NB) classifier on training data.
-from sklearn.naive_bayes import MultinomialNB
 clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
 
 
-# In[14]:
-
 # Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows:
-# The names ‘vect’ , ‘tfidf’ and ‘clf’ are arbitrary but will be used later.
+# The names 'vect' , 'tfidf' and 'clf' are arbitrary but will be used later.
 # We will be using the 'text_clf' going forward.
-from sklearn.pipeline import Pipeline
 
 text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
 
 text_clf = text_clf.fit(twenty_train.data, twenty_train.target)
 
 
-# In[15]:
-
 # Performance of NB Classifier
-import numpy as np
 twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
 predicted = text_clf.predict(twenty_test.data)
 np.mean(predicted == twenty_test.target)
 
 
-# In[16]:
-
 # Training Support Vector Machines - SVM and calculating its performance
-
-from sklearn.linear_model import SGDClassifier
+# 注意：使用 max_iter 替代已过时的 n_iter 参数
 text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
-                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])
+                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=5, random_state=42))])
 
 text_clf_svm = text_clf_svm.fit(twenty_train.data, twenty_train.target)
 predicted_svm = text_clf_svm.predict(twenty_test.data)
 np.mean(predicted_svm == twenty_test.target)
 
 
-# In[18]:
-
 # Grid Search
-# Here, we are creating a list of parameters for which we would like to do performance tuning. 
-# All the parameters name start with the classifier name (remember the arbitrary name we gave). 
+# Here, we are creating a list of parameters for which we would like to do performance tuning.
+# All the parameters name start with the classifier name (remember the arbitrary name we gave).
 # E.g. vect__ngram_range; here we are telling to use unigram and bigrams and choose the one which is optimal.
 
-from sklearn.model_selection import GridSearchCV
 parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}
 
 
-# In[19]:
-
-# Next, we create an instance of the grid search by passing the classifier, parameters 
+# Next, we create an instance of the grid search by passing the classifier, parameters
 # and n_jobs=-1 which tells to use multiple cores from user machine.
 
 gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
 gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)
 
 
-# In[23]:
-
 # To see the best mean score and the params, run the following code
 
 gs_clf.best_score_
 gs_clf.best_params_
 
 # Output for above should be: The accuracy has now increased to ~90.6% for the NB classifier (not so naive anymore! 😄)
-# and the corresponding parameters are {‘clf__alpha’: 0.01, ‘tfidf__use_idf’: True, ‘vect__ngram_range’: (1, 2)}.
-
+# and the corresponding parameters are {'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}.
 
-# In[24]:
 
 # Similarly doing grid search for SVM
-from sklearn.model_selection import GridSearchCV
-parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),'clf-svm__alpha': (1e-2, 1e-3)}
+parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf-svm__alpha': (1e-2, 1e-3)}
 
 gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
 gs_clf_svm = gs_clf_svm.fit(twenty_train.data, twenty_train.target)
@@ -124,43 +107,29 @@
 gs_clf_svm.best_params_
 
 
-# In[25]:
-
 # NLTK
 # Removing stop words
-from sklearn.pipeline import Pipeline
-text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), 
+text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB())])
 
 
-# In[26]:
-
 # Stemming Code
-
-import nltk
-nltk.download()
-
-from nltk.stem.snowball import SnowballStemmer
 stemmer = SnowballStemmer("english", ignore_stopwords=True)
 
+
 class StemmedCountVectorizer(CountVectorizer):
     def build_analyzer(self):
         analyzer = super(StemmedCountVectorizer, self).build_analyzer()
         return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
-
+
+
 stemmed_count_vect = StemmedCountVectorizer(stop_words='english')
 
-text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), 
+text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()),
                              ('mnb', MultinomialNB(fit_prior=False))])
 
 text_mnb_stemmed = text_mnb_stemmed.fit(twenty_train.data, twenty_train.target)
 
 predicted_mnb_stemmed = text_mnb_stemmed.predict(twenty_test.data)
 
 np.mean(predicted_mnb_stemmed == twenty_test.target)
-
-
-# In[ ]:
-
-
-
diff --git a/__pycache__/Text+Classification+using+python,+scikit+and+nltk.cpython-312.pyc b/__pycache__/Text+Classification+using+python,+scikit+and+nltk.cpython-312.pyc