From 21bdf75e9ed54405b851c31e6eb8cc79d51d28ba Mon Sep 17 00:00:00 2001
From: chenziqi66 <1304114564@qq.com>
Date: Tue, 14 Apr 2026 14:17:26 +0800
Subject: [PATCH] =?UTF-8?q?=E8=BF=99=E6=98=AF=E4=B8=80=E4=B8=AA=20?=
 =?UTF-8?q?=E6=9C=BA=E5=99=A8=E5=AD=A6=E4=B9=A0=E4=B8=8E=E8=87=AA=E7=84=B6?=
 =?UTF-8?q?=E8=AF=AD=E8=A8=80=E5=A4=84=E7=90=86(NLP)=E6=96=87=E6=9C=AC?=
 =?UTF-8?q?=E5=88=86=E7=B1=BB=E9=A1=B9=E7=9B=AE=20=EF=BC=8C=E4=B8=BB?=
 =?UTF-8?q?=E8=A6=81=E4=BD=BF=E7=94=A8=20Python=E3=80=81scikit-learn=20?=
 =?UTF-8?q?=E5=92=8C=20NLTK=20=E5=BA=93=E6=9D=A5=E5=AE=9E=E7=8E=B0?=
 =?UTF-8?q?=E3=80=82=20=E6=88=91=E5=8F=91=E7=8E=B0=E4=BA=86=E4=B8=8B?=
 =?UTF-8?q?=E9=9D=A2=E7=9A=84=E9=97=AE=E9=A2=98=EF=BC=8C=E8=AF=B7=E8=BF=9B?=
 =?UTF-8?q?=E8=A1=8C=E4=BF=AE=E6=AD=A3=20=E9=97=AE=E9=A2=98=201:=20?=
 =?UTF-8?q?=E9=87=8D=E5=A4=8D=E5=AF=BC=E5=85=A5=E7=9B=B8=E5=90=8C=E6=A8=A1?=
 =?UTF-8?q?=E5=9D=97=EF=BC=8C=20=E9=97=AE=E9=A2=98=202:=20NLTK=E4=BA=A4?=
 =?UTF-8?q?=E4=BA=92=E5=BC=8F=E4=B8=8B=E8=BD=BD=E4=BC=9A=E5=AF=BC=E8=87=B4?=
 =?UTF-8?q?=E8=84=9A=E6=9C=AC=E5=8D=A1=E4=BD=8F=EF=BC=8C=E5=9C=A8=E9=9D=9E?=
 =?UTF-8?q?=E4=BA=A4=E4=BA=92=E5=BC=8F=E7=8E=AF=E5=A2=83=EF=BC=88=E6=9C=8D?=
 =?UTF-8?q?=E5=8A=A1=E5=99=A8=E3=80=81=E8=84=9A=E6=9C=AC=E8=BF=90=E8=A1=8C?=
 =?UTF-8?q?=EF=BC=89=E4=B8=AD=E4=BC=9A=E6=B0=B8=E4=B9=85=E5=8D=A1=E4=BD=8F?=
 =?UTF-8?q?=20=E9=97=AE=E9=A2=98=203:=20SVM=E5=8F=82=E6=95=B0=E5=B7=B2?=
 =?UTF-8?q?=E8=BF=87=E6=97=B6=EF=BC=8C=E6=96=B0=E7=89=88=E6=9C=AC=E4=BC=9A?=
 =?UTF-8?q?=E6=8A=9B=E5=87=BA=20DeprecationWarning=20=EF=BC=8C=E6=9C=AA?=
 =?UTF-8?q?=E6=9D=A5=E7=89=88=E6=9C=AC=E5=B0=86=E7=9B=B4=E6=8E=A5=E6=8A=A5?=
 =?UTF-8?q?=E9=94=99=20=E9=97=AE=E9=A2=98=204:=20=E5=AF=BC=E5=85=A5?=
 =?UTF-8?q?=E4=BD=8D=E7=BD=AE=E6=B7=B7=E4=B9=B1=EF=BC=8C=E4=B8=8D=E7=AC=A6?=
 =?UTF-8?q?=E5=90=88PEP8=E8=A7=84=E8=8C=83=EF=BC=8C=E6=89=80=E6=9C=89impor?=
 =?UTF-8?q?t=E5=BA=94=E8=AF=A5=E9=9B=86=E4=B8=AD=E5=9C=A8=E6=96=87?=
 =?UTF-8?q?=E4=BB=B6=E6=9C=80=E5=BC=80=E5=A4=B4=20=E9=97=AE=E9=A2=98=205:?=
 =?UTF-8?q?=20=E5=A4=A7=E9=87=8FJupyter=20Notebook=E6=AE=8B=E7=95=99?=
 =?UTF-8?q?=E6=A0=87=E8=AE=B0=20=E7=BA=A6=E6=9D=9F=EF=BC=9A=201.=20?=
 =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=89=8D=E5=BF=85=E9=A1=BB=E5=85=88=E5=A4=8D?=
 =?UTF-8?q?=E7=8E=B0bug=202.=20=E6=89=80=E6=9C=89=E7=8E=B0=E6=9C=89?=
 =?UTF-8?q?=E6=B5=8B=E8=AF=95=E5=BF=85=E9=A1=BB=E5=85=A8=E9=83=A8=E9=80=9A?=
 =?UTF-8?q?=E8=BF=87=203.=20=E5=BF=85=E9=A1=BB=E4=B8=BA=E8=AF=A5bug?=
 =?UTF-8?q?=E6=96=B0=E5=A2=9E=E4=B8=93=E9=97=A8=E7=9A=84=E6=B5=8B=E8=AF=95?=
 =?UTF-8?q?=E7=94=A8=E4=BE=8B=204.=E5=8F=AA=E4=BF=AE=E6=94=B9=E4=B8=8Ebug?=
 =?UTF-8?q?=E7=9B=B4=E6=8E=A5=E7=9B=B8=E5=85=B3=E7=9A=84=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...ification+using+python,+scikit+and+nltk.py |  97 +++------
 ...ng+python,+scikit+and+nltk.cpython-312.pyc | Bin 0 -> 4822 bytes
 test_text_classification.py                   | 204 ++++++++++++++++++
 3 files changed, 237 insertions(+), 64 deletions(-)
 create mode 100644 __pycache__/Text+Classification+using+python,+scikit+and+nltk.cpython-312.pyc
 create mode 100644 test_text_classification.py

diff --git a/Text+Classification+using+python,+scikit+and+nltk.py b/Text+Classification+using+python,+scikit+and+nltk.py
index 8c850bb..276a85f 100644
--- a/Text+Classification+using+python,+scikit+and+nltk.py
+++ b/Text+Classification+using+python,+scikit+and+nltk.py
@@ -1,120 +1,103 @@
-
 # coding: utf-8
 
-# In[1]:
-
-#Loading the data set - training data.
+# 导入模块 - 按照PEP8规范，所有导入集中在文件开头
+import nltk
+from nltk.stem.snowball import SnowballStemmer
+import numpy as np
 from sklearn.datasets import fetch_20newsgroups
-twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
+from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
+from sklearn.linear_model import SGDClassifier
+from sklearn.model_selection import GridSearchCV
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.pipeline import Pipeline
 
 
-# In[4]:
+# 确保NLTK数据已下载（非交互式）
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    nltk.download('punkt', quiet=True)
 
-# You can check the target names (categories) and some data files by following commands.
-twenty_train.target_names #prints all the categories
 
+# Loading the data set - training data.
+twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
 
-# In[5]:
 
-print("\n".join(twenty_train.data[0].split("\n")[:3])) #prints first line of the first data file
+# You can check the target names (categories) and some data files by following commands.
+twenty_train.target_names  # prints all the categories
+
 
+print("\n".join(twenty_train.data[0].split("\n")[:3]))  # prints first line of the first data file
 
-# In[6]:
 
 # Extracting features from text files
-from sklearn.feature_extraction.text import CountVectorizer
 count_vect = CountVectorizer()
 X_train_counts = count_vect.fit_transform(twenty_train.data)
 X_train_counts.shape
 
 
-# In[7]:
-
 # TF-IDF
-from sklearn.feature_extraction.text import TfidfTransformer
 tfidf_transformer = TfidfTransformer()
 X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
 X_train_tfidf.shape
 
 
-# In[9]:
-
 # Machine Learning
 # Training Naive Bayes (NB) classifier on training data.
-from sklearn.naive_bayes import MultinomialNB
 clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
 
 
-# In[14]:
-
 # Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows:
-# The names ‘vect’ , ‘tfidf’ and ‘clf’ are arbitrary but will be used later.
+# The names 'vect' , 'tfidf' and 'clf' are arbitrary but will be used later.
 # We will be using the 'text_clf' going forward.
-from sklearn.pipeline import Pipeline
 
 text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
 
 text_clf = text_clf.fit(twenty_train.data, twenty_train.target)
 
 
-# In[15]:
-
 # Performance of NB Classifier
-import numpy as np
 twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
 predicted = text_clf.predict(twenty_test.data)
 np.mean(predicted == twenty_test.target)
 
 
-# In[16]:
-
 # Training Support Vector Machines - SVM and calculating its performance
-
-from sklearn.linear_model import SGDClassifier
+# 注意：使用 max_iter 替代已过时的 n_iter 参数
 text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
-                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])
+                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=5, random_state=42))])
 
 text_clf_svm = text_clf_svm.fit(twenty_train.data, twenty_train.target)
 predicted_svm = text_clf_svm.predict(twenty_test.data)
 np.mean(predicted_svm == twenty_test.target)
 
 
-# In[18]:
-
 # Grid Search
-# Here, we are creating a list of parameters for which we would like to do performance tuning. 
-# All the parameters name start with the classifier name (remember the arbitrary name we gave). 
+# Here, we are creating a list of parameters for which we would like to do performance tuning.
+# All the parameters name start with the classifier name (remember the arbitrary name we gave).
 # E.g. vect__ngram_range; here we are telling to use unigram and bigrams and choose the one which is optimal.
 
-from sklearn.model_selection import GridSearchCV
 parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}
 
 
-# In[19]:
-
-# Next, we create an instance of the grid search by passing the classifier, parameters 
+# Next, we create an instance of the grid search by passing the classifier, parameters
 # and n_jobs=-1 which tells to use multiple cores from user machine.
 
 gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
 gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)
 
 
-# In[23]:
-
 # To see the best mean score and the params, run the following code
 
 gs_clf.best_score_
 gs_clf.best_params_
 
 # Output for above should be: The accuracy has now increased to ~90.6% for the NB classifier (not so naive anymore! 😄)
-# and the corresponding parameters are {‘clf__alpha’: 0.01, ‘tfidf__use_idf’: True, ‘vect__ngram_range’: (1, 2)}.
-
+# and the corresponding parameters are {'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}.
 
-# In[24]:
 
 # Similarly doing grid search for SVM
-from sklearn.model_selection import GridSearchCV
-parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),'clf-svm__alpha': (1e-2, 1e-3)}
+parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf-svm__alpha': (1e-2, 1e-3)}
 
 gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
 gs_clf_svm = gs_clf_svm.fit(twenty_train.data, twenty_train.target)
@@ -124,33 +107,25 @@
 gs_clf_svm.best_params_
 
 
-# In[25]:
-
 # NLTK
 # Removing stop words
-from sklearn.pipeline import Pipeline
-text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), 
+text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB())])
 
 
-# In[26]:
-
 # Stemming Code
-
-import nltk
-nltk.download()
-
-from nltk.stem.snowball import SnowballStemmer
 stemmer = SnowballStemmer("english", ignore_stopwords=True)
 
+
 class StemmedCountVectorizer(CountVectorizer):
     def build_analyzer(self):
         analyzer = super(StemmedCountVectorizer, self).build_analyzer()
         return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
-    
+
+
 stemmed_count_vect = StemmedCountVectorizer(stop_words='english')
 
-text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), 
+text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()),
                              ('mnb', MultinomialNB(fit_prior=False))])
 
 text_mnb_stemmed = text_mnb_stemmed.fit(twenty_train.data, twenty_train.target)
@@ -158,9 +133,3 @@ def build_analyzer(self):
 predicted_mnb_stemmed = text_mnb_stemmed.predict(twenty_test.data)
 
 np.mean(predicted_mnb_stemmed == twenty_test.target)
-
-
-# In[ ]:
-
-
-
diff --git a/__pycache__/Text+Classification+using+python,+scikit+and+nltk.cpython-312.pyc b/__pycache__/Text+Classification+using+python,+scikit+and+nltk.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c9a2ab4c3b4a31389c823972911c50db75e99e99
GIT binary patch
literal 4822
zcmb_fYi!%r6~2^YNtX4r<cDHAvEx@{`4!tqo5sz{j*Vi8o7L$fYFd^SuWZ^BrIJ+q
zXbLD;04GC-ct47|Sckd)+$9)XU>N+fKMM>fNXr8ScEMWg$KZd?TwrZ6U^|yIrKWJv
z7Kk9-d+xdCp7Wi@J*58X^LY?_s)`S}>tTfcNfy?}yHD($b0PE|Vi8O6C_!c$Po-@M
z8)fqB2?y}(d_~%sa2h-Z?@GH9?sR3M(pXpUo^(~B%AlRRH|<OK4BEx}(}6_5pxt~h
z9ZG~KREaQ}ZL>g>=h@COZBcCHeafUMWJB1=`q-)m{`)qQm#D?Hv%QvVc{XJ0*}&{z
znP*XO;sDz$ih{uG2}`OxvjGbMOR>R2$XZ#mVat7g|3IPvH_V<YpSGxjty($~O}J@x
z*pe;JhVIIe?j!iffx4SYdYW<b59$e6dS*w;7Ay)k1Hw^jRh~<cZiVQF*bs~*UsE==
z8rCIzunxnzWG7{twwI8#;kE<q9+-_E!O{j+oNVN=N@xfDm&>><3b!AW?Gma^(BzoC
zVo8^0+zBTSh{xP9RH^_YJ3k(i8n98t*0ObMJsV{YJ!rUJVcK~tJ~147V(}rmets;v
zelQkYKP47yBil4>V-K@O*ya!Hi8yHU;Wiv+TOJ&}Z#O+obmQ(P<|WaUC}+%~C<OfL
z)~Y;14=N*U1%!PD!Wiy*3}HX+e_{xUu7eTwAB+$X?bj3A_MqLW(w}$FK!=TOgS*jD
zy1U11XdJPfw-I{`Zm*S#0gxQ{uARi7X&(<BU@-=J0k-Qwyo~BQw|fW=J+Y_}T|d8i
zPXNAyZ}N$sQoV~*E5eaBB-R_Z8qIc<E61W(4@A{#MU_I=(2D(?U(BNg`!zIA#kzNh
z;6#kl1Cv5#ev0AwNg1cpSOive3d_kEdZ1sx^U}1K$z`ROO%IG@a)SI4PRbdPTf(9q
zyqe<J)K!rYq*O*UkNPIZM@M)@lDHIS=<<$>96N~_F*!5x66p3_&hauQWYQeNPh0?o
z`wExEJSX6#pq!b*0#PmXWpl!utXCM*RgkK9Gsj_I$RfiD0LCfhrX*mvq?ueQ#bcfF
z?2uZ~9X9|V$Ps9|J;|qZhm0k8$pvJObR(@-%y7aq)@}U2zyABtqpizRSI+K`9=Ah+
z7^~17d`6OVR~8ElFE8p944<80ba$Frpg9@5_JY4`CQVB+BjcEDhdlAcW^&eMOgrRp
zAG2K@i`k~%iu`H(%ca|A#d^ZC>mdS;riE#dNz<SQaQF?RG@X+$O(1&!Crukhcm4+-
z@ZBlUvzaL=W*3iw8nG3ICI7xV`=YuF3)4I&&49Q^k~3L)J|nUctb*LMkP$J!&14N;
z5{{QX`>!Tw?=Y~I;k62*1jV9+?G8#&EOpzRu-`!oRKhU{F6p&qiLl=*gyjB9kD-+$
zf&BJ{V0f2Wrrtxcu|WpevWIS>-&DcFX@BR})E&ySN9;^IrqBs8@Ggk<-hkm<P-jDT
zZ8uBLwNZ4BO4_DT()L;%LT^)LW3vRt_dYi%t0Vl@_D<9Rr{1O}VWbRd9>u5$@Y5xk
zS3!3W0`g{OGfADA*WC;h#Ud$CqPw{{|LxFKydcM|icc~!ml5JQ2}&%UU6f}sLU&wB
za&w#<hpLGSygb*NUDVyfJd>VcnKMgI9URi$sT{|%v}LY$n9n2`UOEGWdGIZ;FSWw(
zbsl{@vnl;KU1%KrIK2`2;N?GDfB*V=%kSxp^j72OH}I1jAoosV+yS0cNVzN)#W-+u
z|Kmnr*CAm(B@U3yN}5iRdrZ@&ujle2c|R3T!m!l-lTa^$s6^gWzsaM&hwHbaeH#%q
ze6bK7zvKQqa8Yqz+=ESW=iG27m^7wo-9yvq44dN#?WO5Aa}00p7>0G<KB^?<d76$<
z;?u+islRh3JmQ$)-2&MN!0`2L^dFn8VvO2FWE9E!uIax$EliEU6{$>dGMyE<j2Jtq
zJ4lZ7Y9mu5`@PbBdlX5^^$H=Eh6LJ#tR9r+c<5_FFU!ab^dd<lan_Gm>=b6?9Nau8
z5Rple!swL=O`45Js#=d&qNM#XBJmvaBK*BUdY!dp@My>jbPiA<y2)<X+7%e?2BxQ&
zMJ$PxWTylwyWdqs@{dY)5RAGb#R;rlbt#jX%VjT$Vn)>6Y-V2IGYqSH<#{Z~i?q>U
zVa15kST@okK_3=5LDn6!8BTx_%JQ5Hjfy-DXrtZwNCe@a`5@5!ub5hBLj;aXGfWoi
zA+yz$<e-`QEXn<T4SIB^X<CdB;BFE!sKk_BW#NNv4((aQEC-Fga-YDg?nq-s0O0!v
zXwvL`rEO#B$uiJ+F|=_>cTP*D-YHN>OG(lsb+557q$FDRm&MKF<`F`R<d2+LM%AQD
z(wH^9H3JefP(Xb_-!(S%nlh5_us=d(J(8;hUBN8<Ga?U5Qq<BPq3Cw)6h-a15#@b|
zdOt_i4^iJg&}E<>qW*`d=OJp}b=#@VqQkxX+^y%9CvHtB;r4Au$0IKa9$l~7Y}*c;
z&3ku2flouxEfjs^M3ghnE_1iI6|WX-QG+dOFtlmGE;ZP-K2!+yZ&YuzD8ZqE^Takf
z`N)m@O$FcKyr&py&_Xda6kG4sx=yQIrwgIsyuXP28j5V82ng1+7a|=B3KfGbTCh(I
z_HFnJ!LxZ^5rOgW779P|AXns98O<40ozc~v^+qi=tj2~5&S&%XM}Fjutfb!06b|<(
zHT@e;sl%5F!xPHD^U4b^tJhM6Yct9(IVCb%sG7^Wzo-h|8@@Zd>ecFFYJF^dSc{)g
z<7Wz0XY+1A5@=a>ZaTL8&*rO&-iYRHRK1O>uWC&_YEw_a+ncW}B3ItF?7HQ;Q@Qf0
zR@14%*WI~|jse7A{c6XDeTAcgN_1#rUOjiEaP9@=<Ye(s>xZvxx;~94FJ4nvtjIUE
z+vk<&0;DLqMC~Ce_!b4BcA{XT7C52?j;wjL=HqJf@r_|^a7-N>D+I>#-XiP<+aTa;
zuWIcl)b<mF;7Jl83yyEKL5s#!_`2iUsCzev+T-hEO6%Y!Z!2t8^}U(*KyE#?D>G}J
zje+gTlX+*cGH~zE-9syvv`D)eX<xgkbv~nZK2xYXop<hf5arTP^%klwBBz;!T|bKT
zZ8(+Ar#^jF;TP1<V&4CyFS^>j?%x!*eHS3rzPfv_-+g`clGfOxHukLF)cP){eHRM8
zk-P^Sq^dMjyM=0t$fKdUEmQ|VGjq2Y-99{`)Qo;IuU>hhaAisvOBQPz-hb_Pv{KUp
zfvV|)IC=XaM;=slXf?LMZhKDWU0*rv)EKo&e}%{l{?&GGEN<7FS5Wnr;fB?e(($y?
zbZT>4z4&6`;>*hESBi&8E?(SjA5oe{L2uI-NkC+rQcw^8M%uJ+ryA~D_iD#ZsmD%j
z4r`|;)YB7%@be(>Mg1|YzF)2H-)PW|pI48cFVtU9P|X)s4itCOT7%XysKVDh_%}54
z_3oS<xx@cP6uD`k{OI!KTbEZl*CLvuLveJFJKFiJ6oNbZmxc+1K10!oCi`cN6pa4|
DalMTY

literal 0
HcmV?d00001

diff --git a/test_text_classification.py b/test_text_classification.py
new file mode 100644
index 0000000..c744c34
--- /dev/null
+++ b/test_text_classification.py
@@ -0,0 +1,204 @@
+# coding: utf-8
+"""
+文本分类项目测试文件
+测试修复的5个问题：
+1. 重复导入相同模块
+2. NLTK交互式下载会导致脚本卡住
+3. SVM参数已过时
+4. 导入位置混乱，不符合PEP8规范
+5. 大量Jupyter Notebook残留标记
+"""
+
+import ast
+import sys
+import warnings
+import unittest
+from io import StringIO
+
+
+class TestCodeQuality(unittest.TestCase):
+    """测试代码质量相关的问题修复"""
+
+    def setUp(self):
+        """读取主文件内容"""
+        with open('Text+Classification+using+python,+scikit+and+nltk.py', 'r', encoding='utf-8') as f:
+            self.source_code = f.read()
+        self.tree = ast.parse(self.source_code)
+
+    def test_no_duplicate_imports(self):
+        """测试问题1: 检查没有重复导入相同模块"""
+        imports = []
+        for node in ast.walk(self.tree):
+            if isinstance(node, ast.Import):
+                for alias in node.names:
+                    imports.append(alias.name)
+            elif isinstance(node, ast.ImportFrom):
+                module = node.module or ''
+                for alias in node.names:
+                    imports.append(f"{module}.{alias.name}")
+
+        # 检查重复
+        duplicates = [item for item in set(imports) if imports.count(item) > 1]
+        self.assertEqual(len(duplicates), 0,
+                        f"发现重复导入: {duplicates}")
+
+    def test_no_jupyter_markers(self):
+        """测试问题5: 检查没有Jupyter Notebook残留标记"""
+        jupyter_patterns = ['# In[', '#In[', '#In [']
+        lines = self.source_code.split('\n')
+        jupyter_lines = []
+
+        for i, line in enumerate(lines, 1):
+            stripped = line.strip()
+            for pattern in jupyter_patterns:
+                if stripped.startswith(pattern):
+                    jupyter_lines.append((i, line.strip()))
+
+        self.assertEqual(len(jupyter_lines), 0,
+                        f"发现Jupyter Notebook残留标记: {jupyter_lines}")
+
+    def test_imports_at_top(self):
+        """测试问题4: 检查导入语句集中在文件开头（符合PEP8规范）"""
+        # 获取所有导入语句的行号
+        import_lines = []
+        for node in ast.walk(self.tree):
+            if isinstance(node, (ast.Import, ast.ImportFrom)):
+                import_lines.append(node.lineno)
+
+        if not import_lines:
+            return
+
+        # 找到最后一个导入语句的行号
+        last_import_line = max(import_lines)
+
+        # 检查在最后一个导入语句之后是否还有导入语句（应该在文件开头）
+        # 允许在导入后有注释和空行
+        lines = self.source_code.split('\n')
+
+        # 检查是否在非导入代码之后还有导入
+        non_import_found = False
+        for i, line in enumerate(lines, 1):
+            if i > last_import_line:
+                stripped = line.strip()
+                # 跳过空行和注释
+                if stripped and not stripped.startswith('#'):
+                    non_import_found = True
+                    break
+
+        # 如果在最后一个导入之后有非导入代码，这是正常的
+        # 我们需要检查的是：是否分散导入
+        # 这里简化检查：确保所有导入都在前30行内（考虑到注释）
+        self.assertLessEqual(last_import_line, 30,
+                            f"导入语句分散在文件中，最后一个导入在第{last_import_line}行，不符合PEP8规范")
+
+    def test_no_interactive_nltk_download(self):
+        """测试问题2: 检查没有交互式的nltk.download()调用"""
+        # 检查是否使用了非交互式下载
+        has_interactive_download = 'nltk.download()' in self.source_code
+        self.assertFalse(has_interactive_download,
+                        "发现交互式nltk.download()调用，会导致脚本在非交互式环境中卡住")
+
+        # 检查是否使用了quiet=True参数
+        has_quiet_download = "nltk.download('punkt', quiet=True)" in self.source_code
+        self.assertTrue(has_quiet_download,
+                       "应该使用nltk.download('punkt', quiet=True)来非交互式下载")
+
+    def test_svm_parameter_updated(self):
+        """测试问题3: 检查SVM参数已更新，没有使用过时的n_iter"""
+        # 检查没有使用n_iter
+        has_deprecated_param = 'n_iter=' in self.source_code
+        self.assertFalse(has_deprecated_param,
+                        "发现已过时的n_iter参数，应该使用max_iter")
+
+        # 检查使用了max_iter
+        has_correct_param = 'max_iter=' in self.source_code
+        self.assertTrue(has_correct_param,
+                       "应该使用max_iter参数替代n_iter")
+
+
+class TestSVMParameterCompatibility(unittest.TestCase):
+    """测试SVM参数兼容性"""
+
+    def test_sgd_classifier_no_deprecation_warning(self):
+        """测试SGDClassifier不会抛出DeprecationWarning"""
+        import numpy as np
+        from sklearn.linear_model import SGDClassifier
+        from sklearn.datasets import fetch_20newsgroups
+        from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
+        from sklearn.pipeline import Pipeline
+
+        # 捕获警告
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+
+            # 加载少量数据进行测试
+            twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
+
+            # 使用max_iter参数（修复后的参数）
+            text_clf_svm = Pipeline([
+                ('vect', CountVectorizer()),
+                ('tfidf', TfidfTransformer()),
+                ('clf-svm', SGDClassifier(
+                    loss='hinge',
+                    penalty='l2',
+                    alpha=1e-3,
+                    max_iter=5,
+                    random_state=42
+                ))
+            ])
+
+            # 训练模型
+            text_clf_svm.fit(twenty_train.data[:100], twenty_train.target[:100])
+
+            # 检查是否有与SVM参数相关的DeprecationWarning
+            # 过滤掉与tar归档相关的Python 3.14警告
+            svm_deprecation_warnings = [
+                warning for warning in w
+                if issubclass(warning.category, DeprecationWarning)
+                and 'n_iter' in str(warning.message)
+            ]
+
+            self.assertEqual(len(svm_deprecation_warnings), 0,
+                           f"发现SVM参数相关的DeprecationWarning: {[str(w.message) for w in svm_deprecation_warnings]}")
+
+
+class TestNLTKDownload(unittest.TestCase):
+    """测试NLTK下载功能"""
+
+    def test_nltk_non_interactive_download(self):
+        """测试NLTK非交互式下载不会卡住"""
+        import nltk
+
+        # 测试使用quiet=True参数下载
+        # 这不应该引发交互式提示
+        try:
+            # 使用quiet=True确保非交互式
+            result = nltk.download('punkt', quiet=True)
+            # 如果成功执行到这里，说明没有卡住
+            self.assertTrue(True, "NLTK非交互式下载成功")
+        except Exception as e:
+            self.fail(f"NLTK下载失败: {e}")
+
+
+class TestModuleImports(unittest.TestCase):
+    """测试模块导入"""
+
+    def test_all_imports_work(self):
+        """测试所有导入都能正常工作"""
+        try:
+            import nltk
+            from nltk.stem.snowball import SnowballStemmer
+            import numpy as np
+            from sklearn.datasets import fetch_20newsgroups
+            from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
+            from sklearn.linear_model import SGDClassifier
+            from sklearn.model_selection import GridSearchCV
+            from sklearn.naive_bayes import MultinomialNB
+            from sklearn.pipeline import Pipeline
+            self.assertTrue(True, "所有导入成功")
+        except ImportError as e:
+            self.fail(f"导入失败: {e}")
+
+
+if __name__ == '__main__':
+    unittest.main(verbosity=2)