import pandas as pdimport numpy as npdf=pd.read_excel("all_data_meituan.xlsx")[["comment","star"]]df.head()```
from sklearn.naive_bayes import MultinomialNBnb=MultinomialNB() from sklearn.pipeline import make_pipeline # 导入make_pipeline方法pipe=make_pipeline(vect,nb)pipe.steps # 查看pipeline的步骤(与pipeline相似)[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None, stop_words=frozenset({'', '范围', '但愿', 'vs', '为', '过去', '集中', '这般', '孰知', '认为', '论', '36', '前后', '每年', '长期以来', 'our', '要不', '使用', '好象', 'such', '不但', '一下', 'how', '召开', '6', '全体', '严格', '除开', 'get', '可好', '毕竟', 'but', '如前所述', '满足', 'your', 'keeps', '只', '大抵', '己', 'concerning', "they're", '再则', '有意的'...'reasonably', '绝对', '咧', '除此以外', '50', '得了', 'seeming', '只是', '背靠背', '弗', 'need', '其', '第二', '再者说'}), strip_accents=None, token_pattern='(?u)\\b[^\\d\\W]\\w+\\b', tokenizer=None, vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]pipe.fit(X_train.cut_comment, y_train)Pipeline(memory=None, steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None, stop_words=...e, vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])y_pred = pipe.predict(X_test.cut_comment) # 对测试集进行预测(其中包括了转化以及预测)# 模型对于测试集的准确率from sklearn import metricsmetrics.accuracy_score(y_test,y_pred)0.82929936305732488from sklearn.linear_model import LogisticRegression# lr=LogisticRegression(solver='saga',max_iter=10000)lr=LogisticRegression() # 实例化pipe_lr=make_pipeline(vect,lr) pipe_lr.steps[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None, stop_words=frozenset({'', 'besides', '中小', '不管怎样', '引起', '它们的', 'take', "c's", 'hopefully', 'no', '就算', '断然', '直到', 'some', '最后一班', '许多', '非独', '嘻', ':', '时', '两者', '惟其', '从优', 'so', 'specified', '50', 'sometimes', '明显', '嗬', '人家', '截至', '开始', '动不动', '大体', '以及', '使', 'own', 'whoever', "wasn't", 'cha...'我是', '/', 'my', '再则', '正常', '49', '关于', '愿意', '其他', '这么', '粗', 'c]', '$', '29', '要求', '第十一', '自后'}), strip_accents=None, token_pattern='(?u)\\b[^\\d\\W]\\w+\\b', tokenizer=None, vocabulary=None)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False))]pipe_lr.fit(X_train.cut_comment, y_train)y_pred_lr = pipe_lr.predict(X_test.cut_comment)metrics.accuracy_score(y_test,y_pred_lr)0.87261146496815289lr_solver = LogisticRegression(solver='saga')pipe_lr1=make_pipeline(vect,lr_solver)pipe_lr1.steps[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None, stop_words=frozenset({'', 'besides', '中小', '不管怎样', '引起', '它们的', 'take', "c's", 'hopefully', 'no', '就算', '断然', '直到', 'some', '最后一班', '许多', '非独', '嘻', ':', '时', '两者', '惟其', '从优', 'so', 'specified', '50', 'sometimes', '明显', '嗬', '人家', '截至', '开始', '动不动', '大体', '以及', '使', 'own', 'whoever', "wasn't", 'cha...'我是', '/', 'my', '再则', '正常', '49', '关于', '愿意', '其他', '这么', '粗', 'c]', '$', '29', '要求', '第十一', '自后'}), strip_accents=None, token_pattern='(?u)\\b[^\\d\\W]\\w+\\b', tokenizer=None, vocabulary=None)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='saga', tol=0.0001, verbose=0, warm_start=False))]pipe_lr1.fit(X_train.cut_comment, y_train)C:\Anaconda3\envs\nlp\lib\site-packages\sklearn\linear_model\sag.py:326: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge "the coef_ did not converge", ConvergenceWarning) Pipeline(memory=None, steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None, stop_words=...penalty='l2', random_state=None, solver='saga', tol=0.0001, verbose=0, warm_start=False))])# C:\Anaconda3\envs\nlp\lib\site-packages\sklearn\linear_model\sag.py:326: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge# "the coef_ did not converge", ConvergenceWarning)# 出现这个提示,说明solver参数在saga(随机平均梯度下降)情况下,系数没有收敛,随机平均梯度需要更大的迭代次数,需要调整最大迭代次数max_iter# 这里需要强调一点,这并不是说saga性能不好,saga针对大的数据集收敛速度比其他的优化算法更快。lr_solver = LogisticRegression(solver='saga',max_iter=10000)pipe_lr1=make_pipeline(vect,lr_solver)pipe_lr1.stepspipe_lr1.fit(X_train.cut_comment, y_train)Pipeline(memory=None, steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None, stop_words=...penalty='l2', random_state=None, solver='saga', tol=0.0001, verbose=0, warm_start=False))])y_pred_lr1 = pipe_lr1.predict(X_test.cut_comment)metrics.accuracy_score(y_test,y_pred_lr1)0.87388535031847137t1=time.time()from sklearn.linear_model import LogisticRegressionCVlrvc = LogisticRegressionCV(Cs=[0.0001,0.005,0.001,0.05,0.01,0.1,0.5,1,10],scoring='accuracy',random_state=42,solver='saga',max_iter=10000,penalty='l2')pipe=make_pipeline(vect,lrvc)print(pipe.get_params)pipe.fit(X_train.cut_comment, y_train)y_pred=pipe.predict(X_test.cut_comment)print(metrics.accuracy_score(y_test,y_pred))t2=time.time()print("time spent l2,saga",t2-t1)<bound method Pipeline.get_params of Pipeline(memory=None, steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None, stop_words=... random_state=42, refit=True, scoring='accuracy', solver='saga', tol=0.0001, verbose=0))])>0.899363057325time spent l2,saga 5.017577648162842t1=time.time()from sklearn.linear_model import LogisticRegressionCVlrvc = LogisticRegressionCV(Cs=[0.0001,0.005,0.001,0.05,0.01,0.1,0.5,1,10],scoring='accuracy',random_state=42,solver='saga',max_iter=10000,penalty='l1')pipe_cvl1=make_pipeline(vect,lrvc)print(pipe_cvl1.get_params)pipe_cvl1.fit(X_train.cut_comment, y_train)y_pred=pipe_cvl1.predict(X_test.cut_comment)print(metrics.accuracy_score(y_test,y_pred))t2=time.time()print("time spent l1,saga",t2-t1)<bound method Pipeline.get_params of Pipeline(memory=None, steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None, stop_words=... random_state=42, refit=True, scoring='accuracy', solver='saga', tol=0.0001, verbose=0))])>0.915923566879time spent l1,saga 64.17242479324341# LogisticRegressionCV 方法 l1正则化,sovler liblinear,速度比saga快的多,很快就收敛了,准确率没有什么差别,只是不支持真正的多分类(为liblinear 打call)t3=time.time()from sklearn.linear_model import LogisticRegressionCVlrvc = LogisticRegressionCV(Cs=[0.0001,0.005,0.001,0.05,0.01,0.1,0.5,1,10],scoring='accuracy',random_state=42,solver='liblinear',max_iter=10000,penalty='l1')pipe_cvl1=make_pipeline(vect,lrvc)print(pipe_cvl1.get_params)pipe_cvl1.fit(X_train.cut_comment, y_train)y_pred=pipe_cvl1.predict(X_test.cut_comment)print("accuracy":metrics.accuracy_score(y_test,y_pred))t4=time.time()print("time spent l1 liblinear ",t4-t3)<bound method Pipeline.get_params of Pipeline(memory=None, steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None, stop_words=...om_state=42, refit=True, scoring='accuracy', solver='liblinear', tol=0.0001, verbose=0))])>"accuracy":0.912101910828time spent l1 liblinear 0.22439861297607422posted on 2018-08-16 14:20 多一点 阅读(...) 评论(...) 编辑 收藏
免责声明:本文系网络转载或改编,未找到原创作者,版权归原作者所有。如涉及版权,请联系删