label_mapping = {'汽车': 1, '财经': 2, '科技': 3, '健康': 4, '体育': 5, '教育': 6, '文化': 7, '军事': 8, '娱乐': 9,
'时尚': 0}
df_train['label'] = df_train['label'].map(label_mapping)
print(df_train.head())
x_train, x_test, y_train, y_test = train_test_split(df_train['contents_clean'].values, df_train['label'].values)
print(x_train[0][1])
words = []
for line_index in range(len(x_train)):
words.append(' '.join(x_train[line_index]))
print(words[0])
print(len(words))
# 计算词频构造向量
vec = CountVectorizer(analyzer='word', max_features=4000, lowercase=False)
vec.fit(words)
classifier = MultinomialNB()
classifier.fit(vec.transform(words), y_train)
test_words = []
for line_index in range(len(x_test)):
test_words.append(' '.join(x_test[line_index]))
print(test_words[0])
print(len(test_words))
print(classifier.score(vec.transform(test_words), y_test))
# tf-idf构造词向量
vec2 = TfidfVectorizer(analyzer='word', max_features=4000, lowercase=False)
vec2.fit(words)
classifier = MultinomialNB()
classifier.fit(vec2.transform(words), y_train)
print(classifier.score(vec2.transform(test_words), y_test))
# 词频构造多维向量形式构造词向量
vec3 = CountVectorizer(analyzer='word', max_features=4000, lowercase=False, ngram_range=(1, 2))
vec3.fit(words)
classifier = MultinomialNB()
classifier.fit(vec3.transform(words), y_train)
print(classifier.score(vec3.transform(test_words), y_test))
# tfidf构造多维向量形式构造词向量
vec4 = TfidfVectorizer(analyzer='word', max_features=4000, lowercase=False, ngram_range=(1, 2))
vec4.fit(words)
classifier = MultinomialNB()
classifier.fit(vec4.transform(words), y_train)
print(classifier.score(vec4.transform(test_words), y_test))
可以看出不同方法构成词向量对结果产生了影响,使用tf-idf方法构建词向量比单纯使用词频构建词向量准确率高一些,将词向量扩充多维比不扩充准确率稍微高一些
|