朴素贝叶斯分类
文章目录
朴素贝叶斯分类前言一、使用条件概率进行分类二、获取B站小黑屋弹幕数据三、代码参考文献
前言
朴素贝叶斯分类是对于贝叶斯概率理论的一个应用,他可以对数据进行分类,是一个很常用的分类算法。
一、使用条件概率进行分类
计算在不同分类下的条件概率,哪个分类的概率大那么就将其归到概率大的分类一方。
在实际计算时候使用概率的对数进行计算,这样可以避免乘法下溢。初始化是对计算概率的分子和分母初始化值不为0,但要保证分母>分子。这样可以避免有0值和除0。在比较不同的条件概率时,可以只计算分子(分母是一样的)。
二、获取B站小黑屋弹幕数据
见博客爬取B站小黑屋 获取到的数据为以下json格式
[
{'type': xxx // 表示封禁信息
'article': xxx} //表示实际弹幕信息
]
本文中获取到的数据一共有2108组,使用前2050组为训练集,剩下的数据做测试数据集。 由于单词划分方式是直接进行对汉字和符号进行划分的(不够合理),错误率也比较高(25%左右)
三、代码
import numpy
as np
import json
class Bayes():
def __init__(self
):
pass
def loadData(self
, url
):
with open(url
, 'r', encoding
='utf-8') as f
:
data_dect
= json
.load
(f
)
label
= []
posting_list
= []
self
.data_size
= len(data_dect
)
for item
in data_dect
:
label
.append
(item
['type'])
posting_list
.append
([word
for word
in item
['article']])
return_label
= []
for example
in label
:
if example
== '永久封禁':
return_label
.append
(1)
else:
return_label
.append
(0)
return posting_list
, return_label
def creat_vocabulary_list(self
, data_set
):
vocabulary_list
= set([])
for document
in data_set
:
vocabulary_list
= vocabulary_list
| set(document
)
return list(vocabulary_list
)
def is_word_in_vocab(self
, vocab_list
, input_set
):
return_vector
= [0] * len(vocab_list
)
for word
in input_set
:
if word
in vocab_list
:
return_vector
[vocab_list
.index
(word
)] = 1
else:
print("Word: %s have not deteced!" % word
)
return return_vector
def train_naive_bayes(self
, train_set
, train_category_set
):
num_train_document
= len(train_set
)
num_words
= len(train_set
[0])
p_abusive
= sum(train_category_set
) / num_train_document
p_abusive_num
= np
.ones
(num_words
)
p_unabusive_num
= np
.ones
(num_words
)
p_abusive_all_num
= 2.0
p_unabusive_all_num
= 2.0
for i
in range(num_train_document
):
if train_category_set
[i
] == 1:
p_abusive_num
+= train_set
[i
]
p_abusive_all_num
+= np
.sum(train_set
[i
])
else:
p_unabusive_num
+= train_set
[i
]
p_unabusive_all_num
+= np
.sum(train_set
[i
])
p_abusive_vec
= np
.log
(p_abusive_num
/p_abusive_all_num
)
p_unabusive_vec
= np
.log
(p_unabusive_num
/p_unabusive_all_num
)
return p_abusive
, p_abusive_vec
, p_unabusive_vec
def classify(self
, input_x
, p_abusive
, p_abusive_vec
, p_unabusive_vec
):
p1
= np
.sum(input_x
* p_abusive_vec
) + np
.log
(p_abusive
)
p0
= np
.sum(input_x
* p_unabusive_vec
) + np
.log
(1.0 - p_abusive
)
if p1
> p0
:
return 1
else:
return 0
def test(self
):
URL
= r
'2020\ML\ML_action\\3.NaiveBayes\data\blackroom.json'
posting_list
, class_vec
= self
.loadData
(URL
)
vocab_list
= self
.creat_vocabulary_list
(posting_list
)
print("len = ", len(posting_list
))
train_size
= 2050
test_list
= posting_list
[train_size
:]
test_label
= class_vec
[train_size
:]
posting_list
= posting_list
[:train_size
]
class_vec
= class_vec
[:train_size
]
train_set
= []
for posting_document
in posting_list
:
train_set
.append
(self
.is_word_in_vocab
(vocab_list
, posting_document
))
index
= 0
rate
= 0
pAb
, p1v
, p0v
= self
.train_naive_bayes
(np
.array
(train_set
), np
.array
(class_vec
))
print(pAb
, p1v
, p0v
)
for example
in test_list
:
test_input_data
= np
.array
(self
.is_word_in_vocab
(vocab_list
, example
))
print(example
)
test_result
= self
.classify
(test_input_data
, pAb
, p1v
, p0v
)
if test_result
!= test_label
[index
]:
rate
+= 1
index
+= 1
print("error rate: %f" % float(rate
/len(test_label
)))
DEBUG
= True
nb
= Bayes
()
if DEBUG
:
nb
.test
()
参考文献
机器学习实战书籍https://github.com/apachecn/AiLearning/blob/master/docs/ml/4.%E6%9C%B4%E7%B4%A0%E8%B4%9D%E5%8F%B6%E6%96%AF.mdhttps://www.cnblogs.com/jpcflyer/p/11069659.html